o
    cj+&                     @   s  d Z ddlZddlZddlZddlZdD ]ZeeZeeje< qdd ejd _	dd ejd _
d	d ejd _ddlZddlmZ dd
lmZ dZdZdZdZdZdZdZG dd dejZedkreej rndndZe e Ze dd e! D Z"e#de"d e$ddeeeZ%e&  ee%Z'W d   n1 sw   Y  e'( D ]>\Z)Z*e+e*dre#de) de,e*j-  qe.e*e/re#de) de0e* d e*re+e*d dre#d e,e*d j-  qej1dkr
e#d!ej2 d" d#d$ dS dS dS )%uz  DA3-based volume model with factored KV-attention.

Architecture (Cameron 2026-05-18 spec):
  - DA3 backbone + DPT head; aux head's final 1×1 conv replaced to output KEY_DIM (+1 conf).
  - Per-pixel feature F ∈ R^(B × KEY_DIM × H × W) — "value/query" stream.
  - Learnable height embeddings h_emb ∈ R^(N_HEIGHT_BINS × H_DIM).
  - Learnable time   embeddings t_emb ∈ R^(N_WINDOW × T_DIM).
  - Key per (t, z): key(t, z) = concat(t_emb[t], h_emb[z]) ∈ R^(T_DIM + H_DIM = KEY_DIM).
  - Volume logits via bilinear scoring: l(b, t, z, u, v) = F(b, :, u, v) · key(t, z) / sqrt(KEY_DIM).

We replace the original libero (B, N_WINDOW * N_HEIGHT_BINS, H, W) dense head with the
factored KV decomposition: parameter-efficient, structural inductive bias that height/time
are categorical attributes that share representation across spatial locations.

forward(rgb) returns:
  volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, h_out, w_out)
  pred_depth:    (B, H, W)  — for distillation against frozen DA3 depth
  dino_feats:    list of intermediate DINO features (for PCA viz)
  pixel_feats:   (B, KEY_DIM, h_out, w_out) — for debugging / viz
    N)depth_anything_3.utils.export!depth_anything_3.utils.pose_alignc                  O      d S N akr   r   0/data/cameron/para/libero/model_da3_volume_v3.py<lambda>       r   r   c                  O   r   r   r   r   r   r   r
   r      r   r   c                  O   r   r   r   r   r   r   r
   r      r   )DepthAnything3z/data/cameron/da3_large_weightsi         0   c                       sb   e Zd Zeeeeeedfde	de
de
de
de
de
f fdd	Zed
d Zdd Zdd Z  ZS )DA3VolumeModelNweights_pathn_windown_height_binskey_dimtime_dim
height_dimc                    s~  t    ||kr||ksJ || _|| _|| _|| _|| _t|}|j	j
| _
|j	j| _~|d u r?tt| j
dg d| _nt|| _| jjjd }	|	d }
tj|
j|d dddd}tj|j tjj|jdd ||	d< t||| _t||| _tjj| jjd	d
dd tjj| jjd	d
dd | jd| ||dd | jd| ||dd t|| _ t!t"#d| _$d S )NZ
out_layers)      	         r   )kernel_sizestridepaddingg?)stdg{Gz?g)r!   r   bt_sinF)
persistenth_singHzG@)%super__init__r   r   r   r   r   r   from_pretrainedmodelbackboneheadlistgetattrdino_feat_layersscratchZoutput_conv2_auxnnConv2din_channelsinitzeros_biasnormal_weight	Embeddingh_embt_embtrunc_normal_register_buffer_sinusoidal_features	LayerNorm
pixel_norm	Parametertorchtensorlogit_scale)selfr   r   r   r   r   r   r.   fullZlast_aux_seqZold_convZnew_conv	__class__r   r
   r'   3   s@   




zDA3VolumeModel.__init__c                 C   s   |d dksJ |d }t j| t jdt| d d }dt j|t jd }|d|d tj }t jt |t 	|gddS )zNeRF/transformer-style sinusoidal positional encoding.
        Returns (n_values, dim) where each row is sin/cos at log-spaced freqs.
        Normalize position to [0, 1] so all n_values fit one cycle base.
           r   )dtyper   g       @r   )dim)
rA   arangefloat32max	unsqueezemathpicatsincos)n_valuesrJ   LposZfreqsanglesr   r   r
   r=   f   s   z#DA3VolumeModel._sinusoidal_featuresc                 C   s0   | j | jj d}| j| jj d}|| S )u   v3: sinusoidal (fixed) + small learned embedding, SUMMED across t and z.
        key(t, z) = (t_sin[t] + t_emb[t]) + (h_sin[z] + h_emb[z])  ∈ R^key_dim.
        r   r   )r#   r:   r7   rN   r%   r9   )rD   Zt_totalZh_totalr   r   r
   _build_keyst   s   zDA3VolumeModel._build_keysc                 C   sZ  | d}tj rtjntj}tj|jj|d | j	|d| j
dd\}}W d   n1 s0w   Y  |jd |jd }}tj|jjdd	 | j|||d
d}W d   n1 s]w   Y  |d }	|	 dkrv|	ddd
d
f }
n|	 dkr|	ddd
f }
n
tdt|	j |d }| dkr|ddd
f d
ddd}n| dkr|d
ddd}n
tdt|j |j\}}}}| |d
dddd
ddd}||jdddd  }|  }||jdddd  }| jjtdd }td||| }g }|D ]}t|ttfr||d
  q|| q||
||dS )z rgb: (B, 3, 504, 504) in [0, 1].r   )device_typerI   NZsaddle_balanced)Z	cam_tokenZexport_feat_layersZref_view_strategyr   F)rY   enabledr   )Zpatch_start_idxdepthr      zunexpected depth shape ray   rH   zunexpected ray shape T)rJ   keepdimgư>g      Y@)rM   zbchw, tzc -> btzhw)volume_logits
pred_depthpixel_feats
dino_feats)rN   rA   cudais_bf16_supportedbfloat16float16autocastdevicetyper*   r.   shaper+   rJ   RuntimeErrortuplepermuter?   normrX   rC   clamprO   logexpeinsum
isinstancer,   append)rD   rgbxZautocast_dtypeZfeatsZ_auxHWZhead_outr\   rb   r^   rc   BZCfZHfZWfZf_lnZf_unitkeysZ	keys_unitscalera   rd   Zlayer_featsr   r   r
   forward|   sT   
"zDA3VolumeModel.forward)__name__
__module____qualname__DA3_WEIGHTS_DEFAULTN_WINDOWN_HEIGHT_BINSKEY_DIMTIME_DIM
HEIGHT_DIMstrintr'   staticmethodr=   rX   r~   __classcell__r   r   rF   r
   r   2   s(    3
r   __main__re   cpuc                 c   s    | ]
}|j r| V  qd S r   )requires_gradnumel).0pr   r   r
   	<genexpr>   s    r   zTrainable: ,rH   r_   rl   z  z: z: list()z    first: zpeak: g    eAz.2fz GB)3__doc__systypesosrO   n
ModuleTypemmodulesexportZalign_poses_umeyamaZbatch_align_poses_umeyamarA   torch.nnr0   Zdepth_anything_3.apir   r   Z	DA3_INPUTr   r   r   r   r   Moduler   r   rj   re   is_availabletoevalsum
parametersZn_tprintrandrw   no_gradoutitemsr	   vhasattrrn   rl   ru   r,   lenrk   max_memory_allocatedr   r   r   r
   <module>   sN     	 




* 