o
    !j                     @   s  d Z ddlZddlZddlZddlZdD ]ZeeZeeje< qdd ejd _	dd ejd _
d	d ejd _ddlZddlmZ dd
lmZ dZdZdZdZdZdZdZG dd dejZedkreej rndndZe e Ze dd e! D Z"e#de"d e$ddeeeZ%e&  ee%Z'W d   n1 sw   Y  e'( D ]>\Z)Z*e+e*dre#de) de,e*j-  qe.e*e/re#de) de0e* d e*re+e*d dre#d e,e*d j-  qej1dkr
e#d!ej2 d" d#d$ dS dS dS )%uz  DA3-based volume model with factored KV-attention.

Architecture (Cameron 2026-05-18 spec):
  - DA3 backbone + DPT head; aux head's final 1×1 conv replaced to output KEY_DIM (+1 conf).
  - Per-pixel feature F ∈ R^(B × KEY_DIM × H × W) — "value/query" stream.
  - Learnable height embeddings h_emb ∈ R^(N_HEIGHT_BINS × H_DIM).
  - Learnable time   embeddings t_emb ∈ R^(N_WINDOW × T_DIM).
  - Key per (t, z): key(t, z) = concat(t_emb[t], h_emb[z]) ∈ R^(T_DIM + H_DIM = KEY_DIM).
  - Volume logits via bilinear scoring: l(b, t, z, u, v) = F(b, :, u, v) · key(t, z) / sqrt(KEY_DIM).

We replace the original libero (B, N_WINDOW * N_HEIGHT_BINS, H, W) dense head with the
factored KV decomposition: parameter-efficient, structural inductive bias that height/time
are categorical attributes that share representation across spatial locations.

forward(rgb) returns:
  volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, h_out, w_out)
  pred_depth:    (B, H, W)  — for distillation against frozen DA3 depth
  dino_feats:    list of intermediate DINO features (for PCA viz)
  pixel_feats:   (B, KEY_DIM, h_out, w_out) — for debugging / viz
    N)depth_anything_3.utils.export!depth_anything_3.utils.pose_alignc                  O      d S N akr   r   0/data/cameron/para/libero/model_da3_volume_v2.py<lambda>       r   r   c                  O   r   r   r   r   r   r   r
   r      r   r   c                  O   r   r   r   r   r   r   r
   r      r   )DepthAnything3z/data/cameron/da3_large_weightsi         0   c                       sV   e Zd Zeeeeeedfde	de
de
de
de
de
f fdd	Zd
d Zdd Z  ZS )DA3VolumeModelNweights_pathn_windown_height_binskey_dimtime_dim
height_dimc                    s<  t    ||kr||ksJ || _|| _|| _|| _|| _t|}|j	j
| _
|j	j| _~|d u r?tt| j
dg d| _nt|| _| jjjd }	|	d }
tj|
j|d dddd}tj|j tjj|jdd ||	d< t||| _t||| _tjj| jjd	d
dd tjj| jjd	d
dd t|| _d S )NZ
out_layers)      	         r   )Zkernel_sizeZstrideZpaddingg?)stdg?gɿg?)r   r   b)super__init__r   r   r   r   r   r   Zfrom_pretrainedZmodelbackboneheadlistgetattrdino_feat_layersZscratchZoutput_conv2_auxnnZConv2dZin_channelsinitZzeros_ZbiasZnormal_weightZ	Embeddingh_embt_embZtrunc_normal_Z	LayerNorm
pixel_norm)selfr   r   r   r   r   r   r&   ZfullZlast_aux_seqZold_convZnew_conv	__class__r   r
   r!   /   s2   




zDA3VolumeModel.__init__c                 C   s0   | j }| j}| jjd}| jjd}|| S )u   v2: SUM of full-dim time and height embeddings (instead of concat halves).
        Returns keys: (n_window, n_height_bins, key_dim).
        key(t, z) = t_emb[t] + h_emb[z]   (both ∈ R^key_dim).
        r   r   )r   r   r+   r)   	unsqueezer*   )r-   TZZt_eZh_er   r   r
   _build_keysY   s
   zDA3VolumeModel._build_keysc                 C   s"  | d}tj rtjntj}tj|jj|d | j	|d| j
dd\}}W d   n1 s0w   Y  |jd |jd }}tj|jjdd	 | j|||d
d}W d   n1 s]w   Y  |d }	|	 dkrv|	ddd
d
f }
n|	 dkr|	ddd
f }
n
tdt|	j |d }| dkr|ddd
f d
ddd}n| dkr|d
ddd}n
tdt|j |j\}}}}| |d
dddd
ddd}|  }dt| j }td||| }g }|D ]}t|ttfr||d
  q|| q||
||dS )z rgb: (B, 3, 504, 504) in [0, 1].r   )device_typeZdtypeNZsaddle_balanced)Z	cam_tokenZexport_feat_layersZref_view_strategyr   F)r4   enabledr   )Zpatch_start_idxdepthr      zunexpected depth shape ray      zunexpected ray shape g      ?zbchw, tzc -> btzhw)volume_logits
pred_depthpixel_feats
dino_feats)r0   torchcudaZis_bf16_supportedZbfloat16Zfloat16Zautocastdevicetyper"   r&   shaper#   dimRuntimeErrortupleZpermuter,   r3   mathZsqrtr   Zeinsum
isinstancer$   append)r-   rgbxZautocast_dtypeZfeatsZ_auxHWZhead_outr7   r=   r9   r>   BZCfZHfZWfZf_normkeysZscaler<   r?   Zlayer_featsr   r   r
   forwardd   sP   
"zDA3VolumeModel.forward)__name__
__module____qualname__DA3_WEIGHTS_DEFAULTN_WINDOWN_HEIGHT_BINSKEY_DIMTIME_DIM
HEIGHT_DIMstrintr!   r3   rQ   __classcell__r   r   r.   r
   r   .   s$    *r   __main__rA   Zcpuc                 c   s    | ]
}|j r| V  qd S r   )Zrequires_gradZnumel).0pr   r   r
   	<genexpr>   s    ra   zTrainable: ,r;   r:   rD   z  z: z: list()z    first: zpeak: g    eAz.2fz GB)3__doc__systypesosrH   n
ModuleTypemmodulesZexportZalign_poses_umeyamaZbatch_align_poses_umeyamar@   Ztorch.nnr'   Zdepth_anything_3.apir   rU   Z	DA3_INPUTrV   rW   rX   rY   rZ   ZModuler   rR   rB   rA   Zis_availabletoevalsumZ
parametersZn_tprintrandrK   Zno_gradoutitemsr	   vhasattrrG   rD   rI   r$   lenrC   Zmax_memory_allocatedr   r   r   r
   <module>   sL     
n



* 