o
    ja                     @   s,  d Z ddlZddlZddlZddlZdD ]ZeeZeeje< qdd ejd _	dd ejd _
d	d ejd _ddlZddlmZ dd
lmZ dZdZdZdZdZdZdZee eksaJ dG dd dejZedkreej rxdndZe e Ze dd e! D Z"e#de"d e$ddeeeZ%e&  ee%Z'W d   n1 sw   Y  e'( D ]>\Z)Z*e+e*dre#de) de,e*j-  qe.e*e/re#de) d e0e* d! e*re+e*d dre#d"e,e*d j-  qej1dkre#d#ej2 d$ d%d& dS dS dS )'uz  DA3-based volume model with factored KV-attention.

Architecture (Cameron 2026-05-18 spec):
  - DA3 backbone + DPT head; aux head's final 1×1 conv replaced to output KEY_DIM (+1 conf).
  - Per-pixel feature F ∈ R^(B × KEY_DIM × H × W) — "value/query" stream.
  - Learnable height embeddings h_emb ∈ R^(N_HEIGHT_BINS × H_DIM).
  - Learnable time   embeddings t_emb ∈ R^(N_WINDOW × T_DIM).
  - Key per (t, z): key(t, z) = concat(t_emb[t], h_emb[z]) ∈ R^(T_DIM + H_DIM = KEY_DIM).
  - Volume logits via bilinear scoring: l(b, t, z, u, v) = F(b, :, u, v) · key(t, z) / sqrt(KEY_DIM).

We replace the original libero (B, N_WINDOW * N_HEIGHT_BINS, H, W) dense head with the
factored KV decomposition: parameter-efficient, structural inductive bias that height/time
are categorical attributes that share representation across spatial locations.

forward(rgb) returns:
  volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, h_out, w_out)
  pred_depth:    (B, H, W)  — for distillation against frozen DA3 depth
  dino_feats:    list of intermediate DINO features (for PCA viz)
  pixel_feats:   (B, KEY_DIM, h_out, w_out) — for debugging / viz
    N)depth_anything_3.utils.export!depth_anything_3.utils.pose_alignc                  O      d S N akr   r   -/data/cameron/para/libero/model_da3_volume.py<lambda>       r   r   c                  O   r   r   r   r   r   r   r
   r      r   r   c                  O   r   r   r   r   r   r   r
   r      r   )DepthAnything3z/data/cameron/da3_large_weightsi         0      z(TIME_DIM + HEIGHT_DIM must equal KEY_DIMc                       sV   e Zd Zeeeeeedfde	de
de
de
de
de
f fdd	Zd
d Zdd Z  ZS )DA3VolumeModelNweights_pathn_windown_height_binskey_dimtime_dim
height_dimc                    s8  t    || |ksJ || _|| _|| _|| _|| _t|}|j	j
| _
|j	j| _~|d u r=tt| j
dg d| _nt|| _| jjjd }	|	d }
tj|
j|d dddd}tj|j tjj|jdd ||	d< t||| _t||| _tjj| jjd	d
dd tjj| jjd	d
dd t|| _d S )NZ
out_layers)      	         r   )kernel_sizestridepaddingg?)stdg?gɿg?)r"   r   b)super__init__r   r   r   r   r   r   from_pretrainedmodelbackboneheadlistgetattrdino_feat_layersscratchZoutput_conv2_auxnnConv2din_channelsinitzeros_biasnormal_weight	Embeddingh_embt_embtrunc_normal_	LayerNorm
pixel_norm)selfr   r   r   r   r   r   r,   fullZlast_aux_seqZold_convZnew_conv	__class__r   r
   r%   ,   s2   




zDA3VolumeModel.__init__c                 C   sR   | j }| j}| jjd||| j}| jjd||| j}t	j
||gddS )zjReturns keys: (n_window, n_height_bins, key_dim).
        key(t, z) = concat(t_emb[t], h_emb[z]).
        r   r   r   )dim)r   r   r8   r5   	unsqueezeexpandr   r7   r   torchcat)r<   TZZt_eZh_er   r   r
   _build_keysU   s
   zDA3VolumeModel._build_keysc                 C   s"  | d}tj rtjntj}tj|jj|d | j	|d| j
dd\}}W d   n1 s0w   Y  |jd |jd }}tj|jjdd	 | j|||d
d}W d   n1 s]w   Y  |d }	|	 dkrv|	ddd
d
f }
n|	 dkr|	ddd
f }
n
tdt|	j |d }| dkr|ddd
f d
ddd}n| dkr|d
ddd}n
tdt|j |j\}}}}| |d
dddd
ddd}|  }dt| j }td||| }g }|D ]}t|ttfr||d
  q|| q||
||dS )z rgb: (B, 3, 504, 504) in [0, 1].r   )device_typedtypeNZsaddle_balanced)Z	cam_tokenZexport_feat_layersZref_view_strategyr   F)rH   enabledr   )Zpatch_start_idxdepthr      zunexpected depth shape ray      zunexpected ray shape g      ?zbchw, tzc -> btzhw)volume_logits
pred_depthpixel_feats
dino_feats)rA   rC   cudais_bf16_supportedbfloat16float16autocastdevicetyper(   r,   shaper)   r@   RuntimeErrortuplepermuter;   rG   mathsqrtr   einsum
isinstancer*   append)r<   rgbxZautocast_dtypeZfeatsZ_auxHWZhead_outrL   rR   rN   rS   BZCfZHfZWff_normkeysscalerQ   rT   Zlayer_featsr   r   r
   forward_   sP   
"zDA3VolumeModel.forward)__name__
__module____qualname__DA3_WEIGHTS_DEFAULTN_WINDOWN_HEIGHT_BINSKEY_DIMTIME_DIM
HEIGHT_DIMstrintr%   rG   rm   __classcell__r   r   r>   r
   r   +   s$    )
r   __main__rU   cpuc                 c   s    | ]
}|j r| V  qd S r   )requires_gradnumel).0pr   r   r
   	<genexpr>   s    r   zTrainable: ,rP   rO   r\   z  z: z: list()z    first: zpeak: g    eAz.2fz GB)3__doc__systypesosr`   n
ModuleTypemmodulesexportZalign_poses_umeyamaZbatch_align_poses_umeyamarC   torch.nnr.   Zdepth_anything_3.apir   rq   	DA3_INPUTrr   rs   rt   ru   rv   Moduler   rn   rZ   rU   is_availabletoevalsum
parametersZn_tprintrandre   no_gradoutitemsr	   vhasattrr^   r\   rc   r*   lenr[   max_memory_allocatedr   r   r   r
   <module>   sN     
l



* 