o
    jR#                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 dD ]Z
e
ejvr2ee
eje
< q#dd ejd _dd ejd _d	d ejd _ejdd
 ddlmZ ddlmZmZmZmZmZmZmZmZmZmZ dZdZ G dd deZ!e"dkre#ej$% rdndZ#e!ddddd&e#' Z(e)dd e(* D Z+e,de+d e-dddd&e#Z.e-dd&e#d Z/e0  e(e.e/Z1W d   n1 sw   Y  e12 D ]\Z3Z4e4dure,de3 de5e4j6  qdS dS ) us  DinoVolumeQuery with DA3 geometric features fused in.

Architecture:
  1. DINOv3 backbone → (B, D_dino, 28, 28) patches
  2. DA3-LARGE backbone (frozen) → last-layer patch tokens at (B, D_da3=2048, 28, 28)
  3. Project DA3 patches: 1×1 conv 2048 → 256
  4. Concat: (D_dino + 256, 28, 28)
  5. Fusion: 2 or 3 conv layers with GELU, 3×3 kernels → (D_dino, 28, 28)
  6. Upsample to (D_dino, 56, 56)
  7. Refine to (d_feat=32, 56, 56) — F_feat
  8. Existing query-MLP volume scoring

DA3 features carry geometric/depth priors. The fusion module learns to combine
DINO's semantic features with DA3's geometric features.
    N)depth_anything_3.utils.export!depth_anything_3.utils.pose_alignc                  O      d S N akr   r   2/data/cameron/para/libero/model_dino_da3_fusion.py<lambda>       r   r   c                  O   r   r   r   r   r   r   r
   r      r   r   c                  O   r   r   r   r   r   r   r
   r      r   z/data/cameron/da3_repo/src)DepthAnything3)
DinoVolumeQueryN_WINDOWN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINSD_FEATD_SINZD_SINTIMG_SIZE	PRED_SIZEz/data/cameron/da3_large_weightsi  c                       s>   e Zd ZdZedddd fdd
Zdd	 ZdddZ  ZS )DinoVolumeQueryDA3zN1view query-MLP with DA3 geometric features fused in via concat + conv layers.      T)da3_weightsda3_proj_dimfusion_layers
freeze_da3c             	      s   t  j|i | t|}|jj| _~|r&| j D ]}d|_q| j	  d| _
tj| j
|dd| _| j| }	g }
t|D ]'}|dkrF|	n| j}| j}|
tj||ddd ||d k re|
t  q>tj|
 | _|| _|| _d S )NFi      )kernel_sizer      )r    padding)super__init__r   from_pretrainedmodelbackboneda3_backbone
parametersrequires_gradevalZda3_token_dimnnConv2dda3_proj	embed_dimrangeappendGELU
Sequentialfusionr   r   )selfr   r   r   r   argskwargsfullpZfused_inlayersiZin_chZout_ch	__class__r   r
   r$   .   s,   




zDinoVolumeQueryDA3.__init__c                 C   s`  |j \}}}}|d}tj rtjntj}t 5 tj|j	j
|d | j|dtt| jdg ddd\}}	W d   n1 sDw   Y  W d   n1 sSw   Y  |d }
t|
ttfrg|
d	 }
|
ddd	f }
t| jd
d	}|d }|
j d | }t|d }|| }|
dd||||  f }|||||
j d d	ddd }| S )zRun DA3 backbone, return last-layer patch tokens reshaped to (B, 2048, H_p, W_p).

        DA3 expects (B, S=1, 3, H, W) input.
        r   )device_typedtypeNZ
out_layers)      	      Zsaddle_balanced)Z	cam_tokenZexport_feat_layersZref_view_strategyr   n_storage_tokensg      ?r!   r   )shape	unsqueezetorchcudais_bf16_supportedbfloat16float16no_gradautocastdevicetyper(   listgetattr
isinstancetupleintreshapepermute
contiguousfloat)r5   rgbB_ZH_inZW_inxZautocast_dtypeZfeatsZ_auxlastZ	n_storageZn_prefixZn_patchZHp_da3ZWp_da3patchesr   r   r
   _extract_da3_patchesO   s2   

&z'DinoVolumeQueryDA3._extract_da3_patchesNc           '      C   s  |j d }| j}| j}| |\}}|j dd  \}	}
| |}|j dd  |	|
fkr6tj||	|
fddd}| |}tj	||gdd}| 
|}tj|| j| jfddd}| |}|j dd  \}}| jr|d || j   d|d }|d	 || j   d|d }tj||jd
}||d d ||f }| tj	||gdd}n| |}|d||||| |}| | j}|d||d|| d}|}| jD ]}|||}q| |}||||}| |}| |}| jdkr	| |||d| j }n| |}|dd | j!f } |d| j!| j!| j" f }!|d| j!| j" d f }"t#d| |}#t#d|!| j$}$t#d|"| j}%|#d|$dd |%ddd }&|&|||dS )Nr   bilinearF)sizemodealign_cornersr   )dim).r   ).r   )rO   rD   per_axisr!   .zbtc, bchw -> bthwzbtc, zc   -> btzzbtc, tc   -> btr   )volume_logitsgripper_logitsrotation_logitspixel_feats)%rF   n_windowd_model_extract_dino_featuresr`   Finterpolater.   rH   catr4   	pred_sizerefineuse_eef
image_sizelongclamparangerO   
input_projrG   expandrV   t_cond_projt_sinblocks
final_normviewq_head	grip_headrotation_moderot_head
n_rot_binsd_featd_sin_zeinsumz_sin)'r5   rZ   	start_pixkp_zyxr[   TdZ
patch_dinoclsZH_dinoZW_dinoZ	patch_da3r.   fusedfeat_upF_featHWsxsyb_idxeef_featq_inq_in_btcond_tcond_bthblkpenult	q_spatialgripperrotationq_Fq_zq_tscore_yxscore_zscore_trh   r   r   r
   forwards   sn   




""
  




zDinoVolumeQueryDA3.forwardr   )	__name__
__module____qualname____doc__DA3_WEIGHTS_DEFAULTr$   r`   r   __classcell__r   r   r<   r
   r   +   s    !$r   __main__rI   cpu   i  1d_pcar   )rl   ru   r   r   c                 c   s    | ]
}|j r| V  qd S r   )r*   numel).0r9   r   r   r
   	<genexpr>   s    r   zTrainable: ,r!   z  z: )7r   ossystypesrH   torch.nnr,   torch.nn.functional
functionalro   nmodules
ModuleTypeexportZalign_poses_umeyamaZbatch_align_poses_umeyamapathinsertZdepth_anything_3.apir   model_dino_volume_queryr   r   r   r   r   r   r   r   r   r   r   ZDA3_INPUT_SIZEr   r   rO   rI   is_availabletor+   msumr)   n_tprintrandrZ   sprM   outitemsr	   vrT   rF   r   r   r   r
   <module>   sP    
0 

"
