o
    Kid                     @   s   d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	 dZ
dZdZdZdZdZejdd	Zd
d ZG dd dejZdS )u  DA3 baseline — Depth Anything 3 pretrained DINOv2 backbone + PARA heads.

Uses the DINOv2 ViT-S/14 backbone initialized with DA3-SMALL weights, pretrained
for monocular depth estimation. The hypothesis: geometry-pretrained features
improve 3D EEF prediction.

Architecture: same as PARA (heatmap volume + gripper/rotation MLPs) but with
DA3's depth-pretrained backbone instead of vanilla DINOv3.
    N)	load_file       @      DA3_WEIGHTS_PATHz+/data/cameron/da3_weights/model.safetensorsc                    s   t jjdddd}t| }d  fdd| D }|j|dd\}}|r6td	t| d
|dd  d |rItdt| d
|dd  d tdt| d |S )zILoad standard DINOv2 ViT-S/14 and initialize with DA3 pretrained weights.zfacebookresearch/dinov2Zdinov2_vits14F)
pretrainedzmodel.backbone.pretrained.c                    s,   i | ]\}}|  r|t d  |qS N)
startswithlen).0kvprefix &/data/cameron/para/libero/model_da3.py
<dictcomp>!   s   , z0_load_dinov2_vits14_from_da3.<locals>.<dictcomp>)strictz  DA3 backbone missing keys (z): N   z...z   DA3 backbone unexpected keys (u$   ✓ Loaded DA3 pretrained backbone (z keys))torchhubloadr   itemsload_state_dictprintr   )Zweights_pathbackbonesdZbackbone_sdmissing
unexpectedr   r   r   _load_dinov2_vits14_from_da3   s   ""r    c                       sX   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dd Z
dddZ  ZS )DA3PredictorzCDA3-pretrained DINOv2 ViT-S/14 backbone + PARA-style heatmap heads.i  Fc                    s  t    || _|| _|| _t| _d| _td t	t
| _|r4| j D ]}d|_q$| j  td ntd d| _| j}tt|d | _td| d	 ttj||d
ddt tj||d
ddt tj||d
ddt | _td|  tj||t dd| _td| dt d| d| d		 tt|t||t t|t| _tt|t||t t|d
t | _ td| dt d	 td| dt d	 d S )NZda3z)Loading DA3 pretrained DINOv2 backbone...Fu   ✓ Frozen DA3 backboneu   ✓ DA3 backbone is trainablei  g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)      )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r%   u   ✓ Volume   head → (B, z, u   ✓ Gripper  MLP  → (B, u   ✓ Rotation MLP  → (B, z, 3, )!super__init__target_size	pred_sizen_window
PATCH_SIZE
patch_size
model_typer   r    r   r   
parametersrequires_gradeval	embed_dimnn	Parameterr   randnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_head	LayerNormLinearN_GRIPPER_BINSgripper_mlp
N_ROT_BINSrotation_mlp)selfr)   r*   r+   freeze_backbonekwargsparamD	__class__r   r   r(   /   sD   



""&zDA3Predictor.__init__c                    s   t  | | j|| _| S r	   )r'   tor   )rC   devicerH   r   r   rJ   [   s   zDA3Predictor.toc                 C   sd   |j d }| jj|dgd}|d }t|j d d  }}||||| j}|dddd }|S )z3Extract patch features using DA3's DINOv2 backbone.r      )nr$   g      ?r#      )shaper   get_intermediate_layersintreshaper2   permute
contiguous)rC   xBfeatspatch_tokensH_pW_ppatch_featuresr   r   r   _extract_features`   s   
zDA3Predictor._extract_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
d d |	|f S )Nr$   ).r   r   ).r$   rK   )rO   longclampr   arangerK   viewexpand)rC   rW   query_pixelsrV   rG   HWNpxpy	batch_idxr   r   r   _index_featuresj   s   
 zDA3Predictor._index_featuresc           	      C   sf   |j d d \}}| | |}||| | j}| |||t}| |||dt}||fS )NrN   r#   )	rO   rj   detachrR   r2   r@   r?   rB   rA   )	rC   rW   rc   rV   rf   indexedflatZgripperrotationr   r   r   predict_at_pixelsr   s   zDA3Predictor.predict_at_pixelsNc                 C   s<  |j d }| |}|j \}}}}	| dkr |d|d}|d d df |	 | j  d|	d }
|d d df | | j  d|d }tj	||j
d}||d d ||
f  | jd7  < tj|| j| jfddd}| |}| |}||| jt| j| j}|d ur| ||\}}nd  }}||||fS )Nr   r$   r]   bilinearF)sizemodealign_corners)rO   r\   dim	unsqueezerb   r)   r^   r_   r   r`   rK   r6   Finterpolater*   r:   r<   ra   r+   r;   ro   )rC   rU   start_keypoint_2drc   rV   r[   _rG   rY   rZ   Zstart_patch_xZstart_patch_ybatch_indicesrW   volZvolume_logitsZgripper_logitsZrotation_logitsr   r   r   forwardz   s"   

**$

zDA3Predictor.forwardr	   )__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWr(   rJ   r\   rj   ro   r}   __classcell__r   r   rH   r   r!   ,   s    ,
r!   )r   osr   torch.nnr3   Ztorch.nn.functional
functionalrw   Zsafetensors.torchr   r   r;   r?   rA   r   r,   environgetr   r    Moduler!   r   r   r   r   <module>   s    
