o
    çIÄi^  ã                   @   sv  d Z ddlZddlZddlmZ ddlm  mZ ej 	dd¡Z
ej 	dd¡ZdZdZd	Zd	Zd	Zd
ZdZG dd„ dejƒZedkr¹e ej ¡ rKdnd¡ZededZe e¡Ze dddd¡ e¡Ze de¡ e¡Ze ded¡ e¡Ze  ¡  eee !ddg¡ e¡eed\Z"Z#Z$Z%W d  ƒ n1 sšw   Y  e&de"j'ƒ e&de#j'ƒ e&de$j'ƒ e&de%j'ƒ dS dS )u¤  DINO-VLA baseline â€” DINOv3 backbone + CLIP task conditioning + PARA heads.

Same architecture as PARA, but with a precomputed CLIP text embedding of the
current task added to every DINO patch token via a learned projection MLP.
This tests whether language-conditioned features help multi-task performance.

The CLIP embeddings are precomputed (frozen) and loaded as .pt tensors â€”
no CLIP forward pass at train time.
é    NÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é   é    é@   i   c                       sZ   e Zd ZdZdeedef‡ fdd„	Z‡ fdd„Zdd	„ Z	d
d„ Z
dd„ Zddd„Z‡  ZS )ÚDinoVLAPredictorzDDINOv3 backbone + CLIP task conditioning + PARA-style heatmap heads.éÀ  Fc           	         s  t ƒ  ¡  || _|| _|| _t| _d| _tdƒ t	j
jtddtd| _|r:| j ¡ D ]}d|_q*| j ¡  tdƒ ntdƒ | jj| _| j}t t ||¡t ¡ t ||¡¡| _td	|› d
|› ƒ t t	 |¡d ¡| _td|› dƒ t tj||dddt ¡ tj||dddt ¡ tj||dddt ¡ ¡| _td|› ƒ tj||t dd| _td|› dt› d|› d|› d	ƒ t t |¡t ||¡t ¡ t |d¡¡| _ t t |¡t ||¡t ¡ t |dt! ¡¡| _"td|› dt#› dƒ td|› dt!› dƒ d S )NÚdino_vlazLoading DINOv2 model...Údinov3_vits16plusÚlocal)ÚsourceÚweightsFu   âœ“ Frozen DINOv2 backboneu    âœ“ DINOv2 backbone is trainableu   âœ“ CLIP projection: u    â†’ g{®Gáz”?u,   âœ“ Learnable start keypoint embedding (dim=ú)é   é   )Úkernel_sizeÚpaddingu1   âœ“ Feature convs: 3Ã— Conv2d(3Ã—3) at pred_size=)r   u   âœ“ Volume   head â†’ (B, z, u   âœ“ Gripper  MLP  â†’ (B, u   âœ“ Rotation MLP  â†’ (B, z, 3, )$ÚsuperÚ__init__Útarget_sizeÚ	pred_sizeÚn_windowÚDINO_PATCH_SIZEÚ
patch_sizeÚ
model_typeÚprintÚtorchÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚnnÚ
SequentialÚLinearÚGELUÚ	clip_projÚ	ParameterÚrandnÚstart_keypoint_embeddingÚConv2dÚfeature_convsÚN_HEIGHT_BINSÚvolume_headÚ	LayerNormÚgripper_mlpÚ
N_ROT_BINSÚrotation_mlpÚN_GRIPPER_BINS)	Úselfr   r   r   Úfreeze_backboneZclip_dimÚkwargsÚparamÚD©Ú	__class__© ú+/data/cameron/para/libero/model_dino_vla.pyr      sZ   
ü




ýý""ÿ&ÿzDinoVLAPredictor.__init__c                    s(   t ƒ  |¡ t| dƒr| j |¡| _| S )Nr    )r   ÚtoÚhasattrr    )r6   Údevicer;   r=   r>   r?   \   s   
zDinoVLAPredictor.toc           
      C   sÎ   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd}|||ƒ}q| jjr@| j |dd…| jjd d…f ¡}n| j |¡dd…| jjd d…f }| |||| j	¡}	|	 
dddd¡ ¡ }	|	S )z,Extract patch features from DINOv3 backbone.r   )ÚHÚWNr   r   é   )Úshaper    Úprepare_tokens_with_masksÚblocksÚ
rope_embedÚuntie_cls_and_patch_normsÚnormÚn_storage_tokensÚreshaper$   ÚpermuteÚ
contiguous)
r6   ÚxÚBÚx_tokensÚH_pÚW_pÚblkÚrope_sincosÚx_norm_patchesÚpatch_featuresr=   r=   r>   Ú_extract_dino_featuresb   s   
&$z'DinoVLAPredictor._extract_dino_featuresc                 C   s|   |j \}}}}|j d }|d  ¡  d|d ¡}|d  ¡  d|d ¡}	tj||jd |d¡ ||¡}
||
d d …|	|f S )Nr   ).r   r   ).r   ©rA   )rE   ÚlongÚclampr   ÚarangerA   ÚviewÚexpand)r6   ÚfeatsÚquery_pixelsrP   r:   rB   rC   ÚNÚpxÚpyÚ	batch_idxr=   r=   r>   Ú_index_featuresr   s   
 z DinoVLAPredictor._index_featuresc           	      C   sd   |j d d… \}}|  | ¡ |¡}| || | j¡}|  |¡ ||¡}|  |¡ ||dt¡}||fS )NrD   r   )rE   re   ÚdetachrL   r$   r2   r4   r3   )	r6   r_   r`   rP   ra   ÚindexedÚflatÚgripperÚrotationr=   r=   r>   Úpredict_at_pixelsz   s   z"DinoVLAPredictor.predict_at_pixelsNc                 C   sb  |j d }|  |¡}|j \}}}	}
|dur$|  |¡}|| d¡ d¡ }| ¡ dkr3| d¡ |d¡}|dd…df |
 | j  ¡  d|
d ¡}|dd…df |	 | j  ¡  d|	d ¡}t	j
||jd}||dd…||f  | j d¡7  < tj|| j| jfddd}|  |¡}|  |¡}| || jt| j| j¡}|dur§|  ||¡\}}nd }}||||fS )	ax  
        Args:
            x:                 (B, 3, H, W)
            start_keypoint_2d: (B, 2) or (2,) current EEF pixel
            query_pixels:      (B, N_WINDOW, 2) for gripper/rotation heads
            clip_embedding:    (B, clip_dim) precomputed CLIP text embedding for task

        Returns:
            volume_logits, gripper_logits, rotation_logits, feats
        r   Néÿÿÿÿr   rY   ÚbilinearF)ÚsizeÚmodeÚalign_corners)rE   rX   r)   Ú	unsqueezeÚdimr^   r   rZ   r[   r   r\   rA   r,   ÚFÚinterpolater   r.   r0   r]   r   r/   rk   )r6   rO   Ústart_keypoint_2dr`   Úclip_embeddingrP   rW   Ú_r:   rR   rS   r)   Ústart_patch_xÚstart_patch_yÚbatch_indicesr_   ÚvolÚvolume_logitsÚgripper_logitsÚrotation_logitsr=   r=   r>   Úforward‚   s(   


**$

zDinoVLAPredictor.forward)NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú	PRED_SIZEÚN_WINDOWÚCLIP_DIMr   r?   rX   re   rk   r   Ú__classcell__r=   r=   r;   r>   r      s    ÿ=r   Ú__main__ÚcudaÚcpur	   )r   r   rD   r   g      l@)ru   r`   rv   zvolume_logits  zgripper_logits r~   zfeats          )(rƒ   Úosr   Útorch.nnr%   Útorch.nn.functionalÚ
functionalrs   ÚenvironÚgetr   r   r   r…   r/   r5   r3   r„   r†   ÚModuler   r€   rA   r‰   Úis_availableÚmodelr?   r+   rO   Zclip_embÚzerosZ
fake_queryÚno_gradÚtensorr{   ÚgripÚrotr_   r   rE   r=   r=   r=   r>   Ú<module>   sB    
 

ÿÿó