o
    çIÄiß  ã                   @   s˜  d Z ddlZddlZddlmZ ej dd¡Zej dd¡ZdZ	dZ
G d	d
„ d
ejƒZedkrÊe ej ¡ r8dnd¡Zede
dZe e¡Ze dddd¡ e¡Ze ddg¡ e¡Ze dd¡ e¡Ze dd¡ e¡Ze ¡  eeeeed\ZZZW d  ƒ n1 s‡w   Y  edejde ¡ d›de  ¡ d›dƒ edejde ¡ d›de  ¡ d›dƒ edejde ¡ d›de  ¡ d›dƒ dS dS )u±  ACT baseline â€” direct regression from DINO CLS token + proprioception.

Same DINOv3 ViT-S/16 backbone as PARA, but instead of pixel-aligned heatmaps,
the CLS token is concatenated with proprioceptive state (current EEF position +
gripper) and passed through MLPs with sigmoid outputs to predict normalized [0,1]
targets:
  - 3D EEF position (N_WINDOW Ã— 3)  â€” normalized via dataset min/max
  - Euler rotation  (N_WINDOW Ã— 3)  â€” normalized via dataset min/max
  - Gripper value    (N_WINDOW Ã— 1)  â€” normalized via dataset min/max

All outputs are sigmoid â†’ [0,1], all targets are min/max normalized â†’ [0,1],
so MSE losses are naturally balanced without manual weight tuning.
é    NÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é   c                       sJ   e Zd ZdZdedf‡ fdd„	Z‡ fdd„Zdd	„ Z	
	
ddd„Z‡  Z	S )ÚACTPredictoruZ   Direct regression baseline: DINO CLS token + proprioception â†’ MLP â†’ sigmoid â†’ [0,1].éÀ  Fc                    s2  t ƒ  ¡  || _|| _d| _tdƒ tjjt	ddt
d| _|r4| j ¡ D ]}d|_q$| j ¡  tdƒ ntdƒ | jj| _| j}|d	 d
 d }| dd¡}t t ||¡t ¡ t ||¡¡| _||7 }td|› d|› ƒ t t |¡t ||¡t ¡ t ||¡t ¡ t ||d
 ¡t ¡ ¡| _t t |¡t ||¡t ¡ t ||¡t ¡ t ||d
 ¡t ¡ ¡| _t t |¡t ||¡t ¡ t ||¡t ¡ t ||¡¡| _tdd„ |  ¡ D ƒƒ}	tdd„ |  ¡ D ƒƒ}
td|
d›d|	d›dƒ td|› d|› ƒ td|› dƒ td|› dƒ td|› dƒ d S )NÚactzLoading DINOv2 model...Údinov3_vits16plusÚlocal)ÚsourceÚweightsFu   âœ“ Frozen DINOv2 backboneu    âœ“ DINOv2 backbone is trainableé   é   é   Úclip_dimi   u   âœ“ CLIP projection: u    â†’ c                 s   s    | ]}|  ¡ V  qd S ©N)Únumel©Ú.0Úp© r   ú&/data/cameron/para/libero/model_act.pyÚ	<genexpr>^   s   € z(ACTPredictor.__init__.<locals>.<genexpr>c                 s   s    | ]
}|j r| ¡ V  qd S r   )Úrequires_gradr   r   r   r   r   r   _   s   € u   âœ“ ACT model: ú,z / z trainable paramsz  Input: CLS(z,) + start_kp(2) + eef_pos(3) + gripper(1) = u     pos_mlp:     â†’ (B, z, 3) [sigmoid, normalized]u     rot_mlp:     â†’ (B, u     gripper_mlp: â†’ (B, z)    [sigmoid, normalized])ÚsuperÚ__init__Útarget_sizeÚn_windowÚ
model_typeÚprintÚtorchÚhubÚloadr   r   ÚdinoÚ
parametersr   ÚevalÚ	embed_dimÚgetÚnnÚ
SequentialÚLinearÚGELUÚ	clip_projÚ	LayerNormÚSigmoidÚpos_mlpÚrot_mlpÚgripper_mlpÚsum)Úselfr   r   Úfreeze_backboneÚkwargsÚparamÚDÚinp_dimr   Zn_totalZn_trainable©Ú	__class__r   r   r      sz   
ü




ý

ù	

ù	


ú
zACTPredictor.__init__c                    s(   t ƒ  |¡ t| dƒr| j |¡| _| S )Nr$   )r   ÚtoÚhasattrr$   )r4   Údevicer:   r   r   r<   f   s   
zACTPredictor.toc           	      C   s´   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd}|||ƒ}q| jjr@| j |dd…d| jjd …f ¡}n| j |dd…d| jjd …f ¡}|dd…df S )z%Extract CLS token from DINO backbone.r   )ÚHÚWNr   )	Úshaper$   Úprepare_tokens_with_masksÚblocksÚ
rope_embedÚuntie_cls_and_patch_normsÚcls_normÚn_storage_tokensÚnorm)	r4   ÚxÚBÚx_tokensÚH_pÚW_pÚblkÚrope_sincosÚ
x_norm_clsr   r   r   Ú_extract_clsl   s   
&$zACTPredictor._extract_clsNc                 C   s.  |j d }|j}|  |¡}	| ¡ dkr| d¡ |d¡}|| j }
|du r-tj|d|d}|du r9tj|d|d}| ¡ dkrD| d¡}| ¡ dkrS| d¡ |d¡}|dur]|  	|¡}n	tj|| j
|d}tj|	|
|||gdd}|  |¡ || jd¡}|  |¡ || jd¡}|  |¡ || j¡}|||fS )aÄ  
        Args:
            x:                 (B, 3, H, W)
            start_keypoint_2d: (B, 2) or (2,) current EEF pixel in image coords
            current_eef_pos:   (B, 3) current EEF 3D position (normalized to [0,1])
            current_gripper:   (B, 1) or (B,) current gripper state (normalized to [0,1])
            query_pixels:      ignored (kept for interface compatibility)
            clip_embedding:    (B, clip_dim) precomputed CLIP text embedding for task

        Returns:
            pos_pred:     (B, N_WINDOW, 3)  normalized [0,1] position
            rot_pred:     (B, N_WINDOW, 3)  normalized [0,1] rotation
            gripper_pred: (B, N_WINDOW)     normalized [0,1] gripper
        r   r   éÿÿÿÿNr   )r>   )Údim)rA   r>   rQ   rS   Ú	unsqueezeÚexpandr   r!   Úzerosr-   r'   Úcatr0   Úreshaper   r1   r2   )r4   rI   Ústart_keypoint_2dÚcurrent_eef_posÚcurrent_gripperÚquery_pixelsÚclip_embeddingrJ   r>   Ú	cls_tokenZstart_kp_normr-   ÚinpÚpos_predÚrot_predÚgripper_predr   r   r   Úforwardy   s,   




zACTPredictor.forward)NNNN)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚN_WINDOWr   r<   rQ   rc   Ú__classcell__r   r   r:   r   r      s    Iÿr   Ú__main__ÚcudaÚcpur   )r   r   r   r   g      l@r   )rZ   r[   zpos  zrange=[z.3fz, ú]zrot  zgrip )!rg   Úosr!   Útorch.nnr)   Úenvironr(   r   r   ÚDINO_PATCH_SIZErh   ÚModuler   rd   r>   rk   Úis_availableÚmodelr<   ÚrandnrI   ÚtensorÚkpÚeefÚgripÚno_gradÚposÚrotZgrip_outr    rA   ÚminÚmaxr   r   r   r   Ú<module>   s2     

ÿ((,ô