o
    omŠi2  ã                   @   sd  d Z ddlZddlmZ ddlm  mZ dZdZdZ	dZ
dZdZd	Zd	Zd
ZdZdZdZdZG dd„ dejƒZedkr°e ejj ¡ rFdnd¡ZededdZe e¡Ze dddd¡ e¡Ze dd¡ e¡d Z e d¡ e¡d Z!e d¡ e¡Z"e #¡  eee e!e"d\Z$Z%Z&W d  ƒ n1 s—w   Y  e'de$j(ƒ e'de%j(ƒ e'de&j(ƒ dS dS )a?  Motion tracks baseline: image + current (2d, height, gripper) -> 2D location + height + gripper per timestep (camera frame).

Conditioned on current 2D (2), height (1), and gripper (1) concatenated to CLS.
Factorized as 2d (N_WINDOW, 2) + height (N_WINDOW,) + gripper (N_WINDOW,). Same lifting as volume for eval/live.
é    NÚdinov3z?dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   )g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?é   gœiÂö“1¦?gš™™™™™É¿gš™™™™™é?é    é   c                       sL   e Zd ZdZdeddf‡ fdd„	Z‡ fdd„Zd	d
„ Z		ddd„Z‡  Z	S )ÚMotionTracksTrajectoryPredictorz€Predict 2D (camera/image) + height + gripper per timestep. Conditioned on current 2d, height, gripper. Lift to 3D for eval/live.éÀ  Fi   c                    sî   t ƒ  ¡  || _|| _t| _tdƒ tjj	t
ddtd| _|r4| j ¡ D ]}d|_q$| j ¡  tdƒ ntdƒ | jj| _|d | | | _t t | jt |¡t ¡ t d	¡t ||¡t ¡ t d	¡t || j¡¡| _td
| j› dƒ d S )NzLoading DINOv2 model...Údinov3_vits16plusÚlocal)ÚsourceÚweightsFu   âœ“ Frozen DINOv2 backboneu    âœ“ DINOv2 backbone is trainableé   çš™™™™™¹?uX   âœ“ MotionTracks head: [CLS, current_2d, current_height, current_gripper] -> MLP -> (B, ú))ÚsuperÚ__init__Útarget_sizeÚn_windowÚDINO_PATCH_SIZEÚ
patch_sizeÚprintÚtorchÚhubÚloadÚDINO_REPO_DIRÚDINO_WEIGHTS_PATHÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚout_dimÚnnÚ
SequentialÚLinearÚCURRENT_STATE_DIMÚGELUÚDropoutÚmlp)Úselfr   r   Úfreeze_backboneÚ
hidden_dimÚparam©Ú	__class__© úH/data/cameron/keygrip/volume_dino_tracks_motion_tracks_baseline/model.pyr      s:   
ü



ù	z(MotionTracksTrajectoryPredictor.__init__c                    s(   t ƒ  |¡ t| dƒr| j |¡| _| S )Nr   )r   ÚtoÚhasattrr   )r)   Údevicer-   r/   r0   r1   A   s   
z"MotionTracksTrajectoryPredictor.toc                 C   sÖ   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd }|||ƒ}q| jjr[| j |d d …d | jjd …f ¡}| j |d d …| jjd d …f ¡}	t	j
||	gdd}n| j |¡}|d d …df }
|
S )Nr   )ÚHÚWé   ©Údim)Úshaper   Úprepare_tokens_with_masksÚblocksÚ
rope_embedÚuntie_cls_and_patch_normsÚcls_normÚn_storage_tokensÚnormr   Úcat)r)   ÚxÚBÚx_tokensÚH_pÚW_pÚblkÚrope_sincosÚ
x_norm_clsÚx_norm_patchesÚ	cls_tokenr/   r/   r0   Ú_extract_clsG   s   
$$z,MotionTracksTrajectoryPredictor._extract_clsNc	                 C   sJ  |   |¡}	|	jd }
|	j}|du rtj|
d||	jd}|du r(tj|
||	jd}| ¡ dkr3| d¡}| ¡ dkr>| d¡}|du rKtj|
||	jd}| ¡ dkrV| d¡}tj|	|||gdd}|  	|¡}|dd…d| j
d …f  |
| j
d¡}|dd…| j
d | j
d …f  |
| j
¡}|dd…| j
d d…f  |
| j
¡}|||fS )a\  
        Args:
            x: (B, 3, H, W)
            current_2d: (B, 2) current 2D position in image coords. If None, zeros.
            current_height: (B,) or (B, 1) current height (z). If None, zeros.
            current_gripper_state: (B,) or (B, 1) current gripper. If None, zeros.
            start_keypoint_2d: ignored (API compatibility; use current_2d).

        Returns:
            trajectory_2d: (B, N_WINDOW, 2) in image/camera pixel coords
            trajectory_height: (B, N_WINDOW) height (z in world) per timestep
            gripper: (B, N_WINDOW) gripper value per timestep
        r   Nr   )r3   Údtyper6   r7   é   )rL   r9   r3   r   ÚzerosrM   r8   Ú	unsqueezerA   r(   r   Úview)r)   rB   Úgt_target_heatmapÚtrainingÚstart_keypoint_2dÚcurrent_heightÚcurrent_gripperÚ
current_2dÚcurrent_gripper_stateÚclsrC   r3   ÚcondÚoutÚtrajectory_2dÚtrajectory_heightÚgripperr/   r/   r0   ÚforwardV   s*   





&*$
z'MotionTracksTrajectoryPredictor.forward)NFNNNNN)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚN_WINDOWr   r1   rL   r_   Ú__classcell__r/   r/   r-   r0   r      s    #ÿr   Ú__main__ÚmpsÚcpur   T)r   r   r*   r   rN   r   )rW   rU   rX   r\   r]   r^   ))rc   r   Útorch.nnr"   Útorch.nn.functionalÚ
functionalÚFr   r   r   ÚIMAGENET_MEANÚIMAGENET_STDrd   Ú
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERÚN_HEIGHT_BINSÚN_GRIPPER_BINSr%   ÚModuler   r`   r3   Úbackendsrg   Úis_availableÚmodelr1   ÚrandnrB   ÚrandÚcur_2dÚcur_hÚcur_gripÚno_gradÚt2dÚthÚgripr   r9   r/   r/   r/   r0   Ú<module>   sB    a

ÿô