o
    ÉlŠiƒ&  ã                   @   sj  d Z ddlZddlZddlmZ ddlm  mZ dZdZ	dZ
dZdZdZd	Zd	Zd
ZdZdZdZdZdZd#dd„ZG dd„ dejƒZG dd„ dejƒZedkr³e ejj ¡ rZdnd¡ZededdZe  e¡Ze !dddd¡  e¡Z"e !dd¡  e¡d Z#e $d¡  e¡Z%e &¡  ee"de#e%d \Z'Z(W d  ƒ n1 s w   Y  e)d!e'j*ƒ e)d"e(j*ƒ dS dS )$aH  Diffusion policy baseline: image + current robot state -> diffusion over (3D + gripper) state, N~10 steps, global robot frame.

Conditioned on current 3D position (3) and gripper state (1) concatenated to CLS.
State = (trajectory_3d flattened, gripper flattened) = (N_WINDOW*4,). Condition = [CLS, current_3d, current_gripper].
é    NÚdinov3z?dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   )g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?é   gœiÂö“1¦?gš™™™™™É¿gš™™™™™é?é    é
   é   çü©ñÒMb€?c                 C   st   | d }t  d| |¡}t  ||  | d|  tj d ¡d }||d  }d|dd… |dd…   }t  |dd¡S )	z-Cosine schedule as proposed in improved DDPM.é   r   g      à?é   Néÿÿÿÿg-Cëâ6?g§èH.ÿï?)ÚtorchÚlinspaceÚcosÚmathÚpiÚclip)Ú	timestepsÚsÚstepsÚxÚalphas_cumprodÚbetas© r   úD/data/cameron/keygrip/volume_dino_tracks_diffusion_baseline/model.pyÚcosine_beta_schedule   s   (r   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚSinusoidalPositionEmbeddingsc                    s   t ƒ  ¡  || _d S )N)ÚsuperÚ__init__Údim)Úselfr   ©Ú	__class__r   r   r   )   s   

z%SinusoidalPositionEmbeddings.__init__c                 C   s‚   |j }| jd }t d¡|d  }t tj||d ¡ |  ¡}|d d …d f  ¡ |d d d …f  }tj| 	¡ | 
¡ gdd}|S )Nr
   i'  r	   ©Údevicer   ©r   )r#   r   r   Úlogr   ÚexpÚarangeÚfloatÚcatÚsinr   )r   Útimer#   Úhalf_dimÚ
embeddingsr   r   r   Úforward-   s   
$z$SinusoidalPositionEmbeddings.forward)Ú__name__Ú
__module__Ú__qualname__r   r.   Ú__classcell__r   r   r    r   r   (   s    r   c                       sj   e Zd ZdZdeddedf‡ fdd„	Z‡ fdd	„Zd
d„ Zdd„ Z	dd„ Z
dd„ Z			ddd„Z‡  ZS )ÚDiffusionTrajectoryPredictorzcDiffusion over (trajectory_3d, gripper) state conditioned on image. Global robot frame. N~10 steps.éÀ  Fi   é@   c                    s†  t ƒ  ¡  || _|| _|d | _|| _t| _tdƒ t	j
jtddtd| _|r<| j ¡ D ]}d|_q,| j ¡  tdƒ ntdƒ | jj| _t | jt |¡| _t|ƒ}d	| }	t	j|	d
d}
|  d|¡ |  d|	¡ |  d|
¡ |  dt	 |
¡¡ |  dt	 d	|
 ¡¡ t|ƒ| _t t | j| | |d ¡t ¡ t d¡t |d |¡t ¡ t d¡t || j¡¡| _ td| j› d|› dƒ d S )Nr   zLoading DINOv2 model...Údinov3_vits16plusÚlocal)ÚsourceÚweightsFu   âœ“ Frozen DINOv2 backboneu    âœ“ DINOv2 backbone is trainableg      ð?r   r$   r   Úalphasr   Úsqrt_alphas_cumprodÚsqrt_one_minus_alphas_cumprodr
   çš™™™™™¹?u   âœ“ Diffusion: state_dim=z, num_steps=z), cond=[CLS, current_3d, current_gripper])!r   r   Útarget_sizeÚn_windowÚ	state_dimÚ	num_stepsÚDINO_PATCH_SIZEÚ
patch_sizeÚprintr   ÚhubÚloadÚDINO_REPO_DIRÚDINO_WEIGHTS_PATHÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚnnÚLinearÚCURRENT_STATE_DIMÚ	cond_projr   ÚcumprodÚregister_bufferÚsqrtr   Út_embedÚ
SequentialÚGELUÚDropoutÚdenoiser)r   r>   r?   Úfreeze_backboneÚ
hidden_dimrA   Út_embed_dimÚparamr   r:   r   r    r   r   r   :   sP   

ü



ù	z%DiffusionTrajectoryPredictor.__init__c                    s(   t ƒ  |¡ t| dƒr| j |¡| _| S )NrI   )r   ÚtoÚhasattrrI   )r   r#   r    r   r   r^   m   s   
zDiffusionTrajectoryPredictor.toc                 C   sÖ   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd }|||ƒ}q| jjr[| j |d d …d | jjd …f ¡}| j |d d …| jjd d …f ¡}	t	j
||	gdd}n| j |¡}|d d …df }
|
S )Nr   )ÚHÚWr	   r$   )ÚshaperI   Úprepare_tokens_with_masksÚblocksÚ
rope_embedÚuntie_cls_and_patch_normsÚcls_normÚn_storage_tokensÚnormr   r)   )r   r   ÚBÚx_tokensÚH_pÚW_pÚblkÚrope_sincosÚ
x_norm_clsÚx_norm_patchesÚ	cls_tokenr   r   r   Ú_extract_clss   s   
$$z)DiffusionTrajectoryPredictor._extract_clsc                 C   s\   |j d }|dd…d| jd …f  || jd¡}|dd…| jd d…f  || j¡}||fS )zNstate (B, state_dim) -> trajectory_3d (B, N_WINDOW, 3), gripper (B, N_WINDOW).r   Né   )rb   r?   Úview)r   Ústaterj   ÚtrajÚgripr   r   r   Ú_state_to_trajectory_gripper‚   s   
&$z9DiffusionTrajectoryPredictor._state_to_trajectory_gripperc                 C   s4   |j d }| |d¡}| |d¡}tj||gddS )z2(B, N_WINDOW, 3), (B, N_WINDOW) -> (B, state_dim).r   r   r	   r$   )rb   Úreshaper   r)   )r   Útrajectory_3dÚgripperrj   Ú	traj_flatÚ	grip_flatr   r   r   Ú_trajectory_gripper_to_state‰   s   
z9DiffusionTrajectoryPredictor._trajectory_gripper_to_statec                 C   sÖ   |   |¡}tj|||gdd}|  |¡}| j|  dd¡}| j|  dd¡}| j|  dd¡}	||	|  | }
|dkre| j|d   dd¡}t 	|¡}tj
||jd}||	|  | }|||  }||
fS |
}||
fS )zMSingle denoise step: predict noise and return x_{t-1} (or x_0 for last step).r	   r$   r   r   r"   )rU   r   r)   rY   r   ru   r;   r<   r   rT   Ú
randn_liker#   )r   Úx_tÚtÚcondrU   Údenoiser_inÚ
noise_predÚalpha_tÚ
sqrt_alphaÚsqrt_one_minusÚx0_predÚbeta_tÚsigma_tÚnoiseÚx_prevr   r   r   Ú_denoise_step   s    


ÿz*DiffusionTrajectoryPredictor._denoise_stepNc
                 C   s¼  |   |¡}
|
jd }|
j}|du rtj|d||
jd}|	du r(tj|||
jd}	|	 ¡ dkr3|	 d¡}	tj|
||	gdd}|  	|¡}|r£|dur£|dur£|  
||¡}|jd }tjd| j|f|jtjd}tj||jd}| j|  dd¡}| j|  dd¡}|| ||  }|  |¡}tj|||gdd}|  |¡}t ||¡}|S |jd }tj|| j|j|jd}tt| jƒƒD ]}tj|f||jtjd}|  |||¡\}}q»|  |¡\}}||fS )	aö  
        Args:
            x: (B, 3, H, W)
            current_3d: (B, 3) current gripper 3D in world. If None, zeros.
            current_gripper_state: (B,) or (B, 1) current gripper. If None, zeros.
            gt_trajectory_3d, gt_gripper: for training only
            training: if True, compute diffusion loss; else sample and return (trajectory_3d, gripper)

        Returns:
            If training: loss (scalar)
            Else: trajectory_3d (B, N_WINDOW, 3), gripper (B, N_WINDOW)
        r   Nrt   )r#   Údtyper	   r$   r"   r   )rs   rb   r#   r   Úzerosr   r   Ú	unsqueezer)   rQ   r   ÚrandintrA   Úlongr€   r;   ru   r<   rU   rY   ÚFÚmse_lossÚrandnr@   ÚreversedÚrangeÚfullrŽ   ry   )r   r   Úgt_trajectory_3dÚ
gt_gripperÚtrainingÚstart_keypoint_2dÚcurrent_heightÚcurrent_gripperÚ
current_3dÚcurrent_gripper_stateÚclsrj   r#   rƒ   Úx0r‚   rŒ   r‡   rˆ   r   rU   r„   r…   ÚlossÚt_batchÚ_r{   r|   r   r   r   r.   ¤   s>   







z$DiffusionTrajectoryPredictor.forward)NNFNNNNN)r/   r0   r1   Ú__doc__ÚN_WINDOWÚNUM_DIFFUSION_STEPSr   r^   rs   ry   r   rŽ   r.   r2   r   r   r    r   r3   7   s    ÿ3þr3   Ú__main__ÚmpsÚcpur4   T)r>   r?   rZ   r
   rt   r=   F)rœ   r    r¡   r{   r|   )r   )+r§   r   r   Útorch.nnrN   Útorch.nn.functionalÚ
functionalr”   rG   rH   rB   ÚIMAGENET_MEANÚIMAGENET_STDr¨   Ú
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERÚN_HEIGHT_BINSÚN_GRIPPER_BINSr©   rP   r   ÚModuler   r3   r/   r#   Úbackendsr«   Úis_availableÚmodelr^   r–   r   Úcur_3dÚrandÚcur_gripÚno_gradrw   rx   rD   rb   r   r   r   r   Ú<module>   sH    

 !

ÿö