o
    
i                     @   s   d Z ddlZddlmZ ddlm  mZ dZdZdZ	dZ
dZdZd	Zd	Zd
ZdZdZdZG dd dejZedkreejj rDdndZededdZeeZeddddeZe  eededdgd\Z Z!W d   n1 s{w   Y  e"de j# e"de!j# dS dS )ae  Model for trajectory volume prediction using DINOv2.

Predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (cross-entropy).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel during training,
decoded at predicted pixel during inference (teacher forcing in train, argmax at pred pixel in val/inference).
    Ndinov3z?dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   gi1?gɿg?    c                       sF   e Zd ZdZdedf fdd	Z fddZdd	 ZdddZ  Z	S )TrajectoryHeatmapPredictorzPredicts pixel-aligned volume (N_WINDOW x N_HEIGHT_BINS per pixel) and per-pixel gripper (N_WINDOW x N_GRIPPER_BINS per pixel).  Fc              
      s8  t    || _|| _t| _td tjj	t
ddtd| _|r4| j D ]}d|_q$| j  td ntd | jj| _td| j  tj| j| jt d	d
| _td| j dt d| j dt d	 tt| jd | _td| j d tj| j| jt d	d
| _td| j dt d| j dt d	 d S )NzLoading DINOv2 model...dinov3_vits16pluslocal)sourceweightsFu   ✓ Frozen DINOv2 backboneu    ✓ DINOv2 backbone is trainableu   ✓ DINO embedding dim:    )kernel_sizeu   ✓ Volume head: (B, *z, H_p, W_p) -> upsample to (B, z, z, H, W)g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)u"   ✓ Gripper head (per-pixel): (B, )super__init__target_sizen_windowDINO_PATCH_SIZE
patch_sizeprinttorchhubloadDINO_REPO_DIRDINO_WEIGHTS_PATHdino
parametersrequires_gradeval	embed_dimnnConv2dN_HEIGHT_BINSvolume_head	Parameterrandnstart_keypoint_embeddingN_GRIPPER_BINSgripper_head)selfr   r   freeze_backboneparam	__class__ 1/data/cameron/keygrip/volume_dino_tracks/model.pyr      sB   



&*z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr   )r   tohasattrr   )r*   devicer-   r/   r0   r1   I   s   
zTrajectoryHeatmapPredictor.toc                 C   s  |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd}|||}q| jjr[| j|ddd| jjd f }| j|dd| jjd df }	t	j
||	gdd}n| j|}|dddf }
|dd| jjd df }||||| j}|dddd }||
fS )zExtract patch features and CLS token.
        Returns:
            patch_features: (B, D, H_p, W_p)
            cls_token: (B, D)
        r   )HWNr   )dim      )shaper   prepare_tokens_with_masksblocks
rope_embeduntie_cls_and_patch_normscls_normn_storage_tokensnormr   catreshaper    permute
contiguous)r*   xBx_tokensH_pW_pblkrope_sincos
x_norm_clsx_norm_patches	cls_tokenpatch_tokenspatch_featuresr/   r/   r0   _extract_dino_featuresO   s   
$$z1TrajectoryHeatmapPredictor._extract_dino_featuresNc                 C   sr  |j d }| |\}}	|j \}
}}}| dkr"|d|d}|dddf | | j  d|d }|dddf | | j  d|d }tj	||j
d}||dd||f  | jd7  < | |}||| jt||}tj||| jt ||| j| jfddd}||| jt| j| j}| |}tj|| j| jfddd}||| jt| j| j}||fS )	a  
        Args:
            x: (B, 3, H, W)
            start_keypoint_2d: (B, 2) or (2,) optional
            current_height, current_gripper: ignored (kept for API compatibility)

        Returns:
            volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
            gripper_logits: (B, N_WINDOW, N_GRIPPER_BINS, H, W)  # per-pixel; index at GT pixel (train) or pred pixel (inference)
        r   r   N)r3   bilinearF)sizemodealign_corners)r9   rQ   r6   	unsqueezeexpandr   longclampr   aranger3   r'   r$   viewr   r#   Finterpolater)   r(   )r*   rE   gt_target_heatmaptrainingstart_keypoint_2dcurrent_heightcurrent_gripperrF   rP   rN   _DrH   rI   start_patch_xstart_patch_ybatch_indicesvolvolume_logitsgripgripper_logitsr/   r/   r0   forwardg   s4   
**$



z"TrajectoryHeatmapPredictor.forward)NFNNN)
__name__
__module____qualname____doc__N_WINDOWr   r1   rQ   rm   __classcell__r/   r/   r-   r0   r      s    +r   __main__mpscpur   T)r   r   r+   r8   r7   Fg      l@)r`   ra   rj   rl   )$rq   r   torch.nnr!   torch.nn.functional
functionalr]   r   r   r   IMAGENET_MEANIMAGENET_STDrr   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERr#   r(   Moduler   rn   r3   backendsru   is_availablemodelr1   r&   rE   no_gradtensorri   rk   r   r9   r/   r/   r/   r0   <module>   s8    y

