o
    Ii=)                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ej	ddZ
ej	ddZdZdZd	Zd
ZdZdZdZdZg dZg dZg dZg dZdZdZdZdZG dd dejZedkreej j!" rfdndZededdZ#e#$eZ#e%dddd$eZ&e'ded$eZ(e)  e#e&e*ddg$ee(d\Z+Z,Z-Z.W d   n1 sw   Y  e/d e+j0 e/d!e,j0 e/d"e-j0 e/d#e.j0 dS dS )$ae  Model for trajectory volume prediction using DINOv2.

Predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (cross-entropy).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel during training,
decoded at predicted pixel during inference (teacher forcing in train, argmax at pred pixel in val/inference).
    NDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3DINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   gi1?            ?)n!	r   r   )n!	@r	   r	   )r   r   g        )r   r   g       @    @   c                       sX   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dd Z
dddZ  ZS )TrajectoryHeatmapPredictorzPredicts pixel-aligned volume (N_WINDOW x N_HEIGHT_BINS per pixel) and per-pixel gripper (N_WINDOW x N_GRIPPER_BINS per pixel).  Fc                    s  t    || _|| _|| _t| _td tj	j
tddtd| _|r7| j D ]}d|_q'| j  td ntd | jj| _td| j  tt| jd	 | _td
| j d | j}ttj||dddt tj||dddt tj||dddt | _td|  tj|| jt dd| _td| j dt d| d| d	 tt|t||t t|d| _tt|t||t t|dt | _ td| j d td| j dt d d S )NzLoading DINOv2 model...Zdinov3_vits16pluslocal)sourceweightsFu   ✓ Frozen DINOv2 backboneu    ✓ DINOv2 backbone is trainableu   ✓ DINO embedding dim: g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)      )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r   u   ✓ Volume   head → (B, z, u   ✓ Gripper  MLP  → (B, z+, 1)  [BCE sigmoid, indexed at query pixel]u   ✓ Rotation MLP  → (B, z, 3, z )   [CE, indexed at query pixel])!super__init__target_size	pred_sizen_windowDINO_PATCH_SIZEZ
patch_sizeprinttorchhubloadr   r   dino
parametersrequires_gradeval	embed_dimnn	Parameterrandnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_head	LayerNormLineargripper_mlp
N_ROT_BINSrotation_mlp)selfr   r   r   freeze_backboneparamD	__class__ "/data/cameron/para/libero/model.pyr   &   sT   



$"&z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr    )r   tohasattrr    )r4   devicer8   r:   r;   r<   `   s   
zTrajectoryHeatmapPredictor.toc                 C   s  |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd}|||}q| jjr[| j|ddd| jjd f }| j|dd| jjd df }	t	j
||	gdd}n| j|}|dddf }
|dd| jjd df }||||| j}|dddd }||
fS )zExtract patch features and CLS token.
        Returns:
            patch_features: (B, D, H_p, W_p)
            cls_token: (B, D)
        r   )HWNr   )dimr      )shaper    Zprepare_tokens_with_masksblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensnormr   catreshaper$   permute
contiguous)r4   xBZx_tokensH_pW_pZblkZrope_sincosZ
x_norm_clsZx_norm_patches	cls_tokenZpatch_tokenspatch_featuresr:   r:   r;   _extract_dino_featuresf   s   
$$z1TrajectoryHeatmapPredictor._extract_dino_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
dd|	|f S )zIndex spatial feature map at specified pixel locations.

        Args:
            feats:         (B, D, H, W)
            query_pixels:  (B, N, 2) pixel coords [x, y] in feats coordinate space

        Returns:
            indexed: (B, N, D)
        r   ).r   r   ).r   r>   N)rC   longclampr   aranger>   viewexpand)r4   featsquery_pixelsrK   r7   r?   r@   NpxpyZ	batch_idxr:   r:   r;   _index_features~   s   

 z*TrajectoryHeatmapPredictor._index_featuresc           
      C   sh   |j dd \}}| | |}||| | j}| |||}| |}|||dt}	||	fS )a  Apply gripper/rotation MLP heads at the specified pixel locations.

        Called with GT pixels during training (teacher forcing) and with predicted
        pixels (from volume argmax) during inference.

        Args:
            feats:         (B, D, pred_size, pred_size)
            query_pixels:  (B, N_WINDOW, 2) in pred_size coordinate space

        Returns:
            gripper_logits:  (B, N_WINDOW) raw logits (apply sigmoid for probability)
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        NrB   r   )rC   r\   detachrG   r$   r1   r3   r2   )
r4   rW   rX   rK   rY   indexedflatZgripperZrot_flatrotationr:   r:   r;   predict_at_pixels   s   
z,TrajectoryHeatmapPredictor.predict_at_pixelsNc                 C   s@  |j d }| |\}}|j \}}}	}
| dkr"|d|d}|dddf |
 | j  d|
d }|dddf |	 | j  d|	d }tj	||j
d}||dd||f  | jd7  < tj|| j| jfddd}| |}| |}||| jt| j| j}|dur| ||\}}nd }}||||fS )	u;  
        Args:
            x:                  (B, 3, H, W)
            start_keypoint_2d:  (B, 2) or (2,) current EEF pixel in image coords
            query_pixels:       (B, N_WINDOW, 2) pixel coords in pred_size space to query
                                for gripper/rotation.  Pass GT pixels during training;
                                pass predicted pixels (volume argmax) during inference.
                                If None, gripper/rotation logits are not computed.

        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, N_GRIPPER_BINS)  or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)   or None
            feats:           (B, D, pred_size, pred_size)  — for downstream predict_at_pixels
        r   r   NrQ   bilinearF)sizemodealign_corners)rC   rP   rA   	unsqueezerV   r   rR   rS   r   rT   r>   r(   Finterpolater   r,   r.   rU   r   r-   ra   )r4   rJ   start_keypoint_2drX   rK   rO   rN   _r7   rL   rM   Zstart_patch_xZstart_patch_yZbatch_indicesrW   volvolume_logitsgripper_logitsrotation_logitsr:   r:   r;   forward   s"   
**$

z"TrajectoryHeatmapPredictor.forward)N)__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWr   r<   rP   r\   ra   rp   __classcell__r:   r:   r8   r;   r   #   s    :r   __main__mpscpur   T)r   r   r5   rB   r   g      l@)rj   rX   zvolume_logits  zgripper_logits ro   zfeats          )1rt   osr   torch.nnr%   torch.nn.functional
functionalrh   environgetr   r   r   IMAGENET_MEANIMAGENET_STDrv   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERMIN_ROTMAX_ROTMIN_POSMAX_POSr-   N_GRIPPER_BINSr2   ru   Moduler   rq   r>   backendsry   is_availablemodelr<   r'   rJ   zerosZ
fake_queryno_gradtensorrl   ZgriprotrW   r   rC   r:   r:   r:   r;   <module>   sN     3

(