o
    i*                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ej	ddZ
ej	ddZdZdZd	Zd
ZdZdZdZdZg dZg dZg dZg dZg dZdZdZdZdZG dd dejZedkre ej!j"# rjdndZ ededdZ$e$%e Z$e&dddd%e Z'e(ded%e Z)e*  e$e'e+ddg%e e)d \Z,Z-Z.Z/W d   n1 sw   Y  e0d!e,j1 e0d"e-j1 e0d#e.j1 e0d$e/j1 dS dS )%ae  Model for trajectory volume prediction using DINOv2.

Predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (cross-entropy).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel during training,
decoded at predicted pixel during inference (teacher forcing in train, argmax at pred pixel in val/inference).
    NDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3DINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   gi1?            ?)n!	r   r   )n!	@r	   r	   )        r
   r
   r   )r   r   r
   )r   r   g       @    @   c                       sX   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dd Z
dddZ  ZS )TrajectoryHeatmapPredictorzPredicts pixel-aligned volume (N_WINDOW x N_HEIGHT_BINS per pixel) and per-pixel gripper (N_WINDOW x N_GRIPPER_BINS per pixel).  Fc                    s  t    || _|| _|| _t| _td tj	j
tddtd| _|r7| j D ]}d|_q'| j  td ntd | jj| _td| j  tt| jd	 | _td
| j d | j}ttj||dddt tj||dddt tj||dddt | _td|  tj|| jt dd| _td| j dt d| d| d	 d}|| _tj|| j| dd| _tj|| jd t dd| _td| j d| d| d| d	 td| j dt d| d| d	 d S )NzLoading DINOv2 model...Zdinov3_vits16pluslocal)sourceweightsFu   ✓ Frozen DINOv2 backboneu    ✓ DINOv2 backbone is trainableu   ✓ DINO embedding dim: g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)      )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r   u   ✓ Volume   head → (B, z,    u   ✓ Gripper  head → (B, u   )  [1×1 conv, CE 2-class]u   ✓ Rotation head → (B, z, 3, u   )  [1×1 conv, CE]) super__init__target_size	pred_sizen_windowDINO_PATCH_SIZEZ
patch_sizeprinttorchhubloadr   r   dino
parametersrequires_gradeval	embed_dimnn	Parameterrandnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_headn_gripper_classesgripper_head
N_ROT_BINSrotation_head)selfr   r   r   freeze_backboneparamDZN_GRIPPER_CLASSES	__class__ 9/data/cameron/567_augmentation_viewpoint_project/model.pyr   '   sP   



$$(z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr"   )r   tohasattrr"   )r5   devicer9   r;   r<   r=   \   s   
zTrajectoryHeatmapPredictor.toc                 C   s  |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd}|||}q| jjr[| j|ddd| jjd f }| j|dd| jjd df }	t	j
||	gdd}n| j|}|dddf }
|dd| jjd df }||||| j}|dddd }||
fS )zExtract patch features and CLS token.
        Returns:
            patch_features: (B, D, H_p, W_p)
            cls_token: (B, D)
        r   )HWNr   )dimr   r   )shaper"   Zprepare_tokens_with_masksblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensnormr   catreshaper&   permute
contiguous)r5   xBZx_tokensH_pW_pZblkZrope_sincosZ
x_norm_clsZx_norm_patches	cls_tokenZpatch_tokenspatch_featuresr;   r;   r<   _extract_dino_featuresb   s   
$$z1TrajectoryHeatmapPredictor._extract_dino_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
dd|	|f S )zIndex spatial feature map at specified pixel locations.

        Args:
            feats:         (B, D, H, W)
            query_pixels:  (B, N, 2) pixel coords [x, y] in feats coordinate space

        Returns:
            indexed: (B, N, D)
        r   .r   r   .r   r?   N)rC   longclampr   aranger?   viewexpand)r5   featsquery_pixelsrK   r8   r@   rA   Npxpy	batch_idxr;   r;   r<   _index_featuresz   s   

 z*TrajectoryHeatmapPredictor._index_featuresc                 C   s  |j d }|j d }| j }}| j}t}|d  d|d }	|d  d|d }
| ||||||}tj	||j
d|d||}tj	||j
dd|||}|||dd|
|	f }| |||d|||}|||dddd|
|	f }||fS )al  Index dense gripper/rotation maps at specified pixel locations.

        Args:
            feats:         (B, D, pred_size, pred_size)
            query_pixels:  (B, N_WINDOW, 2) in pred_size coordinate space

        Returns:
            gripper_logits:  (B, N_WINDOW, 2) logits for [open, close]
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        r   r   rQ   rR   rS   Nr   )rC   r   r1   r3   rT   rU   r2   rW   r   rV   r?   rX   r4   )r5   rY   rZ   rK   r[   r@   rA   ZNcZNrr\   r]   Zgrip_mapr^   Ztime_idxgripper_logitsZrot_maprotation_logitsr;   r;   r<   predict_at_pixels   s   


  z,TrajectoryHeatmapPredictor.predict_at_pixelsNc                 C   s@  |j d }| |\}}|j \}}}	}
| dkr"|d|d}|dddf |
 | j  d|
d }|dddf |	 | j  d|	d }tj	||j
d}||dd||f  | jd7  < tj|| j| jfddd}| |}| |}||| jt| j| j}|dur| ||\}}nd }}||||fS )	u+  
        Args:
            x:                  (B, 3, H, W)
            start_keypoint_2d:  (B, 2) or (2,) current EEF pixel in image coords
            query_pixels:       (B, N_WINDOW, 2) pixel coords in pred_size space to query
                                for gripper/rotation.  Pass GT pixels during training;
                                pass predicted pixels (volume argmax) during inference.
                                If None, gripper/rotation logits are not computed.

        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, 2)  or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)   or None
            feats:           (B, D, pred_size, pred_size)  — for predict_at_pixels at eval
        r   r   NrS   bilinearF)sizemodealign_corners)rC   rP   rB   	unsqueezerX   r   rT   rU   r   rV   r?   r*   Finterpolater   r.   r0   rW   r   r/   rb   )r5   rJ   start_keypoint_2drZ   rK   rO   rN   _r8   rL   rM   Zstart_patch_xZstart_patch_yZbatch_indicesrY   volvolume_logitsr`   ra   r;   r;   r<   forward   s"   
**$

z"TrajectoryHeatmapPredictor.forward)N)__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWr   r=   rP   r_   rb   ro   __classcell__r;   r;   r9   r<   r   $   s    5!r   __main__mpscpur   T)r   r   r6   r   r   g      l@)rk   rZ   zvolume_logits  zgripper_logits ra   zfeats          )2rs   osr   torch.nnr'   torch.nn.functional
functionalri   environgetr   r   r   IMAGENET_MEANIMAGENET_STDru   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERMIN_ROTMAX_ROTREF_ROTATION_QUATMIN_POSMAX_POSr/   N_GRIPPER_BINSr3   rt   Moduler   rp   r?   backendsrx   is_availablemodelr=   r)   rJ   zerosZ
fake_queryno_gradtensorrm   ZgriprotrY   r   rC   r;   r;   r;   r<   <module>   sP     8

(