o
    ci*                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ej	ddZ
ej	ddZdZdZd	Zd
ZdZdZdZdZg dZg dZg dZg dZg dZdZdZdZdZG dd dejZedkre ej!j"# rjdndZ ededdZ$e$%e Z$e&dddd%e Z'e(ded%e Z)e*  e$e'e+ddg%e e)d \Z,Z-Z.Z/W d   n1 sw   Y  e0d!e,j1 e0d"e-j1 e0d#e.j1 e0d$e/j1 dS dS )%ae  Model for trajectory volume prediction using DINOv2.

Predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (cross-entropy).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel during training,
decoded at predicted pixel during inference (teacher forcing in train, argmax at pred pixel in val/inference).
    NDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3DINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   gi1?            ?)n!	r   r   )n!	@r	   r	   )        r
   r
   r   )r   r   r
   )r   r   g       @    @   c                       sX   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dd Z
dddZ  ZS )TrajectoryHeatmapPredictorzPredicts pixel-aligned volume (N_WINDOW x N_HEIGHT_BINS per pixel) and per-pixel gripper (N_WINDOW x N_GRIPPER_BINS per pixel).  Fc                    s  t    || _|| _|| _t| _td tj	j
tddtd| _|r7| j D ]}d|_q'| j  td ntd | jj| _td| j  tt| jd	 | _td
| j d | j}ttj||dddt tj||dddt tj||dddt | _td|  tj|| jt dd| _td| j dt d| d| d	 d}|| _tj|| j| dd| _tj|| jd t dd| _td| j d| d| d| d	 td| j dt d| d| d	 d S )NzLoading DINOv2 model...Zdinov3_vits16pluslocal)sourceweightsFu   ✓ Frozen DINOv2 backboneu    ✓ DINOv2 backbone is trainableu   ✓ DINO embedding dim: g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)      )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r   u   ✓ Volume   head → (B, z,    u   ✓ Gripper  head → (B, u   )  [1×1 conv, CE 2-class]u   ✓ Rotation head → (B, z, 3, u   )  [1×1 conv, CE]) super__init__target_size	pred_sizen_windowDINO_PATCH_SIZEZ
patch_sizeprinttorchhubloadr   r   dino
parametersrequires_gradeval	embed_dimnn	Parameterrandnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_headn_gripper_classesgripper_head
N_ROT_BINSrotation_head)selfr   r   r   freeze_backboneparamDZN_GRIPPER_CLASSES	__class__ 1/data/cameron/para_droid_pretrain/libero/model.pyr   '   sP   



$$(z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr"   )r   tohasattrr"   )r5   devicer9   r;   r<   r=   \   s   
zTrajectoryHeatmapPredictor.toc                 C   s  |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd}|||}q| jjr[| j|ddd| jjd f }| j|dd| jjd df }	t	j
||	gdd}n| j|}|dddf }
|dd| jjd df }||||| j}|dddd }||
fS )zExtract patch features and CLS token.
        Returns:
            patch_features: (B, D, H_p, W_p)
            cls_token: (B, D)
        r   )HWNr   )dimr   r   )shaper"   Zprepare_tokens_with_masksblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensnormr   catreshaper&   permute
contiguous)r5   xBZx_tokensH_pW_pblkZrope_sincosZ
x_norm_clsZx_norm_patches	cls_tokenZpatch_tokenspatch_featuresr;   r;   r<   _extract_dino_featuresb   s   
$$z1TrajectoryHeatmapPredictor._extract_dino_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
dd|	|f S )zIndex spatial feature map at specified pixel locations.

        Args:
            feats:         (B, D, H, W)
            query_pixels:  (B, N, 2) pixel coords [x, y] in feats coordinate space

        Returns:
            indexed: (B, N, D)
        r   .r   r   .r   r?   N)rC   longclampr   aranger?   viewexpand)r5   featsquery_pixelsrK   r8   r@   rA   Npxpy	batch_idxr;   r;   r<   _index_featuresz   s   

 z*TrajectoryHeatmapPredictor._index_featuresc                 C   s  |j d }|j d }| j }}| j}t}|d  d|d }	|d  d|d }
| ||||||}tj	||j
d|d||}tj	||j
dd|||}|||dd|
|	f }| |||d|||}|||dddd|
|	f }||fS )al  Index dense gripper/rotation maps at specified pixel locations.

        Args:
            feats:         (B, D, pred_size, pred_size)
            query_pixels:  (B, N_WINDOW, 2) in pred_size coordinate space

        Returns:
            gripper_logits:  (B, N_WINDOW, 2) logits for [open, close]
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        r   r   rR   rS   rT   Nr   )rC   r   r1   r3   rU   rV   r2   rX   r   rW   r?   rY   r4   )r5   rZ   r[   rK   r\   r@   rA   ZNcZNrr]   r^   Zgrip_mapr_   Ztime_idxgripper_logitsZrot_maprotation_logitsr;   r;   r<   predict_at_pixels   s   


  z,TrajectoryHeatmapPredictor.predict_at_pixelsNc                 C   s@  |j d }| |\}}|j \}}}	}
| dkr"|d|d}|dddf |
 | j  d|
d }|dddf |	 | j  d|	d }tj	||j
d}||dd||f  | jd7  < tj|| j| jfddd}| |}| |}||| jt| j| j}|dur| ||\}}nd }}||||fS )	u+  
        Args:
            x:                  (B, 3, H, W)
            start_keypoint_2d:  (B, 2) or (2,) current EEF pixel in image coords
            query_pixels:       (B, N_WINDOW, 2) pixel coords in pred_size space to query
                                for gripper/rotation.  Pass GT pixels during training;
                                pass predicted pixels (volume argmax) during inference.
                                If None, gripper/rotation logits are not computed.

        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, 2)  or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)   or None
            feats:           (B, D, pred_size, pred_size)  — for predict_at_pixels at eval
        r   r   NrT   bilinearF)sizemodealign_corners)rC   rQ   rB   	unsqueezerY   r   rU   rV   r   rW   r?   r*   Finterpolater   r.   r0   rX   r   r/   rc   )r5   rJ   start_keypoint_2dr[   rK   rP   rO   _r8   rL   rM   Zstart_patch_xZstart_patch_yZbatch_indicesrZ   volZvolume_logitsra   rb   r;   r;   r<   forward   s"   
**$

z"TrajectoryHeatmapPredictor.forward)N)__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWr   r=   rQ   r`   rc   ro   __classcell__r;   r;   r9   r<   r   $   s    5!r   __main__mpscpur   T)r   r   r6   r   r   g      l@)rl   r[   zvolume_logits  zgripper_logits rb   zfeats          )2rs   osr   torch.nnr'   Ztorch.nn.functional
functionalrj   environgetr   r   r   IMAGENET_MEANIMAGENET_STDru   Z
MIN_HEIGHTZ
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERZMIN_ROTZMAX_ROTREF_ROTATION_QUATZMIN_POSZMAX_POSr/   ZN_GRIPPER_BINSr3   rt   Moduler   rp   r?   backendsrx   is_availablemodelr=   r)   rJ   zerosZ
fake_queryno_gradtensorrn   ZgriprotrZ   r   rC   r;   r;   r;   r<   <module>   sP     8

(