o
    jA;                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ej	ddZ
ej	ddZej	ddZej	d	d
ZdZdZdZdZdZdZdZdZg dZg dZg dZg dZdZdZdZdZG dd dejZe dkre!ej"j#$ rtdndZ!ededd Z%e%&e!Z%e'd!d"dd&e!Z(e)d!ed!&e!Z*e+  e%e(e,d#d#g&e!e*d$\Z-Z.Z/Z0W d   n1 sw   Y  e1d%e-j2 e1d&e.j2 e1d'e/j2 e1d(e0j2 dS dS ))ae  Model for trajectory volume prediction using DINOv2.

Predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (cross-entropy).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel during training,
decoded at predicted pixel during inference (teacher forcing in train, argmax at pred pixel in val/inference).
    NDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3DINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthDINO_CONVNEXT_WEIGHTS_PATHzx/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_convnext_small_pretrain_lvd1689m-296db49d.pthDINO_BACKBONE
vits16plus   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   gi1?            ?)n!	r   r   )n!	@r   r   )r	   r	   g        )r
   r
          @    
   @   c                       sl   e Zd ZdZdeeddfdedB f fddZ fdd	Zd
d Z	dd Z
dd Zdd ZdddZ  ZS )TrajectoryHeatmapPredictorzPredicts pixel-aligned volume (N_WINDOW x N_HEIGHT_BINS per pixel) and per-pixel gripper (N_WINDOW x N_GRIPPER_BINS per pixel).  FNbackbonec           
         sN  t    || _|| _|| _t| _|pt| _| jdkr.t	d t
jjtddtd| _d| _n!| jdkrFt	d t
jjtd	dtd| _d
| _n	td| jd|rk| j D ]}d|_qV| j  t	d| j d n	t	d| j d | jj| _t	d| j  tt
| jd | _t	d| j d | j}ttj||dd
dt tj||dd
dt tj||dd
dt | _t	d|  tj|| jt d
d| _ t	d| j dt d| d| d	 | j| }tj|t!d
d| _"tj|dt# d
d| _$| jdkrdnd}	t	d| j dt! d|	 d  t	d!| j d"t# d|	 d  d S )#Nr   z!Loading DINOv3 ViT-S/16+ model...Zdinov3_vits16pluslocal)sourceweights   convnext_smallz&Loading DINOv3 ConvNeXt-Small model...Zdinov3_convnext_small   zunknown backbone z, (expected 'vits16plus' or 'convnext_small')Fu   ✓ Frozen z	 backboneu   ✓ z backbone is trainableu   ✓ DINO embedding dim: g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)   )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r   u   ✓ Volume   head → (B, z, zfeats||CLS, integer-indexedz!feats only, grid_sample sub-pixelu   ✓ Gripper  head → (B, u   )   [1×1 conv on ]u   ✓ Rotation head → (B, z, 3, )%super__init__target_size	pred_sizen_windowDINO_PATCH_SIZEZ
patch_sizer   r   printtorchhubloadr   r   dinoZ_head_channel_multr   
ValueError
parametersrequires_gradeval	embed_dimnn	Parameterrandnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_headN_GRIPPER_BINSgripper_head
N_ROT_BINSrotation_head)
selfr!   r"   r#   freeze_backboner   paramDZhead_in_dimZ	_head_tag	__class__ $/data/cameron/para/para_mac/model.pyr    4   sf   





$
"z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr)   )r   tohasattrr)   )r=   devicerA   rC   rD   rE      s   
zTrajectoryHeatmapPredictor.toc                 C   s  |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd}|||}q| jjr[| j|ddd| jjd f }| j|dd| jjd df }	t	j
||	gdd}n| j|}|dddf }
|dd| jjd df }||||| j}|dddd }||
fS )zViT path: extract patch features and CLS token.
        Returns:
            patch_features: (B, D, H_p, W_p)   stride=patch_size (16)
            cls_token: (B, D)
        r   )HWNr   dimr   r   )shaper)   Zprepare_tokens_with_masksblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensnormr&   catreshaper.   permute
contiguous)r=   xBZx_tokensH_pW_pblkZrope_sincosZ
x_norm_clsZx_norm_patches	cls_tokenpatch_tokenspatch_featuresrC   rC   rD   _extract_dino_features   s   
$$z1TrajectoryHeatmapPredictor._extract_dino_featuresc           	      C   s   |j d }| j|}|d }|d }|j d }tt|d }|| |ks-J d| ||||| j}|dddd }||fS )	u=  ConvNeXt path: take the stride-32 final stage feature map.
        Returns:
            patch_features: (B, D, H_p, W_p)   stride 32 (448 -> 14×14)
            cls_token: (B, D)                   synthetic global-pool; ignored
                                                downstream (kept for API parity)
        r   Zx_norm_clstokenZx_norm_patchtokensr   g      ?z$convnext patch tokens not square: N=r   r   )	rL   r)   Zforward_featuresintroundrP   r.   rQ   rR   )	r=   rS   rT   outrX   rY   NsiderZ   rC   rC   rD   _extract_convnext_features   s   

z5TrajectoryHeatmapPredictor._extract_convnext_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
dd|	|f S )zIndex spatial feature map at specified pixel locations.

        Args:
            feats:         (B, D, H, W)
            query_pixels:  (B, N, 2) pixel coords [x, y] in feats coordinate space

        Returns:
            indexed: (B, N, D)
        r   .r   r   .r   rG   N)rL   longclampr&   arangerG   viewexpand)r=   featsquery_pixelsrT   r@   rH   rI   r_   pxpyZ	batch_idxrC   rC   rD   _index_features   s   

 z*TrajectoryHeatmapPredictor._index_featuresc                 C   s   |j \}}}}|j d }| |}| |}	|d  }
|d  }d|
 t|d d d }d| t|d d d }tj||gddd}tj	||dd	d
}tj	|	|dd	d
}|
ddd}|
ddd||dt}||fS )u  Apply gripper/rotation 1×1 conv heads densely, then sample at the
        query pixels via F.grid_sample (bilinear, sub-pixel). head_input is
        (B, C, H, W) where C is 2D (ViT path: feats||CLS) or D (ConvNeXt path:
        feats only).

        Called with GT pixels during training (teacher forcing) and with
        predicted pixels (from volume argmax) during inference. No detach:
        gradients flow back into the shared features.

        Args:
            head_input:    (B, C, pred_size, pred_size)
            query_pixels:  (B, N_WINDOW, 2) in pred_size coordinate space; can
                           be float for sub-pixel queries.

        Returns:
            gripper_logits:  (B, N_WINDOW, N_GRIPPER_BINS)
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        r   rb   rc   r   r
   rJ   bilinearT)modealign_cornersr   r   )rL   r:   r<   floatmaxr&   stack	unsqueezeFgrid_samplesqueeze	transposerP   r;   )r=   
head_inputrk   rT   _rH   rI   r_   Zgrip_mapZrot_mapqxqynorm_xZnorm_ygridgriprotgripperrotationrC   rC   rD   predict_at_pixels   s   


z,TrajectoryHeatmapPredictor.predict_at_pixelsc                 C   s  |j d }| jdkr| |\}}n| |\}}|j \}}}	}
| dkr/|d|d}|dddf |
 | j  	d|
d }|dddf |	 | j  	d|	d }t
j||jd}||dd||f  | jd7  < tj|| j| jfddd	}| |}| |}||| jt| j| j}| jdkr|ddddddf dd| j| j}t
j||gdd
}n|}|dur| ||\}}nd }}||||fS )u;  
        Args:
            x:                  (B, 3, H, W)
            start_keypoint_2d:  (B, 2) or (2,) current EEF pixel in image coords
            query_pixels:       (B, N_WINDOW, 2) pixel coords in pred_size space to query
                                for gripper/rotation.  Pass GT pixels during training;
                                pass predicted pixels (volume argmax) during inference.
                                If None, gripper/rotation logits are not computed.

        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, N_GRIPPER_BINS)  or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)   or None
            feats:           (B, D, pred_size, pred_size)  — for downstream predict_at_pixels
        r   r   r   ro   Nrd   rp   F)sizerq   rr   rJ   )rL   r   r[   ra   rK   rv   ri   r!   re   rf   r&   rg   rG   r2   rw   interpolater"   r6   r8   rh   r#   r7   rO   r   )r=   rS   start_keypoint_2drk   rT   rZ   rX   r|   r@   rU   rV   Zstart_patch_xZstart_patch_yZbatch_indicesrj   volvolume_logitsZcls_broadcastr{   gripper_logitsrotation_logitsrC   rC   rD   forward   s6   

**$



z"TrajectoryHeatmapPredictor.forward)N)__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWstrr    rE   r[   ra   rn   r   r   __classcell__rC   rC   rA   rD   r   1   s    M&r   __main__mpscpur   T)r!   r#   r>   r   r   g      l@)r   rk   zvolume_logits  zgripper_logits r   zfeats          )3r   osr&   torch.nnr/   torch.nn.functional
functionalrw   environgetr   r   r   r   r$   ZIMAGENET_MEANZIMAGENET_STDr   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERMIN_ROTMAX_ROTMIN_POSMAX_POSr7   r9   r;   r   Moduler   r   rG   backendsr   is_availablemodelrE   r1   rS   zerosZ
fake_queryno_gradtensorr   r   r   rj   r%   rL   rC   rC   rC   rD   <module>   sX     u

(