
    3id1              	          S r SSKrSSKrSSKJr  SSKJs  Jr  SrSr	Sr
SrSrSrSrSrS	 r " S
 S\R"                  5      r\S:X  a  \R(                  " \R*                  R-                  5       (       a  SO,\R.                  R0                  R-                  5       (       a  SOS5      r\" S\SS9r\R5                  \5      r\R6                  " SSSS5      R5                  \5      r\R:                  " 5          \" \S\R<                  " SS/5      SS9u  rr SSS5        \!" S\RD                  5        \!" S\ RD                  5        gg! , (       d  f       N6= f)aj  Model for trajectory volume prediction using SmolVLA (LeRobot).

Uses pretrained SmolVLA: image + language go through the VLM; we take the image token
outputs after language-image self-attention, bilinearly upsample to target resolution,
then 1x1 convs for volume and gripper logits (same API as volume_dino_tracks).
All parameters are fine-tuned (no freezing).
    N   gi1?gɿg?    zlerobot/smolvla_basec                     U R                   S   U:X  a  U $ U R                   S   n[        R                  " XAX2S9nXSS2SU R                   S   24'   U$ )z=Pad state vector to new_dim (batch, dim) -> (batch, new_dim).r   dtypedeviceN)shapetorchzeros)vectornew_dimr	   r   bouts         V/Users/cameronsmith/Projects/robotics_testing/3dkeygrip/volume_tracks_smolvla/model.py_pad_vectorr      sV    ||B7"QA
++a
=C!'V\\"J    c                   t   ^  \ rS rSrSrS\\SS4U 4S jjrU 4S jrS r	S	 r
SS
 jr       SS jrSrU =r$ )TrajectoryHeatmapPredictor&   ae  Predicts pixel-aligned volume and gripper using SmolVLA image features after VLM attention.

- Loads pretrained SmolVLA (e.g. lerobot/smolvla_base), uses image + language prefix only.
- Extracts image token outputs from the VLM, reshapes to 2D, bilinear upsample, then
  same volume/gripper heads as DINO version.
- Fine-tunes all parameters (no freezing).
  F   c                 	  > [         TU ]  5         SS Kn[        S5        SSKJnJnJn	  W
c  UR                  R                  SS 5        OXR                  S'   Xl
        Xl        Xl        X l        XPl        UR                  R                  SS 5      n
SUR                  S'   S n[        S5        [        SU S35        UR                  U5      nUR!                  S	5      nUR"                  U l        UR&                  U l        U R$                  R(                  R*                  U l        U R*                  R,                  U l        U(       aN  U R$                  R/                  5        H
  nS
Ul        M     U R$                  R3                  5         [        S5        O3U R$                  R/                  5        H
  nSUl        M     [        S5        [4        R6                  " 5          [4        R8                  " SSU R&                  R:                  S   U R&                  R:                  S   [=        U R$                  R/                  5       5      R>                  S9nU R$                  R(                  RA                  U5      nURB                  S   U l"        URB                  S   U l#        S S S 5        U R$                  RH                  (       a  SOSU l%        U RJ                  U RD                  -   U l&        [O        [P        RR                  " U RD                  5      5      nUU-  U RD                  :  a  US-  nU=U l*        U l+        U RT                  U RV                  -  U l,        [        SU RD                   SU RT                   SU RV                   SU RF                   35        U RF                  U l-        [\        R^                  " U RZ                  U R                  [`        -  SS9U l1        [\        Rd                  " [4        Rf                  " U RZ                  5      S-  5      U l4        [\        R^                  " U RZ                  U R                  [j        -  SS9U l6        [        SU R                   S[`         SU R                   S[`         S3	5        [        SU R                   S[j         SU R                   S[j         S3	5        g ! , (       d  f       GN= f)Nr   ztring import)SmolVLAPolicyresize_with_padmake_att_2d_masksPYTORCH_MPS_DISABLE1zloading policyzLoading SmolVLA from z$ (on CPU to avoid MPS init hang) ...cpuFu   ✓ SmolVLA backbone frozenTu/   ✓ SmolVLA backbone trainable (full fine-tune)      r	      u   ✓ SmolVLA image tokens: z	 -> grid xz, embed_dim=)kernel_sizeg{Gz?u   ✓ Volume head: (B, *z, H_p, W_p) -> upsample to (B, z, z, H, W)u   ✓ Gripper head: (B, )7super__init__osprint)lerobot.policies.smolvla.modeling_smolvlar   r   r   environpop_resize_with_pad_make_att_2d_maskstarget_sizen_windowmax_lang_lenfrom_pretrainedtomodel	vla_modelconfigvlm_with_expert	processor	tokenizer
parametersrequires_gradevalr   no_gradr   resize_imgs_with_paddingnextr	   embed_imager
   _num_img_embs
_embed_dimadd_image_special_tokens_image_start_ix_image_end_ixintmathsqrt_H_p_W_p_num_spatial	embed_dimnnConv2dN_HEIGHT_BINSvolume_head	Parameterrandnstart_keypoint_embeddingN_GRIPPER_BINSgripper_head)selfr0   r1   pretrained_ckptfreeze_backboner2   r)   r   r   r   _mps_disabled_prev_defaultpolicyp	dummy_imgimg_embs	__class__s                    r   r(   #TrajectoryHeatmapPredictor.__init__/   s    	n	
 	

  JJNN0$70=JJ,- /"3& (
 

'<dC,/

()%o%66Z[\..?5!mm77AA11 ^^..0"' 1NN!/0^^..0"& 1CD ]]_144Q744Q7DNN5578??	I nn44@@KG!(q!1D%mmA.DO  %)NN$K$KqQR!11D4F4FF 		$,,-.q54%%%FA !!	DI II		1*4+=+=*>i		{RSTXT]T]S^^jkokzkzj{|} 99NNMMM)

 )+U[[5PSW5W(X%IINNMMN*

 	%dmm_Am_Dcdhdqdqcrrt  vC  uD  DK  L  	M&t}}oQ~6FFefjfsfsettv  xF  wG  GN  O  	PI _s   B?R44
Sc                    > [         TU ]  U5        [        U S5      (       a   U R                  R                  U5      U l        U $ )Nr6   )r'   r4   hasattrr6   )rW   r	   ra   s     r   r4   TrajectoryHeatmapPredictor.to   s7    
64%%!^^..v6DNr   c                    UR                  5       S:  d  UR                  5       S:  a  [        R                  " / SQUR                  UR
                  S9R                  SSSS5      n[        R                  " / SQUR                  UR
                  S9R                  SSSS5      nX-  U-   R                  SS	5      nU R                  R                  b)  U R                  R                  u  pEU R                  XUS
S9nUS-  S	-
  nU$ )zResize with padding to SmolVLA size and normalize to [-1, 1] (SigLIP).
Accepts either [0, 1] or ImageNet-normalized (0.485/0.229 etc.) input.
g      g      ?)g
ףp=
?gv/?gCl?r	   r   r    r!   )gZd;O?gy&1?g?g        g      ?r   )	pad_valueg       @)minmaxr   tensorr	   r   viewclampr7   r?   r.   )rW   r$   meanstdwhs         r   _preprocess_images-TrajectoryHeatmapPredictor._preprocess_images   s    
 557T>QUUWs]<< 5ahhaggV[[\]_`bcefgD,,4QXXQWWUZZ[\^_abdefC4&&sC0A;;//;;;77DA%%aA%;AGcMr   c           	          [        U[        5      (       a  U/U-  nU R                  USSSU R                  SS9nUS   R	                  U5      nUS   R	                  U5      nXV4$ )zWTokenize task strings to input_ids and attention_mask. task: list of str or single str.pt
max_lengthT)return_tensorspadding
truncationrv   add_special_tokens	input_idsattention_mask)
isinstancestrr:   r2   r4   )rW   taskr	   
batch_sizeencr{   r|   s          r   _tokenize_task)TrajectoryHeatmapPredictor._tokenize_task   s|    dC  6J&Dnn ((#  
 $''/	-.11&9((r   c           	         UR                   S   nUR                  n[        U R                  R	                  5       5      R
                  nU R                  U5      nU/n[        R                  " U[        R                  US9/n	U R                  X%U5      u  pUc9  [        R                  " X@R                  R                  U[        R                  S9nO/[        X0R                  R                  U[        R                  5      nU R                  R!                  XXUS9u  pnU R#                  X5      n[        R$                  " USS9S-
  nU R                  R&                  R)                  UUSUS/SSS	9u  u  n  nUSS2U R*                  U R,                  2SS24   nUR                   S   U R.                  :  a  USS2SU R.                  2SS24   nOSUR                   S   U R.                  :  a6  U R.                  UR                   S   -
  n[0        R2                  " USSSU4SS
9nUR5                  X@R6                  U R8                  U R:                  5      nUR=                  SSSS5      R?                  5       nU$ )z
Run SmolVLA prefix only (image + language + state), return image token outputs
after language-image self-attention. Reshape to (B, D, H_p, W_p).
r   r   Nrg   )stater    )dimF)r|   position_idspast_key_valuesinputs_embeds	use_cachefill_kv_cache)valuer!   r#   ) r
   r	   r@   r6   r;   r   rr   r   onesboolr   r   r7   max_state_dimfloat32r   embed_prefixr/   cumsumr8   forwardrE   rF   rL   FpadreshaperJ   rK   rC   permute
contiguous)rW   r$   r   r   Br	   r   imgimages	img_maskslang_tokens
lang_masksprefix_embsprefix_pad_masksprefix_att_masksprefix_att_2dprefix_position_ids
prefix_out_img_outpad_lenpatch_featuress                         r   #_get_image_features_after_attention>TrajectoryHeatmapPredictor._get_image_features_after_attention   s?   
 GGAJT^^..0177 %%a(ZZFCD	 #'"5"5dA"F =KK;;#<#<VSXS`S`aE{{'@'@&%--XE ;?..:U:U{e ;V ;
7'7 //0@S#ll+;CaG!^^;;CC(, &- D 
Q Q 4 4t7I7I I1LM==d///a!44#4#4!4a78G]]1 1 11'''--*::GeeGaAw%7qAG //!YY		4??K Aq!4??Ar   c	                    UR                   S   n	U R                  XUS9n
U
R                   u  ppUb  UR                  5       S:X  a!  UR                  S5      R	                  U	S5      nUSS2S4   U-  U R
                  -  R                  5       R                  SUS-
  5      nUSS2S4   U-  U R
                  -  R                  5       R                  SUS-
  5      n[        R                  " XR                  S9nU
USS2UU4==   U R                  R                  S5      -  ss'   U R                  U
5      nUR                  XR                  [        X5      n[         R"                  " UR                  XR                  [        -  X5      U R
                  U R
                  4SSS	9nUR                  XR                  [        U R
                  U R
                  5      nU R%                  U
5      n[         R"                  " UU R
                  U R
                  4SSS	9nUR                  XR                  [&        U R
                  U R
                  5      nUU4$ )
aK  
Args:
    x: (B, 3, H, W) RGB in [0, 1]
    start_keypoint_2d: (B, 2) or (2,) optional
    task: str or list of str, language instruction (optional)
    state: (B, state_dim) optional robot state; zeros if None
Returns:
    volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
    gripper_logits: (B, N_WINDOW, N_GRIPPER_BINS, H, W)
r   )r   r   Nr    r   r"   bilinearF)sizemodealign_corners)r
   r   r   	unsqueezeexpandr0   longrm   r   aranger	   rT   rQ   rl   r1   rP   r   interpolaterV   rU   )rW   r$   gt_target_heatmaptrainingstart_keypoint_2dcurrent_heightcurrent_gripperr   r   r   r   r   DH_pW_pstart_patch_xstart_patch_ybatch_indicesvolvolume_logitsgripgripper_logitss                         r   r   "TrajectoryHeatmapPredictor.forward   s!   * GGAJAA!V[A\'--c( $$&!+$5$?$?$B$I$I!R$P!.q!t4s:T=M=MMSSU[[\]_bef_fgM.q!t4s:T=M=MMSSU[[\]_bef_fgM!LL3H3HIM=!]MIJdNkNkNuNuvwNxxJ ~.hhq--AHHQ5s@""D$4$45	
 &**1mm]DL\L\^b^n^no   0}}""D$4$45	
 1mm^TEUEUW[WgWghn,,r   )rJ   rK   rC   rF   rE   r/   rB   rL   r.   r7   rM   rV   r2   r1   r9   rT   r0   r:   r6   rQ   ) N)NFNNNr   N)__name__
__module____qualname____firstlineno____doc__N_WINDOWDEFAULT_SMOLVLA_CKPTr(   r4   rr   r   r   r   __static_attributes____classcell__)ra   s   @r   r   r   &   sW     ,_PB)"3p 6- 6-r   r   __main__cudampsr   r   F)r0   r1   rY   r#   r!   g      l@zpick the block)r   r   r   r   r   )#r   rH   r   torch.nnrN   torch.nn.functional
functionalr   r   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERrP   rU   r   r   Moduler   r   r	   r   is_availablebackendsr   r5   r4   randr$   r>   rk   r   r   r*   r
    r   r   <module>r      s>       
 

 . {- {-| z\\EJJ$;$;$=$=&ENNL^L^LkLkLmLm5sxyF&3[`aEHHVE

1ac"%%f-A	!eu||UTYN?[brs	T 
	/399%	
DJJ' 
 
s   4E
E