
    8i.              	          d Z ddlZddlZddlmZ ddlmc mZ dZdZ	dZ
dZdZdZdZdZd	 Z G d
 dej                  Zedk    r ej        ej                                        rdn!ej        j                                        rdnd          Z eded          Ze                    e          Z ej        dddd                              e          Z ej                    5   eed ej        ddg          d          \  ZZ ddd           n# 1 swxY w Y    e!dej"                    e!de j"                   dS dS )aj  Model for trajectory volume prediction using SmolVLA (LeRobot).

Uses pretrained SmolVLA: image + language go through the VLM; we take the image token
outputs after language-image self-attention, bilinearly upsample to target resolution,
then 1x1 convs for volume and gripper logits (same API as volume_dino_tracks).
All parameters are fine-tuned (no freezing).
    N   gi1?gɿg?    zlerobot/smolvla_basec                     | j         d         |k    r| S | j         d         }t          j        ||||          }| |ddd| j         d         f<   |S )z=Pad state vector to new_dim (batch, dim) -> (batch, new_dim).r   dtypedeviceN)shapetorchzeros)vectornew_dimr	   r   bouts         V/Users/cameronsmith/Projects/robotics_testing/3dkeygrip/volume_tracks_smolvla/model.py_pad_vectorr      sa    |B7""QA
+af
=
=
=C!'CV\"J    c                   b     e Zd ZdZdeeddf fd	Z fdZd Zd Z	ddZ
	 	 	 	 	 	 	 ddZ xZS )TrajectoryHeatmapPredictoray  Predicts pixel-aligned volume and gripper using SmolVLA image features after VLM attention.

    - Loads pretrained SmolVLA (e.g. lerobot/smolvla_base), uses image + language prefix only.
    - Extracts image token outputs from the VLM, reshapes to 2D, bilinear upsample, then
      same volume/gripper heads as DINO version.
    - Fine-tunes all parameters (no freezing).
      F   c                 l   t                                                       ddlm}m}m} || _        || _        || _        || _	        || _
        t          d| d           |                    |          }	|	j        | _        |	j        | _        | j        j        j        | _        | j        j        | _        |rL| j                                        D ]	}
d|
_        
| j                                         t          d           n2| j                                        D ]	}
d|
_        
t          d           t-          j                    5  t-          j        d	d
| j        j        d         | j        j        d	         t5          | j                                                  j                  }| j        j                            |          }|j        d	         | _        |j        d         | _        d d d            n# 1 swxY w Y   | j        j         rdnd| _!        | j!        | j        z   | _"        tG          tI          j%        | j                            }||z  | j        k     r|d	z  }|x| _&        | _'        | j&        | j'        z  | _(        t          d| j         d| j&         d| j'         d| j                    | j        | _)        tU          j+        | j)        | j	        tX          z  d	          | _-        tU          j.        t-          j/        | j)                  dz            | _0        tU          j+        | j)        | j	        tb          z  d	          | _2        t          d| j	         dtX           d| j	         dtX           d	           t          d| j	         dtb           d| j	         dtb           d	           d S )Nr   )SmolVLAPolicyresize_with_padmake_att_2d_maskszLoading SmolVLA from z ...Fu   ✓ SmolVLA backbone frozenTu/   ✓ SmolVLA backbone trainable (full fine-tune)      r	      u   ✓ SmolVLA image tokens: z	 -> grid xz, embed_dim=)kernel_sizeg{Gz?u   ✓ Volume head: (B, *z, H_p, W_p) -> upsample to (B, z, z, H, W)u   ✓ Gripper head: (B, )3super__init__)lerobot.policies.smolvla.modeling_smolvlar   r   r   _resize_with_pad_make_att_2d_maskstarget_sizen_windowmax_lang_lenprintfrom_pretrainedmodel	vla_modelconfigvlm_with_expert	processor	tokenizer
parametersrequires_gradevalr   no_gradr   resize_imgs_with_paddingnextr	   embed_imager
   _num_img_embs
_embed_dimadd_image_special_tokens_image_start_ix_image_end_ixintmathsqrt_H_p_W_p_num_spatial	embed_dimnnConv2dN_HEIGHT_BINSvolume_head	Parameterrandnstart_keypoint_embeddingN_GRIPPER_BINSgripper_head)selfr(   r)   pretrained_ckptfreeze_backboner*   r   r   r   policyp	dummy_imgimg_embs	__class__s                 r   r$   z#TrajectoryHeatmapPredictor.__init__/   sb    		
 	
 	
 	
 	
 	
 	
 	
 	
 	
 !0"3& (;o;;;<<<..??m7A1  	E^..00 ( ("'N!!!/0000^..00 ' '"&CDDD ]__ 		/ 		/14Q74Q7DN557788?	  I n4@@KKG!(q!1D%mA.DO		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ 		/ %)N$KRqqQR!1D4FF 	$,--..q54%%%FA !!	DI I	1|4+=||	||TXT]||kokz||}}} 9NMM)
 
 

 )+U[5P5PSW5W(X(X%INMN*
 
 

 	  Ldm  L  Lm  L  Ldhdq  L  L  vC  L  L  L  	M  	M  	M  Ot}  O  O~  O  Ofjfs  O  O  xF  O  O  O  	P  	P  	P  	P  	Ps   	B&G;;G?G?c                     t                                          |           t          | d          r| j                            |          | _        | S )Nr.   )r#   tohasattrr.   )rO   r	   rW   s     r   rY   zTrajectoryHeatmapPredictor.to   sG    

64%% 	7!^..v66DNr   c                    |                                 dk     s|                                dk    rt          j        g d|j        |j                                      dddd          }t          j        g d|j        |j                                      dddd          }||z  |z                       dd	          }| j        j	        (| j        j	        \  }}| 
                    |||d          }|dz  d	z
  }|S )zResize with padding to SmolVLA size and normalize to [-1, 1] (SigLIP).
        Accepts either [0, 1] or ImageNet-normalized (0.485/0.229 etc.) input.
        g      g      ?)g
ףp=
?gv/?gCl?r	   r   r   r   )gZd;O?gy&1?g?g        g      ?Nr   )	pad_valueg       @)minmaxr   tensorr	   r   viewclampr/   r7   r&   )rO   r    meanstdwhs         r   _preprocess_imagesz-TrajectoryHeatmapPredictor._preprocess_images   s    
 5577T>>QUUWWs]]< 5 5 5ahagVVV[[\]_`bcefggD,444QXQWUUUZZ[\^_abdeffCS4&&sC00A;/;;7DAq%%aA%;;AGcMr   c                     t          |t                    r|g|z  }|                     |ddd| j        d          }|d                             |          }|d                             |          }||fS )zWTokenize task strings to input_ids and attention_mask. task: list of str or single str.pt
max_lengthT)return_tensorspadding
truncationrj   add_special_tokens	input_idsattention_mask)
isinstancestrr2   r*   rY   )rO   taskr	   
batch_sizeencro   rp   s          r   _tokenize_taskz)TrajectoryHeatmapPredictor._tokenize_task   s    dC   	'6J&Dnn (#  
 
 $''//	-.11&99.((r    Nc                    |j         d         }|j        }t          | j                                                  j        }|                     |          }|g}t          j        |t          j	        |          g}	| 
                    |||          \  }
}|-t          j        || j        j        |t          j                  }n&t          || j        j        |t          j                  }| j                            ||	|
||          \  }}}|                     ||          }t          j        |d          dz
  }| j        j                            |	                                |d|dgdd	          \  \  }}}|dd| j        | j        ddf         }|j         d         | j        k    r|ddd| j        ddf         }nF|j         d         | j        k     r0| j        |j         d         z
  }t1          j        |ddd|fd
          }|                    || j        | j        | j                  }|                    dddd                                          }|S )z
        Run SmolVLA prefix only (image + language + state), return image token outputs
        after language-image self-attention. Reshape to (B, D, H_p, W_p).
        r   r   Nr\   )stater   )dimT)rp   position_idspast_key_valuesinputs_embeds	use_cachefill_kv_cache)valuer   r   ) r
   r	   r8   r.   r3   r   rg   r   onesboolrv   r   r/   max_state_dimfloat32r   embed_prefixr'   cumsumr0   forwardr=   r>   rD   FpadreshaperB   rC   r;   permute
contiguous)rO   r    rs   ry   Br	   r   imgimages	img_maskslang_tokens
lang_masksprefix_embsprefix_pad_masksprefix_att_masksprefix_att_2dprefix_position_ids
prefix_out_img_outpad_lenpatch_featuress                         r   #_get_image_features_after_attentionz>TrajectoryHeatmapPredictor._get_image_features_after_attention   so   
 GAJT^..00117 %%a((ZFCCCD	 #'"5"5dFA"F"FZ =K4;#<VSXS`aaaEEt{'@&%-XXE ;?.:U:UI{Je ;V ;
 ;
7%'7 //0@BRSS#l+;CCCaG!^;CC(--//, &- D 
 
Q QQQ 4t7I I111LM=d///aaa!44#4!4aaa78GG]1 111''-*::GeGaAw%7qAAAG //!TY	4?KK Aq!44??AAr   c	                    |j         d         }	|                     |||          }
|
                                }
|
j         \  }}}}|
|                                dk    r)|                    d                              |	d          }|dddf         |z  | j        z                                                      d|dz
            }|dddf         |z  | j        z                                                      d|dz
            }t          j
        |	|
j                  }|
|dd||fxx         | j                            d          z  cc<   |                     |
          }|                    |	| j        t           ||          }t#          j        |                    |	| j        t           z  ||          | j        | j        fdd	          }|                    |	| j        t           | j        | j                  }|                     |
          }t#          j        || j        | j        fdd	          }|                    |	| j        t(          | j        | j                  }||fS )
a  
        Args:
            x: (B, 3, H, W) RGB in [0, 1]
            start_keypoint_2d: (B, 2) or (2,) optional
            task: str or list of str, language instruction (optional)
            state: (B, state_dim) optional robot state; zeros if None
        Returns:
            volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
            gripper_logits: (B, N_WINDOW, N_GRIPPER_BINS, H, W)
        r   )rs   ry   Nr   r   r   bilinearF)sizemodealign_corners)r
   r   floatrz   	unsqueezeexpandr(   longrb   r   aranger	   rL   rI   ra   r)   rH   r   interpolaterN   rM   )rO   r    gt_target_heatmaptrainingstart_keypoint_2dcurrent_heightcurrent_gripperrs   ry   r   r   r   DH_pW_pstart_patch_xstart_patch_ybatch_indicesvolvolume_logitsgripgripper_logitss                         r   r   z"TrajectoryHeatmapPredictor.forward   sq   * GAJAA!$V[A\\'--//'-1c3( $$&&!++$5$?$?$B$B$I$I!R$P$P!.qqq!t4s:T=MMSSUU[[\]_bef_fggM.qqq!t4s:T=MMSSUU[[\]_bef_fggM!L>3HIIIM=!!!]MIJJJdNkNuNuvwNxNxxJJJ ~..hhq$-SAAHHQ5sC@@"D$45	
 
 
 &**1dm]DL\^b^noo   00}"D$45	
 
 
 1dm^TEUW[Wghhn,,r   )rw   N)NFNNNrw   N)__name__
__module____qualname____doc__N_WINDOWDEFAULT_SMOLVLA_CKPTr$   rY   rg   rv   r   r   __classcell__)rW   s   @r   r   r   &   s          ,PP PP PP PP PP PPd      ) ) )"3 3 3 3p 8- 8- 8- 8- 8- 8- 8- 8-r   r   __main__cudampscpur   F)r(   r)   rQ   r   r   g      l@zpick the block)r   r   rs   r   r   )#r   r@   r   torch.nnrF   torch.nn.functional
functionalr   r   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERrH   rM   r   r   Moduler   r   r	   r   is_availablebackendsr   r-   rY   randr    r6   r`   r   r   r+   r
    r   r   <module>r      s8                    
 

 .   n- n- n- n- n- n- n- n-b zU\EJ$;$;$=$=x&&ENL^LkLkLmLmCx55sxyyF&&3[`aaaEHHVE
1ac""%%f--A	 t tE!e|u|UTYN?[?[brsss	Tt t t t t t t t t t t t t t t	E/39%%%	E
DJ''''' s   *#DD D