o
    ,i6                     @   sB  d Z ddlZddlZddlmZ ddlm  mZ dZdZ	dZ
dZdZdZdZdZd	d
 ZG dd dejZedkrej rEedZnejj rQedZnedZeddeddZeeZeddddeZe  eededdgdd\ZZ W d   n1 sw   Y  e!dej" e!de j" dS dS )u  Model for trajectory volume prediction using SmolVLA (LeRobot).

Uses pretrained SmolVLA: image + language go through the VLM; we take the image token
outputs after language-image self-attention, upsample the feature grid to an intermediate
resolution (e.g. 64×64), then a small conv stack (3×3 → 3×3 → 1×1) for volume and gripper
logits, then upsample to target resolution. All parameters are fine-tuned (no freezing).
    N   gi1?gɿg?    zlerobot/smolvla_basec                 C   sL   | j d |kr	| S | j d }tj||||d}| |ddd| j d f< |S )z=Pad state vector to new_dim (batch, dim) -> (batch, new_dim).r   dtypedeviceN)shapetorchzeros)vectornew_dimr   r   bout r   4/data/cameron/keygrip/volume_tracks_smolvla/model.py_pad_vector   s   
r   c                       sn   e Zd ZdZddeedddf fdd	Z fd	d
Zdd Zdd Z	dddZ
							dddZ  ZS )TrajectoryHeatmapPredictoray  Predicts pixel-aligned volume and gripper using SmolVLA image features after VLM attention.

    - Loads pretrained SmolVLA (e.g. lerobot/smolvla_base), uses image + language prefix only.
    - Extracts image token outputs from the VLM, reshapes to 2D, bilinear upsample, then
      same volume/gripper heads as DINO version.
    - Fine-tunes all parameters (no freezing).
      @   F      c                    sV  t    ddlm}m}	m}
 |	| _|
| _|| _|| _	|| _
|| _td| d ||}|j| _|j| _| jjj| _| jj| _|rX| j D ]}d|_qH| j  td n| j D ]}d|_q]td t 3 tjd	d
| jjd | jjd	 t| j jd}| jj|}|jd	 | _|jd | _ W d    n1 sw   Y  | jj!rdnd| _"| j"| j | _#t$t%&| j}|| | jk r|d	7 }| | _'| _(| j'| j( | _)| j)| jkrtd| j d| j' d| j( d td| j d| j' d| j( d| j   | j | _*t+,t+j-| j*|d
d	dt+j.ddt+j-||d
d	dt+j.ddt+j-|| j
t/ d	d| _0t+1t2| j*d | _3t+,t+j-| j*|d
d	dt+j.ddt+j-||d
d	dt+j.ddt+j-|| j
t4 d	d| _5td| j' d| j( d| j	 d| j	 d| j d| j  td| j	 d| j	 d| j
 dt/ d	 td | j	 d| j	 d| j
 dt4 d	 d S )!Nr   )SmolVLAPolicyresize_with_padmake_att_2d_maskszLoading SmolVLA from z ...Fu   ✓ SmolVLA backbone frozenTu/   ✓ SmolVLA backbone trainable (full fine-tune)      r      u   ⚠ SmolVLA image tokens z% is not a perfect square; using grid xz (may truncate/pad)u   ✓ SmolVLA image tokens: z	 -> grid z, embed_dim=)kernel_sizepadding)inplace)r   g{Gz?u   ✓ Feature grid: patch z -> upsample to u)   , then 3×3→3×3→1×1 -> upsample to u   ✓ Volume head: (B, embed, z, z	) -> (B, *z, H, W)u   ✓ Gripper head: (B, embed, )6super__init__)lerobot.policies.smolvla.modeling_smolvlar   r   r   _resize_with_pad_make_att_2d_maskstarget_sizefeature_grid_sizen_windowmax_lang_lenprintfrom_pretrainedmodel	vla_modelconfigvlm_with_expert	processor	tokenizer
parametersrequires_gradevalr	   no_gradr
   resize_imgs_with_paddingnextr   embed_imager   _num_img_embs
_embed_dimadd_image_special_tokens_image_start_ix_image_end_ixintmathsqrt_H_p_W_p_num_spatial	embed_dimnn
SequentialConv2dReLUN_HEIGHT_BINSvolume_head	Parameterrandnstart_keypoint_embeddingN_GRIPPER_BINSgripper_head)selfr(   r)   r*   pretrained_ckptfreeze_backboner+   head_hidden_dimr   r   r   policyp	dummy_imgimg_embs	__class__r   r   r$   /   s|   









"(



8(,z#TrajectoryHeatmapPredictor.__init__c                    s(   t  | t| dr| j|| _| S )Nr/   )r#   tohasattrr/   )rR   r   r[   r   r   r]      s   
zTrajectoryHeatmapPredictor.toc                 C   s   |  dk s| dkr:tjg d|j|jddddd}tjg d|j|jddddd}|| | dd	}| jj	d
urO| jj	\}}| j
|||dd}|d d	 }|S )zResize with padding to SmolVLA size and normalize to [-1, 1] (SigLIP).
        Accepts either [0, 1] or ImageNet-normalized (0.485/0.229 etc.) input.
        g      g      ?)g
ףp=
?gv/?gCl?r   r   r   r   )gZd;O?gy&1?g?g        g      ?Nr   )	pad_valueg       @)minmaxr	   tensorr   r   viewclampr0   r8   r&   )rR   r   meanstdwhr   r   r   _preprocess_images   s   $$z-TrajectoryHeatmapPredictor._preprocess_imagesc                 C   sP   t |tr
|g| }| j|ddd| jdd}|d |}|d |}||fS )zWTokenize task strings to input_ids and attention_mask. task: list of str or single str.pt
max_lengthT)return_tensorsr    
truncationrl   add_special_tokens	input_idsattention_mask)
isinstancestrr3   r+   r]   )rR   taskr   
batch_sizeencrp   rq   r   r   r   _tokenize_task   s   

z)TrajectoryHeatmapPredictor._tokenize_task Nc                 C   s  |j d }|j}t| j j}| |}|g}tj|tj	|dg}	| 
|||\}
}|du r<tj|| jj|tjd}n
t|| jj|tj}| jj||	|
||d\}}}| ||}tj|ddd }| jjj|	 |d|dgddd	\\}}}|dd| j| jddf }|j d | jkr|ddd| jddf }n|j d | jk r| j|j d  }tj|ddd|fdd
}||| j| j| j}|dddd }|S )z
        Run SmolVLA prefix only (image + language + state), return image token outputs
        after language-image self-attention. Reshape to (B, D, H_p, W_p).
        r   r   Nr_   )stater   )dimT)rq   position_idspast_key_valuesinputs_embeds	use_cachefill_kv_cache)valuer   r   ) r   r   r9   r/   r4   r   rj   r	   onesboolrw   r
   r0   max_state_dimfloat32r   embed_prefixr'   cumsumr1   forwardr>   r?   rE   FpadreshaperC   rD   r<   permute
contiguous)rR   r   rt   ry   Br   r   imgimages	img_maskslang_tokens
lang_masksprefix_embsprefix_pad_masksprefix_att_masksprefix_att_2dprefix_position_ids
prefix_out_img_outpad_lenpatch_featuresr   r   r   #_get_image_features_after_attention   s@   



z>TrajectoryHeatmapPredictor._get_image_features_after_attentionc	                 C   s  |j d }	| j|||d}
|
 }
|
j \}}}}tj|
| j| jfddd}
| j| j}}|dur| dkr?|d|	d}|dddf | | j	 
 d|d }|dddf | | j	 
 d|d }tj|	|
jd	}|
|dd||f  | jd7  < | |
}||	| jt||}tj||	| jt ||| j	| j	fddd}||	| jt| j	| j	}| |
}tj|| j	| j	fddd}||	| jt| j	| j	}||fS )
a  
        Args:
            x: (B, 3, H, W) RGB in [0, 1]
            start_keypoint_2d: (B, 2) or (2,) optional
            task: str or list of str, language instruction (optional)
            state: (B, state_dim) optional robot state; zeros if None
        Returns:
            volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
            gripper_logits: (B, N_WINDOW, N_GRIPPER_BINS, H, W)
        r   )rt   ry   bilinearF)sizemodealign_cornersNr   r   r   )r   r   floatr   interpolater)   rz   	unsqueezeexpandr(   longre   r	   aranger   rO   rL   rd   r*   rK   rQ   rP   )rR   r   gt_target_heatmaptrainingstart_keypoint_2dcurrent_heightcurrent_gripperrt   ry   r   r   r   DH_pW_pHgWgstart_patch_xstart_patch_ybatch_indicesvolvolume_logitsgripgripper_logitsr   r   r   r      sH   

**$



z"TrajectoryHeatmapPredictor.forward)rx   N)NFNNNrx   N)__name__
__module____qualname____doc__N_WINDOWDEFAULT_SMOLVLA_CKPTr$   r]   rj   rw   r   r   __classcell__r   r   r[   r   r   &   s,    
]
:r   __main__cudampscpur   r   F)r(   r)   r*   rT   r   r   g      l@zpick the block)r   r   rt   r   r   )#r   rA   r	   torch.nnrG   torch.nn.functional
functionalr   r   
MIN_HEIGHT
MAX_HEIGHTMIN_GRIPPERMAX_GRIPPERrK   rP   r   r   Moduler   r   r   is_availabler   backendsr   r.   r]   randr   r7   rc   r   r   r,   r   r   r   r   r   <module>   s@    
  	



 