o
    –€Äiý-  ã                   @   sš  d Z ddlZddlmZ ddlm  mZ dZdZdZ	dZ
G dd„ dejƒZedkrËe ej ¡ r3d	nd
¡Zede	ddZe e¡Ze dddd¡ e¡Ze ddg¡ e¡Ze dd¡ e¡Ze dd¡ e¡ZddgZe ¡  eeeeeed\ZZZW d  ƒ n1 sˆw   Y  edejde  ¡ d›de !¡ d›dƒ edejde  ¡ d›de !¡ d›dƒ edejde  ¡ d›de !¡ d›dƒ dS dS )uÞ  InternVL VLA baseline â€” action-token decoding via the LLM (pi0-style).

Same InternVL2.5-1B backbone as model_vla_internvl.py, but instead of PARA
pixel-aligned heatmaps, appends learnable action query tokens + a proprioception
token to the LLM input sequence:

    [image_patches, text_tokens, proprio_token, action_0, ..., action_{N-1}]

The LLM's causal self-attention lets each action token attend to all image
patches, the task description, proprioception, and all *preceding* action
tokens â€” giving autoregressive action prediction for free.  The hidden states
at the action token positions are then decoded by lightweight per-timestep
heads into normalised [0,1] position / rotation / gripper values (MSE loss).

This is the standard modern VLA formulation (pi0, RT-2, Octo) and serves as
the non-PARA baseline for comparing action representations on the same backbone.

Usage:
    python train.py --model_type internvl_act --task_ids all --cache_root /data/libero/parsed_libero
é    N)g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?é   c                       sP   e Zd ZdZdeddf‡ fdd„	Zdd„ Zd	d
„ Zdd„ Z		ddd„Z	‡  Z
S )ÚInternVLACTPredictorz.InternVL2.5-1B + action-token VLA (pi0-style).éÀ  FzOpenGVLab/InternVL2_5-1Bc              	      sÞ  t ƒ  ¡  || _|| _d| _td|› ƒ ddlm}m} |j	|t
jdd| _|j	|dd| _|rF| j ¡ D ]}d|_q6| j ¡  td	ƒ ntd
ƒ |  dd¡| _|  dd¡| _|  d¡| _| jj}	t|	dt|	dd ƒƒ}
|
j | _}|	j}t|ddƒ| _t|	ddƒ| _td|› ƒ td| j› d| j› ƒ |  dt
jtt
jd  ddd¡¡ |  dt
jt!t
jd  ddd¡¡ |  dt
jtt
jd  ddd¡¡ |  dt
jt!t
jd  ddd¡¡ t" #t
 $||¡d  ¡| _%t" &t" 't(|¡t" )¡ t" '||¡¡| _*t" &t" +|¡t" '||¡t" )¡ t" '|d¡t" ,¡ ¡| _-t" &t" +|¡t" '||¡t" )¡ t" '|d¡t" ,¡ ¡| _.t" &t" +|¡t" '||¡t" )¡ t" '|d¡t" ,¡ ¡| _/td!|› d"|› ƒ td#t(› d$|› ƒ td%|d › d&ƒ td'ƒ td(ƒ td)ƒ d S )*NZinternvl_actzLoading InternVL model: r   )Ú	AutoModelÚAutoTokenizerT)Ztorch_dtypeÚtrust_remote_code)r   Fz  Frozen InternVL backbonez   InternVL backbone is trainableZvision_modelZvisualZmlp1Zmulti_modal_projectorZlanguage_modelZ
llm_configZtext_configZ
image_sizer   Údownsample_ratiog      à?z  VLM hidden dim: z
  Vision: zpx, downsample Úvlm_mean)Údtypeé   é   Úvlm_stdÚ	inet_meanÚinet_stdg{®Gáz”?z  Action tokens: z x z  Proprio proj:  z -> z4  Sequence: [img_patches, text, proprio, act_0..act_ú]z+  pos_head:     per-token -> (3,) [sigmoid]z+  rot_head:     per-token -> (3,) [sigmoid]z+  gripper_head: per-token -> (1,) [sigmoid])0ÚsuperÚ__init__Útarget_sizeÚn_windowZ
model_typeÚprintZtransformersr   r   Zfrom_pretrainedÚtorchZbfloat16ÚvlmÚ
_tokenizerZ
parametersZrequires_gradÚevalÚ
_find_attrÚ_vision_encÚ
_projectorÚ_lmZconfigÚgetattrZhidden_sizeZvlm_hidden_dimZvision_configÚvlm_image_sizer   Zregister_bufferÚtensorÚIMAGENET_MEANZfloat32ÚviewÚIMAGENET_STDÚnnZ	ParameterÚrandnÚaction_tokensZ
SequentialZLinearÚPROPRIO_DIMZGELUÚproprio_projZ	LayerNormZSigmoidÚpos_headÚrot_headÚgripper_head)Úselfr   r   Úfreeze_backboneZ
model_nameÚkwargsr   r   ÚpZcfgZllm_cfgÚDZvis_cfg©Ú	__class__© ú3/data/cameron/para/libero/model_vla_internvl_act.pyr   $   s„   
ÿÿ

""""

ý

û

û

ûzInternVLACTPredictor.__init__c              	   G   sX   |D ]}t | j|ƒrt| j|ƒ  S qtd|› dt| jƒj› ddd„ t| jƒD ƒ› ƒ‚)NzCannot find any of z	 on VLM (z). Available: c                 S   s   g | ]	}|  d ¡s|‘qS )Ú_)Ú
startswith)Ú.0Úar3   r3   r4   Ú
<listcomp>†   s    z3InternVLACTPredictor._find_attr.<locals>.<listcomp>)Úhasattrr   r   ÚAttributeErrorÚtypeÚ__name__Údir)r,   ÚnamesÚnr3   r3   r4   r   €   s   ÿÿÿzInternVLACTPredictor._find_attrc                 C   s^   || j  | j }|| j | j }|jd | jks |jd | jkr-tj|| j| jfddd}|S )NéÿÿÿÿéþÿÿÿZbilinearF)ÚsizeÚmodeZalign_corners)r   r   r	   r   Úshaper   ÚFZinterpolate)r,   ÚxZx_rawZx_vlmr3   r3   r4   Ú_renormalize_image‰   s    ÿz'InternVLACTPredictor._renormalize_imagec                 C   sD   t | jdƒr| j |¡S |  |¡}t |dƒr|jn|d }|  |¡S )NÚextract_featureÚlast_hidden_stater   )r:   r   rI   r   rJ   r   )r,   Úpixel_valuesZvis_outZvit_featuresr3   r3   r4   Ú_get_image_tokens‘   s
   

z&InternVLACTPredictor._get_image_tokensNc              	   K   s6  |j d }|j}	|du rdg| }| ¡ dkr | d¡ |d¡}|| j }
|du r1tj|d|	d}|du r=tj|d|	d}| ¡ dkrH| d¡}| ¡ dkrW| d¡ |d¡}tj|
||gdd}|  	|¡}|  
| | jj¡¡}|j d }| j|d	d
d
dd |	¡}| j ¡ |d ƒ}| |j¡}|j d }|  | ¡ ¡ |j¡ d¡}| j d¡ |dd¡}| |j¡}tj||||gdd}tjtj|||	tjd|d tj|d| j |	tjdgdd}| j||d
d}|jd }|| d }|dd…||| j …dd…f  ¡ }|  |¡}|  |¡}|  |¡ d¡}|||fS )aæ  
        Args:
            x:                 (B, 3, H, W) ImageNet-normalized
            start_keypoint_2d: (B, 2) or (2,) current EEF pixel
            current_eef_pos:   (B, 3) normalized [0,1]
            current_gripper:   (B, 1) or (B,) normalized [0,1]
            task_text:         list of B strings

        Returns:
            pos_pred:     (B, N_WINDOW, 3)  [0,1]
            rot_pred:     (B, N_WINDOW, 3)  [0,1]
            gripper_pred: (B, N_WINDOW)     [0,1]
        r   Nz.Pick up the object and place it on the target.r   rA   r   )Údevice)ÚdimZptTé@   )Zreturn_tensorsZpaddingZ
truncationZ
max_lengthZ	input_ids)rM   r
   Úattention_mask)Úinputs_embedsrP   Zoutput_hidden_states)rE   rM   rN   Z	unsqueezeÚexpandr   r   ZzerosÚcatrH   rL   Útor   r
   r   r   Zget_input_embeddingsr(   Úfloatr&   ZonesZlongr   Zhidden_statesr)   r*   r+   Zsqueeze)r,   rG   Zstart_keypoint_2dÚcurrent_eef_posÚcurrent_gripperZquery_pixelsÚ	task_textr.   ÚBrM   Zstart_kp_normZpropriorK   Zimage_tokensZN_imgZtext_inputsZtext_embedsZT_textZproprio_tokenZaction_queriesrQ   Z	attn_maskZlm_outZhiddenZaction_startZaction_hiddenZpos_predZrot_predZgripper_predr3   r3   r4   Úforward™   sn   





þý
ÿþýüý
$


zInternVLACTPredictor.forward)NNNN)r=   Ú
__module__Ú__qualname__Ú__doc__ÚN_WINDOWr   r   rH   rL   rZ   Ú__classcell__r3   r3   r1   r4   r   !   s    ÿ\	ÿr   Ú__main__ÚcudaZcpur   T)r   r   r-   é   r   g      l@r   zpick up the bowlzplace it on the plate)rV   rW   rX   zpos  zrange=[z.3fz, r   zrot  zgrip )"r]   r   Ztorch.nnr$   Ztorch.nn.functionalZ
functionalrF   r!   r#   r^   r'   ZModuler   r=   rM   ra   Zis_availableÚmodelrT   r%   rG   r    ÚkpÚeefÚgripZtextsÚno_gradÚposÚrotZgrip_outr   rE   ÚminÚmaxr3   r3   r3   r4   Ú<module>   s4     V

ÿ((,ó