o
    çIÄiØ9  ã                   @   sV  d Z ddlZddlZddlmZ ddlm  mZ dZdZ	dZ
dZdZdZdZdZG d	d
„ d
ejƒZedkr©e ej ¡ r?dnd¡Zede
ddZe e¡Ze dddd¡ e¡Ze de
d¡ e¡ZddgZe ¡  eee ddg¡ e¡eed\ZZ Z!Z"W d  ƒ n1 sŠw   Y  e#dej$ƒ e#de j$ƒ e#de!j$ƒ e#de"j$ƒ dS dS )uÿ  InternVL VLA â€” InternVL3.5-1B as vision-language backbone + PARA action heads.

Runs the image + task description through InternVL's full pipeline (vision encoder
+ projector + LLM). Language-conditioned image patch features from the LLM's last
hidden layer are extracted, projected down, upsampled to pred_size, refined with
conv layers, and fed to PARA heads for heatmap + gripper/rotation prediction.

Usage:
    python train.py --model_type internvl --task_ids all --cache_root /data/libero/parsed_libero
é    N)g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?é   é    é@   é   c                       sf   e Zd ZdZdeeddf‡ fdd„	Zdd„ Zd	d
„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zddd„Z‡  ZS )ÚInternVLAPredictorz3InternVL3.5-1B backbone + PARA-style heatmap heads.éÀ  FzOpenGVLab/InternVL2_5-1Bc                    s>  t ƒ  ¡  || _|| _|| _d| _td|› ƒ ddlm}m	} |j
|tjdd| _|j
|dd| _|rI| j ¡ D ]}	d|	_q9| j ¡  td	ƒ ntd
ƒ |  dd¡| _|  dd¡| _|  d¡| _| jj}
t|
dt|
dd ƒƒ}|j| _|
j}t|ddƒ| _t|ddƒ| _t|
ddƒ| _td| j› ƒ td| j› d| j› d| j› ƒ |  dtjt tj!d "ddd¡¡ |  d tjt#tj!d "ddd¡¡ |  d!tjt tj!d "ddd¡¡ |  d"tjt#tj!d "ddd¡¡ t$}|| _%t& 't& (| j|¡t& )¡ ¡| _*t& 't&j+||ddd#t& )¡ t&j+||ddd#t& )¡ t&j+||ddd#t& )¡ ¡| _,t& -t .|¡d$ ¡| _/t&j+||t0 dd%| _1t& 't& 2|¡t& (||¡t& )¡ t& (|t3¡¡| _4t& 't& 2|¡t& (||¡t& )¡ t& (|dt5 ¡¡| _6td&| j› d'|› ƒ td(|› ƒ td)|› d*t0› d*|› d*|› d+	ƒ td,|› d*t3› d+ƒ td-|› d.t5› d+ƒ d S )/NÚinternvlzLoading InternVL model: r   )Ú	AutoModelÚAutoTokenizerT)Ztorch_dtypeÚtrust_remote_code)r   Fz  Frozen InternVL backbonez   InternVL backbone is trainableZvision_modelÚvisualZmlp1Zmulti_modal_projectorZlanguage_modelZ
llm_configZtext_configÚ
image_sizer   Ú
patch_sizeé   Údownsample_ratiog      à?z  VLM hidden dim: z
  Vision: z
px, patch zpx, downsample Úvlm_mean)Údtypeé   é   Úvlm_stdÚ	inet_meanÚinet_std)Úpaddingg{®Gáz”?)Úkernel_sizez  Feature proj: z -> z-  Feature convs: 3x Conv2d(3x3) at pred_size=z  Volume   head -> (B, z, ú)z  Gripper  MLP  -> (B, z  Rotation MLP  -> (B, z, 3, )7ÚsuperÚ__init__Útarget_sizeÚ	pred_sizeÚn_windowÚ
model_typeÚprintÚtransformersr	   r
   Úfrom_pretrainedÚtorchÚbfloat16ÚvlmÚ
_tokenizerÚ
parametersÚrequires_gradÚevalÚ
_find_attrÚ_vision_encÚ
_projectorÚ_lmÚconfigÚgetattrÚhidden_sizeZvlm_hidden_dimZvision_configÚvlm_image_sizeZvlm_patch_sizer   Úregister_bufferÚtensorÚIMAGENET_MEANÚfloat32ÚviewÚIMAGENET_STDÚFEAT_DIMÚ	embed_dimÚnnÚ
SequentialÚLinearÚGELUÚ	feat_projÚConv2dÚfeature_convsÚ	ParameterÚrandnÚstart_keypoint_embeddingÚN_HEIGHT_BINSÚvolume_headÚ	LayerNormÚN_GRIPPER_BINSÚgripper_mlpÚ
N_ROT_BINSÚrotation_mlp)Úselfr   r   r   Úfreeze_backboneÚ
model_nameÚkwargsr	   r
   ÚpÚcfgZllm_cfgZvis_cfgÚD©Ú	__class__© ú//data/cameron/para/libero/model_vla_internvl.pyr      sv   
ÿÿ

 """"þý"ÿ&ÿ"zInternVLAPredictor.__init__c              	   G   sX   |D ]}t | j|ƒrt| j|ƒ  S qtd|› dt| jƒj› ddd„ t| jƒD ƒ› ƒ‚)z:Return the first matching attribute on self.vlm, or raise.zCannot find any of z	 on VLM (z). Available: c                 S   s   g | ]	}|  d ¡s|‘qS )Ú_)Ú
startswith)Ú.0ÚarU   rU   rV   Ú
<listcomp>}   s    z1InternVLAPredictor._find_attr.<locals>.<listcomp>)Úhasattrr&   r0   ÚAttributeErrorÚtypeÚ__name__Údir)rL   ÚnamesÚnrU   rU   rV   r+   v   s   ÿÿÿzInternVLAPredictor._find_attrc                 C   s^   || j  | j }|| j | j }|jd | jks |jd | jkr-tj|| j| jfddd}|S )z<ImageNet-normalized (B,3,H,W) -> VLM-normalized (B,3,H',W').éÿÿÿÿéþÿÿÿÚbilinearF©ÚsizeÚmodeÚalign_corners)r   r   r   r   Úshaper2   ÚFÚinterpolate)rL   ÚxZx_rawZx_vlmrU   rU   rV   Ú_renormalize_image€   s    ÿz%InternVLAPredictor._renormalize_imagec                 C   sD   t | jdƒr| j |¡S |  |¡}t |dƒr|jn|d }|  |¡S )zÒVision encoder + projector -> image tokens for LLM.

        Args:
            pixel_values: (B, 3, H, W) VLM-normalized

        Returns:
            image_tokens: (B, N_img, D_lm) in VLM dtype (bf16)
        Úextract_featureÚlast_hidden_stater   )r\   r&   ro   r,   rp   r-   )rL   Úpixel_valuesZvis_outZvit_featuresrU   rU   rV   Ú_get_image_tokens   s
   

z$InternVLAPredictor._get_image_tokensc                 C   sN  |j d }|j}|  | | jj¡¡}|j d }| j|ddddd |¡}| j ¡ |d ƒ}| |j¡}t	j
||gdd}	t	j
t	j|||t	jd	|d
 gdd}
| j|	|
dd}|jd }|dd…d|…dd…f  ¡ }tt |¡ƒ}|| |kr£tt t |¡¡ƒ}|| | }|dkr£t	j|||j d ||jd	}t	j
||gdd}||fS )aU  Full VLM pipeline: vision + text -> language-conditioned image features.

        Args:
            pixel_values: (B, 3, H, W) VLM-normalized, bf16 or fp32
            task_texts:   list of B strings

        Returns:
            image_hidden: (B, N_img, D_lm) fp32
            grid_size:    int, spatial side length (sqrt of N_img)
        r   r   ÚptTr   )Zreturn_tensorsr   Z
truncationÚ
max_lengthÚ	input_ids)Údim)Údevicer   Úattention_mask)Úinputs_embedsrx   Zoutput_hidden_statesrc   N)rj   rw   rr   Útor&   r   r'   r.   Zget_input_embeddingsr$   ÚcatÚonesÚlongZhidden_statesÚfloatÚintÚmathÚsqrtÚceilÚzeros)rL   rq   Ú
task_textsÚBrw   Zimage_tokensZN_imgZtext_inputsZtext_embedsry   Ú	attn_maskZlm_outÚhiddenÚimage_hiddenÚ	grid_sizeÚpad_lenr   rU   rU   rV   Ú_extract_vlm_featuresŸ   sH   

þýþýý
ÿz(InternVLAPredictor._extract_vlm_featuresc                 C   s|   |j \}}}}|j d }|d  ¡  d|d ¡}|d  ¡  d|d ¡}	tj||jd |d¡ ||¡}
||
d d …|	|f S )Nr   ).r   r   ).r   ©rw   )rj   r}   Úclampr$   Úarangerw   r7   Úexpand)rL   ÚfeatsÚquery_pixelsr…   rR   ÚHÚWÚNÚpxÚpyÚ	batch_idxrU   rU   rV   Ú_index_featuresÚ   s   
 z"InternVLAPredictor._index_featuresc           	      C   sf   |j dd… \}}|  | ¡ |¡}| || | j¡}|  |¡ ||t¡}|  |¡ ||dt¡}||fS )aP  Apply gripper / rotation MLPs at specified pixel locations.

        Args:
            feats:        (B, D, pred_size, pred_size)
            query_pixels: (B, N_WINDOW, 2) in pred_size coords

        Returns:
            gripper_logits:  (B, N_WINDOW, N_GRIPPER_BINS)
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        Né   r   )	rj   r˜   ÚdetachÚreshaper:   rI   rH   rK   rJ   )	rL   r   r‘   r…   r”   ÚindexedÚflatÚgripperÚrotationrU   rU   rV   Úpredict_at_pixelsâ   s   z$InternVLAPredictor.predict_at_pixelsNc                 C   sv  |j d }|du rdg| }|  |¡}|  ||¡\}}|  |¡}	|	 ddd¡ || j||¡}
| ¡ dkr=| d¡ 	|d¡}|dd…df | | j
  ¡  d|d ¡}|dd…df | | j
  ¡  d|d ¡}tj||
jd}|
|dd…||f  | j d¡7  < tj|
| j| jfdd	d
}|  |¡}|  |¡}| || jt| j| j¡}|dur±|  ||¡\}}nd }}||||fS )um  
        Args:
            x:                 (B, 3, H, W) ImageNet-normalized
            start_keypoint_2d: (B, 2) or (2,) current EEF pixel in image coords
            query_pixels:      (B, N_WINDOW, 2) in pred_size space (GT during train)
            task_text:         list of B strings â€” LIBERO task description

        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, N_GRIPPER_BINS)  or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)   or None
            feats:           (B, D, pred_size, pred_size)
        r   Nz.Pick up the object and place it on the target.r™   r   rc   rŒ   re   Frf   )rj   rn   r‹   r?   Úpermuter›   r:   rv   Ú	unsqueezer   r   r}   r   r$   rŽ   rw   rD   rk   rl   r   rA   rF   r7   r   rE   r    )rL   rm   Ústart_keypoint_2dr‘   Ú	task_textr…   rq   rˆ   r‰   Zfeat_tokensZfeat_mapZkp_xZkp_yr—   r   ÚvolÚvolume_logitsÚgripper_logitsÚrotation_logitsrU   rU   rV   Úforward÷   s.   



**$ÿ

zInternVLAPredictor.forward)NN)r_   Ú
__module__Ú__qualname__Ú__doc__Ú	PRED_SIZEÚN_WINDOWr   r+   rn   rr   r‹   r˜   r    r©   Ú__classcell__rU   rU   rS   rV   r      s    ÿW
;r   Ú__main__ÚcudaÚcpur   T)r   r   rM   r™   r   zRpick up the black bowl between the plate and the ramekin and place it on the platez=pick up the black bowl on the stove and place it on the plateg      l@)r£   r‘   r¤   zvolume_logits  zgripper_logits r¨   zfeats          )%r¬   r€   r$   Útorch.nnr;   Útorch.nn.functionalÚ
functionalrk   r5   r8   r®   rE   rH   rJ   r­   r9   ÚModuler   r_   rw   r±   Úis_availableÚmodelrz   rC   rm   rƒ   Z
fake_queryr„   Úno_gradr4   r¥   ÚgripÚrotr   r!   rj   rU   rU   rU   rV   Ú<module>   sL      
þ
üÿê