o
    +jZ                     @   sz  d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	m
Z
mZmZmZmZmZmZmZmZmZ dZdZG dd de
Zedkreej rNd	nd
Zedddde Zedd e D Z e!de d e"ddddeZ#e"dded Z$e%  ee#e$Z&W d   n1 sw   Y  e&' D ]\Z(Z)e)dure!de( de*e)j+  qdS dS )uT  DinoVolumeQuery with InternVL VLM trunk instead of DINOv3.

Take the same query-MLP architecture (refine + EEF query + AdaLN blocks + volume scoring)
but swap the DINOv3 patch extractor for InternVL3.5-1B's language-conditioned vision pipeline.

Forward: image + task prompt → full VLM forward → extract image hidden states from the LLM's
last layer at the image-token positions → reshape to (B, D_lm, grid, grid) → projected down
to (D_dino_equiv, grid, grid) → existing query-MLP pipeline.

VLM is frozen by default (otherwise training is huge memory). Image features are cached not.
    N)
DinoVolumeQueryN_WINDOWN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINSD_FEATD_SINZD_SINTIMG_SIZE	PRED_SIZE)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?c                       sV   e Zd ZdZddddd fdd
Zdd	 Zd
d Zdd ZdddZdd Z	  Z
S )DinoVolumeQueryVLMz61view query-MLP with InternVL trunk instead of DINOv3.zOpenGVLab/InternVL2_5-1BFzOpick the black bowl between the plate and the ramekin and place it on the plate)vlm_model_name
freeze_vlmfreeze_llm_onlytask_text_defaultc                   s  t  j|i | | `|| _ddlm}m} td|  |j|t	j
dd| _|j|dd| _|rF| j D ]}	d|	_q6| j  td n|rX| j D ]}	d|	_qMtd	 ntd
 | dd| _| dd| _| d| _|r|s| j D ]}	d|	_qy| j D ]}	d|	_q| jj}
t|
dt|
dd }|j| _|
j}t|dd| _t|dd| _t|
dd| _td| j d| j  t| j| j| _d S )Nr   )	AutoModelAutoTokenizerzLoading VLM: T)Ztorch_dtypetrust_remote_code)r   Fz  VLM fully frozenz.  VLM LLM frozen, vision + projector trainablez  VLM fully trainableZvision_modelvisualZmlp1Zmulti_modal_projectorZlanguage_modelZ
llm_configZtext_config
image_size  Z
patch_size   downsample_ratiog      ?z  VLM hidden dim: z, image_size=) super__init__dino_task_text_defaulttransformersr   r   printfrom_pretrainedtorchbfloat16vlm
_tokenizer
parametersrequires_gradeval
_find_attr_vision_enc
_projector_lmconfiggetattrhidden_sizeZvlm_hidden_dimZvision_configvlm_image_sizeZvlm_patch_sizer   nnLinear	embed_dimvlm_proj)selfr   r   r   r   argskwargsr   r   pcfgZllm_cfgZvis_cfg	__class__ ,/data/cameron/para/libero/model_vlm_query.pyr      sL   


zDinoVolumeQueryVLM.__init__c                 G   s4   |D ]}t | j|rt| j|  S qtd| )NzVLM has none of )hasattrr"   r,   AttributeError)r3   namesnr:   r:   r;   r'   X   s
   zDinoVolumeQueryVLM._find_attrc                 C   s>   |j d | jks|j d | jkrtj|| j| jfddd}|S )uV   ImageNet [B,3,H,W] → [B,3,vlm_size,vlm_size] (both use ImageNet stats; just resize).bilinearF)sizemodealign_corners)shaper.   Finterpolater3   xr:   r:   r;   _renormalize_image^   s
    z%DinoVolumeQueryVLM._renormalize_imagec                 C   sD   t | jdr| j|S | |}t |dr|jn|d }| |S )u4   Vision encoder + projector → image tokens for LLM.extract_featurelast_hidden_stater   )r<   r"   rL   r(   rM   r)   )r3   Zpixel_valuesZvis_outfeatr:   r:   r;   _get_image_tokense   s
   

z$DinoVolumeQueryVLM._get_image_tokensNc              	   C   s  |j d }|j}|du r| jg| }| |}|| jj}| |}|j d }| j|ddddd|}| j	
 |d |j}	tj||	gdd	}
tjtj|||tjd
|d gdd	}| j	|
|dd}|jd }|ddd|ddf  }tt|}|| |krttt|}|| | }|dkrtj|tj|||j d ||jd
gdd	}||||ddddd }| |dddddddd }|jdd	}| |}||fS )aJ  Run vision + LLM forward, return image hidden states from last layer + grid size.

        Args:
            x: (B, 3, H, W) ImageNet-normalized.
            task_texts: list of B strings (or None to use default).

        Returns:
            patch: (B, vlm_hidden_dim, grid, grid)
            cls:   (B, vlm_hidden_dim)
        r   N   ptT@   )Zreturn_tensorspaddingZ
truncation
max_length	input_ids)dim)devicedtypeattention_mask)inputs_embedsrY   Zoutput_hidden_statesr@         )rF   rW   r   rK   tor"   rX   rO   r#   r*   Zget_input_embeddingsr    catoneslongZhidden_statesfloatintmathsqrtceilzerosreshapepermute
contiguousr2   mean)r3   rJ   
task_textsBrW   Zx_vlmZimage_tokensZN_imgZtext_inputsZtext_embedsrZ   	attn_maskZlm_outhiddenZimage_hiddengridpadZpatch_lmpatchclsr:   r:   r;   _extract_vlm_featuresm   sZ   




 &
z(DinoVolumeQueryVLM._extract_vlm_featuresc                 C   s   | j |ddS )z4Override: return VLM patches in DINO-feature format.N)rk   )rs   rI   r:   r:   r;   _extract_dino_features   s   z)DinoVolumeQueryVLM._extract_dino_featuresN)__name__
__module____qualname____doc__r   r'   rK   rO   rs   rt   __classcell__r:   r:   r8   r;   r      s    9
6r   __main__cudacpu   r   1d_pca)n_windowr   rotation_modec                 c   s    | ]
}|j r| V  qd S ru   )r%   numel).0r6   r:   r:   r;   	<genexpr>   s    r   zTrainable: ,r\   r[   z  z: ),ry   rc   osr    torch.nnr/   torch.nn.functional
functionalrG   model_dino_volume_queryr   r   r   r   r   r   r   r	   r
   r   IMAGENET_MEANIMAGENET_STDr   rv   rW   r|   is_availabler]   r&   msumr$   n_tr   randrgbspno_gradoutitemskvtuplerF   r:   r:   r:   r;   <module>   s4    0 
"	