o
    ?߱i0                     @   s`   d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z ddlm
Z G dd deZdS )	z
This file is modified from cosmos_predict2/_src/reason1/models/vlm_qwen.py for extracting reason embeddings.
Main change is to return the hidden states from the language model.
    )ListOptionalN)DTensor)	QwenModel)get_rope_indexc                %       s  e Zd ZdZ fddZ	 																	ddejdeej deej dee	ej
  d	eej
 d
eej dee dee dee dee deej deej
 deej deej deej deej deej dejf$ddZ	 i dfdedejfddZ  ZS )QwenVLBaseModelz
    This is a base class for QwenVL models.
    Here we override the forward method and the training_step method to
    obtain more intermediate results from the language model.
    c                    s   t  j|i | d S )N)super__init__)selfargskwargs	__class__ ]/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/text_encoders/reason1.pyr	   &   s   zQwenVLBaseModel.__init__N	input_idsattention_maskposition_idspast_key_valuesinputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictpixel_valuespixel_values_videosimage_grid_thwvideo_grid_thwrope_deltascache_positionsecond_per_grid_tsreturnc           '      C   s  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|du r| j|}t|t}|r9|j}|j	}|
 }|dur|| jj}| j||d}|| j jk  }|jd }||krhtd| d| || j jk}|d}||}||j}||j|j}|||}|dur|| jj}| j||d}|| j jk  }|jd }||krtd| d| || j jk}|d}||}||j}||j|j}|||}|rtj||dj|d	 }|dur||j}|du r|du s|jd
kr|dur|d dks&| jdu s&|du s&| dkrf| j jdkr:t | j |||||\}}n(| j jdkrMt!| j ||||\}}n| j jdkrYd}d}n	td| j j || _nJ|j\} }!}"|dur||d | j |jnd}#t"j#|!|jd}|$dd%| d}|dur|#j&| |#jd  dd}#|'|#}|d%ddd}| jd|||||||	|
|d
}$|$d }%| (|%}&| j)durtj|&| j)t*dgd
 }&|&|$fS )a  
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```N)Zgrid_thwr   z6Image features and image tokens do not match: tokens: z, features z6Video features and video tokens do not match: tokens: )device_mesh)
placements   
qwen2_5_vlqwen2_vlZqwen2_5zUnsupported model type: )device   )dim   )
r   r   r   r   r   r   r   r   r   r    )r$   r%   )+configr   r   use_return_dictmodelembed_tokens
isinstancer   r$   r%   full_tensortypevisualdtypeZimage_token_idsumitemshape
ValueError	unsqueeze	expand_astor)   masked_scatterZvideo_token_id
from_localredistributeto_localndimr   get_seq_length
model_typeget_rope_index_v2_5get_rope_index_v2torcharangeviewexpandrepeat_interleaveaddlm_headcp_meshShard)'r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   Zis_inputs_embeds_dtensorZtarget_device_meshtarget_placementsimage_embedsZn_image_tokensZn_image_featuresmaskZmask_unsqueezedZmask_expanded
image_maskZvideo_embedsZn_video_tokensZn_video_featuresZ
video_mask
batch_size
seq_length_deltaoutputshidden_stateslogitsr   r   r   _forward.   s   <






 

"


zQwenVLBaseModel._forwardr   	start_posc                 C   s   d|vsJ d| dd}| dd}| dd}| dd}| dd}|dur8t|d	ks4J d
|d }|durJt|d	ksFJ d
|d }| j||||||d\}	}
|	|
fS )zQ
        The training step of the model, including the loss computation.
        r   z<pixel_values should not be in data_batch, use images insteadimagesNr   videosr   padding_maskr*   z:Only batch=1 is supported for now, due to `get_rope_index`r   )r   r   r   r   r   r   )getlenrZ   )r
   tokens
data_batchr[   r   r   r   r   r   rY   rW   r   r   r   forward   s*   
zQwenVLBaseModel.forward)NNNNNNNNNNNNNNNNN)__name__
__module____qualname____doc__r	   rF   
LongTensorr   Tensorr   FloatTensorboolrZ   intrc   __classcell__r   r   r   r   r      sz    	

 8"r   )rg   typingr   r   rF   torch.distributed._tensorr   Z,cosmos_predict2._src.reason1.models.vlm_qwenr   Z0cosmos_predict2._src.reason1.networks.qwen2_5_vlr   rD   Z.cosmos_predict2._src.reason1.networks.qwen2_vlrE   r   r   r   r   r   <module>   s   