o
    ?߱iN                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZmZ d dlmZ d d
lmZmZ d dlmZ d dlmZ zd dlm Z  W n e!yq   e"d Y nw d dl#m$Z$m%Z% G dd deZ&dS )    )ListOptionalN)DTensor)log)FSDP2ModelConfig)VLMBaseModel	init_mesh)(Qwen2_5_VisionTransformerPretrainedModelQwen2_5_VLModel)get_rope_index)%Qwen2VisionTransformerPretrainedModelQwen2VLModel)build_lr_schedulersbuild_optimizers)parallelize_qwen)	Processor)ShardzHtorch.distributed.tensor is not available. DeepSeek model will not work.)
AutoConfig
Qwen2Modelc                %       s  e Zd ZdZdededd f fddZdd Zed	d
 Z	edd Z
deejjejjjf fddZdd Zedd Zedd Z																	d1dejdeej deej deeej  deej deej dee dee dee dee d eej d!eej d"eej d#eej d$eej d%eej d&eej dejf$d'd(Zi d)fd*edejfd+d,Zd-eeejf d.edeeeejf ejf f fd/d0Z   Z!S )2	QwenModela  
    A class to build and use a AutoRegressiveModel model for text generation.
    This class is mimicing Qwen2_5_VLForConditionalGenerationSimple

    Methods:
        generate: Generate text sequences based on provided prompts using the language generation model.
    model_config	tokenizerreturnc                    s   t  || g | _d S N)super__init__Zforward_time)selfr   r   	__class__ V/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/reason1/models/vlm_qwen.pyr   2   s   
zQwenModel.__init__c                 C   s  |j dkrt|j| _t|| _nC|j dkr"t|j| _t|| _n2|j dkrLd | _tj	|j
tjdd}t|| _|j|_|j|_dd | j_d | j_ntd|j  |j| _tj|j|jd	d
| _d | _tj rt|\| _| _t| | j| j| | j| j d S d S )N
qwen2_5_vlqwen2_vlqwen2_5flash_attention_2)torch_dtypeattn_implementationc                 S   s   d S r   r   )xr   r   r    <lambda>I   s    z'QwenModel.build_model.<locals>.<lambda>Unsupported model type: F)bias)
model_typer	   vision_configvisualr
   modelr   r   r   from_pretrainedname_or_pathtorchbfloat16r   hidden_size
vocab_sizeZset_cp_meshcp_mesh
ValueErrornnLinearlm_headrope_deltasdistributedis_initializedr   
world_meshparallel_dimsr   )r   r   configr   r   r    build_model:   s2   






zQwenModel.build_modelc                 C   s   | j S r   )r-   r   r   r   r    vision_encoderV   s   zQwenModel.vision_encoderc                 C   s   | j d ur	| jjS d S r   )rB   r-   mergerrA   r   r   r    mm_projector[   s   
zQwenModel.mm_projectorc                 C   sb  g }g }g }| j jsB| jdurBtd| j jj  || jj	 || j jj |d || jj
 || j jj |d | j jsj| jdurjtd| j jj  || jj || j jj |d | j jstd| j jj  || j || j jj |d || j || j jj |d t|| j ||}t|| j }||fS )	zCreates the optimizer and scheduler for the model.

        Args:


        Returns:
            optimizer (torch.optim.Optimizer): The model optimizer.
            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
        Nz3adding vision_encoder to optimizer, lr_multiplier: zvisual.patch_embedzvisual.blocksz1adding mm_projector to optimizer, lr_multiplier: zvisual.mergerz(adding llm to optimizer, lr_multiplier: Zllm)r?   freeze_vision_encoderrB   r   info	optimizerZlr_multiplier_vision_encoderappendr-   patch_embedblocksfreeze_mm_projectorrD   Zlr_multiplier_mm_projectorrC   
freeze_llmZlr_multiplier_llmr.   r9   r   r   )r   optimizer_configscheduler_configZmodel_partsZmodel_part_namesZlr_multiplier
optimizersZlr_schedulersr   r   r    init_optimizer_schedulerc   s>   




z"QwenModel.init_optimizer_schedulerc              	   C   s  | j jr!td | jj D ]}d|_q| jj D ]}d|_q| j j	r6td | jj
 D ]}d|_q0| j jrUtd | j D ]}d|_qD| j D ]}d|_qOtdd |  D }tdd |  D }tdd |  D }td	|d
 dd|dd|d d S )NzFreezing vision_encoderFzFreezing mm_projectorzFreezing llmc                 s   s    | ]}|  V  qd S r   )numel.0pr   r   r    	<genexpr>   s    z<QwenModel.maybe_freeze_pretrained_modules.<locals>.<genexpr>c                 s   s    | ]
}|j s| V  qd S r   requires_gradrQ   rR   r   r   r    rU          c                 s   s    | ]
}|j r| V  qd S r   rV   rR   r   r   r    rU      rX   zTotal parameters: g    eAz.2fzB, Frozen parameters: ,z, Trainable parameters: )r?   rE   r   rF   r-   rI   
parametersrW   rJ   rK   rC   rL   r.   r9   sum)r   paramtotal_paramsfrozen_paramstrainable_paramsr   r   r    maybe_freeze_pretrained_modules   s,   


z)QwenModel.maybe_freeze_pretrained_modulesc                 C   4   t j sd S | jjd urd| jjv r| jd S d S )Ncpr1   r;   r<   r=   mesh_dim_namesrA   r   r   r    r5      
   

zQwenModel.cp_meshc                 C   ra   )Ntprc   rA   r   r   r    tp_mesh   re   zQwenModel.tp_meshN	input_idsattention_maskposition_idspast_key_valuesinputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictpixel_valuespixel_values_videosimage_grid_thwvideo_grid_thwr:   cache_positionsecond_per_grid_tsc           '      C   s  |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
|du r| j|}t|t}|r9|j}|j	}|
 }|dur|| jj}| j||d}|| j jk  }|jd }||krhtd| d| || j jk}|d}||}||j}||j|j}|||}|dur|| jj}| j||d}|| j jk  }|jd }||krtd| d| || j jk}|d}||}||j}||j|j}|||}|rtj||dj|d	 }|dur||j}|du r|du s|jd
kr|dur|d dks&| jdu s&|du s&| dkrf| j jdkr:t | j |||||\}}n(| j jdkrMt!| j ||||\}}n| j jdkrYd}d}n	td| j j || _nJ|j\} }!}"|dur||d | j |jnd}#t"j#|!|jd}|$dd%| d}|dur|#j&| |#jd  dd}#|'|#}|d%ddd}| jd|||||||	|
|d
}$|$d }%| (|%}&| j)durtj|&| j)t*dgd
 }&|&S )a  
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```N)grid_thwr   z6Image features and image tokens do not match: tokens: z, features z6Video features and video tokens do not match: tokens: )device_mesh)
placements   r!   r"   r#   r)   )device   )dim   )
rh   rj   ri   rk   rl   rn   ro   rp   rq   rv   )rz   r{   )+r?   ro   rp   use_return_dictr.   embed_tokens
isinstancer   rz   r{   full_tensortyper-   dtypeimage_token_idr[   itemshaper6   	unsqueeze	expand_astor}   masked_scattervideo_token_id
from_localredistributeto_localndimr:   get_seq_lengthr+   get_rope_index_v2_5get_rope_index_v2r1   arangeviewexpandrepeat_interleaveaddr9   r5   r   )'r   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   r:   rv   rw   is_inputs_embeds_dtensortarget_device_meshtarget_placementsimage_embedsn_image_tokensn_image_featuresmaskmask_unsqueezedmask_expanded
image_maskvideo_embedsn_video_tokensn_video_features
video_mask
batch_size
seq_length_deltaoutputshidden_stateslogitsr   r   r    _forward   s   <






 

"


zQwenModel._forwardr   	start_posc           
      C   s   d|vsJ d| dd}| dd}| dd}| dd}d}|dur6t|dks0J d	|d
 }d}|durUt|dksDJ d	|d
 }d|v rS|d d
 }nd}| j||||||d}	|	S )zQ
        The training step of the model, including the loss computation.
        rr   z<pixel_values should not be in data_batch, use images insteadimagesNrt   videosru   r~   z:Only batch=1 is supported for now, due to `get_rope_index`r   rw   )rh   rr   rt   rs   ru   rw   )getlenr   )
r   tokens
data_batchr   rr   rt   rs   ru   rw   r   r   r   r    forwardu  s2   zQwenModel.forwardr   	iterationc                    s   |dk r?d|v rt d|d j  d|v r0t d|d j  d|v r0t d|d   d|v r?t d	|d j  t ||S )
N   Z	raw_videozRaw video shape: r   zProcessed video tokens shape: rw   zsecond_per_grid_ts: r   zimages shape: )r   rF   r   r   training_step)r   r   r   r   r   r    r     s   zQwenModel.training_step)NNNNNNNNNNNNNNNNN)"__name__
__module____qualname____doc__r   r   r   r@   propertyrB   rD   tupler1   optim	Optimizerlr_schedulerLRSchedulerrP   r`   r5   rg   
LongTensorr   Tensorr   FloatTensorboolr   intr   dictstrr   __classcell__r   r   r   r    r   )   s    


.
	
	

 8r   )'typingr   r   r1   torch.nnr7   torch.distributed._tensorr   %cosmos_predict2._src.imaginaire.utilsr   Z9cosmos_predict2._src.reason1.configs.default.model_configr   Z,cosmos_predict2._src.reason1.models.vlm_baser   r   0cosmos_predict2._src.reason1.networks.qwen2_5_vlr	   r
   r   r   .cosmos_predict2._src.reason1.networks.qwen2_vlr   r   r   Z3cosmos_predict2._src.reason1.parallelisms.optimizerr   r   Z:cosmos_predict2._src.reason1.parallelisms.parallelize_qwenr   0cosmos_predict2._src.reason1.tokenizer.processorr   torch.distributed.tensorr   ImportErrorprinttransformersr   r   r   r   r   r   r    <module>   s*   