o
    ?߱i>[                     @   s>  d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
m  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dl m!Z! d dl"mZ d dl#m$Z$m%Z% dejj&fddZ'G dd dejj&Z(de)e*ej+f dede,fddZ-dee* defddZ.dd Z/dS )    N)AnyDictListOptional)DTensor)
DeviceMesh)_IncompatibleKeys)log)FSDP2ModelConfig)build_lr_schedulersbuild_optimizers)ParallelDims)device_moduledevice_type)	Processor)distributed)	broadcastbroadcast_with_shape_checkmodelc              
   C   s   g }t j }|  D ]W\}}|jddD ]L\}}|du rq||jv rat|dr>|j|jdd\}|_	|j
d||dd td| d	| d
 t |dkra|||t|j|j|jd qqt|dkspJ d| |S )a'  
    Loop over all submodules of a model and report buffers
    that are registered with persistent=False and are all zeros.

    Args:
        model (nn.Module): The PyTorch model (e.g. from transformers).

    Returns:
        List of dicts with module name, buffer name, and tensor info.
    F)recurseNrope_init_fndeviceinv_freq)
persistentz[Check]buffer_name: [z] in module [z] is marked as non-persistentr   )modulebuffershapedtyper   z4nonpersistent_zero_buffers should be empty, but got )torchcudacurrent_devicenamed_modulesnamed_buffers_non_persistent_buffers_sethasattrr   configZattention_scalingregister_buffertor	   infoallappendtupler   r   r   len)r   resultsr!   module_namer   buffer_namer    r1   V/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/reason1/models/vlm_base.pyfind_nonpersistent_zero_buffers$   s0   


r3   c                
       s  e Zd ZdZdededdf fddZejfdej	dd	fd
dZ
dejjdejjjdedd	fddZd2dedd	fddZdd Zdeejjejjjf fddZdefddZd3deeef dedef fdd Ze d!eeejf dedeeeejf ejf fd"d#Z		d4d$eej  fd%d&Z!e"d'd( Z#e"d)d* Z$d!eeejf dedeeeejf ejf fd+d,Z%d-d. Z&i dfd/edejfd0d1Z'  Z(S )5VLMBaseModela  
    A class for base VLM model, has the shared methods for all VLM models

    Methods:
        build_model: build the model, should be implemented by each VLM model
        maybe_freeze_pretrained_modules: freeze the pretrained modules
        init_optimizer_scheduler: initialize the optimizer and scheduler
        get_num_params: get the number of parameters in the model
        load_state_dict: load the state dict
        validation_step: validation step
        forward: forward pass, should be implemented by each VLM model
        training_step: training step
        init_weights: initialize the weights, should be implemented by each VLM model
    model_config	tokenizerreturnZAutoRegressiveModelc                    s   t    	 t }tt|j}t| td| d|  || _	|| _
tt|j| _| | t| td|  d S )Nz!Setting torch default dtype from z to zReset torch default dtype to )super__init__r   get_default_dtypegetattr	precisionset_default_dtyper	   r)   r6   r&   build_model)selfr5   r6   Zorig_precisionr<   	__class__r1   r2   r9   [   s   



zVLMBaseModel.__init__memory_formatNc                 C      dS )zThe model preparation before the training is launched

        Args:
            memory_format (torch.memory_format): Memory format of the model.
        Nr1   )r?   rB   r1   r1   r2   on_train_start}      zVLMBaseModel.on_train_start	optimizer	scheduler	iterationc                 C   rC   )a  Hook before zero_grad() is called.

        Args:
            optimizer (torch.optim.Optimizer): The model optimizer.
            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
            iteration (int): Current iteration number.
        Nr1   )r?   rF   rG   rH   r1   r1   r2   on_before_zero_grad   s   
z VLMBaseModel.on_before_zero_gradr   c                 C   rC   )a,  Hook after loss.backward() is called.

        This method is called immediately after the backward pass, allowing for custom operations
        or modifications to be performed on the gradients before the optimizer step.

        Args:
            iteration (int): Current iteration number.
        Nr1   )r?   rH   r1   r1   r2   on_after_backward   s   	zVLMBaseModel.on_after_backwardc              	   C   s   | j jrtd | j D ]}d|_q| j jr(td | j D ]}d|_q"| j j	r<td | j
 D ]}d|_q6tdd |  D }tdd |  D }tdd |  D }td	|d
 dd|dd|d d S )NzFreezing vision_encoderFzFreezing mm_projectorzFreezing llmc                 s       | ]}|  V  qd S Nnumel.0pr1   r1   r2   	<genexpr>       z?VLMBaseModel.maybe_freeze_pretrained_modules.<locals>.<genexpr>c                 s   s    | ]
}|j s| V  qd S rL   requires_gradrN   rO   r1   r1   r2   rR          c                 s   s    | ]
}|j r| V  qd S rL   rT   rO   r1   r1   r2   rR      rV   zTotal parameters: g    eAz.2fzB, Frozen parameters: ,z, Trainable parameters: )r&   freeze_vision_encoderr	   r)   vision_encoder
parametersrU   freeze_mm_projectormm_projector
freeze_llmr   sum)r?   paramtotal_paramsfrozen_paramstrainable_paramsr1   r1   r2   maybe_freeze_pretrained_modules   s$   


z,VLMBaseModel.maybe_freeze_pretrained_modulesc                 C   s   g }g }| j js&| jdur&td| j jj  || j || j jj | j jsH| j	durHtd| j jj
  || j	 || j jj
 | j jsetd| j jj  || j || j jj t|| j |}t|| j }||fS )zCreates the optimizer and scheduler for the model.

        Args:


        Returns:
            optimizer (torch.optim.Optimizer): The model optimizer.
            scheduler (torch.optim.lr_scheduler.LRScheduler): The optimization scheduler.
        Nz3adding vision_encoder to optimizer, lr_multiplier: z1adding mm_projector to optimizer, lr_multiplier: z(adding llm to optimizer, lr_multiplier: )r&   rX   rY   r	   r)   rF   lr_multiplier_vision_encoderr+   r[   r\   lr_multiplier_mm_projectorr]   lr_multiplier_llmr   r   r   )r?   optimizer_configscheduler_configmodel_partslr_multiplier
optimizerslr_schedulersr1   r1   r2   init_optimizer_scheduler   s*   z%VLMBaseModel.init_optimizer_schedulerc                 C   s   t dd |  D }|S )z?
        Return the number of parameters in the model.
        c                 s   rK   rL   rM   rO   r1   r1   r2   rR      rS   z.VLMBaseModel.get_num_params.<locals>.<genexpr>)r^   rZ   )r?   n_paramsr1   r1   r2   get_num_params   s   zVLMBaseModel.get_num_paramsTF
state_dictstrictassignc                    sP   t  j|d|d\}}|r#t|dkst|dkr#td| d| t||S )z
        Ignore the missing keys with substrings matching `substring_to_ignore` (e.g., "_extra_state" keys imposed by
        TransformerEngine for FP8).
        F)rq   rr   r   zMissing keys: z

Unexpected keys: )r8   load_state_dictr-   
ValueErrorr   )r?   rp   rq   rr   Zactual_missing_keysunexpected_keysr@   r1   r2   rs      s
   
zVLMBaseModel.load_state_dict
data_batchc                 C   s   |  ||S )z
        Perform a validation step for the model, which is the same as the training step (but without backpropagation).
        )training_step)r?   rv   rH   r1   r1   r2   validation_step   s   zVLMBaseModel.validation_stepbuffer_devicec                 C   s~   | j jdv rt| j n| j| | jd ur1| j jdrn| j jdv r+| j  n| j| | jd ur=| j  d S d S )N)qwen2_5siglip)zinternvit-300m-448px-v2.5zinternvit-6b-448px-v2.5)r&   
model_typer3   r   init_weightsrY   
startswithr\   )r?   ry   r1   r1   r2   r}      s   

zVLMBaseModel.init_weightsc                 C      d S rL   r1   r?   r1   r1   r2   cp_mesh     zVLMBaseModel.cp_meshc                 C   r   rL   r1   r   r1   r1   r2   tp_mesh  r   zVLMBaseModel.tp_meshc                 C   s  i }|dk rKd|   }|  D ]}t|| tjr.|d| d|| j d|| j 7 }qdD ]}||v rC|d| d||  7 }q1tj|dd	 | jd urWt	|| j n| j
d urbt	|| j
 |d
 }|jdd}|dd }|d u}|d u rtj|tjd}|jdd}| jjdkr| ||dd\}	}
t|
dkr|
d d usJ t|
}||d< t|
D ]\}}||d| < qnd }n| ||}	|	d d d df }	|d d dd f }|d d dd f  }d}|r||| < |  }|  }|jd }tj|jd |jd}tj|tjjd tj|tjjd || }d|v rg|d }|   }tj|tjjd || }|  jdd }tj|tjjd |  jdd }tj|tjjd nEtjt|  |jd}tj|tjjd || }tjt|jd |jd}tj|tjjd tjt|jd |jd}tj|tjjd |!||	" |" ||" # |" # |" # |" # |  " # |d
 |	$dd}	|$dd}| jj%rt&j'|	||dd}||| "  }nt&j'|	||d}| jj(dkr@t|	t)r|	* }n|	}tj+|dd}| jj(|d ,  }t|t)r9t)j-||j.|j/d}n|}|| }n|}||d < | jjdkrZ|d urZ||| jj 7 }||fS )!N   zdata_batch: z | z shape: z	, dtype: )__url____key__image_grid_thwvideo_grid_thwz: F)
rank0_onlytokensr    r   
token_maskr   r   T)Zreturn_aux_lossZaux_loss_sumZ	aux_loss_   i)oppadding_mask)dim)
Zencode_tokenslogitslabelsignore_indexavg_num_assistant_tokensavg_num_real_tokensmax_num_real_tokensmin_num_real_tokenscurrent_num_assistant_tokensbatch_size_localr^   )inputtargetr   	reduction)r   r   r      )device_mesh
placementsce_loss)0keys
isinstancer   Tensorr   r   r	   r)   r   _broadcast_to_cp_or_tp_ranksr   r(   get	ones_likeboolr&   aux_loss_coeffr-   r^   	enumerateclonefloattensorr   dist
all_reduceReduceOpSUMmaxMAXminMINrN   updatedetachitemflattenloss_per_tokenFcross_entropyz_loss_coeffr   to_local	logsumexpmean
from_localr   r   )r?   rv   rH   output_batchsummary_strkeyr   r   Zapply_token_maskr   Zaux_loss_listaux_lossir   r   num_assistant_tokensr   r   Zbatch_size_globalr   r   Znum_real_tokensr   r   r   r   Zlocal_logitsZlog_z_localZz_loss_localZz_loss_dtensor
total_lossr1   r1   r2   rw     s   &












zVLMBaseModel.training_stepc                 C      t rL   NotImplementedError)r?   r5   r1   r1   r2   r>     s   zVLMBaseModel.build_model	start_posc                 C   r   )z}
        The forward pass of the model.
        Returns:
            logits (torch.Tensor): The logits of the model.
        r   )r?   r   rv   r   r1   r1   r2   forward  rE   zVLMBaseModel.forward)r   )TFrL   ))__name__
__module____qualname____doc__r
   r   r9   r   preserve_formatrB   rD   optim	Optimizerlr_schedulerLRSchedulerintrI   rJ   rc   r,   rm   ro   r   strr   r   rs   no_graddictr   rx   r   r   r}   propertyr   r   rw   r>   r   __classcell__r1   r1   r@   r2   r4   K   sl    "

#
$	




 "r4   rv   cp_or_tp_meshr7   c                 C   s  |  d}t||| d< d| v r|  d}t||| d< |  dd}|du r.tj|tjd}t||| d< d| v rD| d }t||| d< d| v rUtjdtjdj|jd	ntjdtjdj|jd	}t||}|sqd| v rq| d= n|rd| vrtddd
ddj|jd	| d< |  dd}|durt||| d< |  dd}|durt	||| d< |  dd}	|	durt|	|| d< |  dd}
|
durt	|
|| d< dD ]}|| vrd| g| |< t
| | || |< qd| vrd| d< t
| d || d< dS )a  Copies tensors in data_batch to the GPU and broadcasts across CP or TP ranks.

    The contents of data_batch are updated with the copied and broadcasted
    tensors. The inputs are replicated across CP ranks. The output logits
    and loss calculations are also replicated across CP ranks.

    Args:
        data_batch: Inputs (tokens, token_mask, images) needed for training.
        cp_or_tp_mesh: The DeviceMesh for context parallelism or tensor parallelism.
    r   attention_maskr   Nr   r   imagesr   r      i  r   videosr   )r   Z
dialog_strr   placeholder_dataset_namedefault)r   r   r   r   r   onesr(   r   zerosr   broadcast_object)rv   r   r   r   r   r   Z
has_imagesr   r   r   r   r   r1   r1   r2   r     s^   



r   	local_strc                 C   s>   |  }dd ttj|dD }tj|| |d |d }|S )z*
    Broadcast a string to all ranks.
    c                 S   s   g | ]}d qS rL   r1   )rP   _r1   r1   r2   
<listcomp>  s    z$broadcast_object.<locals>.<listcomp>)groupr   )	get_groupranger   get_world_sizeall_gather_object)r   r   r   gathered_list
output_strr1   r1   r2   r     s
   r   c              	   C   sz   t  }t| jj| jj| jj| jj| jj	|| jj
 d}ttdd}tt d| }t| |jtd}||fS )N)dp_sharddp_replicatecptppp
world_sizeZenable_loss_parallel
LOCAL_RANKr   :)r   )r   r   r   trainingdata_parallel_shard_degreedata_parallel_replicate_degreecontext_parallel_degreetensor_parallel_degreeexperimentalpipeline_parallel_degreedisable_loss_parallelr   osgetenvr   r   r   r   
set_deviceZ
build_mesh)r5   r   parallel_dims
local_rankr   
world_meshr1   r1   r2   	init_mesh  s   	
r  )0r  typingr   r   r   r   r   torch.distributedr   r   Ztorch.nn.functionalnn
functionalr   torch.distributed._tensorr   torch.distributed.device_meshr   torch.nn.modules.moduler   %cosmos_predict2._src.imaginaire.utilsr	   9cosmos_predict2._src.reason1.configs.default.model_configr
   3cosmos_predict2._src.reason1.parallelisms.optimizerr   r   Z7cosmos_predict2._src.reason1.parallelisms.parallel_dimsr   Z;cosmos_predict2._src.reason1.parallelisms.torchtitan_utiltsr   r   0cosmos_predict2._src.reason1.tokenizer.processorr   Z"cosmos_predict2._src.reason1.utilsZ.cosmos_predict2._src.reason1.utils.parallelismr   r   Moduler3   r4   r   r   r   r   r   r   r  r1   r1   r1   r2   <module>   s.   '   aJ