o
    ?߱ih`                     @  sp  d dl mZ d dlZd dlZd dlZd dlmZmZmZm	Z	 d dl
Z
d dlZd dlmZ d dlZd dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZ zd dlmZ W n eyc   dZY nw erxd dlmZ d d	lm Z  d d
l!m"Z" G dd dZ#G dd dZ$G dd de$Z%G dd de$Z&G dd de$Z'G dd de$Z(G dd de$Z)G dd de$Z*dS )    )annotationsN)TYPE_CHECKINGAnyCallableOptional)instantiate)distributedlogmisc
wandb_util)get_local_tensor_if_DTensor)parallel_state)Config)ImaginaireModel)ImaginaireTrainerc                   @  s$   e Zd ZdZddd	ZdddZdS )CallBackGroupa  A class for hosting a collection of callback objects.

    It is used to execute callback functions of multiple callback objects with the same method name.
    When callbackgroup.func(args) is executed, internally it loops through the objects in self._callbacks and runs
    self._callbacks[0].func(args), self._callbacks[1].func(args), etc. The method name and arguments should match.

    Attributes:
        _callbacks (list[Callback]): List of callback objects.
    configr   trainerr   returnNonec                 C  s   g | _ |jj}|rft|tst|tjjr&tj	dt
dd dd t|D }| D ]=\}}d|vr>td| d|  q*td	| d
|  t|}t|tsYJ | d||_||_| j | q*dS dS )zInitializes the list of callback objects.

        Args:
            config (Config): The config object for the Imaginaire codebase.
            trainer (ImaginaireTrainer): The main trainer.
        zdThe 'config.trainer.callbacks' parameter should be a dict instead of a list. Please update your code   
stacklevelc                 S     i | ]
\}}d | |qS )	callback_ ).0ivr   r   X/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/utils/callback.py
<dictcomp>J       z*CallBackGroup.__init__.<locals>.<dictcomp>_target_z	Callback z) is missing the '_target_' field. 
 SKip zInstantiating callback z: z is not a valid callback.N)
_callbacksr   	callbacks
isinstancelist	omegaconf
listconfig
ListConfigwarningswarnDeprecationWarning	enumerateitemsr	   criticalr   Callbackr   append)selfr   r   Zcallback_configsZcallback_nameZcurrent_callback_cfg	_callbackr   r   r   __init__9   s0   	zCallBackGroup.__init__method_namestrr   c                   s   d fdd}|S )zLoops through the callback objects to call the corresponding callback function.

        Args:
            method_name (str): Callback method name.
        r   r   c                    sB   j D ]}t| sJ t| }t|sJ || i |}qd S N)r#   hasattrgetattrcallable)argskwargscallbackmethod_r5   r2   r   r   multi_callback_wrapper_   s   

z9CallBackGroup.__getattr__.<locals>.multi_callback_wrapperNr   r   r   )r2   r5   rA   r   r@   r   __getattr__X   s   zCallBackGroup.__getattr__N)r   r   r   r   r   r   )r5   r6   r   r   )__name__
__module____qualname____doc__r4   rC   r   r   r   r   r   .   s    

r   c                   @  st  e Zd ZdZd^d_ddZd`daddZd`dbddZ		d`dbddZd`dcddZd`dcddZ			d`ddd d!Z
d`ded"d#Zd`dcd$d%Zd`dcd&d'Zdfd(d)Zdfd*d+Z		d`dgd2d3Z		d`dhd4d5Z		d`did8d9Z		d`did:d;Z		d`djd>d?Z		d`dbd@dAZ		d`didBdCZd`dadDdEZdkdFdGZ	dldmdJdKZdndNdOZd`dadPdQZd`dadRdSZdodpdVdWZdndXdYZd`dadZd[Zdfd\d]Z dS )qr0   zThe base class for all callbacks.

    All callbacks should inherit from this class and adhere to the established method names and signatures.
    Nr   Optional['Config']r   Optional['ImaginaireTrainer']c                 C  s(   |dus|durt jdtdd ~~dS )a  Initializes a Callback object.

        Args:
            config (Optional[Config]): The configuration object for the Imaginaire codebase, if available.
            trainer (Optional[ImaginaireTrainer]): The main trainer handling the training loop, if available.

        Notes:
            The config and trainer parameters are optional to maintain backward compatibility.
            In future releases, these parameters will be removed. Upon using these parameters, a deprecation
            warning will be issued.

        NzThe 'config' and 'trainer' parameters are deprecated and will be removed in a future release. Please update your code to create Callback instances without these parameters.r   r   )r*   r+   r,   )r2   r   r   r   r   r   r4   o   s   zCallback.__init__r   modelr   	iterationintr   r   c                 C     d S r7   r   r2   rJ   rK   r   r   r   on_train_start      zCallback.on_train_startdatadict[str, torch.Tensor]c                 C     dS )a`  
        Called before the training step, for each batch. This is paired with on_training_step_end() but note that
        when using gradient accumulation, while on_training_step_end() is only called when the optimizer is updated,
        this function is called for every batch.
        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
        for every batch, albeit with the same iteration number.
        FIXME - should this either be deprecated, or called only when a new training step is started after having updated
        the optimizer?
        Nr   r2   rJ   rQ   rK   r   r   r   on_training_step_start      
zCallback.on_training_step_startc                 C  rS   )a  
        Called before the training step, for each batch, similarly to on_training_step_start(). This function is paired with
        on_training_step_batch_end(), and both functions are called for every batch even when using gradient accumulation.
        Note that the iteration is only updated when the optimizer is updated, and therefore it may be the same for multiple invocations.
        Nr   rT   r   r   r   on_training_step_batch_start      z%Callback.on_training_step_batch_startc                 C  rM   r7   r   r2   rK   r   r   r   on_before_forward   rP   zCallback.on_before_forwardc                 C  rM   r7   r   rY   r   r   r   on_after_forward   rP   zCallback.on_after_forward	model_ddp#distributed.DistributedDataParallellosstorch.Tensorc                 C  rM   r7   r   r2   r\   r^   rK   r   r   r   on_before_backward      zCallback.on_before_backwardc                 C  rM   r7   r   r2   r\   rK   r   r   r   on_after_backward   rP   zCallback.on_after_backwardc                 C  rM   r7   r   rY   r   r   r   on_before_dataloading   rP   zCallback.on_before_dataloadingc                 C  rM   r7   r   rY   r   r   r   on_after_dataloading   rP   zCallback.on_after_dataloadingc                 C  rM   r7   r   r2   r   r   r   on_optimizer_init_start   rP   z Callback.on_optimizer_init_startc                 C  rM   r7   r   rg   r   r   r   on_optimizer_init_end   rP   zCallback.on_optimizer_init_end	optimizertorch.optim.Optimizer	scheduler$torch.optim.lr_scheduler.LRSchedulergrad_scalertorch.amp.GradScalerc                 C  rM   r7   r   r2   r\   rj   rl   rn   rK   r   r   r   on_before_optimizer_step   rX   z!Callback.on_before_optimizer_stepc                 C  rM   r7   r   r2   r\   rj   rl   rK   r   r   r   on_before_zero_grad   s   zCallback.on_before_zero_grad
data_batchoutput_batchc                 C  rS   )a/  
        Called at the end of a training step for every batch even when using gradient accumulation.
        This is paired with on_training_step_batch_start(). Note that the iteration is only updated when the optimizer is updated,
        and therefore it may be the same for multiple batches.
        Nr   r2   rJ   rt   ru   r^   rK   r   r   r   on_training_step_batch_end   s   z#Callback.on_training_step_batch_endc                 C  rS   )a}  
        Called at the end of a training step, but note that when using gradient accumulation, this is only called
        when the optimizer is updated, and the iteration incremented, whereas on_training_step_start is called every time.
        Use on_training_step_batch_start and on_training_step_batch_end if you need callbacks that are called
        for every batch.
        Nr   rv   r   r   r   on_training_step_end   s   zCallback.on_training_step_enddataloader_valtorch.utils.data.DataLoaderc                 C  rM   r7   r   r2   rJ   ry   rK   r   r   r   on_validation_start   rb   zCallback.on_validation_startc                 C  rM   r7   r   rT   r   r   r   on_validation_step_start   rb   z!Callback.on_validation_step_startc                 C  rM   r7   r   rv   r   r   r   on_validation_step_end   rX   zCallback.on_validation_step_endc                 C  rM   r7   r   rN   r   r   r   on_validation_end   rP   zCallback.on_validation_endc                 C  rM   r7   r   )r2   rJ   r   r   r   on_load_checkpoint_start  rP   z!Callback.on_load_checkpoint_startcheckpoint_pathOptional[str]c                 C  rM   r7   r   )r2   rJ   rK   r   r   r   r   on_load_checkpoint_end  rb   zCallback.on_load_checkpoint_end
state_dict	dict[Any]c                 C  rS   )z
        Called when checkpoint loading is about to start, but after on_save_checkpoint_start().
        FIXME - why do we need this callback, can't we just use on_save_checkpoint_start()?
        Nr   r2   rJ   r   r   r   r   on_load_checkpoint	  s   zCallback.on_load_checkpointc                 C  rS   )zB
        Called when checkpoint saving is about to start.
        Nr   rN   r   r   r   on_save_checkpoint_start  s   z!Callback.on_save_checkpoint_startc                 C  rS   )a  
        Called when the synchronous part of checkpointing is finished, this function can be used
        along with on_save_checkpoint_start() to measure the exposed (synchronous) checkpoint time.
        Note that for asynchronous checkpoint, the checkpoint may still be ongoing, so this function
        does not mean the checkpoint is finished for the asynchronous case, use on_save_checkpoint_success()
        for that.
        Nr   rN   r   r   r   on_save_checkpoint_end  rX   zCallback.on_save_checkpoint_endelapsed_timefloatc                 C  rS   )a  
        Called when checkpoint saving is fully finished, and succeeded. Not called if checkpoint failed.
        For synchronous checkpoint, it is called at the same time as on_save_checkpoint_end(), but for asynchronous
        checkpoint, it is called after the asynchronous part has also finished. For checkpointers with out-of-process
        checkpointing, this function is called as soon as the notification is received from the checkpointer process,
        which may not be immediately after the checkpoint has completed but later on. Therefore, if you need to measure
        the full checkpoint duration for the asynchronous part, use the elapsed_time parameter, do not measure it directly
        as this would be a significant overestimate.
        Nr   )r2   rK   r   r   r   r   on_save_checkpoint_success   rV   z#Callback.on_save_checkpoint_successc                 C  rM   r7   r   r   r   r   r   on_save_checkpoint,  rP   zCallback.on_save_checkpointc                 C  rM   r7   r   rN   r   r   r   on_train_end/  rP   zCallback.on_train_endc                 C  rM   r7   r   rg   r   r   r   
on_app_end2  rP   zCallback.on_app_end)NN)r   rH   r   rI   r   rJ   r   rK   rL   r   r   rJ   r   rQ   rR   rK   rL   r   r   rK   rL   r   r   r\   r]   r^   r_   rK   rL   r   r   r\   r]   rK   rL   r   r   rB   r\   r]   rj   rk   rl   rm   rn   ro   rK   rL   r   r   
r\   r]   rj   rk   rl   rm   rK   rL   r   r   rJ   r   rt   rR   ru   rR   r^   r_   rK   rL   r   r   rJ   r   ry   rz   rK   rL   r   r   )rJ   r   r   r   )r   N)rJ   r   rK   rL   r   r   r   r   )rJ   r   r   r   r   r   )r   r   )rK   rL   r   r   r   r   )!rD   rE   rF   rG   r4   rO   rU   rW   rZ   r[   ra   rd   re   rf   rh   ri   rq   rs   rw   rx   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   i   sR    


	




r0   c                   @  s*   e Zd ZdZddd	d
Z	ddddZdS )EMAModelCallbackz2The callback class for tracking EMA model weights.r   rJ   r   rK   rL   r   r   c                 C  sH   |j jjrt|dsJ d|jjtjd|_d S t|dr"J dd S )Nemaz.EMA should be initialized from ImaginaireModeldtypez#There should be no EMA initialized.)r   r   enabledr8   totorchfloat32rN   r   r   r   rO   9  s   
zEMAModelCallback.on_train_startrt   rR   ru   r^   r_   c                 C  s    |j jjr|j|| d S d S r7   )r   r   r   Zupdate_averagerv   r   r   r   rx   B  s   
	z%EMAModelCallback.on_training_step_endNr   r   r   )rD   rE   rF   rG   rO   rx   r   r   r   r   r   6  s
    r   c                   @  s   e Zd ZdZejddd	d
Zej	ddddZej	dd ddZej	ddddZ	ejddddZ
ejddddZdS )!ProgressBarCallbackWThe callback class for visualizing the training/validation progress bar in the console.r   rJ   r   rK   rL   r   r   c                 C  s   t j| jjj|dd| _d S )NTraining)initialdesc)tqdmtranger   r   max_iter
train_pbarrN   r   r   r   rO   R  s   z"ProgressBarCallback.on_train_startrt   rR   ru   r^   r_   c                 C     | j   d S r7   )r   updaterv   r   r   r   rx   V     	z(ProgressBarCallback.on_training_step_endry   rz   c                 C  sX   | j jjd ur| j jj}nt|}|d ur|dks J d| tj|dddd| _d S )Nr   z)Invalid number of validation iterations: Z
Validating   F)r   positionleave)r   r   max_val_iterlenr   r   val_pbar)r2   rJ   ry   rK   num_iterr   r   r   r|   a  s
   z'ProgressBarCallback.on_validation_startc                 C  r   r7   )r   r   rv   r   r   r   r~   l  r   z*ProgressBarCallback.on_validation_step_endc                 C  r   r7   )r   closerN   r   r   r   r   w     z%ProgressBarCallback.on_validation_endc                 C  s   | j j  | j  d S r7   )r   checkpointerfinalizer   r   rN   r   r   r   r   {  s   z ProgressBarCallback.on_train_endNr   r   r   r   )rD   rE   rF   rG   r   
rank0_onlyrO   rx   r|   r~   r   r   r   r   r   r   r   O  s"    


r   c                   @  sH   e Zd ZdZejddd	d
ZejddddZej	ddddZdS )IterationLoggerCallbackr   r   rJ   r   rK   rL   r   r   c                 C  s   t   | _d| _d S )Nr   )timestart_iteration_timeelapsed_iteration_timerN   r   r   r   rO     s   

z&IterationLoggerCallback.on_train_startrQ   rR   c                 C  s   t   | _d S r7   )r   r   rT   r   r   r   rU     r   z.IterationLoggerCallback.on_training_step_startrt   ru   r^   r_   c              	   C  sl   |  j t | j 7  _ || jjj dkr4| j | jjj }td| d|dd| d d| _ d S d S )Nr   zIteration: z, average iter time: Z2fz, total loss 4f)	r   r   r   r   r   logging_iterr	   infoitem)r2   rJ   rt   ru   r^   rK   Zavg_timer   r   r   rx     s   
$
z,IterationLoggerCallback.on_training_step_endNr   r   r   r   )	rD   rE   rF   rG   r   r   rO   rU   rx   r   r   r   r   r     s    r   c                   @  sl   e Zd ZdZd'd(d	d
Z	d'd)ddZ	d'd*ddZ	d'd+ddZ	d'd*d d!Zd'd(d"d#Z	d'd(d$d%Z
d&S ),WandBCallbackaP  The callback class for logging to Weights and Biases (W&B).

    By default, WandBCallback logs the following training stats to W&B every config.trainer.logging_iter:
    - iteration: The current iteration number (useful for visualizing the training progress over time).
    - train/loss: The computed overall loss in the training batch.
    - optim/lr: The current learning rate.
    - timer/*: The averaged timing results of each code block recorded by trainer.training_timer.
    For validation, WandBCallback logs:
    - val/loss: The computed overall loss in the validation dataset.
    r   rJ   r   rK   rL   r   r   c                 C  s   t j| j|d d S )N)rJ   )r   Z
init_wandbr   rN   r   r   r   rO     s   zWandBCallback.on_train_startr\   r]   rj   rk   rl   rm   rn   ro   c                 C  sV   || j jj dkr't r)tjd| d i|d tjd| i|d d S d S d S )Nr   zoptim/lrstepzoptim/grad_scale)	r   r   r   r   is_rank0wandbr	   get_last_lr	get_scalerp   r   r   r   rq     s   z&WandBCallback.on_before_optimizer_steprt   rR   ru   r^   r_   c                 C  sz   || j jj dkr;| jj }t r3tjdd |	 D |d tjd|i|d tjd|i|d | jj
  d S d S )Nr   c                 S  r   )ztimer/r   )r   keyvaluer   r   r   r      r!   z6WandBCallback.on_training_step_end.<locals>.<dictcomp>r   z
train/lossrK   )r   r   r   training_timercompute_average_resultsr   r   r   r	   r.   reset)r2   rJ   rt   ru   r^   rK   Ztimer_resultsr   r   r   rx     s   z"WandBCallback.on_training_step_endry   rz   c                 C  s*   t g g tjdddtjdddd| _d S )Ng        cuda)devicer   )data_batchesZoutput_batchesr^   sample_size)dictr   tensor
_val_cacher{   r   r   r   r|     s   z!WandBCallback.on_validation_startc                 C  s6   t |}| jd  || 7  < | jd  |7  < d S )Nr^   r   )r
   get_data_batch_sizer   )r2   rJ   rt   ru   r^   rK   
batch_sizer   r   r   r~     s   
	z$WandBCallback.on_validation_step_endc                 C  s   t j| jd t jjd t j| jd t jjd | jd  | jd  }t r?t	d| d|d t
jd|i|d d S d S )	Nr^   )opr   zValidation loss (iteration z): r   zval/lossr   )dist
all_reducer   ReduceOpSUMr   r   r   r	   r   r   )r2   rJ   rK   r^   r   r   r   r     s   zWandBCallback.on_validation_endc                 C  s   t   d S r7   )r   finishrN   r   r   r   r     s   zWandBCallback.on_train_endNr   r   r   r   r   )rD   rE   rF   rG   rO   rq   rx   r|   r~   r   r   r   r   r   r   r     s    	
r   c                   @  sN   e Zd ZdZd!dd	Zd"d#ddZd"d$ddZ	
d"d$ddZ	
d"d%ddZd S )&LowPrecisionCallbackz2The callback class handling low precision trainingr   r   r   r   update_iterrL   c                 C  s
   || _ d S r7   )r   )r2   r   r   r   r   r   r   r4     s   
zLowPrecisionCallback.__init__r   rJ   r   rK   r   r   c                 C  sJ   |j tjkrtd tj| _n|j tjtj	tj
fv sJ d|j | _d S )Nz4Using fp32. We should disable master weights update.z4LowPrecisionCallback must use a low precision dtype.)	precisionr   r   r	   r/   sysmaxsizer   bfloat16float16halfprecision_typerN   r   r   r   rO     s   

z#LowPrecisionCallback.on_train_startrQ   rR   c                 C  B   |  D ]\}}t|tjrt|| r|j| jd||< qd S Nr   r.   r%   r   Tensoris_floating_pointr   r   r2   rJ   rQ   rK   kr   r   r   r   rU     s
   z+LowPrecisionCallback.on_training_step_startc                 C  r   r   r   r   r   r   r   r}   	  s
   z-LowPrecisionCallback.on_validation_step_startr\   r]   rj   rk   rl   rm   c                 C  s   || j  dkrDt|ddrFg g }}t|j|jD ]"\}}t|d |d D ]\}	}
|t|	j |t|
j q&qt	|| d S d S d S )Nr   master_weightsFparams)
r   r9   zipparam_groupsZparam_groups_masterr1   r   rQ   r   _foreach_copy_)r2   r\   rj   rl   rK   r   Zmaster_paramsgroupZgroup_masterpZp_masterr   r   r   rs     s   
z(LowPrecisionCallback.on_before_zero_gradN)r   r   r   r   r   rL   r   r   r   r   )	rD   rE   rF   rG   r4   rO   rU   r}   rs   r   r   r   r   r     s    
r   c                      s   e Zd ZdZ			d+d, fd
dZd-d.ddZd-d.ddZ	d-d/ddZd-d0ddZ	d-d1d#d$Z		d-d2d%d&Z
d-d.d'd(Zd-d.d)d*Z  ZS )3NVTXCallbackz%The callback for creating NVTX rangesFNsynchronizeboolr   rH   r   rI   c                   s   t  || || _d S r7   )superr4   r   )r2   r   r   r   	__class__r   r   r4   $  s   
zNVTXCallback.__init__r   rK   rL   r   r   c                 C  "   | j rtj   tjjd d S )Nforwardr   r   r   nvtx
range_pushrY   r   r   r   rZ   -  s   
zNVTXCallback.on_before_forwardc                 C      | j rtj   tjj  d S r7   r   r   r   r   	range_poprY   r   r   r   r[   2     
zNVTXCallback.on_after_forwardr\   r]   r^   r_   c                 C  r   )Nbackwardr   r`   r   r   r   ra   7  s   
zNVTXCallback.on_before_backwardc                 C  r   r7   r   rc   r   r   r   rd   >  r  zNVTXCallback.on_after_backwardrj   rk   rl   rm   rn   ro   c                 C  r   )Noptimizer_stepr   rp   r   r   r   rq   C  s   
z%NVTXCallback.on_before_optimizer_stepc                 C  r   r7   r   rr   r   r   r   rs   O  s   
z NVTXCallback.on_before_zero_gradc                 C  s   t jjd d S )Ndataloading)r   r   r   r   rY   r   r   r   re   Z  s   z"NVTXCallback.on_before_dataloadingc                 C  s   t jj  d S r7   )r   r   r   r  rY   r   r   r   rf   ]  s   z!NVTXCallback.on_after_dataloading)FNN)r   r   r   rH   r   rI   r   r   r   r   r   r   )rD   rE   rF   rG   r4   rZ   r[   ra   rd   rq   rs   re   rf   __classcell__r   r   r   r   r   !  s"    	r   )+
__future__r   r   r   r*   typingr   r   r   r   r'   r   torch.distributedr   r   torch.utils.datar   r   +cosmos_predict2._src.imaginaire.lazy_configr   %cosmos_predict2._src.imaginaire.utilsr	   r
   r   *cosmos_predict2._src.imaginaire.utils.miscr   megatron.corer   ImportError&cosmos_predict2._src.imaginaire.configr   %cosmos_predict2._src.imaginaire.modelr   'cosmos_predict2._src.imaginaire.trainerr   r   r0   r   r   r   r   r   r   r   r   r   r   <module>   s@   ; N2 Q/