o
    ?߱i(                     @  s   d dl mZ d dlZd dlmZ d dlmZmZ d dlZd dl	m
Z d dlZd dlZd dlmZ d dlm
Z
mZmZmZ d dlmZ d dlmZ eG d	d
 d
ZG dd deZdS )    )annotationsN)	dataclass)AnyTuple)ImaginaireModel)distributedlogmisc
wandb_util)Callback)easy_ioc                   @  sF   e Zd ZU dZded< dZded< dZded< dd	d
ZdddZdS )_LossRecordr   floatlossint
iter_countedm_lossreturnNonec                 C  s   d| _ d| _d| _d S )Nr   )r   r   r   )self r   [/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/callbacks/wandb_log.pyreset'   s   
z_LossRecord.resetTuple[float, float]c                 C  sp   | j dkr,| j| j  }| j| j  }tj|tjjd tj|tjjd | }| }nd}d}|   ||fS )Nr   op)	r   r   r   dist
all_reduceReduceOpAVGitemr   )r   Zavg_lossZavg_edm_lossr   r   r   get_stat,   s   

z_LossRecord.get_statN)r   r   )r   r   )	__name__
__module____qualname__r   __annotations__r   r   r   r!   r   r   r   r   r   !   s   
 
r   c                      s   e Zd ZdZ			d.d/ fddZejd0d1ddZ	d0d2ddZ	d0d3d"d#Z		d0d4d&d'Z
	d0d3d(d)Zd0d1d*d+Zd0d1d,d-Z  ZS )5WandbCallbackz
    This callback is used to log the loss, average loss over logging_iter_multipler, and unstable counts of image and video to wandb.
       Flogging_iter_multiplerr   save_logging_iter_multiplersave_s3boolr   r   c                   s   t    t | _t | _t | _tjddd| _tjddd| _	|| _
|| _| j
dks0J d|| _|dkr<d| nd| _d| j | _d S )	Nr'   cudadevicer   z/logging_iter_multipler should be greater than 0@ Zwandb_loss_log)super__init__r   train_image_logtrain_video_logfinal_loss_logtorchzerosimg_unstable_countvideo_unstable_countr(   r)   r*   wandb_extra_tagname)r   r(   r)   r*   	__class__r   r   r2   @   s   
zWandbCallback.__init__r   modelr   	iterationc                 C  s   t j| j|d | j}|jj}tj| dr3t	| d}t
jr3t
jjjdd | D dd tj|jj drXdtjv rZt|jj dtjtjd d	 d S d S d S )
N)r>   z/job_env.yamlc                 S     i | ]
\}}d | |qS )z	JOB_INFO/r   ).0kvr   r   r   
<dictcomp>^       z0WandbCallback.on_train_start.<locals>.<dictcomp>T)allow_val_changez/config.yamlZSLURM_LOG_DIRzconfig.yaml)r
   
init_wandbconfigjob
path_localospathexistsr   loadwandbrunupdateitemsenvironcopyfilejoin)r   r>   r?   rH   Zjob_local_pathjob_infor   r   r   on_train_startU   s     zWandbCallback.on_train_start	model_ddp#distributed.DistributedDataParallel	optimizertorch.optim.Optimizer	scheduler$torch.optim.lr_scheduler.LRSchedulergrad_scalertorch.amp.GradScalerc           	      C  s   || j jj dkr=t r?i }t| jd||d< t|jD ]\}}|d |d| < |d |d| < qtj	||d d S d S d S )Nr   sample_counterlrz	optim/lr_weight_decayzoptim/weight_decay_step)
rH   trainerlogging_iterr   is_rank0getattr	enumerateparam_groupsrO   r   )	r   rX   rZ   r\   r^   r?   infoiparam_groupr   r   r   on_before_optimizer_stepf   s   z&WandbCallback.on_before_optimizer_step
data_batchdict[str, torch.Tensor]output_batchr   torch.Tensorc                 C  s  d}t |st |r!d}tjd| d| d|| dd |s||rK| j j| 	 7  _| j j
d7  _
| j j|d  	 7  _n"| j j| 	 7  _| j j
d7  _
| j j|d  	 7  _| j j| 	 7  _| j j
d7  _
| j j|d  	 7  _n||r|  jd7  _n|  jd7  _|| jjj| j  d	kr| jdkri }n| jj }| j \}}	| j \}
}| j \}}tj| jtjjd
 tj| jtjjd
 t rodd | D }|d| j d|d| j d|	d| j d|
d| j d|d| j d|d| j d|d| j d| j  d| j d| j  d|dt!| jd|i
 | j"re|| jjj| j | j#  d	kret$%|d| j& d|dd t'rot'j||d | jdkr{| jj(  | j)  | j)  d S d S )NFTzUnstable loss z at iteration z with is_image_batch: )
rank0_onlyr'   r   r   r   c                 S  r@   )ztimer/r   )rA   keyvaluer   r   r   rD      rE   z6WandbCallback.on_training_step_end.<locals>.<dictcomp>trainz/image_lossz/image_edm_lossz/video_lossz/video_edm_lossz/lossz	/edm_lossz/img_unstable_countz/video_unstable_countr?   r`   zs3://rundir/z/Train_Iter09dz.jsonrc   )*r6   isnanisinfr   criticalis_image_batchr3   r   detachr   r   r   r4   r5   r8   r9   rH   re   rf   r(   training_timercompute_average_resultsr!   r   r   r   SUMr   rg   rR   rQ   r:   r    rh   r*   r)   r   dumpr;   rO   r   zero_)r   r>   ro   rq   r   r?   Z skip_update_due_to_unstable_losstimer_resultsZavg_image_lossZavg_image_edm_lossZavg_video_lossZavg_video_edm_lossZavg_final_lossZavg_final_edm_lossrk   r   r   r   on_training_step_endx   s   




z"WandbCallback.on_training_step_enddataloader_valtorch.utils.data.DataLoaderc                 C  s*   t g g tjdddtjdddd| _d S )Ng        r,   r-   r   )data_batchesoutput_batchesr   sample_size)dictr6   tensor
_val_cache)r   r>   r   r?   r   r   r   on_validation_start   s   z!WandbCallback.on_validation_startc                 C  s6   t |}| jd  || 7  < | jd  |7  < d S )Nr   r   )r	   get_data_batch_sizer   )r   r>   ro   rq   r   r?   
batch_sizer   r   r   on_validation_step_end   s   
	z$WandbCallback.on_validation_step_endc                 C  s   t j| jd t jjd t j| jd t jjd | jd  | jd  }t r>t	d| d|  t
jd|i|d d S d S )Nr   r   r   zValidation loss (iteration z): zval/lossrc   )r   r   r   r   r   r    r   rg   r   rk   rO   )r   r>   r?   r   r   r   r   on_validation_end   s   zWandbCallback.on_validation_endc                 C  s   t   d S )N)rO   finish)r   r>   r?   r   r   r   on_train_end   s   zWandbCallback.on_train_end)r'   r'   F)r(   r   r)   r   r*   r+   r   r   )r   )r>   r   r?   r   r   r   )rX   rY   rZ   r[   r\   r]   r^   r_   r?   r   r   r   )r>   r   ro   rp   rq   rp   r   rr   r?   r   r   r   )r>   r   r   r   r?   r   r   r   )r"   r#   r$   __doc__r2   r   rs   rW   rn   r   r   r   r   r   __classcell__r   r   r<   r   r&   ;   s$    X
r&   )
__future__r   rK   dataclassesr   typingr   r   r6   torch.distributedr   r   torch.utils.datarO   %cosmos_predict2._src.imaginaire.modelr   %cosmos_predict2._src.imaginaire.utilsr   r	   r
   .cosmos_predict2._src.imaginaire.utils.callbackr   -cosmos_predict2._src.imaginaire.utils.easy_ior   r   r&   r   r   r   r   <module>   s   