o
    vi16                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
e	ej deeef defddZG dd deZG dd deZde	ej  dede!e" de!e def
ddZ#G dd deZ$de%de%de%de"fdd Z&de%de%de"fd!d"Z'de%d#e%de%d$e"d%e"d&e"de"fd'd(Z(d)edede$fd*d+Z)dS ),    N)deepcopy)AnyDictList)StateDictOptionsget_optimizer_state_dictset_optimizer_state_dict)Stateful)LambdaLR)log)FSDP2ModelConfig)	FusedAdamparamsoptimizer_kwargsnamec                 C   s|   |dkrt jj| fi |}|S |dkr t jj| fi |}|S |dkr6t| |d |d |d ddd}|S td	| d
)NAdamAdamWr   lrweight_decaybetasT)r   r   r   
capturablemaster_weightsz
Optimizer z not added.)torchoptimr   r   r   NotImplementedError)r   r   r   	optimizer r   W/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/reason1/parallelisms/optimizer.py_optimizer_cls"   s"   
r   c                   @   s   e Zd ZdZdeej deee	f dede
e de
e ddfd	d
ZdejjfddZdddZddeddfddZdeee	f fddZdeee	f ddfddZdS )OptimizersContainerzUtil for calling step/zero_grad on multiple optimizers needed for virtual pipeline stages
    and saving/loading optimizer state_dict at checkpoint.
    model_partsr   r   lr_multipliermodel_part_namesreturnNc                 C   s  t |t |ksJ d|| _dd | jD | _|| _t| jD ]e\}}t|}|d  || 9  < |d rltt}	|	 D ]}
|
j
rUt|
drL|
jnd}|	| |
 q?|	 D ]}t|||}| j| | qZq |	 D ]}
|
j
rt|
g||}| j| | qpq d S )Nz6lr_multiplier must have the same length as model_partsc                 S   s   g | ]}g qS r   r   .0_r   r   r   
<listcomp>D       z0OptimizersContainer.__init__.<locals>.<listcomp>r   fuseddevice_meshdefault)lenr    
optimizersr"   	enumerater   collectionsdefaultdictlist
parametersrequires_gradhasattrr*   appendvaluesr   )selfr    r   r   r!   r"   model_idmodeloptimizer_kwargs_copyZparameters_by_meshpr*   r   r   r   r   r   __init__:   s2   
zOptimizersContainer.__init__c                 C   s   t tj| j S N)iter	itertoolschainr-   r7   r   r   r   __iter__Z      zOptimizersContainer.__iter__c                 C   s   t j| j D ]}|  qd S r=   )r?   r@   r-   step)r7   r   r   r   r   rD   ]   s   
zOptimizersContainer.stepFset_to_nonec                 C   s"   t j| j D ]}|j|d qd S )N)rE   )r?   r@   r-   	zero_grad)r7   rE   r   r   r   r   rF   a   s   zOptimizersContainer.zero_gradc              	   C   s:   i }t | j| jD ]\}}|t||tddd q	|S )NTflatten_optimizer_state_dict)r9   r-   options)zipr    r-   updater   r   )r7   sdr9   r-   r   r   r   
state_dicte   s   zOptimizersContainer.state_dictrM   c                 C   s2   t | j| jD ]\}}t|||tddd qd S )NTrG   )r9   r-   optim_state_dictrI   )rJ   r    r-   r   r   )r7   rM   r9   r-   r   r   r   load_state_dictq   s   z#OptimizersContainer.load_state_dictr#   N)F)__name__
__module____qualname____doc__r   nnModuler   strr   r1   floatr<   r   r   	OptimizerrB   rD   boolrF   rM   rO   r   r   r   r   r   5   s(    

 
r   c                   @   sf   e Zd ZdZg dg fdeej deee	f dede
e de
e dd	fd
dZdddZdddZd	S )OptimizersInBackwardContainerz6Optimiers in backward to skip .step() and .zero_grad())      ?r\   r\   r    r   r   r!   r"   r#   Nc                    s   || _ dd | j D | _|| _i  t| j D ]"\}}t|}|d  || 9  < | D ]}	t|	g|| |	< q,qd fdd}
t| j D ]!\}}| D ]
}	|	jrW|	|
 qM fdd| D | j|< qEd S )	Nc                 S   s   g | ]}d qS r=   r   r$   r   r   r   r'      r(   z:OptimizersInBackwardContainer.__init__.<locals>.<listcomp>r   r#   c                    s    |      |    d S r=   )rD   rF   )paramZ
optim_dictr   r   
optim_hook   s   z:OptimizersInBackwardContainer.__init__.<locals>.optim_hookc                    s   g | ]} | qS r   r   )r%   r]   r^   r   r   r'          rP   )	r    r-   r"   r.   r   r2   r   r3   "register_post_accumulate_grad_hook)r7   r    r   r   r!   r"   r8   r9   r:   r]   r_   r   r^   r   r<   ~   s$   
z&OptimizersInBackwardContainer.__init__c                 C      d S r=   r   rA   r   r   r   rD         z"OptimizersInBackwardContainer.stepc                 C   rb   r=   r   rA   r   r   r   rF      rc   z'OptimizersInBackwardContainer.zero_gradrP   )rQ   rR   rS   rT   r   rU   rV   r   rW   r   r1   rX   r<   rD   rF   r   r   r   r   r[   {   s&    


r[   r    
job_configr!   r"   r#   c           	      C   s   t | t |  krt |ksJ d J d|jj}|r(|jjdkr(td|jj}|jj}|jj}|dd|| d}|sGt	| ||||S t
| ||||S )zWrap one optimizer per model part in an OptimizersContainer which provides a single
    step() and zero_grad() method for all the child optimizers.
    zKlr_multiplier and model_part_names must have the same length as model_parts   zBOptimizers in backward is not supported with pipeline parallelism.)g?gffffff?g?)r   r   r   r)   foreach)r,   r   early_step_in_backwardexperimentalpipeline_parallel_degreer   r   r   r)   r   r[   )	r    rd   r!   r"   Zoptim_in_bwdr   r   r)   r   r   r   r   build_optimizers   s,    	
rj   c                   @   sn   e Zd ZdZdeddfddZdddZdeee	f fd	d
Z
deee	f ddfddZdee fddZdS )SchedulersContainerz]Util for calling step on multiple learning rate schedulers needed for virtual pipeline stagesr-   r#   Nc                 C   s(   g | _ |D ]}| j t||d qd S )N)	lr_lambda)
schedulersr5   r
   )r7   r-   rl   r   r   r   r   r<      s   zSchedulersContainer.__init__c                 C   s    t | jD ]\}}|  qd S r=   )r.   rm   rD   )r7   id	schedulerr   r   r   rD      s   
zSchedulersContainer.stepc                 C   s$   t | jdksJ d| jd  S )Nr   z3Must have at least one scheduler to save state_dict)r,   rm   rM   rA   r   r   r   rM      s   zSchedulersContainer.state_dictrM   c              
   C   s   |d }|d }t d| d|  t| jD ]-\}}t|D ]}|  q t d|d  dt| j d| d	 t d
|   qd S )N
last_epoch_step_countz4Resuming schedulers by stepping them to last_epoch: z; _step_count: z
Scheduler re   /z	 stepped z times.zUpdated learning rate: )r   infor.   rm   rangerD   r,   get_last_lr)r7   rM   rp   rq   idxro   rD   r   r   r   rO      s   
(z#SchedulersContainer.load_state_dictc                 C   s   dd | j D S )Nc                 S   s   g | ]}|  qS r   )ru   )r%   ro   r   r   r   r'      r`   z3SchedulersContainer.get_last_lr.<locals>.<listcomp>)rm   rA   r   r   r   ru      rC   zSchedulersContainer.get_last_lrrP   )rQ   rR   rS   rT   r   r<   rD   r   rW   r   rM   rO   r   rX   ru   r   r   r   r   rk      s    
rk   warmup_stepsdecay_stepscurrent_stepc                 C   sD   || k r|d7 }t || d  }|S |||   }d|| |  }|S )zComputes linear warmup followed by linear decay.
    Per LambdaLR requirement, this is accomplished by returning
    a multiplicative factor to adjust the learning rate to
    create the desired schedule.
    re   rX   )rw   rx   ry   curr_adjustmentZnormalized_stepr   r   r   linear_warmup_linear_decay   s   r|   c                 C   s,   || k r|d7 }t || d  }|S d}|S )zComputes linear warmup only
    Per LambdaLR requirement, this is accomplished by returning
    a multiplicative factor to adjust the learning rate to
    create the desired schedule.
    re   rz   )rw   ry   r{   r   r   r   linear_warmup  s   r}   cooldown_stepsbase_lrinit_lrend_lrc           
      C   sx   | | }|| }|| }|| krt ||  }	|d| |	  S ||kr:||  | }	|dd|  dttj|	    S |S )ap  This scheduler will warmup the learning rate from init_lr to base_lr for warmup_steps,
    then decay the learning rate from base_lr to end_lr for cooldown_steps. After cooldown_steps + warmup_steps,
    the learning rate will be set to end_lr.
    Per LambdaLR requirement, this is accomplished by returning
    a multiplicative factor to adjust the learning rate to
    create the desired schedule.

    Args:
        warmup_steps (int): The number of steps to warmup the learning rate.
        cooldown_steps (int): The number of steps to decay the learning rate.
        current_step (int): The current step.
        base_lr (float): The base learning rate.
        init_lr (float): The initial learning rate before warmup.
        end_lr (float): The final learning rate after cooldown.

    Returns:
        float: The multiplicative factor to adjust the learning rate.
    r\   g      ?re   )rX   mathcospi)
rw   r~   ry   r   r   r   total_stepsZinit_multiplierZend_multiplierprogressr   r   r   linear_warmup_cosine_cooldown  s   $r   r-   c                 C   s|   t |jj}ttd|jj| }|jjr'tjt	|||j
j|j
j|j
jd}n|jjr3tt||}ntt|}t| |S )Nre   )r   r   r   )inttrainingrw   rX   maxstepsuse_cosine_decay	functoolspartialr   r   r   r   r   use_linear_decayr|   r}   rk   )r-   rd   rw   rx   rl   r   r   r   build_lr_schedulers6  s   
r   )*r/   r   r?   r   copyr   typingr   r   r   r   torch.nnrU   'torch.distributed.checkpoint.state_dictr   r   r   %torch.distributed.checkpoint.statefulr	   Ztorch.optim.lr_schedulerr
   #cosmos_policy._src.imaginaire.utilsr   7cosmos_policy._src.reason1.configs.default.model_configr   Z+cosmos_policy._src.reason1.utils.fused_adamr   	ParameterrW   r   r   r[   rV   r1   rX   rj   rk   r   r|   r}   r   r   r   r   r   r   <module>   s^   $F)
!(
$