o
    ?߱iA                     @  s   d dl mZ d dlmZmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlZz
d dlmZ dZW n ey;   dZY nw d dlmZmZ erLd d	lmZ G d
d dZd!d"ddZG dd dejjZG dd deZed#d$ddZed%dd ZdS )&    )annotations)contextmanagernullcontext)TYPE_CHECKINGAny	GeneratorListOptionalUnionN)parallel_stateTF)distributedlog)ImaginaireModelc                   @  sD   e Zd ZdZdd ZddddZdddZddddZdddZdS ) FastEmaModelUpdatera  
    This class is used to update target model~(EMA) given source model~(regular model) and beta.
    The method interaface mimic :class:`EMAModelTracker` and :class:`PowerEMATracker`.
    Different from two classes, this class does not maintain the EMA model weights as buffers. It expects the user to have two module with same architecture and weights shape.
    The class is proposed to work with FSDP model where above two classes are not working as expected. Besides, it is strange to claim model weights as buffers and do unnecessary name changing in :class:`EMAModelTracker` and :class:`PowerEMATracker`. Moeving forward, we should use this class instead of above two classes.
    c                 C  s
   d| _ d S )NF)	is_cached)self r   S/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/utils/ema.py__init__-   s   
zFastEmaModelUpdater.__init__H.?	src_modeltorch.nn.Module	tgt_modelbetafloatreturnNonec                 C  s|   g }g }t | | D ]\}}|jtjks J d|j d|| ||j qt|| tj||d| d d S Nz(EMA model only works in FP32 dtype, got z	 instead.      ?)alpha)	zip
parametersdtypetorchfloat32appenddata_foreach_mul__foreach_add_)r   r   r   r   target_listsource_list
tgt_params
src_paramsr   r   r   update_average1   s   
z"FastEmaModelUpdater.update_averagec                 C  s.   t | | D ]\}}|j|j q	d S N)r    r!   r&   copy_)r   r   r   r+   r,   r   r   r   copy_to=   s   zFastEmaModelUpdater.copy_toFr!   r   is_cpuboolc                   <   | j du s	J d|rdnd  fdd|D | _d| _ dS )	Save the current parameters for restoring later.

        Args:
            parameters (iterable): Iterable of torch.nn.Parameter to be temporarily stored.
        F9EMA cache is already taken. Did you forget to restore it?cpucudac                      g | ]	}|   qS r   cloneto.0paramdevicer   r   
<listcomp>I       z-FastEmaModelUpdater.cache.<locals>.<listcomp>TNr   collected_paramsr   r!   r1   r   r?   r   cacheA      
zFastEmaModelUpdater.cachec                 C  N   | j sJ dt| j|ddD ]\}}|j|j|j qg | _d| _ dS a  Restore the parameters in self.collected_params.

        Useful to validate the model with EMA parameters without affecting the
        original optimization process. Store the parameters before copy_to().
        After validation (or model saving), use this to restore the former parameters.

        Args:
            parameters (iterable): Iterable of torch.nn.Parameter to be updated with the stored parameters.
        zEMA cache is not taken yet.F)strictNr   r    rD   r&   r/   type_asr   r!   Zc_paramr>   r   r   r   restoreL   
   

zFastEmaModelUpdater.restoreN)r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   Fr!   r   r1   r2   r   r   r!   r   r   r   )	__name__
__module____qualname____doc__r   r-   r0   rF   rN   r   r   r   r   r   %   s    
r   
param_namestrtorch_compile_buffer_renamingr2   r   c                 C  s    |  dd}|r| dd}|S )z
    This function creates buffer name used by EMA from parameter's name

    Args:
        param_name (str): Model's parameter name
    Returns:
        buffer_name (str): buffer name to be used for given parameter name
    .-z
_orig_mod- )replace)rW   rY   buffer_namer   r   r   get_buffer_name^   s   
r_   c                      sj   e Zd ZdZd'd( fd
dZe d)d*ddZd+ddZd,d-ddZ	d.ddZ
e	d/d0d%d&Z  ZS )1EMAModelTrackera  This is a class to track the EMA model weights.

    The EMA weights are registered as buffers, which are extractable as state dicts. The names follow those of the
    regular weights, except all "." are replaced with "-" (limitation of register_buffer()). This is similar to SDXL's
    implementation of EMA. There are no optimizable parameters.
    TODO(snah): multi-EMA weights.

    Attributes:
        collected_params (list): temporarily stores the regular weights while in EMA mode.
        beta (float): EMA decay rate. (default: 0.9999).
        torch_compile_buffer_renaming (bool): whether to remove '_orig_mod-' from buffer names when torch.compile is used
    r   Fmodelr   r   r   rY   r2   c                   s   t    || _d|  krdkstd td|| _| D ]\}}|jr9t|| j}| ||	 
 j q!g | _d| _dS )zConstructor of the EMA model weight tracker.

        Args:
            model (ImaginaireModel): The PyTorch model.
            beta (float): EMA decay rate. (default: 0.9999).
                r   zDecay must be between 0 and 1FN)superr   rY   
ValueErrorr   named_parametersrequires_gradr_   register_bufferr:   detachr&   rD   r   )r   ra   r   rY   namer>   r^   	__class__r   r   r      s   

zEMAModelTracker.__init__N	iterationOptional[int]r   r   c           
      C  s   ~g }g }|   }| D ]+\}}|jr8t|| j}|| }	|	jtjks-J d|	j d||	 ||j	 qt
|| j tj||d| j d d S r   )
state_dictre   rf   r_   rY   r"   r#   r$   r%   r&   r'   r   r(   )
r   ra   rl   r)   r*   ema_buffersri   r>   r^   bufferr   r   r   r-      s   
zEMAModelTracker.update_averagec                 C  sF   |   }| D ]\}}|jr t|| j}|| }|j|j qd S r.   )rn   re   rf   r_   rY   r&   r/   )r   ra   ro   ri   r>   r^   rp   r   r   r   r0      s   zEMAModelTracker.copy_tor!   r   r1   c                   r3   )	r4   Fr5   r6   r7   c                   r8   r   r9   r<   r?   r   r   rA      rB   z)EMAModelTracker.cache.<locals>.<listcomp>TNrC   rE   r   r?   r   rF      rG   zEMAModelTracker.cachec                 C  rH   rI   rK   rM   r   r   r   rN      rO   zEMAModelTracker.restore   Tr   rateUnion[float, List[float]]numintenabledOptional[EMAModelTracker]c                 C  s   |sdS t r t r tjdd}tjd| dd td nt }tjd| dd t	|t
r5|n|g}t|t|}||k rG|| n|d	 }||k rYtd
| d|  | ||S )a=  
        Class method to initialize per rank EMA Model Tracker with different rate.
        Each rank will have a different rate based on the given configuration, resulting in different EMA weights.

        Args:
            model (torch.nn.Module): The neural network model to be tracked.
            rate (Union[float, List[float]]): The decay rate(s) for the EMA. If a list is provided,
                                              it corresponds to rates for different ranks.
            num (int, optional): The number of leading ranks to consider for different rates.
                                 Defaults to 1.
            enabled (bool, optional): Flag to enable or disable the creation of the tracker.
                                      If False, returns None. Defaults to True.

        Returns:
            Optional[EMAModelTracker]: An instance of EMAModelTracker if enabled, otherwise None.

        Example:
            >>> model = torch.nn.Linear(10, 2)
            >>> tracker = EMAModelTracker.initialize_ema_from_settings(model, rate=[0.1, 0.2], num=2)
            >>> print(tracker)

        Notes:
            If `rate` is a list and the current rank is less than `num`, the rate for the current rank
            is used. If the current rank exceeds `num`, the first rate in the list is used by default.
        NTwith_context_parallel<using MCore parallel_state for EMA initialization. DP RANK: F
rank0_only&It should not used together with FSDP!9using torch.distributed for EMA initialization. DP RANK: r   zEMAModelTracker: rank , rate )USE_MEGATRONr   is_initializedget_data_parallel_rankr   criticalwarningr   get_rank
isinstancelistminlenprint)clsra   rr   rt   rv   cur_dp_rankr   r   r   initialize_multi_rank_ema   s   
z)EMAModelTracker.initialize_multi_rank_ema)r   F)ra   r   r   r   rY   r2   r.   ra   r   rl   rm   r   r   )ra   r   r   r   rP   rQ   rR   )rq   T)
ra   r   rr   rs   rt   ru   rv   r2   r   rw   )rS   rT   rU   rV   r   r#   no_gradr-   r0   rF   rN   classmethodr   __classcell__r   r   rj   r   r`   q   s    

r`   c                      sJ   e Zd Zdd fd	d
Ze dd fddZe	dd ddZ  Z	S )!PowerEMATracker皙?Fra   r   sr   rY   r2   c                   s@   t  j|d|d tddd|d  d|d  gj | _dS )	a0  Constructor of the EMA model weight tracker.

        Args:
            model (ImaginaireModel): The PyTorch model.
            s (float): EMA decay rate. See EDM2 paper
            torch_compile_buffer_renaming (bool): whether to remove '_orig_mod-' from buffer names when torch.compile is used
        rb   )ra   r   rY   rq            N)rc   r   nprootsrealmaxexp)r   ra   r   rY   rj   r   r   r      s   .zPowerEMATracker.__init__Nrl   rm   r   r   c                   sD   |dkrd}n|d }dd|  | j d  }|| _t || d S )Nr   rb   rq   )r   r   rc   r-   )r   ra   rl   r   irj   r   r   r-     s   zPowerEMATracker.update_averageTr   rr   rt   ru   rv   Optional[PowerEMATracker]c                 C  s   |sdS t r t r tjdd}tjd| dd td nt }tjd| dd ||k r6d	| nd
}||k rHt	d| d||   | ||| S )a  
        Class method to initialize per rank EMA Model Tracker with different rate.
        Each rank will have a different rate based on the given configuration, resulting in different EMA weights.

        Args:
            model (torch.nn.Module): The neural network model for which the EMA tracker is being set up.
            num (int): The number of ranks for which the rate adjustment is applied. Beyond this, the rate remains unchanged.
            rate (float): The base decay rate for the EMA calculation.
            enabled (bool, optional): Flag to enable or disable the initialization of the tracker. If False, returns None.
                                      Defaults to True.

        Returns:
            Optional[PowerEMATracker]: An instance of PowerEMATracker with adjusted rate if enabled, otherwise None.

        Raises:
            None

        Example:
            >>> model = torch.nn.Linear(10, 2)
            >>> tracker = PowerEMATracker.initialize_multi_rank_ema(model, num=3, rate=0.99)
            >>> print(tracker)

        Notes:
            The decay rate is modified by dividing it by 2 raised to the power of the rank for each rank less than `num`.
            If the rank is greater than or equal to `num`, the base rate is used without modification. This approach
            allows higher ranked processes to have a less aggressive decay, potentially reflecting their delayed synchronization
            in a distributed training scenario.
        NTrx   rz   Fr{   r}   r~      rq   zPowerEMATracker: rank r   )
r   r   r   r   r   r   r   r   r   r   )r   ra   rr   rt   rv   r   dividerr   r   r   r     s    z)PowerEMATracker.initialize_multi_rank_ema)r   F)ra   r   r   r   rY   r2   r.   r   )T)
ra   r   rr   r   rt   ru   rv   r2   r   r   )
rS   rT   rU   r   r#   r   r-   r   r   r   r   r   rj   r   r      s    
r   ra   r   rv   context
str | NoneGenerator[None, None, None]c                 #  sD     fdd}|  dV  W d   dS 1 sw   Y  dS )a  Context manager for switching between regular and EMA model weights.

    This function is a dispatcher that handles two main cases:
    1.  If the model has its own `ema_scope` method, it will be used.
        This allows models to define custom EMA logic (e.g., for FSDP).
    2.  If not, it falls back to a generic mechanism that expects the model
        to have a `.ema` attribute containing an EMA tracker object.

    Args:
        model (ImaginaireModel): The PyTorch model.
        enabled (bool): Whether switching to EMA weights is enabled (default: False).
        context (str | None): A logging context string, passed to the model's ema_scope if used.
    c                    s^   r,t dotj} t dotjtttf}| s |s J | r(j dS tS t	 S )N	ema_scopeema)r   )
hasattrcallabler   r   r   r   r`   r   ema_scope_genericr   )Zhas_custom_scopeZhas_generic_emar   rv   ra   r   r   scope_functionL  s   z!ema_scope.<locals>.scope_functionNr   )ra   rv   r   r   r   r   r   r   <  s
   "r   c              
   c  sl    | j |   | j |  td zdV  W | j |   td dS | j |   td w )zGeneric context manager for switching between regular and EMA model weights.

    Args:
        model (ImaginaireModel): The PyTorch model, which must have a `.ema` attribute.
    zEMA: switched to EMA weights.NzEMA: restored regular weights.)r   rF   r!   r0   r   inforN   )ra   r   r   r   r   _  s   
r   rP   )rW   rX   rY   r2   r   rX   )FN)ra   r   rv   r2   r   r   r   r   )ra   r   r   r   ) 
__future__r   
contextlibr   r   typingr   r   r   r   r	   r
   numpyr   r#   megatron.corer   r   ImportError%cosmos_predict2._src.imaginaire.utilsr   r   %cosmos_predict2._src.imaginaire.modelr   r   r_   nnModuler`   r   r   r   r   r   r   r   <module>   s.    9 G"