o
    viG                     @  s  d Z ddlmZ ddlZddlZddlZddlmZmZm	Z	m
Z
mZmZ ddlZddlZddlZddlZddlmZ z
ddlmZ dZW n eyS   dZed	 Y nw dd
lmZ ddlmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ edZ%dGddZ&dHddZ'dIdJddZ(dKdLd!d"Z)e'ej*dd#G d$d% d%Z+e'ej*dd#G d&d' d'Z,e'ej*dd#G d(d) d)Z-e'ej*dd#G d*d+ d+Z.e'ej*dd#G d,d- d-Z/e'ej*dd#G d.d/ d/Z0e'ej*dd#G d0d1 d1Z1e'ej*dd#G d2d3 d3Z2e'ej*dd#G d4d5 d5Z3e'ej*dd#G d6d7 d7Z4e'ej*dd#G d8d9 d9Z5e'ej*dd#G d:d; d;Z6e'ej*dd#G d<d= d=Z7dMdNdBdCZ8dOdPdEdFZ9dS )Qz%Training config system for Imaginare4    )annotationsN)AnyDictOptionalTypeTypeVarUnion)logger)ModelParallelConfigTFzMegatron-core is not installed.)LazyCall)LazyDict)	from_yamlload_callable)callbackdistributed)ColorTobjobjectreturnboolc                 C  s
   t | dS )z
    Helper function to check if an object is an instance of an attrs-defined class.

    Args:
        obj: The object to check.

    Returns:
        bool: True if the object is an instance of an attrs-defined class, False otherwise.
    __attrs_attrs__)hasattr)r    r   J/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/imaginaire/config.py_is_attrs_instance0   s   

r   clsc                   s@   t | ds	td| j d fdd}|| _dd	d
}|| _| S )a~  
    A decorator that adds the capability to freeze instances of an attrs-defined class.

    NOTE: This requires the wrapped attrs to be defined with attrs.define(slots=False) because we need
    to hack on a "_is_frozen" attribute.

    This decorator enhances an attrs-defined class with the ability to be "frozen" at runtime.
    Once an instance is frozen, its attributes cannot be changed. It also recursively freezes
    any attrs-defined objects that are attributes of the class.

    Usage:
        @make_freezable
        @attrs.define(slots=False)
        class MyClass:
            attribute1: int
            attribute2: str

        obj = MyClass(1, 'a')
        obj.freeze()  # Freeze the instance
        obj.attribute1 = 2  # Raises AttributeError

    Args:
        cls: The class to be decorated.

    Returns:
        The decorated class with added freezing capability.
    __dict__zmake_freezable cannot be used with classes that do not define __dict__. Make sure that the wrapped class was defined with `@attrs.define(slots=False)`r   Nonec                   s0   t | dr| jr|dkrtd | || dS )z
        Override __setattr__ to allow modifications during initialization
        and prevent modifications once the instance is frozen.
        
_is_frozenzCannot modify frozen instanceN)r   r   AttributeError)selfkeyvalueoriginal_setattrr   r   setattr_overrideb   s   z(make_freezable.<locals>.setattr_overrider!   r   c                 S  s@   t j| dd D ]\}}t|rt|dr|  q	d| _dS )zK
        Freeze the instance and all its attrs-defined attributes.
        F)recursefreezeTN)attrsasdictitemsr   r   r(   r   )r!   _r#   r   r   r   r(   m   s
   
zmake_freezable.<locals>.freezeNr   r   )r!   r   r   r   )r   	TypeError__setattr__r(   )r   r&   r(   r   r$   r   make_freezable=   s   
	
	r0   indentint	use_colorstrc                 C  s  t | jsJ g }t | jD ]m}t| |j}t |jrO|r5|d| td t	|j d  n|d| d |j d  |t
||d | q|rk|d| td t	|j d t|  q|d| d |j d t|  qd|S )z=
    Recursively pretty prints attrs objects with color.
       * :   : 
)r)   has	__class__fieldsgetattrnameappendr   cyangreen_pretty_print_attrs_instanceyellowr4   join)r   r1   r3   lines	attributer#   r   r   r   rC   {   s   **&
rC   	overridesOptional[list[str]]c                 C  s   g }| tdtd d  | D ]B}|dkrq|dr(|dd }d}n|d\}}|rH| d	td t| d t|  q| d
| d t|  qd|S )z"
    Pretty prints overrides.
    r6   rH   r9   z--~r8   N=r5   z   * r:   )	r@   r   rA   rB   
startswithsplitrD   r4   rE   )rH   r3   rF   overrideattribute_nameattribute_valuer   r   r   pretty_print_overrides   s   
.
rQ   )slotsc                   @  s2   e Zd ZU dZded< dZded< dZded< dS )	ObjectStoreConfigFr   enabled r4   credentialsbucketN)__name__
__module____qualname__rT   __annotations__rV   rW   r   r   r   r   rS         
 rS   c                   @  sf   e Zd ZU dZded< dZded< dZded< dZded< dZd	ed
< e	dddZ
e	dddZdS )	JobConfigrU   r4   projectgroupr?   online
wandb_modeNzOptional[Any]clusterr   c                 C  s   | j  d| j d| j S )N/)r^   r_   r?   r!   r   r   r   path   s   zJobConfig.pathc                 C  s   t jdd}| d| j S )NZIMAGINAIRE_OUTPUT_ROOTz/tmp/imaginaire4-outputrc   )osenvirongetre   )r!   Z
local_rootr   r   r   
path_local   s   zJobConfig.path_local)r   r4   )rX   rY   rZ   r^   r[   r_   r?   ra   rb   propertyre   ri   r   r   r   r   r]      s   
 r]   c                   @  2   e Zd ZU dZded< dZded< dZded< dS )		EMAConfigFr   rT   gH.?floatbetatorch_compile_buffer_renamingN)rX   rY   rZ   rT   r[   rn   ro   r   r   r   r   rl      r\   rl   c                   @  rk   )	PowerEMAConfigFr   rT   g?rm   sro   N)rX   rY   rZ   rT   r[   rq   ro   r   r   r   r   rp      r\   rp   c                   @  s2   e Zd ZU dZded< dZded< dZded< dS )	DDPConfigFr   find_unused_parametersTstatic_graphbroadcast_buffersN)rX   rY   rZ   rs   r[   rt   ru   r   r   r   r   rr      r\   rr   c                   @  s&   e Zd ZU dZded< dZded< dS )CuDNNConfigFr   deterministicT	benchmarkN)rX   rY   rZ   rw   r[   rx   r   r   r   r   rv      s   
 rv   c                   @  sJ   e Zd ZU dZded< dZded< dZded	< d
Zded< dZded< dS )	JITConfigFr   rT   NzUnion[list[int], None]input_shapecudar4   devicebfloat16dtypeTstrict)	rX   rY   rZ   rT   r[   rz   r|   r~   r   r   r   r   r   ry      s   
 ry   c                   @  s   e Zd ZU dZded< dZded< ejedZ	ded	< d
Z
ded< ejedZded< dZded< dZded< dZded< dZded< ejedZded< dZded< g Zded< dZded< dZded< dZded< dZded< dS )CheckpointConfigNzOptional[Dict]typeFr   dcp_async_mode_enabledfactoryrS   save_to_object_storeɚ;r2   	save_iterload_from_object_storerU   r4   	load_pathload_training_stateonly_load_scheduler_stateTstrict_resumery   jitverbose	list[str]keys_not_to_resumebroadcast_via_filesystemload_ema_to_regdcp_allow_mismatched_sizeenable_gcs_patch_in_boto3)rX   rY   rZ   r   r[   r   r)   fieldrS   r   r   r   r   r   r   r   ry   r   r   r   r   r   r   r   r   r   r   r   r     s"   
 r   c                   @  s*   e Zd ZU dZdZded< dZded< dS )
NVTXConfigzConfig for NVTX ranges used in the main training loop.

    See tutorials/nanogpt for more details on how to integrate profiling into your model.Fr   rT   cuda_synchronizeN)rX   rY   rZ   __doc__rT   r[   r   r   r   r   r   r   ,  s   
 r   c                   @  s~   e Zd ZU dZdZded< dZded< dZded	< d
Zded< dZ	ded< dZ
ded< dZded< dZded< dZded< dS )StragglerDetectionConfigzConfig for Straggler detection tool: https://gitlab-master.nvidia.com/dl/gwe/fault_tolerance_related/straggler/-/tree/cupti?ref_type=headsFr   rT   d   r2   report_freqr8   profile_freqg       @rm   max_diffTraise_erroranalyze_forwardanalyze_backwardanalyze_optimizeranalyze_dataloadingN)rX   rY   rZ   r   rT   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   9  s   
 r   c                   @  s   e Zd ZU dZded< dZded< dZded< dZded< ee	d	Z
d
ed< dZded< dZded< dZded< dZded< dS )	ProfilingFr   enable_profilingenable_memory_snapshotsave_s3r8   r2   r      z	list[int]target_ranksrecord_shapeprofile_memoryT
with_stackwith_modulesN)rX   rY   rZ   r   r[   r   r   r   listranger   r   r   r   r   r   r   r   r   r   R  s   
 r   c                   @  sF  e Zd ZU ddlmZ eZded< eee	e
j e	e
j e	e
j dZded< dZd	ed
< ejedZded< ejedZded< dZded< ejdd dZded< dZded< dZded< dZded< dZded< dZded< d Zded!< dZded"< ej Z!d#ed$< d%Z"ded&< eje#dZ$d'ed(< eje%dZ&d)ed*< dS )+TrainerConfigr   )ImaginaireTrainerzType[ImaginaireTrainer]r   )emaprogress_barwandbr   	callbacksddpr4   distributed_parallelismr   rr   rv   cudnnr2   seedc                   C  s
   t ddS )NF)rT   )dictr   r   r   r   <lambda>z  s   
 zTrainerConfig.<lambda>r   grad_scaler_argsr   max_iterNz
int | Nonemax_val_iterr   logging_iterTr   run_validationvalidation_iterFrun_validation_on_starttimeout_periodztorch.memory_formatmemory_formatr8   grad_accum_iterr   straggler_detectionr   	profiling)'rX   rY   rZ   Z%cosmos_policy._src.imaginaire.trainerr   r   r[   r   r   Lr   ZEMAModelCallbackZProgressBarCallbackZWandBCallbackr   r   r)   r   rr   r   rv   r   r   r   r   r   r   r   r   r   r   torchpreserve_formatr   r   r   r   r   r   r   r   r   r   r   b  s4   
 


r   c                   @  s   e Zd ZU dZded< ded< ded< ded< ded< ejedZd	ed
< eje	dZ
ded< er<ejedZded< ndZded< ejedZded< dZded< d d!ddZd"ddZd#ddZdS )$Configz[Config for an imaginaire4 job.

    See /README.md/Configuration System for more info.
    r   model	optimizer	schedulerZdataloader_trainZdataloader_valr   r]   jobr   trainerr
   model_parallelNr   r   
checkpointFr   upload_reproducible_setupr3   r   r4   c                 C  s   t | d|S )Nr   )rC   )r!   r3   r   r   r   pretty_print  s   zConfig.pretty_printdict[str, Any]c                 C  s
   t | S )N)r)   r*   rd   r   r   r   to_dict  s   
zConfig.to_dictc                 C  sr   t t| jjd }t|d | 	 
 d| j_| jjdks'J | jjdks/J | jjdks7J dS )z1Validate that the config has all required fields.zutf-8r   rU   N)r   
ByteTensor	bytearrayr   r?   r{   r   	broadcastcpunumpytobytesdecoder^   r_   )r!   Zjob_name_tensorr   r   r   validate  s   zConfig.validateF)r3   r   r   r4   )r   r   r-   )rX   rY   rZ   r   r[   r)   r   r]   r   r   r   USE_MEGATRONr
   r   r   r   r   r   r   r   r   r   r   r   r     s"   
 
r   config_pathoptsr   enable_one_loggerc                 C  s   t  }| dr"t| }t|j }ddlm} |||dd}nt	| |dd}|rWz"ddl
m} t  }||}t  }	td	|	| d
 dd W n	 tyV   Y nw t  }
td|
| d
 dd |S )Nz.yamlr   )rN   T)Zremove_defaultsF)r   )override_one_logger_callbackz#override_one_logger_callback: took     .A.2fmsztoal time to load config: )timemonotonic_nsendswithr   r   rY   make_config1cosmos_policy._src.imaginaire.utils.config_helperrN   _load_py_configZHcosmos_policy._src.imaginaire.utils.one_logger.one_logger_override_utilsr   loggingdebugImportError)r   r   r   t1configr,   rN   r   Zol_t1Zol_t2t2r   r   r   load_config  s(   
 r   r   c           	      C  s   ddl m}m} t }|| }t }td|| d dd t }t|	 }t }td|| d dd t }|||}t }td|| d dd |rvt }|
  t }td	|| d dd |S )
Nr   )get_config_modulerN   zget_config_module: took r   r   r   zimportlib.import_module: took zoverride: took zconfig.validate: took )r   r   rN   r   r   r   r   	importlibimport_moduler   r   )	r   r   r   r   rN   r   config_moduler   r   r   r   r   r     s&   
r   )r   r   r   r   )r   r   r   r   )r   F)r   r   r1   r2   r3   r   r   r4   )NF)rH   rI   r3   r   r   r4   r   )r   r4   r   r   r   r   r   r   )T)r   r4   r   r   r   r   r   r   ):r   
__future__r   r   rf   r   typingr   r   r   r   r   r   r)   r   torch.utils.datatorch.utils.data.distributedZlogurur	   r   Zmegatron.corer
   r   r   print)cosmos_policy._src.imaginaire.lazy_configr   r   r   Z+cosmos_policy._src.imaginaire.serializationr   r   #cosmos_policy._src.imaginaire.utilsr   r   Z(cosmos_policy._src.imaginaire.utils.miscr   r   r   r0   rC   rQ   definerS   r]   rl   rp   rr   rv   ry   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    

>
	

	
	
	


"



/
7 