o
    ?߱i                     @  s   d dl mZ d dlmZ d dlmZ d dlZd dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZ d d	lmZ d d
lmZmZ dd ZedddZdddZdS )    )annotations)contextmanager)partialN)CheckpointImplapply_activation_checkpointingcheckpoint_wrapper)init_device_mesh)FullyShardedDataParallel)_post_forward_post_forward_reshard_pre_forward_pre_forward_unshard_root_pre_forward)	_p_assert)distributedlogc                   s6   t d tttjd} fdd}t| ||d dS )zZapply activation checkpointing to model
    returns None as model is updated directly
    z---> applying fdsp activation checkpointing...)checkpoint_implc                   s&   d} D ]}t | |rd} |S q|S )NFT)
isinstance)	submoduleresultZ	block_clslist_block_cls [/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/utils/fsdp_helper.pycheck_fn4   s   
z*apply_fsdp_checkpointing.<locals>.check_fn)checkpoint_wrapper_fnr   N)r   criticalr   r   r   NO_REENTRANTr   )modelr   Znon_reentrant_wrapperr   r   r   r   apply_fsdp_checkpointing)   s   
r   r   torch.nn.Modulec                 c  s   t | t}|r_t rJ d| j}dgtdd}}tjjd4 t	| | ||\}}d }t
| |t| j||\}}|rPt|jj| jkd| j d|jj  W d    n1 sZw   Y  zd V  W |ruddi}t| |t| || d S d S |rddi}t| |t| || w w )	Nz1FSDP context should be entered with grad disabledr   )dummyz,FullyShardedDataParallel.possible_fsdp_scopez5Expected `FlatParameter` to be on the compute device z	 but got output   )r   FSDPtorchis_grad_enabled_handledictautogradprofilerrecord_functionr   r   r   _fsdp_wrapped_moduler   
flat_paramdevicecompute_devicer
   r   )r   enabledhandleargskwargsunusedr"   r   r   r   possible_fsdp_scope?   sJ   
r5   c                 C  s   t  }|du rt|d}t||}| du r|| } |pd}|| dkr/td| d| d|| |  dkr?td|  dt|| |fd	d
}|du rPtdtd|  d|  |S )a  
     Initializes a device mesh for use with Hybrid Sharding strategy in FSDP (HSDP) training.

    This function requires explicit sizes for replica and sharding groups to accommodate models
    whose GPU fit is unknown, providing flexibility in distributed training setups.

    Args:
        replica_group_size (int): The size of each replica group. Must be provided to ensure
            the model fits within the available resources.
        sharding_group_size (int): The size of each sharding group that the model can fit. Must be provided to
            ensure the correct distribution of model parameters.
        device (str, optional): The device to use (e.g., "cuda:0"). If None, defaults to "cuda"
            with the local rank as the device index.

    Returns:
        A device mesh object compatible with FSDP.

    Raises:
        ValueError: If replica_group_size or sharding_group_size are not provided, or if the
            world size is not evenly divisible by the sharding group size.
        RuntimeError: If a valid device mesh cannot be created.

    Usage:
        If your model fits on 4 GPUS, and you have 3 nodes of 8 GPUs, then:
        Sharding_Group_Size = 4
        Replica_Groups_Size = (24 total gpus, 4 per sharding group) = 6 Replica Groups
        >>> device_mesh = initialize_device_mesh(replica_group_size, sharding_group_size)
        >>> sharded_model = FSDP(model, device_mesh=device_mesh, ...)
    N   cudar   zWorld size z0 is not evenly divisible by sharding group size .zVThe calculated number of replica groups is not evenly divisible by replica_group_size )	replicateshard)mesh_dim_namesz%Failed to create a valid device mesh.z0Device mesh initialized with replica group size z and sharding group size )r   get_world_sizemin
ValueErrorr   RuntimeErrorr   r   )Zreplica_group_sizesharding_group_sizer.   
world_sizedevice_meshr   r   r   hsdp_device_mesha   s4    


rC   )r   r    )NNN)
__future__r   
contextlibr   	functoolsr   r%   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   r   r   torch.distributed.device_meshr   torch.distributed.fsdpr	   r$   %torch.distributed.fsdp._runtime_utilsr
   r   r   r   r   torch.distributed.utilsr   %cosmos_predict2._src.imaginaire.utilsr   r   r   r5   rC   r   r   r   r   <module>   s   !