o
    ?߱iq                     @   s,  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZ zd dlmZ W n eyE   dZed Y nw d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl(m*Z* dZ+dZ,	 d dl-m.Z. e.dg dZ/G dd dej0Z1G dd de1Z2dd Z3de/fdd Z4G d!d" d"ej0Z5G d#d$ d$ej6Z7G d%d& d&e!Z8G d'd( d(ej0Z9G d)d* d*e9Z:G d+d, d,e9Z;e:e;d-Z<G d.d/ d/ej0Z=G d0d1 d1ej0Z>G d2d3 d3ejj0Z?G d4d5 d5e'Z@dS )6    N)Optional)	rearrangerepeat)MinimalA2AAttnOp)	attention)apply_rotary_embzflash_attn is not installed.)ProcessGroupget_process_group_ranks)fully_shard)checkpoint_wrapper)_IncompatibleKeys)
transforms)DotProductAttention)log)split_inputs_cp)WeightTrainingStat)CheckpointMode)	SACConfig   i  )
namedtuple	VideoSizeTHWc                       sl   e Zd Z fddZdefddZdd Zedd	 Zd
e	j
de	j
fddZde	jfddZdd Z  ZS )VideoPositionEmbc                    s   t    d | _d S N)super__init__	_cp_groupself	__class__ X/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/networks/wan2pt1.pyr   B   s   

zVideoPositionEmb.__init__process_groupc                 C   s
   || _ d S r   r   )r!   r&   r$   r$   r%   enable_context_parallelF      
z(VideoPositionEmb.enable_context_parallelc                 C   s
   d | _ d S r   r'   r    r$   r$   r%   disable_context_parallelI   r)   z)VideoPositionEmb.disable_context_parallelc                 C      dS )N   r$   r    r$   r$   r%   seq_dimL      zVideoPositionEmb.seq_dimx_B_T_H_W_Creturnc                 C   sV   |j }| jdur!t| j}t|}|\}}}}}	||| |||	f}| |}
| |
S )z
        With CP, the function assume that the input tensor is already split.
        It delegates the embedding generation to generate_embeddings function.
        N)shaper   r	   lengenerate_embeddings_split_for_context_parallel)r!   r/   	B_T_H_W_Ccp_rankscp_sizeBr   r   r   C
embeddingsr$   r$   r%   forwardP   s   



zVideoPositionEmb.forwardr5   c                 C   s   t r   )NotImplementedError)r!   r5   r$   r$   r%   r3   _   s   z$VideoPositionEmb.generate_embeddingsc                 C   s    | j d urt|| j| j d}|S )N)xr-   cp_group)r   r   r-   )r!   r:   r$   r$   r%   r4   b   s   
z,VideoPositionEmb._split_for_context_parallel)__name__
__module____qualname__r   r   r(   r*   propertyr-   torchTensorr;   Sizer3   r4   __classcell__r$   r$   r"   r%   r   A   s    
r   c                       s   e Zd Z			ddededededededef fd	d
ZdddZ			ddejde	e de	e de	e fddZ
edd Z  ZS )VideoRopePosition3DEmb      ?head_dimlen_hlen_wlen_th_extrapolation_ratiow_extrapolation_ratiot_extrapolation_ratioc              	      s   t    || _|| _|| _|}|d d }	|	}
|d|	  }||	|
 | ks6J d| d|	 d|
 d| |	| _|| _||	|	d   | _||
|
d   | _|||d   | _	d| _
d S )N      z	bad dim: z != z + F)r   r   max_hmax_wmax_t_dim_h_dim_th_ntk_factorw_ntk_factort_ntk_factor_is_initialized)r!   rI   rJ   rK   rL   rM   rN   rO   dimdim_hdim_wdim_tr"   r$   r%   r   i   s   

0
zVideoRopePosition3DEmb.__init__r0   Nc                 C   s   | j rd S | j}| j}tt| j| j| j	 
 | _td|dd |d  	 
 | | _td|dd |d  	 
 | | _d| _ d S )Nr   rQ   T)rZ   rU   rV   rC   arangemaxrR   rS   rT   floatcudaseqdim_spatial_rangedim_temporal_range)r!   r\   r^   r$   r$   r%   cache_parameters   s   "((
z'VideoRopePosition3DEmb.cache_parametersr5   rW   rX   rY   c              
   C   sP  |    |dur
|n| j}|dur|n| j}|dur|n| j}d| }d| }d| }d|| j  }d|| j  }	d|| j  }
|\}}}}}|| jkrQ|| jksdJ d| d| d| j d| j d	t	| j
d| |}t	| j
d| |	}t	| j
d| |
}tjt|d	||d
t|d||dt|d||dgdd}t|d S )a  
        Generate embeddings for the given input size.

        Args:
            B_T_H_W_C (torch.Size): Input tensor size (Batch, Time, Height, Width, Channels).
            fps (Optional[torch.Tensor], optional): Frames per second. Defaults to None.
            h_ntk_factor (Optional[float], optional): Height NTK factor. If None, uses self.h_ntk_factor.
            w_ntk_factor (Optional[float], optional): Width NTK factor. If None, uses self.w_ntk_factor.
            t_ntk_factor (Optional[float], optional): Time NTK factor. If None, uses self.t_ntk_factor.

        Returns:
            Not specified in the original code snippet.
        Ng     @rH   zInput dimensions (H=z, W=z') exceed the maximum dimensions (max_h=z, max_w=)zt d -> t h w d)hwzh d -> t h w d)tri   zw d -> t h w d)rj   rh   r[   zt h w d -> (t h w) 1 1 d)rf   rW   rX   rY   rd   re   rR   rS   rC   outerrc   catr   r   ra   )r!   r5   rW   rX   rY   h_thetaw_thetat_thetah_spatial_freqsw_spatial_freqstemporal_freqsr8   r   r   r   _Zfreqs_hZfreqs_wZfreqs_tZfreqs_T_H_W_Dr$   r$   r%   r3      s2    	z*VideoRopePosition3DEmb.generate_embeddingsc                 C   r+   )Nr   r$   r    r$   r$   r%   r-      r.   zVideoRopePosition3DEmb.seq_dim)rH   rH   rH   )r0   N)NNN)r?   r@   rA   intra   r   rf   rC   rE   r   r3   rB   r-   rF   r$   r$   r"   r%   rG   h   sD    

4rG   c              	   C   sn   | d dksJ | d }| tj}t|tdt||| }tjt	|t
|gdd}|S )NrQ   r   i'  r,   rl   )typerC   float64rm   powr_   todivrn   cossin)r[   positionhalfZsinusoidr=   r$   r$   r%   sinusoidal_embedding_1d   s   (r   
video_sizec                 C   s   | j \}}}}|\}}}	|| |	 }
||
ksJ d|||d }t|tj}t|tj}t| tj||ddd}|| jS )a  
    Optimized version of rope_apply using flash_attention's rotary embedding implementation.
    This version processes the entire batch at once for efficiency.

    Args:
        x (Tensor): Input tensor with shape [batch_size, seq_len, n_heads, head_dim]
        video_size (VideoSize): Video dimensions with shape [T, H, W]
        freqs (Tensor): Complex frequencies with shape [max_seq_len, head_dim // 2]

    Returns:
        Tensor: Rotary-embedded tensor with same shape as input
    z&Sequence length must be equal to T*H*WrQ   TF)interleavedinplace)	r1   viewrC   r|   rz   float32r}   flash_apply_rotary_embdtype)r=   r   freqs
batch_sizeseq_lenn_headsrI   r   r   r   Zcurr_seq_lenr|   r}   rotatedr$   r$   r%   
rope_apply   s   
r   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )
WanRMSNormh㈵>c                    s,   t    || _|| _tt|| _d S r   )	r   r   r[   epsnn	ParameterrC   onesweight)r!   r[   r   r"   r$   r%   r      s   
zWanRMSNorm.__init__c                 C   s   | j jd d S )NrH   )r   datafill_r    r$   r$   r%   reset_parameters   s   zWanRMSNorm.reset_parametersc                 C   s   |  | || j S z>
        Args:
            x(Tensor): Shape [B, L, C]
        )_normra   type_asr   r!   r=   r$   r$   r%   r;      s   zWanRMSNorm.forwardc                 C   s$   |t |djddd| j  S )NrQ   rk   T)r[   keepdim)rC   rsqrtry   meanr   r   r$   r$   r%   r     s   $zWanRMSNorm._norm)r   )r?   r@   rA   r   r   r;   r   rF   r$   r$   r"   r%   r      s
    r   c                       s*   e Zd Zd fdd	Z fddZ  ZS )WanLayerNormư>Fc                    s   t  j|||d d S )N)elementwise_affiner   )r   r   )r!   r[   r   r   r"   r$   r%   r     s   zWanLayerNorm.__init__c                    s   t  |S r   r   r;   r   r"   r$   r%   r;     s   zWanLayerNorm.forward)r   F)r?   r@   rA   r   r;   rF   r$   r$   r"   r%   r     s    r   c                       s*   e Zd Z	ddee f fddZ  ZS )
SelfAttnOpNr   c                    s   t  |||S r   r   )r!   	q_B_L_H_D	k_B_L_H_D	v_B_L_H_Dr   r"   r$   r%   r;     s   zSelfAttnOp.forwardr   )r?   r@   rA   r   r   r;   rF   r$   r$   r"   r%   r     s
    r   c                       sF   e Zd Z					d fdd	Zdd	 Zd
efddZdd Z  ZS )WanSelfAttentionrk   rk   Tr   p2ptransformer_enginec                    s  || dksJ t    || _|| _|| | _|| _|| _|| _|| _|| _|| _	t
||| _t
||| _t
||| _t
||| _|rNt||dnt
 | _|r[t||dnt
 | _| j	dkrut| j| j| jdddd| _d S | j	dkrt | _d S J d	| j	 )
Nr   r   r   bshdno_masknum_gqa_groupsattention_dropout
qkv_formatattn_mask_typeminimal_a2aF!Unreckognized attention backend: )r   r   r[   	num_headsrI   window_sizeqk_normr   cp_comm_typeattention_backendr   Linearqkvor   Identitynorm_qnorm_kr   attn_opr   r!   r[   r   r   r   r   r   r   r"   r$   r%   r   $  s:   




zWanSelfAttention.__init__c                 C   s   dt | j }tjjj| jj|d tjjj| j	j|d tjjj| j
j|d tjjj| jj|d | jjj  | j	jj  | j
jj  | jjj  | jr_| j  | j  d S d S NrH   std)mathsqrtr[   rC   r   inittrunc_normal_r   r   r   r   r   biasr   zero_r   r   r   r   r!   r   r$   r$   r%   init_weightsP  s   
zWanSelfAttention.init_weightsr   c           	         s   g |j dd jjR \  fdd}||\}}}t|||t|||||}|d}|}|S )z
        Args:
            x(Tensor): Shape [B, L, num_heads, C / num_heads]
            seq_lens(Tensor): Shape [B]
            video_size(VideoSize): Shape [T, H, W]
            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
        NrQ   c                    sX    |  }|  }|  }|||fS r   )r   r   r   r   r   r   )r=   r   r   r   bdnsr!   r$   r%   qkv_fnk  s   
z(WanSelfAttention.forward.<locals>.qkv_fn)r1   r   rI   r   r   flattenr   )	r!   r=   seq_lensr   r   r   r   r   r   r$   r   r%   r;   `  s   ( 

zWanSelfAttention.forwardc                 C   sR   | j dkr| jj|||| jd d S | j dkr!| j||| d S J d| j  )Nr   )r   r   Fr   )r   r   set_context_parallel_groupr   )r!   r&   ranksstreamr$   r$   r%   r   z  s
   

z+WanSelfAttention.set_context_parallel_groupr   Tr   r   r   )	r?   r@   rA   r   r   r   r;   r   rF   r$   r$   r"   r%   r   #  s    ,r   c                   @   s   e Zd Zdd ZdS )WanT2VCrossAttentionc           
      C   s   | d| j| j}}}| | ||d||}| | ||d||}| ||d||}	| 	|||	d}|
d}| |}|S )
        Args:
            x(Tensor): Shape [B, L1, C]
            context(Tensor): Shape [B, L2, C]
            context_lens(Tensor): Shape [B]
        r   rk   NrQ   )sizer   rI   r   r   r   r   r   r   r   r   r   )
r!   r=   contextcontext_lensr   r   r   r   r   r   r$   r$   r%   r;     s   

zWanT2VCrossAttention.forwardN)r?   r@   rA   r;   r$   r$   r$   r%   r     s    r   c                       s<   e Zd Z					d fdd	Z fdd	Zd
d Z  ZS )WanI2VCrossAttentionr   Tr   r   r   c              	      s   t  ||||||| t||| _t||| _|r"t||dnt | _| j	dkr<t
| j| j| jdddd| _d S | j	dkrFt| _d S J d	| j	 )
Nr   r   r   r   r   r   r   Fr   )r   r   r   r   k_imgv_imgr   r   
norm_k_imgr   r   r   rI   attn_op_imager   r   r"   r$   r%   r     s    



zWanI2VCrossAttention.__init__c                    sz   t    dt| j }tjjj| j	j
|d tjjj| jj
|d | j	jj  | jjj  | jr;| j  d S d S r   )r   r   r   r   r[   rC   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r$   r%   r     s   
z!WanI2VCrossAttention.init_weightsc                 C   s  |j d t }|ddd|f }|dd|df }|d| j| j}}}| | ||d||}	| | 	||d||}
| 
||d||}| | ||d||}| ||d||}| |	||}| |	|
|}|d}|d}|| }| |}|S )r   r,   Nr   rk   rQ   )r1   T5_CONTEXT_TOKEN_NUMBERr   r   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   )r!   r=   r   r   Zimage_context_lengthZcontext_imgr   r   r   r   r   r   r   r   Zimg_xr$   r$   r%   r;     s    


zWanI2VCrossAttention.forwardr   r?   r@   rA   r   r   r;   rF   r$   r$   r"   r%   r     s    r   )t2v_cross_attni2v_cross_attnc                       sF   e Zd Z						ddef fdd	Zd
d ZdefddZ  ZS )WanAttentionBlockr   TFr   r   r   r   c                    s   t    || _|| _|| _|| _|| _|| _|| _t	||| _
t||||||	|
| _|r4t	||ddnt | _t| ||d|||	|
| _t	||| _tt||tjddt||| _ttdd||d  | _d S )	NT)r   r   tanhapproximater,   rP         ?)r   r   r[   ffn_dimr   r   r   cross_attn_normr   r   norm1r   	self_attnr   r   norm3WAN_CROSSATTENTION_CLASSES
cross_attnnorm2
Sequentialr   GELUffnr   rC   randn
modulation)r!   cross_attn_typer[   r   r   r   r   r   r   r   r   r"   r$   r%   r     s"   
("zWanAttentionBlock.__init__c                 C   sZ   | j   | j  | j  | j  | j  dt| j	 }t
jjj| j|d d S r   )r   r   r   r   r   r   r   r   r   r[   rC   r   r   r   r   r   r$   r$   r%   r     s   




zWanAttentionBlock.init_weightsr   c           
         s   |j tjksJ tjdtjd  j| jddd}W d   n1 s%w   Y  |d j tjks4J   |	 d|d   |d  
||||}tjdtjd |||d 
|  }W d   n1 slw   Y   fd	d
}	|	||||}|S )a)  
        Args:
            x(Tensor): Shape [B, L, C]
            e(Tensor): Shape [B, 6, C]
            seq_lens(Tensor): Shape [B], length of each sequence in batch
            video_size(VideoSize): Shape [T, H, W]
            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
        rb   r   rP   r,   rl   Nr   rQ   c                    s   |    | || }   |  d|d   |d  | }tjdtj	d | ||d |   } W d    | S 1 sBw   Y  | S )Nr,         rb   r      )
r   r   r   r   ra   r   ampautocastrC   r   )r=   r   r   eyr    r$   r%   cross_attn_ffn6  s   .
z1WanAttentionBlock.forward.<locals>.cross_attn_ffn)r   rC   r   r   r  r   chunkr   r   ra   r   )
r!   r=   r  r   r   r   r   r   r  r  r$   r    r%   r;     s   4zWanAttentionBlock.forward)r   TFr   r   r   )	r?   r@   rA   strr   r   r   r;   rF   r$   r$   r"   r%   r     s    #r   c                       .   e Zd Zd fdd	Zdd Zdd Z  ZS )	Headr   c                    sl   t    || _|| _|| _|| _t|| }t||| _	t
||| _t
tdd||d  | _d S )Nr,   rQ   r   )r   r   r[   out_dim
patch_sizer   r   prodr   normr   r   headr   rC   r   r   )r!   r[   r	  r
  r   r"   r$   r%   r   B  s   
"zHead.__init__c                 C   sV   | j   dt| j }tjjj| j	|d tjjj| j
j|d | j
jj  d S r   )r  r   r   r   r[   rC   r   r   r   r   r  r   r   r   r   r   r$   r$   r%   r   Q  s
   
zHead.init_weightsc                 C   s   |j tjksJ tjdtjd( | j|d jddd}| | 	|d|d   |d  }W d   |S 1 s;w   Y  |S )zc
        Args:
            x(Tensor): Shape [B, L1, C]
            e(Tensor): Shape [B, C]
        rb   r   r,   rQ   rl   r   N)
r   rC   r   r   r  r   	unsqueezer  r  r  )r!   r=   r  r$   r$   r%   r;   Y  s   &
zHead.forward)r   r   r$   r$   r"   r%   r  A  s    r  c                       r  )	MLPProjFc              	      sl   t    tjtj|tj||tj tj||tj|| _|r4t	t
dtd| _d S d S )Nr,      )r   r   rC   r   r   	LayerNormr   r   projr   zeros%FIRST_LAST_FRAME_CONTEXT_TOKEN_NUMBERemb_pos)r!   in_dimr	  flf_pos_embr"   r$   r%   r   g  s   


zMLPProj.__init__c                 C   sV   | j d   | j d   | j d   | j d   t| dr)| jj  d S d S )Nr   r,   r   r   r  )r  r   hasattrr  r   r   r    r$   r$   r%   r   t  s   
zMLPProj.init_weightsc                 C   s@   t | dr|j\}}}|dd| |}|| j }| |}|S )Nr  rk   rQ   )r  r1   r   r  r  )r!   image_embedsbsr   r   Zclip_extra_context_tokensr$   r$   r%   r;   }  s   


zMLPProj.forward)Fr   r$   r$   r"   r%   r  f  s    	r  c                       s   e Zd ZdZdddddddd	ddd
ddddde ddfdedededef fddZ						d,dee	j
 fddZdd Zdd Zdd  Zd-d!ee fd"d#Zed$d% Zded&ejfd'd(Zd.d)ef fd*d+Z  ZS )/WanModelzR
    Wan diffusion backbone supporting both text-to-video and image-to-video.
    t2v)r,   rQ   rQ   r      i   i       i       r   Tr   Fr   r   concat_padding_mask
sac_configr   r   c                    s  t    |dv sJ |_|_|_|___|_|_	|	_
_|_	____|_|_ _jrH|d n|}t||d  |d  |d  _tt|tjddt_tt|t t_tt td _|dkrd	nd
t 	f
ddt|D _t |	|_! dkrȈ d dksJ  }t"|dddd_#|dks|dkrt$d|dkd_%&  '|j dS )a  
        Initialize the diffusion model backbone.

        Args:
            model_type (`str`, *optional*, defaults to 't2v'):
                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video) or 'flf2v' (first-last-frame-to-video)
            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
            text_len (`int`, *optional*, defaults to 512):
                Fixed length for text embeddings
            in_dim (`int`, *optional*, defaults to 16):
                Input video channels (C_in)
            dim (`int`, *optional*, defaults to 2048):
                Hidden dimension of the transformer
            ffn_dim (`int`, *optional*, defaults to 8192):
                Intermediate dimension in feed-forward network
            freq_dim (`int`, *optional*, defaults to 256):
                Dimension for sinusoidal time embeddings
            text_dim (`int`, *optional*, defaults to 4096):
                Input dimension for text embeddings
            out_dim (`int`, *optional*, defaults to 16):
                Output video channels (C_out)
            num_heads (`int`, *optional*, defaults to 16):
                Number of attention heads
            num_layers (`int`, *optional*, defaults to 32):
                Number of transformer blocks
            window_size (`tuple`, *optional*, defaults to (-1, -1)):
                Window size for local attention (-1 indicates global attention)
            qk_norm (`bool`, *optional*, defaults to True):
                Enable query/key normalization
            cross_attn_norm (`bool`, *optional*, defaults to False):
                Enable cross-attention normalization
            eps (`float`, *optional*, defaults to 1e-6):
                Epsilon value for normalization layers
            concat_padding_mask (`bool`, *optional*, defaults to False):
                Enable concat padding mask
            cp_comm_type (str, *optional*, defaults to 'p2p'):
                CP communication type passed to TE.
            attention_backend (str, defaults to 'transformer_engine', options are: ['transformer_engine', 'minimal_a2a'])
                Backend used for attention
        )r  i2vflf2vr,   r   rQ   r   r   rP   r  r   r   c                    s(   g | ]}t 	j 
qS r$   )r   r   ).0ru   
r   r   r   r[   r   r   r   r   r!   r   r$   r%   
<listcomp>  s    z%WanModel.__init__.<locals>.<listcomp>   r  )rI   rJ   rK   rL   r"  r#  r  )r  N)(r   r   
model_typer
  text_lenr  r[   r   freq_dimtext_dimr	  r   
num_layersr   r   r   r   r   r   Zattention_backedr   r   patch_embeddingr   r   text_embeddingSiLUtime_embeddingtime_projection
ModuleListrangeblocksr  r  rG   rope_position_embeddingr  img_embr   enable_selective_checkpoint)r!   r(  r
  r)  r  r[   r   r*  r+  r	  r   r,  r   r   r   r   r   r!  r   r   r   r"   r%  r%   r     sX   
@&($ zWanModel.__init__Npadding_maskc
                 K   s  |j d dks	J |dddf }~
| jdks| jdkr&|dur$|dus&J |dur3tj||gdd}| jr^tjj|t|j dd tj	j
d}tj||ddd|j d	 ddgdd}t|d
| jd | jd | jd	 d}| |}t|j d |j d	 |j d d}t|d}tjdd |D tjd}|  }| |ksJ tjdtjd- | t| j| }| |dd| jf}|jtjkr|jtjksJ W d   n1 sw   Y  d}|  |}|dur| !|}tj"||gdd}t#|||| $|||d}
t%| j&D ]\}}|	dur ||	v r |r q||fi |
}q| '||}|\}}}t|d| jd | jd | jd	 |||| j(d	}|S )aW  
        Forward pass through the diffusion model

        Args:
            x_B_C_T_H_W (Tensor):
                Input video tensor with shape [B, C_in, T, H, W]
            t (Tensor):
                Diffusion timesteps tensor of shape [B]
            context (List[Tensor]):
                List of text embeddings each with shape [L, C]
            seq_len (`int`):
                Maximum sequence length for positional encoding
            frame_cond_crossattn_emb_B_L_D (Tensor, *optional*):
                CLIP image features for image-to-video mode or first-last-frame-to-video mode
            y_B_C_T_H_W (Tensor, *optional*):
                Conditional video inputs for image-to-video mode, shape [B, C_in, T, H, W]

        Returns:
            Tensor:
                Denoised video tensor with shape [B, C_out, T, H / 8, W / 8]
        r,   Nr   r"  r#  rl   )interpolationrQ   z0b c (t kt) (h kh) (w kw) -> b t h w (c kt kh kw))ktkhkwr   r   zb t h w d -> b (t h w) dc                 S   s   g | ]}| d qS )r   )r   )r$  ur$   r$   r%   r&  U  s    z$WanModel.forward.<locals>.<listcomp>r   rb   rP   )r  r   r   r   r   r   z2b (t h w) (nt nh nw d) -> b d (t nt) (h nh) (w nw))ntnhnwrj   rh   ri   r   ))r1   r(  rC   rn   r   r   
functionalresizelistInterpolationModeNEARESTr  r   r   r
  r-  r   tensorlongr`   itemr   r  r   r0  r   r*  ra   r1  	unflattenr[   r   r.  r6  concatdictr5  	enumerater4  r  r	  )r!   x_B_C_T_H_Wtimesteps_B_Tcrossattn_embr   frame_cond_crossattn_emb_B_L_Dy_B_C_T_H_Wr8  Z	is_uncondZ
slg_layerskwargst_Bx_B_T_H_W_Dr   Zx_B_L_Dr   Ze_B_DZe0_B_6_Dr   context_B_L_DZcontext_clip	block_idxblockrj   rh   ri   r$   r$   r%   r;     s~   ""
 


	
zWanModel.forwardc                 C   s  |   D ]}t|tjrtj|j |jdurtj|j q| j	D ]}|
  q#| j
  tj| jjd tj| jj | j  D ]}t|tjrdtjj|jdd |jdurdtj|j qG| j  D ]}t|tjrtjj|jdd |jdurtj|j qj| j  D ]}t|tjrtjj|jdd |jdurtj|j qtj| jjj | jjjdurtj| jjj dS dS )zJ
        Initialize model parameters using Xavier initialization.
        Nr,   g{Gz?r   )modules
isinstancer   r   r   xavier_uniform_r   r   zeros_r4  r   r  r-  r   r.  normal_r0  r1  )r!   mrX  r$   r$   r%   r     sB   






zWanModel.init_weightsc                 C   sf   t | jD ]\}}t||dd qt| j|dd t| j|dd t| j|dd t| j|dd d S )NT)meshreshard_after_forwardF)rM  r4  r
   r  r.  r0  r-  )r!   r_  irX  r$   r$   r%   r
     s   zWanModel.fully_shardc                 C   s8   | j   | jD ]}|jjd d tj d qd| _d S )Nr&   r   r   F)	r5  r*   r4  r   r   rC   rb   Stream_is_context_parallel_enabled)r!   rX  r$   r$   r%   r*     s   


z!WanModel.disable_context_parallelr&   c                 C   sD   | j j|d t|}| jD ]}|jj||tj d qd| _	d S )N)r&   rb  T)
r5  r(   r	   r4  r   r   rC   rb   rc  rd  )r!   r&   r6   rX  r$   r$   r%   r(     s   

z WanModel.enable_context_parallelc                 C   s   | j S r   )rd  r    r$   r$   r%   is_context_parallel_enabled  s   z$WanModel.is_context_parallel_enabledr4  c              	   C   s   |j tjkr	 td|j  d|j dt|  | }| D ]"\}}t	||j dkrCtd|  t
||dd}||| q!| dt
| j|dd d S )	Nz!Enable selective checkpoint with z, for every z blocks. Total blocks: r   z&Enable selective checkpoint for block F)
context_fnpreserve_rng_stater  )moder   NONEr   infoevery_n_blocksr2   get_context_fnnamed_childrenrv   ptd_checkpoint_wrapperregister_moduler  )r!   r!  r4  _context_fnblock_idrX  r$   r$   r%   r7    s0   z$WanModel.enable_selective_checkpointr0   c           	         s   i }|  D ]\}}d|v rtd| d q|||< q|}t j|d|d\}}|du rDtdd |D s9J td	d |D sDJ t||S )
N_extra_statezSkipping key z; introduced by TransformerEngine for FP8 in the checkpoint.F)strictassignTc                 s       | ]}d |v V  qdS rr  Nr$   r$  r   r$   r$   r%   	<genexpr>      z+WanModel.load_state_dict.<locals>.<genexpr>c                 s   ru  rv  r$   rw  r$   r$   r%   rx    ry  )itemsr   warningr   load_state_dictallr   )	r!   
state_dictrs  rt  Zfiltered_state_dictr   r   missing_keysunexpected_keysr"   r$   r%   r|    s   

zWanModel.load_state_dict)NNNNFNr   )TF)r?   r@   rA   __doc__r   boolr  r   r   rC   rD   r;   r   r
   r*   r   r(   rB   re  r   r2  r7  r   r|  rF   r$   r$   r"   r%   r    s`     
s+
r  )Ar   typingr   rC   Z	torch.ampr   torch.nnr   einopsr   r   -cosmos_predict2._src.predict2.networks.a2a_cpr   0cosmos_predict2._src.predict2.networks.attentionr   flash_attn.layers.rotaryr   r   ImportErrorprinttorch.distributedr   r	   "torch.distributed._composable.fsdpr
   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   rn  torch.nn.modules.moduler   torchvisionr   $transformer_engine.pytorch.attentionr   %cosmos_predict2._src.imaginaire.utilsr   6cosmos_predict2._src.imaginaire.utils.context_parallelr   :cosmos_predict2._src.predict2.networks.model_weights_statsr   Fcosmos_predict2._src.predict2.networks.selective_activation_checkpointr   r   r   r  collectionsr   r   Moduler   rG   r   r   r   r  r   r   r   r   r   r   r   r  r  r  r$   r$   r$   r%   <module>   s^   'b `KW% 