o
    viX                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" G dd deZ						dde#de#fddZ$G dd dZ%G dd deZ&dS )z
Cosmos Policy Wan2pt1 VAE with deterministic seeding support.

This module provides a complete wrapper around the wan2pt1 tokenizer to add
deterministic seeding without modifying the main branch files.
    N)nullcontext)Optional)parallel_state)INTERNALSMOKE)log)	broadcastget_ranksync_model_states)easy_io)VideoTokenizerInterface)WanVAE_)plugin_mount)BenchmarkTimes)resolve_checkpoint_pathc                       s"   e Zd ZdZd fdd	Z  ZS )r   z
    Extended WanVAE_ for Cosmos Policy with deterministic seeding support.

    Adds optional deterministic seeding in the encode method for reproducibility.
    Tc           	         s~   t jdd dk}|r7dtj }d|v p$d|v p$d|v p$d|v }|r+J d	d
dlm} d}|| t	 
|||S )z
        Extended encode with optional deterministic seeding for reproducibility.

        When DETERMINISTIC environment variable is set to "true", uses fixed seed
        for reproducible VAE encoding.
        ZDETERMINISTIC true zscripts.trainzscripts/train.pyz	/train.pyz-m cosmos_policy.scripts.trainzDETERMINISTIC mode is enabled but this is a training run! Deterministic seeding breaks random sampling during training. Please unset the DETERMINISTIC environment variable before training.r   )set_seed_everywhere   )osenvirongetlowerjoinsysargvcosmos_policy.utils.utilsr   superencode)	selfxscaleclear_encoder_cacheZdeterministic_enabledcmd_argsis_trainingr   seed	__class__ F/data/cameron/vidgen/cosmos-policy/cosmos_policy/tokenizers/wan2pt1.pyr   3   s"   	zWanVAE_.encodeT)__name__
__module____qualname____doc__r   __classcell__r)   r)   r'   r*   r   ,   s    r   cpucredentials/s3_training.secretFBhf://nvidia/Cosmos-Predict2-2B-Video2World/tokenizer/tokenizer.pths3_credential_pathmean_std_pathc                 K   s"  t d|g ddg g ddd}|jdi | td tdi |}W d   n1 s.w   Y  ts9| du rp|j|d	 |rotjd
dd
d
d
|d	tjd
dd
d
d
|d	}	}
tjd
ddd
d
|d	tjd
ddd
d
|d	}}nt dkrt	sddl
m} || } | drd}tj|d|dd nd}t| }tj|||d}|r|dd}|dd}t	sddl
m} ||}||}tj|||d\}	}
tj|||d\}}|	d
dd
d
d
}	|
d
dd
d
d
}
|d
ddd
d
}|d
ddd
d
}td|   |j|dd n7|j|d	 |r?tjd
dd
d
d
|d	tjd
dd
d
d
|d	}	}
tjd
ddd
d
|d	tjd
ddd
d
|d	}}t| |rftd t|	d t|
d t|d t|d ||	|
||fS |tjd
d
d
d
d
|d	tjd
d
d
d
d
|d	tjd
d
dd
d
|d	tjd
d
dd
d
|d	fS )z
    Modified _video_vae that uses the policy WanVAE_ with deterministic seeding.

    This is a copy of the original _video_vae function, but instantiates our
    WanVAE_ subclass instead of the base one.
    `   )         r9   r8   )FTTg        )dimz_dimdim_multnum_res_blocksattn_scalestemperal_downsampledropoutmetaN)devicer7          r   )get_checkpoint_pathzs3://wan2pt1_vaes3)backendr4   )keybackend_args)backend_keymap_locationzmean_std.ptzimages_mean_std.ptzvideo_mean_std.ptzloading T)assignz"broadcast mean and std for wan2pt12   r)   )dictupdatetorchrB   r   r   to_emptyrandnr	   r   1cosmos_policy._src.imaginaire.utils.checkpoint_dbrE   
startswithr   set_s3_backendr   loadreplacereshaper   infoload_state_dictr
   r   zerosones)pretrained_pathr;   rB   r4   load_mean_stdr5   kwargscfgmodelimg_meanimg_std
video_mean	video_stdrE   rK   resolved_pathckptimg_mean_stdvideo_mean_stdr)   r)   r*   _policy_video_vaeV   s   	.





rk   c                   @   s  e Zd ZdZdddddejdddd	dd
fdededededede	e
eef  fddZdd Ze d0ddZe d0ddZedd Zedd Zedd Zdejd ejfd!d"Zd#ejfd$d%Zd&d' Zd(d) Zd*ejd efd+d,Zd-ejd d
fd.d/Zd
S )1CosmosPolicyWanVAEz
    Cosmos Policy WanVAE wrapper that uses deterministic WanVAE_.

    This is a complete reimplementation of WanVAE that uses our
    _policy_video_vae() function to instantiate the deterministic WanVAE_.
    rC   r3   r2   Fz@hf://nvidia/Cosmos-Predict2-2B-Video2World/tokenizer/mean_std.ptcudaTr9   Nr4   r5   	benchmarktemporal_windowis_parallelcp_grid_shapec              	   C   s0  || _ || _|	| _|
| _|| _|| _d| _d| _g d}g d}tj	|||d| _
tj	|||d| _| j
d| j g| _t|||||||
d\| _| _| _| _| _|rqd }t rgt }|d u rfd| f}nJ d| || | j d| _|| _|s| jj|d	| _t | _d S tjjd
|d	| _d S )NF)gy):gMOg^)gQ?gtVƿgZӼ?gBfjÿgU0*?gL
F%u?gMg&?gz6>׿gF%uȿg[ AcgMJ?gW2ıҿ)g_L@gNё\C?gQ@g?@g9#J{?g|a2U?gHPs@g0* @gJ{/L&
@gJY8@g]C@g(?gK46?gS:?go_Ι@g-?)dtyperB   g      ?)r^   r;   r4   r_   r5   rB   ro   r7   z7is_parallel set, but context parallelism is initialized)rr   rm   ) rr   rB   rn   ro   rp   rq   context_parallel_enabledcp_group_initializedrQ   tensormeanstdr"   rk   rb   rc   rd   re   rf   r   is_initializedget_context_parallel_groupsize_initialize_context_parallelevalrequires_grad_is_amptor   contextampautocast)r    r;   vae_pthr4   r_   r5   rr   rB   r~   rn   ro   rp   rq   rv   rw   cp_groupr)   r)   r*   __init__   sJ   
zCosmosPolicyWanVAE.__init__c                 C   s   t dd | j D S )Nc                 s   s    | ]}|  V  qd S N)numel).0pr)   r)   r*   	<genexpr>,  s    z1CosmosPolicyWanVAE.count_param.<locals>.<genexpr>)sumrb   
parametersr    r)   r)   r*   count_param+  s   zCosmosPolicyWanVAE.count_paramc           	   
   C   sD  | j r6| |r|   n)z| |}|   W n ty5 } ztt| |   W Y d}~nd}~ww | j	rEt
j  t }t }|j}| j5 | jsU|| j}| j	rat
j  t }| j|| j|}| j	ryt
j  t | |_W d   n1 sw   Y  ||}| j	rt
j  t | |_||fS |S )zH
        videos: A list of videos each with shape [C, T, H, W].
        N)rp   _is_image_batch_disable_context_parallel&_broadcast_split_for_model_parallelsim_enable_context_parallel
ValueErrorr   warningstrrn   rQ   rm   synchronizer   timeperf_counterrr   r   r~   r   rb   r   r"   model_invocationtotal)	r    videosr#   ebenchmark_times
total_timein_dtype
model_timelatentr)   r)   r*   r   .  sD   








zCosmosPolicyWanVAE.encodec           	      C   s  | j rtj  t }t }| jra| |r| 	  nE|j
d | jd  dko3|j
d | jd  dk}|s]td| j d|j
d  d| jd  d|j
d  d| jd  d	 | 	  n|   |j}| j5 | jsq|| j}| j r}tj  t }| j|| j|}| j rtj  t | |_W d    n1 sw   Y  ||}| jr| jr| |}| j rtj  t | |_||fS |S )
N   r   r9   r7   &For parallel encoding with grid_shape z9 latent height should be divisible by grid_shape[0], got z / z5 and width should be divisible by grid_shape[1], got z, falling back to non CP)rn   rQ   rm   r   r   r   r   rp   r   r   shaperq   r   r   r   rr   r   r~   r   rb   decoder"   r   rs   _cat_outputs_cpr   )	r    zsclear_decoder_cacher   r   can_apply_cpr   r   video_reconr)   r)   r*   r   T  sF   


0<






zCosmosPolicyWanVAE.decodec                 C      dS N   r)   r   r)   r)   r*   spatial_compression_factor|     z-CosmosPolicyWanVAE.spatial_compression_factorc                 C   r   Nr9   r)   r   r)   r)   r*   temporal_compression_factor  r   z.CosmosPolicyWanVAE.temporal_compression_factorc                 C   r   )Nr   r)   r   r)   r)   r*   _cp_dim  r   zCosmosPolicyWanVAE._cp_dimstatereturnc                    sZ  t  jdksJ d| j\}} jd || j  dko' jd || j  dk}|sStd| j d jd  d| jd  d	| j d
 jd  d| jd  d	| j d    fddt|| D }tj| | j	d |d   jd | } jd | }tj
| j	d}|| }	|| }
 d d d d d d |	| |	d | |
| |
d | f S )N   zState should be of shape BCTHWr   r   r9   r   zE height should be divisible by compression_factor*grid_shape[0], got z / (z * zI) and width should be divisible by compression_factor*grid_shape[1], got r7   z), falling back to non CPc                       g | ]}t  qS r)   rQ   
zeros_liker   _r   r)   r*   
<listcomp>      zMCosmosPolicyWanVAE._broadcast_split_for_model_parallelsim.<locals>.<listcomp>group)lenr   rq   r   r   
contiguousrangedistributed
all_gatherr   r	   )r    r   cp_rowscp_colscan_cp_be_applied_to_shape
state_listchunk_hchunk_w
group_rankrow_idcol_idr)   r   r*   r     s&   
L>z9CosmosPolicyWanVAE._broadcast_split_for_model_parallelsimlocal_video_reconc                    sV    fddt jD tj jd tjfddt jd D dd}|S )Nc                    r   r)   r   r   )r   r)   r*   r     r   z6CosmosPolicyWanVAE._cat_outputs_cp.<locals>.<listcomp>r   c                    s*   g | ]}t j|d  jd  ddqS )Nr7   r   r:   )rQ   catrq   )r   c)r    video_recon_chunksr)   r*   r     s   * r7   r9   r   )r   cp_group_sizer   r   r   rQ   r   rq   )r    r   r   r)   )r   r    r   r*   r     s   z"CosmosPolicyWanVAE._cat_outputs_cpc                 C   :   d| _ | j D ]\}}| D ]	\}}|d qqd S )NTrs   pluginsitems
set_enabler    r   plugin_listpluginr)   r)   r*   r        z+CosmosPolicyWanVAE._enable_context_parallelc                 C   r   )NFr   r   r)   r)   r*   r     r   z,CosmosPolicyWanVAE._disable_context_parallelr!   c                 C   s$   t |jdksJ d|jd dkS )Nr   z#Expected tensor's shape to be BCTHWr8   r7   )r   r   )r    r!   r)   r)   r*   r     s   z"CosmosPolicyWanVAE._is_image_batchr   c                 C   sh   | j du sJ d| _|| _d| _|| _tt| j| _t	| j
| j|| _|   td| d d S )NFTzEnabled CP with grid_shape: z for Wan2.1 tokenizer)rt   rp   rq   rs   r   r   r   get_process_group_ranksr   r   rb   r   r   r   rZ   r    r   rq   r)   r)   r*   r{     s   z/CosmosPolicyWanVAE._initialize_context_parallelr+   )r,   r-   r.   r/   rQ   bfloat16r   boolintr   tupler   r   no_gradr   r   propertyr   r   r   Tensorr   r   r   r   r   r   ProcessGroupr{   r)   r)   r)   r*   rl      sX    	

^%'


rl   c                   @   s   e Zd ZdZd-defddZdejdeeef d	d
fddZ	e
dd Zdd Zdd Zdejd	ejfddZdejd	ejfddZded	efddZded	efddZe
dd  Ze
d!d" Ze
d#d$ Ze
d%d& Ze
d'd( Ze
d)d* Ze
d+d, Zd
S ).Wan2pt1VAEInterfacez
    Cosmos Policy Wan2pt1VAE Interface with deterministic seeding support.

    Uses CosmosPolicyWanVAE which instantiates our deterministic WanVAE_ subclass.
    Q   Fchunk_durationc                 K   sr   | dd| _| dd| _ttjd|| dd| dd| dd	| d
d| dd d| _~|| _d| _d S )Nkeep_decoder_cacheFkeep_encoder_cacher   r3   r4   r2   ro   r9   rp   rq   )rr   r~   r_   r   r4   ro   rp   rq   )	r   r   r   rl   rQ   r   rb   r   cp_initialized)r    r   r_   r`   r)   r)   r*   r     s$   




zWan2pt1VAEInterface.__init__r   rq   r   Nc                 C   s&   | j du sJ d| _ | j|| d S )NFT)r   rb   r{   r   r)   r)   r*   initialize_context_parallel  s   z/Wan2pt1VAEInterface.initialize_context_parallelc                 C   s   | j jS r   )rb   rr   r   r)   r)   r*   rr     s   zWan2pt1VAEInterface.dtypec                 C   s   d S r   r)   r   r)   r)   r*   reset_dtype  s   zWan2pt1VAEInterface.reset_dtypec                 C   s   | j j   dS )z5Clear the feature cache for both encoder and decoder.N)rb   clear_cacher   r)   r)   r*   r     s   zWan2pt1VAEInterface.clear_cacher   c                 C   s   | j j|| j d}|jd }|dkr#|| j j| | j j| S || j jd d d d d |f | | j jd d d d d |f | S )N)r#   r8   r7   )	rb   r   r   r   rc   type_asrd   re   rf   )r    r   latents
num_framesr)   r)   r*   r     s   
 ,zWan2pt1VAEInterface.encoder   c                 C   s   |j d }|dkr$| jj|| jj| | jj|  | j d}n0| jj|| jjd d d d d |f | | jj	d d d d d |f |  | j d}t
|trjt|dkscJ d|d d}|S )Nr8   r7   )r   zAssuming batch_size=1 was usedr   )r   rb   r   rd   r   rc   r   r   rf   re   
isinstancelistr   	unsqueeze)r    r   r   reconr)   r)   r*   r     s"   
"&"
zWan2pt1VAEInterface.decodenum_pixel_framesc                 C   s   d|d d  S Nr7   r9   r)   )r    r   r)   r)   r*   get_latent_num_frames     z)Wan2pt1VAEInterface.get_latent_num_framesnum_latent_framesc                 C   s   |d d d S r   r)   )r    r   r)   r)   r*   get_pixel_num_frames  r   z(Wan2pt1VAEInterface.get_pixel_num_framesc                 C   r   r   r)   r   r)   r)   r*   r   "  r   z.Wan2pt1VAEInterface.spatial_compression_factorc                 C   r   r   r)   r   r)   r)   r*   r   &  r   z/Wan2pt1VAEInterface.temporal_compression_factorc                 C   s   | j S r   )r   r   r)   r)   r*   pixel_chunk_duration*  s   z(Wan2pt1VAEInterface.pixel_chunk_durationc                 C   s   |  | jS r   )r   r   r   r)   r)   r*   latent_chunk_duration.  s   z)Wan2pt1VAEInterface.latent_chunk_durationc                 C   r   )NrC   r)   r   r)   r)   r*   	latent_ch2  r   zWan2pt1VAEInterface.latent_chc                 C   r   )Ni   r)   r   r)   r)   r*   spatial_resolution6  r   z&Wan2pt1VAEInterface.spatial_resolutionc                 C   r   )NZcosmos_policy_wan2pt1_tokenizerr)   r   r)   r)   r*   name:  r   zWan2pt1VAEInterface.name)r   F)r,   r-   r.   r/   r   r   r   r   r   r   r   rr   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r  r)   r)   r)   r*   r     s4     







r   )NNr1   r2   Fr3   )'r/   r   r   r   
contextlibr   typingr   rQ   torch.distributedr   megatron.corer   #cosmos_policy._src.imaginaire.flagsr   r   #cosmos_policy._src.imaginaire.utilsr   Z/cosmos_policy._src.imaginaire.utils.distributedr   r	   r
   +cosmos_policy._src.imaginaire.utils.easy_ior   0cosmos_policy._src.predict2.tokenizers.interfacer   .cosmos_policy._src.predict2.tokenizers.wan2pt1r   ZBaseWanVAE_9cosmos_policy._src.predict2.tokenizers.wan2pt1_2d_pluginsr   8cosmos_policy._src.predict2.utils.tokenizer_benchmarkingr   $cosmos_policy.utils.checkpoint_utilsr   r   rk   rl   r   r)   r)   r)   r*   <module>   sD   +
o  