o
    vi8                  	   @   s  U d Z ddlZddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddl
mZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZ eddG dd deZG dd deZG dd deZG dd deZe eedddddeedddddeedgdddeeddddd Z!eed0i e!Z"ee#d!< eed0i e!Z$ee#d"< eddG d#d$ d$eZ%G d%d& d&eZ&ee&eedgdd'eedddddeedddddeed(d)egdd*d+d,Z'ee#d-< d.d/ Z(dS )1z
Video2World conditioner configurations for Cosmos Policy.

Provides mutable versions of Video2WorldCondition and related classes.
These need to be mutable for Cosmos Policy since it modifies parts of the condition
objects during training.
    N)	dataclass)DictOptional)ConfigStore)LazyCall)LazyDict)broadcast_split_tensor)BooleanFlagReMapkeyTextAttrTextAttrEmptyStringDrop)WAN2PT1_I2V_COND_LATENT_KEY)Wan2pt1CLIPEmb)GeneralConditionerText2WorldConditionF)frozenc                   @   s   e Zd ZU dZdZeed< dZee	j
 ed< dZee	j
 ed< 		dde	j
deded	ee d
eeeef  dd fddZ	dded	edd fddZde	jjdd fddZdS )Video2WorldConditionz:Mutable version of Video2WorldCondition for Cosmos Policy.Fuse_video_conditionN	gt_frames$condition_video_input_mask_B_C_T_H_W!random_min_num_conditional_frames!random_max_num_conditional_framesnum_conditional_framesconditional_frames_probsreturnc              	   C   sH  | j dd}||d< |j\}}}	}
}tj|d|	|
||j|jd}|	dkr-tj|tjd}nL|durOt|tjrDtj	|tjd|
  }n5tj	|tjd| }n*|durnt| }t| }tjtj|||dtjd}ntj||d |fd	}t|D ]}||ddd|| ddddf  d7  < q}||d
< t| di |S )a	  
        Sets the video conditioning frames for video-to-video generation.

        This method creates a conditioning mask for the input video frames that determines
        which frames will be used as context frames for generating new frames. The method
        handles both image batches (T=1) and video batches (T>1) differently.

        Args:
            gt_frames: A tensor of ground truth frames with shape [B, C, T, H, W], where:
                B = batch size
                C = number of channels
                T = number of frames
                H = height
                W = width

            random_min_num_conditional_frames: Minimum number of frames to use for conditioning
                when randomly selecting a number of conditioning frames.

            random_max_num_conditional_frames: Maximum number of frames to use for conditioning
                when randomly selecting a number of conditioning frames.

            num_conditional_frames: Optional; If provided, all examples in the batch will use
                exactly this many frames for conditioning. If None, a random number of frames
                between random_min_num_conditional_frames and random_max_num_conditional_frames
                will be selected for each example in the batch.

            conditional_frames_probs: Optional; Dictionary mapping number of frames to probabilities.
                If provided, overrides the random_min/max_num_conditional_frames with weighted sampling.
                Example: {0: 0.5, 1: 0.25, 2: 0.25} for 50% chance of 0 frames, 25% for 1, 25% for 2.

        Returns:
            A new Video2WorldCondition object with the gt_frames and conditioning mask set.
            The conditioning mask (condition_video_input_mask_B_C_T_H_W) is a binary tensor
            of shape [B, 1, T, H, W] where 1 indicates frames used for conditioning and 0
            indicates frames to be generated.

        Notes:
            - For image batches (T=1), no conditioning frames are used (num_conditional_frames_B = 0).
            - For video batches:
                - If num_conditional_frames is provided, all examples use that fixed number of frames.
                - Otherwise, each example randomly uses between random_min_num_conditional_frames and
                random_max_num_conditional_frames frames.
            - The mask marks the first N frames as conditioning frames (set to 1) for each example.
        Fskip_underscorer      )dtypedevice)r   N)weightsk)sizer    )to_dictshapetorchzerosr   r   int32
isinstanceTensoronescpulistkeysvaluestensorrandomchoicesrandintrangetype)selfr   r   r   r   r   kwargsB_THWr   num_conditional_frames_Bframes_optionsr    idxr#   r#   ^/data/cameron/vidgen/cosmos-policy/cosmos_policy/config/conditioner/video2world_conditioner.pyset_video_condition9   s2   40z(Video2WorldCondition.set_video_conditionTr   is_cfg_conditionalc                 C   s(   | j | jdd|d}|s|jd |S )Nr   r   r   r   r   T)rA   r   r   fill_r6   rB   r   
_conditionr#   r#   r@   edit_for_inference   s   z'Video2WorldCondition.edit_for_inferenceprocess_groupc                 C   s   | j r| S | j}| j}| jdd}d |d< d |d< tt| di ||}|jdd}|j\}}}}}|d urP|dkrP| dkrPt	|d|d}t	|d|d}||d< ||d< t| di |S )	NFr   r   r   r      seq_dimrH   r#   )
is_broadcastedr   r   r$   r   	broadcastr5   r%   r"   r   )r6   rH   r   r   r7   new_conditionr9   r:   r#   r#   r@   rM      s,   zVideo2WorldCondition.broadcastNNTr   )__name__
__module____qualname____doc__r   bool__annotations__r   r   r&   r*   r   intr   floatrA   rG   distributedProcessGrouprM   r#   r#   r#   r@   r   0   s:   
 
V
r   c                       sp   e Zd ZdZ		ddejdededee deeee	f  dd f fd	d
Z
	ddededd f fddZ  ZS )Video2WorldConditionV2z
    compared to Video2WorldCondition, this class apply zero frames when use_video_condition is False~(unconditional generation in cfg)
    in the case, we do zero-out conditional frames in the video condition
    Nr   r   r   r   r   r   c                    s$   | j sdn|}t j|||||dS )Nr   )r   r   r   r   r   )r   superrA   )r6   r   r   r   r   r   	__class__r#   r@   rA      s   z*Video2WorldConditionV2.set_video_conditionTr   rB   c                    s   ~t  j| jdd|d}|S )Nr   rC   )r\   rA   r   rE   r]   r#   r@   rG      s   z)Video2WorldConditionV2.edit_for_inferencerO   rP   )rQ   rR   rS   rT   r&   r*   rW   r   r   rX   rA   rU   rG   __classcell__r#   r#   r]   r@   r[      s2    
r[   c                       :   e Zd Z	ddedeeeef  def fddZ  Z	S )Video2WorldConditionerNbatchoverride_dropout_rater   c                       t  ||}tdi |S Nr#   )r\   _forwardr   r6   rb   rc   outputr]   r#   r@   forward      zVideo2WorldConditioner.forwardN)
rQ   rR   rS   r   r   strrX   r   ri   r_   r#   r#   r]   r@   ra          ra   c                       r`   )Video2WorldConditionerV2Nrb   rc   r   c                    rd   re   )r\   rf   r[   rg   r]   r#   r@   ri      rj   z Video2WorldConditionerV2.forwardrk   )
rQ   rR   rS   r   r   rl   rX   r[   ri   r_   r#   r#   r]   r@   rn      rm   rn   fpsg        )	input_key
output_keydropout_rater   padding_maskt5_text_embeddingsg?)rp   rr   use_empty_stringr   )rp   rq   rr   )ro   rs   textr   VideoPredictionConditionerVideoPredictionConditionerV2c                   @   sL   e Zd ZU dZdZeej ed< dZ	eej ed< dej
jddfdd	ZdS )
VideoPredictionWan2pt1ConditionzEMutable version of VideoPredictionWan2pt1Condition for Cosmos Policy.Nframe_cond_crossattn_emb_B_L_Dy_B_C_T_H_WrH   r   r   c                 C   sz   | j r| S | j}| jdd}d|d< tt| di ||}|jdd}|dur0t|d|d}||d< t| di |S )aU  Broadcasts and splits the condition across the checkpoint parallelism group.
        For most condition, such asT2VCondition, we do not need split.

        Args:
            process_group: The process group for broadcast and split

        Returns:
            A new BaseCondition instance with the broadcasted and split condition.
        Fr   Nr{   rI   rJ   r#   )rL   r{   r$   r   rM   r5   r   )r6   rH   r{   r7   rN   r#   r#   r@   rM     s   
z)VideoPredictionWan2pt1Condition.broadcast)rQ   rR   rS   rT   rz   r   r&   r*   rV   r{   rY   rZ   rM   r#   r#   r#   r@   ry     s
   
 ry   c                       r`   )!VideoPredictionWan2pt1ConditionerNrb   rc   r   c                    rd   re   )r\   rf   ry   rg   r]   r#   r@   ri   6  rj   z)VideoPredictionWan2pt1Conditioner.forwardrk   )
rQ   rR   rS   r   r   rl   rX   ry   ri   r_   r#   r#   r]   r@   r|   5  rm   r|   )rp   rr   imagesvideobfloat16)rp   rr   r   )rv   ro   rs   wanclip/VideoConditionerFpsPaddingEmptyStringDrppConfigc                  C   sB   t  } | jdddtd | jdddtd | jdddtd d S )Nconditionerzmodel.config.conditionervideo_prediction_conditioner)grouppackagenamenodevideo_prediction_conditioner_v26wan2pt1_video_prediction_conditioner_empty_string_drop)r   instancestorerw   rx   r   )csr#   r#   r@   register_conditionerX  s&   
r   r#   ))rT   r1   dataclassesr   typingr   r   r&   hydra.core.config_storer   )cosmos_policy._src.imaginaire.lazy_configr   Lr   4cosmos_policy._src.imaginaire.utils.context_parallelr   'cosmos_policy._src.predict2.conditionerr	   r
   r   r   <cosmos_policy._src.predict2.models.video2world_wan2pt1_modelr   )cosmos_policy._src.predict2.networks.clipr   Zcosmos_policy.conditionerr   r   r   r[   ra   rn   dict_SHARED_CONFIGrw   rV   rx   ry   r|   r   r   r#   r#   r#   r@   <module>   s    $



"
