o
    vi                     @   s  d Z ddlZddlZddlmZmZmZmZmZ ddl	Z	ddl
Z
ddlmZ ddlmZ ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z! e"dZ#e"dZ$e	j%ddG dd deZ&G dd deZ'dS )a  
Cosmos Policy Video2World Model - extends CosmosPolicyDiffusionModel with video conditioning.

IMPORTANT: This class inherits from CosmosPolicyDiffusionModel (not Video2WorldModel)
to ensure it gets all the policy-specific functionality (training_step, compute_loss, etc.)
    N)AnyCallableDictOptionalTuple)	rearrange)parallel_state)Tensor)DenoisePrediction)HighSigmaStrategy)DataType)NUM_CONDITIONAL_FRAMES_KEYConditioningStrategy)Text2WorldCondition)Video2WorldCondition)CosmosPolicyDiffusionModelCosmosPolicyModelConfig replace_latent_with_action_chunkreplace_latent_with_proprio   i F)slotsc                       s   e Zd ZU dZdZeed< dZeed< dZe	ed< e
ejZe
ed< d	Zeed
< e
ejZe
ed< dZe	ed< dZe	ed< dZeeee	f  ed<  fddZ  ZS )CosmosPolicyVideo2WorldConfigz
    Extended config for Cosmos Policy Video2World model.
    Inherits from CosmosPolicyModelConfig and adds some video-specific parameters in the same way that
    Video2WorldConfig adds video-specific parameters to Text2WorldModelConfig.
       min_num_conditional_frames   max_num_conditional_framesg-C6?sigma_conditionalconditioning_strategyTdenoise_replace_gt_frameshigh_sigma_strategyg?high_sigma_ratiolow_sigma_ratioNconditional_frames_probsc                    sb   t    | jttjfv sJ | jttjttj	ttj
ttjttjttjfv s/J d S N)super__attrs_post_init__r   strr   FRAME_REPLACEr   r   NONEUNIFORM80_2000LOGUNIFORM200_100000BALANCED_TWO_HEADS_V1SHIFT24HARDCODED_20steps)self	__class__ S/data/cameron/vidgen/cosmos-policy/cosmos_policy/models/policy_video2world_model.pyr%   D   s   

z1CosmosPolicyVideo2WorldConfig.__attrs_post_init__)__name__
__module____qualname____doc__r   int__annotations__r   r   floatr&   r   r'   r   r   boolr   r)   r   r    r!   r"   r   r   r%   __classcell__r1   r1   r/   r2   r   2   s   
 r   c                       s   e Zd ZdZdef fddZdeeej	f de
e	e	ef f fddZd	ej	d
ej	dedefddZ					ddededededej	dedefddZdededej	f fddZdej	dej	dedej	fddZ  ZS ) CosmosPolicyVideo2WorldModelu  
    Cosmos Policy Video2World Model - extends CosmosPolicyDiffusionModel with video conditioning.

    Inheritance: CosmosPolicyDiffusionModel → Text2WorldModel

    Adds Video2World functionality to the policy base:
    - Video frame conditioning with gt_frames
    - High sigma sampling strategies
    - FlowUniPC scheduler support
    - Policy-specific mask manipulation for world model/value function
    configc                    s   t  | || _d S r#   )r$   __init__r=   )r.   r=   r/   r1   r2   r>   `   s   
z%CosmosPolicyVideo2WorldModel.__init__
data_batchreturnc                    s
  t  |\}}}|j|jd i | j| jj| jj|t	d| jj
d}d|v r|d }|d }|jjdd \}}|ddd	d
ddd|||jj}|ddd	d
ddd|||jj}tj|jd |jd}	|dddddddddf |j|	dd|d ddddf< |jjd }
|dd|
dd}t| t|j|j|_t|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< | jjrt|d dkr^t|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< t|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< d|v rt|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< t|d dkrbt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< d|v rt|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< t|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< | jjrt|d dkrat|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< t|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< d|v rt|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< t|d dkret|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< d|v rt|d dkrt|dddddddddf  t|j|	dd|d ddddf |j|	dd|d ddddf |j|	dd|d ddddf< |j |_t|j|d |d d|_d|v rt|d dkrt|j|d |d d|_d|v r
t|d dkr
t|j|d |d d|_t|d dkrTd|v rTtj|jjd |jjd}	|jj\}}}}}|d ddddd||||jj|j|	dd|d ddddf< |||fS )!aA  
        Extended get_data_and_condition with policy-specific conditioning logic.

        Adds:
        - Condition mask manipulation for world model and value function samples
        - Latent injection for actions and proprio in gt_frames
        - Input masking for different prediction modes (V(s'), Q(s,a))
        N	gt_frames!random_min_num_conditional_frames!random_max_num_conditional_framesnum_conditional_framesr"   Zrollout_data_maskworld_model_sample_maskvalue_function_sample_maskr   r         r   deviceaction_latent_idxvalue_latent_idxcurrent_proprio_latent_idxcurrent_wrist_image_latent_idxcurrent_wrist_image2_latent_idxcurrent_image_latent_idxcurrent_image2_latent_idxfuture_proprio_latent_idxfuture_wrist_image_latent_idxfuture_wrist_image2_latent_idxfuture_image_latent_idxfuture_image2_latent_idxactions)action_indicesproprioZproprio_indicesZfuture_proprioZvalue_function_returnr1   ) r$   get_data_and_conditionset_video_conditiontotensor_kwargsr=   r   r   getr   r"   $condition_video_input_mask_B_C_T_H_Wshape	unsqueezeexpanddtypetorcharangerM   wherer:   	ones_like
zeros_like.mask_current_state_action_for_value_predictionall'mask_future_state_for_qvalue_predictionrB   cloneorig_gt_framesr   r   reshape)r.   r?   	raw_statelatent_state	conditionrF   rG   ZH_latentZW_latentbatch_indicesTZvalue_mask_all_frames_ZC_latentr/   r1   r2   r^   d   s  



 "
$"
$$
$$
$$
$$
$$
$

	
"
z3CosmosPolicyVideo2WorldModel.get_data_and_conditionxt_B_C_T_H_Wsigmaru   c           $   
   C   sp  |j dkrt|d}n|j dkr|}n	td|j dt|d}| j|d\}}}}	|| }
|jr|j|
| jj	 }|j
sC|d }|j\}}}}}|jd|ddd|
}|| |
d|   }
t|| jj }| j|d\}}}}|jg d	d
d}|| |	d|   }	| jd"|
jd"i | j|	jg d	djd"i i | jd| jjrtjn| jd id|  }|| ||  }|jr| jjr|j|| |d|   }|| | }d}|r2d}ddlm} | r| r2| dkr2ddlm} t j!dd
d |j}| j"#|}|d d $dd%dddddtj&' ( }t)|jd D ]!}t j*+dd| d}|,|||f -| t.d|  q1|j/}| j"#|}|d d $dd%dddddtj&' ( }t)|jd D ]!}t j*+dd| d}|,|||f -| t.d|  q||
}| j"#|}|d d $dd%dddddtj&' ( } t)| jd D ]!}t j*+dd| d}|,| ||f -| t.d|  q|}!| j"#|!}"|"d d $dd%dddddtj&' ( }#t)|#jd D ]!}t j*+dd | d}|,|#||f -| t.d!|  qt0||dS )#z
        Performs denoising with optional debugging visualization support.

        Extended from base to add debugging code for visualizing latent frames.
        r   b -> b 1r   zsigma shape  is not supportedb t -> b 1 t 1 1)rz   r   )r   rI   rJ   T)dimkeepdimr~   rg   )x_B_C_T_H_Wtimesteps_B_TFN)Imagetemp)exist_ok      ?g     _@   rI   rJ   z"contaminated_ground_truth_frames--z.pngz/Saved contaminated ground-truth frame at path: zclean_ground_truth_frames--z(Saved clean ground-truth frame at path: znoised_latent_frames--z#Saved noised latent frame at path: zdenoised_latent_frames--z%Saved denoised latent frame at path: r1   )1ndimr   
ValueErrorrd   scalingis_videorB   type_asr=   
sigma_datause_video_conditionrc   repeatrh   rk   r   meannetr`   ra   squeezeuse_wan_fp32_strategyfloat32to_dictr9   r   torch.distributeddistributedis_initializedget_rankPILr   osmakedirs	tokenizerdecodeclamppermuteuint8cpunumpyrangepathjoin	fromarraysaveprintrq   r
   )$r.   ry   rz   ru   	sigma_B_Tsigma_B_1_T_1_1c_skip_B_1_T_1_1c_out_B_1_T_1_1c_in_B_1_T_1_1c_noise_B_1_T_1_1net_state_in_B_C_T_H_Wcondition_state_in_B_C_T_H_Wrx   Ccondition_video_masksigma_cond_B_1_T_1_1c_noise_cond_B_1_T_1_1condition_video_mask_B_1_T_1_1net_output_B_C_T_H_Wx0_pred_B_C_T_H_Weps_pred_B_C_T_H_W	DEBUGGINGZSAMPLE_INDEXdistr   Zground_truth_framesZdecoded_ground_truth_framesZ(unnormalized_decoded_ground_truth_framesidx	save_pathZnoisy_inputZdecoded_noisy_inputZ unnormalized_decoded_noisy_inputdenoised_outputZdecoded_denoised_outputZ$unnormalized_decoded_denoised_outputr1   r1   r2   denoise  s   








z$CosmosPolicyVideo2WorldModel.denoise      ?FNguidanceis_negative_promptskip_vae_encodingprevious_generated_latentreturn_orig_clean_latent_framesc                    s  t v r	t  }nd}|rj\ nj\ } |r*tjntj dur=|r9tjntj|rL|dusGJ d|	 }	n
\}
}	}
 j|	jjjj|jjd duruj|	jjjj|d jd|d durjd|d j	  _ jjd	 }d
v rtd dkrɈd
 }d }tj||jd}d j|dd|ddddf< t j||d _dv rd rtj| jjd}td dkrd	 j|ddd ddddf< td dkrd	 j|ddd ddddf< dv r4td dkr4d	 j|ddd ddddf< td dkrPd	 j|ddd ddddf< dv rqtd dkrqd	 j|ddd ddddf< d	 j|ddd ddddf< dv r,d r,tj| jjd}td dkrd	 j|ddd ddddf< td dkrd	 j|ddd ddddf< dv rtd dkrd	 j|ddd ddddf< td dkrd	 j|ddd ddddf< dv r,td dkr,d	 j|ddd ddddf< |	 dd\}
 }
}
durI|	dd\}
}
}
t rOn	jjrXJ ddtjdtjdtjf fdd }|rt| jfS |S )!aG  
        NOTE (user): Modified to remove the negative prompt and "uncondition" since we are doing always-conditional generation instead of CFG.

        Generates a callable function `x0_fn` based on the provided data batch and guidance factor.

        Extended to support:
        - Handling uncondition=None case (always-conditional generation)
        - Skip VAE encoding for autoregressive generation
        - Return original clean latent frames
        - Latent injection for proprio and actions in conditioning

        Args:
        - data_batch (Dict): A batch of data used for conditioning.
        - guidance (float, optional): Guidance scale. Defaults to 1.5.
        - is_negative_prompt (bool): use negative prompt t5 in uncondition if true
        - skip_vae_encoding (bool): Skip VAE encoding if True
        - previous_generated_latent (torch.Tensor): Previous generated sample
        - return_orig_clean_latent_frames (bool): Whether to return the original condition

        Returns:
        - Callable: A function `x0_fn(noise_x, sigma)` that returns x0 prediction
        r   NzHprevious_generated_latent must be provided if skip_vae_encoding is True!rA   )rB   rC   rD   rE   T)is_cfg_conditionalrE   Fr   r\   rP   rK   rL   r]   rm   rQ   rR   rS   rT   rN   ro   rU   rV   rW   rX   rY   zIparallel_state is not initialized, context parallel should be turned off.noise_xrz   r@   c           
         s   j jr| | }| |}|||   }|S | | j}d ur9| |j}|||   }n|}dv rYdv sGJ dd }d }	|	| d|	 |  }|S )Nguided_imageguided_maskz>guided_mask should be in data_batch if guided_image is presentr   )r=   use_flowunipc_schedulerdenoise_with_velocityr   x0)
r   rz   cond_velocityuncond_velocityvelocitycond_x0	uncond_x0raw_x0guide_image
guide_maskru   r?   r   r.   unconditionr1   r2   x0_fn  s    z@CosmosPolicyVideo2WorldModel.get_x0_fn_from_batch.<locals>.x0_fn) r   conditioner"get_condition_with_negative_promptget_condition_unconditionis_image_batchedit_data_typer   IMAGEVIDEOrp   r^   r_   r=   r   r   r"   edit_for_inferencerB   rq   rc   rd   rh   rn   ri   rM   r   %broadcast_split_for_model_parallelsimr   r   r   is_context_parallel_enabledr	   )r.   r?   r   r   r   r   r   rE   r   r   rx   Br\   rP   rv   r   r1   r   r2   get_x0_fn_from_batch9  s    




 


$




(
z1CosmosPolicyVideo2WorldModel.get_x0_fn_from_batchx0_sizec                    s  t  ||\}}|jtjk}|r| jjttj	krCt
j|j|jd| jjk }t
j|j|jd|d d }t
|||}||fS | jjttjkrxt
j|j|jd| jjk }t
j|j|jd|tt  t }t
|| |}||fS | jjttjkrt
j|j|jd }	d|	 d|	 d |	  }	|	d|	   }t
j|j|jd| jjk }t
j|j|jd|d d }t
|||}||fS | jjttjkr"t
j|j|jd| jjk }t
j|j|jd|tt  t }t
|| |}t
j|j|jd| jjk }t
j|j|jd|d d }
t
||
|}||fS | jjttjkrjt| d	sUd
dlm} || jj| jj ddd}t
j!t
j"dg|jd|gd
d| _#| j#t
$d
t%| j#|j |}||fS | jjttj&kry	 ||fS t'd| jj d||fS )z
        Extended sigma sampling with high sigma strategy support.

        Inherited from Video2WorldModel to support various high sigma strategies.
        rL   i  P      r   r   g       @gh㈵>hardcoded_20steps_sigmar   )
get_rev_ts   g      @)t_mint_max	num_stepsts_orderg     j@r   zHigh sigma strategy r|   )(r$   draw_training_sigma_and_epsilon	data_typer   r   r=   r   r&   r   r)   rh   randrd   rM   r    r   rj   r*   
LOG_100000LOG_200expr,   doubler9   r+   r!   r-   hasattr1cosmos_policy._src.imaginaire.modules.res_samplerr   sde	sigma_min	sigma_maxcattensorr   randintlenr(   r   )r.   r   ru   	sigma_B_1epsilonis_video_batchmask	new_sigmalog_new_sigma_tlow_sigma_B_1r   r   r/   r1   r2   r     sp    0)   z<CosmosPolicyVideo2WorldModel.draw_training_sigma_and_epsilonnoise_x_in_t_spacet_B_Tc           	      C   sx   |j dkrt|d}n|j dkr|}n	td|j d|d|  }|dt|d  }| |||}|j}|j}|| S )zX
        This function is used when self.config.use_flowunipc_scheduler is set.
        r   r{   r   zt_B_T shape r|   r   r}   )r   r   r   rd   r   r   eps)	r.   r  r  ru   r   x_B_C_T_H_W_in_sigma_spacedenoise_output_B_C_T_H_Wr   r   r1   r1   r2   r   D  s   

z2CosmosPolicyVideo2WorldModel.denoise_with_velocity)r   FFNF)r3   r4   r5   r6   r   r>   dictr&   rh   r	   r   r   r^   r   r
   r   r   r9   r:   r   r   r7   r   r   r   r;   r1   r1   r/   r2   r<   S   sf      (
 3
 L@r<   )(r6   mathr   typingr   r   r   r   r   attrsrh   einopsr   megatron.corer   r	   6cosmos_policy._src.imaginaire.utils.denoise_predictionr
   7cosmos_policy._src.imaginaire.utils.high_sigma_strategyr   'cosmos_policy._src.predict2.conditionerr   4cosmos_policy._src.predict2.models.video2world_modelr   r   cosmos_policy.conditionerr   8cosmos_policy.config.conditioner.video2world_conditionerr   Z,cosmos_policy.models.policy_text2world_modelr   r   r   r   logr   r   definer   r<   r1   r1   r1   r2   <module>   s*   


 