o
    vi߱                     @  s  d Z ddlmZ ddlmZmZmZ ddlZddlZddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" d"ddZ#d#ddZ$ej%ddG dd deZ&G d d! d!eZ'dS )$z]
Cosmos Policy Diffusion Model - extends Text2WorldModel with policy-specific functionality.
    )annotations)DictOptionalTupleN)	rearrange)LazyCall)LazyDict)instantiate)COMMON_SOLVER_OPTIONS)misc)broadcast_split_tensorcat_outputs_cp)DiffusionModel)Text2WorldModelConfig)Text2WorldCondition)CosmosPolicySampler)HybridEDMSDEx0torch.Tensoraction_chunkaction_indicesreturnc                 C  s  t j| jd | jd}| |dd|ddddf }t |}|j\}}}}	||d}
|
jd }|| |	 }||ksFJ d| d| d|| d | }|
d|}|ddd|f }||d}||ddddf< |||||	}| }|||dd|ddddf< |S )	a  
    Replaces the image latent (at the specified action index) in clean input image latents x0 with the action chunk.

    Example:
    Let's say x0 has shape (B=32, C'=16, T', H'=28, W'=28) and action_chunk has shape (B=32, chunk_size=8, action_dim=7).
    Then, this function will overwrite the (C'=16, H'=28, W'=28) volume at x0[:,:,action_indices,:,:] with the action chunk,
    repeating it as many times as needed to fill the entire volume.

    Args:
        x0 (torch.Tensor): Clean image latents.
        action_chunk (torch.Tensor): Ground-truth action chunk.
        action_indices (torch.Tensor): Batch indices of the image latents to replace.

    Returns:
        torch.Tensor: Modified image latents.
    r   deviceN   z@Not enough room in the latent tensor for the full action chunk: z action elements >  latent elements!)torcharangeshaper   
zeros_likereshaperepeat)r   r   r   batch_indicesZaction_image_latentresult
batch_sizelatent_channelslatent_hlatent_wZflat_actionnum_action_elementslatent_elementsnum_repeatsZrepeated_actionflat_resultnew_x0 r.   R/data/cameron/vidgen/cosmos-policy/cosmos_policy/models/policy_text2world_model.py replace_latent_with_action_chunk-   s&   


r0   proprioproprio_indicesc                 C  s  t j| jd | jd}| |dd|ddddf }t |}|j\}}}}	|jd }
|| |	 }|
|ks@J d|
 d| d||
 d |
 }|d|}|ddd|f }||d}||ddddf< |||||	}| }|||dd|ddddf< |S )	a  
    Replaces the image latent (at the specified proprio index) in clean input image latents x0 with the proprio.

    Example:
    Let's say x0 has shape (B=32, C'=16, T', H'=28, W'=28) and proprio has shape (B=32, proprio_dim=9).
    Then, this function will overwrite the (C'=16, H'=28, W'=28) volume at x0[:,:,proprio_indices,:,:] with the proprio,
    repeating it as many times as needed to fill the entire volume.

    Args:
        x0 (torch.Tensor): Clean image latents.
        proprio (torch.Tensor): Ground-truth proprio.
        proprio_indices (torch.Tensor): Batch indices of the image latents to replace.

    Returns:
        torch.Tensor: Modified image latents.
    r   r   Nr   z;Not enough room in the latent tensor for the full proprio: z proprio elements > r   r   )r   r   r   r   r    r"   r!   )r   r1   r2   r#   Zproprio_image_latentr$   r%   r&   r'   r(   Znum_proprio_elementsr*   r+   Zrepeated_proprior,   r-   r.   r.   r/   replace_latent_with_propriop   s$   


r3   F)slotsc                	      s   e Zd ZU dZeeddddddddZd	ed
< dZded< dZ	ded< dZ
ded< dZded< dZded<  fddZ  ZS )CosmosPolicyModelConfigz
    Extended config for Cosmos Policy diffusion model.
    Uses Cosmos Policy's HybridEDMSDE instead of the original EDMSDE.
    Also adds policy-specific parameters for loss masking and action prediction.
    g              ?P   g-C6*?Tg     @U@)p_meanp_std	sigma_max	sigma_minZhybrid_sigma_distributionZuniform_lowerZuniform_upperr   sdeFbool,mask_loss_for_action_future_state_prediction0mask_value_prediction_loss_for_policy_prediction.mask_current_state_action_for_value_prediction'mask_future_state_for_qvalue_predictionr   intaction_loss_multiplierc                   s&   t    | jr| jrJ dd S d S )NzuCannot enable both mask_loss_for_action_future_state_prediction and mask_value_prediction_loss_for_policy_prediction!)super__attrs_post_init__r>   r?   )self	__class__r.   r/   rE      s   
z+CosmosPolicyModelConfig.__attrs_post_init__)__name__
__module____qualname____doc__Lr   r<   __annotations__r>   r?   r@   rA   rC   rE   __classcell__r.   r.   rG   r/   r5      s"   
 r5   c                      sb   e Zd ZdZdD fddZdEddZdFd$d%Z	&	'	(	(	)	*	+	(	(	)	,	)	(	)dGdH fdBdCZ  ZS )ICosmosPolicyDiffusionModelat  
    Cosmos Policy Diffusion Model - extends Text2WorldModel with policy-specific functionality.

    Adds support for:
    - Action chunk prediction and injection
    - Proprioception (proprio) prediction and injection
    - Value function prediction
    - Loss masking for different prediction types (action, future state, value)
    - Multi-component loss tracking
    configr5   c                   s*   t  | || _t|j| _t | _d S )N)rD   __init__rQ   lazy_instantiater<   r   sampler)rF   rQ   rG   r.   r/   rR      s   z#CosmosPolicyDiffusionModel.__init__
data_batchdict[str, torch.Tensor]	iterationrB   r   ,tuple[dict[str, torch.Tensor], torch.Tensor]c                 C  s|  |  | | jjdur,| jjjr,| j|| j}||d< tj|j	d |j	d dd|d< | 
|\}}}| | |\}}| ||||\}}}}| j||||f|d |d	 |d
 |d |d |d |d d|v rn|d nd|d d|v rz|d nd|d |d |d |d |d d\}	}
}}| jdkr|
 | j }
|	|
fS | jdkr|
jdd | j }
|	|
fS td| j )a:  
        Performs a single training step for the Cosmos Policy diffusion model.

        Extended from base to pass policy-specific data (actions, proprio, values, masks).

        Args:
            data_batch (dict): raw data batch draw from the training data loader.
            iteration (int): Current iteration number.

        Returns:
            tuple: A tuple containing two elements:
                - dict: additional data that used to debug / logging / callbacks
                - Tensor: The computed loss for the training step as a PyTorch Tensor.
        Nt5_text_embeddingsr   r   cudar   t5_text_maskactionsaction_latent_idxr1   current_proprio_latent_idxfuture_propriofuture_proprio_latent_idxfuture_wrist_image_latent_idxfuture_wrist_image2_latent_idxfuture_image_latent_idxfuture_image2_latent_idxrollout_data_maskworld_model_sample_maskvalue_function_sample_maskvalue_function_returnvalue_latent_idx)r   r   r1   current_proprio_indicesr_   future_proprio_indicesfuture_wrist_image_indicesfuture_wrist_image2_indicesfuture_image_indicesfuture_image2_indicesre   rf   rg   rh   value_indicesmeansumdimzInvalid loss_reduce: )_update_train_statsrQ   text_encoder_configcompute_onlinetext_encodercompute_text_embeddings_onlineinput_caption_keyr   onesr   get_data_and_conditiondraw_training_sigma_and_epsilonsize%broadcast_split_for_model_parallelsim#compute_loss_with_epsilon_and_sigmaloss_reducerq   
loss_scalerr   
ValueError)rF   rU   rW   text_embeddings_x0_B_C_T_H_W	condition	sigma_B_Tepsilon_B_C_T_H_Woutput_batchkendall_lossr.   r.   r/   training_step   sN   
 

z(CosmosPolicyDiffusionModel.training_stepr   r   r   r   r   r   r   r   r1   rj   r_   rk   rl   rm   Optional[torch.Tensor]rn   ro   re   rf   rg   rh   rp   c           \      C  s|  |  |_tj|jd |jd}|jd |jd |jd }}}t|||d}t|dkr6t|||d}t|
dkrDt||	|
d}|	dddd
d||||j||d	d	|d	d	d	d	f< | j||\}}||t|d
  }| |||}| j|d}|jd |jd }}tj||ftj|jd}| jjs| jjrtj||ftj|jd} |dk|dk@ tj|j}!t|!rtj|!dddtj|j}"d| |"d	d	f< d| |"||" f< ||  }| jjr|jd |jd }}tj||ftj|jd} |dktj|j}#t|#r0tj|#dddtj|j}$d| |$||$ f< |dk|dktj|j@ }%t|%rtj|%dddtj|j}&t|dkrhd| |&||& f< |d	ur}t|dkr}d| |&||& f< t|dkrd| |&||& f< |d	urt|dkrd| |&||& f< t|
dkrd| |&|
|& f< |dk|dk@ tj|j}!t|!rtj|!dddtj|j}"d| |"||" f< ||  }| jjr| dksJ d|jd |jd }}tj||ftj|jd} |dktj|j}#t|#rtj|#dddtj|j}$d| |$||$ f< t|dkrId| |$||$ f< |d	ur^t|dkr^d| |$||$ f< t|dkrnd| |$||$ f< |d	urt|dkrd| |$||$ f< t|
dkrd| |$|
|$ f< |dk|dktj|j@ }%t|%rtj|%dddtj|j}&t|dkrd| |&||& f< |d	urt|dkrd| |&||& f< t|dkrd| |&||& f< |d	urt|dkrd| |&||& f< t|
dkrd| |&|
|& f< ||  }| jjdkr0|||f t| jj |||f< ||j  d }'|'t|d
 }(|(})| jjsV| jjsV| jjsV| jjdkr]|)t|d
 })t|dkrtj|jd |jd}||d	d	|d	d	d	d	f |j |d	d	|d	d	d	d	f  }*|*|dk }+|*|dk },|*|dk }-|+d ! }.t"|+! }/|,d ! }0t"|,! }1|*d ! }2t"|*! }3n<tj#t$d|jd}.tj#t$d|jd}/tj#t$d|jd}0tj#t$d|jd}1tj#t$d|jd}2tj#t$d|jd}3t|dkrf||d	d	|d	d	d	d	f |j |d	d	|d	d	d	d	f  }4|4|dk }5|4|dk }6|4|dk }7|5d ! }8t"|5! }9|6d ! }:t"|6! };|4d ! }<t"|4! }=n<tj#t$d|jd}8tj#t$d|jd}9tj#t$d|jd}:tj#t$d|jd};tj#t$d|jd}<tj#t$d|jd}=t|
dkr||d	d	|
d	d	d	d	f |j |d	d	|
d	d	d	d	f  }>|>|dk }?|>|dk }@|>|dk }A|?d ! }Bt"|?! }C|@d ! }Dt"|@! }E|>d ! }Ft"|>! }Gn<tj#t$d|jd}Btj#t$d|jd}Ctj#t$d|jd}Dtj#t$d|jd}Etj#t$d|jd}Ftj#t$d|jd}G||d	d	|d	d	d	d	f |j |d	d	|d	d	d	d	f  }H|H|dk }I|H|dk }J|Id ! }Kt"|I! }L|Hd ! }Mt"|H! }N||d	d	|d	d	d	d	f |j |d	d	|d	d	d	d	f  }O|O|dk }P|O|dk }Q|O|dk }R|Pd ! }St"|P! }T|Qd ! }Ut"|Q! }V|Rd ! }Wt"|R! }X|Od ! }Yt"|O! }Zi d|d|d|d|d|d|d|'! d|(! dtj!|(g ddd|Kd|Ld|Bd |Cd!|8d"|9d#|.d$|/|S|T|D|E|:|;|0|1|U|V|W|Xd%}[|[|)|'|(fS )&a  
        NOTE (user): Modified to add action chunk prediction and future image prediction + action chunk loss logging.

        Compute loss given epsilon and sigma with policy-specific functionality.

        This method extends the base implementation to support:
        1. Latent injection for actions, proprioception, and values
        2. Loss masking for different prediction types (action, future state, value)
        3. Detailed per-component loss tracking

        Args:
            x0_B_C_T_H_W: image/video latent
            condition: text condition
            epsilon_B_C_T_H_W: noise
            sigma_B_T: noise level
            action_chunk: ground truth action chunk
            action_indices: indices for action latent frames
            proprio: current proprioception
            current_proprio_indices: indices for current proprio latent frames
            future_proprio: future proprioception
            future_proprio_indices: indices for future proprio latent frames
            future_wrist_image_indices: indices for future wrist image latent frames
            future_wrist_image2_indices: indices for future wrist image #2 latent frames
            future_image_indices: indices for future primary image latent frames
            future_image2_indices: indices for future secondary image latent frames
            rollout_data_mask: mask for rollout vs demo data
            world_model_sample_mask: mask for world model samples
            value_function_sample_mask: mask for value function samples
            value_function_return: ground truth value function returns
            value_indices: indices for value latent frames

        Returns:
            tuple: A tuple containing four elements:
                - dict: additional data that used to debug / logging / callbacks
                - Tensor 1: kendall loss,
                - Tensor 2: MSE loss,
                - Tensor 3: EDM loss
        r   r   r         )r   r   )r2   Nzb t -> b 1 t 1 1)sigma   )dtyper   F)as_tuplezhNo value function samples should be present when mask_value_prediction_loss_for_policy_prediction==True!nanr   xtr   weights_per_sigmar   
model_predmse_lossedm_lossedm_loss_per_frame)r   r   r   rs   demo_sample_action_mse_lossdemo_sample_action_l1_loss#demo_sample_future_proprio_mse_loss"demo_sample_future_proprio_l1_loss'demo_sample_future_wrist_image_mse_loss&demo_sample_future_wrist_image_l1_loss!demo_sample_future_image_mse_loss demo_sample_future_image_l1_loss)demo_sample_value_mse_lossdemo_sample_value_l1_loss*world_model_sample_future_proprio_mse_loss)world_model_sample_future_proprio_l1_loss.world_model_sample_future_wrist_image_mse_loss-world_model_sample_future_wrist_image_l1_loss(world_model_sample_future_image_mse_loss'world_model_sample_future_image_l1_loss!world_model_sample_value_mse_loss world_model_sample_value_l1_loss$value_function_sample_value_mse_loss#value_function_sample_value_l1_loss)%cloneZorig_x0_B_C_T_H_Wr   r   r   r   r0   allr3   r!   expandtor   r<   marginal_probr   denoiseget_per_sigma_loss_weightsr{   longrQ   r@   rA   anynonzerosqueezer>   zerosr?   rr   rC   rB   r   rq   abstensorfloat)\rF   r   r   r   r   r   r   r1   rj   r_   rk   rl   rm   rn   ro   re   rf   rg   rh   rp   r#   C_latentH_latentW_latentmean_B_C_T_H_Wstd_B_Txt_B_C_T_H_Wr   weights_per_sigma_B_TBTZfinal_mask_B_TZmask_B_TZvalue_idx_BZvalue_batch_indicesZ
demo_idx_BZdemo_batch_indicesZworld_idx_BZworld_batch_indicespred_mse_B_C_T_H_Wedm_loss_B_C_T_H_Wr   Zfuture_image_diffZfuture_image_diff_demoZfuture_image_diff_world_modelZ future_image_diff_value_functionr   r   r   r   Z!all_samples_future_image_mse_lossZ all_samples_future_image_l1_lossZfuture_wrist_image_diffZfuture_wrist_image_diff_demoZ#future_wrist_image_diff_world_modelZ&future_wrist_image_diff_value_functionr   r   r   r   Z'all_samples_future_wrist_image_mse_lossZ&all_samples_future_wrist_image_l1_lossZfuture_proprio_diffZfuture_proprio_diff_demoZfuture_proprio_diff_world_modelZ"future_proprio_diff_value_functionr   r   r   r   Z#all_samples_future_proprio_mse_lossZ"all_samples_future_proprio_l1_lossZaction_diffZaction_diff_demoZaction_diff_world_modelr   r   Zall_samples_action_mse_lossZall_samples_action_l1_lossZ
value_diffZvalue_diff_demoZvalue_diff_world_modelZvalue_diff_value_functionr   r   r   r   r   r   Zall_samples_value_mse_lossZall_samples_value_l1_lossr   r.   r.   r/   r   6  s  
C""
"
" ""
" "<<	"z>CosmosPolicyDiffusionModel.compute_loss_with_epsilon_and_sigma      ?r   NF#   2abr   r   guidancer   seedstate_shapeTuple | Nonen_sample
int | Noneis_negative_promptr=   	num_stepssolver_optionr
   x_sigma_maxr:   float | Noneuse_variance_scale	worker_idskip_vae_encodingprevious_generated_latentreturn_orig_clean_latent_framesc                   s  |  | | | | |}|r| jn| j}|du r"|| jd }|du rF|| jdd \}}}| jj| j	||| jj
 || jj
 g}|rV| j|||||dd\}}n
| j|||||d}|r~t| td d d	 }td d
 d }nd	}d	}| jjrt j|||||||||	|
f
i |S |	du rt|ft| tj| jd || jj | }	| jjrt|	d|  d}	|
du r| jj}
| j||	||
| | jj| |d}| jjrt |d|  d}|r||fS |S )a  
        Generate samples from the batch with Cosmos Policy extensions.

        Extended to support:
        - Variance scaling for increased diversity
        - Autoregressive generation (skip_vae_encoding, previous_generated_latent)
        - Returning original clean latent frames

        Args:
            data_batch (dict): raw data batch draw from the training data loader.
            guidance (float): guidance weights
            seed (int): random seed
            state_shape (tuple): shape of the state, default to data batch if not provided
            n_sample (int): number of samples to generate
            is_negative_prompt (bool): use negative prompt t5 in uncondition if true
            num_steps (int): number of steps for the diffusion process
            solver_option (str): differential equation solver option, default to "2ab"
            use_variance_scale (bool): use variance scale to increase diversity in outputs
            worker_id (int): worker id for random seed
            skip_vae_encoding (bool): Skip VAE encoding if True
            previous_generated_latent (torch.Tensor): Previous generated sample
            return_orig_clean_latent_frames (bool): Whether to return the clean latent frames
        Nr   T)r   r   r   r   )r   r   r   r   g       @r6   g?g?r   r   )seq_dimprocess_group)r   r:   r;   r   )r   cp_group)!"_normalize_video_databatch_inplace_augment_image_dim_inplaceis_image_batchinput_image_keyinput_data_keyr   rQ   state_ch	tokenizerget_latent_num_framesspatial_compression_factorget_x0_fn_from_batchr   manual_seedranditemuse_flowunipc_schedulerrD   generate_samples_from_batchr   arch_invariant_randtuplefloat32tensor_kwargsr<   r:   netis_context_parallel_enabledr   get_context_parallel_grouprT   r;   r   )rF   rU   r   r   r   r   r   r   r   r   r:   r   r   r   r   r   kwargsr   	input_key_T_H_Wx0_fnorig_clean_latent_framesZsigma_max_variance_scaleZsigma_min_variance_scalesamplesrG   r.   r/   r     s   
*




		


z6CosmosPolicyDiffusionModel.generate_samples_from_batch)rQ   r5   )rU   rV   rW   rB   r   rX   )&r   r   r   r   r   r   r   r   r   r   r   r   r1   r   rj   r   r_   r   rk   r   rl   r   rm   r   rn   r   ro   r   re   r   rf   r   rg   r   rh   r   rp   r   )r   r   NNFr   r   NNFr   FNF) rU   r   r   r   r   rB   r   r   r   r   r   r=   r   rB   r   r
   r   r   r:   r   r   r=   r   rB   r   r=   r   r   r   r=   r   r   )	rI   rJ   rK   rL   rR   r   r   r   rO   r.   r.   rG   r/   rP      s.    

F   rP   )r   r   r   r   r   r   r   r   )r   r   r1   r   r2   r   r   r   )(rL   
__future__r   typingr   r   r   attrsr   einopsr   )cosmos_policy._src.imaginaire.lazy_configr   rM   r   r	   rS   1cosmos_policy._src.imaginaire.modules.res_samplerr
   #cosmos_policy._src.imaginaire.utilsr   4cosmos_policy._src.imaginaire.utils.context_parallelr   r   3cosmos_policy._src.predict2.models.text2world_modelr   ZBaseDiffusionModelr   ZBaseText2WorldModelConfigcosmos_policy.conditionerr   Z$cosmos_policy.modules.cosmos_samplerr   Z$cosmos_policy.modules.hybrid_edm_sder   r0   r3   definer5   rP   r.   r.   r.   r/   <module>   s,   

C
@+