o
    vi@Y                     @  s  d dl mZ d dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* edddZ+G dd de,eZ-d3d4ddZ.eddG dd deZ/eddG dd  d e/Z0eddG d!d" d"e0Z1G d#d$ d$ej2Z3G d%d& d&e3Z4G d'd( d(e3Z5G d)d* d*e3Z6G d+d, d,e3Z7G d-d. d.ej2eZ8G d/d0 d0e8Z9G d1d2 d2e8Z:dS )5    )annotationsN)ABCabstractmethod)defaultdict)nullcontext)	dataclassfields)Enum)AnyDictListOptionalTupleTypeVarUnion)ProcessGroup)	batch_mul)instantiate)log)	broadcast)count_params)disabled_train)easy_ioTBaseCondition)boundc                   @  s"   e Zd ZdZdZdZd	ddZdS )
DataTypeimagevideomixreturnstrc                 C     | j S N)valueself r'   M/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/predict2/conditioner.py__str__/   s   zDataType.__str__Nr    r!   )__name__
__module____qualname__IMAGEVIDEOZMIXr)   r'   r'   r'   r(   r   *   s
    r   	conditionprocess_groupOptional[ProcessGroup]r    c                 C  sl   | j r| S | jdd}| D ]\}}|dur(t|tjr!| }t||||< qd|d< t| di |S )zR
    Broadcast the condition from the minimum rank in the specified group(s).
    Fskip_underscoreNT_is_broadcastedr'   )	is_broadcastedto_dictitems
isinstancetorchTensorcudar   type)r0   r1   kwargskeyr$   r'   r'   r(   broadcast_condition3   s   r@   T)frozenc                   @  sB   e Zd ZU dZdZded< ddd	d
ZedddZdddZ	dS )r   z
    Attributes:
        _is_broadcasted: Flag indicating if parallel broadcast splitting
            has been performed. This is an internal implementation detail.
    Fboolr5   Tr4   r    Dict[str, Any]c                   s    fddt  D S )zConverts the condition to a dictionary.

        Returns:
            Dictionary containing the condition's fields and values.
        c                   s,   i | ]}|j d rs|j t |j qS )_)name
startswithgetattr).0fr&   r4   r'   r(   
<dictcomp>U   s   , z)BaseCondition.to_dict.<locals>.<dictcomp>)r   rJ   r'   rJ   r(   r7   N   s   zBaseCondition.to_dictc                 C  r"   r#   )r5   r%   r'   r'   r(   r6   W      zBaseCondition.is_broadcastedr1   torch.distributed.ProcessGroupc                 C  s   | j r| S t| |S )a]  Broadcasts and splits the condition across the checkpoint parallelism group.
        For most condition, such as Text2WorldCondition, we do not need split.

        Args:
            process_group: The process group for broadcast and split

        Returns:
            A new BaseCondition instance with the broadcasted and split condition.
        )r6   r@   )r&   r1   r'   r'   r(   r   [   s   

zBaseCondition.broadcastN)T)r4   rB   r    rC   r    rB   )r1   rM   r    r   )
r+   r,   r-   __doc__r5   __annotations__r7   propertyr6   r   r'   r'   r'   r(   r   D   s   
 	c                   @  sX   e Zd ZU dZded< ejZded< dZded< dZ	ded< dd	d
Z
edddZdS )Text2WorldConditionNOptional[torch.Tensor]crossattn_embr   	data_typepadding_maskfpsr    c                 C  s&   | j dd}||d< t| di |S )zEdit the data type of the condition.

        Args:
            data_type: The new data type.

        Returns:
            A new Text2WorldCondition instance with the new data type.
        Fr3   rU   Nr'   )r7   r=   )r&   rU   r>   r'   r'   r(   edit_data_typeq   s   	z"Text2WorldCondition.edit_data_typerB   c                 C  s   | j tjkS r#   )rU   r   r/   r%   r'   r'   r(   is_video~   s   zText2WorldCondition.is_video)rU   r   r    rR   rN   )r+   r,   r-   rT   rP   r   r/   rU   rV   rW   rX   rQ   rY   r'   r'   r'   r(   rR   j   s   
 
rR   c                   @  s@   e Zd ZU dZded< dZded< dZded< 	ddddZdS )GR00TV1Img2VidConditionNrS   gt_first_frameFrB   use_image_condition$condition_video_input_mask_B_C_T_H_Wr1   r2   r    c                 C  s   |du rdn|  }| jdd}|j\}}}}}	tj|d|||	f|j|jd}
|dks1| dkrO|dddddf  |d< |
dddddf  d7  < |
|d< t	| d	i |S )
zEdit the video condition to include the video mask information.

        Args:
            x0_B_C_T_H_W: The first frame of the video.

        Returns:
            A new GR00TV1Img2VidCondition instance with the video mask information.
        N   Fr3   dtypedevicer   r[   r]   r'   )
sizer7   shaper:   zerosr`   ra   rankdetachr=   )r&   x0_B_C_T_H_Wr1   pg_sizer>   BrD   r   HWZcondition_video_input_maskr'   r'   r(   edit_video_condition   s   z,GR00TV1Img2VidCondition.edit_video_conditionr#   )r1   r2   r    rZ   )r+   r,   r-   r[   rP   r\   r]   rl   r'   r'   r'   r(   rZ      s   
 rZ   c                      s   e Zd Z fddZed%ddZed&dd	Zed'ddZed%ddZej	d(ddZej	d)dd	Zej	d*ddZej	d(ddZej
dd Zej
dd	 Zej
dd Zej
dd Z	d+d,dd Zd'd!d"Zd'd#d$Z  ZS )-AbstractEmbModelc                   s&   t    d | _d | _d | _d| _d S )NF)super__init___is_trainable_dropout_rate
_input_key_return_dictr%   	__class__r'   r(   ro      s
   

zAbstractEmbModel.__init__r    rB   c                 C  r"   r#   rp   r%   r'   r'   r(   is_trainable   rL   zAbstractEmbModel.is_trainableUnion[float, torch.Tensor]c                 C  r"   r#   rq   r%   r'   r'   r(   dropout_rate   rL   zAbstractEmbModel.dropout_rater!   c                 C  r"   r#   rr   r%   r'   r'   r(   	input_key   rL   zAbstractEmbModel.input_keyc                 C  r"   r#   rs   r%   r'   r'   r(   is_return_dict   rL   zAbstractEmbModel.is_return_dictr$   c                 C  
   || _ d S r#   rv   r&   r$   r'   r'   r(   rw         
c                 C  r   r#   ry   r   r'   r'   r(   rz      r   c                 C  r   r#   r{   r   r'   r'   r(   r|      r   c                 C  r   r#   r}   r   r'   r'   r(   r~      r   c                 C     | ` d S r#   rv   r%   r'   r'   r(   rw         c                 C  r   r#   ry   r%   r'   r'   r(   rz      r   c                 C  r   r#   r{   r%   r'   r'   r(   r|      r   c                 C  r   r#   r}   r%   r'   r'   r(   r~      r   N	in_tensortorch.Tensorrz   Optional[float]r?   Optional[str]c                 C  s>   ~|d ur|n| j }ttd| t|jd  ||S )N      ?r   )rz   r   r:   	bernoullionesrc   type_asr&   r   rz   r?   r'   r'   r(   random_dropout_input   s   "z%AbstractEmbModel.random_dropout_inputc                 C     dS )N r'   r%   r'   r'   r(   details      zAbstractEmbModel.detailsc                 C  sT   | j d ur| j nt| dd }| jj d| dt| d d| j d| j d|   S )N
input_keysz 
	input key: z
	Param count: Fz 
	Trainable: z
	Dropout rate: z
	)r|   rG   ru   r+   r   rw   rz   r   )r&   r|   r'   r'   r(   summary   s   zAbstractEmbModel.summaryrN   )r    rx   r*   )r$   rB   )r$   rx   )r$   r!   NNr   r   rz   r   r?   r   r    r   )r+   r,   r-   ro   rQ   rw   rz   r|   r~   setterdeleterr   r   r   __classcell__r'   r'   rt   r(   rm      s<    	





rm   c                      sX   e Zd Z				dd  fddZd!ddZd"ddZ	d#d$ fddZd%ddZ  ZS )&TextAttr        F>s3://bucket/predict2_assets/reason1_empty_string_embeddings.ptcredentials/s3_training.secretr|   	List[str]rz   r   use_empty_stringrB   empty_string_embeddings_pathr!   credential_pathc                   s2   t    || _|| _|| _d | _|| _|| _d S r#   )rn   ro   rr   rq   r   _empty_string_embeddings_cacher   r   )r&   r|   rz   r   r   r   rt   r'   r(   ro      s   

zTextAttr.__init__tokenr   c                 C     d|iS NrT   r'   r&   r   r'   r'   r(   forward     zTextAttr.forwardr    c                 C  s*   | j du rtj| jd| jdd| _ | j S )z,Lazy load and cache empty string embeddings.Ns3backends3_credential_pathbackend_args)r   r   loadr   r   r%   r'   r'   r(   _get_empty_string_embeddings  s   

z%TextAttr._get_empty_string_embeddingsNr   r?   r   c                   s   |d ur
d|v r
|S | j st |||S |jd }|d ur |n| j}|  }||jj|j|j	d}t
d| t
j||j	d |}|j|gdg| d  R  }|| d| |  S )Nmaskr   r_   r   ra   r^   )r   rn   r   rc   rz   r   expandtor`   ra   r:   r   r   r   viewdim)r&   r   rz   r?   ri   Zempty_string_embeddings	keep_maskrt   r'   r(   r     s   
$ zTextAttr.random_dropout_inputc                 C  r   NzOutput key: [crossattn_emb]r'   r%   r'   r'   r(   r   !  r   zTextAttr.details)r   Fr   r   )
r|   r   rz   r   r   rB   r   r!   r   r!   r   r   )r    r   r   r   r*   )	r+   r,   r-   ro   r   r   r   r   r   r'   r'   rt   r(   r      s    


r   c                      sB   e Zd Zdd fddZdd
dZ	ddddZdddZ  ZS )TextAttrEmptyStringDropr   r|   r   rz   r   c                   s    t    || _|| _d | _d S r#   )rn   ro   rr   rq   empty_prompt_data)r&   r|   rz   rt   r'   r(   ro   &     

z TextAttrEmptyStringDrop.__init__r   r   c                 C  r   r   r'   r   r'   r'   r(   r   ,  r   zTextAttrEmptyStringDrop.forwardNr   r?   r   r    c                 C  s  |d ur
d|v r
|S ~| j d u rtjddddd| _ |d ur!|n| j}|jd }td| tj||jd	 	|}|j
|gd
g| d
  R  }| j j|j|jd}|jd |kr|jd d
krr|j|g|jd
d  R  }ntd|jd  d| || d| |  S )Nr   zWs3://bucket/edify_video/v4/validation/item_dataset/negative_prompt/empty_string_umt5.ptr   r   r   r   r   r   r   r^   r_   zempty_prompt_data batch size z% does not match in_tensor batch size )r   r   r   rz   rc   r:   r   r   ra   r   r   r   r   r`   r   
ValueError)r&   r   rz   r?   ri   r   Zempty_promptr'   r'   r(   r   /  s(   

$ z,TextAttrEmptyStringDrop.random_dropout_inputr!   c                 C  r   r   r'   r%   r'   r'   r(   r   N  r   zTextAttrEmptyStringDrop.details)r   )r|   r   rz   r   r   r   r   r*   r+   r,   r-   ro   r   r   r   r   r'   r'   rt   r(   r   %  s    
r   c                      s:   e Zd Z			dd fd
dZdddZdddZ  ZS )ReMapkeyNr   r|   r!   
output_keyr   rz   r   r`   c                   sL   t    || _d tjtjtjtjtjtjd| | _	|| _
|| _|| _d S )N)Nfloatbfloat16halffloat16intlong)rn   ro   r   r:   float32r   r   int32int64r`   rr   Z_output_keyrq   )r&   r|   r   rz   r`   rt   r'   r(   ro   S  s   
	
zReMapkey.__init__elementr   r    Dict[str, torch.Tensor]c                 C  s4   | j r| j n| j}t|tjr|j| jd}||iS )N)r`   )r   r|   r9   r:   r;   r   r`   )r&   r   r?   r'   r'   r(   r   i  s   zReMapkey.forwardc                 C  s$   | j r| j n| j}d| d| j S )NOutput key: z
 
	Dtype: )r   r|   r`   r&   r?   r'   r'   r(   r   o  s   zReMapkey.details)Nr   N)r|   r!   r   r   rz   r   r`   r   )r   r   r    r   r*   )r+   r,   r-   ro   r   r   r   r'   r'   rt   r(   r   R  s    
r   c                      sB   e Zd Zdd fd	d
ZdddZ	ddddZdddZ  ZS )BooleanFlagNr   r|   r!   r   r   rz   r   c                   s    t    || _|| _|| _d S r#   )rn   ro   rr   rq   r   )r&   r|   r   rz   rt   r'   r(   ro   u  r   zBooleanFlag.__init__r    r   c                 O  s    ~~| j r| j n| j}|| jiS r#   )r   r|   flag)r&   argsr>   r?   r'   r'   r(   r   {  s   
zBooleanFlag.forwardr   r   r?   c                 C  s@   ~|d ur|n| j }td| td  j|jd| _|S )Nr   r^   r   )rz   r:   r   r   rB   r   ra   r   r   r'   r'   r(   r     s   (z BooleanFlag.random_dropout_inputc                 C  s   | j r| j n| j}d| dS )Nr   z 
	 This is a boolean flag)r   r|   r   r'   r'   r(   r     s   zBooleanFlag.details)Nr   )r|   r!   r   r   rz   r   )r    r   r   r   r*   r   r'   r'   rt   r(   r   t  s    
r   c                      s^   e Zd ZdZddiZd fddZe	ddddZ	ddddZdddZ	dddZ
  ZS )GeneralConditionera,  
    An abstract module designed to handle various embedding models with conditional and unconditional configurations.
    This abstract base class initializes and manages a collection of embedders that can dynamically adjust
    their dropout rates based on conditioning.

    Attributes:
        KEY2DIM (dict): A mapping from output keys to dimensions used for concatenation.
        embedders (nn.ModuleDict): A dictionary containing all embedded models initialized and configured
                                   based on the provided configurations.

    Parameters:
        emb_models (Union[List, Any]): A dictionary where keys are embedder names and values are configurations
                                       for initializing the embedders.

    Example:
        See Edify4ConditionerConfig
    rT   r^   
emb_modelsUnion[List, Any]c              
     s   t    t | _t| D ]P\}\}}t|}t|t	s)J d|j
j dt|dd|_t|dd|_|jsKt|_| D ]}d|_qA|  td| d	| d
|   || j|< qd S )Nzembedder model z% has to inherit from AbstractEmbModelrw   Trz   r   FzInitialized embedder #-z: 
 )rn   ro   nn
ModuleDict	embedders	enumerater8   r   r9   rm   ru   r+   rG   rw   rz   r   train
parametersrequires_gradevalr   infor   )r&   r   nemb_nameZ
emb_configembedderparamrt   r'   r(   ro     s"   

 zGeneralConditioner.__init__Nbatchr   override_dropout_rateOptional[Dict[str, float]]r    r
   c                 C  s   t )z?Should be implemented in subclasses to handle conditon datatype)NotImplementedError)r&   r   r   r'   r'   r(   r     s   zGeneralConditioner.forwardc              
     s.  t t}du r
i  D ]jv sJ d qj D ]i\jr+tntj}| B t	j
trH j
 d}n$t	j
ttjjfrb fddj
D  }n
tdjj dW d   n1 svw   Y  | D ]\}}|| | qq"fdd| D S )	a<  
        Processes the input batch through all configured embedders, applying conditional dropout rates if specified.
        Output tensors for each key are concatenated along the dimensions specified in KEY2DIM.

        Parameters:
            batch (Dict): The input data batch to process.
            override_dropout_rate (Optional[Dict[str, float]]): Optional dictionary to override default dropout rates
                                                                per embedder key.

        Returns:
            Dict: A dictionary of output tensors concatenated by specified dimensions.

        Note:
            In case the network code is sensitive to the order of concatenation, you can either control the order via             config file or make sure the embedders return a unique key for each output.
        Nzinvalid name found c              	     s(   g | ]}  |d |qS r#   )r   get)rH   k)r   r   r   r   r'   r(   
<listcomp>  s    z/GeneralConditioner._forward.<locals>.<listcomp>z
Embedder 'zW' requires an 'input_key' attribute to be defined as either a string or list of stringsc              	     s*   i | ]\}}|t j| j|d dqS ))r   )r:   catKEY2DIMr   )rH   r   vr%   r'   r(   rK     s   * z/GeneralConditioner._forward.<locals>.<dictcomp>)r   listkeysr   r8   rw   r   r:   no_gradr9   r|   r!   r   r   	omegaconf
listconfig
ListConfigKeyErrorru   r+   append)r&   r   r   outputZembedding_contextZemb_outr   r   r'   )r   r   r   r   r&   r(   _forward  s:   zGeneralConditioner._forward
data_batchTuple[Any, Any]c                 C  s\   i i }}| j  D ]\}}d||< |jdkrdnd||< q
| ||d}| ||d}||fS )u  
        Processes the provided data batch to generate two sets of outputs: conditioned and unconditioned. This method
        manipulates the dropout rates of embedders to simulate two scenarios — one where all conditions are applied
        (conditioned), and one where they are removed or reduced to the minimum (unconditioned).

        This method first sets the dropout rates to zero for the conditioned scenario to fully apply the embedders' effects.
        For the unconditioned scenario, it sets the dropout rates to 1 (or to 0 if the initial unconditional dropout rate
        is insignificant) to minimize the embedders' influences, simulating an unconditioned generation.

        Parameters:
            data_batch (Dict): The input data batch that contains all necessary information for embedding processing. The
                            data is expected to match the required format and keys expected by the embedders.

        Returns:
            Tuple[Any, Any]: A tuple containing two condition:
                - The first one contains the outputs with all embedders fully applied (conditioned outputs).
                - The second one contains the outputs with embedders minimized or not applied (unconditioned outputs).
        r   -C6?r   r   )r   r8   rz   )r&   r   cond_dropout_ratesZdropout_ratesr   r   r0   un_conditionr'   r'   r(   get_condition_uncondition  s   
z,GeneralConditioner.get_condition_unconditionc           	      C  s   i i }}| j  D ]\}}d||< t|trd||< q
|jdkr#dnd||< q
t|}d|v r?t|d tjr?|d |d< | ||d}| ||d}||fS )zt
        Similar functionality as get_condition_uncondition
        But use negative prompts for unconditon
        r   r   r   Zneg_t5_text_embeddingst5_text_embeddingsr   )	r   r8   r9   r   rz   copydeepcopyr:   r;   )	r&   r   r   Zuncond_dropout_ratesr   r   Zdata_batch_neg_promptr0   r   r'   r'   r(   "get_condition_with_negative_prompt  s   



z5GeneralConditioner.get_condition_with_negative_prompt)r   r   r#   )r   r   r   r   r    r
   )r   r   r   r   r    r   )r   r   r    r   )r+   r,   r-   rO   r   ro   r   r   r   r   r   r   r'   r'   rt   r(   r     s    
6r   c                      "   e Zd Z	d
d fdd	Z  ZS )VideoConditionerNr   r   r   r   r    rR   c                      t  ||}tdi |S Nr'   )rn   r   rR   r&   r   r   r   rt   r'   r(   r   /     zVideoConditioner.forwardr#   )r   r   r   r   r    rR   r+   r,   r-   r   r   r'   r'   rt   r(   r  .      r  c                      r  )GR00TV1Img2VidConditionerNr   r   r   r   r    rZ   c                   r  r  )rn   r   rZ   r  rt   r'   r(   r   9  r  z!GR00TV1Img2VidConditioner.forwardr#   )r   r   r   r   r    rZ   r  r'   r'   rt   r(   r	  8  r  r	  r#   )r0   r   r1   r2   r    r   );
__future__r   r   abcr   r   collectionsr   
contextlibr   dataclassesr   r   enumr	   typingr
   r   r   r   r   r   r   r   r:   torch.nnr   torch.distributedr   2cosmos_policy._src.imaginaire.functional.batch_opsr   )cosmos_policy._src.imaginaire.lazy_configr   #cosmos_policy._src.imaginaire.utilsr   4cosmos_policy._src.imaginaire.utils.context_parallelr   0cosmos_policy._src.imaginaire.utils.count_paramsr   Z2cosmos_policy._src.imaginaire.utils.disabled_trainr   +cosmos_policy._src.imaginaire.utils.easy_ior   r   r!   r   r@   r   rR   rZ   Modulerm   r   r   r   r   r   r  r	  r'   r'   r'   r(   <module>   sJ   $	%Q5-" "
