o
    ?߱iSK                     @   s  d dl Z d dlmZmZmZ d dlZd dlmZ d dlm  m	Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d	gZG d
d dejZG dd dejZG dd dejZG dd dejZ G dd dejZ!G dd dejZ"G dd dejZ#G dd deZ$G dd dejZ%dde%dddej&dfdd Z'd*d"d#Z(d+d$ee) fd%d&Z*G d'd	 d	Z+G d(d) d)eZ,dS ),    N)DictListOptional)distributedlog)easy_io)AbstractEmbModel)HuggingfaceTokenizer)	attention)
XLMRoberta	CLIPModelc                   @   s   e Zd Zdd ZdS )	QuickGELUc                 C   s   |t d|  S )NgZd;?)torchsigmoidselfx r   U/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/networks/clip.pyforward(   s   zQuickGELU.forwardN)__name__
__module____qualname__r   r   r   r   r   r   '   s    r   c                       s   e Zd Z fddZ  ZS )	LayerNormc                    s   t  | |S N)superr   floattype_asr   	__class__r   r   r   -   s   zLayerNorm.forward)r   r   r   r   __classcell__r   r   r   r   r   ,   s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SelfAttentionF        c                    sf   || dksJ t    || _|| _|| | _|| _|| _|| _t	||d | _
t	||| _d S )Nr      )r   __init__dim	num_headshead_dimcausalattn_dropoutproj_dropoutnnLinearto_qkvproj)r   r%   r&   r(   r)   r*   r   r   r   r$   2   s   

zSelfAttention.__init__c                 C   s   g |  | j| jR \}}}}}| |||d||d\}}}	| jr*| jnd}
t|||	|
| j	d}|
|||}| |}t|| j| j}|S )z!
        x:   [B, L, C].
        r#      r"   )	dropout_pr(   )sizer&   r'   r-   viewunbindtrainingr)   r
   r(   reshaper.   Fdropoutr*   )r   r   bscndqkvpr   r   r   r   @   s   $$
zSelfAttention.forward)Fr"   r"   r   r   r   r$   r   r    r   r   r   r   r!   1   s    r!   c                       s$   e Zd Z fddZdd Z  ZS )SwiGLUc                    sD   t    || _|| _t||| _t||| _t||| _d S r   )	r   r$   r%   mid_dimr+   r,   fc1fc2fc3)r   r%   rC   r   r   r   r$   U   s   
zSwiGLU.__init__c                 C   s(   t | || | }| |}|S r   )r6   silurD   rE   rF   r   r   r   r   r   _   s   
zSwiGLU.forwardrA   r   r   r   r   rB   T   s    
rB   c                       s2   e Zd Z						d	 fdd	Zdd Z  ZS )
AttentionBlockF
quick_gelur"   h㈵>c
           
   	      s   |dv sJ t    || _|| _|| _|| _|| _|	| _t||	d| _	t
|||||| _t||	d| _|dkrDt|t|| | _d S tt|t|| |dkrVt nt tt|| |t|| _d S )N)rI   geluswi_gluepsrL   rI   )r   r$   r%   	mlp_ratior&   	post_normr(   norm_epsr   norm1r!   attnnorm2rB   intmlpr+   
Sequentialr,   r   GELUDropout)
r   r%   rO   r&   rP   r(   
activationr)   r*   rQ   r   r   r   r$   f   s&   

zAttentionBlock.__init__c                 C   s^   | j r|| | | }|| | | }|S || | | }|| | | }|S r   )rP   rR   rS   rT   rV   r   r   r   r   r      s   zAttentionBlock.forward)FFrI   r"   r"   rJ   rA   r   r   r   r   rH   e   s    #rH   c                       s&   e Zd Zd fdd	Zdd Z  ZS )	AttentionPoolrK   r"   rJ   c              	      s   || dksJ t    || _|| _|| _|| | _|| _|| _dt	| }t
|tdd| | _t
||| _t
||d | _t
||| _t||d| _t
t
|t|| |dkret nt
 t
t|| |t
|| _d S )Nr         ?   r/   rM   rI   )r   r$   r%   rO   r&   r'   r*   rQ   mathsqrtr+   	Parameterr   randncls_embeddingr,   to_qto_kvr.   r   normrW   rU   r   rX   rY   rV   )r   r%   rO   r&   rZ   r*   rQ   gainr   r   r   r$      s(   


zAttentionPool.__init__c           
      C   s   g |  | j| jR \}}}}}| | jdd|||ddd}| |||d||d\}}	t	|||	}|
|d|}| |}t|| j| j}|| | | }|dddf S )z 
        x:  [B, L, C].
        r]   r/   Nr   )r1   r&   r'   rc   rb   r2   expandrd   r3   r
   r5   r.   r6   r7   r*   r4   rV   re   )
r   r   r8   r9   r:   r;   r<   r=   r>   r?   r   r   r   r      s   $$"
zAttentionPool.forward)rK   r"   rJ   rA   r   r   r   r   r[      s    r[   c                       sF   e Zd Z												
				d fdd	ZdddZ  ZS )VisionTransformer                  tokenTFrI   r"   rJ   c              	      s  || dkrt ddd |dv sJ |p}t   || _|| _|| d | _| _| _|| _| _	|| _
|| _| _| _dt }tjd|||	 d	| _|d
v rdt|tdd | _t|td| j|d
v rrdnd  | _t|| _|	rtdnd | _tj fddt|D  | _td| _|dkrt|t| | _d S |dkrt || _d S |dkrt! | _d S d S )Nr   z3[WARNING] image_size is not divisible by patch_sizeT)flush)rp   token_fc	attn_poolr/   r\   r#   )kernel_sizestridebiasrp   rr   r]   rM   c                    s$   g | ]}t d  	qS )F)rH   ).0_rZ   r)   r%   rO   rQ   r&   rP   r*   r   r   
<listcomp>   s    z.VisionTransformer.__init__.<locals>.<listcomp>rp   rr   rs   )"printr   r$   
image_size
patch_sizenum_patchesr%   rO   out_dimr&   
num_layers	pool_typerP   rQ   r^   r_   r+   Conv2dpatch_embeddingr`   r   ra   rb   pos_embeddingrY   r7   r   pre_normrW   rangetransformerheadr,   r[   )r   r}   r~   r%   rO   r   r&   r   r   r   rP   rZ   r)   r*   embedding_dropoutrQ   rf   r   rz   r   r$      sL   
"zVisionTransformer.__init__c                 C   s   | d}| |dddd}| jdv r&tj| j|dd|gdd}|r2t	| j
| d}n| j
}| || }| jd urF| |}|rS| jd d |}|S | |}|S )Nr   r/   r]   rw   rg   r%   )r1   r   flattenpermuter   r   catrb   rh   Zpos_interpolater   r7   r   r   )r   r   interpolationuse_31_blockr8   er   r   r   r     s   




zVisionTransformer.forward)rj   rk   rl   rm   rn   ro   ro   rp   TFrI   r"   r"   r"   rJ   )FFrA   r   r   r   r   ri      s$    Ari   c                       s(   e Zd Z fddZ fddZ  ZS )XLMRobertaWithHeadc              	      sb   | d| _t jdi | | j| j d }ttj| j|ddt tj|| jdd| _	d S )Nr   r/   F)rv   r   )
popr   r   r$   r%   r+   rW   r,   rX   r   )r   kwargsrC   r   r   r   r$     s   &
zXLMRobertaWithHead.__init__c                    sN   t  |}|| jd|}|| jdd|jdd }| |}|S )Nrg   r]   r   )r   r   nepad_id	unsqueezetosumr   )r   idsr   maskr   r   r   r   (  s
   
zXLMRobertaWithHead.forwardrA   r   r   r   r   r     s    
r   c                       s^   e Zd Z											
															d fdd	Zdd Zdd Z  ZS )XLMRobertaCLIP   rj         rm   rk       rp   TFrK      r]      皙?r"   rJ   c                    s   t    || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|||||||||	|
|||||d| _t||||||||||d
| _ttdtg  | _d S )N)r}   r~   r%   rO   r   r&   r   r   r   rP   rZ   r)   r*   r   rQ   )

vocab_sizemax_seq_len	type_sizer   r%   r   r&   r   rP   r7   g$I$I,@) r   r$   	embed_dimr}   r~   
vision_dimvision_mlp_ratiovision_headsvision_layersvision_pre_normvision_post_normrZ   r   max_text_lenr   r   text_dim
text_headstext_layerstext_post_normrQ   ri   visualr   textualr+   r`   r^   r   r   ones	log_scale)r   r   r}   r~   r   r   r   r   vision_poolr   r   rZ   r   r   r   r   r   r   r   r   text_dropoutr)   r*   r   rQ   r   r   r   r$   6  sd   
 zXLMRobertaCLIP.__init__c                 C   s   |  |}| |}||fS )a
  
        imgs:       [B, 3, H, W] of torch.float32.
        - mean:     [0.48145466, 0.4578275, 0.40821073]
        - std:      [0.26862954, 0.26130258, 0.27577711]
        txt_ids:    [B, L] of torch.long.
                    Encoded by data.CLIPTokenizer.
        )r   r   )r   imgsZtxt_idsxixtr   r   r   r     s   

zXLMRobertaCLIP.forwardc                 C   s2   dd |   D ddddd |   D ig}|S )Nc                 S   s&   g | ]\}}d |v s| dr|qS re   rv   endswithrx   r;   r@   r   r   r   r{        & z/XLMRobertaCLIP.param_groups.<locals>.<listcomp>r"   )paramsweight_decayr   c                 S   s&   g | ]\}}d |v s| ds|qS r   r   r   r   r   r   r{     r   )named_parameters)r   groupsr   r   r   param_groups  s   zXLMRobertaCLIP.param_groups)r   rj   r   r   rm   rk   r   rp   TFrK   r   r   r]   r]   r   rk   r   Tr   r"   r"   r"   rJ   )r   r   r   r$   r   r   r    r   r   r   r   r   5  s8    Pr   Feoscpuc                 K   s   t | |d
i |}	W d    n1 sw   Y  |	j||d}	|	f}
|r_d| v r8g dg d}}ng d}g d}ttj|	j|	jftjj	dt
 tj||dg}|
|f7 }
t|
dkri|
d	 S |
S )N)dtypedevicesiglip)      ?r   r   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?)r   )meanstdr]   r   r   )r   r   r   lowerTComposeResizer}   InterpolationModeBICUBICToTensor	Normalizelen)
pretrainedpretrained_name	model_clsreturn_transformsreturn_tokenizerZtokenizer_paddingr   r   r   modeloutputr   r   
transformsr   r   r   _clip  s$   
r   'open-clip-xlm-roberta-large-vit-huge-14c                 K   s   t d&i ddddddddd	d
ddddddddddddddddddddddddd d!d"d#d$d#d%d#}|jd&i | t| |tfi |S )'Nr   r   r}   rj   r~   r   r   r   r   rm   r   rk   r   r   r   rp   rZ   rK   r   r   r   r   r   r]   r   r   r   r   r   r   Tr   r   r)   r"   r*   r   r   )dictupdater   r   )r   r   r   cfgr   r   r   clip_xlm_roberta_vit_h_14  sZ   	
r   credential_pathc                 C   sl   t d|  t r-|drd}tj|d|dd nd }tj||dd}| | tj	| d	d
 | S )Nzloading weights from zs3://Z_clip_models3)backends3_credential_path)keybackend_argsr   )backend_keymap_locationr   )src)
r   infor   is_rank0
startswithr   set_s3_backendloadload_state_dictsync_model_states)r   	ckpt_pathr   r   ckptr   r   r   load_model_torch  s   

r   c                   @   s4   e Zd Zejddddfdee fddZdd	 Zd
S )r   cudazms3://bucket/cosmos_diffusion_v2/pretrain_weights/models_clip_open-clip-xlm-roberta-large-vit-huge-14_fp16.pthzxlm-roberta-largezcredentials/s3_training.secretr   c                 C   sv   || _ || _|| _|| _tddd||d\| _| _| j  	d| _t
| j||d| _t|| jjd dd| _d S )NFT)r   r   r   r   r   )r   r/   
whitespace)nameseq_lenclean)r   r   checkpoint_pathtokenizer_pathr   r   r   r   evalrequires_grad_r   r	   r   	tokenizer)r   r   r   r   r   r   r   r   r   r$     s   
zCLIPModel.__init__c                 C   s   | j jfd }tj||ddd}| jjd |dd}tjj	d| j
d | j j|d	d
}|W  d    S 1 s<w   Y  d S )Nr/   bicubicF)r1   modealign_cornersrg   r   r   )r   T)r   )r   r}   r6   interpolater   mul_add_r   ampautocastr   r   )r   Zvideos_B_C_H_W_n1_p1r1   videosoutr   r   r   r     s   $zCLIPModel.visualN)	r   r   r   r   float16r   strr$   r   r   r   r   r   r     s    
c                       s   e Zd Z			ddee dee dedef fdd	Z	
ddee	j
 dee dee dee	j
 f fddZ	
	
	
ddee	j
 dee	j
 dee	j
 deee	j
f fddZdefddZ  ZS )Wan2pt1CLIPEmbr"     bfloat16	input_keydropout_rate	num_tokenr   c                    sL   t    || _d| _t | _|| _d | _|| _t	j
t	jt	jd| | _d S )Nr   )r  r  float32)r   r$   r  	model_dimr   
clip_model
_input_key_output_key_dropout_rater   r  r  r  r   )r   r  r  r  r   r   r   r   r$     s   

zWan2pt1CLIPEmb.__init__N	in_tensorr   returnc                    s   |d u rd S t  |||S r   )r   random_dropout_input)r   r  r  r   r   r   r   r  3  s   z#Wan2pt1CLIPEmb.random_dropout_inputimage_tensorvideo_tensormedia_latentsc                 C   s   |j \}}}}}t|d||||| j}	|d ur-t|| j| j|| j}
n<|d d d d dd d d d f }t  | j	
|| j}
W d    n1 sWw   Y  d|	d d d d d df< tj|	|| jgdd}|
|dS )Nrm   r   r\   r]   r   )frame_cond_crossattn_emb_B_L_Dy_B_C_T_H_W)shaper   zerosr   r   r   r  r  no_gradr  r   concat)r   r  r  r   r8   ry   Zlatent_fZlatent_hZlatent_wr   Zcontext_B_L_DZfirst_frame_B_C_H_Wyr   r   r   r   :  s    ""

zWan2pt1CLIPEmb.forwardc                 C   s   ddg}d| j  d| S )Nr!  r"  zInput key: z 
	Output key: )r  )r   
output_keyr   r   r   detailsN  s   zWan2pt1CLIPEmb.details)r"   r  r  )NNN)r   r   r   r   r  r   r   rU   r$   r   Tensorr  r   r   r)  r    r   r   r   r   r    sH    	
r  )Fr   r   )-r^   typingr   r   r   r   torch.nnr+   Ztorch.nn.functional
functionalr6   Ztorchvision.transformsr   r   %cosmos_predict2._src.imaginaire.utilsr   r   -cosmos_predict2._src.imaginaire.utils.easy_ior   )cosmos_predict2._src.predict2.conditionerr   Z4cosmos_predict2._src.predict2.inference.get_umt5_embr	   Z0cosmos_predict2._src.predict2.networks.attentionr
   Z2cosmos_predict2._src.predict2.networks.xlm_robertar   __all__Moduler   r   r!   rB   rH   r[   ri   r   r   r  r   r   r  r   r   r  r   r   r   r   <module>   sH   #.0Zi

(&