o
    9i                     @   sP  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZ d	d
lm Z  d	dl!m"Z" d	dl#m$Z$ d	dl%m&Z&m'Z' d	dl(m)Z) d	dl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 G dd dej2Z3G dd dej2Z4G dd dej2Z5G dd de3Z6G dd de3Z7G dd de7Z8G dd de3Z9G dd de3Z:G d d! d!e3Z;G d"d# d#e3Z<G d$d% d%e3Z=G d&d' d'e3Z>G d(d) d)e3Z?G d*d+ d+ej2Z@G d,d- d-ej2ZAG d.d/ d/e3ZBG d0d1 d1e"e3ZCG d2d3 d3e3ZDG d4d5 d5e3ZEdS )6    N)nullcontext)partial)DictListOptionalTupleUnion)	rearrangerepeat)
ListConfig)
checkpoint)ByT5TokenizerCLIPTextModelCLIPTokenizerT5EncoderModelT5Tokenizer   )DiagonalGaussianRegularizer)Encoder)Timestep)extract_into_tensormake_beta_schedule)DiagonalGaussianDistribution)append_dimsautocastcount_paramsdefaultdisabled_trainexpand_dims_likeinstantiate_from_configc                       s   e Zd Z fddZedefddZedeee	j
f fddZedefdd	Zejd
efddZejd
eee	j
f fddZejd
efdd	Zejdd Zejdd Zejdd	 Z  ZS )AbstractEmbModelc                    s    t    d | _d | _d | _d S N)super__init___is_trainable	_ucg_rate
_input_keyself	__class__ F/data/cameron/vidgen/generative-models/sgm/modules/encoders/modules.pyr#      s   

zAbstractEmbModel.__init__returnc                 C      | j S r!   r$   r'   r+   r+   r,   is_trainable"      zAbstractEmbModel.is_trainablec                 C   r.   r!   r%   r'   r+   r+   r,   ucg_rate&   r1   zAbstractEmbModel.ucg_ratec                 C   r.   r!   r&   r'   r+   r+   r,   	input_key*   r1   zAbstractEmbModel.input_keyvaluec                 C   
   || _ d S r!   r/   r(   r6   r+   r+   r,   r0   .      
c                 C   r7   r!   r2   r8   r+   r+   r,   r3   2   r9   c                 C   r7   r!   r4   r8   r+   r+   r,   r5   6   r9   c                 C      | ` d S r!   r/   r'   r+   r+   r,   r0   :      c                 C   r:   r!   r2   r'   r+   r+   r,   r3   >   r;   c                 C   r:   r!   r4   r'   r+   r+   r,   r5   B   r;   )__name__
__module____qualname__r#   propertyboolr0   r   floattorchTensorr3   strr5   setterdeleter__classcell__r+   r+   r)   r,   r       s(    

r    c                       s   e Zd ZddddZddddddZdeeef f fd	d
Zde	de
de
fddZ	dde
dee de
fddZ			dde
dee
 deee  deee  fddZ  ZS )GeneralConditionervector	crossattnconcat)   r         rL   )rI   rJ   rK   	cond_viewcond_motion
emb_modelsc              
      s.  t    g }t|D ]\}}t|}t|ts"J d|jj d|dd|_	|dd|_
|j	sDt|_| D ]}d|_q:|  td| d|jj d	t|d d
|j	  d|v rd|d |_nd|v rn|d |_n	td|jj |dd |_|jd urtj |_|| qt|| _d S )Nzembedder model z% has to inherit from AbstractEmbModelr0   Fr3           zInitialized embedder #z: z with z params. Trainable: r5   
input_keysz5need either 'input_key' or 'input_keys' for embedder legacy_ucg_value)r"   r#   	enumerater   
isinstancer    r*   r<   getr0   r3   r   train
parametersrequires_gradevalprintr   r5   rS   KeyErrorlegacy_ucg_valnprandomRandomStateucg_prngappendnn
ModuleList	embedders)r(   rQ   rf   n	embconfigembedderparamr)   r+   r,   r#   K   sF   

zGeneralConditioner.__init__ri   batchr-   c                 C   s\   |j d usJ |j}|j }tt||j D ]}|jjdd| |gdr+|||j |< q|S )NrL   rN   )p)r^   r3   rangelenr5   rb   choice)r(   ri   rk   rl   valir+   r+   r,   possibly_get_ucg_valo   s   z'GeneralConditioner.possibly_get_ucg_valNforce_zero_embeddingsc           	   
      s  t  }|d u r	g }| jD ]}|jrtntj}| 6 t|dr7|jd ur7|jd ur/| 	|  | |j }nt|drH| fdd|j
D  }W d    n1 sRw   Y  t|tjttfsiJ dt| t|ttfss|g}|D ][}|jdv r|j}n| j|  }|jdkr|jd u rttd|j tj|jd	 |jd
 || }t|dr|j|v rt|}||v rt|| |f| j| ||< qu|||< quq|S )Nr5   rS   c                    s   g | ]} | qS r+   r+   ).0krk   r+   r,   
<listcomp>   s    z.GeneralConditioner.forward.<locals>.<listcomp>z7encoder outputs must be tensors or a sequence, but got )rO   rP   rR         ?r   device)dictrf   r0   r   rB   no_gradhasattrr5   r^   rr   rS   rV   rC   listtupletypeOUTPUT_DIM2KEYSdimr3   r   	bernoullionesshaperz   
zeros_likecat
KEY2CATDIM)	r(   rk   rs   outputri   embedding_contextemb_outembout_keyr+   rv   r,   forwardx   s`   







zGeneralConditioner.forwardbatch_cbatch_ucforce_uc_zero_embeddingsforce_cond_zero_embeddingsc           
      C   st   |d u rg }t  }| jD ]}||j d|_q| ||}| |d u r$|n||}t| j|D ]\}}	|	|_q.||fS )NrR   )r~   rf   rc   r3   zip)
r(   r   r   r   r   	ucg_ratesri   cucrater+   r+   r,   get_unconditional_conditioning   s   

z1GeneralConditioner.get_unconditional_conditioningr!   )NNN)r<   r=   r>   r   r   r   r   r   r#   r    r   rr   r   r   rD   r   rG   r+   r+   r)   r,   rH   G   s2    $

5

rH   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )InceptionV3zsWrapper around the https://github.com/mseitzer/pytorch-fid inception
    port with an additional squeeze at the endFc                    s8   t    ddlm} d|d< |jdd|i|| _d S )Nr   )	inceptionTresize_inputnormalize_inputr+   )r"   r#   pytorch_fidr   r   model)r(   r   kwargsr   r)   r+   r,   r#      s   
zInceptionV3.__init__c                 C   s&   |  |}t|dkr|d  S |S )NrN   r   )r   rn   squeeze)r(   inpoutpr+   r+   r,   r      s   
zInceptionV3.forwardFr<   r=   r>   __doc__r#   r   rG   r+   r+   r)   r,   r      s    r   c                   @   s   e Zd Zdd Zdd ZdS )IdentityEncoderc                 C      |S r!   r+   r(   xr+   r+   r,   encode      zIdentityEncoder.encodec                 C   r   r!   r+   r   r+   r+   r,   r      r   zIdentityEncoder.forwardN)r<   r=   r>   r   r   r+   r+   r+   r,   r      s    r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )ClassEmbedder  Fc                    s(   t    t||| _|| _|| _d S r!   )r"   r#   rd   	Embedding	embedding	n_classesadd_sequence_dim)r(   	embed_dimr   r   r)   r+   r,   r#      s   

zClassEmbedder.__init__c                 C   s*   |  |}| jr|d d d d d f }|S r!   )r   r   )r(   r   r+   r+   r,   r      s   
zClassEmbedder.forwardcudac                 C   s0   | j d }tj|f|d| }| j| i}|S )NrN   ry   )r   rB   r   keylong)r(   bsrz   uc_classr   r+   r+   r,   r      s
   z,ClassEmbedder.get_unconditional_conditioning)r   F)r   )r<   r=   r>   r#   r   r   rG   r+   r+   r)   r,   r      s    r   c                       s   e Zd Zd fdd	Z  ZS )ClassEmbedderForMultiCondNFc                    sX   |}t || j}t|| t}|r|| d ||< t |||}|r&|gn|||< |S )Nr   )r   r   rV   r~   r"   r   )r(   rk   r   disable_dropoutoutislistc_outr)   r+   r,   r      s   z!ClassEmbedderForMultiCond.forwardNF)r<   r=   r>   r   rG   r+   r+   r)   r,   r      s    r   c                       <   e Zd ZdZ	d fdd	Zdd	 Zd
d Zdd Z  ZS )FrozenT5Embedderz(Uses the T5 transformer encoder for textgoogle/t5-v1_1-xxlr   M   Tc                    B   t    t|| _t|| _|| _|| _|r| 	  d S d S r!   )
r"   r#   r   from_pretrained	tokenizerr   transformerrz   
max_lengthfreezer(   versionrz   r   r   r)   r+   r,   r#         
zFrozenT5Embedder.__init__c                 C   $   | j  | _ |  D ]}d|_q
d S r   r   r[   rY   rZ   r(   rj   r+   r+   r,   r        zFrozenT5Embedder.freezec              	   C   n   | j |d| jddddd}|d | j}tjddd | j|d	}W d    n1 s-w   Y  |j}|S 
NTFr   pt
truncationr   return_lengthreturn_overflowing_tokenspaddingreturn_tensors	input_idsr   enabled)r   r   r   torz   rB   r   r   last_hidden_stater(   textbatch_encodingtokensoutputszr+   r+   r,   r        	zFrozenT5Embedder.forwardc                 C      | |S r!   r+   r(   r   r+   r+   r,   r   !     zFrozenT5Embedder.encode)r   r   r   T	r<   r=   r>   r   r#   r   r   r   rG   r+   r+   r)   r,   r      s    r   c                       r   )FrozenByT5EmbedderzI
    Uses the ByT5 transformer encoder for text. Is character-aware.
    google/byt5-baser   r   Tc                    r   r!   )
r"   r#   r   r   r   r   r   rz   r   r   r   r)   r+   r,   r#   *  r   zFrozenByT5Embedder.__init__c                 C   r   r   r   r   r+   r+   r,   r   5  r   zFrozenByT5Embedder.freezec              	   C   r   r   r   r   r+   r+   r,   r   ;  r   zFrozenByT5Embedder.forwardc                 C   r   r!   r+   r   r+   r+   r,   r   K  r   zFrozenByT5Embedder.encode)r   r   r   Tr   r+   r+   r)   r,   r   %  s    r   c                       sT   e Zd ZdZg dZ								d fd
d	Zdd Zedd Zdd Z	  Z
S )FrozenCLIPEmbedderz=Uses the CLIP transformer encoder for text (from huggingface))lastpooledhiddenopenai/clip-vit-large-patch14r   r   Tr   NFc                    s   t    || jv sJ t|| _t|| _|| _|| _	|r$| 
  || _|| _|| _|dkrG|d us7J dt|  krDdksIJ  J d S d S )Nr   r      )r"   r#   LAYERSr   r   r   r   r   rz   r   r   layer	layer_idxreturn_pooledabs)r(   r   rz   r   r   r   r   always_return_pooledr)   r+   r,   r#   T  s    

 zFrozenCLIPEmbedder.__init__c                 C   r   r   r   r   r+   r+   r,   r   m  r   zFrozenCLIPEmbedder.freezec              	   C   s   | j |d| jddddd}|d | j}| j|| jdkd}| jd	kr(|j}n| jd
kr:|jd d d d d f }n|j| j	 }| j
rH||jfS |S )NTFr   r   r   r   r   )r   output_hidden_statesr   r   )r   r   r   rz   r   r   r   pooler_outputhidden_statesr   r   r   r+   r+   r,   r   s  s*   	



zFrozenCLIPEmbedder.forwardc                 C   r   r!   r+   r   r+   r+   r,   r     r   zFrozenCLIPEmbedder.encode)r   r   r   Tr   NF)r<   r=   r>   r   r   r#   r   r   r   r   rG   r+   r+   r)   r,   r   O  s    
r   c                       sx   e Zd ZdZg dZ									d fd
d	Zdd Zedd Zdd Z	dd Z
ddejfddZdd Z  ZS )FrozenOpenCLIPEmbedder2z8
    Uses the OpenCLIP transformer encoder for text
    )r   r   penultimateViT-H-14laion2b_s32b_b79kr   r   Tr   Fc	                    s   t    || jv sJ tj|td|d\}	}
}
|	`|	| _|| _|| _	|| _
|r.|   || _| jdkr:d| _n| jdkrCd| _nt || _d S Ncpurz   
pretrainedr   r   r   rN   )r"   r#   r   	open_clipcreate_model_and_transformsrB   rz   visualr   r   r   r   r   r   NotImplementedErrorlegacy)r(   archr   rz   r   r   r   r   r  r   _r)   r+   r,   r#     s*   



z FrozenOpenCLIPEmbedder2.__init__c                 C   r   r   r   r[   rY   rZ   r   r+   r+   r,   r        zFrozenOpenCLIPEmbedder2.freezec                 C   sX   t |}| || j}| js| jr|S | jr'| jrJ || j |d fS || j S )Nr   )r  tokenizeencode_with_transformerr   rz   r   r  r   r(   r   r   r   r+   r+   r,   r     s   


zFrozenOpenCLIPEmbedder2.forwardc                 C   s   | j |}|| j j }|ddd}| j|| j jd}| jr,|| j }| j |}|S |d }| j |}| 	||}||d< |S )NrN   r   rL   	attn_maskr   r   )
r   token_embeddingpositional_embeddingpermutetext_transformer_forwardr  r  r   ln_finalpool)r(   r   r   or   r+   r+   r,   r    s   
z/FrozenOpenCLIPEmbedder2.encode_with_transformerc                 C   s,   |t |jd |jddf | jj }|S )Nr   r   )rB   aranger   argmaxr   text_projection)r(   r   r   r+   r+   r,   r    s
   zFrozenOpenCLIPEmbedder2.poolNr   c                 C   s   i }t | jjjD ]/\}}|t| jjjd kr!|ddd|d< | jjjr2tj	 s2t
|||}q	|||d}q	|ddd|d< |S )NrN   r   rL   r   r  r   )rU   r   r   	resblocksrn   r  grad_checkpointingrB   jitis_scriptingr   )r(   r   r  r   rq   rr+   r+   r,   r    s   z0FrozenOpenCLIPEmbedder2.text_transformer_forwardc                 C   r   r!   r+   r   r+   r+   r,   r     r   zFrozenOpenCLIPEmbedder2.encode)r   r   r   r   Tr   FTr!   )r<   r=   r>   r   r   r#   r   r   r   r  r  rB   rC   r  r   rG   r+   r+   r)   r,   r     s&    #

r   c                       sd   e Zd ZddgZ						d fdd		Zd
d Zdd Zdd Zddej	fddZ
dd Z  ZS )FrozenOpenCLIPEmbedderr   r   r   r   r   r   Tc           	         s   t    || jv sJ tj|td|d\}}}|`|| _|| _|| _	|r+| 
  || _| jdkr8d| _d S | jdkrBd| _d S t r   )r"   r#   r   r  r  rB   rz   r  r   r   r   r   r   r  )	r(   r  r   rz   r   r   r   r   r  r)   r+   r,   r#     s"   
	



zFrozenOpenCLIPEmbedder.__init__c                 C   r   r   r	  r   r+   r+   r,   r     r
  zFrozenOpenCLIPEmbedder.freezec                 C   s    t |}| || j}|S r!   )r  r  r  r   rz   r  r+   r+   r,   r     s   
zFrozenOpenCLIPEmbedder.forwardc                 C   sV   | j |}|| j j }|ddd}| j|| j jd}|ddd}| j |}|S )NrN   r   rL   r  )r   r  r  r  r  r  r  )r(   r   r   r+   r+   r,   r  $  s   z.FrozenOpenCLIPEmbedder.encode_with_transformerNr   c                 C   sh   t | jjjD ]*\}}|t| jjj| j kr |S | jjjr+tj	 s+t
|||}q|||d}q|S )Nr  )rU   r   r   r  rn   r   r  rB   r  r  r   )r(   r   r  rq   r   r+   r+   r,   r  -  s   z/FrozenOpenCLIPEmbedder.text_transformer_forwardc                 C   r   r!   r+   r   r+   r+   r,   r   :  r   zFrozenOpenCLIPEmbedder.encode)r   r   r   r   Tr   r!   )r<   r=   r>   r   r#   r   r   r  rB   rC   r  r   rG   r+   r+   r)   r,   r!    s     	r!  c                       sh   e Zd ZdZ													
d fdd	Zdd Zdd ZedddZdd Z	dd Z
  ZS )FrozenOpenCLIPImageEmbedderzA
    Uses the OpenCLIP vision transformer encoder for images
    r   r   r   r   TrR   Fr   Nc                    s   t    tj|tt|d|d\}}}|`|| _|
| _	| j	dk| _
|	o)| j
 | _|| _|| _|r7|   || _| jdtg ddd | jdtg d	dd || _|| _d | _|| jj_|| _d S )
Nr   r   r   mean)g3<4'?gwgM?gy{ ?F)
persistentstd)gB91?gwt.?g	U?)r"   r#   r  r  rB   rz   r   r   r   	max_cropspad_to_max_lenrepeat_to_max_lenr   r   	antialiasregister_bufferrC   r3   unsqueeze_dimstored_batchr  output_tokens)r(   r  r   rz   r   r   r)  r3   r+  r(  num_image_cropsr-  init_devicer   r  r)   r+   r,   r#   C  s6   


z$FrozenOpenCLIPImageEmbedder.__init__c                 C   s<   t jj|ddd| jd}|d d }t j|| j| j}|S )N)   r0  bicubicT)interpolationalign_cornersr)  rx   g       @)korniageometryresizer)  enhance	normalizer#  r%  r   r+   r+   r,   
preprocessp  s   z&FrozenOpenCLIPImageEmbedder.preprocessc                 C   r   r   r	  r   r+   r+   r,   r   ~  r
  z"FrozenOpenCLIPImageEmbedder.freezec              	   C   s  |  |}d }| jr|d |d }}||j}| jdkr\|s\| jdks\td| j tj|j	d |j
d d d d f | }|d ur\ttd| j tj|j	d |j
d || }| jrj|d d d d d f }| jr{| jrrJ | jrwJ ||fS | jr| dkr|d d d d d f }n|}t|d| jd|fS | jr| d	ksJ t|tj|j	d | j|j	d  |j	d |j
dfd}||d d dd
f fS |S )Nr   rN   rR   rx   ry   rL   zb 1 d -> b n drg   r   .)encode_with_vision_transformerr-  r   dtyper3   r&  rB   r   r   r   rz   r   r+  r(  r'  r   r
   r   r   zeros)r(   image
no_dropoutr   r   z_z_padr+   r+   r,   r     sh   




	z#FrozenOpenCLIPImageEmbedder.forwardc              	   C   s  |  dkr| j|jd ksJ t|d}| |}| js-| jjjr$J | j|}d }n| jjjs4J | j|\}}| jdkryt|d| jd}t	d| j
 tj|jd |jd d|jd | }|d uryt|d	| jd}td
| jj d | jr||fS |S )N   rN   zb n c h w -> (b n) c h wr   z(b n) d -> b n dr:  rx   ry   z(b n) t d -> b t (n d)z2You are running very experimental token-concat in z9. Check what you are doing, and then remove this message.)r   r&  r   r	   r9  r-  r   r  rB   r   r3   r   rz   r\   r*   r<   )r(   imgr   r   r+   r+   r,   r;    s8   


z:FrozenOpenCLIPImageEmbedder.encode_with_vision_transformerc                 C   r   r!   r+   r   r+   r+   r,   r     r   z"FrozenOpenCLIPImageEmbedder.encode)r   r   r   r   TTrR   FFr   FNr   )r<   r=   r>   r   r#   r9  r   r   r   r;  r   rG   r+   r+   r)   r,   r"  >  s*    -5"r"  c                       s8   e Zd Z					d fdd	Zdd Zd	d
 Z  ZS )FrozenCLIPT5Encoderr   google/t5-v1_1-xlr   r   c              	      sr   t    t|||d| _t|||d| _t| jjj dt	| jd dd| jjj dt	| jd dd d S )N)r   z has gư>z.2fz M parameters, z comes with z
 M params.)
r"   r#   r   clip_encoderr   
t5_encoderr\   r*   r<   r   )r(   clip_version
t5_versionrz   clip_max_lengtht5_max_lengthr)   r+   r,   r#     s   
zFrozenCLIPT5Encoder.__init__c                 C   r   r!   r+   r   r+   r+   r,   r     r   zFrozenCLIPT5Encoder.encodec                 C   s    | j |}| j|}||gS r!   )rF  r   rG  )r(   r   clip_zt5_zr+   r+   r,   r     s   zFrozenCLIPT5Encoder.forward)r   rE  r   r   r   )r<   r=   r>   r#   r   r   rG   r+   r+   r)   r,   rD    s    rD  c                       s@   e Zd Z									d fdd	Zd	d
 Zdd Z  ZS )SpatialRescalerrN   bilinear      ?r   NFc
           
         s   t    || _| jdksJ |dv sJ || _ttjjj|d| _	|d up'|	| _
| j
rDtd| d| d tj|||||d d| _|| _d S )	Nr   )nearestlinearrO  	trilinearr1  area)modezSpatial Rescaler mapping from z to z channels after resizing.rL   )kernel_sizebiasr   )r"   r#   n_stages
multiplierr   rB   rd   
functionalinterpolateinterpolatorremap_outputr\   Conv2dchannel_mapper
wrap_video)
r(   rX  methodrY  in_channelsout_channelsrW  r`  rV  r]  r)   r+   r,   r#     s&   

zSpatialRescaler.__init__c                 C   s   | j r|jdkr|j\}}}}}t|d}t|d}t| jD ]
}| j|| jd}q| j r;t|d|||d}t|d}| jrC| 	|}|S )NrB  zb c t h w -> b t c h wzb t c h w -> (b t) c h w)scale_factorz(b t) c h w -> b t c h w)btr   zb t c h w -> b c t h w)
r`  ndimr   r	   rm   rX  r\  rY  r]  r_  )r(   r   BCTHWstager+   r+   r,   r   #  s   



zSpatialRescaler.forwardc                 C   r   r!   r+   r   r+   r+   r,   r   3  r   zSpatialRescaler.encode)	rN   rO  rP  r   NFFrN   F)r<   r=   r>   r#   r   r   rG   r+   r+   r)   r,   rN    s    'rN  c                       sT   e Zd Z				d fdd	Z						
dddZdddZdd Zdd Z  ZS )LowScaleEncoderr      @   rx   c                    s<   t    || _t|| _| j|||d| _|| _|| _d S )N)	timestepslinear_start
linear_end)	r"   r#   max_noise_levelr   r   register_scheduleaugmentation_scheduleout_sizerd  )r(   model_configrr  rs  rq  rt  output_sizerd  r)   r+   r,   r#   8  s   



zLowScaleEncoder.__init__rR  -C6?{Gz?Mb?c                 C   s8  t |||||d}d| }tj|dd}td|d d }	|j\}t|| _|| _|| _|jd | jks9J dt	t
jt
jd}
| d|
| | d	|
| | d
|
|	 | d|
t| | d|
td|  | d|
td|  | d|
td|  | d|
td| d  d S )N)rr  rs  cosine_srx   r   )axisr  z+alphas have to be defined for each timestep)r<  betasalphas_cumprodalphas_cumprod_prevsqrt_alphas_cumprodsqrt_one_minus_alphas_cumprodlog_one_minus_alphas_cumprodsqrt_recip_alphas_cumprodsqrt_recipm1_alphas_cumprodrN   )r   r_   cumprodrc   r   intnum_timestepsrr  rs  r   rB   tensorfloat32r*  sqrtlog)r(   beta_schedulerq  rr  rs  r}  r  alphasr  r  to_torchr+   r+   r,   ru  K  sD   
z!LowScaleEncoder.register_scheduleNc                    s:   t | fdd}t| j| j  t| j| j|  S )Nc                      s
   t  S r!   )rB   
randn_liker+   x_startr+   r,   <lambda>|  s   
 z*LowScaleEncoder.q_sample.<locals>.<lambda>)r   r   r  r   r  )r(   r  rf  noiser+   r  r,   q_sample{  s   zLowScaleEncoder.q_samplec                 C   s~   | j |}t|tr| }|| j }tjd| j|j	d f|j
d }| ||}| jd ur;tjjj|| jdd}||fS )Nr   ry   rQ  )sizerU  )r   r   rV   r   samplerd  rB   randintrt  r   rz   r   r  rw  rd   rZ  r[  )r(   r   r   noise_levelr+   r+   r,   r     s   


zLowScaleEncoder.forwardc                 C   s   || j  }| j|S r!   )rd  r   decode)r(   r   r+   r+   r,   r    s   
zLowScaleEncoder.decode)r   ro  rp  rx   )rR  r   rz  r{  r|  r!   )	r<   r=   r>   r#   ru  r  r   r  rG   r+   r+   r)   r,   rn  7  s    

0rn  c                       s(   e Zd ZdZ fddZdd Z  ZS )ConcatTimestepEmbedderNDz9embeds each dimension independently and concatenates themc                    s   t    t|| _|| _d S r!   )r"   r#   r   timestepoutdim)r(   r  r)   r+   r,   r#     s   


z!ConcatTimestepEmbedderND.__init__c                 C   sn   |j dkr|d d d f }t|jdksJ |jd |jd }}t|d}| |}t|d||| jd}|S )NrN   rL   r   zb d -> (b d)z(b d) d2 -> b (d d2))re  dd2)rg  rn   r   r	   r  r  )r(   r   re  dimsr   r+   r+   r,   r     s   


z ConcatTimestepEmbedderND.forwardr   r+   r+   r)   r,   r    s    r  c                       sF   e Zd Z	d
dedef fddZdeeej	f f fdd	Z
  ZS )GaussianEncoderrx   Tweightflatten_outputc                    s*   t  j|i | t | _|| _|| _d S r!   )r"   r#   r   	posteriorr  r  )r(   r  r  argsr   r)   r+   r,   r#     s   
zGaussianEncoder.__init__r-   c                    sH   t  |}| |\}}|d |d< | j|d< | jr t|d}||fS )Nkl_losslossr  zb c h w -> b (h w ) c)r"   r   r  r  r  r	   )r(   r   r   r  r)   r+   r,   r     s   

zGaussianEncoder.forward)rx   T)r<   r=   r>   rA   r@   r#   r   r   rB   rC   r   rG   r+   r+   r)   r,   r    s    $r  c                       s   e Zd Z						ddedededee dee d	ed
ededee f fddZde	j
dee	j
ee	j
e	j
f ee	j
ef eee	j
e	j
f ef f fddZ  ZS )"VideoPredictionEmbedderWithEncoderNFrx   n_cond_framesn_copiesencoder_configsigma_sampler_configsigma_cond_configis_aerd  disable_encoder_autocasten_and_decode_n_samples_a_timec
           
         sh   t    || _|| _t|| _|d urt|nd | _|d ur#t|nd | _|| _|| _	|| _
|	| _d S r!   )r"   r#   r  r  r   encodersigma_sampler
sigma_condr  rd  r  r  )
r(   r  r  r  r  r  r  rd  r  r  r)   r+   r,   r#     s    


z+VideoPredictionEmbedderWithEncoder.__init__vidr-   c                 C   s  | j d urM|jd | j }|  ||j}| jd ur6| |}| jdkr.t|d| jd}nt|d| jd}t|d| jd}t	|}||t
||j  }tjd| j dP | jd ur_| jn|jd }t|jd | }g }t|D ]*}	| jr| j||	| |	d |  }
n| ||	| |	d |  }
||
 qtW d    n1 sw   Y  tj|dd}|| j9 }| jdkrt|d	| jd}t|d
| jd}| jd ur||f}|S |}|S )Nr   rN   zb d -> (b t) drf  z
b -> (b t)r   r   r  z(b t) c h w -> b () (t c) h wzb 1 c h w -> (b t) c h w)r  r   r  r   rz   r  r
   r  rB   r  r   rg  r   r  r  mathceilrm   r  r  r   rc   r   rd  r	   )r(   r  re  sigmasr  r  	n_samplesn_roundsall_outrg   r   
return_valr+   r+   r,   r     sD   





"

z*VideoPredictionEmbedderWithEncoder.forward)NNFrx   FN)r<   r=   r>   r  r{   r   r@   rA   r#   rB   rC   r   r   r   rG   r+   r+   r)   r,   r    sJ    	
 r  c                       s2   e Zd Zdededef fddZdd Z  ZS )%FrozenOpenCLIPImagePredictionEmbedderopen_clip_embedding_configr  r  c                    s$   t    || _|| _t|| _d S r!   )r"   r#   r  r  r   r  )r(   r  r  r  r)   r+   r,   r#     s   
z.FrozenOpenCLIPImagePredictionEmbedder.__init__c                 C   s.   |  |}t|d| jd}t|d| jd}|S )Nz(b t) d -> b t dr  zb t d -> (b s) t d)s)r  r	   r  r
   r  )r(   r  r+   r+   r,   r     s   
z-FrozenOpenCLIPImagePredictionEmbedder.forward)r<   r=   r>   r   r  r#   r   rG   r+   r+   r)   r,   r    s    r  )Fr  
contextlibr   	functoolsr   typingr   r   r   r   r   r4  numpyr_   r  rB   torch.nnrd   einopsr	   r
   	omegaconfr   torch.utils.checkpointr   transformersr   r   r   r   r   !modules.autoencoding.regularizersr   modules.diffusionmodules.modelr   $modules.diffusionmodules.openaimodelr   modules.diffusionmodules.utilr   r   #modules.distributions.distributionsr   utilr   r   r   r   r   r   r   Moduler    rH   r   r   r   r   r   r   r   r   r!  r"  rD  rN  rn  r  r  r  r  r+   r+   r+   r,   <module>   sN    $,x(*AfH "<^Q