o
    㮬i(                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Zd dlmZ d dlm  mZ d dlmZ d dlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ dd
l m!Z! e "e#Z$G dd dej%Z&G dd dej%Z'G dd dej(e'Z)G dd dej%Z*G dd dej%Z+G dd de'Z,G dd dej%Z-G dd dej%Z.G dd dej%Z/G dd dej%Z0G dd  d ej%Z1dS )!    N)abstractmethod)IterableListOptionalTupleUnion)	rearrange)
checkpoint   )SpatialTransformer)avg_pool_ndconv_ndlinearnormalizationtimestep_embeddingzero_module)SpatialVideoTransformer)existsc                
       sP   e Zd ZdZ	ddedededee f fddZd	ejd
ejfddZ	  Z
S )AttentionPool2dzS
    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
    Nspacial_dim	embed_dimnum_heads_channels
output_dimc                    sp   t    tt||d d |d  | _td|d| d| _td||p'|d| _	|| | _
t| j
| _d S )N      g      ?r
   )super__init__nn	Parameterthrandnpositional_embeddingr   qkv_projc_proj	num_headsQKVAttention	attention)selfr   r   r   r   	__class__ R/data/cameron/vidgen/generative-models/sgm/modules/diffusionmodules/openaimodel.pyr      s   

zAttentionPool2d.__init__xreturnc                 C   s   |j \}}}|||d}tj|jddd|gdd}|| jd d d d d f |j }| |}| 	|}| 
|}|d d d d df S )NT)dimkeepdimr/   r   )shapereshaper   catmeanr!   todtyper"   r&   r#   )r'   r,   bc_r*   r*   r+   forward+   s   $


zAttentionPool2d.forwardN)__name__
__module____qualname____doc__intr   r   r   Tensorr;   __classcell__r*   r*   r(   r+   r      s    	r   c                   @   s*   e Zd ZdZedejdejfddZdS )TimestepBlockzT
    Any module where forward() takes timestep embeddings as a second argument.
    r,   embc                 C   s   dS )zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr*   r'   r,   rE   r*   r*   r+   r;   ;   s    zTimestepBlock.forwardN)r=   r>   r?   r@   r   r   rB   r;   r*   r*   r*   r+   rD   6   s    rD   c                   @   s   e Zd ZdZ									ddejdejdeej deej deej deej d	eej d
ee dee dee dee fddZ	dS )TimestepEmbedSequentialzt
    A sequential module that passes timestep embeddings to the children that
    support it as an extra input.
    Nr,   rE   contextcamimage_only_indicator	cond_viewcond_motiontime_contextnum_video_frames	time_stepnamec                 C   s  ddl m}m} ddlm}m}m} | D ]t}|}t|||fr-||||||	||||
|
}qt||frA||||||	||||
|
}qt|trO|||||	|}qt||r\||||	|}qt||ri||||	|}qt|t	ryt||sy|||}qt|t
r|||}q||}q|S )Nr
   )VideoResBlockPostHocResBlockWithTime)BasicTransformerTimeMixBlock'PostHocSpatialTransformerWithTimeMixing0PostHocSpatialTransformerWithTimeMixingAndMotion)$modules.diffusionmodules.video_modelrQ   rR   modules.spacetime_attentionrS   rT   rU   
isinstancer   rD   r   )r'   r,   rE   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   layermoduler*   r*   r+   r;   H   sv   




zTimestepEmbedSequential.forward)	NNNNNNNNN)
r=   r>   r?   r@   r   rB   r   rA   strr;   r*   r*   r*   r+   rG   B   sD    		
rG   c                       sj   e Zd ZdZ						ddeded	ed
ee dedededef fddZdej	dej	fddZ
  ZS )UpsampleaA  
    An upsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    r   Nr   Fr
   channelsuse_convdimsout_channelspaddingthird_upkernel_sizescale_factorc	           	         sV   t    || _|p|| _|| _|| _|| _|| _|r)t|| j| j||d| _	d S d S )Nra   )
r   r   r]   r`   r^   r_   rb   rd   r   conv)	r'   r]   r^   r_   r`   ra   rb   rc   rd   r(   r*   r+   r      s   

zUpsample.__init__r,   r-   c                 C   s   |j d | jks
J | jdkr4| jsdn| j}tj|||j d  |j d | j |j d | j fdd}n	tj|| jdd}| jrE| |}|S )Nr   r
   r      nearest)mode)rd   ri   )	r2   r]   r_   rb   rd   Finterpolater^   rf   )r'   r,   t_factorr*   r*   r+   r;      s   


zUpsample.forward)r   Nr   Fr
   r   r=   r>   r?   r@   rA   boolr   r   r   rB   r;   rC   r*   r*   r(   r+   r\      s4    	r\   c                       s^   e Zd ZdZ				ddededed	ee d
edef fddZdej	dej	fddZ
  ZS )
DownsampleaD  
    A downsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    r   Nr   Fr]   r^   r_   r`   ra   
third_downc              
      s   t    || _|p|| _|| _|| _|dkrdn|sdnd}|rXtd| d td| j d| j d	| d
|  |dkrItd|  t|| j| jd||d| _	d S | j| jks`J t
|||d| _	d S )Nr
   r   )r   r   r   )r   r   r   z!Building a Downsample layer with z dims.z  --> settings are: 
 in-chn: z, out-chn: z, kernel-size: 3, stride: z, padding: z&  --> Downsampling third axis (time): )stridera   )rc   rq   )r   r   r]   r`   r^   r_   logpyinfor   opr   )r'   r]   r^   r_   r`   ra   rp   rq   r(   r*   r+   r      s6   
	
	zDownsample.__init__r,   r-   c                 C   s   |j d | jks
J | |S )Nr   )r2   r]   rt   )r'   r,   r*   r*   r+   r;      s   
zDownsample.forward)r   Nr   Frm   r*   r*   r(   r+   ro      s(    #ro   c                       s   e Zd ZdZ										ddededed	ee d
ededededededededef fddZde	j
de	j
de	j
fddZde	j
de	j
de	j
fddZ  ZS )ResBlocka  
    A residual block that can optionally change the number of channels.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    NFr   r
   r]   emb_channelsdropoutr`   r^   use_scale_shift_normr_   use_checkpointupdownrc   exchange_temb_dims
skip_t_embc                    s  t    || _|| _|| _|p|| _|| _|| _|| _|| _	t
|tr,dd |D }n|d }tt|t t||| j||d| _|	pG|
| _|	rZt|d|| _t|d|| _n|
rkt|d|| _t|d|| _nt  | _| _|| _|r}d| j n| j| _| jrtd| jj  | jrJ d | _d| _	ntt t|| j| _tt| jt tj |dt!t|| j| j||d| _"| j|krt | _#d S |rt||| j||d| _#d S t||| jd| _#d S )	Nc                 S   s   g | ]}|d  qS )r   r*   ).0kr*   r*   r+   
<listcomp>/  s    z%ResBlock.__init__.<locals>.<listcomp>r   re   FzSkipping timestep embedding in )pr   )$r   r   r]   rv   rw   r`   r^   ry   rx   r|   rX   r   r   
Sequentialr   SiLUr   	in_layersupdownr\   h_updx_updro   Identityr}   emb_out_channelsrr   rs   r)   r=   
emb_layersr   Dropoutr   
out_layersskip_connection)r'   r]   rv   rw   r`   r^   rx   r_   ry   rz   r{   rc   r|   r}   ra   r(   r*   r+   r     sz   






zResBlock.__init__r,   rE   r-   c                 C   s    | j r
t| j||S | ||S )a  
        Apply the block to a Tensor, conditioned on a timestep embedding.
        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        )ry   r	   _forwardrF   r*   r*   r+   r;   n  s   zResBlock.forwardc                 C   sn  |d ur:|j d |j d kr:|j d |j d  dkr:|j d |j d  }|j|j d |g|j dd  R  jdd}| jr]| jd d | jd }}||}| |}| |}||}n| |}| jrkt	|}n	| 
||j}|j d |j d kr|j d |j d  dkr|j d |j d  }|j|j d |g|j dd  R  jdd}n1|j d |j d  dkr|j|j d |j d  dd}n|d d j|j d g|j dd  R  }t|j t|j k r|d }t|j t|j k s| jr| jd | jdd  }}	tj|ddd\}
}||d|
  | }|	|}n| jr't|d}|| }| |}| || S )Nr   r   r1   r.   ).Nr   zb t c ... -> b c t ...)r2   r3   r5   r   r   r   r   r}   r   
zeros_liker   typer7   repeat_interleaveexpandlenrx   r   chunkr|   r   r   )r'   r,   rE   nin_restin_convhemb_outout_normout_restscaleshiftr*   r*   r+   r   z  sB   4,



. *


zResBlock._forward)
NFFr   FFFr
   FF)r=   r>   r?   r@   rA   floatr   rn   r   r   rB   r;   r   rC   r*   r*   r(   r+   ru     sR    	
Z$ru   c                       sl   e Zd ZdZ				ddedededed	ef
 fd
dZdejdejfddZ	dejdejfddZ
  ZS )AttentionBlocka  
    An attention block that allows spatial positions to attend to each other.
    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    r   r.   Fr]   r$   num_head_channelsry   use_new_attention_orderc                    s   t    || _|dkr|| _n|| dks J d| d| || | _|| _t|| _td||d d| _|r@t	| j| _
nt| j| _
ttd||d| _d S )Nr.   r   zq,k,v channels z' is not divisible by num_head_channels r   r
   )r   r   r]   r$   ry   r   normr   qkvr%   r&   QKVAttentionLegacyr   proj_out)r'   r]   r$   r   ry   r   r(   r*   r+   r     s   


zAttentionBlock.__init__r,   r-   c                 K   s   t | j|S r<   )r	   r   )r'   r,   kwargsr*   r*   r+   r;        zAttentionBlock.forwardc                 C   sV   |j ^}}}|||d}| | |}| |}| |}|| j||g|R  S )Nr.   )r2   r3   r   r   r&   r   )r'   r,   r8   r9   spatialr   r   r*   r*   r+   r     s   

zAttentionBlock._forward)r   r.   FF)r=   r>   r?   r@   rA   rn   r   r   rB   r;   r   rC   r*   r*   r(   r+   r     s&    	r   c                       <   e Zd ZdZdef fddZdejdejfddZ  Z	S )	r   zh
    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
    n_headsc                       t    || _d S r<   r   r   r   r'   r   r(   r*   r+   r        

zQKVAttentionLegacy.__init__r   r-   c                 C   s   |j \}}}|d| j  dksJ |d| j  }||| j |d |j|dd\}}}dtt| }	td||	 ||	 }
tj|
	 dd
|
j}
td|
|}||d|S )z
        Apply QKV attention.
        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r
   r   r   r1   bct,bcs->btsr.   bts,bcs->bct)r2   r   r3   splitmathsqrtr   einsumsoftmaxr   r   r7   r'   r   bswidthlengthchqr   vr   weightar*   r*   r+   r;     s   (zQKVAttentionLegacy.forward
r=   r>   r?   r@   rA   r   r   rB   r;   rC   r*   r*   r(   r+   r         r   c                       r   )	r%   zP
    A module which performs QKV attention and splits in a different order.
    r   c                    r   r<   r   r   r(   r*   r+   r     r   zQKVAttention.__init__r   r-   c              	   C   s   |j \}}}|d| j  dksJ |d| j  }|jddd\}}}dtt| }	td||	 || j ||||	 || j ||}
tj|
	 dd
|
j}
td|
||| j ||}||d|S )z
        Apply QKV attention.
        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r
   r   r   r1   r   r.   r   )r2   r   r   r   r   r   r   viewr   r   r   r7   r3   r   r*   r*   r+   r;     s   zQKVAttention.forwardr   r*   r*   r(   r+   r%     r   r%   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Timestepr/   c                    r   r<   )r   r   r/   )r'   r/   r(   r*   r+   r     r   zTimestep.__init__tr-   c                 C   s   t || jS r<   )r   r/   )r'   r   r*   r*   r+   r;     r   zTimestep.forward)	r=   r>   r?   rA   r   r   rB   r;   rC   r*   r*   r(   r+   r     s    r   c                4       s  e Zd ZdZ																				
	d-dededededededeeef de	dede
eeef  de	dededede	de	dede
e de
ee	  de
ee  de	d e	d!e	d"ed#e
e f2 fd$d%Z			d.d&ejd'e
ej d(e
ej d)e
ej d*ejf
d+d,Z  ZS )/	UNetModela  
    The full UNet model with attention and timestep embedding.
    :param in_channels: channels in the input Tensor.
    :param model_channels: base channel count for the model.
    :param out_channels: channels in the output Tensor.
    :param num_res_blocks: number of residual blocks per downsample.
    :param attention_resolutions: a collection of downsample rates at which
        attention will take place. May be a set, list, or tuple.
        For example, if this contains 4, then at 4x downsampling, attention
        will be used.
    :param dropout: the dropout probability.
    :param channel_mult: channel multiplier for each level of the UNet.
    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param num_classes: if specified (as an int), then this model will be
        class-conditional with `num_classes` classes.
    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
    :param num_heads: the number of attention heads in each attention layer.
    :param num_heads_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
    :param resblock_updown: use residual blocks for up/downsampling.
    :param use_new_attention_order: use a different attention pattern for potentially
                                    increased efficiency.
            r   r   rg      Tr   NFr.   r   r   in_channelsmodel_channelsr`   num_res_blocksattention_resolutionsrw   channel_multconv_resampler_   num_classesry   r$   r   num_heads_upsamplerx   resblock_updowntransformer_depthcontext_dimdisable_self_attentionsnum_attention_blocksdisable_middle_self_attndisable_middle_transformeruse_linear_in_transformerspatial_transformer_attn_typeadm_in_channelsc           (         s  t    |dkr|}|dkr|dksJ d|dkr#|dks#J d|_|_|_t|tr8t||g }|d }t|trJt||g _nt|t|krVt	d|_|d urgt|t|ksgJ  d urt tjksvJ t
t fddtt sJ td  d| d |_|_|_|_|
_|_|_|_|_|d	 }tt||t t||_jd ur)tjtrt|
|_nRjd
krtd td|_n@jdkrtt |tt||t t||_n#jdkr'|d usJ ttt||t t||_nt	t!t"t#|	||dddg_$|_%|g}|}d}t&|D ]\}} tj| D ]m}!t'|||| | |	||dg}"| | }||v r|dkru|| }#n|| }|}#|d urt(|r|| }$nd}$t( r|! | k r|")t*|||#|| ||$|||d	 j$)t"|"   j%|7  _%|)| qR|t|d kr|}%j$)t"|rt'||||%|	||ddnt+|||	|%d |%}|)| |d9 } j%|7  _%qG|dkr|| }#n|| }|}#t"t'|||||	||d|s't*|||#||||||d	nt,j- t'||||	||d_. j%|7  _%t!g _/t0t&|d d d D ]\}} tj| d D ]}&|1 }'t'||' ||||  |	||dg}"||  }||v r|dkr|| }#n|| }|}#t(|r|| }$nd}$t( r|& | k r|")t*|||#|| ||$|||d	 |r|&j| kr|}%|")|rt'||||%|	||ddnt2|||	|%d |d }j/)t"|"   j%|7  _%q\qOtt3|t t4t#|	||ddd_5d S )Nr.   z3Either num_heads or num_head_channels has to be setzprovide num_res_blocks either as an int (globally constant) or as a list/tuple (per-level) with the same length as channel_multc                    s   j |   |  kS r<   )r   )ir   r'   r*   r+   <lambda>w  s    z$UNetModel.__init__.<locals>.<lambda>z7Constructor of UNetModel received num_attention_blocks=z;. This option has LESS priority than attention_resolutions zz, i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, attention will still not be set.rg   
continuousz'setting up linear c_adm embedding layerr   timestep
sequentialr
   re   )r`   r_   ry   rx   F)depthr   disable_self_attn
use_linear	attn_typery   T)r`   r_   ry   rx   r{   )r_   r`   r   )r_   ry   rx   )r`   r_   ry   rx   rz   )6r   r   r   r   r`   rX   rA   r   r   
ValueErrorallmaprangerr   rs   r   rw   r   r   r   ry   r$   r   r   r   r   r   r   
time_embed	Embedding	label_embLinearr   
ModuleListrG   r   input_blocks_feature_size	enumerateru   r   appendr   ro   r   r   middle_blockoutput_blockslistpopr\   r   r   out)(r'   r   r   r`   r   r   rw   r   r   r_   r   ry   r$   r   r   rx   r   r   r   r   r   r   r   r   r   r   transformer_depth_middletime_embed_diminput_block_chansr   dslevelmultnrlayersdim_headdisabled_saout_chr   ichr(   r   r+   r   5  s  














 




?
zUNetModel.__init__r,   	timestepsrH   yr-   c                 K   s   |du| j duksJ dg }t|| jdd}| |}| j dur4|jd |jd ks-J || | }|}	| jD ]}
|
|	||}	||	 q9| |	||}	| j	D ]}
t
j|	| gdd}	|
|	||}	qQ|	|j}	| |	S )aQ  
        Apply the model to an input batch.
        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :param context: conditioning plugged in via crossattn
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        Nz<must specify y if and only if the model is class-conditionalF)repeat_onlyr   r   r1   )r   r   r   r   r2   r   r   r   r   r   r   r4   r   r   r7   r   )r'   r,   r  rH   r  r   hst_embrE   r   rZ   r*   r*   r+   r;   o  s*   




zUNetModel.forward)r   r   Tr   NFr.   r.   r.   FFr   NNNFFFr   N)NNN)r=   r>   r?   r@   rA   r   r   r   r   rn   r   r[   r   r   rB   r;   rC   r*   r*   r(   r+   r     s    $
	


  ?r   )2loggingr   abcr   typingr   r   r   r   r   torchr   torch.nnr   torch.nn.functional
functionalrj   einopsr   torch.utils.checkpointr	   modules.attentionr   modules.diffusionmodules.utilr   r   r   r   r   r   modules.video_attentionr   utilr   	getLoggerr=   rr   Moduler   rD   r   rG   r\   ro   ru   r   r   r%   r   r   r*   r*   r*   r+   <module>   s4     
 [52 !0	