o
    9i-                     @   s   d dl mZmZmZ d dlZd dlmZmZ d dlm	Z	m
Z
mZmZmZ d dlmZmZ d dlmZ d dlmZ G dd	 d	eZG d
d dejjZG dd de
ZG dd deZ			 	ddedefddZG dd dejjZG dd deZdS )    )CallableIterableUnionN)	rearrangerepeat)XFORMERS_IS_AVAILABLE	AttnBlockDecoderMemoryEfficientAttnBlockResnetBlock)ResBlocktimestep_embedding)VideoTransformerBlock)partialclassc                       s>   e Zd Zddddd fdd
Zdd Zd fdd	Z  ZS )VideoResBlock           learned)dropoutvideo_kernel_sizealphamerge_strategyc                   s   t  j|||d| |d u rg d}t|d|ddddd|ddd| _|| _| jdkr8| d	t|g d S | jd
krM| d	tj	
t|g d S td| j )N)out_channelsr   )r      r   r   r   FT)channelsemb_channelsr   dimsuse_scale_shift_normuse_convupdownkernel_sizeuse_checkpoint
skip_t_embfixed
mix_factorr   unknown merge strategy )super__init__r   
time_stackr   register_buffertorchTensorregister_parameternn	Parameter
ValueError)selfr   r   r   r   r   argskwargs	__class__ N/data/cameron/vidgen/generative-models/sgm/modules/autoencoding/temporal_ae.pyr(      s0   


zVideoResBlock.__init__c                 C   s,   | j dkr| jS | j dkrt| jS t )Nr$   r   r   r%   r+   sigmoidNotImplementedError)r1   bsr6   r6   r7   	get_alpha6   s
   

zVideoResBlock.get_alphaFNc                    s   |d u r| j }|j\}}}}t ||}|sBt|d|d}	t|d|d}| ||}| j|| d}
|
| d|
 |	  }t|d}|S )N(b t) c h w -> b c t h wt)r;         ?b c t h w -> (b t) c h w)	timestepsshaper'   forwardr   r)   r<   )r1   xtemb
skip_videorB   bchwx_mixr   r4   r6   r7   rD   >   s   
zVideoResBlock.forward)FN)__name__
__module____qualname__r(   r<   rD   __classcell__r6   r6   r4   r7   r      s    %r   c                       s,   e Zd Zd fdd	Zd fdd	Z  ZS )	AE3DConvr   c                    s\   t  j||g|R i | t|trdd |D }nt|d }tjj||||d| _d S )Nc                 S   s   g | ]}t |d  qS )   )int).0kr6   r6   r7   
<listcomp>X   s    z%AE3DConv.__init__.<locals>.<listcomp>rR   )in_channelsr   r!   padding)	r'   r(   
isinstancer   rS   r+   r.   Conv3dtime_mix_conv)r1   rW   r   r   r2   r3   rX   r4   r6   r7   r(   U   s   
zAE3DConv.__init__Fc                    s6   t  |}|r
|S t|d|d}| |}t|dS )Nr=   r>   rA   )r'   rD   r   r[   )r1   inputrB   rG   rE   r4   r6   r7   rD   c   s   

zAE3DConv.forward)r   F)rM   rN   rO   r(   rD   rP   r6   r6   r4   r7   rQ   T   s    rQ   c                       D   e Zd Z	ddededef fddZd fd	d
	Zdd Z  Z	S )
VideoBlockr   r   rW   r   r   c                       t  | t|d|dddd| _| jd }tjtj| j|tj	 tj|| j| _
|| _| jdkrB| dt|g d S | jd	krW| dtjt|g d S td
| j )Nr   FTsoftmaxdimn_headsd_head
checkpointff_in	attn_mode   r$   r%   r   r&   r'   r(   r   time_mix_blockrW   r+   r.   
SequentialLinearSiLUvideo_time_embedr   r*   r,   r-   r/   r0   r1   rW   r   r   time_embed_dimr4   r6   r7   r(   m   .   
	

zVideoBlock.__init__Fc                       |rt  |S |}| |}|jdd  \}}t|d}|}tj||jd}t|d|jd | d}t|d}t	|| j
dd	}	| |	}
|
d d d d d f }
||
 }|  }| j||d
}|| d| |  }t|d||d}| |}|| S NrR   zb c h w -> b (h w) c)devicezt -> b tr   )rH   zb t -> (b t)F)repeat_only)rB   r@   zb (h w) c -> b c h w)rJ   rK   r'   rD   	attentionrC   r   r+   arangeru   r   r   rW   ro   r<   rk   proj_out)r1   rE   rB   rG   x_inrJ   rK   rL   
num_framest_embembr   r4   r6   r7   rD      (   




zVideoBlock.forwardc                 C   6   | j dkr| jS | j dkrt| jS td| j  Nr$   r   r&   r8   r1   r6   r6   r7   r<      
   

zVideoBlock.get_alphar   r   r]   
rM   rN   rO   rS   floatstrr(   rD   r<   rP   r6   r6   r4   r7   r_   l       r_   c                       r^   )MemoryEfficientVideoBlockr   r   rW   r   r   c                    r`   )Nr   FTzsoftmax-xformersrb   ri   r$   r%   r   r&   rj   rp   r4   r6   r7   r(      rr   z"MemoryEfficientVideoBlock.__init__Fc                    rs   rt   rw   )r1   rE   rB   skip_time_blockr{   rJ   rK   rL   r|   r}   r~   r   r4   r6   r7   rD      r   z!MemoryEfficientVideoBlock.forwardc                 C   r   r   r8   r   r6   r6   r7   r<      r   z#MemoryEfficientVideoBlock.get_alphar   r]   r   r6   r6   r4   r7   r      r   r   vanillar   r   r   c                 C   s   |dv sJ d| dt d| d|  d ts*|dkr*t d| d	tj  d
}|d
kr<|d u s4J tt| ||dS |dkrPt d|  d tt| ||dS t S )N)r   vanilla-xformersz
attn_type z, not supported for spatio-temporal attentionz/making spatial and temporal attention of type 'z' with z in_channelsr   zAttention mode 'z' is not available. Falling back to vanilla attention. This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version r   r   r   z'building MemoryEfficientAttnBlock with z in_channels...)printr   r+   __version__r   r_   r   r:   )rW   	attn_typeattn_kwargsr   r   r6   r6   r7   make_time_attn   s6   

r   c                       s*   e Zd Zdejdejf fddZ  ZS )Conv2DWrapperr\   returnc                    s   t  |S )N)r'   rD   )r1   r\   r3   r4   r6   r7   rD     s   zConv2DWrapper.forward)rM   rN   rO   r+   r,   rD   rP   r6   r6   r4   r7   r     s    "r   c                	       s   e Zd Zg dZddddddeeef ded	ed
ef fddZ	dddZ
def fddZdefddZdef fddZ  ZS )VideoDecoder)all	conv-only	attn-onlyr   r   r   r   )r   r   r   	time_moder   r   r   r   c                   sJ   || _ || _|| _|| _| j| jv sJ d| j t j|i | d S )Nz!time_mode parameter has to be in )r   r   r   r   available_time_modesr'   r(   )r1   r   r   r   r   r2   r3   r4   r6   r7   r(   &  s   	
zVideoDecoder.__init__Fc                 K   s(   | j dkr	td|s| jjjS | jjS )Nr   TODO)r   r:   conv_outr[   weight)r1   skip_time_mixr3   r6   r6   r7   get_last_layer8  s   

zVideoDecoder.get_last_layerr   c                    s&   | j dvrtt| j| jdS t  S )N)r   only-last-convr   )r   r   r   r   r   r'   
_make_attnr   r4   r6   r7   r   B  s   

zVideoDecoder._make_attnc                 C   s   | j dkrtt| jdS tS )Nr   )r   )r   r   rQ   r   r   r   r6   r6   r7   
_make_convL  s   
zVideoDecoder._make_convc                    s*   | j dvrtt| j| j| jdS t  S )N)r   r   )r   r   r   )r   r   r   r   r   r   r'   _make_resblockr   r4   r6   r7   r   R  s   

zVideoDecoder._make_resblockr]   )rM   rN   rO   r   r   rS   listr   r   r(   r   r   r   r   r   rP   r6   r6   r4   r7   r   #  s&    



r   )r   Nr   r   ) typingr   r   r   r+   einopsr   r   "sgm.modules.diffusionmodules.modelr   r   r	   r
   r   (sgm.modules.diffusionmodules.openaimodelr   r   sgm.modules.video_attentionr   sgm.utilr   r   r.   Conv2drQ   r_   r   r   r   r   r   r   r6   r6   r6   r7   <module>   s,    DFH
&