o
    ѫi+                     @   s\   d dl Z ddlT ddlmZmZmZ G dd dejZG dd dej	Z
G d	d
 d
eZdS )    N   )*)AlphaBlenderlineartimestep_embeddingc                   @   s   e Zd ZdddZdS )TimeMixSequentialNc                 C   s   | D ]}||||}q|S )N )selfxcontext	timestepslayerr   r   E/data/cameron/vidgen/generative-models/sgm/modules/video_attention.pyforward	   s   zTimeMixSequential.forwardNN)__name__
__module____qualname__r   r   r   r   r   r      s    r   c                	       st   e Zd ZeedZ											d fdd	Z	dd	ejd
ejde	dejfddZ
dddZdd Z  ZS )VideoTransformerBlock)softmaxzsoftmax-xformers        NTFr   c                    sL  t    | j| }|	p|
d u| _|
d u r|}
t|| |
ks!J |
|k| _| jr8t|| _t	||
||d| _|| _
|| _| jrL||
||||d| _n	||
|||d| _t	|
|||d| _|rh|rdtd | _nt|
| _|rz||
|||d| _n
||
||||d| _t|
| _t|
| _|| _|| _| jrt| jj d d S d S )N)dim_outdropoutglu)	query_dimheadsdim_headcontext_dimr   )r   r   r   r   )r   r   r   r   r   z is using checkpointing)super__init__ATTENTION_MODESff_inintis_resnn	LayerNormnorm_inFeedForwardr   disable_self_attnattn1ff
ValueErrorattn2norm2norm1norm3switch_temporal_ca_to_sa
checkpointprint	__class__r   )r	   dimn_headsd_headr   r   gated_ffr1   r   r!   	inner_dim	attn_moder(   disable_temporal_crossattentionr0   attn_clsr3   r   r   r      sb   




zVideoTransformerBlock.__init__r
   r   r   returnc                 C   s&   | j rt | j|||S | j|||dS )N)r   )r1   _forward)r	   r
   r   r   r   r   r   r   f   s   zVideoTransformerBlock.forwardc           
      C   s  | j s|sJ | j r|r| j |ksJ | j p|}|j\}}}t|d|d}|d ur|jd |jd kr|jd | dkr|jd | }|jd | }|| |jd krt|||ddd d ddf ||dd|| d|jd }n#|dd |jd d|jd }n|dd |jd d|jd }| jr|}	| | |}| jr||	7 }| jr| j	| 
||d| }n
| 	| 
|| }| jd ur| jr| | || }n| j| ||d| }|}	| | |}| jr||	7 }t|d||| ||d}|S )	Nz(b t) s c -> (b s) t c)tr      r   z(b s) t c -> (b t) s c)sbcr?   )r   shape	rearrangereshapeexpandr!   r&   r#   r(   r)   r.   r,   r0   r-   r*   r/   )
r	   r
   r   r   BSCbatch_videos	n_spatialx_skipr   r   r   r>   n   sH   
$"
zVideoTransformerBlock._forwardc                 C   s   | j jd jS )NrA   )r*   netweight)r	   r   r   r   get_last_layer   s   z$VideoTransformerBlock.get_last_layer)r   NTTNFNr   FFFr   )r   r   r   CrossAttentionMemoryEfficientCrossAttentionr    r   torchTensorr"   r   r>   rR   __classcell__r   r   r<   r   r      s8    
Q

0r   c                       s   e Zd Z																dd	ed
edef fddZ				ddejde	ej de	ej de	e de	ej dejfddZ
  ZS )SpatialVideoTransformerr@   r   FNfixed      ?r   '  merge_strategymerge_factormax_time_embed_periodc                    s   t  j|||| ||d
 || _|| _|| _|	|t	 
|| |r+|t 	
fddt| jD | _	t
| j	t
| jksRJ || _|| _| jd }tt| j|t t|| j| _t||
d| _d S )N)depthr   	attn_typeuse_checkpointr   
use_linearr(   c                    s,   g | ]}t 	
 d qS ))	r   r   r   r1   r!   r8   r9   r(   r:   )r   ).0_r9   r1   r(   r:   r   r!   r8   n_time_mix_headstime_context_dimtime_mix_d_headtime_mix_inner_dimr   r   r   
<listcomp>   s"    z4SpatialVideoTransformer.__init__.<locals>.<listcomp>   )alphar\   )r   r   
time_depthr_   r^   r"   r$   
ModuleListrange
time_stacklentransformer_blocksuse_spatial_contextin_channels
Sequentialr   SiLUtime_pos_embedr   
time_mixer)r	   rt   r5   r6   r_   r   rb   r   rs   r   r\   r]   rg   r!   r1   rm   r9   r(   r:   r^   time_embed_dimr<   re   r   r      sL    


z SpatialVideoTransformer.__init__r
   r   time_contextr   image_only_indicatorr=   c                 C   s<  |j \}}}}|j d }	|}
|d u r|j d }d }t|r|}| jrB|jdks/J d|j |}|d d | }t|d|| d}n|d ur\| js\t|d|| d}|jdkr\t|d}| |}| jsi| |}t|d}| jrv| |}t	j
||jd	}t|d
|j d | d}t|d}t|| jd| jd}| |}|d d d d d f }tt| j| jD ]!\}\}}|||d}|}|| }||||d}| j|||d}q|j d |	kr|j d |	 dkr|j d |	 }|j|	|g|j dd  R  jdd}| jr| |}t|d||d}| js| |}||
 }|S )Nr      z.n dims of spatial context should be 3 but are zb ... -> (b n) ...)nr   zb c -> b 1 czb c h w -> b (h w) c)devicezt -> b t)rD   zb t -> (b t)F)repeat_only
max_periodrB   )r   r   )	x_spatial
x_temporalr{   r@   )r4   zb (h w) c -> b c h w)hw)rF   existsrs   ndimrepeatrG   normrb   proj_inrU   aranger~   r   rt   r^   rw   	enumerateziprr   rp   rx   rH   meanproj_out)r	   r
   r   rz   r   r{   rd   r   r   batch_inx_inspatial_contexttime_context_first_timestep
num_framest_embembit_block	mix_blockx_mixr}   outr   r   r   r      s~   











 &

zSpatialVideoTransformer.forward)r@   r   FNFNrY   rZ   NFFr@   r   FFr[   )NNNN)r   r   r   strfloatr"   r   rU   rV   Optionalr   rW   r   r   r<   r   rX      sR    VrX   )rU   modules.attentionmodules.diffusionmodules.utilr   r   r   r$   ru   r   Moduler   SpatialTransformerrX   r   r   r   r   <module>   s     