o
    9iT                     @   s   d dl mZ d dlZd dlm  mZ ddlT ddlm	Z	m
Z
mZmZmZ G dd dejZG dd	 d	ejZG d
d deZG dd deZdS )    )partialN   )*)AlphaBlender	get_alphalinearmixed_checkpointtimestep_embeddingc                   @   s   e Zd ZdddZdS )TimeMixSequentialNc                 C   s   | D ]}||||}q|S )N )selfxcontext	timestepslayerr   r   I/data/cameron/vidgen/generative-models/sgm/modules/spacetime_attention.pyforward   s   zTimeMixSequential.forwardNN)__name__
__module____qualname__r   r   r   r   r   r
      s    r
   c                	       st   e Zd ZeedZ											d fdd	Z	dd	ejd
ejde	dejfddZ
dddZdd Z  ZS )BasicTransformerTimeMixBlock)softmaxzsoftmax-xformers        NTFr   c                    sN  t    | j| }|	p|
d u| _|
d u r|}
t|| |
ks!J |
|k| _| jr8t|| _t	||
||d| _|| _
|| _| jrL||
||||d| _n	||
|||d| _t	|
|||d| _|rh|rdtd | _nt|
| _|rz||
|||d| _n
||
||||d| _t|
| _t|
| _|| _|| _| jrt| jj d d S d S )N)dim_outdropoutglu)	query_dimheadsdim_headcontext_dimr   )r   r   r   r   )r   r    r   r   r   z is using checkpointing)super__init__ATTENTION_MODESff_inintis_resnn	LayerNormnorm_inFeedForwardr   disable_self_attnattn1ff
ValueErrorattn2norm2norm1norm3switch_temporal_ca_to_sa
checkpointlogpyinfo	__class__r   )r   dimn_headsd_headr   r    gated_ffr4   r   r$   	inner_dim	attn_moder+   disable_temporal_crossattentionr3   attn_clsr7   r   r   r"      sb   




z%BasicTransformerTimeMixBlock.__init__r   r   r   returnc                 C   s&   | j rt | j|||S | j|||dS )N)r   )r4   _forward)r   r   r   r   r   r   r   r   n   s   z$BasicTransformerTimeMixBlock.forwardc                 C   s"  | j s|sJ | j r|r| j |ksJ | j p|}|j\}}}t|d|d}| jr9|}| | |}| jr9||7 }| jrI| j| ||d| }n
| | || }| j	d urr| j
rf| 	| || }n| j	| ||d| }|}| | |}| jr||7 }t|d||| ||d}|S )Nz(b t) s c -> (b s) t c)tr   z(b s) t c -> (b t) s c)sbcrC   )r   shape	rearranger$   r)   r&   r+   r,   r1   r/   r3   r0   r-   r2   )r   r   r   r   BSCx_skipr   r   r   rB   v   s2   

z%BasicTransformerTimeMixBlock._forwardc                 C   s   | j jd jS )N)r-   netweight)r   r   r   r   get_last_layer   s   z+BasicTransformerTimeMixBlock.get_last_layer)r   NTTNFNr   FFFr   )r   r   r   CrossAttentionMemoryEfficientCrossAttentionr#   r"   torchTensorr%   r   rB   rQ   __classcell__r   r   r@   r   r      s8    
Q

!r   c                       s   e Zd Z																			dd
ededededef
 fddZ								ddej	de
ej	 de
ej	 de
e de
ej	 de
ej	 de
ej	 de
e de
e dej	fddZ  ZS )'PostHocSpatialTransformerWithTimeMixing   r   FNfixed      ?Tr   '  merge_strategymerge_factorapply_sigmoid_to_mergetime_mix_legacymax_time_embed_periodc                    s  t  j|||| ||d
 || _|| _|| _|	|t	 
|| |r+|t 	
fddt| jD | _	t
| j	t
| jksRJ || _|| _| jd }tt| j|t t|| j| _|| _| jr|
dkr| dt|g n&|
dks|
dkr| dtjt|g n|
d	krd | _ntd
|
 tt|
| j|dd| _d S t||
d| _d S )Ndepthr   	attn_typeuse_checkpointr    
use_linearr+   c                    ,   g | ]}t 	
 d qS )	r   r    r   r4   r$   r<   r=   r+   r>   r   .0_r=   r4   r+   r>   r   r$   r<   n_time_mix_headstime_context_dimtime_mix_d_headtime_mix_inner_dimr   r   r   
<listcomp>   "    zDPostHocSpatialTransformerWithTimeMixing.__init__.<locals>.<listcomp>   rY   
mix_factorlearnedlearned_with_imagesfixed_with_imagesunknown merge strategy Tapply_sigmoidis_attnalphar\   )r!   r"   
time_depthrb   r`   r%   r'   
ModuleListrangetime_mix_blockslentransformer_blocksuse_spatial_contextin_channels
Sequentialr   SiLUtime_mix_time_embedr_   register_bufferrT   rU   register_parameter	Parameterrt   r.   r   r   get_alpha_fnr   
time_mixer)r   r   r9   r:   rb   r   re   r    r   r   r\   r]   r^   rn   r$   r4   r~   r=   r+   r>   r_   r`   time_embed_dimr@   rl   r   r"      sp    


z0PostHocSpatialTransformerWithTimeMixing.__init__r   r   time_contextr   image_only_indicator	cond_viewcond_motion	time_stepnamerA   c
                 C   s  |j \}
}
}}|}d }t|r|}| jr4|jdks!J d|j |}|d d | }t|d|| d}n|d urN| jsNt|d|| d}|jdkrNt|d}| |}| js[| |}t|d}| jrh| |}| j	rq| j
|d}tj||jd	}t|d
|j d | d}t|d}t|| jd| jd}| |}|d d d d d f }tt| j| jD ]?\}\}}|||||	d t| d}|}|| }||||d}| j	r||j| d| |j|  }q| j|||d}q| jr| |}t|d||d}| js| |}|| }|S )N   z.n dims of spatial context should be 3 but are zb ... -> (b n) ...)nr   zb c -> b 1 cb c h w -> b (h w) cr   devicet -> b tr   rF   b t -> (b t)Frepeat_only
max_periodrk   )r   r   r   r   r         ?	x_spatial
x_temporalr   b (h w) c -> b c h whw)rH   existsr   ndimrepeatrI   normre   proj_inr_   r   rT   aranger   r	   r   r`   r   	enumeratezipr   r   strtodtyper   proj_out)r   r   r   r   r   r   r   r   r   r   rk   r   r   x_inspatial_contexttime_context_first_timestepr}   
num_framest_embembit_block	mix_blockx_mixoutr   r   r   r     s~   









&

z/PostHocSpatialTransformerWithTimeMixing.forward)rX   r   FNFNrY   rZ   TNFFrX   r   FFTr[   )NNNNNNNNr   r   r   r   floatboolr%   r"   rT   rU   Optionalr   rV   r   r   r@   r   rW      sv    m	
rW   c                       s   e Zd Z																									dd
edededededef fddZ									d dej	de
ej	 de
ej	 de
ej	 de
e de
ej	 de
ej	 de
ej	 de
e de
e dej	fddZ  ZS )!0PostHocSpatialTransformerWithTimeMixingAndMotionrX   r   FNrY   rZ   Tr   r[   r\   r]   merge_factor_motionr^   r_   r`   c                    s  t  j|||| ||d
 || _|| _|| _|	| _| _|
| _|| _|
|t	
 || |r7|	t
 	
fddt| jD | _t
 
fddt| jD | _t| jt| jksyJ || _|| _| jd }| jr|n| j}t
t||t
 t|| j| _t
t| j|t
 t|| j| _|| _| jr|dkr| dt|g n&|dks|d	kr| dtj
t|g n|d
krd | _ntd| t t!|| j|dd| _"d S t#||d| _$| jrt#||d| _%d S d S )Nra   c                    rf   rg   rh   ri   rl   r   r   rq     rr   zMPostHocSpatialTransformerWithTimeMixingAndMotion.__init__.<locals>.<listcomp>c                    s,   g | ]}t 	
 d qS rg   rh   ri   )r=   r4   r+   r>   r   r$   r<   motion_context_dimrm   ro   rp   r   r   r   rq     rr   rs   rY   rt   ru   rv   rw   rx   Try   r|   )&r!   r"   r~   rb   r`   use_camera_embr   use_3d_attentionseparate_motion_merge_factorr%   r'   r   r   r   motion_blocksr   r   r   r   r   r   r   r   time_mix_motion_embedr_   r   rT   rU   r   r   rt   r.   r   r   r   r   r   time_mixer_motion)r   r   r9   r:   rb   r   re   r    r   r   r   r   adm_in_channelsr   r\   r]   r   r^   rn   r   r$   r4   r~   r=   r+   r>   r_   r`   r   time_embed_channelsr@   )r=   r4   r+   r>   r   r$   r<   r   rm   rn   ro   rp   r   r   r"   a  s     



z9PostHocSpatialTransformerWithTimeMixingAndMotion.__init__r   r   camr   r   r   r   r   r   r   rA   c           &      C   s  |j \}}}|j d | |j d }}|j \}}}}|}d }t|r$|}tjjj|||fdd}|d d d d d f dd|ddd|}|d d d d d f dd|| ddd|}|dddddd|}| 	|}| j
sx| |}t|d}| j
r| |}| jr| j|d	}tj||jd
}t|d|d}t|d}t|| jd| jd}| |}|d d d d d f }| jr| ||||dd d df || d}|d d d d d f }n.tj||jd
}t|d|d}t|d}t|| jd| jd}| |}|d d d d d f }| jr"|d|| ddd|}tt| j| j| jD ]\}\} }!}"| ||d}| jrW|||||| |dddddd||}n|||||| |ddddd|| d|}|| }#|!|#||d}#| jr||j| d| |j|#  }n| j ||#t!|d d d df d|j d | d}| jr||||| ||ddddd|| d|}n|||||| |ddddd|| d|}|| }#|"|#||d}#| jr||j| d| |j|#  }n%| j"r| j#n| j }$|$||#t!|d d d df d|j d | d}|||||| |d|| |}q-| j
rI| $|}t|d||d}| j
sZ| $|}|| }%|%S )Nr   rX   bilinear)sizemoderN   r   r   r   r   r   r   r   r   Fr   rD   rs   r   r   r   r   r   )%rH   r   rT   r'   
functionalinterpolater   reshapepermuter   re   r   rI   r_   r   r   r   r	   r   r`   r   r   r   viewr   r   r   r   r   r   r   r   r   
zeros_liker   r   r   )&r   r   r   r   r   r   r   r   r   r   r   rF   rC   d1vd2rk   rG   r   r   r   r   camera_contextmotion_contextr}   r   r   emb_timeemb_view	num_viewsv_embr   r   
time_block	mot_blockr   motion_mixerr   r   r   r   r     s   ,0





.

02&*42&*(

z8PostHocSpatialTransformerWithTimeMixingAndMotion.forward)rX   r   FNFFFFNNrY   rZ   rZ   TNNFFrX   r   FFTr[   )	NNNNNNNNNr   r   r   r@   r   r   `  s     	
r   )	functoolsr   rT   torch.nn.functionalr'   r   Fmodules.attentionmodules.diffusionmodules.utilr   r   r   r   r	   r   r
   Moduler   SpatialTransformerrW   r   r   r   r   r   <module>   s    	  F