o
    ԫiK                     @   s   d dl mZ d dlmZmZmZ d dlmZ ddlT ddl	m
Z
 ddlmZmZmZ ddlmZ d	d
lmZmZ G dd deZG dd dejZG dd deZG dd deZG dd dejZdS )    )partial)ListOptionalUnion)	rearrange   )*SpatialVideoTransformer)BasicTransformerTimeMixBlock'PostHocSpatialTransformerWithTimeMixing0PostHocSpatialTransformerWithTimeMixingAndMotion)default   )AlphaBlender	get_alphac                       s   e Zd Z										ddeded	ed
eeee f dededee de	de	dede	de	de	f fddZ
	ddejdejdedeej dejf
 fddZ  ZS )VideoResBlockr   fixed      ?NF   channelsemb_channelsdropoutvideo_kernel_sizemerge_strategymerge_factorout_channelsuse_convuse_scale_shift_normdimsuse_checkpointupdownc                    sb   t  j||||||	|
|||d
 tt||||dt||dddd||dd| _t||dd| _d S )Nr   r   r   r   r    r!   r"   r   FT
r   r   r   r   r   r!   r"   kernel_sizer    exchange_temb_dimsb t -> b 1 t 1 1alphar   rearrange_pattern)super__init__ResBlockr   
time_stackr   
time_mixer)selfr   r   r   r   r   r   r   r   r   r   r    r!   r"   	__class__ R/data/cameron/vidgen/generative-models/sgm/modules/diffusionmodules/video_model.pyr,      s>   zVideoResBlock.__init__xembnum_video_framesimage_only_indicatorreturnc                    s^   t  ||}t|d|d}t|d|d}| |t|d|d}| j|||d}t|d}|S )N(b t) c h w -> b c t h wt(b t) ... -> b t ...	x_spatial
x_temporalr8   b c t h w -> (b t) c h w)r+   forwardr   r.   r/   )r0   r5   r6   r7   r8   x_mixr1   r3   r4   rB   C   s   
zVideoResBlock.forward)
r   r   r   NFFr   FFF)N)__name__
__module____qualname__intfloatr   r   strr   boolr,   thTensorrB   __classcell__r3   r3   r1   r4   r      sd    	
6r   c                ?       sR  e Zd Z																					
						d8dededededededee dededee dededededededeee ef dee d ee d!ed"ee d#ed$ed%e	d&ed'e	d(eeee f d)ed*ee d+ed,ef> fd-d.Z
					d9d/ejd0ejd1eej d2eej d3eej d4ee d5eej fd6d7Z  ZS ):	VideoUNet        r   r         Tr   NFr   r   r   softmaxr   '  in_channelsmodel_channelsr   num_res_blocksattention_resolutionsr   channel_multconv_resampler   num_classesr    	num_headsnum_head_channelsnum_heads_upsampler   resblock_updowntransformer_depthtransformer_depth_middlecontext_dimtime_downuptime_context_dimextra_ff_mix_layeruse_spatial_contextr   r   spatial_transformer_attn_typer   use_linear_in_transformeradm_in_channelsdisable_temporal_crossattentionmax_ddpm_temb_periodc            .         s2  t    |d usJ |dkr|}|dkr|dksJ |dkr%|dks%J || _|| _|| _t|tr:t||g }t||d }|| _	|| _
| _|| _|| _|
| _|| _|| _|| _|| _|d } tt|| t t| | | _| jd urt| jtrt|
| | _nO| jdkrtd td| | _n>| jdkrtt|tt|| t t| | | _n"| jdkr|d usJ ttt|| t t| | | _nt ttt |	||ddd	g| _!|| _"|g}!|}"d}#		 	
	
d 	f
dd	}$	
	
ddd}%t#|D ]\}&}'t$|D ]Q}(|%||"| |'| |	||d
g})|'| }"|#|v rR|dkr<|"| }*n|"| }|}*|)%|$|"||*||& ||d
d | j!%t|)  |  j"|"7  _"|!%|" q|&t|d kr|#d9 }#|"}+| j!%t|r|%||"| |+|	||ddnt&|"||	|+|d |+}"|!%|" |  j"|"7  _"q|dkr|"| }*n|"| }|}*t|%||"| d |	||d
|$|"||*|||d|%||"d | |	||d
| _'|  j"|"7  _"tg | _(t)t#|d d d D ]\}&}'t$|d D ]~},|!* }-|%||"|- | ||' |	||d
g})||' }"|#|v rC|dkr-|"| }*n|"| }|}*|)%|$|"||*||& ||d
d |&ro|,|kro|"}+|#d }#|)%|re|%||"| |+|	||ddnt+|"||	|+|d | j(%t|)  |  j"|"7  _"qqtt,|"t t-t |	||ddd	| _.d S )NrS   rQ   
continuous'setting up linear c_adm embedding layerr   timestep
sequentialr   paddingFc                    s.   t | ||f||	|| dS )N)depthrc   re   r   ff_inrg   r   r   
checkpoint
use_linear	attn_modedisable_self_attnrk   max_time_embed_periodr	   chr]   dim_headrs   rc   r    disabled_sa
rk   r   rf   rl   r   r   rh   re   ri   rg   r3   r4   get_attention_layer   s(   	z/VideoUNet.__init__.<locals>.get_attention_layerc                 S   s    t | |||||||||	|
|dS )N)r   r   r   r   r   r   r   r   r    r   r"   r!   )r   )r   r   r   r{   time_embed_dimr   out_chr   r    r   r"   r!   r3   r3   r4   get_resblock   s   z(VideoUNet.__init__.<locals>.get_resblock)
r   r   r   r{   r   r   r   r   r    r   rs   rc   r    r}   r   T)r   r   r   r{   r   r   r   r   r    r   r"   r   r   
third_down)
r   r   r   r{   r   r   r   r   r    r   rs   rc   r    )
r   r   r   r{   r   r   r   r   r    r   )r   r   r   r{   r   r   r   r   r    r   r!   r   r   third_upr   NFFFF)/r+   r,   rV   rW   r   
isinstancerG   lenr   rX   rY   r   rZ   r[   r\   r    r]   r^   r_   nn
SequentiallinearSiLU
time_embed	Embedding	label_embprintLinearTimestep
ValueError
ModuleListTimestepEmbedSequentialconv_ndinput_blocks_feature_size	enumeraterangeappend
Downsamplemiddle_blockoutput_blockslistpopUpsamplenormalizationzero_moduleout).r0   rV   rW   r   rX   rY   r   rZ   r[   r   r\   r    r]   r^   r_   r   r`   ra   rb   rc   rd   re   rf   rg   r   r   rh   r   ri   rj   rk   rl   r   input_block_chansr{   dsr   r   levelmult_layersr|   r   iichr1   r~   r4   r,   Z   s  
"




	 (






"


A
zVideoUNet.__init__r5   	timestepscontextytime_contextr7   r8   c              	      sX  |d u| j d uksJ dg }t|| jdd}	| |	}
| j d ur4|jd |jd ks-J |
| | }
|jd   fdd}|}| jD ]}|||
||||d}||}|| qD| j||
||||d}||}| j	D ]}|
 }|jd |jd kr|jd |jd  dkr|jd |jd  }|j|jd |g|jdd  R  jdd	}n1|jd |jd  dkr|j|jd |jd  dd	}n|d d j|jd g|jdd  R  }| d
kr	|jd |jd ks|jd |jd kr	tjjj||jd |jd fddd}tj||gdd	}|||
||||d}||}ql||j}| |S )NSmust specify y if and only if the model is class-conditional -> no, relax this TODOFrepeat_onlyr   c                    s   t | tjr| jd  kr| S | jd   dkr2| jd   }| j |g| jdd  R  jddS  | jd  dkrG| j | jd  ddS | d   S )Nr   r   dim)r   rK   rL   shapereshapemeanrepeat_interleave)r<   n	batch_refr3   r4   _align_batch  s   &z'VideoUNet.forward.<locals>._align_batch)r   r8   r   r7   r   r   rQ   r   r   bilinear)sizemodealign_corners)r\   timestep_embeddingrW   r   r   r   r   r   r   r   r   r   r   r   expandr   rK   r   
functionalinterpolatecattypedtyper   )r0   r5   r   r   r   r   r7   r8   hst_embr6   r   hmoduleskipr   r3   r   r4   rB     sv   






. *8

zVideoUNet.forward)rO   rP   Tr   NFrS   rS   rS   FFr   NNFNFFr   r   rT   r   FNFrU   )NNNNNrD   rE   rF   rG   rH   r   rJ   r   r   rI   r,   rK   rL   rB   rM   r3   r3   r1   r4   rN   Y   s    	
   krN   c                       s   e Zd Z										d deded	ed
edededededededededef fddZ						d!dej	de
ej	 de
ej	 de
e de
ej	 de
ej	 de
ej	 f fddZ  ZS )"#PostHocAttentionBlockWithTimeMixingFrO   r   r   TrT   rV   n_headsd_headr    use_new_attention_orderr   rg   r   r   apply_sigmoid_to_mergert   rw   rk   c                    s   t  j|||||d || }tt||||||||dg| _|| _| jd }tt| j|t	 t|| j| _
|| _|dkrL| dt|	g n&|dksT|dkrc| dtjt|	g n|dkrkd | _ntd	| tjt|| j|
d
| _d S )N)r    r   )r   ru   rt   rw   rk   rQ   r   
mix_factorlearnedlearned_with_imagesfixed_with_imagesunknown merge strategy apply_sigmoid)r+   r,   r   r   r   time_mix_blocksrV   r   r   r   time_mix_time_embedrg   register_bufferrK   rL   register_parameter	Parameterr   r   	functoolsr   r   get_alpha_fn)r0   rV   r   r   r    r   r   rg   r   r   r   rt   rw   rk   	inner_dimr   r1   r3   r4   r,     sX   


z,PostHocAttentionBlockWithTimeMixing.__init__Nr5   r   r   r   r8   	conv_viewconv_motionc                    s  |d urt |j\}}}	}
t|rt|d}| jr(t|d d df d|	|
 d}t |}t|d}|}tj	||j
d}t|d|jd | d}t|d	}t|| jd
d}| |}|d d d d d f }|| }| jd |||d}| j|d}|| d| |  }t|d|	|
d}|S )Nzb t ... -> (b t) ...r   zb ... -> (b n) ...)r   zb c h w -> b (h w) c)devicezt -> b t)bzb t -> (b t)Fr   )r   r   r8         ?zb (h w) c -> b c h w)r   w)NotImplementedErrorr   existsr   rg   repeatr+   rB   rK   aranger   r   rV   r   r   r   )r0   r5   r   r   r   r8   r   r   r   r   r   rC   
num_framesr   r6   r)   r1   r3   r4   rB   W  s4   



z+PostHocAttentionBlockWithTimeMixing.forward)
FFrO   Fr   r   TFrT   F)NNNNNN)rD   rE   rF   rG   rJ   rH   rI   r,   rK   rL   r   rB   rM   r3   r3   r1   r4   r     sv    	
G	r   c                !       s   e Zd Z													d#ded	ed
edeeee f dedededee dedededededededef  fddZ				d$de
jde
jdedee
j dee
j dee
j d e
jf fd!d"Z  ZS )%PostHocResBlockWithTimer   r   r   TNFr   r   r   r   time_kernel_sizer   r   r   r   r   r   r   r    r!   r"   time_mix_legacyreplicate_bugc                    s   t  j|||||	|
||||d
 tt||||dt||dddd||dd| _|| _| jro|dkr<| dt|g n&|dksD|d	krS| 	dtj
t|g n|d
kr[d | _ntd| tjt|| j|d| _d S 	 t||dd| _d S )Nr#   r   FTr$   r   r   r   r   r   r   r   a  *****************************************************************************************
GRAVE WARNING: YOU'RE USING THE BUGGY LEGACY ALPHABLENDER!!! ARE YOU SURE YOU WANT THIS?!
*****************************************************************************************r'   r(   )r+   r,   r-   r   r   r   r   rK   rL   r   r   r   r   r   r   r   r   r   logpywarningLegacyAlphaBlenderWithBugr/   r   )r0   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   r1   r3   r4   r,     sb   z PostHocResBlockWithTime.__init__r5   r6   r7   r8   	cond_viewcond_motionr9   c           	         s   t  ||}t|d|d}t|d|d}| |t|d|d}| jr>| j|d d}||j| d| |j|  }n
| j|||d d}t|d}|S )	Nr:   r;   r=   rO   r   r   r>   rA   )	r+   rB   r   r   r   r   tor   r/   )	r0   r5   r6   r7   r8   r   r   rC   r)   r1   r3   r4   rB     s   	&

zPostHocResBlockWithTime.forward)r   r   r   TNFFr   FFFTF)NNN)rD   rE   rF   rG   rH   r   r   rJ   r   r,   rK   rL   rB   rM   r3   r3   r1   r4   r     s    	
Yr   c                [       s  e Zd Z																									
	
	
														dJdededededededee dededee dedededededededed eee ef d!ee d"ee d#ed$ee d%ee d&ee d'ed(ed)e	d*ed+ed,ed-e	d.eeee f d/ed0ed1ee d2ed3ed4ed5ed6ed7ed8ed9ed:efZ fd;d<Z
									dKd=ejd>ejd?eej d@eej dAeej dBeej dCee dDeej dEeej dFeej dGee fdHdIZ  ZS )LSpatialUNetModelWithTimerO   rP   Tr   NFrS   r   r   r   rT   r   rU   rV   rW   r   rX   rY   r   rZ   r[   r   r\   r    r]   r^   r_   r   r`   r   use_spatial_transformerra   rb   rc   rd   re   view_context_dimmotion_context_dimrf   rg   time_block_merge_strategytime_block_merge_factorview_block_merge_factormotion_block_merge_factorrh   r   ri   legacyrj   use_temporal_resblockrk   r   rl   replicate_time_mix_buguse_motion_attentionuse_camera_embuse_3d_attentionseparate_motion_merge_factorc.           <         s  t    r|d usJ |d ursJ |dkr|}|dkr%|dks%J |dkr/|dks/J |_|_|_t|trDt||g }t||d }|_	|_
_|_|_|
_|_|_|_|_|%_|d }.tt||.t t|.|._jd urtjtrt|
|._nOjdkrtd td|._n>jdkrtt|tt||.t t|.|._n"jdkr߈ d usJ ttt |.t t|.|._nt tt t!|	||ddd	g_"|_#|g}/|}0d}1		 	
	
d 	
fdd	}2	
	
dfdd	}3t$|D ]\}4}5t%|D ]]}6|3|!|0|.|5| |	||d
g}7|5| }0|1|v ry|dkrW|0| }8n|0| }|}8|#rirg|0| n|}8|7&|2|0||8||4 ||d
d j"&t |7   j#|07  _#|/&|0 q1|4t|d kr|1d9 }1|0}9j"&t |r|3|!|0|.|9|	||ddnt'|0||	|9|d |9}0|/&|0  j#|07  _#q)|dkr|0| }8n|0| }|}8|#rr|0| n|}8t |3|!|0|.d |	||d
|2|0||8|||d|3|!|0d |.|	||d
_( j#|07  _#tg _)t*t$|d d d D ]\}4}5t%|d D ]}:|/+ };|3|!|0|; |.||5 |	||d
g}7||5 }0|1|v r|dkr`|0| }8n|0| }|}8|#rrrp|0| n|}8|7&|2|0||8||4 ||d
d |4r|:|kr|0}9|1d }1|7&|r|3|!|0|.|9|	||ddnt,|0||	|9|d j)&t |7   j#|07  _#q4q*tt-|0t t.t!|	||ddd	_/d S )NrS   rQ   rm   rn   r   ro   rp   r   rq   Fc                    s   st | |||
	dS r[t| ||fi d|d|dddddd	d
dd d
d	dd|ddd|dddS t| ||f||
	||dS )N)	r    r   r   rt   rg   r   r   rw   rk   rs   rc   re   r   r   rt   rg   r	  r
  r  rj   r   r   merge_factor_motionru   rv   rw   rx   rk   r   ry   )rs   rc   re   r   rt   rg   r   r   ru   rv   rw   rx   rk   r   ry   )r   r   r   rz   )rj   rk   r   rf   rl   r  r   r  rh   r  r  re   r   r
  r	  ri   r  r   rg   r   r3   r4   r   |  s   		
z>SpatialUNetModelWithTime.__init__.<locals>.get_attention_layerc                    sD   j rt| |||||||||	|
| dS t|||||||	|
|d	S )N)r   r   r   r   r   r   r   r   r    r   r"   r!   r   r   )	r   r   r   r   r    r   r   r"   r!   )use_temporal_resblocksr   r-   )r  r  r   r{   r   r   r   r   r    r   r"   r!   )r  r0   r   r3   r4   r     s8   z7SpatialUNetModelWithTime.__init__.<locals>.get_resblock)
r  r  r   r{   r   r   r   r   r    r   r   r   T)r  r  r   r{   r   r   r   r   r    r   r"   r   )
r  r  r   r{   r   r   r   r   r    r   r   )
r  r  r   r{   r   r   r   r   r    r   )r  r  r   r{   r   r   r   r   r    r   r!   r   r   r   )0r+   r,   rV   rW   r   r   rG   r   r   rX   rY   r   rZ   r[   r\   r    r]   r^   r_   r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )<r0   rV   rW   r   rX   rY   r   rZ   r[   r   r\   r    r]   r^   r_   r   r`   r   r   ra   rb   rc   rd   re   r   r   rf   rg   r  r  r  r  rh   r   ri   r  rj   r  rk   r   rl   r  r  r	  r
  r  r   r   r{   r   r   r   r   r   r   r   r|   r   r   r   r1   )rj   rk   r   rf   rl   r  r   r  r0   r  rh   r  r  re   r   r
  r	  ri   r  r   rg   r   r4   r,     s  
0




	4V,





"


G
z!SpatialUNetModelWithTime.__init__r5   r   r   r   camr   r7   r8   r   r   	time_stepc                 C   sT  |d u| j d uksJ dg }t|| jdd}| |}t|d j  }| j d ur?|jd |jd ks8J || 	| }|}t
| jD ]\}}|||||||	|
|||d||d}|| qF| j||||||	|
|||d||d}t
| jD ]"\}}tj|| gdd	}|||||||	|
|||d
||d}q|||j}| |S )Nr   Fr   r   zencoder_{}_{})	r   r  r8   r   r   r   r7   r  namezmiddle_{}_0r   r   zdecoder_{}_{})r\   r   rW   r   rI   datacpunumpyr   r   r   r   formatr   r   r   rK   r   r   r   r   r   )r0   r5   r   r   r   r  r   r7   r8   r   r   r  r   r   r6   timer   r   r   r3   r3   r4   rB     st   





z SpatialUNetModelWithTime.forward)(rO   rP   Tr   NFrS   rS   rS   FFFFr   NNFNNNFFr   r   r   r   rT   r   FTNTFTrU   FFFFF)	NNNNNNNNNr   r3   r3   r1   r4   r     sN   	
 !"#$%&'()*+,-.   L	
r   N)r   r   typingr   r   r   einopsr   $modules.diffusionmodules.openaimodelmodules.video_attentionr
   modules.spacetime_attentionr   r   r   utilr   r   r   r-   r   r   ModulerN   AttentionBlockr   r   r   r3   r3   r3   r4   <module>   s     H   <rr