o
    #i5x                     @   s  d dl mZ d dlmZ d dlmZmZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ dd	 Zd
d Zd>ddZd?ddZdd Zdd Zd@ddZdAddZG dd de	jZG d d! d!e	jZG d"d# d#eZG d$d% d%e	jZG d&d' d'e	jZG d(d) d)e	jZG d*d+ d+e	jZG d,d- d-e	jZ G d.d/ d/e	jZ!G d0d1 d1e	jZ"G d2d3 d3e	jZ#G d4d5 d5e	jZ$G d6d7 d7e	jZ%G d8d9 d9e%Z&G d:d; d;e	jZ'G d<d= d=e'Z(dS )B    )Callable)beartype)TupleUnionN)	rearrange   )
checkpointc           	      C   s   t | jdksJ dt | j| j\}}}}}t| d} || |} tj| d||d} t| d} || |} tj| d|||d} | S )	N   z-input should be 5D tensor, but got {}D tensorb c t h w -> (b t) c h w(b t) c h w -> b c t h wbtzb c t h w -> (b h w) c tz(b h w) c t -> b c t h w)r   hw)lenshapeformateinopsr   )	xZblock_sZblock_ttembBCTHW r   </data/cameron/vidgen/VidTok/vidtok/modules/model_3dcausal.pyspatial_temporal_resblk   s   "

r   c                 C   s   | t |  S N)torchsigmoidr   r   r   r   nonlinearity      r#       	groupnormc                 C   s4   |dkrt jj|| dddS |dkrt| ddS d S )Nr&   ư>T)
num_groupsnum_channelsepsaffineZ	layernorm)r)   r*   )r    nn	GroupNorm	LayerNorm)in_channelsr(   	norm_typer   r   r   	Normalize   s
   r1   constant        c                 C   sp   |dv sJ |dk r| d n| j | d }d| }|dkr+tj| g ||R |dS tj| g ||R |dS )N)r3   	replicatereflectr   r   )r   r   r3   )valuemode)ndimFpad)r   r<   dimpad_moder7   Zdims_from_rightzerosr   r   r   
pad_at_dim%   s    r@   c                 C   s   | | dkS )Nr   r   )numdenr   r   r   divisible_by.      rC   c                 C   s   t | d S )N   )rC   )nr   r   r   is_odd2   rD   rG   c                 C   s   t | tr| S | f| S r   )
isinstancetuple)r   lengthr   r   r   
cast_tuple6   s   rK   Fc                 C   s   t | ||dS )Nuse_checkpointr0   )AttnBlockWrapper)r/   rM   r0   r   r   r   	make_attn:   r$   rO   c                       s&   e Zd Zd fdd	Zdd Z  ZS )r.   r'   c                    s*   t  j|i | tjj||dd| _d S )NT)r*   elementwise_affine)super__init__r    r,   r.   norm)selfr)   r*   argskwargs	__class__r   r   rR   ?   s   zLayerNorm.__init__c                 C   s~   |  dkrt|d}| |}t|d}|S |  dkr.t|d}| |}t|d}|S t|d}| |}t|d}|S )	Nr	   zb c t h w -> b t h w czb t h w c -> b c t h w   zb c h w -> b h w czb h w c -> b c h wzb c s -> b s czb s c -> b c s)r=   r   rS   rT   r   r   r   r   forwardC   s   


	





zLayerNorm.forward)r'   __name__
__module____qualname__rR   r[   __classcell__r   r   rW   r   r.   >   s    r.   c                       sD   e Zd Zd fdd	ZdejdejfddZd	d
 Zdd Z  Z	S )	AttnBlockFr&   c                    s   t    || _|| _t|| jd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _tjj||dddd| _|| _d S )Nr0   r   r   kernel_sizestridepadding)rQ   rR   r/   r0   r1   rS   r    r,   Conv2dqkvproj_outrM   rT   r/   rM   r0   rW   r   r   rR   T   s   

zAttnBlock.__init__h_returnc           	      C   sv   |  |}| |}| |}| |}|j\}}}}tdd |||f\}}}tjj	|||}t
|d||||dS )Nc                 S      t | d S )Nzb c h w -> b 1 (h w) cr   
contiguousr"   r   r   r   <lambda>h       z%AttnBlock.attention.<locals>.<lambda>zb 1 (h w) c -> b c h wr   r   cr   )rS   rh   ri   rj   r   mapr    r,   
functionalscaled_dot_product_attentionr   )	rT   rm   rh   ri   rj   r   ru   r   r   r   r   r   	attentiona   s   



zAttnBlock.attentionc                 K   s(   | j rt| j|f|  | j S | |S r   rM   r   _forward
parameters)rT   r   rV   r   r   r   r[   l   s   
zAttnBlock.forwardc                 K   s    |}|  |}| |}|| S r   )ry   rk   )rT   r   rV   rm   r   r   r   r{   r   s   

zAttnBlock._forwardFr&   )
r]   r^   r_   rR   r    Tensorry   r[   r{   r`   r   r   rW   r   ra   S   s
    ra   c                       s4   e Zd Zd	 fdd	ZdejdejfddZ  ZS )
rN   Fr&   c                    s^   t  j|||d t||ddd| _t||ddd| _t||ddd| _t||ddd| _d S )NrL   r   rd   re   )rQ   rR   CausalConv3drh   ri   rj   rk   rl   rW   r   r   rR   z   s
   zAttnBlockWrapper.__init__rm   rn   c                 C   s   |j d }t|d}| |}t|d|d}| |}| |}| |}|j \}}}}	}
tdd |||f\}}}tjj	
|||}t|d|	|
||dS )	Nr   r
   r   r   c                 S   ro   )Nzb c t h w -> b t (h w) crp   r"   r   r   r   rr      rs   z,AttnBlockWrapper.attention.<locals>.<lambda>zb t (h w) c -> b c t h wrt   )r   r   rS   rh   ri   rj   rv   r    r,   rw   rx   )rT   rm   r   rh   ri   rj   r   ru   r   r   r   r   r   r   ry      s   





zAttnBlockWrapper.attentionr}   )r]   r^   r_   rR   r    r~   ry   r`   r   r   rW   r   rN   y   s    rN   c                       s0   e Zd Zeddef fddZdd Z  ZS )CausalConv1dr3   rd   c                    sl   t    |dd}|dd}|| _||d  d|  | _| jdf| _tj|||f||d|| _d S )Ndilationr   re   r   re   r   )	rQ   rR   popr>   time_padtime_causal_paddingr,   Conv1dconv)rT   chan_inchan_outrd   r>   rV   r   re   rW   r   r   rR      s   
"zCausalConv1d.__init__c                 C   6   | j |jd k r| jnd}tj|| j|d}| |S NrE   r3   r8   r   r   r>   r;   r<   r   r   rT   r   r>   r   r   r   r[      s   
zCausalConv1d.forwardr3   )r]   r^   r_   r   intrR   r[   r`   r   r   rW   r   r      s    
r   c                	       sB   e Zd Zeddeeeeeef f f fddZdd Z  Z	S )r   r3   rd   c                    s
  t    t|d}|dd}|dd}t|d}t|d}|\}}	}
t|	r-t|
s/J || _|d |d  d|d   }|d |	d  d|d   }|d |	d  d|d   }|| _|d ||d  |d ||d  |df| _tj	|||f||d|| _
d S )N   r   r   re   r   rE   r   )rQ   rR   rK   r   rG   r>   r   r   r,   Conv3dr   )rT   r   r   rd   r>   rV   r   re   Ztime_kernel_sizeZheight_kernel_sizeZwidth_kernel_sizer   Z
height_padZ	width_padrW   r   r   rR      s*   






"	zCausalConv3d.__init__c                 C   r   r   r   r   r   r   r   r[      s   
zCausalConv3d.forwardr   )
r]   r^   r_   r   r   r   r   rR   r[   r`   r   r   rW   r   r      s    (r   c                       $   e Zd Z fddZdd Z  ZS )Upsamplec                    s<   t    || _|| _| jrtjj||dddd| _d S d S )Nr   r   rc   rQ   rR   r/   	with_convr    r,   rg   r   rT   r/   r   rW   r   r   rR         
zUpsample.__init__c                 C   s8   t jjj|t jddd|j}| jr| |}|S )N       @nearestscale_factorr9   )	r    r,   rw   interpolatetofloat32dtyper   r   rZ   r   r   r   r[      s   $
zUpsample.forwardr\   r   r   rW   r   r          r   c                       r   )
Downsamplec                    s<   t    || _|| _| jrtjj||dddd| _d S d S )Nr   rE   r   rc   r   r   rW   r   r   rR      r   zDownsample.__init__c                 C   sF   | j rd}tjjj||ddd}| |}|S tjjj|ddd}|S )N)r   r   r   r   r3   r   r9   r7   rE   r   )r   r    r,   rw   r<   r   
avg_pool2d)rT   r   r<   r   r   r   r[      s   
zDownsample.forwardr\   r   r   rW   r   r      r   r   c                       .   e Zd Z	ddef fddZdd Z  ZS )TimeDownsampleResCausal2xr   
mix_factorc                    sL   t    d| _tjddd| _t||ddd| _tj	t
|g| _d S )N)r   r   r   )r   r   r   )rE   r   r   )re   r   )rQ   rR   rd   r,   	AvgPool3davg_poolr   r   r    	Parameterr~   r   rT   r/   out_channelsr   rW   r   r   rR      s
   
z"TimeDownsampleResCausal2x.__init__c                 C   sJ   t | j}d}| t jjj||ddd}| |}|| d| |  S )N)r   r   r   r   r   r   r3   r   r   r   )r    r!   r   r   r,   rw   r<   r   )rT   r   alphar<   x1x2r   r   r   r[      s
   
z!TimeDownsampleResCausal2x.forwardr   r]   r^   r_   floatrR   r[   r`   r   r   rW   r   r      s    r   c                       r   )TimeUpsampleResCausal2xr   r   c                    s2   t    t||d| _tjt|g| _d S )Nr   )	rQ   rR   r   r   r    r,   r   r~   r   r   rW   r   r   rR      s   
z TimeUpsampleResCausal2x.__init__c                 C   sR   t | j}t jjj|t jg ddd|j}| 	|}|| d| |  S )N)r         ?r   r   r   r   )
r    r!   r   r,   rw   r   r   r   r   r   )rT   r   r   x_r   r   r   r[     s    
zTimeUpsampleResCausal2x.forwardr   r   r   r   rW   r   r      s    r   c                       <   e Zd Zdddddd fdd
Zdd	 Zdd
dZ  ZS )ResnetBlockNF   r&   r   conv_shortcuttemb_channelsrM   r0   c                   s   t    || _|d u r|n|}|| _|| _|| _t|| jd| _tj	j
||dddd| _|dkr9tj	||| _t|| jd| _tj	|| _tj	j
||dddd| _| j| jkrv| jrjtj	j
||dddd| _ntj	j
||dddd| _|| _d S )Nrb   r   r   rc   r   )rQ   rR   r/   r   use_conv_shortcutr0   r1   norm1r    r,   rg   conv1Linear	temb_projnorm2Dropoutdropoutconv2r   nin_shortcutrM   rT   r/   r   r   r   r   rM   r0   rW   r   r   rR     s$   

zResnetBlock.__init__c                 C   :   | j r|d u sJ dt| j|f|  | j S | ||S Nz%checkpointing not supported with tembrz   rT   r   r   r   r   r   r[   6     zResnetBlock.forwardc                 C   s   |}|  |}t|}| |}|d ur'|| t|d d d d d d f  }| |}t|}| |}| |}| j| jkrQ| j	rL| 
|}|| S | |}|| S r   )r   r#   r   r   r   r   r   r/   r   r   r   r   )rT   r   r   r   r   r   r   r{   =  s    

&




zResnetBlock._forwardr   r]   r^   r_   rR   r[   r{   r`   r   r   rW   r   r     s    !r   c                       r   )ResnetCausalBlockNFr   r&   r   c                   s   t    || _|d u r|n|}|| _|| _|| _t|| jd| _t||ddd| _	|dkr6t
j||| _t|| jd| _t
j|| _t||ddd| _| j| jkrj| jrat||ddd| _n	t||ddd| _|| _d S Nrb   r   r   r   r   )rQ   rR   r/   r   r   r0   r1   r   r   r   r    r,   r   r   r   r   r   r   r   r   rM   r   rW   r   r   rR   U  sL   


zResnetCausalBlock.__init__c                 C   r   r   rz   r   r   r   r   r[     r   zResnetCausalBlock.forwardc                 C   s   |j d }|}t|d}| |}t|}t|d|d}| |}|d ur8|| t|d d d d d d f  }t|d}| |}t|}| |}t|d|d}| |}| j	| j
krn| jri| |}|| S | |}|| S )Nr   r
   r   r   r   r   r   r#   r   r   r   r   r   r/   r   r   r   r   rT   r   r   r   r   r   r   r   r{     s*   



&





zResnetCausalBlock._forwardr   r   r   r   rW   r   r   T  s    4r   c                       s>   e Zd Zddddddd fdd
Zdd	 Zdd
dZ  ZS )ResnetCausalBlock1DNFr   r&   )r   r   r   	zero_initrM   r0   c          	         s  t    || _|d u r|n|}|| _|| _|| _t|| jd| _t||ddd| _	|dkr6t
j||| _t|| jd| _t
j|| _t||ddd| _| j| jkrj| jrat||ddd| _n	t||ddd| _|r|| jjjj  | jjjj  || _d S r   )rQ   rR   r/   r   r   r0   r1   r   r   r   r    r,   r   r   r   r   r   r   r   r   r   weightdatazero_biasrM   )	rT   r/   r   r   r   r   r   rM   r0   rW   r   r   rR     s*   

zResnetCausalBlock1D.__init__c                 C   r   r   rz   r   r   r   r   r[     r   zResnetCausalBlock1D.forwardc                 C   s   |j d }|}t|d|d}| |}t|}t|d|d}| |}|d ur:|| t|d d d d d d f  }t|d|d}| |}t|}| |}t|d|d}| |}| j	| j
krr| jrm| |}|| S | |}|| S )Nr   z(b s) c t -> (b t) c sr   z(b t) c s -> (b s) c tr   r   r   r   r   r{     s*   


&




zResnetCausalBlock1D._forwardr   r   r   r   rW   r   r     s    &r   c                       s`   e Zd Zdddddddd fdd
Zd	efd
dZd	efddZd	efddZdd Z  Z	S )EncoderCausal3Dr   rE   rY      Nr4   Tr&   )ch_mult
spatial_dstempo_dsr   resamp_with_convdouble_zr0   c                   sv  t    |dd}|| _d| _t|| _|| _|	| _|| _	|dd| _
d| _|  }|  }|  }||	| jddd| _d	t| }|| _|d u rVttd| jd n|| _|d u rg| jd
 | jd gn|| _t | _t | _t| jD ]}}|||  }|||  }t }t }t }t }t| jD ]#}|t||| j||| j	d |t||| j|d|| j	d |}qt }||_||_t }||_||_|| jv rt |||_!|| jv rt"|||_!| j| | j| qyt | _#|||| j||| j	d| j#_$||| j	d| j#_%|||| j||| j	d| j#_&t'|| j	d| _(|||r2d
|
 n|
ddd| _)d S )NrM   Fr   fix_encoderTr   r   r   r   rE   r/   r   r   r   rM   r0   r/   r   r   r   r   rM   r0   rb   )*rQ   rR   getchtemb_chr   num_resolutionsnum_res_blocksr/   r0   r   	is_causal
_make_conv
_make_attn_make_resblockconv_inrI   
in_ch_multlistranger   r   r,   
ModuleListdowndown_temporalappendr   r   Moduleblockattnr   
downsampler   midblock_1attn_1block_2r1   norm_outconv_out)rT   r   out_chr   r   r   r   r   r   r/   
z_channelsr   r0   ignore_kwargsrM   make_conv_clsmake_attn_clsmake_resblock_clsr   i_levelblock_in	block_outr   r   Zblock_temporalZattn_temporali_blockr   r   rW   r   r   rR     s   

""








zEncoderCausal3D.__init__rn   c                 C      t S r   rO   rT   r   r   r   r   n     zEncoderCausal3D._make_attnc                 C   r  r   r   r
  r   r   r   r   q  r  zEncoderCausal3D._make_resblockc                 C   r  r   r   r
  r   r   r   r   t  r  zEncoderCausal3D._make_convc                 C   sp  d }|j \}}}}}| |g}t| jD ]d}	t| jD ]}
t|d | j|	 j|
 | j|	 j|
 |}|	| q|	| j
v ryt|d d}| j|	 |}tj|d||d}|j \}}}}}|	| jv rl| j|	 |}|	| |j \}}}}}q|d }| j||}| j|}| j||}|j \}}}}}t|d}| |}t|}tj|d|d}| |}|S )Nr2   r
   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r   )rT   r   r   r   _r   r   r   hsr  r  r   Zhtmpr   r   r   r   r[   w  s<   $




zEncoderCausal3D.forward)
r]   r^   r_   rR   r   r   r   r   r[   r`   r   r   rW   r   r     s    wr   c                       (   e Zd Z fddZ fddZ  ZS )EncoderCausal3DPaddingc                    s\   t  j|i | |dd| _|dd| _| jd | _| jr*|  D ]}d|_q$d S d S )Ntime_downsample_factorrY   init_pad_moder5   r   F)	rQ   rR   r   r  r  time_paddingr   r|   requires_gradrT   rU   r   paramrW   r   r   rR     s   zEncoderCausal3DPadding.__init__c                    s>   |j d }|| j dkrt|| jdfd| jdd}t |S )NrE   r   r4   )r=   r>   r7   )r   r  r@   r  r  rQ   r[   )rT   r   Z	video_lenrW   r   r   r[     s   
zEncoderCausal3DPadding.forwardr\   r   r   rW   r   r    s    
r  c                	       sj   e Zd Zddddddddd fdd	
Zd
efddZd
efddZd
efddZdd Zdd Z	  Z
S )DecoderCausal3Dr   Nr4   TFr&   )r   
spatial_ustempo_usr   r   give_pre_endtanh_outr0   c                   s  t    |dd}|| _d| _t|| _|| _|	| _|| _	|| _
|| _|dd| _dt| }||| jd   }|  }|  }|  }||
|ddd| _t | _|||| j||| jd	| j_|||| jd
| j_|||| j||| jd	| j_|d u rttd| jn|| _|d u rddgn|| _t | _tt| jD ]G}t }t }|||  }t| jd D ]}|t ||| j||| jd	 |}qt }||_!||_"|| jv rt#|||_$| j%d| qt | _&tt| jD ]P}t }t }|||  }|||  }t| jd D ]}|t'||| j|d|| jd |}qt }||_!||_"|| jv r>t(|||_$| j&%d| qt)|| jd| _*|||ddd| _+d S )NrM   Fr   fix_decoderr   r   r   r   r   rL   rE   Tr   rb   ),rQ   rR   r   r   r   r   r   r   r/   r  r  r0   r  rI   r   r   r   r   r,   r   r   r   r   r   r   r   r  r  r   upreversedr   r   r   r   r   upsampleinsertup_temporalr   r   r1   r   r   )rT   r   r   r   r  r  r   r   r   r/   r   r  r  r0   ZignorekwargsrM   r   r  r  r  r  r  r   r   r  r  r  r"  rW   r   r   rR     s   










zDecoderCausal3D.__init__rn   c                 C   r  r   r	  r
  r   r   r   r   -  r  zDecoderCausal3D._make_attnc                 C   r  r   r  r
  r   r   r   r   0  r  zDecoderCausal3D._make_resblockc                 C   r  r   r  r
  r   r   r   r   3  r  zDecoderCausal3D._make_convc                 K   s    z| j jjW S    | j j Y S r   )r   r   r   )rT   rV   r   r   r   get_last_layer6  s   zDecoderCausal3D.get_last_layerc                 K   s  d }|j \}}}}}| |}	| jj|	|fi |}	| jj|	fi |}	| jj|	|fi |}	tt| jD ]X}
t| j	d D ]}t
|	| j|
 j| | j|
 j| |}	q?|
| jv rt|	d}	| j|
 |	}	tj|	d||d}	|	j \}}}}}|
| jv r| j|
 |	}	|	j \}}}}}q6| jr|	S |	j \}}}}}t|	d}	| |	}	t|	d|d}	t|	}	| j|	fi |}	| jrt|	}	|	S )Nr   r
   r   r   r   )r   r   r   r   r   r   r  r   r   r   r   r  r   r"  r  r   r   r   r  r  r   r#   r   r  r    tanh)rT   zrV   r   r   r  r   r   r   r   r  r  r   r   r   r   r[   <  s>   
 



zDecoderCausal3D.forward)r]   r^   r_   rR   r   r   r   r   r#  r[   r`   r   r   rW   r   r    s    xr  c                       r  )DecoderCausal3DPaddingc                    sN   t  j|i | |dd| _| jd | _| jr#|  D ]}d|_qd S d S )Nr  rY   r   F)rQ   rR   r   r  r  r  r|   r  r  rW   r   r   rR   j  s   zDecoderCausal3DPadding.__init__c                    s4   t  |}|d d d d | jd d d d d f S r   )rQ   r[   r  rZ   rW   r   r   r[   s  s   (zDecoderCausal3DPadding.forwardr\   r   r   rW   r   r&  i  s    	r&  )r%   r&   )r2   r3   r4   r   r}   ))typingr   r   beartype.typingr   r   r   r    torch.nnr,   Ztorch.nn.functionalrw   r;   r   utilr   r   r#   r1   r@   rC   rG   rK   rO   r   r.   ra   rN   r   r   r   r   r   r   r   r   r   r   r  r  r&  r   r   r   r   <module>   sF    

	

&&@WK - 6