o
    9i]                     @   sz  d dl Z d dlZd dlmZmZmZ d dlZd dlZd dl	m
Z
 d dlmZ d dlmZ e eZzd dlZd dlZdZW n   dZed Y dd	lmZmZ d
d Zdd Zd(ddZG dd de
jZG dd de
jZG dd de
jZG dd deZ G dd de
jZ!G dd de
jZ"G dd deZ#d)d d!Z$G d"d# d#e
jZ%G d$d% d%e
jZ&G d&d' d'e
jZ'dS )*    N)AnyCallableOptional	rearrange)versionTFz+no module 'xformers'. Processing without...   )LinearAttentionMemoryEfficientCrossAttentionc                 C   s   t | jdks	J |d }td|d  }ttj|tjd|  }|j| j	d}| 
 dddf |dddf  }tjt|t|gdd}|d dkrZtjj|d}|S )	a  
    This matches the implementation in Denoising Diffusion Probabilistic Models:
    From Fairseq.
    Build sinusoidal embeddings.
    This matches the implementation in tensor2tensor, but differs slightly
    from the description in Section 3.5 of "Attention Is All You Need".
          i'  )dtype)deviceNdim)r   r   r   r   )lenshapemathlogtorchexparangefloat32tor   floatcatsincosnn
functionalpad)	timestepsembedding_dimhalf_dimemb r%   L/data/cameron/vidgen/generative-models/sgm/modules/diffusionmodules/model.pyget_timestep_embedding   s   $r'   c                 C   s   | t |  S N)r   sigmoidxr%   r%   r&   nonlinearity/   s   r,       c                 C   s   t jj|| dddS )Ngư>T)
num_groupsnum_channelsepsaffine)r   r   	GroupNorm)in_channelsr.   r%   r%   r&   	Normalize4   s   r4   c                       $   e Zd Z fddZdd Z  ZS )Upsamplec                    s6   t    || _| jrtjj||dddd| _d S d S )Nr   r   kernel_sizestridepaddingsuper__init__	with_convr   r   Conv2dconvselfr3   r>   	__class__r%   r&   r=   ;   s   

zUpsample.__init__c                 C   s(   t jjj|ddd}| jr| |}|S )Ng       @nearest)scale_factormode)r   r   r   interpolater>   r@   )rB   r+   r%   r%   r&   forwardC   s   
zUpsample.forward__name__
__module____qualname__r=   rI   __classcell__r%   r%   rC   r&   r6   :   s    r6   c                       r5   )
Downsamplec                    s6   t    || _| jrtjj||dddd| _d S d S )Nr   r   r   r7   r;   rA   rC   r%   r&   r=   K   s   

zDownsample.__init__c                 C   sF   | j rd}tjjj||ddd}| |}|S tjjj|ddd}|S )N)r   r   r   r   constantr   )rG   valuer   )r8   r9   )r>   r   r   r   r    r@   
avg_pool2d)rB   r+   r    r%   r%   r&   rI   T   s   
zDownsample.forwardrJ   r%   r%   rC   r&   rO   J   s    	rO   c                       s.   e Zd Zdddd fdd
Zdd Z  ZS )	ResnetBlockNFi   )out_channelsconv_shortcuttemb_channelsc                   s   t    || _|d u r|n|}|| _|| _t|| _tjj	||dddd| _
|dkr3tj||| _t|| _tj|| _tjj	||dddd| _| j| jkrp| jrbtjj	||dddd| _d S tjj	||dddd| _d S d S )Nr   r   r7   r   )r<   r=   r3   rT   use_conv_shortcutr4   norm1r   r   r?   conv1Linear	temb_projnorm2Dropoutdropoutconv2rU   nin_shortcut)rB   r3   rT   rU   r^   rV   rC   r%   r&   r=   _   s2   
	





zResnetBlock.__init__c                 C   s   |}|  |}t|}| |}|d ur'|| t|d d d d d d f  }| |}t|}| |}| |}| j| jkrQ| j	rL| 
|}|| S | |}|| S r(   )rX   r,   rY   r[   r\   r^   r_   r3   rT   rW   rU   r`   )rB   r+   tembhr%   r%   r&   rI      s    

&




zResnetBlock.forwardrJ   r%   r%   rC   r&   rS   ^   s    $rS   c                       s    e Zd ZdZ fddZ  ZS )LinAttnBlockzto match AttnBlock usagec                    s   t  j|d|d d S )Nr   )r   headsdim_head)r<   r=   rB   r3   rC   r%   r&   r=      s   zLinAttnBlock.__init__)rK   rL   rM   __doc__r=   rN   r%   r%   rC   r&   rc      s    rc   c                       s:   e Zd Z fddZdejdejfddZdd Z  ZS )		AttnBlockc                    s~   t    || _t|| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d S Nr   r   r7   )r<   r=   r3   r4   normr   r   r?   qkvproj_outrf   rC   r%   r&   r=      s   





zAttnBlock.__init__h_returnc           	      C   sv   |  |}| |}| |}| |}|j\}}}}tdd |||f\}}}tjj	|||}t
|d||||dS )Nc                 S   s   t | d S )Nzb c h w -> b 1 (h w) c)r   
contiguousr*   r%   r%   r&   <lambda>   s    z%AttnBlock.attention.<locals>.<lambda>zb 1 (h w) c -> b c h w)rb   wcb)rj   rk   rl   rm   r   mapr   r   r   scaled_dot_product_attentionr   )	rB   ro   rk   rl   rm   ru   rt   rb   rs   r%   r%   r&   	attention   s   




zAttnBlock.attentionc                 K       |}|  |}| |}|| S r(   rx   rn   rB   r+   kwargsro   r%   r%   r&   rI         

zAttnBlock.forward)	rK   rL   rM   r=   r   Tensorrx   rI   rN   r%   r%   rC   r&   rh      s    rh   c                       s>   e Zd ZdZ fddZdejdejfddZdd	 Z  Z	S )
MemoryEfficientAttnBlockz
    Uses xformers efficient implementation,
    see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
    Note: this is a single-head self-attention operation
    c                    s   t    || _t|| _tjj||dddd| _tjj||dddd| _	tjj||dddd| _
tjj||dddd| _d | _d S ri   )r<   r=   r3   r4   rj   r   r   r?   rk   rl   rm   rn   attention_oprf   rC   r%   r&   r=      s    






z!MemoryEfficientAttnBlock.__init__ro   rp   c                    s   |  |}| |}| |}| |}|j\ }}tdd |||f\}}}t fdd|||f\}}}tjj|||d | j	d}|
d d|jd dddd |jd }t|d	 ||d
S )Nc                 S   s
   t | dS )Nb c h w -> b (h w) cr   r*   r%   r%   r&   rr      s   
 z4MemoryEfficientAttnBlock.attention.<locals>.<lambda>c                    s@   |  d | jd ddddd d | jd  S )Nr   r   r   r   )	unsqueezereshaper   permuterq   )tBCr%   r&   rr      s
    )	attn_biasopr   r   r   r   b (h w) c -> b c h w)ru   rb   rs   rt   )rj   rk   rl   rm   r   rv   xformersopsmemory_efficient_attentionr   r   r   r   r   )rB   ro   rk   rl   rm   HWoutr%   r   r&   rx      s&   




z"MemoryEfficientAttnBlock.attentionc                 K   ry   r(   rz   r{   r%   r%   r&   rI     r}   z MemoryEfficientAttnBlock.forward)
rK   rL   rM   rg   r=   r   r~   rx   rI   rN   r%   r%   rC   r&   r      s
    r   c                       s   e Zd Zd fdd	Z  ZS )$MemoryEfficientCrossAttentionWrapperNc           
         sD   |j \}}}}t|d}t j|||d}	t|	d|||d}	||	 S )Nr   )contextmaskr   )rb   rs   rt   )r   r   r<   rI   )
rB   r+   r   r   unused_kwargsru   rt   rb   rs   r   rC   r%   r&   rI     s
   
z,MemoryEfficientCrossAttentionWrapper.forwardNN)rK   rL   rM   rI   rN   r%   r%   rC   r&   r     s    r   vanillac                 C   s   |dv sJ d| dt tjt dk r(|dkr(ts&J dtj dd}td	| d
|  d |dkrB|d u s>J t| S |dkrStd|  d t| S t	dkrb| |d< t
di |S |dkrkt| S t| S )N)r   vanilla-xformersmemory-efficient-cross-attnlinearnonez
attn_type z unknownz2.0.0r   z'We do not support vanilla attention in za anymore, as it is too expensive. Please install xformers via e.g. 'pip install xformers==0.0.16'r   zmaking attention of type 'z' with z in_channelsr   z'building MemoryEfficientAttnBlock with z in_channels...r   	query_dimr%   )r   parser   __version__XFORMERS_IS_AVAILABLElogpyinforh   r   typer   r   Identityrc   )r3   	attn_typeattn_kwargsr%   r%   r&   	make_attn  s0   



r   c                       s>   e Zd Zddddddd fdd
Zdd
dZdd Z  ZS )Modelr   r                 TFr   )ch_multr^   resamp_with_convuse_timestepuse_linear_attnr   c             
      s  t    |r	d}|| _| jd | _t|| _|| _|	| _|| _|
| _	| j	rBt
 | _t
tj
| j| jtj
| j| jg| j_tj
j|| jdddd| _|	}dt| }t
 | _t| jD ]Z}t
 }t
 }|||  }|||  }t| jD ]}|t||| j|d |}||v r|t||d q|t
 }||_||_|| jd krt|||_|d	 }| j| qat
 | _t||| j|d| j_t||d| j_t||| j|d| j_ t
 | _!t"t| jD ]k}t
 }t
 }|||  }|||  }t| jd D ]-}|| jkr|||  }|t|| || j|d |}||v r4|t||d qt
 }||_||_|d
krOt#|||_$|d	 }| j!%d
| qt&|| _'tj
j||dddd| _(d S )Nr   r   r   r   r7   r   r3   rT   rV   r^   r   r   r   ))r<   r=   chtemb_chr   num_resolutionsnum_res_blocks
resolutionr3   r   r   Modulera   
ModuleListr   rZ   denser?   conv_intupledownrangeappendrS   r   blockattnrO   
downsamplemidblock_1attn_1block_2upreversedr6   upsampleinsertr4   norm_outconv_out)rB   r   out_chr   r   attn_resolutionsr^   r   r3   r   r   r   r   curr_res
in_ch_multi_levelr   r   block_in	block_outi_blockr   skip_inr   rC   r%   r&   r=   9  s   











zModel.__init__Nc           	      C   s  |d urt j||fdd}| jr1|d usJ t|| j}| jjd |}t|}| jjd |}nd }| |g}t	| j
D ]D}t	| jD ](}| j| j| |d |}t| j| jdkrh| j| j| |}|| qE|| j
d kr|| j| |d  q>|d }| j||}| j|}| j||}tt	| j
D ]@}t	| jd D ]*}| j| j| t j|| gdd|}t| j| jdkr| j| j| |}q|dkr| j| |}q| |}t|}| |}|S )Nr   r   r   )r   r   r   r'   r   ra   r   r,   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   popr   r   r   )	rB   r+   r   r   ra   hsr   r   rb   r%   r%   r&   rI     sL   

zModel.forwardc                 C      | j jS r(   r   weightrB   r%   r%   r&   get_last_layer     zModel.get_last_layerr   )rK   rL   rM   r=   rI   r   rN   r%   r%   rC   r&   r   8  s    
y1r   c                       s4   e Zd Zddddddd fdd
Zd	d
 Z  ZS )Encoderr   r   TFr   )r   r^   r   double_zr   r   c             
      s  t    |r	d}|| _d| _t|| _|| _|	| _|| _t	j
j|| jdddd| _|	}dt| }|| _t
 | _t| jD ]Z}t
 }t
 }|||  }|||  }t| jD ]}|t||| j|d |}||v rw|t||d qZt
 }||_||_|| jd krt|||_|d	 }| j| q?t
 | _t||| j|d| j_t||d| j_t||| j|d| j_t|| _t	j
j||rd	|
 n|
dddd| _ d S )
Nr   r   r   r   r7   r   r   r   r   )!r<   r=   r   r   r   r   r   r   r3   r   r   r?   r   r   r   r   r   r   r   rS   r   r   r   r   rO   r   r   r   r   r   r4   r   r   )rB   r   r   r   r   r   r^   r   r3   r   
z_channelsr   r   r   ignore_kwargsr   r   r   r   r   r   r   r   r   rC   r%   r&   r=     s~   






zEncoder.__init__c                 C   s   d }|  |g}t| jD ]D}t| jD ](}| j| j| |d |}t| j| jdkr7| j| j| |}|| q|| jd krQ|| j| 	|d  q|d }| j
||}| j
|}| j
||}| |}t|}| |}|S )Nr   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   )rB   r+   ra   r   r   r   rb   r%   r%   r&   rI   @  s&   

zEncoder.forwardrJ   r%   r%   rC   r&   r     s    Xr   c                       sh   e Zd Zdddddddd fdd
Zd	efd
dZd	efddZd	efddZdd Zdd Z	  Z
S )Decoderr   r   TFr   )r   r^   r   give_pre_endtanh_outr   r   c             
      s  t    |r	d}|| _d| _t|| _|| _|	| _|| _|| _	|| _
dt| }||| jd   }|	d| jd   }d|
||f| _td| jt| j |  }|  }|  }tjj|
|dddd| _t | _|||| j|d	| j_|||d
| j_|||| j|d	| j_t | _t t!| jD ]T}t }t }|||  }t!| jd D ]}|"|||| j|d	 |}||v r|"|||d
 qt }||_#||_$|dkrt%|||_&|d }| j'd| qt(|| _)|||dddd| _*d S )Nr   r   r   r   r   z+Working with z of shape {} = {} dimensions.r   r7   r   r   )+r<   r=   r   r   r   r   r   r   r3   r   r   r   z_shaper   r   formatnpprod
_make_attn_make_resblock
_make_convr   r   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   r6   r   r   r4   r   r   )rB   r   r   r   r   r   r^   r   r3   r   r   r   r   r   r   ignorekwargsr   r   r   make_attn_clsmake_resblock_clsmake_conv_clsr   r   r   r   r   r   rC   r%   r&   r=   ]  s   








zDecoder.__init__rp   c                 C      t S r(   )r   r   r%   r%   r&   r        zDecoder._make_attnc                 C   r   r(   )rS   r   r%   r%   r&   r     r   zDecoder._make_resblockc                 C   s   t jjS r(   )r   r   r?   r   r%   r%   r&   r     r   zDecoder._make_convc                 K   r   r(   r   )rB   r|   r%   r%   r&   r     r   zDecoder.get_last_layerc                 K   s&  |j | _d }| |}| jj||fi |}| jj|fi |}| jj||fi |}tt| j	D ]?}t| j
d D ])}| j| j| ||fi |}t| j| jdkrd| j| j| |fi |}q;|dkrq| j| |}q2| jrw|S | |}t|}| j|fi |}| jrt|}|S )Nr   r   )r   last_z_shaper   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   r   r   tanh)rB   zr|   ra   rb   r   r   r%   r%   r&   rI     s.   


zDecoder.forward)rK   rL   rM   r=   r   r   r   r   r   rI   rN   r%   r%   rC   r&   r   \  s    br   )r-   )r   N)(loggingr   typingr   r   r   numpyr   r   torch.nnr   einopsr   	packagingr   	getLoggerrK   r   r   xformers.opsr   warningmodules.attentionr	   r
   r'   r,   r4   r   r6   rO   rS   rc   rh   r   r   r   r   r   r   r%   r%   r%   r&   <module>   s@   

<+@
	# 0u