o
     ݱi                     @   st   d dl mZ ddlmZmZmZ dd ZG dd dejZG dd	 d	ejZ	G d
d dejZ
G dd dejZdS )    N   )	AttentionCrossAttentionMLPc                 C   s   | d| d  | d S )Nr   )	unsqueeze)xshiftscale r
   I/data/cameron/vidgen/unified-world-model/models/common/adaln_attention.pymodulate   s   r   c                	       sB   e Zd ZdZdddddejejddf	 fdd	Zdd
dZ  Z	S )AdaLNAttentionBlockzNMultiheaded self-attention block with adaptive layer normalization modulation.         @F        r   c              	      s   t    |	|ddd| _t||||||
|d| _|	|ddd| _t|t|| |||d| _t	
t	 t	|d| | _d S )NFư>elementwise_affineeps)dim	num_headsqkv_bias	attn_drop	proj_drop	is_causalcausal_blockin_dim
hidden_dimout_dimactdrop   )super__init__norm1r   attnnorm2r   intmlpnn
SequentialSiLULinearadaLN_modulation)selfr   cond_dimr   	mlp_ratior   r!   r   r    normr   r   	__class__r
   r   r$      s.   
	

zAdaLNAttentionBlock.__init__Nc                 C   st   |  |jddd\}}}}}	}
||d| t| |||||  }||
d| t| |||	  }|S Nr"   r   r   )r.   chunkr   r&   r   r%   r)   r'   )r/   r   cond	pos_embed	attn_mask	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpr
   r
   r   forward3   s   zAdaLNAttentionBlock.forwardNN
__name__
__module____qualname____doc__r*   GELU	LayerNormr$   rA   __classcell__r
   r
   r3   r   r   
   s    &r   c                       >   e Zd ZdZdddddejejf fdd	Zdd	d
Z  Z	S )AdaLNCrossAttentionBlockzOMultiheaded cross-attention block with adaptive layer normalization modulation.r   r   Fr   c
           
         s|   t    |	|ddd| _t|||||d| _|	|ddd| _t|t|| |||d| _t	
t	 t	|d| | _d S NFr   r   )r   r   r   r   r   r   r"   )r#   r$   r%   r   xattnr'   r   r(   r)   r*   r+   r,   r-   r.   
r/   r   r0   r   r1   r   r!   r   r    r2   r3   r
   r   r$   C   s*   


z!AdaLNCrossAttentionBlock.__init__Nc                 C   sv   |  |jddd\}}}}	}
}||d| t|||| |||  }||d| t| ||	|
  }|S r5   )r.   r7   r   rN   r   r%   r)   r'   r/   r   cr8   Zx_pos_embedZc_pos_embedr;   r<   r=   r>   r?   r@   r
   r
   r   rA   e   s   z AdaLNCrossAttentionBlock.forwardrB   rC   r
   r
   r3   r   rL   @   s    "rL   c                       rK   )AdaLNHybridAttentionBlockzPMultiheaded hybrid attention block with adaptive layer normalization modulation.r   r   Fr   c
           
         s   t    |	|ddd| _t|||||d| _|	|ddd| _t|||||d| _|	|ddd| _t	|t
|| |||d| _tt t|d| | _d S rM   )r#   r$   r%   r   r&   r'   r   rN   norm3r   r(   r)   r*   r+   r,   r-   r.   rO   r3   r
   r   r$   u   s:   


z"AdaLNHybridAttentionBlock.__init__Nc                 C   s   |  |jddd\}}}}	}
}||d| t| ||||  }|| | |||| }||d| t| 	||	|
  }|S r5   )
r.   r7   r   r&   r   r%   rN   r'   r)   rS   rP   r
   r
   r   rA      s   z!AdaLNHybridAttentionBlock.forwardrB   rC   r
   r
   r3   r   rR   r   s    *rR   c                       s$   e Zd Z fddZdd Z  ZS )AdaLNFinalLayerc                    sL   t    tj|ddd| _t||| _tt t|d| | _	d S )NFr   r      )
r#   r$   r*   rI   r2   r-   linearr+   r,   r.   )r/   r   r0   r3   r
   r   r$      s   

zAdaLNFinalLayer.__init__c                 C   s4   |  |jddd\}}| t| |||}|S )NrU   r   r6   )r.   r7   rV   r   r2   )r/   r   r8   r   r	   r
   r
   r   rA      s   zAdaLNFinalLayer.forward)rD   rE   rF   r$   rA   rJ   r
   r
   r3   r   rT      s    	rT   )torch.nnr*   	attentionr   r   r   r   Moduler   rL   rR   rT   r
   r
   r
   r   <module>   s    62;