o
     ݱiD"                     @   s   d dl Z d dlmZ d dlm  mZ ddlmZ G dd dejZ	G dd dejZ
G dd	 d	ejZG d
d dejZG dd dejZG dd dejZdS )    N   )apply_rotary_embedc                       s0   e Zd ZdZejdf fdd	Zdd Z  ZS )MLPz-Multilayer perceptron with two hidden layers.        c                    s>   t    t||| _| | _t||| _t|| _d S N)	super__init__nnLinearfc1actfc2Dropoutdrop)selfin_dim
hidden_dimout_dimr   r   	__class__ C/data/cameron/vidgen/unified-world-model/models/common/attention.pyr      s
   
zMLP.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   )r   xr   r   r   forward   s   




zMLP.forward)	__name__
__module____qualname____doc__r	   GELUr   r   __classcell__r   r   r   r   r      s    r   c                       s:   e Zd ZdZ							d fdd	Zdd
dZ  ZS )	AttentionzMultiheaded self-attention.   Fr   r   Tc	           	         s   t    || _|| | _tj||d |d| _t|| _t||| _	t|| _
|| _|| _|r?|dkr?td d| _n|| _| jsVtjt||f dd| _d S d S )N   biasr   z6Disabling torch spda kernel for block causal attentionF)requires_grad)r   r   	num_headshead_dimr	   r
   qkvr   	attn_dropproj	proj_drop	is_causalcausal_blockprintuse_sdpa	Parametertorchonesboolcausal_block_mat)	r   dimr&   qkv_biasr)   r+   r,   r-   r/   r   r   r   r      s&   

zAttention.__init__Nc                    s  |j \}}}|d urt|j dksJ |d} |||d j| j ddddd}|d\}}	}
|d urEt||}t|	|}	 j	rWt
j||	|
| jj jd}ns||	dd  jd	  } jr|d u smJ | j dksvJ | j }tj fd
dt|D  }tjtj|||jddd }t| |}||dd||td}|d ur|| td}|jdd} |}||
 }|dd|||} |} |}|S )Nr"   r      r      )	dropout_pr,         ?c                    s   g | ]} j qS r   )r4   ).0_r   r   r   
<listcomp>]   s    z%Attention.forward.<locals>.<listcomp>)device)diagonalz-infr5   ) shapelen	unsqueezer(   reshaper&   permuteunbindr   r/   Fscaled_dot_product_attentionr)   pr,   	transposer'   r-   r1   
block_diagrangetriur2   rA   r3   logical_andmasked_fillviewfloatsoftmaxr*   r+   )r   r   	pos_embed	attn_maskBNDr(   qkvattn
num_blocksZblock_diag_matZtriu_matmaskr   r?   r   r   ?   sP   






zAttention.forward)r!   Fr   r   Fr   TNNr   r   r   r   r   r   r   r   r   r   r   r       s    !r    c                	       B   e Zd ZdZdddddejejddf	 fdd	Zdd
dZ  Z	S )AttentionBlockzmMultiheaded self-attention block.

    Combines an attention layer and an MLP with residual connections.
    r!         @Fr   r   c              	      sV   t    ||| _t||||||	|
d| _||| _t|t|| |||d| _d S )Nr5   r&   r6   r)   r+   r,   r-   r   r   r   r   r   )	r   r   norm1r    r^   norm2r   intmlpr   r5   r&   	mlp_ratior6   r   r)   r   normr,   r-   r   r   r   r   v   s&   


	
zAttentionBlock.__init__Nc                 C   s0   ||  | ||| }|| | | }|S r   )r^   rh   rk   ri   )r   r   rV   rW   r   r   r   r      s   zAttentionBlock.forwardra   
r   r   r   r   r	   r   	LayerNormr   r   r   r   r   r   r   rd   p   s    !rd   c                       s6   e Zd ZdZ					d fdd	Zdd	d
Z  ZS )CrossAttentionzMultiheaded cross-attention.r!   Fr   Tc                    sr   t    || _|| | _tj|||d| _tj||d |d| _t|| _	t||| _
t|| _|| _d S )Nr#   r7   )r   r   r&   r'   r	   r
   r[   kvr   r)   r*   r+   use_spda)r   r5   r&   r6   r)   r+   rs   r   r   r   r      s   
	

zCrossAttention.__init__Nc                 C   s&  |j \}}}| |||| j|| j dddd}|d ur$t||}|j \}}	}| |||	d| j|| j ddddd}
|
d\}}|d urQt||}| jr`t	j
|||| jjd}n||dd | jd	  }|jdd
}| |}|| }|dd|||}| |}| |}|S )Nr   r7   r   r"   r8   )r9   r:   r;   r<   rC   )rD   r[   rG   r&   rH   r   rr   rI   rs   rJ   rK   r)   rL   rM   r'   rU   r*   r+   )r   r   cx_pos_embedc_pos_embedrX   ZNxrZ   r[   ZNcrr   r\   r]   xattnr   r   r   r      s2   




zCrossAttention.forward)r!   Fr   r   Tra   rb   r   r   r   r   rq      s    rq   c                       s>   e Zd ZdZdddddejejf fdd	Zdd	d
Z  Z	S )CrossAttentionBlockzsMultiheaded cross-attention block.

    Combines a cross-attention layer and an MLP with residual connections.
    r!   re   Fr   c	           	         sR   t    ||| _t|||||d| _||| _t|t|| |||d| _d S )Nr5   r&   r6   r)   r+   rg   )	r   r   rh   rq   rw   ri   r   rj   rk   )	r   r5   r&   rm   r6   r   r)   r   rn   r   r   r   r      s"   



zCrossAttentionBlock.__init__Nc                 C   s2   ||  || ||| }|| | | }|S r   )rw   rh   rk   ri   r   r   rt   ru   rv   r   r   r   r      s   zCrossAttentionBlock.forwardra   ro   r   r   r   r   rx      s    rx   c                	       rc   )MixedAttentionBlockzMultiheaded mixed-attention block.

    Combines a self-attention, a cross-attention, and an MLP with residual connections.
    r!   re   Fr   r   c              	      st   t    ||| _t||||||	|
d| _||| _t|||||d| _||| _t	|t
|| |||d| _d S )Nrf   ry   rg   )r   r   rh   r    r^   ri   rq   rw   norm3r   rj   rk   rl   r   r   r   r     s6   


	

zMixedAttentionBlock.__init__Nc                 C   sH   ||  | || }|| | |||| }|| | | }|S r   )r^   rh   rw   ri   rk   r|   rz   r   r   r   r   -  s   zMixedAttentionBlock.forwardra   ro   r   r   r   r   r{      s    )r{   )r1   torch.nnr	   torch.nn.functional
functionalrJ   utilsr   Moduler   r    rd   rq   rx   r{   r   r   r   r   <module>   s    U-8)