o
    Bfi)                  	   @   s   d dl mZmZmZ d dlZd dlmZmZ d dlmZm	Z	 ddl
mZmZ ddlmZ ddlmZ zd	ejj_d
ejj_W n eefyI   Y nw G dd dejZG dd dejZdS )    )CallableListOptionalN)Tensornn)cat_keep_shapesuncat_with_shapes   )CausalSelfAttentionSelfAttention)Mlp)
LayerScaleFi   c                       s  e Zd Zddddddddejejeeddfdedede	d	e
d
e
de
de	de	de	dedejf dedejf dedejf dedejf de
ddf fddZedeeef dB dedeeef dB fddZd$dedefddZd$dee dee fd d!Zd$dee fd"d#Z  ZS )%SelfAttentionBlock      @FT        Ndim	num_heads	ffn_ratioqkv_bias	proj_biasffn_biasdrop	attn_drop	drop_path	act_layer.
norm_layer
attn_class	ffn_layermask_k_biasreturnc              
      s   t    ||| _|||||||||d| _|	r t||	|dnt | _||| _t	|| }|||||||d| _
|	rDt||	|dnt | _|
| _d S )N)r   r   r   r   	proj_dropr   device)init_valuesr!   )in_featureshidden_featuresr   r   biasr!   )super__init__norm1attnr   r   Identityls1norm2intmlpls2sample_drop_ratio)selfr   r   r   r   r   r   r   r   r"   r   r   r   r   r   r   r!   Zmlp_hidden_dim	__class__ ?/data/cameron/keygrip/volume_dino_tracks/dinov3/layers/block.pyr'      s2   




zSelfAttentionBlock.__init__ropeindicesc                 C   sF   | d u rd S | \}}|j |j ksJ |j dkr|| || fS ||fS )N   )ndim)r6   r7   sincosr4   r4   r5   _maybe_index_ropeI   s   
z$SelfAttentionBlock._maybe_index_ropexc                 C   s   |j \}}}tt|d| j  d}|| }| jrr| jdkrrtj||jdd| }|| }| ||}	| j	| 
||	d}
tj|d| |
||d}tj||jdd| }|| }| | |}tj|d| |||d}|S || | j	| 
||d }|| | | | }|S )z
        This is the reference implementation for a single tensor, matching what is done below for a list.
        We call the list op on [x] instead of this function.
        r	   r   r!   Nr6   r   r   sourceindexalpha)shapemaxr-   r0   trainingtorchrandpermr!   r<   r)   r(   	index_addr+   r.   r,   r/   )r1   r=   r6   b_sample_subset_sizeresidual_scale_factor	indices_1Z
x_subset_1Zrope_subset
residual_1x_attn	indices_2Z
x_subset_2
residual_2x_ffnr4   r4   r5   _forwardW   s:   zSelfAttentionBlock._forwardx_listc              	      s  dd |D } fdd|D }dd t ||D } jr jdkrdd t |||D }dd t ||D }|durH fd	dt ||D }n|}t|\}	}
}t |	|
|} jj||d
} fddt ||||D }dd t |||D }dd t ||D }t|\}	}
} |	}t||
|} j	|} fddt ||||D }|S g }t ||D ]%\}}| 
 j ||d }|  	 | }|| q|}|S )z
        This list operator concatenates the tokens from the list of inputs together to save
        on the elementwise operations. Torch-compile memory-planning allows hiding the overhead
        related to concat ops.
        c                 S   s   g | ]}|j d  qS )r   )rD   .0r=   r4   r4   r5   
<listcomp>   s    z4SelfAttentionBlock._forward_list.<locals>.<listcomp>c                    s$   g | ]}t t|d  j  d qS )r	   )rE   r-   r0   )rW   rJ   r1   r4   r5   rX      s   $ c                 S   s   g | ]\}}|| qS r4   r4   )rW   rJ   rL   r4   r4   r5   rX          r   c                 S   *   g | ]\}}}t j||jd d| qS r>   NrG   rH   r!   rW   r=   rJ   rL   r4   r4   r5   rX          c                 S      g | ]\}}|| qS r4   r4   )rW   r=   rN   r4   r4   r5   rX      rZ   Nc                    s   g | ]
\}}  ||qS r4   )r<   )rW   r6   rN   rY   r4   r5   rX      s    	rope_listc              	      .   g | ]\}}}}t j|d  |||dqS r   r@   )rG   rI   r+   )rW   r=   rO   rN   rM   rY   r4   r5   rX          
c                 S   r[   r\   r]   r^   r4   r4   r5   rX      r_   c                 S   r`   r4   r4   )rW   r=   rQ   r4   r4   r5   rX      rZ   c              	      rc   rd   )rG   rI   r/   )rW   rP   rR   rQ   rM   rY   r4   r5   rX      re   r?   )ziprF   r0   r   r   r(   r)   forward_listr,   r.   r+   r/   append)r1   rU   rb   Zb_listZsample_subset_sizesZresidual_scale_factorsZindices_1_listZx_subset_1_listZrope_subset_list	flattenedshapes
num_tokensr(   Zresidual_1_listZx_attn_listZindices_2_listZx_subset_2_listZ
norm2_flatZ
norm2_listZresidual_2_listrS   Zx_outr=   r6   rP   r4   rY   r5   _forward_list   sT   





z SelfAttentionBlock._forward_listc                 C   sR   t |tr| j|g|gdd S t |tr'|d u r dd |D }| j||dS t)Nra   r   c                 S   s   g | ]}d qS Nr4   rV   r4   r4   r5   rX      s    z.SelfAttentionBlock.forward.<locals>.<listcomp>)
isinstancer   rl   listAssertionError)r1   Zx_or_x_listZrope_or_rope_listr4   r4   r5   forward   s   

zSelfAttentionBlock.forwardrm   )__name__
__module____qualname__r   GELU	LayerNormr   r   r-   floatboolr   Moduler'   staticmethodtupler   r<   rT   r   rl   rq   __classcell__r4   r4   r2   r5   r      sf    	00*Jr   c                       s   e Zd Zdddejejdfdedededee d	e	d
e
de
def fddZ				ddedB dedB dedB deddf
ddZdejfddZ  ZS )CausalSelfAttentionBlockr   NTr   r   r   r   ls_init_value	is_causalr   r   dropout_probc	           
         s   t    || _|| _|rt||dnt | _||| _t	||||d| _
||| _t|| }	t||	||d| _|rEt||d| _d S t | _d S )N)r"   )r   r    )r#   r$   r   r   )r&   r'   r   r   r   r   r*   r+   attention_normr
   	attentionffn_normr-   r   feed_forwardr/   )
r1   r   r   r   r~   r   r   r   r   Zffn_hidden_dimr2   r4   r5   r'      s   


$z!CausalSelfAttentionBlock.__init__      ?init_attn_stdinit_proj_stdinit_fc_stdfactorr   c                 C   s~   |p| j d }|p|| }|pd| j  d }| j|| | j  tjj| jj	j
|d tjj| jjj
|d | j  d S )Ng         )std)r   r   init_weightsr   reset_parametersr   initnormal_r   fc1weightfc2r   )r1   r   r   r   r   r4   r4   r5   r      s   
z%CausalSelfAttentionBlock.init_weightsr=   c                 C   s<   ||  | | || j }|| | | | }|S rm   )r+   r   r   r   r/   r   r   )r1   r=   rP   rS   r4   r4   r5   rq   	  s   z CausalSelfAttentionBlock.forward)NNNr   )rr   rs   rt   r   ru   rv   r-   rw   r   rx   r   r'   r   rG   r   rq   r|   r4   r4   r2   r5   r}      sT    	 
r}   )typingr   r   r   rG   r   r   dinov3.utilsr   r   r   r
   r   
ffn_layersr   layer_scaler   _dynamoconfigautomatic_dynamic_shapesZaccumulated_cache_size_limitAttributeErrorRuntimeErrorry   r   r}   r4   r4   r4   r5   <module>   s    
 C