o
    {i:                     @   sb  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZ d dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ e dZeeeedd	eed
d	eedd	dZeejddeejddedZejej ej!dZ"d(dej#de$fddZ%G dd dej#Z&d)ddZ'd)ddZ(d)ddZ)d)d d!Z*d)d"d#Z+d)d$d%Z,d)d&d'Z-dS )*    N)partial)AnyDictListLiteralOptionalSequenceTupleUnion)Tensornn)
LayerScaleMlp
PatchEmbedRMSNormRopePositionEmbeddingSelfAttentionBlock	SwiGLUFFN)named_applydinov3    )align_to@      )mlpswigluZswiglu32swiglu64Z	swiglu128gư>)epsgh㈵>)	layernormlayernormbf16Zrmsnorm)fp32fp16bf16 modulenamec                 C   s   t | tjr?tjjj| jdd | jd urtj| j t	| dr?| j
d ur?| j}| j
d | j
|d d| d  d t | tjrI|   t | trR|   t | tr[|   t | trf|   d S d S )N{Gz?std	bias_mask         r   )
isinstancer   Lineartorchinittrunc_normal_weightbiaszeros_hasattrr)   out_featuresfill_	LayerNormreset_parametersr   r   r   )r$   r%   o r;   L/data/cameron/keygrip/volume_dino_tracks/dinov3/models/vision_transformer.pyinit_weights_vit(   s"   
 


r=   c                8       s   e Zd Zddddddddddddd	d	d
ddddddddddddddedededededB dedB ded dedB dedB dedB dededed ed!ed"ed#ed$edB d%ed&ed'ed(ed)ed*ed+ed,ed-edB f6 fd.d/Z	d0d1 Z
dMd2ed3eeee f fd4d5Zd6ee d7ee d3eeeef  fd8d9ZdMd2eee B d:ee d3eeeef  fd;d<ZdNd2ed>ed3ee fd?d@Zd=dddddAd2ejd>eeef dBedCedDedEed3eeejeejdFf f  fdGdHZddIdJed3eeeef  eB fdKdLZ  ZS )ODinoVisionTransformer      r+   g      Y@Nseparater"         g      @Tg        r   r   r   F)img_size
patch_sizein_chanspos_embed_rope_basepos_embed_rope_min_periodpos_embed_rope_max_periodpos_embed_rope_normalize_coordspos_embed_rope_shift_coordspos_embed_rope_jitter_coordspos_embed_rope_rescale_coordspos_embed_rope_dtype	embed_dimdepth	num_heads	ffn_ratioqkv_biasdrop_path_ratelayerscale_init
norm_layer	ffn_layerffn_bias	proj_biasn_storage_tokensmask_k_biasuntie_cls_and_patch_normsuntie_global_and_local_cls_normdevicerD   rE   rF   rG   rH   rI   rJ   )minmaxrA   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   c                   s$  t    t|dkrtd|  ~t|  | _| _|| _	| _	|| _
t|||dd| _ttjdd d| _|| _| jdkrSttjd| d| _td| d td	| d td
| d td| d td| d td|
 d td|	 d td| d t	||||||	|
t|  d| _td| d t| |g|  	
fddt|D }d| _t|| _| _|| _|r| _nd | _|| _ |r| _!nd | _!t" | _#ttjd d| _$d S )Nr   zIgnored kwargs: F)rD   rE   rF   rO   flatten_embeddingr*   )r^   zusing base=z for rope newzusing min_period=zusing max_period=zusing normalize_coords=zusing shift_coords=zusing rescale_coords=zusing jitter_coords=zusing dtype=)rO   rQ   base
min_period
max_periodnormalize_coordsshift_coordsjitter_coordsrescale_coordsdtyper^   zusing z layer as FFNc                    s4   g | ]}t 	| 
tj d qS ))dimrQ   rR   rS   rY   rX   	drop_pathrV   	act_layerrW   init_valuesr[   r^   )r   r   GELU).0ir^   rT   rO   rX   Zffn_layer_clsZffn_ratio_sequencerU   r[   Znorm_layer_clsrQ   rY   rS   r;   r<   
<listcomp>   s$    z2DinoVisionTransformer.__init__.<locals>.<listcomp>)%super__init__lenloggerwarningnorm_layer_dictnum_featuresrO   n_blocksrQ   rE   r   patch_embedr   	Parameterr/   empty	cls_tokenrZ   storage_tokensinfor   
dtype_dict
rope_embedffn_layer_dictrangeZchunked_blocks
ModuleListblocksnormr\   cls_normr]   local_cls_normIdentityhead
mask_token)selfrD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   Zignored_kwargsZblocks_list	__class__rq   r<   rt   <   sx   
 

 

zDinoVisionTransformer.__init__c                 C   sT   | j   tjj| jdd | jdkrtjj| jdd tj| j	 t
t|  d S )Nr&   r'   r   )r   _init_weightsr   r0   normal_r~   rZ   r   r4   r   r   r=   r   r;   r;   r<   init_weights   s   

z"DinoVisionTransformer.init_weightsxreturnc           	      C   s   |  |}|j\}}}}|dd}|d ur,t|d| j|jd|}| j	}n| j	d| j  }| j
dkr=| j}ntjdd|jd |j|jd}tj||dd||dd|gdd}|||ffS )Nr*   r,   r   )ri   r^   rj   )r{   shapeflattenr/   where	unsqueezer   tori   r~   rZ   r   r}   r^   catexpand)	r   r   masksBHW_r~   r   r;   r;   r<   prepare_tokens_with_masks   s0   
$
	z/DinoVisionTransformer.prepare_tokens_with_masksx_list
masks_listc              	      s  g }g }t ||D ]\}} ||\}}|| || q	t jD ]\}	}
 jd ur8 fdd|D }ndd |D }|
||}q%|}g }tt ||D ]\}\}} js\ jr jrw jrw|dkrw 	|d d d  j
d f }n$ jr |d d d  j
d f }n |d d d  j
d f } |d d  j
d d f }n |}|d d d  j
d f }|d d  j
d d f }||d d df |d d dd f |||d qP|S )Nc                    s   g | ]\}} j ||d qS )r   r   )r   )ro   r   r   r   r;   r<   rr      s    z?DinoVisionTransformer.forward_features_list.<locals>.<listcomp>c                 S   s   g | ]}d qS Nr;   )ro   rr;   r;   r<   rr      s    r*   r   )x_norm_clstokenZx_storage_tokensx_norm_patchtokensZ	x_prenormr   )zipr   append	enumerater   r   r\   r]   trainingr   rZ   r   r   )r   r   r   r   ropeZt_xZt_masksZt2_xZhw_tupler   blkrope_sincosZall_xoutputidxr   x_norm_cls_regx_norm_patchZx_normr;   r   r<   forward_features_list   sB   

"" "
	z+DinoVisionTransformer.forward_features_listr   c                 C   s,   t |tjr| |g|gd S | ||S Nr   )r-   r/   r   r   )r   r   r   r;   r;   r<   forward_features  s   z&DinoVisionTransformer.forward_featuresr*   nc                 C   s   |  |\}\}}g t| j}}t|trt|| |n|}t| jD ]!\}}	| jd ur5| j||d}
nd }
|	||
}||v rE|| q$t|t|ks]J dt| dt| d|S )Nr   zonly z / z blocks found)	r   ru   r   r-   intr   r   r   r   )r   r   r   r   r   r   Ztotal_block_lenblocks_to_takerp   r   r   r;   r;   r<   $_get_intermediate_layers_not_chunked  s   


.z:DinoVisionTransformer._get_intermediate_layers_not_chunked)r   reshapereturn_class_tokenreturn_extra_tokensr   r   r   r   r   .c                   sJ   ||}|rIg }|D ]:}	jr>|	d d d jd f }
|	d d jd d f }|tj|
|fdd q||	 q|}dd |D }fdd|D }fdd|D }|rw|j\ } fdd|D }|s|st	|S |r|st	t
||S |s|rt	t
||S |r|rt	t
|||S d S d S )Nr*   r   c                 S   s   g | ]
}|d d df qS r   r;   ro   outr;   r;   r<   rr   2  s    zADinoVisionTransformer.get_intermediate_layers.<locals>.<listcomp>c                    s&   g | ]}|d d d j d f qS Nr*   rZ   r   r   r;   r<   rr   3     & c                    s&   g | ]}|d d  j d d f qS r   r   r   r   r;   r<   rr   4  r   c                    s8   g | ]}|  j j d dddd qS )r   r   r+   r*   r,   )r   rE   permute
contiguousr   r   hr   wr;   r<   rr   7  s    *)r   r\   r   rZ   r   r   r/   r   r   tupler   )r   r   r   r   r   r   r   outputsZoutputs_normedr   r   r   Zclass_tokensZextra_tokensr   r;   r   r<   get_intermediate_layers  s6   
  z-DinoVisionTransformer.get_intermediate_layers)is_trainingr   c                O   s&   | j |i |}|r|S | |d S )Nr   )r   r   )r   r   argskwargsretr;   r;   r<   forwardD  s   zDinoVisionTransformer.forwardr   )r*   )__name__
__module____qualname__r   floatr   strboolr   rt   r   r   r	   r   r   r   r   r   r   r   r/   r
   r   r   r   __classcell__r;   r;   r   r<   r>   ;   s    	
z * 0)
	
0'r>   r@   c                 K      t d| ddddd|}|S )Ni  rC         rE   rO   rP   rQ   rR   r;   r>   rE   r   modelr;   r;   r<   	vit_smallL     r   c                 K   s   t d| ddddd|}|S )NrB   rC   r   r   r;   r   r   r;   r;   r<   vit_baseX  r   r   c                 K   r   )Ni      r@   r   r   r;   r   r   r;   r;   r<   	vit_larged  r   r   c                 K   r   )Ni        g>@8@r   r;   r   r   r;   r;   r<   
vit_so400mp  r   r   c                 K   r   )Ni   r      r   r   r;   r   r   r;   r;   r<   	vit_huge2|  r   r   c                 K   s   t d| ddddd|}|S )zW
    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
    i   (   r   r   r   Nr;   r   r   r;   r;   r<   
vit_giant2  s   r   c                 K   r   )Ni   r   r   r+   r   r;   r   r   r;   r;   r<   vit_7b  r   r   )r#   )r@   ).logging	functoolsr   typingr   r   r   r   r   r   r	   r
   r/   Ztorch.nn.initr   r   Zdinov3.layersr   r   r   r   r   r   r   dinov3.utilsr   	getLoggerrv   r   r8   rx   float32float16bfloat16r   Moduler   r=   r>   r   r   r   r   r   r   r   r;   r;   r;   r<   <module>   sD   ($



	  





