o
    {i                     @   s   d dl Z d dlmZmZmZ d dlmZ d dlZd dlmZ ddl	m
Z
mZmZ ddlmZ G dd	 d	eZd
ejejddddedeeef deeef dededeejef fddZdS )    N)AnyTupleUnion)Enum)nn   )dinov3_vitl16Weightsconvert_path_or_url_to_url)DINOV3_BASE_URLc                   @   s   e Zd ZdZdS )DINOTxtWeights	LVTD2300MN)__name__
__module____qualname__r    r   r   >/data/cameron/keygrip/volume_dino_tracks/dinov3/hub/dinotxt.pyr      s    r   TzMhttps://dl.fbaipublicfiles.com/dinov3/thirdparty/bpe_simple_vocab_16e6.txt.gzF)
pretrainedweightsbackbone_weightsbpe_path_or_url
check_hashr   r   r   r   r   returnc                 C   sv  ddl m}m} ddlm} ddlm} |d1i ddddd	d
dddddddddddddddddddddddddddtd d!d d"d}	t	| |d#}
|d$d%d&d'd(d)dd dd*	}||	|
|d+}| r|
|j
_|  t|tu r|tjkrt d,}nt|tu r|tjkrtd-| t|}tjj||d.}|j|dd/ n|  |||d0fS )2Nr   )DINOTxtDINOTxtConfig)TextTransformer)get_tokenizer	embed_dimi   Zvision_model_freeze_backboneTZvision_model_train_img_size   Zvision_model_use_class_tokenZvision_model_use_patch_tokensZvision_model_num_head_blocks   Z"vision_model_head_blocks_drop_pathg333333?Z"vision_model_use_linear_projectionFZ%vision_model_patch_tokens_pooler_typemeanZvision_model_patch_token_layerr   Ztext_model_freeze_backboneZtext_model_num_head_blocksZ text_model_head_blocks_is_causalZ text_model_head_blocks_drop_probg        Ztext_model_tokens_pooler_typeargmaxZ text_model_use_linear_projectionZinit_logit_scaleg$I$I,@Zinit_logit_biasZfreeze_logit_scale)r   r   M   i   i            )	Zcontext_lengthZ
vocab_sizedim	num_heads
num_layers	ffn_ratio	is_causalZls_init_valueZdropout_prob)Zmodel_configvision_backbonetext_backbonezN/dinov3_vitl16/dinov3_vitl16_dinotxt_vision_head_and_text_encoder-a442d8f5.pthz Unsuported weights for DINOTxt: )r   )strict)r   r   )Zdinov3.eval.text.dinotxt_modelr   r   Z!dinov3.eval.text.text_transformerr   Zdinov3.eval.text.tokenizerr   mathlogr   Zvisual_modelbackboneevaltyper   r   r   AssertionErrorr
   torchhubload_state_dict_from_urlload_state_dictinit_weights)r   r   r   r   r   r   r   r   r   Zdinotxt_configr+   r,   modelurlZ'vision_head_and_text_encoder_state_dictr   r   r   $dinov3_vitl16_dinotxt_tet1280d20h24l   s   	

r;   )r.   typingr   r   r   enumr   r4   r   	backbonesr   r	   BackboneWeightsr
   utilsr   r   r   LVD1689MboolstrModuler;   r   r   r   r   <module>   s4   

