o
    i;                     @   s  d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlZd dlmZ ddlmZmZmZmZmZ edZd dedejdejfddZG dd dejZG dd dejZd!dejdefddZ d"ddZ!d"ddZ"d"ddZ#d"ddZ$dS )#    )partialN)SequenceTupleUnionCallableOptionalList)trunc_normal_   )Mlp
PatchEmbedSwiGLUFFNFusedMemEffAttentionNestedTensorBlockdinov2 TFfnmodulereturnc                 C   sf   |s
|r
| ||d |  D ]\}}|rd||fn|}t| |||dd q|r1|r1| ||d |S )Nr   name.T)r   r   r   depth_firstinclude_root)named_childrenjoinnamed_apply)r   r   r   r   r   
child_nameZchild_module r   F/data/cameron/moge_repo/moge/model/dinov2/models/vision_transformer.pyr      s   r   c                   @   s   e Zd Zdd ZdS )
BlockChunkc                 C   s   | D ]}||}q|S Nr   )selfxbr   r   r   forward&   s   
zBlockChunk.forwardN)__name__
__module____qualname__r%   r   r   r   r   r    %   s    r    c                       s   e Zd Zdddddddddddd	d
eejedddd	df fdd	Zedd Z	e	j
defddZ	dd Zdd Zd-ddZdd Zd-ddZd.dd Zd.d!d"Z						d/d#ejd$eeef d%ed&ed'eeejeej f  f
d(d)Zd	d*d+d,Z  ZS )0DinoVisionTransformer               g      @Tg        FNmlp   r   g?c                    s&  t    ttjdd | _| _d| _|| _	| _	|| _
|| _|| _|| _||||d| _| jj}ttdd| _ttd|| j | _|dksSJ |r_ttd|nd| _|du rl|g| ndd	 td||D d
krtd tn"dksdkrtd tndkrtd dd }|nt 	
fdd	t|D }|dkrd| _g }|| }td||D ]}|t g| ||||    qt dd	 |D | _!n	d| _t || _!| _"t | _#ttd| _$| %  dS )a  
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            proj_bias (bool): enable bias for proj in attn if True
            ffn_bias (bool): enable bias for ffn if True
            drop_path_rate (float): stochastic depth rate
            drop_path_uniform (bool): apply uniform drop rate across blocks
            weight_init (str): weight init scheme
            init_values (float): layer-scale init values
            embed_layer (nn.Module): patch embedding layer
            act_layer (nn.Module): MLP activation layer
            block_fn (nn.Module): transformer block class
            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
        ư>)epsr0   )img_size
patch_sizein_chans	embed_dimr   NTc                 S   s   g | ]}|  qS r   )item).0r#   r   r   r   
<listcomp>w       z2DinoVisionTransformer.__init__.<locals>.<listcomp>r/   zusing MLP layer as FFNswiglufusedZswigluzusing SwiGLU layer as FFNidentityzusing Identity layer as FFNc                  _   s   t  S r!   )nnIdentity)argskwargsr   r   r   f   s   z)DinoVisionTransformer.__init__.<locals>.fc                    s.   g | ]}	
|  d qS ))dim	num_heads	mlp_ratioqkv_bias	proj_biasffn_biasZ	drop_path
norm_layer	act_layer	ffn_layerinit_valuesr   )r8   irI   block_fnZdprr6   rG   rJ   rK   rD   rH   rC   rF   rE   r   r   r9      s     c                 S   s   g | ]}t |qS r   )r    )r8   pr   r   r   r9      r:   F)&super__init__r   r=   	LayerNormnum_featuresr6   Z
num_tokensZn_blocksrC   r4   num_register_tokensinterpolate_antialiasinterpolate_offsetpatch_embednum_patches	Parametertorchzeros	cls_token	pos_embedregister_tokenslinspaceloggerinfor   r   NotImplementedErrorrangechunked_blocksappendr>   
ModuleListblocksnormhead
mask_tokeninit_weights)r"   r3   r4   r5   r6   depthrC   rD   rE   rG   rF   drop_path_ratedrop_path_uniformrK   Zembed_layerrI   rN   rJ   block_chunksrT   rU   rV   rX   rA   Zblocks_listrd   	chunksizerL   	__class__rM   r   rQ   -   s^   
1


 &

zDinoVisionTransformer.__init__c                 C   s   t | ddS )N_onnx_compatible_modeF)getattrr"   r   r   r   onnx_compatible_mode   s   z*DinoVisionTransformer.onnx_compatible_modevaluec                 C   s
   || _ d S r!   )rs   )r"   rw   r   r   r   rv      s   
c                 C   sJ   t | jdd tjj| jdd | jd urtjj| jdd tt|  d S )N{Gz?stdr1   )	r	   r]   r=   initnormal_r\   r^   r   init_weights_vit_timmru   r   r   r   rk      s
   
z"DinoVisionTransformer.init_weightsc                 C   s  |j }|jd d }|jd }| jjd d }| js%||kr%||kr%| jS | j }|d d dd d f }	|d d dd d d f }
|jd }|| j || j }}tt|}||| ksaJ i }| js| j	dkrt|| j	 | }t|| j	 | }||f|d< n||f|d< t
jj|
d|||ddddfd| jd	|}
||f|
jd
d  ksJ |
dddddd}
tj|	d d d d d f |
jd dd|
fdd|S )Nr0   r   scale_factorsizer,   r
   bicubic)mode	antialiasrB   )dtypeshaper]   rv   floatr4   intmathsqrtrV   r=   
functionalinterpolatereshapepermuterU   flattenrZ   catexpandto)r"   r#   hwZprevious_dtypeZnpatch
batch_sizeNr]   Zclass_pos_embedZpatch_pos_embedrB   Zh0w0Mr@   sxsyr   r   r   interpolate_pos_encoding   s<   


:z.DinoVisionTransformer.interpolate_pos_encodingc                 C   s   |j \}}}}| |}|d ur"t|d| j|jd|}tj| j	
|j d dd|fdd}|| ||| }| jd uretj|d d d df | j
|j d dd|d d dd f fdd}|S )Nr~   r   r0   r   )r   rW   rZ   where	unsqueezerj   r   r   r   r\   r   r   r^   )r"   r#   masksBncr   r   r   r   r   prepare_tokens_with_masks   s   
$$
	z/DinoVisionTransformer.prepare_tokens_with_masksc           	   	      s    fddt ||D } jD ]}||}q|}g }t ||D ]0\}} |}||d d df |d d d jd f |d d  jd d f ||d q|S )Nc                    s   g | ]\}}}  ||qS r   )r   )r8   r#   r   arru   r   r   r9      s    z?DinoVisionTransformer.forward_features_list.<locals>.<listcomp>r   r0   x_norm_clstokenZx_norm_regtokensZx_norm_patchtokensZ	x_prenormr   )ziprg   rh   re   rT   )	r"   x_listZ
masks_listr#   blkZall_xoutputr   x_normr   ru   r   forward_features_list   s    


	z+DinoVisionTransformer.forward_features_listc                 C   s   t |tr| ||S | ||}| jD ]}||}q| |}|d d df |d d d| jd f |d d | jd d f ||dS )Nr   r0   r   )
isinstancelistr   r   rg   rh   rT   )r"   r#   r   r   r   r   r   r   forward_features	  s   



z&DinoVisionTransformer.forward_featuresc                 C   s   |  |}g t| j}}t|trt|| |n|}t| jD ]\}}||}||v r1|| q t|t|ksIJ dt| dt| d|S )Nonly  /  blocks found)r   lenrg   r   r   rc   	enumeratere   )r"   r#   nr   total_block_lenblocks_to_takerL   r   r   r   r   $_get_intermediate_layers_not_chunked  s   

.z:DinoVisionTransformer._get_intermediate_layers_not_chunkedc           	      C   s   |  |}g dt| jd }}}t|trt|| |n|}| jD ]}||d  D ]}||}||v r:|| |d7 }q+q#t|t|ksWJ dt| dt| d|S )Nr   r~   r0   r   r   r   )r   r   rg   r   r   rc   re   )	r"   r#   r   r   rL   r   r   Zblock_chunkr   r   r   r    _get_intermediate_layers_chunked'  s   



.z6DinoVisionTransformer._get_intermediate_layers_chunkedr#   r   r   return_class_tokenr   c           	         s   j r
||}n||}|rfdd|D }dd |D }fdd|D }|r@|j\ } fdd|D }|rItt||S t|S )Nc                    s   g | ]}  |qS r   )rh   r8   outru   r   r   r9   B  s    zADinoVisionTransformer.get_intermediate_layers.<locals>.<listcomp>c                 S   s   g | ]
}|d d df qS )Nr   r   r   r   r   r   r9   C  s    c                    s&   g | ]}|d d d j  d f qS )Nr0   )rT   r   ru   r   r   r9   D  s   & c                    s8   g | ]}|  j j d dddd qS )r~   r   r,   r0   r
   )r   r4   r   
contiguousr   r   r   r"   r   r   r   r9   G  s    *)rd   r   r   r   tupler   )	r"   r#   r   r   r   rh   outputsZclass_tokens_r   r   r   get_intermediate_layers5  s   z-DinoVisionTransformer.get_intermediate_layers)is_trainingc                O   s&   | j |i |}|r|S | |d S )Nr   )r   ri   )r"   r   r?   r@   retr   r   r   r%   O  s   zDinoVisionTransformer.forwardr!   )r0   )r0   FFT)r&   r'   r(   r   r=   GELUBlockrQ   propertyrv   setterboolrk   r   r   r   r   r   r   rZ   Tensorr   r   r   r   r   r%   __classcell__r   r   rq   r   r)   ,   sd    

$




r)   r   c                 C   s>   t | tjrt| jdd | jdurtj| j dS dS dS )zCViT weight initialization, original timm impl (for reproducibility)rx   ry   N)r   r=   Linearr	   weightbiasr{   zeros_r   r   r   r   r}   W  s   
r}   r+   c                 K   *   t d| ddddtttd|d|}|S )Ni  r.         Z
attn_classr4   r6   rl   rC   rD   rN   rT   r   r)   r   r   r   r4   rT   r@   modelr   r   r   	vit_small_     

r   c                 K   s*   t d| ddddtttd|d|}|S )Nr-   r.   r   r   r   r   r   r   r   r   r   vit_basem  r   r   c                 K   r   )Ni      r+   r   r   r   r   r   r   r   r   r   	vit_large{  r   r   c                 K   s*   t d| ddddtttd|d|}|S )	zW
    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
    i   (   r   r   r   r   Nr   r   r   r   r   r   
vit_giant2  s   

r   )r   TF)r   )r+   r   )%	functoolsr   r   loggingtypingr   r   r   r   r   r   rZ   torch.nnr=   Ztorch.utils.checkpointZtorch.nn.initr	   layersr   r   r   r   r   r   	getLoggerr`   Moduler   rf   r    r)   strr}   r   r   r   r   r   r   r   r   <module>   s(   	 
  -


