
    |2g<<                        d dl Z d dlZd dlmZ d dlZd dlZd dlmZ d dlmZ d dl Z d dlZd dl	m
Z
mZmZmZ d dlZd dlmZ d dlZd dlmZ d dlmZmZmZmZmZ  ej.                  d      Zddedej2                  d	ej2                  fd
Z G d dej6                        Z G d dej2                        Zd Zddede fdZ!ddej2                  de"fdZ#ddZ$ddZ%ddZ&ddZ' G d dej2                        Z(y)    N)partial)SequenceTupleUnionCallable)trunc_normal_)Mlp
PatchEmbedSwiGLUFFNFusedMemEffAttentionNestedTensorBlockdinov2fnmodulereturnc                     |s|r
 | ||       |j                         D ],  \  }}|rdj                  ||f      n|}t        | |||d       . |r|r
 | ||       |S )Nr   name.T)r   r   r   depth_firstinclude_root)named_childrenjoinnamed_apply)r   r   r   r   r   
child_namechild_modules          </home/cameronsmith/repos/FeatUp/featup/featurizers/DINOv2.pyr   r      sn    <
&t$$*$9$9$; m 
L59SXXtZ01z
r,Z[gklm |
&t$M    c                       e Zd Zd Zy)
BlockChunkc                 $    | D ]
  } ||      } |S N )selfxbs      r   forwardzBlockChunk.forward%   s     	A!A	r   N)__name__
__module____qualname__r'   r#   r   r   r    r    $   s    r   r    c                        e Zd Zdddddddddddd	d
eej
                  eddf fd	ZddZd Z	d Z
ddZd ZddZd dZd dZ	 	 	 	 d!dej"                  deeef   dededeeej"                  eej"                     f      f
dZd	ddZ xZS )"DinoVisionTransformer               g      @T        FNmlp   c                    t         |           t        t        j                  d      }|x| _        | _        d| _        || _        || _	        || _
         |||||      | _        | j                  j                  }t        j                  t        j                  dd|            | _        t        j                  t        j                  d|| j                  z   |            | _        |du r|g|z  }n2t        j$                  d||      D cg c]  }|j'                          }}|dk(  rt(        j+                  d       t,        }nL|d	k(  s|d
k(  rt(        j+                  d       t.        }n&|dk(  rt(        j+                  d       d }|}nt0        t3        |      D cg c]  } ||||||
|	||   ||||       }}|dkD  rd| _        g }||z  }t3        d||      D ]2  }|j7                  t        j8                         g|z  ||||z    z          4 t        j:                  |D cg c]  }t=        |       c}      | _        n!d| _        t        j:                  |      | _         ||      | _         t        j8                         | _!        t        j                  t        j                  d|            | _"        | jG                          yc c}w c c}w c c}w )a  
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            proj_bias (bool): enable bias for proj in attn if True
            ffn_bias (bool): enable bias for ffn if True
            drop_path_rate (float): stochastic depth rate
            drop_path_uniform (bool): apply uniform drop rate across blocks
            weight_init (str): weight init scheme
            init_values (float): layer-scale init values
            embed_layer (nn.Module): patch embedding layer
            act_layer (nn.Module): MLP activation layer
            block_fn (nn.Module): transformer block class
            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
        ư>)epsr4   )img_size
patch_sizein_chans	embed_dimTr   r3   zusing MLP layer as FFNswiglufusedswigluzusing SwiGLU layer as FFNidentityzusing Identity layer as FFNc                  *    t        j                         S r"   )nnIdentity)argskwargss     r   fz)DinoVisionTransformer.__init__.<locals>.fs   s    {{}$r   )dim	num_heads	mlp_ratioqkv_bias	proj_biasffn_bias	drop_path
norm_layer	act_layer	ffn_layerinit_valuesFN)$super__init__r   r@   	LayerNormnum_featuresr;   
num_tokensn_blocksrF   r9   patch_embednum_patches	Parametertorchzeros	cls_token	pos_embedlinspaceitemloggerinfor	   r   NotImplementedErrorrangechunked_blocksappendrA   
ModuleListr    blocksnormhead
mask_tokeninit_weights)r$   r8   r9   r:   r;   depthrF   rG   rH   rJ   rI   drop_path_ratedrop_path_uniformrO   embed_layerrM   block_fnrN   block_chunksrL   rW   dprr%   rD   iblocks_listrc   	chunksizep	__class__s                                r   rQ   zDinoVisionTransformer.__init__+   s   V 	R\\t4
-66DN"$&ZZbnwx&&22ekk!Q	&BCekk![4??5RT]&^_$!"U*C%*^^A~u%MN1668NCNKK01I-'9+@KK34&I*$KK56% I%%  5\
  ##!#!a&%##'
 
  !"&DN-I1eY/ \%%r{{}o&9KAPYM<Z&Z[\ --(O1A(OPDK"'D--4DKy)	KKM	,,u{{1i'@Ag O$
. )Ps   K
K%Kc                    | j                  |      }g }t        | j                        D ]]  \  }} ||      }t        | j                        |z
  |k  s*|r!|j	                  | j                  |             M|j	                  |       _ |S r"   )prepare_tokens_with_masks	enumeraterf   lenrd   rg   )r$   r%   nrg   featrr   blks          r   get_intermediate_featz+DinoVisionTransformer.get_intermediate_feat   s{    **1-, 	#FAsAA4;;!#q(KK		!-KKN	# r   c                     t        | j                  d       t        j                  j	                  | j
                  d       t        t        |        y )N{Gz?stdr6   )r   r\   r@   initnormal_r[   r   init_weights_vit_timm)r$   s    r   rj   z"DinoVisionTransformer.init_weights   s3    dnn$/
D1)40r   c                    |j                   }|j                  d   dz
  }| j                  j                  d   dz
  }||k(  r||k(  r| j                  S | j                  j                         }|d d df   }|d d dd f   }	|j                  d   }
|| j                  z  }|| j                  z  }|dz   |dz   }}t
        j                  j                  |	j                  dt        t        j                  |            t        t        j                  |            |
      j                  dddd      |t        j                  |      z  |t        j                  |      z  fd      }	t        |      |	j                  d	   k(  rt        |      |	j                  d   k(  sJ |	j                  dddd      j                  dd|
      }	t        j                  |j!                  d      |	fd
      j#                  |      S )Nr4   r   g?r/      bicubic)scale_factormoderE   )dtypeshaper\   floatr9   r@   
functionalinterpolatereshapeintmathsqrtpermuteviewrY   cat	unsqueezeto)r$   r%   whprevious_dtypenpatchNr\   class_pos_embedpatch_pos_embedrE   w0h0s                r   interpolate_pos_encodingz.DinoVisionTransformer.interpolate_pos_encoding   s   aNN  #a'Q;16>>!NN((*	#AqD/#AqrE*ggbk$//!$//! c28B--33##As499Q<'8#diil:KSQYYZ[]^`acdetyy|+R$))A,->? 4 
 2w///33B?CXCXY[C\8\\)11!Q1=BB1b#Nyy/33A6HaPSSTbccr   c                    |j                   \  }}}}| j                  |      }|Xt        j                  |j	                  d      | j
                  j                  |j                        j	                  d      |      }t        j                  | j                  j                  |j                   d   dd      |fd      }|| j                  |||      z   }|S )Nr   r   r4   r   )r   rV   rY   wherer   ri   r   r   r   r[   expandr   )r$   r%   masksBncr   r   s          r   rx   z/DinoVisionTransformer.prepare_tokens_with_masks   s    gg2q!QEOOB/1C1CAGG1L1V1VWX1Y[\]AIIt~~,,QWWQZR@!D!L--aA66r   c                 @   t        ||      D cg c]  \  }}| j                  ||       c}}}| j                  D ]
  } ||      } |}g }t        ||      D ]<  \  }}| j                  |      }|j	                  |d d df   |d d dd f   ||d       > |S c c}}w Nr   r4   )x_norm_clstokenx_norm_patchtokens	x_prenormr   )ziprx   rf   rg   rd   )	r$   x_list
masks_listr%   r   r}   all_xoutputx_norms	            r   forward_features_listz+DinoVisionTransformer.forward_features_list   s    FI&R\F]^(!UT++Au5^;; 	CAA	 E:. 		HAuYYq\FMM'-ad|*0AB-!""			 ! _s   Bc                     t        |t              r| j                  ||      S | j                  ||      }| j                  D ]
  } ||      } | j                  |      }|d d df   |d d dd f   ||dS r   )
isinstancelistr   rx   rf   rg   )r$   r%   r   r}   r   s        r   forward_featuresz&DinoVisionTransformer.forward_features   s    a--a77**1e4;; 	CAA	 1%ad|"(AB-	
 	
r   c                 |   | j                  |      }g t        | j                        }}t        |t              rt        ||z
  |      n|}t        | j                        D ]#  \  }} ||      }||v s|j                  |       % t        |      t        |      k(  s J dt        |       dt        |       d       |S )Nonly  /  blocks found)rx   rz   rf   r   r   rb   ry   rd   )r$   r%   r{   r   total_block_lenblocks_to_takerr   r}   s           r   $_get_intermediate_layers_not_chunkedz:DinoVisionTransformer._get_intermediate_layers_not_chunked   s    **1-"$c$++&6HRSTVYHZ2OD`a, 	!FAsAAN"a 	! 6{c.11mU3v;-s3~K^J__l3mmr   c                    | j                  |      }g dt        | j                  d         }}}t        |t              rt        ||z
  |      n|}| j                  D ].  }||d  D ]$  } ||      }||v r|j                  |       |dz  }& 0 t        |      t        |      k(  s J dt        |       dt        |       d       |S )Nr   r   r4   r   r   r   )rx   rz   rf   r   r   rb   rd   )	r$   r%   r{   r   rr   r   r   block_chunkr}   s	            r    _get_intermediate_layers_chunkedz6DinoVisionTransformer._get_intermediate_layers_chunked  s    **1-%'CB,@?HRSTVYHZ2OD`a;; 	K"12 F&MM!$Q		 6{c.11mU3v;-s3~K^J__l3mmr   r%   r{   r   return_class_tokenr   c           	      H   | j                   r| j                  ||      }n| j                  ||      }|r|D cg c]  }| j                  |       }}|D cg c]  }|d d df    }}|D cg c]  }|d d dd f    }}|rl|j                  \  }	}
}}|D cg c]P  }|j                  |	|| j                  z  || j                  z  d      j                  dddd      j                         R }}|rt        t        ||            S t        |      S c c}w c c}w c c}w c c}w )Nr   r4   r   r/   r   )rc   r   r   rg   r   r   r9   r   
contiguoustupler   )r$   r%   r{   r   r   rg   outputsoutclass_tokensr   _r   r   s                r   get_intermediate_layersz-DinoVisionTransformer.get_intermediate_layers  s*    ;;AqAG??1EG189#tyy~9G9-45cAqD	55)01#3q!"u:11JAq!Q # AqDOO3Q$//5I2NVVWXZ[]^`abmmoG  Wl344W~ :51s   DD,DAD)is_trainingc                V     | j                   |i |}|r|S | j                  |d         S )Nr   )r   rh   )r$   r   rB   rC   rets        r   r'   zDinoVisionTransformer.forward-  s6    #d##T4V4J99S!2344r   )r4   Tr"   r4   )r4   FFT)r(   r)   r*   r
   r@   GELUr   rQ   r~   rj   r   rx   r   r   r   r   rY   Tensorr   r   r   boolr   r   r'   __classcell__rv   s   @r   r,   r,   *   s     ''"'pf1
d4	&
"
" #$#(<< h 	
 ! 
uU\\5#667	84 */ 5 5r   r,   c                    d }||d|z  z
  k  s||d|z  z   kD  rt        j                  dd       t        j                         5   |||z
  |z        } |||z
  |z        }| j	                  d|z  dz
  d|z  dz
         | j                          | j                  |t        j                  d      z         | j                  |       | j                  ||       | cd d d        S # 1 sw Y   y xY w)Nc                 d    dt        j                  | t        j                  d      z        z   dz  S )Ng      ?       @)r   erfr   )r%   s    r   norm_cdfz(_no_grad_trunc_normal_.<locals>.norm_cdf7  s(    TXXa$))B-/00B66r   r   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr4   r   )minmax)warningswarnrY   no_graduniform_erfinv_mul_r   r   add_clamp_)tensormeanr   ar&   r   lus           r   _no_grad_trunc_normal_r   4  s    7 	q1s7{q1s7{ 2 E!"	$ 
  a$h#%&a$h#%& 	A	1q519- 	 	C$))B-'(D 	!#+  s   BC&&C/	drop_probtrainingc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )Nr2   r4   r   r   )r   device)r   ndimrY   randr   r   floor_div)r%   r   r   	keep_probr   random_tensorr   s          r   rK   rK   Y  s}    BhIIWWQZMDAFFQJ//E

5 QQMUU9-FMr   r   c                     t        | t        j                        rNt        | j                  d       | j
                  *t        j                  j                  | j
                         yyy)zCViT weight initialization, original timm impl (for reproducibility)r   r   N)r   r@   Linearr   weightbiasr   zeros_r   s     r   r   r   d  sF    &"))$fmm.;;"GGNN6;;' # %r   c                 P    t        d| ddddt        t        t              d|}|S )N  r1         
attn_classr9   r;   rk   rF   rG   ro   r#   r,   r   r   r   r9   rC   models      r   	vit_smallr  l  s<    ! *G E Lr   c                 P    t        d| ddddt        t        t              d|}|S )Nr0   r1   r   r  r  r#   r  r  s      r   vit_baser	  y  s<    ! *G E Lr   c                 P    t        d| ddddt        t        t              d|}|S )Ni      r.   r   r  r  r#   r  r  s      r   	vit_larger    s<    ! *G E Lr   c                 P    t        d| ddddt        t        t              d|}|S )zW
    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
    i   (   r  r   r  r  r#   r  r  s      r   
vit_giant2r    s>     " *G E Lr   c                   ,     e Zd Z fdZd ZddZ xZS )DINOv2Featurizerc                     t         |           || _        || _        || _        d| _        t        j                  j                  dd      | _	        y )N   zfacebookresearch/dinov2dinov2_vits14)
rP   rQ   archr9   	feat_typen_featsrY   hubloadr  )r$   r  r9   r  rv   s       r   rQ   zDINOv2Featurizer.__init__  sA    	$"YY^^$=O
r   c                 8    | j                   j                  |      S r"   )r  r'   )r$   imgs     r   get_cls_tokenzDINOv2Featurizer.get_cls_token  s    zz!!#&&r   c                     |j                   d   | j                  z  }|j                   d   | j                  z  }| j                  j                  |      d   j	                  d||d      j                  dddd      S )Nr   r/   r   r   r   r   r4   )r   r9   r  r   r   r   )r$   r  r{   include_clsr   r   s         r   r'   zDINOv2Featurizer.forward  sr    IIaLDOO+IIaLDOO+zz**3/0DEMMbRSUVX[\ddefhiklnoppr   )r4   F)r(   r)   r*   rQ   r  r'   r   r   s   @r   r  r    s    P'qr   r  ) TF)r2   F)r  )r.   ))r   r   	functoolsr   timmrY   torch.nnr@   loggingtypingr   r   r   r   torch.utils.checkpointtorch.nn.initr    featup.featurizers.dinov2.layersr	   r
   r   r   r   	getLoggerr_   Moduler   re   r    r,   r   r   r   rK   strr   r  r	  r  r  r  r#   r   r   <module>r+     s             3 3    ' p p 
		8	$H bii ceclcl  H5BII H5T!JE $ (")) (3 (


 qryy qr   