
    |2gpO                         d dl mZ d dlmZmZ d dlZd dlZd dlm	c m
Z d dlm	Z	 ddlmZ  G d de	j                        Z G d	 d
e	j                        Z G d de	j                        Z G d de	j$                        Z G d de	j                        Z G d de	j                        Z G d de	j                        Z G d de	j                        Z G d de	j                        Zde	j                  fdZdefdZy)    )OrderedDict)TupleUnionN)nn   ) interpolate_positional_embeddingc                   D     e Zd ZdZd fd	Zdej                  fdZ xZS )
Bottleneck   c                    t         |           t        j                  ||dd      | _        t        j
                  |      | _        t        j                  d      | _        t        j                  ||ddd      | _	        t        j
                  |      | _
        t        j                  d      | _        |dkD  rt        j                  |      nt        j                         | _        t        j                  ||| j                  z  dd      | _        t        j
                  || j                  z        | _        t        j                  d      | _        d | _        || _        |dkD  s||t*        j                  z  k7  rt        j,                  t/        dt        j                  |      fd	t        j                  ||| j                  z  ddd
      fdt        j
                  || j                  z        fg            | _        y y )Nr   F)biasTinplace   )paddingr   z-10)strider   1)super__init__r   Conv2dconv1BatchNorm2dbn1ReLUrelu1conv2bn2relu2	AvgPool2dIdentityavgpool	expansionconv3bn3relu3
downsampler   r
   
Sequentialr   )selfinplanesplanesr   	__class__s       D/home/cameronsmith/repos/FeatUp/featup/featurizers/maskclip/model.pyr   zBottleneck.__init__   s{    YYx?
>>&)WWT*
YYvvq!%H
>>&)WWT*
/5zr||F+r{{}YYvv'>N
>>&4>>"9:WWT*
A:Vj.B.B%BB mmKr||F+,bii&4>>*A1QUZ[\bnnVdnn%<=>9 - DO C    xc                    |}| j                  | j                  | j                  |                  }| j                  | j	                  | j                  |                  }| j                  |      }| j                  | j                  |            }| j                  | j                  |      }||z  }| j                  |      }|S N)r   r   r   r   r   r   r"   r%   r$   r'   r&   )r)   r/   identityouts       r-   forwardzBottleneck.forward,   s    jj$**Q-01jj$**S/23ll3hhtzz#'??&q)Hxjjo
r.   r   )	__name__
__module____qualname__r#   r   torchTensorr4   __classcell__r,   s   @r-   r
   r
      s    I: r.   r
   c            	       X     e Zd Zd	dedededef fdZd Zdej                  fdZ xZ	S )
AttentionPool2dspacial_dim	embed_dim	num_heads
output_dimc                    t         |           t        j                  t	        j
                  |dz  dz   |      |dz  z        | _        t        j                  ||      | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  ||xs |      | _        || _        || _        y )N   r         ?)r   r   r   	Parameterr9   randnpositional_embeddingLineark_projq_projv_projc_projrA   r?   )r)   r?   r@   rA   rB   r,   s        r-   r   zAttentionPool2d.__init__=   s    $&LL[A=MPQ=QS\1]`imp`p1p$q!ii	95ii	95ii	95ii	:+BC"&r.   c           
      t   |j                  d      j                  ddd      }t        j                  |j	                  dd      |gd      }|| j
                  d d d d d f   j                  |j                        z   }t        j                  di d|d d d	|d
|d|j                  d   d| j                  d| j                  j                  d| j                  j                  d| j                  j                  dd dt        j                  | j                  j                   | j                  j                   | j                  j                   g      dd dd ddddd| j"                  j                  d| j"                  j                   ddd| j$                  dd\  }}|j'                  d      S )NrD   	start_dimr   r   TdimkeepdimrR   querykeyvalueembed_dim_to_checkrA   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingneed_weights )flattenpermuter9   catmeanrH   todtypeFmulti_head_attention_forwardshaperA   rK   weightrJ   rL   r   rM   rf   squeeze)r)   r/   _s      r-   r4   zAttentionPool2d.forwardG   s   III"**1a3IIqvv!Tv2A6A>))!T1*588AA-- 
BQ%

&'
 wwr{
 nn
 ++,,	

 ++,,
 ++,,
  
 DKK$4$4dkk6F6FHXHX#YZ
 
 
  
 
 !KK..
 ++**
 &*
  ]]!
" #
1& yy|r.   r/   c                 X   |j                   \  }}}}|j                  d      j                  ddd      }t        j                  |j                  dd      |gd      }t        | j                  |j                  ddd      d||      }||d	d	d	d	d	f   z   }t        j                  || j                  j                  | j                  j                        }t        j                  || j                  j                  | j                  j                        }|j                  ddd      }|S )

        Forward function for computing the value features for dense prediction (i.e., features for every image patch).
        rD   rO   r   r   TrQ   rT   
patch_sizewhN)rq   ri   rj   r9   rk   rl   r   rH   ro   linearrL   rr   r   rM   )r)   r/   rt   ry   rz   interpolated_pev_inv_outs           r-   	forward_vzAttentionPool2d.forward_v`   s    WW
1aIII"**1a3IIqvv!Tv2A6A> ;4;T;TVWV_V_`acdfgVhuvz{  @A  B4
++xx4;;--t{{/?/?@t{{114;;3C3CDaA&r.   r1   )
r6   r7   r8   intr   r4   r9   r:   r   r;   r<   s   @r-   r>   r>   <   s:    'C 'C 'C 'UX '25<< r.   r>   c                   :     e Zd ZdZd fd	ZddZddefdZ xZS )	ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    c                    t         |           || _        || _        t	        j
                  d|dz  dddd      | _        t	        j                  |dz        | _        t	        j                  d      | _
        t	        j
                  |dz  |dz  ddd      | _        t	        j                  |dz        | _        t	        j                  d      | _        t	        j
                  |dz  |ddd      | _        t	        j                  |      | _        t	        j                  d      | _        t	        j"                  d      | _        || _        | j)                  ||d	         | _        | j)                  |dz  |d   d
      | _        | j)                  |dz  |d   d
      | _        | j)                  |dz  |d   d
      | _        |dz  }t3        |dz  |||      | _        y )Nr   rD   r   F)kernel_sizer   r   r   Tr   )r   r   r   r   )r   r          )r   r   rB   input_resolutionr   r   r   r   r   r   r   r   r   r   r$   r%   r&   r    r"   	_inplanes_make_layerlayer1layer2layer3layer4r>   attnpool)r)   layersrB   headsr   widthr@   r,   s          r-   r   zModifiedResNet.__init__z   s   $ 0 YYq%1*!AqW\]
>>%1*-WWT*
YYuz5A:1aV[\
>>%1*-WWT*
YYuz5aQVW
>>%(WWT*
||A &&ufQi8&&uqy&)A&F&&uqy&)A&F&&uqy&)A&FBJ	'(8B(>	5R\]r.   c                     t        | j                  ||      g}|t         j                  z  | _        t        d|      D ]'  }|j	                  t        | j                  |             ) t        j                  | S )Nr   )r
   r   r#   rangeappendr   r(   )r)   r+   blocksr   r   rt   s         r-   r   zModifiedResNet._make_layer   sg    T^^VV<=*"6"66q&! 	>AMM*T^^V<=	> }}f%%r.   patch_outputc                      fd}|j                   j                  j                  j                        } ||      } j	                  |      } j                  |      } j                  |      } j                  |      }|r+ j                  j                  |      }|d d dd d d f   }|S  j                  |      }|S )Nc                 D   j                  j                  j                  |                   } j                  j	                  j                  |                   } j                  j                  j                  |                   } j                  |       } | S r1   )
r   r   r   r   r   r   r&   r%   r$   r"   )r/   r)   s    r-   stemz$ModifiedResNet.forward.<locals>.stem   sr    

488DJJqM23A

488DJJqM23A

488DJJqM23AQAHr.   r   )
typer   rr   rn   r   r   r   r   r   r   )r)   r/   r   r   s   `   r-   r4   zModifiedResNet.forward   s    	 FF4::$$**+GKKNKKNKKNKKN''*A!QR(A  a Ar.   )   @   r5   F)	r6   r7   r8   __doc__r   r   boolr4   r;   r<   s   @r-   r   r   r   s    ^6&t r.   r   c                   <     e Zd ZdZdej
                  f fdZ xZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.r/   c                     |j                   }t        | 	  |j                  t        j
                              }|j                  |      S r1   )rn   r   r4   r   r9   float32)r)   r/   	orig_typeretr,   s       r-   r4   zLayerNorm.forward   s6    GG	goaffU]]34xx	""r.   )r6   r7   r8   r   r9   r:   r4   r;   r<   s   @r-   r   r      s    4# # #r.   r   c                   ,    e Zd Zdej                  fdZy)	QuickGELUr/   c                 8    |t        j                  d|z        z  S )NgZd;?)r9   sigmoidr)   r/   s     r-   r4   zQuickGELU.forward   s    5==+++r.   N)r6   r7   r8   r9   r:   r4   rh   r.   r-   r   r      s    , ,r.   r   c                        e Zd Zd	dededej
                  f fdZdej
                  fdZdej
                  fdZdej
                  fdZ	 xZ
S )
ResidualAttentionBlockd_modeln_head	attn_maskc                 j   t         |           t        j                  ||      | _        t        |      | _        t        j                  t        dt        j                  ||dz        fdt               fdt        j                  |dz  |      fg            | _        t        |      | _        || _        y )Nc_fcr   gelurM   )r   r   r   MultiheadAttentionattnr   ln_1r(   r   rI   r   mlpln_2r   )r)   r   r   r   r,   s       r-   r   zResidualAttentionBlock.__init__   s    ))'6:	g&	==RYYw!45Y[!ryy1g67.
 " 
 g&	"r.   r/   c                     | j                   1| j                   j                  |j                  |j                        nd | _         | j	                  |||d| j                         d   S )Nrn   deviceF)rg   r   r   )r   rm   rn   r   r   r   s     r-   	attentionz ResidualAttentionBlock.attention   sT    NRnnNh***JnryyAquyOPQRRr.   c                    | j                   j                  | j                   j                   d }| j                   j                  | j                   j                   d }t	        j
                  | j                  |      ||      }t	        j
                  || j                   j                  j                  | j                   j                  j                        }|S )rv   N)
r   r]   r@   r^   ro   r{   r   out_projrr   r   )r)   r/   v_in_proj_weightv_in_proj_biasr}   r~   s         r-   r   z ResidualAttentionBlock.forward_v   s    
  9933TYY5H5H4H4IJ//1D1D0D0EFxx		!&6Gtyy1188$)):L:L:Q:QR r.   c                     || j                  | j                  |            z   }|| j                  | j                  |            z   }|S r1   )r   r   r   r   r   s     r-   r4   zResidualAttentionBlock.forward   s=    tyy|,,1&&r.   r1   )r6   r7   r8   r   r9   r:   r   r   r   r4   r;   r<   s   @r-   r   r      sS    # #S #U\\ #S5<< S5<<  r.   r   c            	       f     e Zd Zddedededej
                  f fdZdej
                  fdZ xZS )	Transformerr   r   r   r   c           
          t         |           || _        || _        t	        j
                  t        |      D cg c]  }t        |||       c} | _        y c c}w r1   )	r   r   r   r   r   r(   r   r   	resblocks)r)   r   r   r   r   rt   r,   s         r-   r   zTransformer.__init__   sL    
bghnbo(p]^)?ui)X(pq(ps   Ar/   c                 $    | j                  |      S r1   )r   r   s     r-   r4   zTransformer.forward   s    ~~a  r.   r1   )	r6   r7   r8   r   r9   r:   r   r4   r;   r<   s   @r-   r   r      s=    rc r3 rs ru|| r! !r.   r   c                   ^     e Zd Zdedededededef fdZddej                  d	efd
Z xZ	S )VisionTransformerr   rx   r   r   r   rB   c                 .   t         |           || _        || _        t	        j
                  d|||d      | _        |dz  }t	        j                  |t        j                  |      z        | _
        t	        j                  |t        j                  ||z  dz  dz   |      z        | _        t        |      | _        t        |||      | _        t        |      | _        t	        j                  |t        j                  ||      z        | _        || _        y )Nr   F)in_channelsout_channelsr   r   r         rD   r   )r   r   r   rB   r   r   r   rF   r9   rG   class_embeddingrH   r   ln_prer   transformerln_postprojrx   )	r)   r   rx   r   r   r   rB   scaler,   s	           r-   r   zVisionTransformer.__init__   s     0$YY15jakrwx
!||EEKK4F,FG$&LLFVZdFdijEjmnEnpu9v1v$w!&&ufe< 'LLUJ)G!GH	$r.   r/   r   c           
      H   |j                   \  }}}}| j                  |      }|j                  |j                   d   |j                   d   d      }|j                  ddd      }t	        j
                  | j                  j                  |j                        t	        j                  |j                   d   d|j                   d   |j                  |j                        z   |gd      }|t        | j                  || j                  ||      z   }| j                  |      }|j                  ddd      }|r| j                  j                   ^ }}t#        j$                  | } ||      }|j'                  |      }|j                  ddd      }|d d dd d d f   }| j)                  |      }| j*                  || j*                  z  }|S | j                  |      }|j                  ddd      }| j)                  |d d dd d f         }| j*                  || j*                  z  }|S )Nr   r   rY   rD   r   rT   rw   )rq   r   reshaperj   r9   rk   r   rm   rn   zerosr   r   rH   rx   r   r   r   r   r(   r   r   r   )	r)   r/   r   rt   ry   rz   r   last_resblockpenultimates	            r-   r4   zVisionTransformer.forward	  s    WW
1aJJqMIIaggaj!''!*b1IIaAIIt++..qww7%++aggajRSUVU\U\]_U`hihohoxy  yA  yA  ;B  B  DE  F  LM  N01J1JAZ^ZiZimnrsttKKNIIaA%)%5%5%?%?"V]--0KAA''*A		!Q"A !QR(AQAyy$		MHQIIaALL1a7$99 DIIAr.   r   )
r6   r7   r8   r   r   r9   r:   r   r4   r;   r<   s   @r-   r   r      sL    % %# %c %SV %_b %ps %$$ $T $r.   r   c                        e Zd Zdededeeeeeef   ef   dededededed	ed
ef fdZd Zd Ze	d        Z
d Zdej                  fdZdej                   fdZd Zd Z xZS )CLIPr@   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    t         |           || _        t        |t        t
        f      r|dz  dz  }t        |||||      | _        n|dz  }t        ||||||      | _        t        ||
|	| j                               | _        || _        t        j                  ||      | _        t        j                   t#        j$                  | j                  |            | _        t)        |      | _        t        j                   t#        j$                  ||            | _        t        j                   t#        j.                  g       t1        j2                  d      z        | _        | j7                          y )Nr   r   )r   rB   r   r   r   )r   rx   r   r   r   rB   )r   r   r   r   g$I$I,@)r   r   r   
isinstancetuplelistr   visualr   r   build_attention_maskr   r   r   	Embeddingtoken_embeddingrF   r9   emptyrH   r   ln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)r)   r@   r   r   r   r   r   r   r   r   r   vision_headsr,   s               r-   r   zCLIP.__init__1  s;    	,meT]3'",2L($$"!1"DK (2-L+!1,"$"$DK '#%#//1	
 %!||J8IJ$&LLT=P=PRc1d$e!!"34!||EKK8I9,UV<<

29I(IJ""$r.   c                    t         j                  j                  | j                  j                  d       t         j                  j                  | j
                  d       t        | j                  t              r| j                  j                  Q| j                  j                  j                  j                  dz  }t         j                  j                  | j                  j                  j                  j                  |       t         j                  j                  | j                  j                  j                  j                  |       t         j                  j                  | j                  j                  j                  j                  |       t         j                  j                  | j                  j                  j                  j                  |       | j                  j                  | j                  j                   | j                  j"                  | j                  j$                  fD ]K  }|j'                         D ]6  \  }}|j)                  d      st         j                  j+                  |       8 M | j,                  j.                  dz  d| j,                  j0                  z  dz  z  }| j,                  j.                  dz  }d| j,                  j.                  z  dz  }| j,                  j2                  D ]  }t         j                  j                  |j4                  j6                  |       t         j                  j                  |j4                  j8                  j                  |       t         j                  j                  |j:                  j<                  j                  |       t         j                  j                  |j:                  j                  j                  |        | j>                  Ct         j                  j                  | j>                  | j,                  j.                  dz         y y )Ng{Gz?)stdg{Gz?r   z
bn3.weightrD   ) r   initnormal_r   rr   rH   r   r   r   r   rM   in_featuresrK   rJ   rL   r   r   r   r   named_parametersendswithzeros_r   r   r   r   r   r]   r   r   r   r   )	r)   r   resnet_blocknameparamproj_stdattn_stdfc_stdblocks	            r-   r   zCLIP.initialize_parametersh  s   
,,33>
11t<dkk>2{{##/kk**11==E 4 4 ; ; B BL 4 4 ; ; B BL 4 4 ; ; B BL 4 4 ; ; B BL!%!3!3T[[5G5GI[I[]a]h]h]o]o p .#/#@#@#B .KD%}}\2u-..
 $$**d2D<L<L<S<S8SX\7\]##))T1d&&,,,5%%// 	CEGGOOEJJ558ODGGOOEJJ//66HOEGGOOEIINN11vO>GGOOEII,,33OB		C +GGOOD00d6F6F6L6LPT6TOU ,r.   c                     t        j                  | j                  | j                        }|j                  t	        d             |j                  d       |S )Nz-infr   )r9   r   r   fill_floattriu_)r)   masks     r-   r   zCLIP.build_attention_mask  s@     {{4..0C0CD

5=!

1r.   c                 V    | j                   j                  j                  j                  S r1   )r   r   rr   rn   r)   s    r-   rn   z
CLIP.dtype  s    {{  ''---r.   c                 V    | j                  |j                  | j                              S r1   r   r   rn   r)   images     r-   encode_imagezCLIP.encode_image  s    {{5::djj122r.   returnc                 Z    | j                  |j                  | j                        d      S )z/ Get the encodings for each patch in the image T)r   r
  r  s     r-   get_patch_encodingszCLIP.get_patch_encodings  s"    {{5::djj1{EEr.   c                 f    t        | j                  t              sJ | j                  j                  S )z* Get vision transformer projection matrix.)r   r   r   r   r  s    r-   get_image_encoder_projectionz!CLIP.get_image_encoder_projection  s$    $++'899{{r.   c                    | j                  |      j                  | j                        }|| j                  j                  | j                        z   }|j	                  ddd      }| j                  |      }|j	                  ddd      }| j                  |      j                  | j                        }|t        j                  |j                  d         |j                  d      f   | j                  z  }|S )Nr   r   rD   rY   rT   )r   r   rn   rH   rj   r   r   r9   arangerq   argmaxr   )r)   textr/   s      r-   encode_textzCLIP.encode_text  s      &++DJJ7))..tzz::IIaAQIIaAMM!!!$**- ell1771:&(;;<t?S?SSr.   c                 &   | j                  |      }| j                  |      }||j                  dd      z  }||j                  dd      z  }| j                  j	                         }||z  |j                         z  }|j                         }||fS )Nr   TrQ   )r  r  normr   expt)r)   r  r  image_featurestext_featuresr   logits_per_imagelogits_per_texts           r-   r4   zCLIP.forward  s    **51((. (.*=*=!T*=*RR%(:(:q$(:(OO &&**,&7-//:KK*,,.  00r.   )r6   r7   r8   r   r   r   r   r   r   propertyrn   r  r9   r:   r  r   rF   r  r  r4   r;   r<   s   @r-   r   r   0  s    5%5% $'5% !&eCc3,>&?&D E	5%
  #5% %(5% "%5% !5% %(5% %(5% &)5%nV: . .3FELL F bll  
1r.   r   modelc                 ,    d }| j                  |       y)z+Convert applicable model parameters to fp16c                    t        | t        j                  t        j                  t        j                  f      rr| j
                  j                  j                         | j
                  _        | j                  3| j                  j                  j                         | j                  _        t        | t        j                        rNg dD cg c]  }| d	 c}dddD ]0  }t        | |      }||j                  j                         |_        2 dD ]=  }t        | |      st        | |      }||j                  j                         |_        ? y c c}w )N)inqkv_proj_weightr^   r_   r`   )r   r   )r   r   Conv1dr   rI   rr   datahalfr   r   getattrhasattr)lsattrtensorr   s        r-   _convert_weights_to_fp16z1convert_weights.<locals>._convert_weights_to_fp16  s   a"))RYY		:;HHMM..0AHHMvv!ffkk..0a../r7LM!aS-Mr~r_griqr 5 D)%"(++"2"2"4FK5
 0 	1Dq$q$'# $		 0DI		1 Ns   EN)apply)r!  r2  s     r-   convert_weightsr4    s    1$ 
KK()r.   
state_dictc                    d| v }|r| d   j                   d   }t        | j                         D cg c](  }|j                  d      s|j	                  d      s'|* c}      }| d   j                   d   }t        | d   j                   d   dz
  d	z        }||z  }nd
D cg c]  t        t        fd| D                    ! }	}t        |	      }| d   j                   d   }t        | d   j                   d   dz
  d	z        }
d }|
dz  dz   | d   j                   d   k(  sJ |
dz  }| d   j                   d   }| d   j                   d   }| d   j                   d   }| d   j                   d   }|dz  }t        t        d | D                    }t        ||||||||||
      }dD ]
  }|| v s| |=  t        |       |j                  |        |j                         S c c}w c c}w )Nzvisual.projzvisual.conv1.weightr   zvisual.z.attn.in_proj_weightrY   zvisual.positional_embeddingr   rE   )r   rD   r   r   c              3   p   K   | ]-  }|j                  d        s|j                  d      d    / yw)zvisual.layer.rD   N
startswithsplit).0r&  bs     r-   	<genexpr>zbuild_model.<locals>.<genexpr>  s2     hAq||VbcdbeTfGgQhs   66zvisual.layer1.0.conv1.weightz$visual.attnpool.positional_embeddingrD   r   r   rH   ztoken_embedding.weightzln_final.weightr   c              3   h   K   | ]*  }|j                  d       s|j                  d      d    , yw)ztransformer.resblocksr8  rD   Nr9  )r<  r&  s     r-   r>  zbuild_model.<locals>.<genexpr>  s(      nQUlHma ns   22)r   r   r   )rq   lenkeysr:  r   roundsetr   r   r4  load_state_dicteval)r5  vitr   r&  r   r   	grid_sizer   r=  countsoutput_widthr@   r   r   r   r   r   r!  rV   s           `          r-   build_modelrJ    sB   
:
%C
!"78>>qA
(9|1Q\\)=TYZYcYcdzY{Q|}&'<=CCBG:&CDJJ1MPQQVYYZ	,y8s  AnoChhhi  A  Af!"@AGGJj)OPVVWXY\]]beef q 1$
3Y(Z(`(`ab(ccc'",,-33A6I 67==a@N45;;A>J"#45;;A>)R/S n* nno-7H
$57HJ\E D  *3  E	*%::<A }
 As   G"G"G"#$G')collectionsr   typingr   r   numpyr   r9   torch.nn.functionalr   
functionalro   interpolater   Moduler
   r>   r   r   r   r   r   r   r   r4  dictrJ  rh   r.   r-   <module>rS     s    #       9- -`3bii 3lARYY AH# #,		 ,
$RYY $N!")) !7		 7tJ1299 J1Z*299 *0%D %r.   