
     hb)                         d dl Z d dlmZ de j                  j                  j
                  _        d dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ  G d dej$                        Zy)	    NT)partial)BlockDecoderBlock
PatchEmbed)get_2d_sincos_pos_embedRoPE2D)
RandomMaskc                        e Zd Zdddddddddd eej
                  d	
      ddf fd	ZddZd Zd Z	d Z
d Zd Zd ZddZddZd ZddZd Z xZS )CroCoNet      g?      i         gư>)epsTcosinec                 ~   t         t        |           | j                  |||       | j	                  | j
                  j                  |       || _        |dk(  rt        |t        | j
                  j                  dz        d      }| j                  dt        j                  |      j                                t        |t        | j
                  j                  dz        d      }| j                  dt        j                  |      j                                d | _        ng|j                  d      rHd | _        d | _        t$        t'        d      t        |t)        d      d        }t%        |	      | _        nt+        d
|z         || _        || _        t1        j2                  t5        |      D cg c]  }t7        |||
d|| j                         c}      | _         ||      | _        | j=                  |       | j?                  |||	||
||       | jA                  ||       | jC                          y c c}w )Nr         ?r   )n_cls_tokenenc_pos_embeddec_pos_embedRoPEzICannot find cuRoPE2D, please install it following the README instructions)freqzUnknown pos_embed T)qkv_bias
norm_layerrope)"superr   __init___set_patch_embed_set_mask_generatorpatch_embednum_patches	pos_embedr   intregister_buffertorch
from_numpyfloatr   
startswithr   r   r   ImportErrorlenNotImplementedError	enc_depthenc_embed_dimnn
ModuleListranger   
enc_blocksenc_norm_set_mask_token_set_decoder_set_prediction_headinitialize_weights)selfimg_size
patch_size
mask_ratior/   r.   enc_num_headsdec_embed_dim	dec_depthdec_num_heads	mlp_ratior   norm_im2_in_decr$   r   r   r   i	__class__s                     @/home/cameronsmith/repos/controll3r/dust3r/croco/models/croco.pyr   zCroCoNet.__init__   s     	h&( 	h
MB 	  !1!1!=!=zJ"h3M3tGWGWGcGcegGgChvwxM  %2B2B=2Q2W2W2YZ3M3tGWGWGcGcegGgChvwxM  %2B2B=2Q2W2W2YZDI!!&)!%D!%D~[1|%}}3v;<01DD)DI%&:9&DEE #*--9%)' -	DU_fjfofop)' ( #=1 	]+ 	-yR[]gixy 	!!-< 	!)'s   4"H:c                 *    t        ||d|      | _        y )N   )r   r"   )r9   r:   r;   r/   s       rE   r    zCroCoNet._set_patch_embedV   s    %h
A}M    c                 &    t        ||      | _        y )N)r	   mask_generator)r9   r#   r<   s      rE   r!   zCroCoNet._set_mask_generatorY   s    (jArH   c                 b    t        j                  t        j                  dd|            | _        y )N   )r0   	Parameterr'   zeros
mask_token)r9   r>   s     rE   r5   zCroCoNet._set_mask_token\   s    ,,u{{1a'GHrH   c                    || _         || _        t        j                  ||d      | _        t        j
                  t        |      D cg c]  }t        |||d||| j                          c}      | _	         ||      | _
        y c c}w )NTbias)rA   r   r   norm_memr   )r?   r>   r0   Lineardecoder_embedr1   r2   r   r   
dec_blocksdec_norm)	r9   r/   r>   r@   r?   rA   r   rB   rC   s	            rE   r6   zCroCoNet._set_decoder_   s    "*YY}m$O--9%)' UYfp  |K  RV  R[  R[  \)' ( #=1	)'s   #Bc                 J    t        j                  ||dz  dz  d      | _        y )N   rG   TrQ   )r0   rT   prediction_head)r9   r>   r;   s      rE   r7   zCroCoNet._set_prediction_headk   s!     "		-Q9JQU V	rH   c                     | j                   j                          | j                  5t        j                  j
                  j                  | j                  d       | j                  | j                         y )Ng{Gz?)std)r"   _init_weightsrO   r'   r0   initnormal_apply)r9   s    rE   r8   zCroCoNet.initialize_weightso   sO    &&(??&(=(=dooSV(=(W

4%%&rH   c                 "   t        |t        j                        rt        j                  j                  j                  |j                         t        |t        j                        r8|j                  +t        j                  j                  |j                  d       y y y t        |t        j                        rUt        j                  j                  |j                  d       t        j                  j                  |j                  d       y y )Nr   g      ?)

isinstancer0   rT   r'   r^   xavier_uniform_weightrR   	constant_	LayerNorm)r9   ms     rE   r]   zCroCoNet._init_weightsw   s    a#HHMM))!((3!RYY'AFF,>!!!&&!, -?'2<<(GGaffa(GGahh, )rH   c                 n   | j                  |      \  }}| j                  || j                  d   z   }|j                         \  }}}|r@| j                  |      }	||	    j	                  |d|      }||	    j	                  |dd      }
n3|j                         \  }}}t        j                  ||ft              }	|}
|rIg }| j                  D ]  } |||
      }|j                  |        | j                  |d         |d<   |||	fS | j                  D ]  } |||
      } | j                  |      }|||	fS )a)  
        image has B x 3 x img_size x img_size 
        do_mask: whether to perform masking or not
        return_all_blocks: if True, return the features at the end of every block 
                           instead of just the features from the last block (eg for some prediction heads)
        )N.rY   dtype)r"   r   sizerJ   viewr'   rN   boolr3   appendr4   )r9   imagedo_maskreturn_all_blocksxposBNCmasksposvisoutblks                rE   _encode_imagezCroCoNet._encode_image   sF    !!%(3)D&&x00A!A''*E5&	q"a(A%[%%aQ/FFFHEAaKK1T2EFC 6N

1 mmCG,CGU?" #6N#a Ac5= rH   c                    | j                  |      }| j                  |      }|j                         \  }	}
}||}na|j                  d      }| j                  j                  |	|d      j	                  |j
                        }|j                  |	|
z  |      || <   | j                  || j                  z   }|| j                  z   }|}|}|rM|g }}| j                  D ]!  } |||||      \  }}|j                  |       # | j                  |d         |d<   |S | j                  D ]  } |||||      \  }} | j                  |      }|S )a  
        return_all_blocks: if True, return the features at the end of every block 
                           instead of just the features from the last block (eg for some prediction heads)
                           
        masks1 can be None => assume image1 fully visible 
        rL   rj   ri   )rU   rl   rO   repeattork   rm   r   rV   ro   rW   )r9   feat1pos1masks1feat2pos2rr   visf1f2ru   Nencrw   f1_Ntotalrz   out2_outr{   s                     rE   _decoderzCroCoNet._decoder   s`    ""5)&::<$q>C[[^F//((FA6999LC ::a$h2CL)***Cd(((BR#D ! tT48
d

4 ! mmCG,CG
 
  7T46	T7--$C
rH   c                    | j                   j                  d   }|j                  d   |j                  d   k(  r|j                  d   |z  dk(  sJ |j                  d   |z  x}}|j                  |j                  d   d||||f      }t	        j
                  d|      }|j                  |j                  d   ||z  |dz  dz  f      }|S )zH
        imgs: (B, 3, H, W)
        x: (B, L, patch_size**2 *3)
        r   rY   rG   shapeznchpwq->nhwpqc)r"   r;   r   reshaper'   einsum)r9   imgsphwrs   s         rE   patchifyzCroCoNet.patchify   s    
 ''*zz!}

1-$**Q-!2Cq2HH

1""ALL

1q!Q1=L>LL)1-IITZZ]AE1a4!8<I=rH   c                 h   | j                   j                  d   }t        |j                  d   dz        x}}||z  |j                  d   k(  sJ |j	                  |j                  d   |||||f      }t        j                  d|      }|j	                  |j                  d   |||z  ||z  f      }|S )zO
        x: (N, L, patch_size**2 *channels)
        imgs: (N, 3, H, W)
        r   rL   r   r   znhwpqc->nchpwq)r"   r;   r%   r   r   r'   r   )r9   rs   channelsr;   r   r   r   s          rE   
unpatchifyzCroCoNet.unpatchify   s    
 %%003
AGGAJN##A1u
""IIQWWQZAz:xPIQLL)1-yy
Ha*na*nUyVrH   c                     | j                  |d      \  }}}| j                  |d      \  }}}| j                  |||||      }	| j                  |	      }
| j                  |      }|
||fS )z
        img1: tensor of size B x 3 x img_size x img_size
        img2: tensor of size B x 3 x img_size x img_size
        
        out will be    B x N x (3*patch_size*patch_size)
        masks are also returned as B x N just in case 
        T)rq   F)r|   r   rZ   r   )r9   img1img2r   r   mask1r   r   _decfeatrz   targets               rE   forwardzCroCoNet.forward   s|     "//d/CtU++D%+@tQ--tUE4@""7+t$E6!!rH   )r   r   r   )FF)F)rG   )__name__
__module____qualname__r   r0   rf   r   r    r!   r5   r6   r7   r8   r]   r|   r   r   r   r   __classcell__)rD   s   @rE   r   r      s     "!"!#BLLd;!%#="~NBI
2W'-#!J#J"rH   r   )r'   torch.nnr0   backendscudamatmul
allow_tf32	functoolsr   models.blocksr   r   r   models.pos_embedr   r   models.maskingr	   Moduler    rH   rE   <module>r      sF     (,     %  9 9 < %d"ryy d"rH   