o
    YjX                     @   s  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	
ddZej	
ddZdZdZd	Zd
ZdZdZG dd dejZedkreej rMdndZe e Zedd e D Zeded e ddeeeZ!e"  ee!Z#W d   n1 sw   Y  e#$ D ]@\Z%Z&e'e&drede% de(e&j)  qe*e&e+e(frede% de,e& d e&re'e&d drede(e&d j)  qej-dkredej. d d d! dS dS dS )"u+  Vanilla DINOv3 + 1×1 conv heads — the simpler baseline.

Per Cameron 2026-05-18: go back to the DINOv3 prototype, the DA3 path may be
over-engineered. Same volume formulation (T × Z × H × W joint logits) and CE loss,
just swap the heavy DA3 backbone for vanilla DINOv3 ViT-S/16.

Inputs:  rgb (B, 3, IMG, IMG) in [0, 1] — we ImageNet-normalize internally
Outputs:
  volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, h_out, w_out)
  pred_depth:    None (no depth head — keeps things simple)
  dino_feats:    list of intermediate features (for PCA viz)
    NDINO_REPO_DIRz/data/cameron/keygrip/dinov3DINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   i         )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?c                       sT   e Zd Zeeedddfdededededed	ef fd
dZ	dd Z
dd Z  ZS )DinoVanillaModel   Fdinov3_vits16plusn_windown_height_bins
image_sizehead_hiddenfreeze_backbonedino_variantc           
   
      sn  t    || _|| _|| _t| _|t | _|| _|| _	t
tjvr(tjdt
 tdd}||t}tjjt
|d|d| _|rM| j D ]}	|	d qEt| jdd| _| jd	 | _ttj| j|d
ddt tj||d
ddt | _t||| d| _tj | jj! tjj"| jj#dd | j$dt%t&'dd
dddd | j$dt%t('dd
dddd d S )Nr   zQ/data/cameron/keygrip/dinov3/weights/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth)r	   dinov3_vitl16local)sourceweightsF	embed_dimi           )paddingg{Gz?)stdmean)
persistentr   ))super__init__r
   r   r   DINO_PATCH_SIZEZ
patch_sizegridr   r   r   syspathinsertr   gettorchhubloaddino
parametersrequires_grad_getattrr   	pred_sizenn
SequentialConv2dGELUrefinevolume_headinitzeros_biasnormal_weightregister_buffertensorIMAGENET_MEANviewIMAGENET_STD)
selfr
   r   r   r   r   r   Zvariant_weightswp	__class__ //data/cameron/para/libero/model_dino_vanilla.pyr      s>   



"&zDinoVanillaModel.__init__c                 C   s   || j  | j S N)r   r   )r<   Zrgb01rA   rA   rB   
_normalizeJ   s   zDinoVanillaModel._normalizec                 C   sT  |j d }|j d | jkrtj|| j| jfddd}| |}tj r'tjntj	}tj
|jj|d | j|}W d   n1 sDw   Y  t|tr_|dd}|du r^|d	d}n|}|tj}|j d }| j }}	|dd
d||||	}
tj|
| j| jfddd}
| |
}
| |
}||| j| j| j| j}|d|
|gdS )z*rgb: (B, 3, IMG_SIZE, IMG_SIZE) in [0, 1].r   bilinearF)sizemodealign_corners)device_typedtypeNZx_norm_patchtokensZ	x_prenormr   r   )volume_logitsZ
pred_depthZpixel_featsZ
dino_feats)shaper   FinterpolaterD   r$   cudais_bf16_supportedbfloat16float16autocastdevicetyper'   Zforward_features
isinstancedictr#   tofloat32r   permutereshaper+   r0   r1   r:   r
   r   )r<   rgbBxZautocast_dtypeZfeatsZpatch_tokensDhr=   fZvol_flatZvolrA   rA   rB   forwardM   sB   






zDinoVanillaModel.forward)__name__
__module____qualname__N_WINDOWN_HEIGHT_BINSIMG_SIZEintboolstrr   rD   rc   __classcell__rA   rA   r?   rB   r      s     ,r   __main__rP   cpuc                 c   s    | ]
}|j r| V  qd S rC   )requires_gradnumel).0r>   rA   rA   rB   	<genexpr>v   s    rs   zTrainable: ,r   r   rM   z  z: z: list()z    first: zpeak: g    eAz.2fz GB)/__doc__osr    r$   torch.nnr,   Ztorch.nn.functional
functionalrN   environr#   r   r   r   ri   rg   rh   r9   r;   Moduler   rd   rU   rP   is_availablerY   evalmsumr(   n_tprintrandr]   no_gradoutitemskvhasattrtuplerM   rW   listlenrV   max_memory_allocatedrA   rA   rA   rB   <module>   sH    V


*
 