o
    j;                     @   sF  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 ej
ddZej
ddZdZdZd	Zd
ZdZdZdZdedefddZG dd dejZedkreej r\dndZdD ]?ZdD ]8Zeeede  Z!e"dd e!# D Z$e!% Z&e'deddedde$dd e&j(d!d") * d# qeqadS dS )$u  Vanilla DINOv3 + factored KV volume head (v3-style architecture but DINO backbone).

Ablation flags:
  height_enc: 'sin' | 'learned' | 'sin_plus_learned'
  time_enc:   'sin' | 'learned' | 'sin_plus_learned'

Per Cameron 2026-05-19: try the volume projection approach with DINO features and all
combinations of height/time encoding.

Architecture:
  DINOv3 ViT-S/16 → patch tokens (B, N, D), upsampled to (B, 48, h_out, w_out).
  Per-pixel feature F is the "value" stream.
  Key per (t, z): h_emb[z] + t_emb[t] ∈ R^48.
  L2-normalize F and keys → cosine similarity → scaled by learnable logit_scale.
  Volume logits: einsum(F, K) → (B, T, Z, h_out, w_out).
    NDINO_REPO_DIRz/data/cameron/keygrip/dinov3DINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   i         0   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?n_valuesdimc                 C   s   |d dksJ |d }t j| t jdt| d d }dt j|t jd }|d|d tj }t jt |t 	|gddS )zCNeRF-style sinusoidal positional encoding. Returns (n_values, dim).   r   )dtype   g       @r	   )
torcharangefloat32max	unsqueezemathpicatsincos)r   r	   LposZfreqsangles r   1/data/cameron/para/libero/model_dino_volume_kv.pysinusoidal_features"   s   r   c                       sh   e Zd Zeeeeddddfdededededed	ed
edef fddZ	dd Z
dd Zdd Z  ZS )DinoVolumeKVr      dinov3_vits16plusn_windown_height_binskey_dim
image_size
height_enctime_enchead_hiddendino_variantc	                    s.  t    |dv sJ |dv sJ || _|| _|| _|| _|| _|| _|| _t	| _
|t	 | _| jd | _ttjvr@tjdt tdd}	|	|t}
tjjt|d|
d| _t| jdd	| _ttj| j|d
ddt tj||d
ddt t||d| _t|| _d|v rt ||| _!|dkrdnd}tj"j#| j!j$|d| d
| d d|v r| j%dt&||dd d|v rt ||| _'|dkrdnd}tj"j#| j'j$|d| d
| d d|v r| j%dt&||dd t(t)d| _*| j%dt)t+,dd
dddd | j%dt)t-,dd
dddd d S )Nr   learnedsin_plus_learnedr
   r   zQ/data/cameron/keygrip/dinov3/weights/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth)r!   dinov3_vitl16local)sourceweights	embed_dimi     r   )paddingr+   r,   g{Gz?g?)stdabr   	t_emb_sinF)
persistent	h_emb_singHzG@meanr5   ).super__init__r"   r#   r$   r%   r&   r'   r)   DINO_PATCH_SIZEZ
patch_sizegrid	pred_sizer   syspathinsertr   getr   hubloaddinogetattrr1   nn
SequentialConv2dGELUrefine	LayerNorm
pixel_norm	Embeddingt_emb_learnedinittrunc_normal_weightregister_bufferr   h_emb_learned	Parametertensorlogit_scaleIMAGENET_MEANviewIMAGENET_STD)selfr"   r#   r$   r%   r&   r'   r(   r)   Zvariant_weightswZinit_std	__class__r   r   r=   -   sb   


  "&zDinoVolumeKV.__init__c                 C   s   || j  | j S N)r;   r5   )r]   Zrgb01r   r   r   
_normalizej   s   zDinoVolumeKV._normalizec                 C   s|   | j dkr	| j}n| j dkr| jj}n| j| jj }| jdkr#| j}n| jdkr-| jj}n| j| jj }|d|d S )Nr   r+   r   r   )r'   r8   rQ   rT   r&   r:   rV   r   )r]   Zt_totalZh_totalr   r   r   _build_keysm   s   zDinoVolumeKV._build_keysc                 C   s  |j d }|j d }|| jkrtj|| j| jfddd}| |}tj r)tjntj	}tj
|jj|d | j|}W d    n1 sFw   Y  t|trZ|d|d}n|}|tj}|j d }| j }	}
|dd	d
|||	|
}tj|| j| jfddd}| |}| |dd	dd
ddd
d	}||jd
ddd  }|  }||jdddd  }| jjtdd  }t!d||| }|d ||gdS )Nr   r   bilinearF)sizemodealign_corners)device_typer   Zx_norm_patchtokensZ	x_prenormr
   r   r2   T)r	   keepdimgư>g      Y@)r   zbchw, tzc -> btzhw)volume_logits
pred_depthpixel_feats
dino_feats)"shaper%   Finterpolaterb   r   cudais_bf16_supportedbfloat16float16autocastdevicetyperG   Zforward_features
isinstancedictrD   tor   r?   permutereshaper@   rM   rO   normrc   rY   clampr   logexpeinsum)r]   rgbBin_sizexZautocast_dtypefeatsZpatch_tokensDhr^   Zfeat_2drl   Zf_lnZf_unitkeysZ	keys_unitscalerj   r   r   r   forwardy   sB   







"zDinoVolumeKV.forward)__name__
__module____qualname__N_WINDOWN_HEIGHT_BINSKEY_DIMIMG_SIZEintstrr=   rb   rc   r   __classcell__r   r   r_   r   r   ,   s,    =r   __main__rq   cpur*   )r&   r'   c                 c   s    | ]
}|j r| V  qd S ra   )requires_gradnumel).0pr   r   r   	<genexpr>   s    r   zh=z<18z t=z
 | params=z>11,z  keys_mean_norm=r   r   z.3f)+__doc__osrA   r   r   torch.nnrI   torch.nn.functional
functionalro   environrD   r   r   r>   r   r   r   r   rZ   r\   r   r   Moduler   r   rv   rq   is_availableZh_encZt_encrz   evalmsum
parametersn_trc   r   printr}   r;   itemr   r   r   r   <module>   s:    
t: