o
    rj#                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 ej
dej
e ddlmZmZmZmZmZmZmZmZmZmZ dZdZdZdZdZG dd dejZe d	kre!ej"# rdd
ndZ!e $e!% Z&e'dd e&( D Z)e*de)d e+ddee$e!Z,e-ddgddgg$e!Z.e/  e&e,e.Z0W d   n1 sw   Y  e01 D ]\Z2Z3e4e3dre*de2 de5e3j6  qe!j7d
kre*dej"8 d dd dS dS dS )u  DINOv3 volume with REGRESSED abstract keys (Cameron 2026-05-19 spec).

Architecture:
  - DINOv3 backbone → per-pixel features F ∈ R^(B × key_dim × H × W)  (same as kv model)
  - Sample F at start-EEF pixel → f_start ∈ R^(B × key_dim)
  - Shared MLP trunk on f_start → 3 outputs:
        keys           (B, T, D_key)         — abstract per-timestep keys
        gripper_logits (B, T, n_grip)        — direct gripper-bin predictions
        rotation_logits (B, T, 3, n_rot)     — direct rotation-bin predictions
  - Volume values: V[b, t, z, u, v] = Linear_F(F[b,u,v]) + Linear_t(sin_t[t]) + Linear_z(sin_z[z])
                   (all D_key-dim)
  - Volume logits: keys[b, t] · V[b, t, z, u, v]  (computed efficiently via 3 einsums)

The gripper/rotation predictions DO NOT use the volume decoding path — they're regressed
directly from f_start. The volume decoding uses the same upstream f_start to produce keys.
Single shared representation; all three heads pull on it.
    N)
DINO_REPO_DIRDINO_WEIGHTS_PATHIMG_SIZEN_WINDOWN_HEIGHT_BINSKEY_DIMIMAGENET_MEANIMAGENET_STDDINO_PATCH_SIZEsinusoidal_features    8   i  c                       sn   e Zd Zeeeeeddee	f	de
de
de
de
de
de
d	ed
e
de
f fddZdd Zdd Zdd Z  ZS )DinoVolumeRegressedi   dinov3_vits16plusn_windown_height_binskey_dim_volume
image_sized_keytrunk_hiddendino_variant
n_rot_binsn_gripper_binsc
                    s  t    || _|| _|| _|| _|| _|| _|	| _t	| _
|t	 | _| jd | _ttjvr4tjdt tdd}
|
|t}tjjt|d|d| _t| jdd| _ttj| jd	d
ddt tjd	d	d
ddt td	|d| _t||| _| j dt!||dd | j dt!||dd t||| _"t||| _#tt$|t||t t||t | _%t||| | _&t|||	 | _'t||d
 | | _(| j dt)t*+dd
dddd | j dt)t,+dd
dddd d S )N   r   zQ/data/cameron/keygrip/dinov3/weights/dinov3_vitl16_pretrain_lvd1689m-8aa4cbdd.pth)r   dinov3_vitl16local)sourceweights	embed_dimi           )paddingt_sinF)
persistenth_sinmeanstd)-super__init__r   r   r   r   r   r   r   r
   Z
patch_sizegrid	pred_sizer   syspathinsertr   gettorchhubloaddinogetattrr   nn
SequentialConv2dGELUrefineLinearfeat_to_valueregister_bufferr   
t_sin_proj
z_sin_proj	LayerNormstart_trunk	keys_headgripper_headrotation_headtensorr   viewr	   )selfr   r   r   r   r   r   r   r   r   Zvariant_weightsw	__class__ 8/data/cameron/para/libero/model_dino_volume_regressed.pyr)   $   sL   

"&zDinoVolumeRegressed.__init__c                 C   s   || j  | j S N)r&   r'   )rF   Zrgb01rJ   rJ   rK   
_normalize]   s   zDinoVolumeRegressed._normalizec                 C   s  |j d }|j d | jkrtj|| j| jfddd}| |}tj r'tjntj	}tj
|jj|d | j|}W d   n1 sDw   Y  t|trX|d|d	}n|}|tj}|j d }| j }}	|dd
d||||	}
tj|
| j| jfddd}
| |
|fS )uK   rgb (B, 3, *, *) in [0, 1] → F (B, key_dim_volume, pred_size, pred_size).r   bilinearF)sizemodealign_corners)device_typedtypeNZx_norm_patchtokensZ	x_prenormr   r!   )shaper   FinterpolaterM   r0   cudais_bf16_supportedbfloat16float16autocastdevicetyper3   Zforward_features
isinstancedictr/   tofloat32r*   permutereshaper+   r9   )rF   rgbBxZautocast_dtypefeatsZpatch_tokensDhrG   Zfeat_2drJ   rJ   rK   _pixel_features`   s*   




z#DinoVolumeRegressed._pixel_featuresc                 C   s  |j d }| |\}}|j dd \}}}| j}	| j}
|t }|t }|dddf |  d|d }|dddf |  d|d }|tj||j	ddd||f }| 
|}| |||	| j}| |||	| j}| |||	d| j}| |dddddddd}| | j}| | j}td||}td||}td	||}|d|||	ddd |||	|
dd }|||||gdd
S )a6  rgb (B, 3, *, *) in [0, 1]; start_pix_504 (B, 2) GT current EEF pixel in 504-space.
        Returns:
          volume_logits   (B, T, Z, ph, pw)
          gripper_logits  (B, T, n_grip)
          rotation_logits (B, T, 3, n_rot)
          pixel_feats     (B, key_dim_volume, ph, pw)  for compatibility
        r   r!   N)r]   r    r   zbtd, bdhw -> bthwzbtd, td -> btzbtd, zd -> btz)volume_logitsgripper_logitsrotation_logitspixel_feats
dino_feats
pred_depth)rU   rk   r   r   	DA3_INPUTlongclampr0   aranger]   r@   rA   rE   r   rB   r   rC   r   r;   rc   r=   r#   r>   r%   einsum	unsqueeze)rF   re   start_pix_504rf   ro   Zdino_tokensCphpwTZsxsygxgyf_startrj   keysgriprotZf_projZt_projZz_projterm1term2term3volrJ   rJ   rK   forwardv   s>   
$$ 
"zDinoVolumeRegressed.forward)__name__
__module____qualname__r   r   r   r   D_KEY
N_ROT_BINSN_GRIP_BINSintstrr)   rM   rk   r   __classcell__rJ   rJ   rH   rK   r   #   s0    9r   __main__rX   cpuc                 c   s    | ]
}|j r| V  qd S rL   )requires_gradnumel).0prJ   rJ   rK   	<genexpr>   s    r   zTrainable: ,r   r    g      i@g     r@g     @o@rU   z  z: z
peak mem: g    eAz.2fz GB)9__doc__osr,   mathr0   torch.nnr5   torch.nn.functional
functionalrV   r-   r.   dirname__file__Zmodel_dino_volume_kvr   r   r   r   r   r   r   r	   r
   r   r   Z	PRED_GRIDr   r   rr   Moduler   r   r]   rX   is_availablera   evalmsum
parametersn_tprintrandre   rD   spno_gradoutitemskvhasattrtuplerU   r^   max_memory_allocatedrJ   rJ   rJ   rK   <module>   s>    0 

$
 