o
    ßjd/  ã                   @   s²  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	 
dd¡Zej	 
dd¡ZdZdZd	Zd	Zd	Zd
Zd	ZdZdZee e ZdZdZd"dd„ZG dd„ dejƒZG dd„ dejƒZedkrÕe ej  ¡ rkdnd¡Zedd !e¡ "¡ Z#e$dd„ e# %¡ D ƒƒZ&e'de&d›ƒ e (ddee¡ !e¡Z)e (dd¡ !e¡e Z*e +¡  e#e)e*ƒZ,W d  ƒ n1 sµw   Y  e, -¡ D ]\Z.Z/e0e/dƒrÔe'd e.› d!e1e/j2ƒ› ƒ q¾dS dS )#u  DINO + per-timestep query MLP with AdaLN-Zero(t) conditioning.

Cameron's redesign (2026-05-20): keep the volume formulation but move *all* the
nonlinearity into the per-timestep query, so the spatial scoring stays a cheap
dot product. Architecturally this is cross-attention from per-timestep query
tokens to the (factored) volume features.

Computation graph
â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
  rgb â†’ DINO â†’ patch tokens + cls
      â†’ 1Ã—1 conv refine â†’ F (B, d_feat, H, W)            # spatial feature map
      â†’ cls token        (B, embed_dim)

  eef_feat = F[b, :, y_eef, x_eef]                        # current EEF feature
  q_input  = Linear(concat(eef_feat, cls)) â†’ (B, d_model)
  # 5-layer residual MLP with AdaLN-Zero on sin(t), applied per timestep
  # (B, T) copies of the same input; AdaLN(t) differentiates per-step
  penult = MLP_with_AdaLN_t(q_input)                       # (B, T, d_model)

  q_F, q_z, q_t = split(q_head(penult), [d_feat, d_sin_z, d_sin_t])
  gripper       = grip_head(penult)                        # (B, T, n_grip)
  rotation      = rot_head(penult)                         # (B, T, n_rot) â€” 1D PCA

  # Spatial scoring: dot product of q with the *implicit* volume
  # V[b, t, z, y, x] = concat(F[y,x], sin_z[z], sin_t[t]) â€” never materialised.
  # The concat structure lets the dot product factor:
  score_yx = einsum('btc, bchw -> bthw', q_F, F)           # (B, T, H, W)
  score_z  = einsum('btc, zc -> btz',    q_z, z_sin)       # (B, T, Z)
  score_t  = einsum('btc, tc -> bt',     q_t, t_sin)       # (B, T)  â€” constant per (b,t)

  volume_logits = (score_yx[:, :, None] + score_z[..., None, None]
                                       + score_t[..., None, None, None])     # (B, T, Z, H, W)

The 6-D feature volume (B, T, Z, H, W, d) is never instantiated; only the
5-D scalar logit volume is, which is what the CE loss needs anyway.

Memory usage stays trunk-bound â€” the head adds <100 MB at any reasonable B,
because the MLP runs B*T times (not B*T*Z*H*W like the FiLM-per-voxel design).
é    NÚDINO_REPO_DIRz/data/cameron/keygrip/dinov3ÚDINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthiø  é2   é    é8   é   é€   é   ç     ˆÃ@c                 C   s–   t j| t jd}t  t jd|dt jdt |¡|   ¡}t  | |¡}t  | d¡| ¡|d d …dd d…f< t  	| d¡| ¡|d d …dd d…f< |S )N)Údtyper   é   é   )
ÚtorchÚarangeÚfloat32ÚexpÚmathÚlogÚzerosÚsinÚ	unsqueezeÚcos)ÚnÚdimÚbaseÚposÚdivÚpe© r   ú4/data/cameron/para/libero/model_dino_volume_query.pyÚsinusoidal_features?   s   *&&r    c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚAdaLNZeroMLPBlockuQ   DiT-style block on (N, d): LN â†’ FiLM(Î³,Î²) â†’ MLP(dâ†’4dâ†’d) â†’ +Î±Â·resid.é   c                    s|   t ƒ  ¡  tj|dd| _t |d| ¡| _tj | jj	¡ tj | jj
¡ t t ||| ¡t ¡ t || |¡¡| _d S )NF)Úelementwise_affineé   )ÚsuperÚ__init__ÚnnÚ	LayerNormÚnormÚLinearÚ	cond_projÚinitÚzeros_ÚweightÚbiasÚ
SequentialÚGELUÚmlp)ÚselfÚdÚd_condZ	mlp_ratio©Ú	__class__r   r   r&   J   s   

ýzAdaLNZeroMLPBlock.__init__c                 C   sJ   |   |¡jddd\}}}|  |¡}|d|  | }|  |¡}|||  S )Nr$   éÿÿÿÿ©r   g      ð?)r+   Úchunkr)   r2   )r3   ÚxÚcondÚgÚbÚaÚhr   r   r   ÚforwardV   s
   

zAdaLNZeroMLPBlock.forward)r"   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r&   rA   Ú__classcell__r   r   r6   r   r!   H   s    r!   c                       sN   e Zd Zeeeeeee	e
eeeddddf‡ fdd„	Zdd„ Zdd
d„Z‡  ZS )ÚDinoVolumeQueryFTÚper_axisr   c              	      sÐ  t ƒ  ¡  |ˆ_|ˆ_|ˆ_|ˆ_|ˆ_|ˆ_|ˆ_|| | ˆ_	|
ˆ_
|ˆ_|ˆ_|dv s0J ‚|ˆ_|ˆ_tjjtddtdˆ_|rNˆj ¡ D ]}d|_qHˆjjˆ_t tjˆjˆjdddt ¡ tjˆj|dd	¡ˆ_ˆjd
t||ƒdd ˆjdt||ƒdd t |ˆ ¡ˆ_ |r‘|ˆj nˆj}t |ˆj	¡ˆ_!t "‡ ‡fdd„t#|	ƒD ƒ¡ˆ_$t %ˆj	¡ˆ_&t ˆj	ˆj	¡ˆ_'t ˆj	|¡ˆ_(|dkrÍd| }n|dkrÔ|}n
|dksÜJ dƒ‚|}t ˆj	|¡ˆ_)d S )N)rH   Ú1d_pcaÚkmeansZdinov3_vits16plusÚlocal)ÚsourceÚweightsFr$   r   )Úkernel_sizeÚpadding)rN   Úz_sin)Ú
persistentÚt_sinc                    s   g | ]}t ˆjˆ ƒ‘qS r   )r!   Úd_model)Ú.0Ú_©r5   r3   r   r   Ú
<listcomp>‘   s    ÿz,DinoVolumeQuery.__init__.<locals>.<listcomp>rH   rI   r   z3rotation_mode='kmeans' requires kmeans_n_clusters>0)*r%   r&   Ún_windowÚn_height_binsÚn_gripper_binsÚ
n_rot_binsÚd_featÚd_sin_zÚd_sin_trS   Ú
image_sizeÚ	pred_sizeÚuse_eefÚrotation_modeÚkmeans_n_clustersr   ÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚ	embed_dimr'   r0   ÚConv2dr1   ÚrefineÚregister_bufferr    r*   Út_cond_projÚ
input_projÚ
ModuleListÚrangeÚblocksr(   Ú
final_normÚq_headÚ	grip_headÚrot_head)r3   rX   rY   rZ   r[   r\   r]   r^   r5   Ún_blocksr_   r`   Zfreeze_backbonera   rb   rc   ÚpÚin_dimZrot_out_dimr6   rV   r   r&   _   sZ   

ÿ
ý
ÿ
zDinoVolumeQuery.__init__c                 C   s  |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd }|||ƒ}q| jjr[| j |d d …d | jjd …f ¡}| j |d d …| jjd d …f ¡}	t	j
||	gdd}n| j |¡}|d d …df }
|d d …| jjd d …f }| |||| j¡ dddd¡ ¡ }||
fS )Nr   )ÚHÚWr   r9   r$   r   )Úshaperf   Zprepare_tokens_with_masksrq   Z
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensr)   r   ÚcatÚreshaperi   ÚpermuteÚ
contiguous)r3   r;   ÚBZx_tokensZH_pZW_pÚblkZropeZx_clsZx_patÚclsÚpatchr   r   r   Ú_extract_dino_features¢   s   
$$"z&DinoVolumeQuery._extract_dino_featuresNc           #      C   sb  |j d }| j}| j}| j}|  |¡\}}	tj|| j| jfddd}
|  |
¡}|j dd… \}}| j	rt|d || j
   ¡  d|d ¡}|d	 || j
   ¡  d|d ¡}tj||jd
}||dd…||f }|  tj||	gdd¡}n|  |	¡}| d¡ |||¡ || |¡}|  | j¡}| d¡ ||d¡ || d¡}|}| jD ]}|||ƒ}q¤|  |¡}| |||¡}|  |¡}|  |¡}| jdkrÔ|  |¡ ||d| j¡}n|  |¡}|dd| j…f }|d| j| j| j  …f }|d| j| j  d…f }t !d||¡}t !d|| j"¡} t !d|| j¡}!| d¡|  d¡ d¡ |! d¡ d¡ d¡ }"|"|||dœS )u    rgb: (B, 3, IMG, IMG). start_pix: (B, 2) â€” current EEF pixel in IMG-coords.
           kp_zyx: unused (kept so the train loop's call signature stays uniform).r   ÚbilinearF)ÚsizeÚmodeÚalign_cornerséþÿÿÿN).r   r   ).r   )Údevicer8   r9   rH   r$   .zbtc, bchw -> bthwzbtc, zc   -> btzzbtc, tc   -> btr   )Úvolume_logitsÚgripper_logitsÚrotation_logitsÚpixel_feats)#r{   rX   rY   rS   r„   ÚFÚinterpolater`   rk   ra   r_   ÚlongÚclampr   r   rŠ   rn   r|   r   Úexpandr}   rm   rR   rq   rr   Úviewrs   rt   rb   ru   r[   r\   r]   ÚeinsumrP   )#r3   ÚrgbÚ	start_pixZkp_zyxr€   ÚTÚZr4   rƒ   r‚   Zfeat_upZF_featry   rz   ÚsxÚsyZb_idxZeef_featÚq_inZq_in_btZcond_tZcond_btr@   r   ZpenultZ	q_spatialÚgripperÚrotationZq_FZq_zZq_tZscore_yxZscore_zZscore_tr‹   r   r   r   rA   ³   s^   
ÿ
""
  





ÿþÿüzDinoVolumeQuery.forward©N)rB   rC   rD   ÚN_WINDOWÚN_HEIGHT_BINSÚN_GRIPPER_BINSÚ
N_ROT_BINSÚD_FEATÚD_SINZÚD_SINTÚD_CONDÚN_BLOCKSÚIMG_SIZEÚ	PRED_SIZEr&   r„   rA   rF   r   r   r6   r   rG   ^   s    ùCrG   Ú__main__ÚcudaÚcpu)rX   c                 c   s    | ]
}|j r| ¡ V  qd S rŸ   )rh   Únumel)rT   rw   r   r   r   Ú	<genexpr>ù   s   € r¯   zTrainable: ú,r   r$   r{   z  z: )r
   )3rE   Úosr   r   Útorch.nnr'   Útorch.nn.functionalÚ
functionalr   ÚenvironÚgetr   r   r©   r    r¡   r¢   r£   rª   r¤   r¥   r¦   ZD_MODELr§   r¨   r    ÚModuler!   rG   rB   rŠ   r¬   Úis_availableÚtoÚevalÚmÚsumrg   Ún_tÚprintÚrandr–   ÚspÚno_gradÚoutÚitemsÚkÚvÚhasattrÚtupler{   r   r   r   r   Ú<module>   sN    '
	 
ÿ
€õ	