o
    Ôjm&  ã                   @   sþ  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 ej
 dd¡Zej
 dd¡ZdZdZd	Zd
ZdZdZG dd„ dejƒZedkrûe ej ¡ rQdnd¡Zeƒ  e¡ ¡ Zedd„ e ¡ D ƒƒZe ded›ƒ e !ddee¡ e¡Z"e #ddgddgg¡ e¡Z$e %¡  ee"e$ƒZ&W d  ƒ n1 sœw   Y  e& '¡ D ]@\Z(Z)e*e)dƒr¼e de(› de+e)j,ƒ› ƒ q¥e-e)e.e+fƒråe de(› de/e)ƒ› dƒ e)råe*e)d dƒråe de+e)d j,ƒ› ƒ q¥ej0dkrýe d ej 1¡ d! d"›d#ƒ dS dS dS )$u  DINOv3 + EEF-patch attention as heatmap.

Per Cameron 2026-05-18: instead of vanilla heatmap prediction via 1Ã—1 conv, have the
EEF-projecting patch ATTEND to all other patches and interpret the attention output
as the heatmap response. Architectural shift: the heatmap IS the attention map, not
a separate dense prediction.

Forward outline:
  1. DINOv3 forward â†’ patch tokens P âˆˆ R^(B Ã— N Ã— D), N = grid Ã— grid.
  2. Compute current EEF patch index from `start_pixel` (passed in or = first GT pixel
     at training time). Extract that token p_eef âˆˆ R^(B Ã— D).
  3. Per timestep t, the query is q_t = q_proj(p_eef) + t_query[t], where t_query is a
     learnable embedding âˆˆ R^(T Ã— D). Keys/values are the full patch token stack.
  4. Attention scores s_t = (q_t Â· K^T) / âˆšD âˆˆ R^(B Ã— N). These are the per-timestep
     heatmap logits, reshaped to (h, w).
  5. For the volume formulation we ALSO need a height distribution per pixel. We add a
     small per-pixel MLP that, conditioned on the patch token AND timestep t, outputs
     N_HEIGHT_BINS logits. So the joint volume logit is:
        vol[b, t, z, h, w] = score_t[b, h, w] + height_t_pixel[b, t, z, h, w]
     This separates the 2D attention from the height; each timestep still gets its own
     z distribution, conditioned on the visited patch.

Inputs:  rgb (B, 3, IMG, IMG) in [0, 1], start_pixel (B, 2) in 504-space (training: GT
         current EEF; inference: predicted current pixel from prior step or a heuristic).
Outputs: volume_logits (B, T, Z, h_out, w_out)
é    NÚDINO_REPO_DIRz/data/cameron/keygrip/dinov3ÚDINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   iÀ  é   é    )g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?c                       sV   e Zd Zeeeddfdededededef
‡ fdd	„Zd
d„ Zdd„ Z	dd„ Z
‡  ZS )ÚDinoEefAttnModelé   é   Ún_windowÚn_height_binsÚ
image_sizeÚn_headsÚattn_pred_upsamplec                    sV  t ƒ  ¡  || _|| _|| _t| _|t | _| j| | _t	t
jvr(t
j dt	¡ tjjt	ddtd| _t| jddƒ| _| j}t t ||¡¡| _tjj| jdd || dksYJ ‚|| _|| | _tj||d	d
| _tj||d	d
| _t t || d¡t  ¡ t d|¡¡| _!| j"dt #t$¡ %dddd¡d	d | j"dt #t&¡ %dddd¡d	d d S )Nr   Zdinov3_vits16plusÚlocal)ÚsourceÚweightsÚ	embed_dimi€  g{®Gáz”?)ÚstdF)Úbiasé€   Úmeané   é   )Ú
persistentr   )'ÚsuperÚ__init__r
   r   r   ÚDINO_PATCH_SIZEZ
patch_sizeÚgridÚ	pred_sizer   ÚsysÚpathÚinsertÚtorchÚhubÚloadr   ÚdinoÚgetattrr   ÚnnÚ	ParameterÚzerosÚt_queryÚinitÚtrunc_normal_r   Úhead_dimÚLinearÚq_projÚk_projÚ
SequentialÚGELUÚheight_headÚregister_bufferÚtensorÚIMAGENET_MEANÚviewÚIMAGENET_STD)Úselfr
   r   r   r   r   ÚD©Ú	__class__© ú0/data/cameron/para/libero/model_dino_eef_attn.pyr   -   s6   



ÿ

þ"&zDinoEefAttnModel.__init__c                 C   s   || j  | j S ©N)r   r   )r9   Zrgb01r=   r=   r>   Ú
_normalizeV   s   zDinoEefAttnModel._normalizec           
      C   s˜   |j \}}}| j}|dd…df | j |  ¡  d|d ¡}|dd…df | j |  ¡  d|d ¡}|| |  |dd¡ |d|¡}	| d|	¡ d¡S )zRpatch_tokens: (B, N, D); start_pixel: (B, 2) in image_size coords; returns (B, D).Nr   r   )	Úshaper   r   ÚlongÚclampr7   ÚexpandÚgatherÚsqueeze)
r9   Úpatch_tokensÚstart_pixelÚBÚNr:   ÚgÚuÚvÚidxr=   r=   r>   Ú_gather_eef_tokenY   s   ** z"DinoEefAttnModel._gather_eef_tokenc                 C   s¾  |j d }|j d }|| jkrtj|| j| jfddd}| ¡ | j|  }|  |¡}tj ¡ r2tj	ntj
}tj|jj|d | j |¡}W d  ƒ n1 sOw   Y  t|tƒrc| d| d	¡¡}	n|}	|	 tj¡}	|	j d
 }
| j}| j }}|  |	|¡}|  |¡ d
¡| j d¡ }|  |	¡}|j \}}}| ||| j| j¡ ddd
d¡}| ||
| j| j¡ dddd
¡}t  d||¡t! "| j¡ }|j#d
d}| ||||¡}tj|| j$| j$fddd}|	 ddd
¡ %||||¡}tj|| j$| j$fddd}| dddd
¡ %|d|¡}tj&||j|j'd}| d
¡ (||d|¡}| d
|d
|¡ (|||j d
 |¡}tj)||gdd}|  *|¡}| ||| j$| j$| j+¡}| dd
ddd¡}| d¡| }||d||	gdœS )zWrgb: (B, 3, *, *) in [0, 1]. start_pixel_504: (B, 2) GT current EEF pixel in 504-space.r   éÿÿÿÿÚbilinearF)ÚsizeÚmodeÚalign_corners)Údevice_typeÚdtypeNZx_norm_patchtokensZ	x_prenormr   r	   r   zbhtd, bhdn -> bhtn)Údim)ÚdevicerV   r   )Úvolume_logitsZattn_scores_2dÚ
pred_depthZpixel_featsÚ
dino_feats),rA   r   ÚFÚinterpolateÚfloatr@   r"   ÚcudaÚis_bf16_supportedÚbfloat16Úfloat16ÚautocastrX   Útyper%   Zforward_featuresÚ
isinstanceÚdictÚgetÚtoÚfloat32r   r   rO   r/   Ú	unsqueezer*   r0   r7   r   r-   ÚpermuteÚeinsumÚmathÚsqrtÚsumr   ÚreshapeÚeyerV   rD   Úcatr3   r   )r9   ÚrgbZstart_pixel_504rI   Úin_sizerH   ÚxZautocast_dtypeÚfeatsrG   rJ   r:   ÚhÚwZp_eefZq_tZk_tÚBnÚTÚ_ÚscoresZ	scores_2dZfeat_2dZ	feat_flatZt_onehotZ
feat_tiledZ	toh_tiledÚjointZz_logitsÚvolr=   r=   r>   Úforwardb   sd   


ÿ
ÿ



  ÿÿ"
ûzDinoEefAttnModel.forward)Ú__name__Ú
__module__Ú__qualname__ÚN_WINDOWÚN_HEIGHT_BINSÚIMG_SIZEÚintr   r@   rO   r   Ú__classcell__r=   r=   r;   r>   r   ,   s    þÿÿþ)	r   Ú__main__r_   Úcpuc                 c   s    | ]
}|j r| ¡ V  qd S r?   )Úrequires_gradÚnumel)Ú.0Úpr=   r=   r>   Ú	<genexpr>¯   s   € rŽ   zTrainable: ú,r	   r   g      i@g     Àr@rA   z  z: z: list(ú)z    first: zpeak: g    eÍÍAz.2fz GB)2Ú__doc__Úosr   rm   r"   Útorch.nnr'   Útorch.nn.functionalÚ
functionalr\   Úenvironrg   r   r   r   r…   rƒ   r„   r6   r8   ÚModuler   r€   rX   r_   Úis_availablerh   ÚevalÚmro   Ú
parametersÚn_tÚprintÚrandrs   r5   ÚspÚno_gradÚoutÚitemsÚkrM   ÚhasattrÚtuplerA   re   ÚlistÚlenrd   Úmax_memory_allocatedr=   r=   r=   r>   Ú<module>   sN    ÿ 
ÿ
*€
 ð