o
    •

j U  ã                	   @   s¦  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	 
dd¡Zej	 
dd¡ZdZdZd	Zd
ZdZdZdZdZdZd	Zd	Zdd„ Zdd„ ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZ G dd „ d e ƒZ!G d!d"„ d"e ƒZ"e#d#krOe $ej% &¡ rd$nd%¡Z$e!d&fe"d'ffD ]¨\Z'Z(e)d(e(› d)ƒ e'd
d*dd+d, *e$¡Z+e,d-d.„ e+ -¡ D ƒƒZ.e)d/e.d0›ƒ d1\Z/Z0e 1e/e0d2dd¡ *e$¡Z2e 3e/e0d3¡ *e$¡d Z4e 5d2¡ 6d¡ 7e/d2d2¡ *e$¡Z8e 5d¡ 6d¡ 7e/dd¡ *e$¡Z9e :g d4¢g¡ *e$¡Z;e <¡  e+e2e4e8e9e;d5Z=W d  ƒ n	1 s2w   Y  e= >¡ D ]\Z?Z@e)d6e?› d7eAe@jBƒ› ƒ q;q¨dS dS )8u.  Voxel-token AR policy (Cameron's variants B and C).

Architecture:
  Stage A (PatchEncoder, reused from v2): DINO patches per frame.
  Voxel grid (per current frame): (G_xy Ã— G_xy Ã— G_z) voxels. Each voxel feature =
      Linear(PE(xyz)) + dino_patch[x_pix, y_pix]
    where xyz is either:
      - variant B: absolute world xyz of the voxel center
      - variant C: (xyz - eef_start_xyz), the EEF-anchored delta
    The image-aligned formulation means voxel (x, y, z) projects to the same pixel as
    (x, y, 0), so dino_patch indexing is trivial.

  Stage B (cross-attention only â€” Perceiver-IO style):
    Query tokens: past H-1 EEF tokens (causal) + 1 EEF query at current frame = H tokens
    KV tokens:    H Ã— N_patches past patches (cached, small) + V current-frame voxels (large)
    Self-attention among the H query tokens, cross-attention from queries to KV.
    No voxelâ†”voxel attention. Total compute O(K Ã— V) per layer.

  Output heads: same 7-DoF (xy, height, gripper, rotation) read off the last EEF query.

Two flavors share this skeleton:
  - VoxelARPolicyAbs: PE input = world xyz
  - VoxelARPolicyRel: PE input = world xyz - eef_start_xyz_world

eef_start_xyz_world: the EEF position at the FIRST frame of the current attention context
(t = current_step - H + 1). Documented choice, see inbox spec.
é    NÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é8   é    é   i€  é   é   iÀ  c                 C   sV   |d }t  t j|||dt d¡|   ¡}|  d¡| }t j| ¡ | ¡ gddS )Né   ©ÚdeviceÚdtypeg     ˆÃ@éÿÿÿÿ©Údim)	ÚtorchÚexpÚarangeÚmathÚlogÚ	unsqueezeÚcatÚsinÚcos)Ú	positionsr   r   r   ÚhalfZfreqsÚangles© r   ú+/data/cameron/para/libero/model_voxel_ar.pyÚ_sincos_pe_1d6   s   &r   c           
      C   s‚   |d }t | d |||ƒ}t | d |||ƒ}t | d |||ƒ}tj|||gdd}|jd |kr?||jd  }	t |d|	f¡}|S )zYxyz: (..., 3); returns (..., dim) summing PEs for each axis (dim must be divisible by 6).é   ©.r   ©.é   ).r
   r   r   r   )r   r   r   ÚshapeÚFÚpad)
Úxyzr   r   r   ÚperÚpxÚpyÚpzÚoutr&   r   r   r   Ú_sincos_pe_3d=   s   r-   c                       s4   e Zd Zeedf‡ fdd„	Zdd„ Zdd„ Z‡  ZS )Ú_PatchEncoderTc                    s®   t ƒ  ¡  || _|t | _| jd | _td|› dƒ tjj	t
ddtd| _|r8| j ¡ D ]}d|_q-| j ¡  | jj| _| j|ksDJ ‚|| _t t |¡t ||¡¡| _d S )Nr
   z%PatchEncoder: loading DINOv3 (frozen=z)...Zdinov3_vits16plusÚlocal)ÚsourceÚweightsF)ÚsuperÚ__init__Útarget_sizeÚDINO_PATCH_SIZEÚpatches_per_sideÚ	n_patchesÚprintr   ÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚfreeze_backboneÚnnÚ
SequentialÚ	LayerNormÚLinearÚproj)Úselfr4   Úd_modelr@   Úp©Ú	__class__r   r   r3   M   s   



 z_PatchEncoder.__init__c           	      C   sÞ  | j r…t ¡ t | j |¡\}\}}| jjD ]}| jjr$| jj||dnd }|||ƒ}q| jjr^| j |d d …d | jj	d …f ¡}| j 
|d d …| jj	d d …f ¡}tj||gdd}n| j 
|¡}|d d …| jj	d d …f  ¡ W  d   ƒ S 1 s~w   Y  d S | j |¡\}\}}| jjD ]}| jjr¡| jj||dnd }|||ƒ}q“| jjrÛ| j |d d …d | jj	d …f ¡}| j 
|d d …| jj	d d …f ¡}tj||gdd}n| j 
|¡}|d d …| jj	d d …f S )N)ÚHÚWr#   r   )r@   r   Úno_gradr;   Zprepare_tokens_with_masksÚblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensÚnormr   Údetach)	rF   ÚxÚtokensZH_pZW_pÚblkZropeZcls_nZpat_nr   r   r   Ú_dino_patches]   s.   
$$$õ$$z_PatchEncoder._dino_patchesc                 C   sZ   |j d d… \}}|j|| g|j dd … ¢R Ž }|  |¡}|  |¡}| ||| j| j¡S )Nr
   )r$   ÚviewrT   rE   r7   r?   )rF   ÚframesÚBrL   rQ   Úpatchesr   r   r   Úforwardx   s
    

z_PatchEncoder.forward)	Ú__name__Ú
__module__Ú__qualname__Ú
IMAGE_SIZEÚTRANSFORMER_Dr3   rT   rY   Ú__classcell__r   r   rI   r   r.   L   s    r.   c                       ó,   e Zd ZdZd	‡ fdd„	Zd
dd„Z‡  ZS )ÚCrossAttnBlockuT   Pre-LN cross-attention: q â† q + Attn(LN(q), LN(kv), LN(kv)); q â† q + FFN(LN(q)).r	   ç        c                    sr   t ƒ  ¡  t |¡| _t |¡| _tj|||dd| _t |¡| _t 	t 
||| ¡t ¡ t 
|| |¡¡| _d S ©NT)ÚdropoutÚbatch_first)r2   r3   rA   rC   Úln_qÚln_kvÚMultiheadAttentionÚattnÚln2rB   rD   ÚGELUÚffn©rF   rG   Ún_headsZffn_multrd   rI   r   r   r3   …   s   

þzCrossAttnBlock.__init__Nc                 C   sL   |   |¡}|  |¡}| j||||dd\}}|| }||  |  |¡¡ }|S ©NF)Ú	attn_maskÚneed_weights)rf   rg   ri   rl   rj   )rF   ÚqÚkvrp   Úq_nZkv_nÚaÚ_r   r   r   rY      s
   zCrossAttnBlock.forward©r	   rb   ©N©rZ   r[   r\   Ú__doc__r3   rY   r_   r   r   rI   r   ra   ‚   s    ra   c                       r`   )ÚSelfAttnBlockz-Pre-LN causal self-attention over EEF tokens.r	   rb   c                    sf   t ƒ  ¡  t |¡| _tj|||dd| _t |¡| _t t 	||| ¡t 
¡ t 	|| |¡¡| _d S rc   )r2   r3   rA   rC   Úlnrh   ri   rj   rB   rD   rk   rl   rm   rI   r   r   r3   ›   s   

þzSelfAttnBlock.__init__Nc                 C   sB   |   |¡}| j||||dd\}}|| }||  |  |¡¡ }|S ro   )r|   ri   rl   rj   )rF   rr   rp   rt   ru   rv   r   r   r   rY   ¥   s
   
zSelfAttnBlock.forwardrw   rx   ry   r   r   rI   r   r{   ˜   s    
r{   c                       s>   e Zd ZdZeeeeee	e
 f‡ fdd„	Ze	fdd„Z‡  ZS )ÚVoxelARHeaduò  Perceiver-IO style temporal+voxel attention with H EEF query tokens.

    Inputs to forward:
      patch_tokens:   (B, H, Np, D)   â€” past H frames' DINO patches (cached)
      eef_history_xy: (B, H, 2)        â€” past H EEF pixel coords (state EEF, teacher-forced)
      voxel_feats:    (B, V, D)        â€” voxel features (Linear(PE(xyz)) + image_feat[x_pix,y_pix])
                                          ONLY the current (last) frame's voxels. KV-only.
    Output: dict of 7-DoF logits.
    c              
      sÜ  t ƒ  ¡  || _|| _ˆ | _|| _|d | _t t	 
ˆ ¡d ¡| _t t	 
ˆ ¡d ¡| _t t	 
ˆ ¡d ¡| _t t	 
ˆ ¡d ¡| _t ‡ ‡fdd„t|ƒD ƒ¡| _t ‡ ‡fdd„t|ƒD ƒ¡| _t t ˆ ¡t ˆ ˆ ¡t ¡ t ˆ || ¡¡| _t ˆ ¡| _t t ˆ ˆ ¡t ¡ t ˆ t¡¡| _t t ˆ ˆ ¡t ¡ t ˆ d¡¡| _t t ˆ ˆ ¡t ¡ t ˆ dt ¡¡| _t	 | j¡d | j }t	 | j¡d | j }t	j||d	d
\}	}
| j dt	j!|
|	gdd "| jd¡dd d S )Nr
   g{®Gáz”?c                    ó   g | ]}t ˆ ˆƒ‘qS r   )r{   ©Ú.0rv   ©rG   rn   r   r   Ú
<listcomp>Ê   ó    z(VoxelARHead.__init__.<locals>.<listcomp>c                    r~   r   )ra   r   r   r   r   r‚   Ë   rƒ   r#   r    ç      à?Úij©ÚindexingÚpatch_xy_01r   r   F©Ú
persistent)#r2   r3   Úhistory_lenÚ	grid_sizerG   r6   r7   rA   Ú	Parameterr   ÚrandnÚ	eef_tokenÚtype_embed_eefÚtype_embed_patchÚtype_embed_voxelÚ
ModuleListÚrangeÚself_blocksÚcross_blocksrB   rC   rD   rk   Ú
readout_xyÚ	feat_normÚN_HEIGHT_BINSÚheight_headÚgripper_headÚ
N_ROT_BINSÚrotation_headr   ÚmeshgridÚregister_bufferÚstackÚreshape)rF   r‹   rŒ   rG   rn   Ún_layersr6   ÚysÚxsÚgyÚgxrI   r   r   r3   º   s6   

  þ$$(
þzVoxelARHead.__init__c              	   C   s2  |j \}}}}|j d }	|| jksJ d| j› d|› ƒ‚|j}
|j}|t|ƒ }tj||
|d}t|||
|ƒ}tjt|d |d |
|ƒt|d |d |
|ƒgdd	}| j	| j
  dd|¡ |||¡}|| | }tjt| j |¡d |d |
|ƒt| j |¡d |d |
|ƒgdd	 d
¡ |||¡}|| j | d¡  ||| |¡}|| j }tj||gdd	}tj|||
d}| tjtj|||
tjdddtdƒ¡ t| j| jƒD ]\}}|||d}|||ƒ}qÖ|d d …dd d …f }|  |¡}|  |¡}|  |¡}|  |¡ d¡}|   |¡ ddt!¡}||||dœS )Nr#   zexpected H=z, got r   r!   r
   r"   r   r   r   )r   )Údiagonalz-inf)rp   r    )Ú	xy_logitsÚheight_logitsÚgripper_logitÚrotation_logits)"r$   r‹   r   r   Úfloatr   r   r   r   r   r   rU   Úexpandrˆ   Útor   r‘   r¡   r’   ÚzerosÚmasked_fill_ÚtriuÚonesÚboolÚzipr•   r–   r—   r˜   rš   r›   Úsqueezer   rœ   )rF   Úpatch_tokensÚeef_history_xyÚvoxel_featsr4   rW   rK   ÚNpÚDÚVr   r   Zeef_01Ztime_idxZtime_peZeef_absZ	eef_protoZeef_qZ	patch_absZ
patches_kvZ	voxels_kvrs   ZcausalÚsaÚcarr   r¨   Úfr©   rª   r«   r   r   r   rY   Þ   sZ   
 þý þýý"
 ÿ


üzVoxelARHead.forward)rZ   r[   r\   rz   ÚHISTORY_LENÚ	GRID_SIZEr^   ÚTRANSFORMER_HÚTRANSFORMER_Lr]   r5   r3   rY   r_   r   r   rI   r   r}   ¯   s    
þ$r}   c                       sF   e Zd ZdZeeeddeee f‡ fdd„	Z	dd„ Z
dd	d
„Z‡  ZS )ÚVoxelFeatureBuilderuÃ  Build voxel features for the current frame.

    Given:
      patch_tokens:   (B, Np, D)            â€” current frame's DINO patches (already projected)
      cam_K:          (B, 3, 3)             â€” image-pixel intrinsics
      cam_extrinsic:  (B, 4, 4)             â€” cameraâ†’world (used to unproject voxel centers to world)
      eef_start_xyz_world: (B, 3) or None   â€” if not None, subtract from xyz before PE (variant C)

    The voxel grid is image-aligned: x,y span the image pixel space, z spans
    [MIN_HEIGHT, MAX_HEIGHT] in world coords. For each voxel:
      - find its (x_pix, y_pix) â†’ look up the corresponding DINO patch via bilinear/nearest sample
      - unproject (x_pix, y_pix, world_z) â†’ world xyz using the camera matrices
      - feature = Linear(PE(world_xyz [- eef_start_xyz])) + patch_feature

    Returns:
      voxel_feats: (B, V, D) where V = G_xy * G_xy * G_z
      voxel_xyz_world: (B, V, 3) â€” for inspection / debug
    ç333333ë?çÍÌÌÌÌÌø?c                    s°   t ƒ  ¡  || _|| _|| _|| _|| _|| _|| _t	 
||¡| _|| }t |¡d | }	t |¡d | }
t |||¡}| jd|	dd | jd|
dd | jd|dd d S )Nr„   Úvox_pxFr‰   Úvox_pyÚvox_pz)r2   r3   Ú
image_sizeÚgrid_xyÚgrid_zÚmin_hÚmax_hrG   r6   rA   rD   Úpe_to_dr   r   ÚlinspacerŸ   )rF   rÉ   rÊ   rË   Ú
min_heightÚ
max_heightrG   r6   Úcellr¤   r£   ÚzsrI   r   r   r3   .  s    
zVoxelFeatureBuilder.__init__c                 C   s°   |j \}}}| j}| ||||¡ dddd¡}| j| j d d }| j| j d d }tj||dd\}	}
tj	|
|	gdd 
d¡ |ddd¡}tj||d	d
d}| dddd¡S )u\   patch_tokens: (B, Np, D). Returns (B, grid_xy, grid_xy, D) â€” bilinear-sampled at voxel xy.r   r    r#   r
   r…   r†   r   r   ÚbilinearF)ÚmodeÚalign_corners)r$   r6   rU   ÚpermuterÆ   rÉ   rÇ   r   rž   r    r   r­   r%   Úgrid_sample)rF   r¶   rW   r¹   rº   ZPsÚfeatsÚnxÚnyr¥   r¦   ÚgridÚsampledr   r   r   Ú_patch_lookupC  s   $z!VoxelFeatureBuilder._patch_lookupNc                 C   sR  |j d }| j}|j}|j}|  |¡}	|	 d¡ ddd| jd¡}
| j| j}}| j	 
|¡| j  dd|dd¡ |||| jd¡}| j 
|¡| j  d|ddd¡ |||| jd¡}| j 
|¡ ddd| jd¡ |||| jd¡}tj|||gdd}|dur†|| |dddd¡ }t||||ƒ}|  |¡}|
| }
|
 ||| | j |¡| |dd¡fS )zTBuild voxel feats. patch_tokens: (B, Np, D). cam_K: (B,3,3). cam_extrinsic: (B,4,4).r   r    r   r#   r   N)r$   rG   r   r   rÞ   r   r­   rË   rÊ   rÆ   r®   rÉ   rU   rÇ   rÈ   r   r   r-   rÎ   r¡   )rF   r¶   Úcam_KÚcam_extrinsicÚeef_start_xyz_worldrW   rº   r   r   Zpatch_feats_xyrÙ   r¦   r¥   Zpx_nZpy_nr+   r'   Úper   r   r   rY   P  s"   

	00,
&zVoxelFeatureBuilder.forwardrx   )rZ   r[   r\   rz   r]   ÚVOXEL_XYÚVOXEL_Zr^   r5   r3   rÞ   rY   r_   r   r   rI   r   rÃ     s    þrÃ   c                       sB   e Zd Zeeeeeee	e
ddddf‡ fdd„	Z	d
dd	„Z‡  ZS )Ú_VoxelARPolicyBaseTFrÄ   rÅ   c              	      sj   t ƒ  ¡  || _|| _|| _|
| _t|||	ƒ| _t|||||||t	 d| _
t||||||t	 d| _d S )N)rÉ   rÊ   rË   rÐ   rÑ   rG   r6   )r‹   rŒ   rG   rn   r¢   r6   )r2   r3   r4   r‹   rŒ   Úuse_eef_relativer.   Úpatch_encoderrÃ   r5   Úvoxel_builderr}   Úar_head)rF   r4   r‹   rŒ   Úvoxel_xyÚvoxel_zrG   rn   r¢   r@   ræ   rÐ   rÑ   rI   r   r   r3   w  s    
ýýz_VoxelARPolicyBase.__init__Nc                 C   sN   |   |¡}|dd…df }| jr|nd}|  ||||¡\}	}
|  |||	| j¡S )zû
        frames:              (B, H, 3, target_size, target_size)
        eef_history_xy:      (B, H, 2)
        cam_K:               (B, 3, 3)
        cam_extrinsic:       (B, 4, 4)
        eef_start_xyz_world: (B, 3)  required for variant C
        Nr   )rç   ræ   rè   ré   r4   )rF   rV   r·   rß   rà   rá   rX   ÚcurrentÚanchorr¸   rv   r   r   r   rY     s
   
	z_VoxelARPolicyBase.forwardrx   )rZ   r[   r\   r]   r¿   rÀ   rã   rä   r^   rÁ   rÂ   r3   rY   r_   r   r   rI   r   rå   v  s    üÿrå   c                       ó   e Zd Z‡ fdd„Z‡  ZS )ÚVoxelARPolicyAbsc                    ó   d|d< t ƒ jdi |¤Ž d S )NFræ   r   ©r2   r3   ©rF   ÚkwargsrI   r   r   r3   ž  ó   zVoxelARPolicyAbs.__init__©rZ   r[   r\   r3   r_   r   r   rI   r   rï     ó    rï   c                       rî   )ÚVoxelARPolicyRelc                    rð   )NTræ   r   rñ   rò   rI   r   r   r3   ¤  rô   zVoxelARPolicyRel.__init__rõ   r   r   rI   r   r÷   £  rö   r÷   Ú__main__ÚcudaÚcpuÚabsÚrelz
== smoke: z ==é   T)r‹   rê   rë   r@   c                 c   s    | ]
}|j r| ¡ V  qd S rx   )r=   Únumel)r€   rH   r   r   r   Ú	<genexpr>®  s   € rÿ   zTrainable: ú,)r#   r   r    r
   )gš™™™™™Ù?gš™™™™™¹¿g      ð?)rá   z  z: )Crz   Úosr   r   Útorch.nnrA   Útorch.nn.functionalÚ
functionalr%   ÚenvironÚgetr   r   r5   rã   rä   r¿   rÀ   r^   rÁ   rÂ   r]   r™   rœ   r   r-   ÚModuler.   ra   r{   r}   rÃ   rå   rï   r÷   rZ   r   rù   Úis_availableÚclsÚlabelr8   r®   ÚmodelÚsumr<   Ún_trainrW   rK   rŽ   rV   ÚrandZeefÚeyer   r­   ÚKÚEÚtensorrí   rM   r,   ÚitemsÚkÚvÚtupler$   r   r   r   r   Ú<module>   sd    6k\'
  
ÿÿñ