o
    Œ
j,                     @   s  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZ ejddZejddZdZd	Zd
ZdZdZG dd dejZG dd dejZedkre ej!" ridndZ e #e Z$e%dd e$& D Z'e%dd e$& D Z(e)de'dde(d dZ*e+e*ddd#e Z,e+e*ed#e d e-g d#e  Z.e.dddf Z/e0d1d2e*dd#e Z3e$e,e.e/e3Z4e)de4d  j5d!e4d" j5d#e4d$ j5 e)d%e4d& j5 e)e j6dkrd'ej!7 d( d)d*nd+ dS dS ),u   Minimal volume AR model per Cameron's spec (2026-05-17).

Pipeline:
  rgb 448²  → DINO (frozen) 28² patches D=384
            → bilinear up 64² → 1×1 conv MLP → 64² × 32D image features
  voxels (32³ = 32,768) + 20 past EEFs + current EEF, all projected to image pixel via cam,
            grid_sample → per-token 32D image feature
  per-token PE: sincos(xyz - current_eef) → 2-layer MLP → 32D
  token feature = image_feature + PE_feature + type_embed
  KV pool = 20 past + 1 current EEF tokens (21 tokens)
  Q       = 32k voxel tokens
  4× cross-attention layers (Q ← KV) with 1×1 conv between (no FFN, "really cheap")
  final 1×1 conv: 32 → 8 timestep logits per voxel
  per-timestep argmax voxel → MLP → grip logit + (3,32) rot bins
    N)
voxel_centers_worldworld_to_pixel_torchpixel_to_normalized_gridsincos_pe_3dPE_DIM
N_PAST_EEFT_FUTURE
N_ROT_BINS
IMAGE_SIZEN_VOXDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3DINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   @          c                       s.   e Zd ZdZeef fdd	Zdd Z  ZS )	CrossAttnuO   One cross-attention layer (Q ← KV), pre-LN, no FFN. Cameron's 'really cheap'.c                    s8   t    t|| _t|| _tj||dd| _d S )NT)batch_first)super__init__nn	LayerNormln_qln_kvMultiheadAttentionattn)selfdheads	__class__ ,/data/cameron/para/libero/model_volume_ar.pyr   %   s   
zCrossAttn.__init__c                 C   s0   | j | || || |dd\}}|| S )NF)need_weights)r   r   r   )r   qkva_r!   r!   r"   forward+   s   (zCrossAttn.forward)	__name__
__module____qualname____doc__TOKEN_DN_HEADSr   r(   __classcell__r!   r!   r   r"   r   #   s    r   c                       s8   e Zd Zeedf fdd	Zdd Z	d	ddZ  ZS )
VolumeARModelTc              	      sH  t    || _|| _td| d tjjtddt	d| _
|r0| j
 D ]}d|_q%| j
  | j
j| _|| _ttj| jtddt tjttdd| _ttttt ttt| _td	t| _td
d ttD | _tdd ttD | _ tt|| _!ttd| _"ttd	t# | _$| j%dt& dd d S )NzLoading DINOv3 (frozen=z)...Zdinov3_vits16pluslocal)sourceweightsF   )kernel_size   c                 S   s   g | ]}t ttqS r!   )r   r-   r.   .0r'   r!   r!   r"   
<listcomp>R   s    z*VolumeARModel.__init__.<locals>.<listcomp>c                 S   s   g | ]}t ttqS r!   )r   Linearr-   r7   r!   r!   r"   r9   S   s    voxel_centers)
persistent)'r   r   n_pastt_futureprinttorchhubloadr   r   dino
parametersrequires_gradeval	embed_dimdino_dfreeze_backboner   
SequentialConv2dr-   GELU	image_mlpr:   r   pe_mlp	Embedding
type_embed
ModuleListrangeN_LAYERSattn_layersbetween_convfinal	grip_headr	   rot_headregister_bufferr   )r   r=   r>   rI   pr   r!   r"   r   1   s8   




zVolumeARModel.__init__c                 C   s2  | j rt t | j|\}\}}| jjD ]}| jjr$| jj||dnd}|||}q| jjr^| j|ddd| jj	d f }| j
|dd| jj	d df }tj||gdd}n| j
|}|dd| jj	d df  }	W d   n1 s~w   Y  |	jd }
|	|
||| jddddS t)u%   x: (B, 3, H, W) → (B, 384, 28, 28).)HWNr4   dimr   r6      )rI   r@   no_gradrC   Zprepare_tokens_with_masksblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensnormcatdetachshapereshaperH   permuteNotImplementedError)r   xtokensZH_pZW_pblkZropeZcls_nZpat_nrZ   Br!   r!   r"   _dino_patches`   s    
$$"
zVolumeARModel._dino_patchesNc           (         sF  |j d }|j}| |}tj|ttfddd |   | jd	|dd}	t
|	|}
t
||}t
|d|d} fdd}||
}||}||dd}|d}|	| }|| }tj|dd	||jd
}| t|}| t|}| t|}| tj||	dtj|d}| tj||dtj|d}| tj|dfdtj|d}|| | }|| | }|d| | }tj||gdd}|}t| j| jD ]\} }!| ||}|!|}q| |}"|du r|"jdd}#n|}#|#d	ddt}$|d|$}%| |%d}&| |% || j!d	t"}'|"|&|'|#dS )u  
        rgb:                 (B, 3, 448, 448)
        past_eef_world:      (B, N=20, 3) — world frame; index 19 = most recent
        current_eef_world:   (B, 3)       — world frame (= past_eef_world[:, -1])
        world_to_camera:     (B, 4, 4)    — robosuite-style world→pixel matrix
        target_voxel_idx:    (B, T=8) or None  — for teacher-forced grip/rot heads
        r   bilinearF)sizemodealign_cornersr4   c                    s8   t | td}tj |dddd}|ddddS )	Nr_   rn   Fzeros)rp   rq   padding_moderr   r   r4   )r   r
   	unsqueezeFgrid_samplesqueezerg   )pix_uvgridsZfeatsr!   r"   sample   s
   z%VolumeARModel.forward.<locals>.sampler6   )devicedtype)r   r~   r_   r]   N)voxel_logits
grip_logit
rot_logitspred_voxel_idx)#re   r~   rm   rv   interpolateUPSAMPLE_RESrM   r;   ru   expandr   rx   r@   rs   r   rN   r   rP   ro   longonesfullrc   ziprT   rU   rV   argmaxr-   gatherrW   rX   rf   r>   r	   )(r   rgbpast_eef_worldcurrent_eef_worldworld_to_cameratarget_voxel_idxrl   r~   patchesZ	vox_worldZvox_pixZpast_pixZcur_pixr}   Zvox_imgZpast_imgZcur_imgceZvox_relZpast_relZcur_relZvox_peZpast_peZcur_peZtype_voxZ	type_pastZtype_curZ
vox_tokensZpast_tokensZ	cur_tokenr%   ri   r   ZlinZ
logits_v_tr   Z
idx_expandZtimestep_featr   r   r!   r|   r"   r(   t   s\   
	




  


zVolumeARModel.forwardN)	r)   r*   r+   r   r   r   rm   r(   r/   r!   r!   r   r"   r0   0   s
    /r0   __main__cudacpuc                 c   s    | ]
}|j r| V  qd S r   )rE   numelr8   rZ   r!   r!   r"   	<genexpr>   s    r   c                 c   s    | ]}|  V  qd S r   )r   r   r!   r!   r"   r      s    zTrainable: ,z / r_   r6   i  g?)        r   g      ?rr   zvoxel_logits:r   zgrip:r   zrot:r   zpred_voxel_idx:r   zpeak memory: g    eAz.2fz GB )8r,   osr@   torch.nnr   torch.nn.functional
functionalrv   robot_volumer   r   r   r   r   r   r   r	   r
   r   environgetr   r   ZDINO_PATCH_SIZEr   r-   r.   rS   Moduler   r0   r)   r~   r   is_availabletomsumrD   Zn_trainn_totr?   rl   randnr   tensorZpast_eefZcur_eefeyeru   r   w2coutre   typemax_memory_allocatedr!   r!   r!   r"   <module>   sF    0 
, 

0