o
    ÉYj‚(  ã                   @   s"  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZ ej dd¡Zej dd¡ZdZd	Zd
ZdZdZdZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZ e!dkre "ej# $¡ rtdnd¡Z"e ƒ  %e"¡Z&e'dd„ e& (¡ D ƒƒZ)e'dd„ e& (¡ D ƒƒZ*e+de)d›de*d›ƒ dZ,e -e,ddd¡ %e"¡Z.e -e,ed¡ %e"¡d e /g d¢¡ %e"¡ Z0e0dd…d f Z1e 2d¡ 3d¡ 4e,dd¡ %e"¡Z5e&e.e0e1e5ƒZ6e6 7¡ D ]\Z8Z9e+d!e8› d"e:e9j;ƒ› ƒ qåe+e"j<dkr
d#ej# =¡ d$ d%›d&nd'ƒ dS dS )(uà  SmoothVolumeARModel v2 â€” Cameron's two tweaks on the smooth arch (2026-05-18):

(1) cur_img feature is added to the timestep query embeddings BEFORE cross-attn.
    This anchors each timestep query with visual context at the starting EEF.

(2) Gripper + rotation regression moved to a separate stage:
      - argmax voxel per timestep (or GT during training)
      - gather VOXEL TOKEN features at those indices  â†’  (B, T, D)
      - 2 rounds of self-attention among the T gathered tokens
      - MLP per token â†’ grip logit + 3-axis rot bins
    Rationale: the voxel feature carries "what's at that spot visually" â€” better
    for grasp-timing + orientation than the timestep query feature (which is
    more about navigation).
é    N)
Úvoxel_centers_worldÚworld_to_pixel_torchÚpixel_to_normalized_gridÚsincos_pe_3dÚPE_DIMÚ
N_PAST_EEFÚT_FUTUREÚ
N_ROT_BINSÚ
IMAGE_SIZEÚN_VOXÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é@   é    é   é   c                       ó*   e Zd Zeef‡ fdd„	Zdd„ Z‡  ZS )ÚSelfAttnc                    s,   t ƒ  ¡  t |¡| _tj||dd| _d S ©NT)Úbatch_first)ÚsuperÚ__init__ÚnnÚ	LayerNormÚlnÚMultiheadAttentionÚattn©ÚselfÚdÚheads©Ú	__class__© ú3/data/cameron/para/libero/model_volume_smooth_v2.pyr   $   s   
zSelfAttn.__init__c                 C   s(   |   |¡}| j|||dd\}}|| S ©NF)Úneed_weights)r   r   )r   ÚxÚnÚaÚ_r$   r$   r%   Úforward)   s   
zSelfAttn.forward©Ú__name__Ú
__module__Ú__qualname__ÚTOKEN_DÚN_HEADSr   r,   Ú__classcell__r$   r$   r"   r%   r   #   s    r   c                       r   )Ú	CrossAttnc                    s8   t ƒ  ¡  t |¡| _t |¡| _tj||dd| _d S r   )r   r   r   r   Úln_qÚln_kvr   r   r   r"   r$   r%   r   0   s   
zCrossAttn.__init__c                 C   s0   | j |  |¡|  |¡|  |¡dd\}}|| S r&   )r   r5   r6   )r   ÚqÚkvr*   r+   r$   r$   r%   r,   6   s   (zCrossAttn.forwardr-   r$   r$   r"   r%   r4   /   s    r4   c                       s8   e Zd Zeedf‡ fdd„	Zdd„ Z	d	dd„Z‡  ZS )
ÚSmoothVolumeARModelV2Tc              	      s¬  t ƒ  ¡  || _|| _td|› dƒ tjjtddt	d| _
|r0| j
 ¡ D ]}d|_q%| j
 ¡  | j
j| _|| _t tj| jtddt ¡ tjttdd¡| _t t tt¡t ¡ t tt¡¡| _t d	t¡| _t t |t¡d
 ¡| _t t |t¡d
 ¡| _t  dd„ t!t"ƒD ƒ¡| _#t  dd„ t!t"ƒD ƒ¡| _$t tt¡| _%t tt¡| _&t  dd„ t!t'ƒD ƒ¡| _(t )t¡| _*t td¡| _+t td	t, ¡| _-| j.dt/ƒ dd d S )NzLoading DINOv3 (frozen=z)...Zdinov3_vits16plusÚlocal)ÚsourceÚweightsFé   )Úkernel_sizeé   g{®Gáz”?c                 S   ó   g | ]}t ttƒ‘qS r$   ©r   r1   r2   ©Ú.0r+   r$   r$   r%   Ú
<listcomp>W   ó    z2SmoothVolumeARModelV2.__init__.<locals>.<listcomp>c                 S   r@   r$   )r4   r1   r2   rB   r$   r$   r%   rD   X   rE   c                 S   r@   r$   rA   rB   r$   r$   r%   rD   _   rE   Úvoxel_centers)Ú
persistent)0r   r   Ún_pastÚt_futureÚprintÚtorchÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚdino_dÚfreeze_backboner   Ú
SequentialÚConv2dr1   ÚGELUÚ	image_mlpÚLinearr   Úpe_mlpÚ	EmbeddingÚ
type_embedÚ	ParameterÚrandnÚtimestep_tokenÚtimestep_peÚ
ModuleListÚrangeÚN_LAYERSÚself_blocksÚcross_blocksÚq_projÚk_projÚN_RG_SELFATTNÚrg_self_blocksr   Úrg_normÚ	grip_headr	   Úrot_headÚregister_bufferr   )r   rH   rI   rT   Úpr"   r$   r%   r   <   s<   


þÿzSmoothVolumeARModelV2.__init__c                 C   s2  | j r—t ¡ t | j |¡\}\}}| jjD ]}| jjr$| jj||dnd }|||ƒ}q| jjr^| j |d d …d | jj	d …f ¡}| j 
|d d …| jj	d d …f ¡}tj||gdd}n| j 
|¡}|d d …| jj	d d …f  ¡ }	W d   ƒ n1 s~w   Y  |	jd }
|	 |
||| j¡ dddd¡S t‚)N)ÚHÚWr=   ©Údimr   r?   r   )rT   rK   Úno_gradrN   Zprepare_tokens_with_masksÚblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensÚnormÚcatÚdetachÚshapeÚreshaperS   ÚpermuteÚNotImplementedError)r   r(   ÚtokensZH_pZW_pÚblkZropeZcls_nZpat_nrn   ÚBr$   r$   r%   Ú_dino_patchesf   s    
$$"õ
z#SmoothVolumeARModelV2._dino_patchesNc           -         sÄ  |j d }|j}|  |¡}tj|ttfddd‰ |  ˆ ¡‰ | j d¡ 	|dd¡}	t
|	|ƒ}
t
||ƒ}t
| d¡|ƒ d¡}‡ fdd„}||
ƒ}||ƒ}|| d¡ƒ d¡}| d¡}|	| }|| }tj|dd	||jd
}|  t|ƒ¡}|  t|ƒ¡}|  t|ƒ¡}|  tj||	 d¡tj|d¡}|  tj|| d¡tj|d¡}|  tj|dfdtj|d¡}|| | }|| | }| d¡| | }tj|||gdd}| j| j  d¡ 	|dd¡}|| d¡ }|} t| j| jƒD ]\}!}"|!| ƒ} |"| |ƒ} qî|  | ¡}#|  |¡}$t d|#|$¡td  }%|%  ddd¡}&|&j!dd}'|d ur%|n|'}(| "d|( d¡ 	ddt¡¡})|)}*| j#D ]}!|!|*ƒ}*q:|  $|*¡}*|  %|*¡ d¡}+|  &|*¡ '|| j(d	t)¡},|&|+|,|'dœS )Nr   ÚbilinearF)ÚsizeÚmodeÚalign_cornerséÿÿÿÿr=   c                    s8   t | tƒ d¡}tjˆ |dddd}| d¡ ddd¡S )	Nr   r€   FÚzeros)r‚   rƒ   Úpadding_moder„   r   r=   )r   r
   Ú	unsqueezeÚFÚgrid_sampleÚsqueezerz   )Úpix_uvÚgridÚs©Zfeatsr$   r%   ÚsampleŠ   s   z-SmoothVolumeARModelV2.forward.<locals>.sampler?   )ÚdeviceÚdtype)r‘   r   r   rq   zbtd, bvd -> btvg      à?)Úvoxel_logitsÚ
grip_logitÚ
rot_logitsÚpred_voxel_idx)*rx   r   r   rˆ   ÚinterpolateÚUPSAMPLE_RESrX   rF   r‡   Úexpandr   rŠ   rK   r…   r‘   rZ   r   r\   r   ÚlongÚonesÚfullrv   r_   r`   Úziprd   re   rf   rg   Úeinsumr1   rz   ÚargmaxÚgatherri   rj   rk   rl   ry   rI   r	   )-r   ÚrgbÚpast_eef_worldÚcurrent_eef_worldÚworld_to_cameraÚtarget_voxel_idxr~   r   ÚpatchesZ	vox_worldZvox_pixZpast_pixZcur_pixr   Zvox_imgZpast_imgZcur_imgÚceZvox_relZpast_relZcur_relZvox_peZpast_peZcur_peZtype_voxZ	type_pastZtype_curZ
vox_tokensZpast_tokensZ	cur_tokenr8   Zts_qr(   ÚsaÚcar7   ÚkZvoxel_logits_tvr’   r•   Zref_idxZtarget_vox_featÚrgr“   r”   r$   rŽ   r%   r,   x   sj   

ÿ



  



üzSmoothVolumeARModelV2.forward©N)	r.   r/   r0   r   r   r   r   r,   r3   r$   r$   r"   r%   r9   ;   s
    *ÿr9   Ú__main__ÚcudaÚcpuc                 c   s    | ]
}|j r| ¡ V  qd S r«   )rP   Únumel©rC   rn   r$   r$   r%   Ú	<genexpr>Í   s   € r±   c                 c   s    | ]}|  ¡ V  qd S r«   )r¯   r°   r$   r$   r%   r±   Î   s   € zTrainable: ú,z / r?   iÀ  gš™™™™™É?)ç        r³   g      ð?r„   z  z: zpeak: g    eÍÍAz.2fz GBÚ )>Ú__doc__ÚosrK   Útorch.nnr   Útorch.nn.functionalÚ
functionalrˆ   Úrobot_volumer   r   r   r   r   r   r   r	   r
   r   ÚenvironÚgetr   r   ZDINO_PATCH_SIZEr—   r1   r2   rc   rh   ÚModuler   r4   r9   r.   r   r­   Úis_availableÚtoÚmÚsumrO   Zn_tZn_arJ   r~   r^   r    ÚtensorÚpastÚcurÚeyer‡   r˜   Úw2cÚoutÚitemsr©   ÚvÚtuplerx   ÚtypeÚmax_memory_allocatedr$   r$   r$   r%   Ú<module>   sD    0 
, 0ò