o
    X¸
j)  ã                   @   s,  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZ ej dd¡Zej dd¡ZdZd	Zd
ZdZdZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZe dkre !ej" #¡ rrdnd¡Z!eƒ  $e!¡Z%e&dd„ e% '¡ D ƒƒZ(e&dd„ e% '¡ D ƒƒZ)e*de(d›de)d›ƒ dZ+e ,e+ddd¡ $e!¡Z-e ,e+ed¡ $e!¡d e .g d¢¡ $e!¡ Z/e/dd…d f Z0e 1d¡ 2d¡ 3e+dd¡ $e!¡Z4e%e-e/e0e4ƒZ5e5 6¡ D ]\Z7Z8e*d!e7› d"e9e8d#ƒröe:e8j;ƒne8› ƒ qãe*e!j<dkrd$ej" =¡ d% d&›d'nd(ƒ dS dS ))uŸ  Smooth-trajectory variant of model_volume_ar.py (Cameron's proposed "A" arch).

Difference from VolumeARModel:
  - Queries are 8 timestep-tokens (one per future step), each w/ temporal PE
  - KV pool = [32k voxel tokens + 21 EEF tokens]
  - Per layer: SelfAttn(timestep tokens)  then  CrossAttn(timestep tokens â† KV)
    â†’ the SelfAttn between timesteps is the smoothness inductive bias
  - Output voxel logits = einsum('btd, bvd -> btv', timestep_features, voxel_tokens)
    (attention-style scoring, no per-voxel head)
  - Grip/rot heads operate directly on timestep query features (no gather)

Same I/O dict as VolumeARModel so train_volume_ar.py works unchanged.
é    N)
Úvoxel_centers_worldÚworld_to_pixel_torchÚpixel_to_normalized_gridÚsincos_pe_3dÚPE_DIMÚ
N_PAST_EEFÚT_FUTUREÚ
N_ROT_BINSÚ
IMAGE_SIZEÚN_VOXÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é@   é    é   c                       ó*   e Zd Zeef‡ fdd„	Zdd„ Z‡  ZS )ÚSelfAttnc                    s,   t ƒ  ¡  t |¡| _tj||dd| _d S ©NT)Úbatch_first)ÚsuperÚ__init__ÚnnÚ	LayerNormÚlnÚMultiheadAttentionÚattn©ÚselfÚdÚheads©Ú	__class__© ú0/data/cameron/para/libero/model_volume_smooth.pyr   "   s   
zSelfAttn.__init__c                 C   s(   |   |¡}| j|||dd\}}|| S ©NF)Úneed_weights)r   r   )r   ÚxÚnÚaÚ_r#   r#   r$   Úforward'   s   
zSelfAttn.forward©Ú__name__Ú
__module__Ú__qualname__ÚTOKEN_DÚN_HEADSr   r+   Ú__classcell__r#   r#   r!   r$   r   !   s    r   c                       r   )Ú	CrossAttnc                    s8   t ƒ  ¡  t |¡| _t |¡| _tj||dd| _d S r   )r   r   r   r   Úln_qÚln_kvr   r   r   r!   r#   r$   r   .   s   
zCrossAttn.__init__c                 C   s0   | j |  |¡|  |¡|  |¡dd\}}|| S r%   )r   r4   r5   )r   ÚqÚkvr)   r*   r#   r#   r$   r+   4   s   (zCrossAttn.forwardr,   r#   r#   r!   r$   r3   -   s    r3   c                       s8   e Zd Zeedf‡ fdd„	Zdd„ Z	d	dd„Z‡  ZS )
ÚSmoothVolumeARModelTc              	      s†  t ƒ  ¡  || _|| _td|› dƒ tjjtddt	d| _
|r0| j
 ¡ D ]}d|_q%| j
 ¡  | j
j| _|| _t tj| jtddt ¡ tjttdd¡| _t t tt¡t ¡ t tt¡¡| _t d	t¡| _t t |t¡d
 ¡| _t t |t¡d
 ¡| _t  dd„ t!t"ƒD ƒ¡| _#t  dd„ t!t"ƒD ƒ¡| _$t tt¡| _%t tt¡| _&t td¡| _'t td	t( ¡| _)| j*dt+ƒ dd d S )NzLoading DINOv3 (frozen=z)...Zdinov3_vits16plusÚlocal)ÚsourceÚweightsFé   )Úkernel_sizeé   g{®Gáz”?c                 S   ó   g | ]}t ttƒ‘qS r#   )r   r0   r1   ©Ú.0r*   r#   r#   r$   Ú
<listcomp>Y   ó    z0SmoothVolumeARModel.__init__.<locals>.<listcomp>c                 S   r?   r#   )r3   r0   r1   r@   r#   r#   r$   rB   Z   rC   Úvoxel_centers)Ú
persistent),r   r   Ún_pastÚt_futureÚprintÚtorchÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚdino_dÚfreeze_backboner   Ú
SequentialÚConv2dr0   ÚGELUÚ	image_mlpÚLinearr   Úpe_mlpÚ	EmbeddingÚ
type_embedÚ	ParameterÚrandnÚtimestep_tokenÚtimestep_peÚ
ModuleListÚrangeÚN_LAYERSÚself_blocksÚcross_blocksÚq_projÚk_projÚ	grip_headr	   Úrot_headÚregister_bufferr   )r   rF   rG   rR   Úpr!   r#   r$   r   :   s8   


þÿzSmoothVolumeARModel.__init__c                 C   s2  | j r—t ¡ t | j |¡\}\}}| jjD ]}| jjr$| jj||dnd }|||ƒ}q| jjr^| j |d d …d | jj	d …f ¡}| j 
|d d …| jj	d d …f ¡}tj||gdd}n| j 
|¡}|d d …| jj	d d …f  ¡ }	W d   ƒ n1 s~w   Y  |	jd }
|	 |
||| j¡ dddd¡S t‚)N)ÚHÚWr<   ©Údimr   r>   é   )rR   rI   Úno_gradrL   Zprepare_tokens_with_masksÚblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensÚnormÚcatÚdetachÚshapeÚreshaperQ   ÚpermuteÚNotImplementedError)r   r'   ÚtokensZH_pZW_pÚblkZropeZcls_nZpat_nri   ÚBr#   r#   r$   Ú_dino_patchesh   s    
$$"õ
z!SmoothVolumeARModel._dino_patchesNc           *         sd  |j d }|j}|  |¡}tj|ttfddd‰ |  ˆ ¡‰ | j d¡ 	|dd¡}	t
|	|ƒ}
t
||ƒ}t
| d¡|ƒ d¡}‡ fdd„}||
ƒ}||ƒ}|| d¡ƒ d¡}| d¡}|	| }|| }tj|dd	||jd
}|  t|ƒ¡}|  t|ƒ¡}|  t|ƒ¡}|  tj||	 d¡tj|d¡}|  tj|| d¡tj|d¡}|  tj|dfdtj|d¡}|| | }|| | }| d¡| | }tj|||gdd}| j| j  d¡ 	|dd¡}|} t| j| jƒD ]\}!}"|!| ƒ} |"| |ƒ} qç|  | ¡}#|  |¡}$t d|#|$¡td  }%|%  ddd¡}&|&j!dd}'|  "| ¡ d¡}(|  #| ¡ $|| j%d	t&¡})|&|(|)|'dœS )zÑSame I/O contract as VolumeARModel.

        Returns dict:
          voxel_logits:   (B, V, T)
          grip_logit:     (B, T)
          rot_logits:     (B, T, 3, 32)
          pred_voxel_idx: (B, T)
        r   ÚbilinearF)ÚsizeÚmodeÚalign_cornerséÿÿÿÿr<   c                    s8   t | tƒ d¡}tjˆ |dddd}| d¡ ddd¡S )	Nrn   r|   FÚzeros)r~   r   Úpadding_moder€   r   r<   )r   r
   Ú	unsqueezeÚFÚgrid_sampleÚsqueezerv   )Úpix_uvÚgridÚs©Zfeatsr#   r$   Úsample”   s   z+SmoothVolumeARModel.forward.<locals>.sampler>   )ÚdeviceÚdtype)r   rŒ   rn   rl   zbtd, bvd -> btvg      à?)Úvoxel_logitsÚ
grip_logitÚ
rot_logitsÚpred_voxel_idx)'rt   rŒ   r{   r„   ÚinterpolateÚUPSAMPLE_RESrV   rD   rƒ   Úexpandr   r†   rI   r   r   rX   r   rZ   r}   ÚlongÚonesÚfullrr   r]   r^   Úziprb   rc   rd   re   Úeinsumr0   rv   Úargmaxrf   rg   ru   rG   r	   )*r   ÚrgbÚpast_eef_worldÚcurrent_eef_worldÚworld_to_cameraÚtarget_voxel_idxrz   rŒ   ÚpatchesZ	vox_worldZvox_pixZpast_pixZcur_pixr‹   Zvox_imgZpast_imgZcur_imgÚceZvox_relZpast_relZcur_relZvox_peZpast_peZcur_peZtype_voxZ	type_pastZtype_curZ
vox_tokensZpast_tokensZ	cur_tokenr7   Zts_qr'   ÚsaÚcar6   ÚkZvoxel_logits_tvrŽ   r‘   r   r   r#   rŠ   r$   r+   z   s\   


ÿ



  

üzSmoothVolumeARModel.forward©N)	r-   r.   r/   r   r   r   r{   r+   r2   r#   r#   r!   r$   r8   9   s
    .ÿr8   Ú__main__ÚcudaÚcpuc                 c   s    | ]
}|j r| ¡ V  qd S r¥   )rN   Únumel©rA   ri   r#   r#   r$   Ú	<genexpr>Ò   s   € r«   c                 c   s    | ]}|  ¡ V  qd S r¥   )r©   rª   r#   r#   r$   r«   Ó   s   € zTrainable: ú,z / rn   r>   iÀ  gš™™™™™É?)ç        r­   g      ð?r€   z  z: rt   zpeak: g    eÍÍAz.2fz GBÚ )>Ú__doc__ÚosrI   Útorch.nnr   Útorch.nn.functionalÚ
functionalr„   Úrobot_volumer   r   r   r   r   r   r   r	   r
   r   ÚenvironÚgetr   r   ZDINO_PATCH_SIZEr“   r0   r1   ra   ÚModuler   r3   r8   r-   rŒ   r§   Úis_availableÚtoÚmÚsumrM   Zn_tZn_arH   rz   r\   r›   ÚtensorÚpastÚcurÚeyerƒ   r”   Úw2cÚoutÚitemsr¤   ÚvÚhasattrÚtuplert   ÚtypeÚmax_memory_allocatedr#   r#   r#   r$   Ú<module>   sB    0 
, *0ò