o
    ;ojÍc  ã                   @   sH  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	 
dd¡Zej	 
dd¡ZdZdZd	Zd	Zd	Zd
Zd	ZdZdZee e ZdZdZdZd-dd„ZG dd„ dejƒZdd„ Zdd„ Zd.dd„ZG dd„ dejƒZ e!dkr e "ej# $¡ r{dnd¡Z"e dd d! %e"¡ &¡ Z'e(d"d#„ e' )¡ D ƒƒZ*e+d$e*d%›ƒ ej,eeed&e"d'Z-ej,dd&eee"d'Z.ej,dd&eee"d'Z/ej,dde"d'e Z0ej1d&e"d' 2d¡ 3dd(d(¡ 4¡ Z5ej1d)e"d' 2d¡ 3dd(d(¡ 4¡ Z6e 7¡  e'e.e/e0e-e5e6ƒZ8W d  ƒ n1 sþw   Y  e8 9¡ D ]\Z:Z;e<e;d*ƒre+d+e:› d,e=e;j>ƒ› ƒ qdS dS )/u&  Two-view (BEV + wrist) DinoVolumeQuery â€” same architecture skeleton as the
single-view query-MLP model, but with:

  - Shared DINO trunk applied to both views â†’ patch tokens for each
  - 1-2 cross-attention layers at the last spatial layer mixing BEV â†” wrist
  - Two refine heads â†’ F_bev, F_wrist âˆˆ (B, d_feat, H, W)
  - Query input: Linear(concat(eef_feat_bev, cls_bev, cls_wrist)) â†’ d_model
  - 5-layer AdaLN-Zero MLP per timestep with sin(t) conditioning
  - Per-step query split into (q_F_bev, q_F_wrist, q_z, q_t)
  - Volume scoring sums contributions from BEV (direct lookup) + WRIST
    (project each voxel's world XYZ through the wrist camera, grid_sample
    F_wrist at the projected uv with padding_mode='zeros' so out-of-frustum
    voxels contribute 0). Plus the height/time terms.
é    NÚDINO_REPO_DIRz/data/cameron/keygrip/dinov3ÚDINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthiÀ  é   é    é8   é   é€   é   é   ç     ˆÃ@c                 C   s–   t j| t jd}t  t jd|dt jdt |¡|   ¡}t  | |¡}t  | d¡| ¡|d d …dd d…f< t  	| d¡| ¡|d d …dd d…f< |S )N)Údtyper   r
   é   )
ÚtorchÚarangeÚfloat32ÚexpÚmathÚlogÚzerosÚsinÚ	unsqueezeÚcos)ÚnÚdimÚbaseÚposÚdivÚpe© r   ú:/data/cameron/para/libero/model_dino_volume_query_2view.pyÚsinusoidal_features'   s   *&&r    c                       s*   e Zd ZdZd‡ fdd„	Zdd„ Z‡  ZS )ÚAdaLNZeroMLPBlockuQ   DiT-style block on (N, d): LN â†’ FiLM(Î³,Î²) â†’ MLP(dâ†’4dâ†’d) â†’ +Î±Â·resid.é   c                    s|   t ƒ  ¡  tj|dd| _t |d| ¡| _tj | jj	¡ tj | jj
¡ t t ||| ¡t ¡ t || |¡¡| _d S )NF)Úelementwise_affineé   )ÚsuperÚ__init__ÚnnÚ	LayerNormÚnormÚLinearÚ	cond_projÚinitÚzeros_ÚweightÚbiasÚ
SequentialÚGELUÚmlp)ÚselfÚdÚd_condZ	mlp_ratio©Ú	__class__r   r   r&   2   s   

þzAdaLNZeroMLPBlock.__init__c                 C   sJ   |   |¡jddd\}}}|  |¡}|d|  | }|  |¡}|||  S )Nr$   éÿÿÿÿ©r   ç      ð?)r+   Úchunkr)   r2   )r3   ÚxÚcondÚgÚbÚaÚhr   r   r   Úforward=   s
   

zAdaLNZeroMLPBlock.forward)r"   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r&   rB   Ú__classcell__r   r   r6   r   r!   0   s    r!   c	                 C   s  |   ¡ }	|	d  t|ƒ9  < |	d  t|ƒ9  < t|ƒt|ƒ }
tj||d ¡ d |
 }tj||d ¡ d |
 }tj||dd\}}t |	¡}t |¡}tj|||gdd dd	¡j	}|| j	 ||d	¡}|d
d	…d
d	…f }|d
d	…d	f }|| dd	¡j	 j	 ||d	¡}tj
||||d}|d  d¡}t | ¡ dk t |d¡|¡}| |dd¡|d  | }| dddd	¡| d¡| d¡  }|S )u  Compute world XYZ for every voxel (z_bin, y_grid, x_grid) â€” BEV camera is static.

    K_norm_bev:    (3, 3) intrinsics normalised by image dims.
    bev_extrinsic: (4, 4) cameraâ†’world transform (from get_camera_extrinsic_matrix).
    Returns: (Z, H, W, 3) world XYZ table.
    r   r   ©Údeviceç      à?Úxy©Úindexingr8   r9   r$   N©.r
   çíµ ÷Æ°>çíµ ÷Æ°¾r
   )ÚcloneÚfloatr   r   ÚmeshgridÚinverseÚ	ones_likeÚstackÚreshapeÚTÚlinspacer   ÚwhereÚabsÚ	full_likeÚview)Ú
K_norm_bevÚbev_extrinsicÚn_height_binsÚ
min_heightÚ
max_heightÚHÚWÚ
image_sizerI   ÚKÚscaleÚysÚxsÚgrid_xÚgrid_yÚK_invÚonesÚuv1Úrays_camÚR_cwÚt_cwÚ
rays_worldÚheightsÚrwzÚrwz_safeÚsÚxyzr   r   r   Úbuild_bev_world_xyz_tableE   s&   (

$rx   c                 C   sà  | j }| jd }	|  ¡ }
|
dd…df  t|ƒ9  < |
dd…df  t|ƒ9  < t|ƒt|ƒ }tj||d ¡ d | }tj||d ¡ d | }tj||dd\}}t |¡}tj|||gdd	 	dd
¡j
}t |
¡}|| d¡ |	dd¡  dd¡}| 	|	||d
¡}|dd…dd
…dd
…f }|dd…dd
…d
f }t d||¡}tj||||d}|d  d¡}t | ¡ dk t |d¡|¡}| d|dd¡|dd…df  |	ddd¡ | }| |	dddd
¡| d¡| d¡  }|S )uð   Batched version of build_bev_world_xyz_table.

    K_norm_bev:    (B, 3, 3) intrinsics normalised by image dims.
    bev_extrinsic: (B, 4, 4) cameraâ†’world transform per sample.
    Returns: (B, Z, H, W, 3) world XYZ table per sample.
    r   Nr   rH   rJ   rK   rL   r8   r9   r$   r
   zbij,bhwj->bhwirN   rO   rP   )rI   ÚshaperQ   rR   r   r   rS   rU   rV   rW   rX   rT   r   ÚexpandÚ	transposeÚeinsumrY   rZ   r[   r\   r]   )r^   r_   r`   ra   rb   rc   rd   re   rI   ÚBrf   rg   rh   ri   rj   rk   rm   rn   rl   ro   rp   rq   rr   rs   rt   ru   rv   rw   r   r   r   Ú!build_bev_world_xyz_table_batchedj   s,   
8

 0&r~   Fc                 C   s:  |j d }| ¡ }|dd…df  t|ƒ9  < |dd…df  t|ƒ9  < t |¡}|  ¡ dkr_| j \}}	}
}|  dd¡}tj|t |dd…dd…f ¡gdd}||j	 
d¡ |dd¡ }n(| j \}}}	}
}|  |dd¡}tj|t |ddd…f ¡gdd}t d	||¡}|dd…d
f }|dk}|dd…dd…f | d¡ 
d¡ }|| }|dd…dd
…f }|dd…df t|d ƒ d
 d }|dd…df t|d ƒ d
 d }|| ¡ dkB | ¡ dkB }t |t |d¡|¡}t |t |d¡|¡}tj||gdd |||	|
d
¡}|r|  |||	|
¡}||fS |S )uj  xyz_table:        (Z, H, W, 3) OR (B, Z, H, W, 3) in world.
       wrist_K_norm:     (B, 3, 3) normalised intrinsics.
       wrist_extrinsic:  (B, 4, 4) cameraâ†’world (from get_camera_extrinsic_matrix).
       Returns (B, Z, H, W, 2) of normalised [-1, 1] UV grid for grid_sample.
       If return_mask=True, also returns in_frustum_mask (B, Z, H, W) bool.
    r   Nr   r"   r8   r$   r9   .zbij,bnj->binr
   gü©ñÒMbP?rO   r:   g       @)ry   rQ   rR   r   rT   r   rW   ÚcatrU   rX   r   rz   r|   Ú	clamp_minr[   rZ   r\   rV   r]   )Z	xyz_tableÚwrist_K_normÚwrist_extrinsicre   Úreturn_maskr}   rf   Zworld_to_camÚZrc   rd   Ú_Zxyz_flatZpts_hZpts_camZz_camZbehindZpts_normZpixZpix_uvÚuÚvÚinvalidÚgridÚ
in_frustumr   r   r   Úproject_world_to_wrist_uv_grid‹   s8   
8
($$$$ r‹   c                       sN   e Zd Zeeeeeee	e
eeeeddddf‡ fdd„	Zdd„ Zd	d
„ Z‡  ZS )ÚDinoVolumeQuery2ViewÚ1d_pcar   FÚsumc              	      sr  t ƒ  ¡  |ˆ_t t |¡¡ˆ_t t |¡d ¡ˆ_	|ˆ_
|ˆ_|ˆ_|ˆ_|ˆ_|ˆ_|ˆ_|| | ˆ_|ˆ_|ˆ_|dv sDJ ‚|ˆ_|ˆ_tjjtddtdˆ_|rbˆj ¡ D ]}d|_q\ˆjjˆ_tjˆjdˆjd d	d
ddd}tj||
dˆ_ t !dˆj¡ˆ_"t #tj$ˆjˆjdddt %¡ t $ˆj|d¡¡ˆ_&t #tj$ˆjˆjdddt %¡ t $ˆj|d¡¡ˆ_'ˆj(dt)||ƒdd ˆj(dt)||ƒdd t *|ˆ ¡ˆ_+t *|dˆj  ˆj¡ˆ_,t -‡ ‡fdd„t.|	ƒD ƒ¡ˆ_/t 0ˆj¡ˆ_1d| | | ˆ_2t *ˆjˆj2¡ˆ_3t *ˆj|¡ˆ_4|dkrd| }n|dkr&|}n	|dks-J ‚|}t *ˆj|¡ˆ_5d S )Ng{®Gáz”?)r   ÚkmeansÚper_axisZdinov3_vits16plusÚlocal)ÚsourceÚweightsFé   r
   g        ÚgeluT)Úd_modelÚnheadÚdim_feedforwardÚdropoutÚ
activationÚbatch_firstÚ
norm_first)Ú
num_layersr$   r   )ÚpaddingÚz_sin)Ú
persistentÚt_sinc                    s   g | ]}t ˆjˆ ƒ‘qS r   )r!   r–   )Ú.0r…   ©r5   r3   r   r   Ú
<listcomp>ÿ   s    ÿz1DinoVolumeQuery2View.__init__.<locals>.<listcomp>r   r   r   )6r%   r&   Úfusion_moder'   Ú	Parameterr   r   Úwrist_oof_logitÚrandnÚF_oof_tokenÚn_windowr`   Ún_gripper_binsÚ
n_rot_binsÚd_featÚd_sin_zÚd_sin_tr–   re   Ú	pred_sizeÚrotation_modeÚkmeans_n_clustersÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚ	embed_dimÚTransformerEncoderLayerÚTransformerEncoderÚ
cross_attnÚ	EmbeddingÚview_embr0   ÚConv2dr1   Ú
refine_bevÚrefine_wristÚregister_bufferr    r*   Út_cond_projÚ
input_projÚ
ModuleListÚrangeÚblocksr(   Ú
final_normZ
q_head_dimÚq_headÚ	grip_headÚrot_head)r3   rª   r`   r«   r¬   r­   r®   r¯   r5   Ún_blocksZn_cross_layersre   r°   r±   r²   Zfreeze_backboner¥   ÚpÚ	enc_layerZrot_out_dimr6   r£   r   r&   ·   sp   


ÿ
þþþ
ÿ


zDinoVolumeQuery2View.__init__c                 C   sü   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd}|||ƒ}q| jjr[| j |dd…d| jjd …f ¡}| j |dd…| jjd d…f ¡}	t	j
||	gdd}n| j |¡}|dd…df }
|dd…| jjd d…f }|
|||ffS )zMReturns (cls_token (B, embed), patch_tokens (B, n_patch, embed), (H_p, W_p)).r   )rc   rd   Nr   r9   )ry   rµ   Zprepare_tokens_with_masksrÆ   Z
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensr)   r   r   )r3   r<   r}   Zx_tokensZH_pZW_pÚblkZropeZx_clsZx_patÚclsÚpatchesr   r   r   Ú_extract_dino_tokens  s   
$$z)DinoVolumeQuery2View._extract_dino_tokensc           S   	      sÖ  |j d ‰ ˆjˆjˆj}}}	ˆ |¡\}
}\‰‰ˆ |¡\}}\}}ˆˆ }ˆˆf||fks4J dƒ‚t tj|tj|j	dtj
|tj|j	dg¡}ˆ |¡ d¡}tj||gdd| }ˆ |¡}|dd…d|…f }|dd…|d…f }‡ ‡‡‡fdd„}tj||ƒˆjˆjfd	d
d}tj||ƒˆjˆjfd	d
d}ˆ |¡}ˆ |¡}|j dd… \}}|d |ˆj   ¡  d|d ¡}|d |ˆj   ¡  d|d ¡}tjˆ |j	d}||dd…||f } ˆ tj| |
|gdd¡}!|! d¡ ˆ ||	¡ ˆ | |	¡}"ˆ ˆj¡}#|# d¡ ˆ |d¡ ˆ | d¡}$|"}%ˆjD ]}&|&|%|$ƒ}%qˆ |%¡}%|% ˆ ||	¡}'ˆ |'¡}(ˆ |'¡})ˆj dkrOˆ !|'¡ ˆ |dˆj"¡}*nˆ !|'¡}*ˆj#ˆj$ˆj%}+},}-|(dd|+…f }.|(d|+d|+ …f }/|(dd|+ d|+ |, …f }0|(dd|+ |, d…f }1t &d|.|¡}2t '¡  |}3t(|3||ˆjdd\}4}5W d  ƒ n	1 s°w   Y  |4j \}6}7}8}9}:|4 |6|7|8 |9d¡};tj)||;d	ddd}<|< |6ˆj#|7|8|9¡}<t &d|/|<¡}=|5 d¡ *¡ }>ˆj+dkrˆj, ddddd¡}?|>|= d|> |?  }@nˆj+dkr|>|= }@n|=}@t &d|0ˆj-¡}At &d|1ˆj¡}B|A d¡ d¡}C|B d¡ d¡ d¡}D|2 d¡|C |D }E|@|C |D }Fˆj+d v rU|2 d¡|@ |C |D }Gn‹ˆj+dkrŠ|2 d¡|@ |C |D }H|Hj \}I}J}:}:}:t &d!|/ˆj.¡ d¡}K|H |I|Jd¡}Ltj|L|Kgdd}GnVˆj+d"kr¡tj/|E|Fgdd}Mtj0|Mdd}Gn?ˆj+d#krØ|Ej \}I}J}N}O}P|E |I|Jd¡j1dd |I|J|N|O|P¡}Q|F |I|Jd¡j1dd |I|J|N|O|P¡}R|Q|R }Gnt2d$ˆj+› ƒ‚|G|E|F|5|)|*||d%œS )&u©  rgb_bev / rgb_wrist: (B, 3, IMG, IMG)
        start_pix_bev: (B, 2) â€” current EEF pixel in IMG-coords on the BEV image
        bev_xyz_table: (Z, H, W, 3) â€” world XYZ at each (z_bin, y_grid, x_grid) voxel
                       (static for libero â€” same for all samples in batch).
        wrist_K_norm: (B, 3, 3) normalised wrist intrinsics
        wrist_world_to_cam: (B, 4, 4) wrist worldâ†’camera per sample
        r   z+Both views must produce the same patch grid)r   rI   r   r9   Nc                    s"   |   ˆ ˆˆˆj¡ dddd¡ ¡ S )Nr   r$   r   r
   )rW   r¸   ÚpermuteÚ
contiguous)rÌ   ©r}   ZHp_bZWp_br3   r   r   Ú_to_gridC  s   "z.DinoVolumeQuery2View.forward.<locals>._to_gridÚbilinearF)ÚsizeÚmodeÚalign_cornerséþÿÿÿ).r   ).r   rH   r8   r   r$   .r
   zbtc, bchw -> bthwT)rƒ   r   )rØ   Úpadding_moderÙ   zbtc, bczhw -> btzhwÚoof_maskr:   Z	aug_tokenzbtc, zc -> btzzbtc, tc -> bt)rŽ   rÜ   zbtc, c -> btÚmaxÚpoezUnknown fusion_mode: )Úvolume_logitsÚvol_bevÚ	vol_wristrŠ   Úgripper_logitsÚrotation_logitsÚpixel_featsÚpixel_feats_wrist)3ry   rª   r`   r–   rÑ   r   r   r   ÚlongrI   rm   r½   r   r»   ÚFÚinterpolater°   r¿   rÀ   re   Úclampr   rÃ   rz   rW   rÂ   r¡   rÆ   rÇ   r]   rÈ   rÉ   r±   rÊ   r¬   r­   r®   r¯   r|   Úno_gradr‹   Úgrid_samplerR   r¥   r§   rŸ   r©   rV   Ú	logsumexpÚlog_softmaxÚ
ValueError)Sr3   Úrgb_bevÚ	rgb_wristÚstart_pix_bevÚbev_xyz_tabler   r‚   rX   r„   r4   Zcls_bevZpatches_bevZ	cls_wristZpatches_wristZHp_wZWp_wZn_pZview_idsZview_eÚjointZ	pat_bev_xZpat_wrist_xrÕ   Zfeat_bev_upZfeat_wrist_upÚF_bevÚF_wristrc   rd   ÚsxÚsyZb_idxZeef_featZq_inZq_in_btZcond_tZcond_btrA   rÎ   ZpenultZ	q_spatialÚgripperÚrotationZd_FZd_zZd_tZq_F_bevZ	q_F_wristZq_zZq_tZscore_bev_yxrw   Zuv_gridrŠ   ZBvZZvZHvZWvr…   Z	grid_flatZF_w_sampledZscore_wrist_from_featZin_frustum_BTZHWZoof_logit_BTZscore_wrist_zyxZscore_zZscore_tZz_termÚt_termZvol_bev_fullÚvol_wrist_fullrß   Zvol_spatialÚB_ZT_Z	oof_scoreZflat_spatialÚstackedZZ_ZH_ZW_Z	log_p_bevZlog_p_wristr   rÔ   r   rB   $  sÐ   
	þ
ÿÿ

""  





ÿþ
ÿ
ÿ
ÿ$$
øzDinoVolumeQuery2View.forward)rC   rD   rE   ÚN_WINDOWÚN_HEIGHT_BINSÚN_GRIPPER_BINSÚ
N_ROT_BINSÚD_FEATÚD_SINZÚD_SINTÚD_CONDÚN_BLOCKSÚN_CROSS_ATTN_LAYERSÚIMG_SIZEÚ	PRED_SIZEr&   rÑ   rB   rG   r   r   r6   r   rŒ   ¶   s    ÷\rŒ   Ú__main__ÚcudaÚcpur   )rª   r±   c                 c   s    | ]
}|j r| ¡ V  qd S )N)r·   Únumel)r¢   rÌ   r   r   r   Ú	<genexpr>Á  s   € r  zTrainable: ú,r$   rH   r8   r"   ry   z  z: )r   )F)?rF   Úosr   r   Útorch.nnr'   Útorch.nn.functionalÚ
functionalrç   ÚenvironÚgetr   r   r  rþ   rÿ   r   r  r	  r  r  r  ZD_MODELr  r  r  r    ÚModuler!   rx   r~   r‹   rŒ   rC   rI   r  Úis_availableÚtoÚevalÚmrŽ   r¶   Ún_tÚprintÚrandZbev_xyzrï   rð   ÚspÚeyer   rz   rQ   Úwrist_KZ	wrist_wtcrê   ÚoutÚitemsÚkr‡   ÚhasattrÚtuplery   r   r   r   r   Ú<module>   s`    
	%
!+  

""
ÿ€ð