o
    +†jD  ã                   @   s˜  d Z ddlZddlZddlmZ ddlm  mZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd„ deƒZedkrÈe ej ¡ rRdnd¡Zed	d
d e¡ ¡ Z e!dd„ e  "¡ D ƒƒZ#e$de#d›ƒ e %ddee¡ e¡Z&e %ddee¡ e¡Z'e %dd¡ e¡e Z(e )¡  e e&e'e(ƒZ*W d  ƒ n1 s©w   Y  e* +¡ D ]\Z,Z-e-durÇe$de,› de.e-j/ƒ› ƒ q²dS dS )uî  DinoVolumeQuery, image-concat 2view variant.

Inputs (rgb_bev, rgb_wrist) â†’ horizontally concat to (3, IMG, 2*IMG) â†’ DinoV3 â†’ patches
over 28Ã—56 â†’ upsample to 56Ã—112 â†’ take first 56 columns (BEV half) for volume head.

DINO's self-attention naturally crosses both views via the extra patches; the wrist's
features influence the BEV-side patches through attention, but the world-space scoring
remains anchored on the BEV camera (which is the only one with a fixed world frame here).
é    N)ÚDINO_REPO_DIRÚDINO_WEIGHTS_PATHÚN_WINDOWÚN_HEIGHT_BINSÚN_GRIPPER_BINSÚ
N_ROT_BINSÚD_FEATÚD_SINZÚD_SINTÚD_CONDÚN_BLOCKSÚIMG_SIZEÚ	PRED_SIZEÚsinusoidal_featuresÚAdaLNZeroMLPBlockÚDinoVolumeQueryc                       s*   e Zd ZdZ‡ fdd„Zddd„Z‡  ZS )ÚDinoVolumeQueryConcatzOImage-concat variant: takes (rgb_bev, rgb_wrist) and concatenates side-by-side.c                    s   t ƒ j|i |¤Ž d S ©N)ÚsuperÚ__init__)ÚselfÚargsÚkwargs©Ú	__class__© ú;/data/cameron/para/libero/model_dino_volume_query_concat.pyr      s   zDinoVolumeQueryConcat.__init__Nc           &      C   sŠ  |j d }| j}| j}| j}tj||gdd}	|  |	¡\}
}tj|
| j	d| j	 fddd}|  
|¡}|dd	| j	…f }|j d
d	… \}}| jrˆ|d || j   ¡  d|d ¡}|d || j   ¡  d|d ¡}tj||jd}||d	d	…||f }|  tj||gdd¡}n|  |¡}| d¡ |||¡ || |¡}|  | j¡}| d¡ ||d¡ || d¡}|}| jD ]}|||ƒ}q¸|  |¡}| |||¡}|  |¡}|  |¡}| jdkrè|  |¡ ||d| j¡}n|  |¡}|dd	| j…f }|d| j| j| j  …f } |d| j| j  d	…f }!t !d||¡}"t !d| | j"¡}#t !d|!| j¡}$|" d¡|# d¡ d¡ |$ d¡ d¡ d¡ }%|%|||dœS )uY  rgb_bev, rgb_wrist: each (B, 3, IMG, IMG)
           start_pix: (B, 2) â€” EEF pixel in BEV frame (IMG-coords).

        BEV is left half, wrist is right half. We crop the BEV-half from the upsampled
        feature map so volume_logits live in BEV pixel/world space (matched to bev_xyz table
        used at inference for 3D recovery).
        r   éÿÿÿÿ)Údimé   ÚbilinearF)ÚsizeÚmodeÚalign_corners.Néþÿÿÿ).r   é   ).r%   )ÚdeviceZper_axisé   zbtc, bchw -> bthwzbtc, zc   -> btzzbtc, tc   -> bt)Úvolume_logitsZgripper_logitsZrotation_logitsZpixel_feats)#ÚshapeÚn_windowZn_height_binsÚd_modelÚtorchÚcatZ_extract_dino_featuresÚFÚinterpolateZ	pred_sizeÚrefineZuse_eefÚ
image_sizeÚlongÚclampÚaranger&   Z
input_projÚ	unsqueezeÚexpandÚreshapeZt_cond_projZt_sinÚblocksZ
final_normÚviewZq_headZ	grip_headÚrotation_modeZrot_headZ
n_rot_binsZd_featZd_sin_zÚeinsumZz_sin)&r   Úrgb_bevÚ	rgb_wristZ	start_pixZkp_zyxÚBÚTÚZÚdZrgb_catZ	patch_catÚclsZfeat_upZF_fullZF_featÚHÚWÚsxÚsyZb_idxZeef_featZq_inZq_in_btZcond_tZcond_btÚhZblkZpenultZ	q_spatialZgripperÚrotationZq_FZq_zZq_tZscore_yxZscore_zZscore_tr(   r   r   r   Úforward   sb   
ÿ
""
  





ÿþÿüzDinoVolumeQueryConcat.forwardr   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   rI   Ú__classcell__r   r   r   r   r      s    r   Ú__main__ÚcudaÚcpué   Ú1d_pca)r*   r:   c                 c   s    | ]
}|j r| ¡ V  qd S r   )Úrequires_gradÚnumel)Ú.0Úpr   r   r   Ú	<genexpr>o   s   € rX   zTrainable: ú,r   r'   z  z: )0rM   Úosr,   Útorch.nnÚnnZtorch.nn.functionalÚ
functionalr.   Zmodel_dino_volume_queryr   r   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   rJ   r&   rP   Úis_availableÚtoÚevalÚmÚsumÚ
parametersÚn_tÚprintÚrandr<   r=   ÚspÚno_gradÚoutÚitemsÚkÚvÚtupler)   r   r   r   r   Ú<module>   s.    	H	T
ÿ"€õ
