o
    ,eÇiŒ=  ã                   @   sH  d Z ddlZddlZddlmZ ddlm  mZ ej 	dd¡Z
ej 	dd¡ZdZdZd	Zd	Zd
ZdZG dd„ dejƒZedkr"ddlZde_de_e ej ¡ rTdnd¡ZededZe e¡ZdZe eddd¡ e¡Z e eddd¡ e¡Z!e "ddgge ¡ e¡Z#e $eed¡ e¡Z%e $ee¡ &¡  e¡Z'e (d¡ )d¡ *edd¡ e¡ +¡ Z,de,dd…ddf< de,dd…ddf< e "g d¢g d¢g d¢g¡ )d¡ *edd¡ e¡ +¡ Z-e .¡  ee e!e#e%e'e,e-e,e-d	\Z/Z0Z1Z2W d  ƒ n1 sûw   Y  e3de/j4› ƒ e3de0j4› ƒ e3d e1j4› ƒ e3d!e2j4› ƒ dS dS )"u  Cost-volume PARA: fused third-view + wrist-view features for heatmap prediction.

For each pixel (u,v) in the agentview and each height bin h:
  1. Project the 3D cell onto the wrist camera, sample 16-dim wrist features
  2. Get 16-dim agentview features at (u,v)
  3. Get 16-dim learned height embedding for bin h
  4. Concatenate â†’ 48-dim â†’ 3-layer MLP â†’ per-timestep heatmap score

Gripper/rotation: at the selected 3D cell (GT during train, argmax during eval),
extract the same 48-dim fused feature and map through 2-layer MLPs.
é    NÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é   é    é@   c                       sh   e Zd Zdeeeedf‡ fdd„	Z‡ fdd„Zdd„ Z	d	d
„ Z
dd„ Zdd„ Z				ddd„Z‡  ZS )ÚCostVolumePredictoréÀ  Fc                    sV  t ƒ  ¡  || _|| _|| _|| _t| _|| _d| _	t
dƒ tjjtddtd| _|r;| j ¡ D ]}d|_q0| j ¡  | jj| _| j}	t t |	¡d ¡| _t tj|	|	dd	d
t ¡ tj|	|	dd	d
t ¡ tj|	|dd	d
t ¡ ¡| _t tj|	|	dd	d
t ¡ tj|	|	d dd	d
t ¡ tj|	d |dd	d
t ¡ ¡| _t t ||¡d ¡| _|d }
t t |
|
d d	¡t ¡ t |
d |d	¡t ¡ t ||d	¡¡| _t t  |
|
d ¡t ¡ t  |
d d¡¡| _!t"dd„ |  ¡ D ƒƒ}t"dd„ |  ¡ D ƒƒ}t
d|d›d|d›dƒ t
d|› d|
› ƒ t
d|
› d|› dƒ t
d|
› dƒ t
dƒ d S )NÚcost_volumezLoading DINOv2 model...Údinov3_vits16plusÚlocal)ÚsourceÚweightsFg{®Gáz”?é   é   )Úpaddingé   c                 s   s    | ]}|  ¡ V  qd S ©N)Únumel©Ú.0Úp© r   ú@/data/cameron/para_normalized_losses/libero/model_cost_volume.pyÚ	<genexpr>Z   s   € z/CostVolumePredictor.__init__.<locals>.<genexpr>c                 s   s    | ]
}|j r| ¡ V  qd S r   )Úrequires_gradr   r   r   r   r   r   [   s   € u   âœ“ CostVolume model: ú,z / z trainable paramsz  Feature dim: z	, fused: z  Volume MLP: u    â†’ z	 per cellz  Gripper MLP: u    â†’ 2z  No rotation prediction)#ÚsuperÚ__init__Útarget_sizeÚ	pred_sizeÚn_windowÚn_height_binsÚDINO_PATCH_SIZEÚ
patch_sizeÚfeat_dimÚ
model_typeÚprintÚtorchÚhubÚloadr   r   ÚdinoÚ
parametersr   ÚevalÚ	embed_dimÚnnÚ	ParameterÚrandnÚstart_keypoint_embeddingÚ
SequentialÚConv2dÚGELUÚagent_convsÚwrist_convsÚheight_embeddingsÚ
volume_mlpÚLinearÚgripper_mlpÚsum)Úselfr   r    r!   r"   r%   Úfreeze_backboneÚkwargsÚparamÚDZ	fused_dimÚn_totalÚn_trainable©Ú	__class__r   r   r      s^   
ÿ

ýýýþzCostVolumePredictor.__init__c                    s   t ƒ  |¡ | j |¡| _| S r   )r   Útor+   )r=   ÚdevicerD   r   r   rF   b   s   zCostVolumePredictor.toc           	      C   sÊ   |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd }|||ƒ}q| jjr@| j |d d …| jjd d …f ¡}n| j |¡d d …| jjd d …f }| |||| j	¡ 
dddd¡ ¡ }|S )Nr   )ÚHÚWr   r   r   )Úshaper+   Úprepare_tokens_with_masksÚblocksÚ
rope_embedÚuntie_cls_and_patch_normsÚnormÚn_storage_tokensÚreshaper.   ÚpermuteÚ
contiguous)	r=   ÚxÚBÚx_tokensÚH_pÚW_pÚblkÚrope_sincosÚpatchesr   r   r   Ú_extract_featuresg   s   
&$"z%CostVolumePredictor._extract_featuresc           1      C   sœ  |j \}}}	}
|j}t|ƒ}|	|
 }| j| j }tjtj|	|tjdtj|
|tjddd\}}|d | }|d | }t 	|¡}tj
|||gdd |d¡}t |¡}t d||¡}|d	d	…d	d…d	d…f }t d
||¡}|d	d	…d	d…df }|d	d	…d	d	…df }| d|d¡}|d	d	…df  |dd¡}|| | d¡d  }|dk}| |ddd¡| d¡| d¡  }|d	d	…d	d…d	d…f  dd¡} |d	d	…d	d…df }!||! |ddd¡ }"|" ||| d¡}#t d
| |#¡}$|$ |||d¡}$|$d	d	…d	d	…d	d	…df jdd}%|d	d	…ddf  |dd¡}&|d	d	…ddf  |dd¡}'|d	d	…ddf  |dd¡}(|d	d	…ddf  |dd¡})|&|$d	d	…d	d	…d	d	…df  |% |( }*|'|$d	d	…d	d	…d	d	…df  |% |) }+d|* | jd  d },d|+ | jd  d }-t ||,t |,d¡¡},t ||-t |-d¡¡}-tj
|,|-gdd || |	|
d¡}.| d¡ ||||	|
¡ || ||	|
¡}/tj|/|.dddd}0|0 ||||	|
¡S )u¡   Sample wrist features for each (u,v,h) cell in agentview. Fully vectorized.

        Returns: (B, Nh, feat_dim, H, W) â€” wrist features at each 3D cell
        )rG   ÚdtypeÚij)Úindexingç      à?éÿÿÿÿ©Údimr   zbij,nj->bniNzbij,bnj->bnir   r   g:Œ0âŽyE>r   g-Cëâ6?)Úming       @ç      ð?g      $@ÚbilinearÚzerosF)ÚmodeÚpadding_modeÚalign_corners)rJ   rG   Úlenr   r    r(   ÚmeshgridÚarangeÚfloat32Ú	ones_likeÚstackrQ   ÚinverseÚeinsumÚviewÚ	unsqueezeÚ	transposeÚclampÚwhereÚ	full_likeÚexpandÚFÚgrid_sample)1r=   Úwrist_featsÚagent_cam_poseZagent_cam_KÚwrist_cam_poseZwrist_cam_KÚheight_binsrU   ÚCrH   rI   rG   ÚNhZHWÚscaleÚysÚxsZxs_imgZys_imgÚonesZpix_hZagent_K_invZrays_camZagent_RZ
rays_worldÚcam_posZray_zÚheightsZcam_zÚtÚvalidÚ	points_3dZwrist_R_invÚwrist_tZpts_centeredZpts_flatZpts_camÚzÚfxÚfyÚcxÚcyZu_wZv_wÚgrid_xÚgrid_yÚgridZ	wrist_expÚsampledr   r   r   Ú_sample_wrist_featurest   s\   
ý

$"$(("(z*CostVolumePredictor._sample_wrist_featuresc                 C   sf   |j \}}}}| j}| d¡ |||||¡}| j d||dd¡ |||||¡}	tj|||	gdd}
|
S )a  Build fused feature volume: concat(agent, wrist, height) at each cell.

        Args:
            agent_feats_16: (B, feat_dim, H, W)
            wrist_sampled:  (B, Nh, feat_dim, H, W)

        Returns:
            fused: (B, Nh, 3*feat_dim, H, W)
        r   r   rb   )rJ   r"   rt   ry   r8   rs   r(   Úcat)r=   Úagent_feats_16Úwrist_sampledrU   r€   rH   rI   r   Z	agent_expZh_embÚfusedr   r   r   Ú_build_fused_volumeÂ   s   
"z'CostVolumePredictor._build_fused_volumec                 C   sÈ   |j \}}}}}|j d }	|j}
|d  ¡  d|d ¡}|d  ¡  d|d ¡}| ¡  d|d ¡}tj||
d |d¡ ||	¡}|||dd…||f }| ||	 |¡}|  	|¡ ||	d¡}d}||fS )a»  Extract fused features at specific (u, v, h) cells and predict gripper/rotation.

        Args:
            fused_volume:     (B, Nh, 3*feat_dim, H, W)
            query_pixels:     (B, N_WINDOW, 2) pixel coords in pred_size space
            query_height_bins:(B, N_WINDOW) height bin indices

        Returns:
            gripper_logits:  (B, N_WINDOW, 2)
            rotation_pred:   (B, N_WINDOW, 3) sigmoid [0,1] delta axis-angle
        r   ).r   r   ).r   ©rG   Nr   )
rJ   rG   Úlongrv   r(   rm   rs   ry   rQ   r;   )r=   Zfused_volumeÚquery_pixelsZquery_height_binsrU   r   ÚC3rH   rI   ÚNrG   ÚpxÚpyZhbÚ	batch_idxÚfeatsÚflatÚgripperÚrotationr   r   r   Úpredict_at_pixelsÙ   s   
z%CostVolumePredictor.predict_at_pixelsNc
           (      C   s¶  |j d }
|j}|  |¡}|j \}}}}|durj| ¡ dkr'| d¡ |
d¡}|dd…df | | j  ¡  d|d ¡}|dd…df | | j  ¡  d|d ¡}t	j
|
|d}||dd…||f  | j d¡7  < tj|| j| jfddd}|  |¡}|durF|durF|  |¡}tj|| j| jfddd}|  |¡}| ¡ }|dd…df  | j9  < |dd…df  | j9  < |	 ¡ }|dd…df  | j9  < |dd…df  | j9  < ddl}t	j|j|j| j|d}|  ||||||¡}|  ||¡}| j}| j } }!| |
| | jd	 | |!¡}"|  |"¡}#|# |
|| j| |!¡}$|$ dd
dd	d¡ ¡ }%|dur<|dur<|  |||¡\}&}'nd }&}'|%|&|'|fS t	j |
| j| j| j| j|d}%|%dddfS )u;  
        Returns:
            volume_logits:   (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits:  (B, N_WINDOW, 2) or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS) or None
            fused_volume:    (B, Nh, 3*feat_dim, H, W) â€” for predict_at_pixels at eval
        r   Nr   ra   r›   rf   F)Úsizerh   rj   r   r   r   )!rJ   rG   r\   rc   rt   ry   r   rœ   rv   r(   rm   r2   rz   Úinterpolater    r6   r7   ÚcloneÚmodelÚlinspaceÚ
MIN_HEIGHTÚ
MAX_HEIGHTr"   r•   rš   rQ   r%   r9   r!   rR   rS   r§   rg   )(r=   Z	agent_imgZ	wrist_imgÚstart_keypoint_2dÚagent_query_pixelsÚagent_query_height_binsr}   Úagent_cam_K_normr~   Úwrist_cam_K_normrU   rG   Zagent_patchesÚ_rA   rW   rX   ZskxZskyÚbiÚagent_featsr—   Zwrist_patchesr|   Zwrist_feats_16Zagent_KZwrist_KÚmodel_moduler   r˜   r™   r   rH   rI   Z
fused_flatZvol_flatÚvolÚvolume_logitsÚgripper_logitsÚrotation_logitsr   r   r   Úforward÷   sf   

**$ÿ

ÿ
ÿÿ


ÿ
ÿzCostVolumePredictor.forward)NNNNNNNN)Ú__name__Ú
__module__Ú__qualname__Ú	PRED_SIZEÚN_WINDOWÚN_HEIGHT_BINSÚFEAT_DIMr   rF   r\   r•   rš   r§   r¼   Ú__classcell__r   r   rD   r   r      s    þCNýr   Ú__main__g…ëQ¸í?g333333ó?ÚcudaÚcpur	   )r   r!   r   r   g      l@ra   gÍÌÌÌÌÌä?gš™™™™™ù?)ç®Gázæ?r   r`   )r   rÈ   r`   )r   r   re   )r¯   r°   r±   r}   r²   r~   r³   z
volume:   z
gripper:  z
rotation: z
fused:    )5Ú__doc__Úosr(   Útorch.nnr/   Útorch.nn.functionalÚ
functionalrz   ÚenvironÚgetr   r   r#   rÁ   rÂ   Ú
N_ROT_BINSrÀ   rÃ   ÚModuler   r½   r«   r·   r­   r®   rG   rÆ   Úis_availableÚmrF   rU   r1   ÚaÚwÚtensorÚkprg   Úqprœ   ZqhÚeyert   ry   ÚfloatÚcamÚKÚno_gradr¸   ÚgripÚrotr™   r'   rJ   r   r   r   r   Ú<module>   sX      
.
$$6
ýÿä