o
    Ú[Ëi³¾  ã                   @   s"  d Z ddlZddlZddlZddlZddlmZ ddlZddlZddl	Z
ddlZddlm  mZ ddlmZ ej dej e¡¡ ddlZddlmZmZmZmZmZ ddlmZ dd„ Zdd	lm Z!m"Z" dd
l#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) e
j*g d¢e
j+dZ,e
j*g d¢e
j+dZ-dZ.e.fdd„Z/e.fdd„Z0e.fdd„Z1e.ddfdd„Z2e.dfdd„Z3e.dfdd„Z4e.dfdd„Z5dd „ Z6d!d"„ Z7e8d#krej9d$d%Z:e:j;d&e<d'g d(¢d)d* e:j;d+e<d,d-d. e:j;d/e<d0d1d2 e:j;d3e<d4d5d. e:j;d6e=dd7 e:j;d8e<d9d7 e:j;d:e=d;d<d. e:j;d=e=d>d?d. e:j;d@e=dd7 e:j;dAe<dBd7 e:j;dCdDdEdF e:j;dGdDdHdF e:j;dIe=dJdKd. e:j;dLe<dMdNd. e:j;dOdDdPdF e:j;dQdDdRdF e:j;dSe>dTdUd. e:j;dVe>dTdWd. e:j;dXdDdYdF e:j;dZdDd[dF e:j;d\dDd]dF e: ?¡ Z@e7e@ƒ dS dS )^u¬  eval.py â€” Evaluate a trained PARA checkpoint in the LIBERO simulation environment.

PARA predicts the next N_WINDOW absolute EEF 3D positions from a single RGB image.
This script runs closed-loop rollouts: at each env step, re-run the model on the
current observation, decode the first predicted position into a delta OSC_POSE action,
and step the sim. Success is the LIBERO binary predicate check (env returns done=True).

Usage:
    python libero/eval.py         --checkpoint libero/checkpoints/para_libero_spatial_t0/best.pth         --benchmark libero_spatial         --task_id 0         --n_episodes 20

Action format: 7D OSC_POSE [delta_pos (3), delta_rot (3)=0, gripper (1)]
é    N)ÚPath)Útqdm)ÚTrajectoryHeatmapPredictorÚN_HEIGHT_BINSÚN_GRIPPER_BINSÚ
N_ROT_BINSÚ	PRED_SIZE)Ú*recover_3d_from_direct_keypoint_and_heightc           
      C   sò   | dkrt S | dkrddlm} |S | dkrddlm} |S | dkr*ddlm} |S | d	kr6dd
lm} |S | dkrBddl	m
} |S | dkrNddlm} |S | dkrZddlm} |S | dkrfddlm} |S | dkrrddlm}	 |	S td| › ƒ‚)NÚparaÚactr   )ÚACTPredictorÚda3)ÚDA3PredictorÚmoge)ÚMoGePredictorÚdino_vla)ÚDinoVLAPredictorÚinternvl)ÚInternVLAPredictorÚinternvl_act)ÚInternVLACTPredictorÚdual_da3)ÚDualDA3PredictorÚ	dual_para)ÚDualParaPredictorÚcost_volume)ÚCostVolumePredictorzUnknown model_type: )r   Z	model_actr   Z	model_da3r   Z
model_moger   Zmodel_dino_vlar   Zmodel_vla_internvlr   Zmodel_vla_internvl_actr   Zmodel_dual_da3r   Zmodel_dual_parar   Zmodel_cost_volumer   Ú
ValueError)
Ú
model_typer   r   r   r   r   r   r   r   r   © r   ú5/data/cameron/para_normalized_losses/libero/./eval.pyÚget_model_class&   s<   r!   )Ú	benchmarkÚget_libero_path)ÚOffScreenRenderEnv)Úget_camera_transform_matrixÚget_camera_extrinsic_matrixÚget_camera_intrinsic_matrixÚ#project_points_from_world_to_camera)g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?©Údtype)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?iÀ  c                 C   sb   |   tj¡d }t |¡ ¡ }tj|||ftjd}|t t	 }t
 | ddd¡¡ ¡  d¡}|S )uà   HxWx3 uint8 â†’ (1, 3, H, W) float tensor, ImageNet-normalized.

    LIBERO obs images are already upright (already flipped vs raw render),
    so we flipud to match training convention (flipud(obs) â†’ training image).
    ç     ào@©Zinterpolationé   r   é   )ÚastypeÚnpÚfloat32ÚflipudÚcopyÚcv2ÚresizeÚINTER_LINEARÚIMAGENET_MEANÚIMAGENET_STDÚtorchÚ
from_numpyZ	transposeÚfloatÚ	unsqueeze)Úrgb_obsÚ
image_sizeZimgÚtensorr   r   r    Úpreprocess_obsY   s   r@   c                 C   sx   t | |||ƒ}t| |ƒ}t| |||ƒ}|d  |  < |d  |  < | ¡ }|d  |9  < |d  |9  < |||fS )uª  Return camera matrices needed for projection and 3D recovery.

    - world_to_camera: (4,4) worldâ†’camera transform  â†’ for project_points_from_world_to_camera
    - camera_pose:     (4,4) cameraâ†’world transform   â†’ for recover_3d (ray unprojection)
    - cam_K:           (3,3) intrinsic at image_size  â†’ for recover_3d
    These are two different matrices; using the wrong one for 3D recovery gives bad targets.
    r   r.   )r%   r&   r'   r3   )ÚsimZcamera_namer>   Úworld_to_cameraÚcamera_poseZ
cam_K_normÚcam_Kr   r   r    Úget_camera_paramsg   s   

rE   c                 C   sP   t |  dd¡ tj¡|||dd }t|d ƒ}t|d ƒ}tj||gtjdS )uQ   Project current EEF world position â†’ (u, v) pixel in training image convention.r.   é   )ZpointsZworld_to_camera_transformZcamera_heightZcamera_widthr   r)   )	r(   Úreshaper/   r0   Úfloat64r;   r9   r?   r1   )Zeef_posrB   r>   Úpix_rcÚuÚvr   r   r    Úeef_to_start_kpz   s   üûrL   c              
   C   s4  |j d }|| }	|  tj¡d }
t |
¡ ¡ }
tj|
||ftjd}
|d }t	j
| d¡dd |j ¡}|jddd  ¡  ¡ }tj|||ftjd}|| ¡  | ¡ d  }t |
¡}||d< t |
d	 |d
  dd¡}|d  tj¡}| ¡ }|| || }}t|d |	 ƒ}t|d |	 ƒ}t |||fdtjddtj¡ t| dd¡ tj¡|||ƒd }ttt|d ƒƒƒttt|d ƒƒƒ}}d|  krÒ|k rën nd|  krÞ|k rën nt |||fddd¡ d|› }|durü||rùdnd7 }t ||dtjd	ddtj¡ t ||dtjd	ddtj¡ |S )zPRender a single eval step: RGB + heatmap overlay + predicted pixel + GT EEF dot.éÿÿÿÿr+   r,   ©r   r   r   ©Údimç:Œ0âŽyE>©.r   çš™™™™™á?çÍÌÌÌÌÌÜ?r.   ç      à?©r   éÿ   r   é   r-   rF   é   ©rW   rW   rW   ústep Nz	  SUCCESSz	  running©é
   é   ©é   r`   r`   )Úshaper/   r0   r1   r2   r3   r4   r5   r6   ÚFÚsoftmaxrG   ÚmaxÚcpuÚnumpyÚminÚ
zeros_likeÚclipÚuint8ÚargmaxÚintÚ
drawMarkerÚMARKER_CROSSÚLINE_AAr(   rH   Úroundr;   ÚcircleÚputTextÚFONT_HERSHEY_SIMPLEX)r=   Úvolume_logitsÚcurrent_eef_posrB   rD   r>   Ústep_idxÚsuccessÚ	pred_sizeÚscaleÚframeÚvol_tÚ	vol_probsÚ
heat_smallÚheatÚheat_rgbÚoverlayÚvisÚflat_idxÚpyÚpxÚpx_fullÚpy_fullrI   rJ   rK   Úlabelr   r   r    Úrender_eval_frame‡   sD   

þý*0
rˆ   c                 C   sV  |j d }|j d }|| }	|  tj¡d }
t |
¡ ¡ }
tj|
||ftjd}
t	| 
dd¡ tj¡|||ƒd }ttt|d ƒƒƒttt|d ƒƒƒ}}g }t|ƒD ]Ï}|d|f }tj| 
d¡dd 
|j ¡}|jddd  ¡  ¡ }tj|||ftjd}|| ¡  | ¡ d  }t |
¡}||d	< t |
d
 |d  dd¡}|d  tj¡}| ¡ }|| || }}t|d |	 ƒ}t|d |	 ƒ}t |||fdtjddtj¡ d|  krá|k rún nd|  krí|k rún nt |||fddd¡ d|› d|› }t ||dtjdddtj¡ t ||dtjdddtj¡ |  |¡ qTtj!|ddS )a
  Render a horizontal strip showing heatmaps for all N_WINDOW predicted timesteps.

    Each tile: RGB + heatmap overlay (red) + predicted pixel (green cross) + GT EEF (white dot).
    Returns a single wide image: (image_size, image_size * n_window, 3) uint8 RGB.
    r.   rM   r+   r,   rF   r   rO   rQ   rR   rS   rT   rU   rV   é   r-   é   rZ   r[   z t+)é   é   gš™™™™™Ù?r_   )Úaxis)"ra   r/   r0   r1   r2   r3   r4   r5   r6   r(   rG   rH   rl   rp   r;   Úrangerb   rc   rd   re   rf   rg   rh   ri   rj   rk   rm   rn   ro   rq   rr   rs   ÚappendZconcatenate)r=   rt   ru   rB   rD   r>   rv   Ún_windowrx   ry   rz   rI   Zeef_uZeef_vZtilesÚtr{   r|   r}   r~   r   r€   r   r‚   rƒ   r„   r…   r†   r‡   r   r   r    Úrender_window_strip·   sH   

þý*
0r’   çš™™™™™©?c	           3         sØ  ddl m}	 d}
d}| jd }| jd }|| }tjtj}}tjtj}}tj	tj
tjd‰tj	tjtjd‰ g }g }t|ƒD ]:‰| dˆf }|jddd }| d¡ ¡  ¡ }|| }|| }|d	d	…||f  ¡  ¡ }| ||f¡ | |¡ q@tjd
d„ |D ƒtj|jd d¡}tj|tj|jd d¡}t ¡ $ t|dƒr³|jdkr³| |||¡\}‰n| ||¡\}‰W d	  ƒ n1 sÅw   Y  g }g }| ¡ } t|ƒD ]\‰\}}| dˆf }|d | }!|d | }"|d	d	…||f  ¡  ¡ }|ttd dƒ ||  | }#t tj	|!|"gtjd|#||ƒ}$|$d	u r*|r&|d n|  ¡ }$| |$¡ |$|  }%tj! "|%¡}&|&|krD|%|& | }%t #|%|
 dd¡}'tj	tj$tjd}(|	 %|(¡})ˆd	u ritj&dtjd}*nLˆ '¡ dkr‡ˆdˆf  (¡  )¡  *tj¡}+|+ˆ ˆ  ˆ },nt 	‡ ‡‡‡fdd„tdƒD ƒ¡},|)|	 +|,¡ }-|	 %|¡}.|-|. ,¡  }/t #|/ -¡ | dd¡}*t.|dˆf  ¡  ¡ ƒ}0|0dkrÈdnd}1tj&dtjd}2|'|2d	d…< |*|2dd…< |1|2d< | |2¡ qÖ||fS )uá  Decode all N_WINDOW predicted timesteps into OSC_POSE delta actions.

    Gripper/rotation are predicted by indexing features at the argmax pixel of each
    timestep and passing through the model's MLP heads (same as training inference path).

    Args:
        volume_logits:   (1, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
        model:           TrajectoryHeatmapPredictor (for predict_at_pixels)
        feats:           (1, D, pred_size, pred_size) feature map from forward()
        camera_pose:     (4,4) cameraâ†’world extrinsic  â† get_camera_extrinsic_matrix()
        cam_K:           (3,3) intrinsic at image_size
        current_eef_pos: (3,) numpy EEF position at start of window
        max_delta:       max position delta magnitude in metres before OSC normalisation

    Returns:
        actions:         list of N_WINDOW (7,) numpy [delta_pos(3), delta_rot_axisangle(3), gripper(1)]
        pred_3d_targets: list of N_WINDOW (3,) absolute EEF targets (for debug)
    r   ©ÚRotationr“   rU   r.   rM   r)   rO   Nc                 S   s   g | ]\}}||g‘qS r   r   )Ú.0r„   rƒ   r   r   r    Ú
<listcomp>  s    z)decode_window_actions.<locals>.<listcomp>©r*   ÚdeviceZgripper_mlpr   ç      ð¿ç      ð?rF   c                    sN   g | ]#}ˆd ˆ|dd…f   ¡  ¡ ttd dƒ ˆ | ˆ|   ˆ|  ‘qS ©r   Nr.   ©rk   Úitemrd   r   ©r–   r   ©Úmax_rÚmin_rZrotation_logitsr‘   r   r    r—   N  ó    (þÿÿÿé   rY   )/Úscipy.spatial.transformr•   ra   Úmodel_moduleÚ
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERr0   ÚarrayÚMIN_ROTrH   ÚMAX_ROTrŽ   rd   rG   rk   rž   r   r9   r?   r1   r™   r<   ZlongÚno_gradÚhasattrr   Úpredict_at_pixelsr3   Ú	enumerater   r	   ÚlinalgÚnormri   ÚREF_ROTATION_QUATÚ	from_quatÚzerosrP   re   rf   r/   Zfrom_rotvecÚinvÚ	as_rotvecrl   )3rt   ÚmodelÚfeatsrC   rD   ru   Úcurrent_eef_quatr>   Ú	max_deltaÚScipyRÚOSC_POS_SCALEÚOSC_ROT_SCALEr   rx   ry   Úmin_hÚmax_hÚmin_gÚmax_gZpred_px_listZpred_hbin_listr{   Ú
max_over_hr‚   rƒ   r„   Úh_binÚpred_pixelsZ
pred_hbinsZgripper_logitsÚactionsÚpred_3d_targetsÚref_posr…   r†   ÚheightÚpred_3dÚ	delta_posr³   Ú
delta_normZref_quatZR_refÚdelta_rot_normZrot_sigmoidZdelta_rotvec_predÚR_predÚ	R_currentÚR_deltaÚg_classÚgripper_cmdÚactionr   r    r    Údecode_window_actionsï   s–   

ÿþ
€ü	ÿ




ý
rÕ   c           E   	      s  ddl m} d}d}| jd }| jd }|| }tjtj}}tjtjtj	d‰tjtj
tj	d‰ g }g }|
 ¡ }t|ƒD ]Ê}| d|f }|d|f }|jddd }| d¡ ¡  ¡ }|| }|| }|d | } |d | }!|d	d	…||f  ¡  ¡ }"|"ttd dƒ ||  | }#ttj| |!gtj	d|#||ƒ}$d
}%|$d	urÜt|$ dd¡|	||ƒd }&t|&d ƒt|&d ƒ}'}(d})|)|'  krÊ||) k rÜn n|)|(  krØ||) k rÜn nd}%|%ré|}*|}+|},d}-|}.n
|}*|}+|},d}-|}.|*jddd }/|/ d¡ ¡  ¡ }0|0| }1|0| }2|2d | }3|1d | }4|*d	d	…|1|2f  ¡  ¡ }5|5ttd dƒ ||  | }6ttj|3|4gtj	d|6|+|,ƒ}7|7d	u rZ|$d	urO|$n
|rV|d n| ¡ }7| |7¡ |7| }8tj |8¡}9|9|krt|8|9 | }8t |8| dd¡}:tj|2|1ggtj|.jd d¡};t ¡  |j|.|;|-d\}<‰W d	  ƒ n	1 s§w   Y  t ‡ ‡‡fdd„tdƒD ƒ¡}=|  d|=¡}>| !|¡}?|>|? "¡  }@t |@ #¡ | dd¡}At$|<d  ¡  ¡ ƒ}B|Bdkrédnd}Ctj%dtjd}D|:|Dd	d…< |A|Ddd…< |C|Dd< | |D¡ q=||fS )a/  Decode dual-camera predictions with agentview-guided wrist selection.

    Strategy: use agentview for coarse 3D prediction, then check if that 3D point
    projects into the wrist camera frustum. If yes, use wrist view's prediction
    (more precise at close range). If no, fall back to agentview.
    r   r”   r“   rU   r.   rM   r)   rO   NFrF   r`   TZwristZagentrš   r›   r˜   )Ú	view_namec                    sN   g | ]#}ˆd d |dd…f   ¡  ¡ ttd dƒ ˆ | ˆ|   ˆ|  ‘qS rœ   r   rŸ   ©r¡   r¢   Z
rot_logitsr   r    r—   Æ  r£   z.decode_dual_window_actions.<locals>.<listcomp>ÚxyzrN   r¤   rY   )&r¥   r•   ra   r¦   r§   r¨   r0   r«   r¬   rH   r­   r3   rŽ   rd   rG   rk   rž   r   r	   r(   r;   r   r²   r³   ri   r9   r?   r1   r™   r<   r®   r°   Ú
from_eulerrµ   r·   r¸   rl   r¶   )EZ	agent_volZ	wrist_volr¹   Úagent_featsÚwrist_featsÚagent_cam_poseZagent_cam_KÚwrist_cam_poseÚwrist_cam_KÚ	wrist_w2cru   r»   r>   r¼   r½   r¾   r¿   r   rx   ry   rÀ   rÁ   rÇ   rÈ   rÉ   r‘   Za_vol_tZw_vol_tZa_max_over_hZa_flatZa_pyZa_pxZ	a_px_fullZ	a_py_fullZa_h_binZa_heightZagent_3dZ	use_wristZwrist_pix_rcZwuZwvZmarginr{   Zcam_poserD   rÖ   rº   rÄ   r‚   rƒ   r„   r…   r†   rÅ   rÊ   rË   rÌ   r³   rÍ   rÆ   Zgrip_logitsÚ
euler_predrÏ   rÐ   rÑ   rÎ   rÒ   rÓ   rÔ   r   r×   r    Údecode_dual_window_actionse  s¬   


ÿÿþ8ÿ
$

 
ÿý
rá   c           "      C   sä  ddl m} d}d}tjtjtjd}tjtjtjd}	tjtjtjd}
tjtj	tjd}t
tjƒ}t
tjƒ}| jd }g }| ¡ }t|ƒD ]¨}| d|f  ¡  ¡  tj¡}||	|  | }|d|f  ¡  ¡  tj¡}|||
  |
 }t
|d|f  ¡ ƒ}|||  | }|| }tj |¡}|dkr›|| d }t || dd¡}| d	|¡}| |¡}|| ¡  }t | ¡ | dd¡}t
|d|f  ¡ ƒ}|d
krÐdnd} tjdtjd}!||!dd…< ||!dd…< | |!d< | |!¡ qG|S )ad  Decode ACT normalized [0,1] predictions into OSC_POSE delta actions.

    Model outputs are in [0,1] (sigmoid). We denormalize using dataset min/max,
    then compute deltas for the OSC_POSE controller.

    Args:
        pos_pred:     (1, N_WINDOW, 3) normalized [0,1] positions (tensor)
        rot_pred:     (1, N_WINDOW, 3) normalized [0,1] rotations (tensor)
        gripper_pred: (1, N_WINDOW) normalized [0,1] gripper (tensor)
        current_eef_pos:  (3,) numpy
        current_eef_quat: (4,) numpy

    Returns:
        actions: list of N_WINDOW (7,) numpy [delta_pos(3), delta_rot(3), gripper(1)]
    r   r”   r“   rU   r)   r.   rš   r›   rØ   ç        r¤   NrF   rY   )r¥   r•   r0   r«   r¦   ÚMIN_POSrH   ÚMAX_POSr¬   r­   r;   r©   rª   ra   r3   rŽ   re   rf   r/   r²   r³   ri   rÙ   rµ   r·   r¸   r¶   r1   r   )"Úpos_predÚrot_predÚgripper_predru   r»   r½   r¾   r¿   Úmin_posÚmax_posÚmin_rotÚmax_rotrÂ   rÃ   r   rÇ   rÉ   r‘   Zpos_normrË   Zrot_normrà   Zg_normZgripper_valrÌ   r³   rÍ   rÏ   rÐ   rÑ   Z	delta_rotZg_logitrÓ   rÔ   r   r   r    Údecode_act_actionsÜ  sH   



rì   c           o         s<  t  t j ¡ r	dn	t jj ¡ rdnd¡}td|› ƒ t| jƒ}| 	¡ s+t
d|› ƒ‚t j|dd}t| dtj¡ƒt_t| dtj¡ƒt_t| d	tj¡ƒt_t| d
tj¡ƒt_d|v rh|d t_|d t_d|v rv|d t_|d t_d|v r|d t_tdtjd›dtjd›dƒ tdtjd›dtjd›dƒ tddd„ tjD ƒ› ddd„ tjD ƒ› ƒ tddd„ tjD ƒ› ƒ tddd„ tjD ƒ› ddd„ tjD ƒ› ƒ t| jƒ}| jdkrç|ttd}n| jdv rô|t| jd }n|td!}|j|d" d#d$ | |¡}| ¡  td%| j› d&|› ƒ t  !¡ | j" ƒ }| #| j$¡}|j%}td'| j"› d(|› ƒ t&j' (t)d)ƒ| *| j$¡¡}	t+ ,|	d*¡‰ t-d+d„ ˆ d,  .¡ D ƒƒ}
‡ fd-d„|
D ƒ}W d   ƒ n	1 sgw   Y  t/| j0t1|ƒƒ}td.|› d/t1|ƒ› d0ƒ t&j' (t)d1ƒ|j2|j3¡}t4|tt| j5g| jd2v rd3gng  d4}| 6| j6¡ | 7¡  | j8r|j9j:}d5D ]!}z|j; <|¡}t= >g d6¢¡|j;j?|< W q¶ t@y×   Y q¶w | A¡  g d7¢}tBƒ }|D ]}z| C|j; <|¡¡ W qå t@yþ   Y qåw tD|j;jEƒD ]}|j;jF| |v rd8|j;jG| d9< qtd:ƒ td;ƒ d }| jd<v rbt&j' (| jH| j"d=| j$› d>¡}t&j' 	|¡rRt j||d Id?¡}td@|› ƒ ntdA|› dBƒ t jJdCdD|dE}d }| jdv rz| KdFdG¡g}tdH|d? › ƒ g }g }tLtD|ƒdIdJD ]æ}| 7¡  | j8rõ|j9j:}d5D ]}zt= >g d6¢¡|j;j?|j; <|¡< W q— t@y¶   Y q—w | A¡  tBƒ }d7D ]}z| C|j; <|¡¡ W qÀ t@yÙ   Y qÀw tD|j;jEƒD ]}|j;jF| |v ród8|j;jG| d9< qà||  M¡ }| jNd?ks| jOd?kr9dKD ]} | dC }!||!  | jN7  < ||!dC   | jO7  < q	dLD ]} | dC }!g dM¢||!|!d9 …< q'| P|¡}"tDdNƒD ]}#| Qt=jJdOt=jRdP¡\}"}#}#}#qBtS|j:| j5tƒ\}$}%}&d#}'d#}(| jTrjg nd })g }*d?}+d?},dQ}-|+| jUk rr|'srt=j>|"dR t=jVdP}.|"| j5› dS }/tW|.|$tƒ |¡}0tX|/tƒ |¡}1t=j>|"dT t=jVdP}2d }3| jdUv rHt=j>tjt=jVdP}4t=j>tjt=jVdP}5t jY|.|4 |5|4 dV  t jR|dW Zd?dC¡ Id?¡}6t|" dXd?d?g¡d? ƒ}7t jY|7tj tjtj dV  gt jR|dW Zd?dC¡ Id?¡}8i }9|d ur||9dY< |d ur||9dZ< t  [¡  ||1|0f|6|8d[œ|9¤Ž\}:};}<W d   ƒ n	1 s7w   Y  t\|:|;|<|.|2ƒ}=d }>n?| jd\v r |"d] }?tX|?tƒ |¡}@tS|j:d3tƒ\}A}B}Ct  [¡  ||1|@|0d^}DW d   ƒ n	1 s{w   Y  t]|Dd_ |Dd` ||Dda |Ddb |%|&|B|C|A|.|2tdc\}=}#|Dd_ }>nç| jddkrC|"d] }?tX|?tƒ |¡}@tS|j:d3tƒ\}E}F}G|& M¡ }H|Hd?  t  < |HdC  t  < |G M¡ }I|Id?  t  < |IdC  t  < t  [¡ C ||1|@|0t  ^|%¡ ¡  Id?¡ |¡t  ^|H¡ ¡  Id?¡ |¡t  ^|F¡ ¡  Id?¡ |¡t  ^|I¡ ¡  Id?¡ |¡de\}>}#}#}JW d   ƒ n	1 s/w   Y  t_|>||J|%|&|.|2tdc\}=}3nDi }K|d urN||KdY< |d urW||KdZ< t  [¡  ||1|0fi |K¤Ž\}>}#}#}JW d   ƒ n	1 stw   Y  t_|>||J|%|&|.|2tdc\}=}3t`| dfd#ƒ}L|Lr§|>d ur§ta|/|>|.|$|&t|+ƒ}M|* b|,|+|Mf¡ |,dC7 },tc|=ƒD ]¸\}N}O| jdr¾d8|Od9dg…< |)d ur,|>d urÞ|) bte|"| j5› dS |>|.|$|&t|+d dh¡ nN|"| j5› dS  ft=jR¡di }Pt= g|P¡ M¡ }Pthji|Pttfthjjdj}P|Pdi  ft=jk¡}Qdk|+› }Rth l|Q|Rdlthjmdmdndothjn¡ th l|Q|RdlthjmdmdpdCthjn¡ |) b|Q¡ | jorÛ|3d urÛ|3|N  ft=jV¡}S|Od9dg…  M¡ }T|Odg }Ud?}Vdq}Wdr}X|V|Wk r¶t=j>|"dR t=jVdP}Y|S|Y }Zt=jp q|Z¡}[|[|Xk rnnHt= r|Zds dQdt¡}\t=jJdOt=jRdP}]|\|]d d9…< | jds|T|]d9dg…< |-|]dg< | Q|]¡\}"}#}'}#|+dC7 }+|VdC7 }V|'rªdu}(n|+| jUkr±n|V|Wk sT|'sÚ|+| jUk rÚt=jJdOt=jRdP}^|U|^dg< | Q|^¡\}"}#}'}#|+dC7 }+|U}-n}| jsr|O M¡ }_|-|_dg< | Q|_¡\}"}#}'}#|+dC7 }+|'rûdu}( nn|+| jUkr nf| Q|O¡\}"}#}'}#|Odg }-|+dC7 }+nC| jtrG| Q|O¡\}"}#}'}#|Odg }-|+dC7 }+|'r1du}( n8|+| jUkr9 n0| Q|O¡\}"}#}'}#|+dC7 }+n| Q|O¡\}"}#}'}#|Odg }-|+dC7 }+|'r_du}( n
|+| jUkrg nq¯|+| jUk rr|'r}|)rü|>d ur—te|"| j5› dS |>t=j>|"dR t=jVdP|$|&t|+|(dh|)dv< n|(rœdwndx}`th l|)dv |`dythjmdz|(r­d{nd|dothjn¡ t| juƒd} d=| j$›  }a|ajvdudud~ |ad|d€›dF|(rÒdnd‚› dƒ }bth wtx|bƒthjyd„Ž | jzttf¡}c|)D ]‰ |c {th |ˆ thj}¡¡ qê|c ~¡  |*rF|(rdnd‚}dt| juƒd… d=| j$›  d|d€›dF|d›  }e|ejvdudud~ |*D ] \}f}g}h|ed†|fd€›d‡|gdˆ›d‰ }ith tx|iƒth ||hthj}¡¡ q%| bt|(ƒ¡ | b|+dC ¡ tL {dŠ|dC d‹›dŒ|(rbdndŽ› d|+dC › ¡ q†| €¡  tt= |¡ƒ}jtt= |¡ƒ}ktdd‘› ƒ td’| j"› ƒ td“| j$› d”|› ƒ td•|› ƒ td–t‚tƒ|ƒƒ› d/|› ƒ td—|jd˜ d™›dšƒ td›|kd™›d/| jU› ƒ td‘› ƒ | j"| j$|tx|ƒ||j|||k| jUdsdœœ}lt| juƒ}m|mjvdudud~ |md| j"› dž| j$› dŸ }nt„|nd ƒ‰ t…j†|lˆ dod¡ W d   ƒ n	1 	sw   Y  td¢|n› ƒ |lS )£NÚcudaÚmpsre   zDevice: zCheckpoint not found: )Zmap_locationZ
min_heightZ
max_heightZmin_gripperZmax_gripperrê   rë   rè   ré   Zref_rotation_quatzHeight  range: [ú.4fz, ú]zGripper range: [zRot     range: c                 S   ó   g | ]}|d ›‘qS ©z.3fr   ©r–   rK   r   r   r    r—   B  ó    zrun_eval.<locals>.<listcomp>z .. zRef rot:       c                 S   rñ   )rï   r   ró   r   r   r    r—   C  rô   zPos     range: c                 S   rñ   rò   r   ró   r   r   r    r—   D  rô   r
   )Útarget_sizerx   )r   r   )rõ   Ú
model_name)rõ   Zmodel_state_dictF)ÚstrictzLoaded model (z) from zTask: [z] ZdatasetsÚrc                 S   s   g | ]	}|  d ¡r|‘qS )Zdemo_)Ú
startswith©r–   Úkr   r   r    r—   [  s    Údatac                    s    g | ]}ˆ d |› d d ‘qS )zdata/z/statesr   r   rú   ©Úfr   r    r—   \  s     zRunning z / z episodes...Z
bddl_files)r   r   r   Zrobot0_eye_in_hand)Zbddl_file_nameZcamera_heightsZcamera_widthsZcamera_names)Zwooden_cabinet_1_mainZflat_stove_1_main)r   r   g      À)Zakita_black_bowl_2_mainZcookies_1_mainZ#glazed_rim_porcelain_ramekin_1_mainrâ   rF   u6   âœ“ Clean scene: distractors hidden, furniture removedzEnvironment ready.)r   r   Ztask_z_clip.ptr   zLoaded CLIP embedding: z%WARNING: CLIP embedding not found at z, using zerosr.   i   )r™   Ú_ú zTask text for VLA: ZEpisodes)Zdesc)é	   é%   )rŒ   é   é   )ç      $@r  gÍÌÌÌÌÌì?rŠ   r¤   r)   rš   Zrobot0_eef_posZ_imageZrobot0_eef_quat)r   r   rQ   r˜   Zrobot0_gripper_qposÚclip_embeddingZ	task_text)ru   Zcurrent_gripper)r   r   Zrobot0_eye_in_hand_image)Ústart_keypoint_2dZagent_volumeZwrist_volumerÚ   rÛ   )r>   r   )r  rÜ   Zagent_cam_K_normrÝ   Zwrist_cam_K_normÚsave_visrY   )rw   r+   r,   r[   r\   rS   rZ   r-   r_   é   g{®Gázt?r“   r›   TrM   ÚSUCCESSÚFAILURE)r]   é,   gffffffæ?rV   )r   r   rW   Zvideos)ÚparentsÚexist_okZepZ03drw   Zfailz.mp4Zmp4vr   ZreplanZ_stepZ04dz.pngz  Ep Z3dz: u   âœ“ SUCCESSu   âœ— FAILUREz  steps=Ú
z4====================================================z  Benchmark:    z  Task z:      z  Episodes:     z  Successes:    z  Success Rate: éd   z.1fú%z  Avg steps:    )r"   Útask_idÚ	task_nameÚ
checkpointÚ
n_episodesÚsuccess_rateÚ	successesÚstep_countsÚ	avg_stepsÚ	max_stepsr¼   Zeval_Z_taskz.jsonÚw)Zindentu   Results saved â†’ )‡r9   r™   rí   Zis_availableZbackendsrî   Úprintr   r  ÚexistsÚFileNotFoundErrorÚloadr;   Úgetr¦   r§   r¨   r©   rª   r¬   r­   rã   rä   r´   r!   r   Ú
IMAGE_SIZEr   rö   Zload_state_dictÚtoÚevalÚbmZget_benchmark_dictr"   Zget_taskr  ÚnameÚosÚpathÚjoinr#   Zget_task_demonstrationÚh5pyZFileÚsortedÚkeysrg   r  ÚlenZproblem_folderÚ	bddl_filer$   ÚcameraÚseedÚresetÚclean_sceneÚenvrA   r¹   Zbody_name2idr0   r«   Zbody_posÚ	ExceptionZforwardÚsetÚaddrŽ   ZngeomZgeom_bodyidZ	geom_rgbaÚclip_embeddings_dirr<   r¶   Úreplacer   r3   Úshift_dxÚshift_dyZset_init_stateÚstepr1   rE   Ú
save_videor  rH   rL   r@   r?   Zclampr®   rì   rá   r:   rÕ   Úgetattrr’   r   r±   Úzero_rotationrˆ   r/   r2   r4   r5   r6   rj   rr   rs   ro   Úteleportr²   r³   ri   Úmove_then_gripÚduplicate_actionsÚout_dirÚmkdirZVideoWriterÚstrZVideoWriter_fourccÚ	video_fpsÚwriteZcvtColorZCOLOR_RGB2BGRÚreleaseZimwriteÚcloseZmeanrl   ÚsumÚopenÚjsonÚdump)oÚargsr™   Z	ckpt_pathZckptZ
ModelClassr¹   ZbenchZtaskr  Z	demo_pathZ	demo_keysZinit_statesr  r-  r2  rA   ZfnameZbidZdistractor_namesZdistractor_bodiesZdnameZgeom_idr  Z	clip_pathZtask_text_for_evalr  r  Zep_idxZdist_bodiesZdnÚgidZ
init_stateZqpsZsiZobsrÿ   rB   rC   rD   Zdonerw   ZframesZ
vis_stripsrv   Z
replan_idxZcurrent_gripper_cmdru   r=   Zstart_kpZ
img_tensorr»   rÈ   rè   ré   Zeef_normZ
grip_stateZ	grip_normZ	act_extrarå   ræ   rç   Zwindow_actionsrt   Z	wrist_obsZwrist_tensorrß   rÝ   rÞ   ÚoutZwrist_w2c_matZwrist_cam_pose_matZwrist_cam_K_matZagent_K_normZwrist_K_normrº   Zextra_kwargsr  Ústripr‘   rÔ   rz   r   r‡   Z
target_posZpred_rot_deltaZnew_gripperZservo_stepsZ	max_servoZ	thresholdZcur_posZdeltaZdistZdelta_clippedZservo_actionZgrip_actionZmove_actionZtag_textZ	video_dirZ
video_pathÚwriterÚtagZvis_dirZreplan_iZstep_iZ	strip_imgZvis_pathr  r  ZresultsrA  Zout_pathr   rý   r    Úrun_eval%  s  ý





,,




þüÿÿ€$€
"þýþý


ÿýüÿþ
ÿ
ÿ
ú

ÿ
ûÿý


ÿ
ýþ



þ




í€

ÿ ° T

ýÿ"ü,ÿÿ
õ
ÿrR  Ú__main__z"Evaluate PARA in LIBERO simulation)Zdescriptionz--model_typer
   )r
   r   r   r   r   r   r   r   r   Z
wrist_onlyr   zModel architecture to evaluate)ÚtypeÚdefaultZchoicesÚhelpz--model_namezOpenGVLab/InternVL2_5-1Bz4HuggingFace model name (used by internvl model_type))rT  rU  rV  z--checkpointTzQPath to .pth checkpoint (e.g. libero/checkpoints/para_libero_spatial_t0/best.pth))rT  ZrequiredrV  z--benchmarkÚlibero_spatialzMLIBERO benchmark name (libero_spatial, libero_goal, libero_object, libero_10)z	--task_id)rT  rU  z--cameraÚ	agentviewz--n_episodesr`   z&Number of rollout episodes to evaluatez--max_stepsi,  z(Max env steps per episode before failurez--seedz	--out_dirzlibero/out/evalz--save_videoZ
store_truez<Save per-episode MP4 with heatmap overlay to out_dir/videos/)rÔ   rV  z
--save_viszISave per-replan-step visualization strips (all N_WINDOW heatmaps) as PNGsz--video_fpsr]   z!FPS for saved videos (default 10)z--clip_embeddings_dirz/data/libero/parsed_liberoz?Directory containing precomputed CLIP embeddings (for dino_vla)z--zero_rotationzPZero out rotation deltas (position-only control, for diagnosing rotation issues)z--clean_scenezBRemove distractors and furniture (match OOD objpos training setup)z
--shift_dxrâ   z)Shift pick/place objects by dx in world Xz
--shift_dyz)Shift pick/place objects by dy in world Yz
--teleportz[Servo to predicted 3D targets with closed-loop control (bypasses open-loop delta execution)z--move_then_gripzFExecute EEF move and gripper as separate steps (move first, then grip)z--duplicate_actionszUExecute each action twice [a1,a1,a2,a2,...] for sanity checking (should match normal))AÚ__doc__ÚargparserJ  r&  ÚsysÚpathlibr   r4   r)  rf   r0   r9   Ztorch.nn.functionalZnnZ
functionalrb   r   r'  ÚinsertÚdirnameÚ__file__r¹   r¦   r   r   r   r   r   Zutilsr	   r!   Zlibero.liberor"   r$  r#   Zlibero.libero.envsr$   Zrobosuite.utils.camera_utilsr%   r&   r'   r(   r«   r1   r7   r8   r!  r@   rE   rL   rˆ   r’   rÕ   rá   rì   rR  Ú__name__ZArgumentParserZparserZadd_argumentrC  rl   r;   Z
parse_argsrL  r   r   r   r    Ú<module>   sÈ    !
ÿ1
ÿ9
ÿy
ýwI   
K
þ
ÿ
ÿ
ÿ
ÿ
ÿÿÿ
ÿ
ÿÿÿ
ÿ
ÿÿÿÿÖ