o
    ¯§{i ¥  ã                   @   sì   d Z ddlZddlmZ ddlmZ ddlZddl	Z	ddl
Z
ddlZddlmZ ddlmZ ddlmZ ddlZdd„ Zd%d	d
„Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zd&dd„Zdd„ Zd'dd„Z	 	d(d!d"„Z	d)d#d$„Z dS )*zMGeneral utilities for token selection model: geometry, visualization, and IK.é    N)ÚGridSpec)ÚPath)Ú	make_grid)ÚRotationc                 C   sú   t j| ddd}t|tjƒrt  |¡}| ¡ }d }}| ¡ dkr-|jd dkr-| 	d¡}| ¡ dkrQ|jd |jd	 k rD| 
dd
d¡}|j\}}}| d	|¡}n'| ¡ d
kro|jd |jd k rn|jd dkrn| dd¡}n	| |jd d	¡}|||fS )aB  Load and process DINO features from file.
    
    Args:
        path: Path to DINO features file (.pt)
    
    Returns:
        feats: (num_patches, dino_feat_dim) tensor of features
        H_patches: Height in patches (or None if not determinable)
        W_patches: Width in patches (or None if not determinable)
    ÚcpuF)Úmap_locationÚweights_onlyNé   r   é   é   éÿÿÿÿé   é€   )ÚtorchÚloadÚ
isinstanceÚnpÚndarrayÚ
from_numpyÚfloatÚdimÚshapeÚsqueezeÚpermuteÚreshapeÚ	transpose)ÚpathÚfeatsÚ	H_patchesÚ	W_patchesÚD© r!   ú1/data/cameron/keygrip/volume_dino_tracks/utils.pyÚload_dino_features   s$   

"€
r#   c                 C   s’   |du s|du r#t t | ¡ƒ}| | dkr|d8 }| | dks| | }tjt |¡t |¡dd\}}tj| ¡ | ¡ gdd tj¡}|||fS )ah  Build patch position coordinates.
    
    Args:
        num_patches: Total number of patches
        H_patches: Height in patches (if known)
        W_patches: Width in patches (if known)
    
    Returns:
        patch_positions: (num_patches, 2) array of [x, y] patch coordinates
        H_patches: Height in patches
        W_patches: Width in patches
    Nr   r
   Úij)Úindexing©Úaxis)	Úintr   ÚsqrtÚmeshgridÚarangeÚstackÚflattenÚastypeÚfloat32)Únum_patchesr   r   Úy_coordsÚx_coordsÚpatch_positionsr!   r!   r"   Úbuild_patch_positions.   s   ÿ "
r4   c                 C   sŠ   t |jƒd›}| d|› d }| d|› d }| ¡ r)| ¡ r)t |¡t |¡fS | d }| d }| ¡ rC| ¡ rCt |¡t |¡fS dS )a'  Load camera pose and intrinsics for a frame.
    
    Args:
        episode_dir: Path to episode directory
        frame_file: Path to frame file
    
    Returns:
        camera_pose: 4x4 camera pose matrix (or None if not found)
        cam_K: 3x3 camera intrinsics (or None if not found)
    Ú06dÚrobot_camera_pose_z.npyÚcam_K_zrobot_camera_pose.npyz	cam_K.npy©NN)r(   ÚstemÚexistsr   r   )Úepisode_dirÚ
frame_fileÚ	frame_strÚcam_pose_pathÚ
cam_K_pathÚcam_pose_staticÚcam_K_staticr!   r!   r"   Úload_cam_dataD   s   rB   c                 C   sH   t  | d¡}|| dd… }|d dkrdS || }|dd… |d  S )z)Project 3D point to 2D image coordinates.ç      ð?Nr   r   r   )r   Úappend)Úpoint_3dÚcamera_poseÚcam_KÚ
point_3d_hÚ	point_camÚ
point_2d_hr!   r!   r"   Úproject_3d_to_2d\   s   rK   c           	      C   sŒ   | du rdS t j| t jd} | jdkr| S | jdkr#|  dd¡} d}nd}|| }|| }t j| d | | d	 | gdd
}|rD|d S |S )aU  Rescale 2D coordinates from original image size to new size.
    
    Args:
        coords: Can be None, empty, 1D array (2,), or 2D array (N, 2)
        H_orig, W_orig: Original image dimensions
        H_new, W_new: New image dimensions
    
    Returns:
        Rescaled coordinates in same shape as input (or None if input was None)
    N©Údtyper   r
   r   TF).r   ).r
   r&   )r   Úasarrayr/   ÚsizeÚndimr   r,   )	ÚcoordsÚH_origÚW_origÚH_newÚW_newÚwas_1dÚscale_xÚscale_yÚcoords_rescaledr!   r!   r"   Úrescale_coordse   s"   

ÿrZ   c                 C   s¨   t j |¡}|dd…df }|dd…dd…f }|d |d }}|d |d }}	| d | | }
| d |	 | }d	}t  |
||g¡}|t j |¡ }|| }||fS )
a\  Unproject 2D point to a ray in robot frame.
    
    Args:
        point_2d: 2D point in image coordinates
        camera_pose: 4x4 transformation matrix from robot frame to camera frame
        cam_K: 3x3 camera intrinsics
    
    Returns:
        cam_pos_robot: Camera position in robot frame
        ray_robot: Ray direction in robot frame
    Nr   ©r   r   )r
   r
   ©r   r   )r
   r   r   r
   rC   )r   ÚlinalgÚinvÚarrayÚnorm)Úpoint_2drF   rG   Úcam_pose_invÚcam_pos_robotÚcam_rot_c2rÚfxÚfyÚcxÚcyÚx_camÚy_camÚz_camÚray_camÚ	ray_robotr!   r!   r"   Úunproject_2d_to_ray…   s   rn   c           	      C   sj   | du s|du r
dS t | ||ƒ\}}t|d ƒdk rdS |d  |d  }|||  }| ¡ }||d< |S )a¦  Recover 3D keypoint from 2D image projection of ground projection and height.
    
    The 2D point is the projection of the keypoint's ground projection (Y=0), not the keypoint itself.
    So we need to:
    1. Find where the ray from the camera through the 2D point intersects the ground plane (Y=0, index 2)
    2. Then move up by the height to get the actual 3D keypoint
    
    Args:
        kp_2d_image: 2D image coordinates of the ground projection of the keypoint
        height: Height (Z coordinate, index 2) of the actual 3D keypoint
        camera_pose: 4x4 camera pose matrix
        cam_K: 3x3 camera intrinsics
    
    Returns:
        3D keypoint position
    Nr   çíµ ÷Æ°>)rn   ÚabsÚcopy)	Úkp_2d_imageÚheightrF   rG   Úcam_posÚ	ray_imageÚt_groundÚground_pointÚkp_3dr!   r!   r"   Ú#recover_3d_from_keypoint_and_heightž   s   ry   c                 C   sh   | du s|du r
dS t | ||ƒ\}}t|d ƒdk rdS ||d  |d  }|dk r,dS |||  }|S )aS  Recover 3D keypoint from direct 2D keypoint projection and height.
    
    The 2D point is the direct projection of the 3D keypoint (not its ground projection).
    We find the point along the ray from the camera through the 2D point at the specified height.
    
    Args:
        kp_2d_image: 2D image coordinates of the direct keypoint projection
        height: Height (Y coordinate, index 2) of the 3D keypoint in MuJoCo Z-up convention
        camera_pose: 4x4 camera pose matrix
        cam_K: 3x3 camera intrinsics
    
    Returns:
        3D keypoint position, or None if invalid
    Nr   ro   r   )rn   rp   )rr   rs   rF   rG   rt   Úray_directionÚtrx   r!   r!   r"   Ú*recover_3d_from_direct_keypoint_and_heightÄ   s   r|   Fc                 C   sv  g }g }g }	t d|d ƒD ]U}
||
 }|t|ƒkr nHt|| jƒd›}| |› d }| ¡ s0qt |¡}|dd…dd…f }|dd…df }|| | }| |¡ |r[| |d ¡ |rb|	 |¡ qt|ƒdkrnt |¡nt g ¡ 	dd¡}|g}|r‘t|ƒdkr‡t |¡nt g ¡}| |¡ |r­t|	ƒdkržt |	¡n	t g ¡ 	ddd¡}	| |	¡ t|ƒdkr·|d S t
|ƒS )aÖ  
    Load ground truth 3D trajectory from gripper poses.
    
    Args:
        episode_dir: Path to episode directory
        frame_files: List of frame file paths
        start_idx: Starting frame index
        window_size: Number of future frames to load
        kp_local: Local keypoint offset (from KEYPOINTS_LOCAL_M_ALL[KP_INDEX])
        return_heights: If True, also return heights array
        return_orientations: If True, also return orientations as rotation matrices
    
    Returns:
        trajectory_gt_3d: (N, 3) array of 3D keypoints
        heights_gt: (N,) array of heights (only if return_heights=True)
        orientations_gt: (N, 3, 3) array of rotation matrices (only if return_orientations=True)
    r
   r5   z_gripper_pose.npyNr   r   r   )ÚrangeÚlenr(   r9   r:   r   r   rD   r_   r   Útuple)r;   Úframe_filesÚ	start_idxÚwindow_sizeÚkp_localÚreturn_heightsÚreturn_orientationsÚtrajectory_gt_3dÚ
heights_gtÚorientations_gtÚoffsetÚf_idxr=   Ú	pose_pathÚposeÚrotÚposrx   Úretr!   r!   r"   Úload_gt_trajectory_3dê   s@   


€( 
*
r   c                 C   s6  t  | ¡} t  |¡}| jdkr|  dd¡} |jdkr| d¡}| jd }| jd }	| jdd}
g }|
D ]}|| }|| }| ||g¡ q3t  |¡}t|||||ƒ}|t	t
  t
 }g }ttt|ƒt|ƒƒƒD ]}|| }|| }t||||ƒ}|dur| |¡ qgt|ƒdkrt  |¡nt  g ¡ dd¡}|||fS )a”  
    Post-process model predictions: convert pixel scores to 3D trajectory.
    
    Args:
        pixel_scores: (window_size, num_patches) or (num_patches,) - attention/pixel scores
        heights_pred: (window_size,) or scalar - predicted heights (normalized)
        H_patches, W_patches: Patch grid dimensions
        H_orig, W_orig: Original image dimensions
        camera_pose: 4x4 camera pose matrix
        cam_K: 3x3 camera intrinsics
    
    Returns:
        trajectory_pred_3d: (N, 3) array of 3D keypoints
        trajectory_pred_2d_image: (N, 2) array of 2D image coordinates
        heights_pred_denorm: (N,) array of denormalized heights
    r
   r   r   r&   Nr   )r   rN   rP   r   r   ÚargmaxrD   r_   rZ   Ú
MAX_HEIGHTÚ
MIN_HEIGHTr}   Úminr~   r|   )Úpixel_scoresÚheights_predr   r   rR   rS   rF   rG   r‚   r0   Úpred_patch_idxÚpred_patchesÚidxÚpyÚpxÚpred_image_coordsÚheights_pred_denormÚtrajectory_pred_3dr{   Úkp_2dÚhÚ
kp_3d_predr!   r!   r"   Úpost_process_predictions"  s6   








€(
r¢   éd   ç{®Gáz”?c                 C   s‚  ddl m}m}	 |  ¡ }
|
d |k r||
d< t|ƒD ]}||||ƒ}|	||||ƒ t ||¡ | |j¡ t	j
ddddd}t |tjjd¡}t |j| g d	¢ ¡ ¡ }t |¡ ¡ }| t	jt |d
 |d |d |d g|
g¡d¡ t |¡ ¡ }t |tjjd¡}|j|  ¡ }t	j
ddddd}| t	jt |d
 |d |d |d g|g¡d¡ t	j|dd}| |j¡ t	j||||gddt	j|dgd}| |d¡ |j|jdd…< |jdt|jƒ… |jdd…< t  ||¡ t ||¡ t |tjjd¡}|j|  ¡ }|d |k rt!|
d |ƒ|
d< |d dkr/tj" #||
 ¡}|dk r/ nqt ||¡ t |tjjd¡}|j|  ¡ }|d |k r=||
d< tdƒD ]é}||||ƒ}|	||||ƒ t ||¡ | |j¡ t	j
ddddd}t |j| g d	¢ ¡ ¡ }t |¡ ¡ }| t	jt |d
 |d |d |d g|
g¡d¡ t |¡ ¡ }t |tjjd¡}|j|  ¡ }t	j
ddddd}| t	jt |d
 |d |d |d g|g¡d¡ t	j|dd}| |j¡ t	j||||gddt	j|dgd}| |d¡ |j|jdd…< |jdt|jƒ… |jdd…< t  ||¡ t ||¡ |j|  ¡ }|d |kr; dS qUdS dS )a  Solve IK with separate tasks for keypoint position and gripper rotation.
    
    Args:
        target_kp_pos: (3,) target keypoint position
        target_gripper_rot: (3, 3) target gripper rotation matrix
        configuration: mink.Configuration object
        robot_config: Robot configuration
        mj_model: MuJoCo model
        mj_data: MuJoCo data
        max_iterations: Maximum number of IK iterations
        min_height_above_ground: Minimum height above ground plane (Y=0) in meters. Default 0.02m (2cm).
    r   )Úget_link_poses_from_robotÚposition_exoskeleton_meshesr   Úvirtual_gripper_keypointÚbodyrC   ç        )Úposition_costÚorientation_cost)r
   r   r   r   r   r
   )Úwxyz_xyzÚ	Fixed_Jawg¸…ëQ¸Ž?gü©ñÒMbP?)Úcostg{®Gáz„?Údaqp)Úmodel)ÚlimitsNé
   gš™™™™™¹?)$Ú	exo_utilsr¥   r¦   rq   r}   ÚmujocoÚ
mj_forwardÚupdateÚqposÚminkÚ	FrameTaskÚ
mj_name2idÚmjtObjÚ
mjOBJ_BODYÚRÚ	from_quatÚxquatÚ	as_matrixÚfrom_matrixÚas_quatÚ
set_targetÚSE3r   ÚconcatenateÚxposÚPostureTaskÚsolve_ikÚConfigurationLimitÚintegrate_inplaceÚqr~   ÚctrlÚmj_stepÚmaxr]   r`   )Útarget_kp_posÚtarget_gripper_rotÚconfigurationÚrobot_configÚmj_modelÚmj_dataÚmax_iterationsÚmin_height_above_groundr¥   r¦   Útarget_kp_pos_constrainedÚ	iterationÚ
link_posesÚkp_taskÚ
kp_body_idÚkp_rotÚkp_quatÚgripper_quatÚgripper_body_idÚgripper_posÚgripper_taskÚposture_taskÚvelÚcurrent_kp_posÚerrorÚfinal_kp_posÚ_r!   r!   r"   Úik_to_keypoint_and_rotation`  sŠ   44$
€44$ÿárè   r²   c           +   
   C   s  | j d }|du r~tjdd}tdd|ddd}i }| |d ¡|d	< | |d
 ¡|d< | |d ¡|d< d}t|ƒD ]B}|d }|d }|dkrK n3| |||f ¡|d|› < |d7 }|d }|d }|dkrk n| |||f ¡|d|› < |d7 }q;|d	 }| ¡  | | ¡ |durt|ƒdkr|j	|dd…df |dd…df ddddd t
|ƒD ]\}\}}tj |t|ƒ ¡}|j	||d|dddd q²|durt|ƒdkr|j	|dd…df |dd…df ddddd t
|ƒD ]\}\}}tj |t|ƒ ¡}|j	||d|ddd qö|dur(|j	|d |d d d!ddd"d#d$ |jd%|› d|› d&|› d'|› d#d(d) | d*¡ |jd+d!d, |d }| ¡  | |¡ |durÞt|ƒdkrÞ|j	|dd…df |dd…df ddddd t
|ƒD ]\}\}}tj |t|ƒ ¡}|j	||d|dddd q||durÞt|ƒdkrÞ|j	|dd…df |dd…df ddddd t
|ƒD ]\}\}}tj |t|ƒ ¡}|j	||d|ddd qÁ|durô|j	|d |d d d!ddd"d#d$ |jd-|	› d|
› d&|› d'|› d#d(d) | d*¡ |jd+d!d, d|v rã|d }| ¡  | |¡ tj|	|
dftjd.}|dur{t|ƒdkr{tt|ƒƒD ]7}||df ||df }}tt t |d|
d ¡¡ƒ} tt t |d|	d ¡¡ƒ}!g d/¢||!| dd…f< qC|durÅt|ƒdkrÅtt|ƒƒD ]7}||df ||df }"}#tt t |"d|
d ¡¡ƒ}"tt t |#d|	d ¡¡ƒ}#g d0¢||#|"dd…f< qd}$|d|$  ||$  }%| |%¡ |jd1d#d(d) | d*¡ tttd#ƒƒD ]º}d|› |vröqê|d|›  }&|& ¡  tj|	|
dftjd.}'|durL|t|ƒk rL||df ||df }}tt t |d|
d ¡¡ƒ} tt t |d|	d ¡¡ƒ}!g d/¢|'|!| dd…f< |durŒ|t|ƒk rŒ||df ||df }"}#tt t |"d|
d ¡¡ƒ}"tt t |#d|	d ¡¡ƒ}#g d0¢|'|#|"dd…f< |& |'¡ |&jd2|d › d3d#d4 |& d*¡ qêtttd#ƒƒD ]W}d|› |vr¸q¬|dur||j d k r||  |	|
¡}(|(|( ¡  |( ¡ |( ¡  d5  })|d|›  }*|* ¡  |*j|)d6ddd7 |*jd8|d › d#d4 |* d*¡ q¬||fS )9aj  
    Create 4x4 grid visualization of predictions.
    
    Args:
        rgb_lowres: (H, W, 3) RGB image at low resolution
        dino_vis: (H_patches, W_patches, 3) DINO visualization
        trajectory_points_lowres: (N, 2) GT trajectory in low-res coordinates
        trajectory_points_patches: (N, 2) GT trajectory in patch coordinates
        predicted_trajectory_lowres: (M, 2) Predicted trajectory in low-res coordinates
        predicted_trajectory_patches: (M, 2) Predicted trajectory in patch coordinates
        current_kp_2d_lowres: (2,) Current EEF position in low-res coordinates
        current_kp_2d_patches: (2,) Current EEF position in patch coordinates
        attention_scores: (window_size, num_patches) Attention scores
        H_patches, W_patches: Patch grid dimensions
        episode_id: Episode identifier string
        start_idx: Start frame index
        window_size: Number of future timesteps
        fig: Optional figure to plot on (for live updates)
        axes_dict: Optional dict of axes (for live updates)
    
    Returns:
        fig, axes_dict: Figure and axes dict for live updates
    r   N)é   ré   )Úfigsizeé   ç333333Ó?)ÚfigureÚhspaceÚwspacer[   Úrgb)r   r
   Údinor\   Údino_onehotr   Úonehot_r
   Ú
attention_úb-r   çffffffæ?zGT Trajectory©Ú	linewidthÚalphaÚlabelÚoÚwhiteç      à?©ÚcolorÚ
markersizeÚmarkeredgecolorÚmarkeredgewidthúr-zPred TrajectoryÚxé   ©rÿ   r   r  Úroé   úCurrent EEFr²   ©r   r  r  rú   ÚzorderzLow-Res Image (z)
z	 - Frame Úbold)ÚfontsizeÚ
fontweightÚoffúupper right©Úlocr  zDINO Patch Features (rL   )rC   rC   rC   )rC   r©   r©   z*DINO + One-hot Pixels
(White=GT, Red=Pred)z
One-hot t+z
(White=GT, Red=Pred)©r  ç:Œ0âŽyE>Úhot)ÚcmapÚvminÚvmaxzAttention t+)r   Úpltrí   r   Úadd_subplotr}   ÚclearÚimshowr~   ÚplotÚ	enumerateÚcmÚviridisÚplasmaÚ	set_titler'   Úlegendr   Úzerosr/   r(   ÚroundÚclipr”   Úmax_timestepsr   rÎ   )+Ú
rgb_lowresÚdino_visÚtrajectory_points_lowresÚtrajectory_points_patchesÚpredicted_trajectory_lowresÚpredicted_trajectory_patchesÚcurrent_kp_2d_lowresÚcurrent_kp_2d_patchesÚattention_scoresr   r   Ú
episode_idr   r‚   ÚfigÚ	axes_dictÚRES_LOWÚgsÚgrid_idxr{   ÚrowÚcolÚax1Úir  Úyrÿ   Úax2Úax3ÚoverlayÚkp_xÚkp_yÚ
patch_x_gtÚ
patch_y_gtÚpatch_x_predÚpatch_y_predrù   ÚblendedÚ	ax_onehotÚ
onehot_imgÚattention_mapÚattention_map_normÚax_attnr!   r!   r"   Úvisualize_predictionsð  sè   
 

..
ÿ(

..
ÿ(





 
€rK  c           E   
   C   s`  |j dd}g }|D ]}|| }|| }| ||g¡ q
t |¡}|j dd ¡ }g }|D ]}|| }|| }| ||g¡ q-t |¡}|durM| ¡ }nt|ƒdkrW|d n
t |d |
d g¡}| dd…dd…f  |
|d¡ ¡ }tdƒD ]7}|dd…dd…|f }| ¡ | 	¡ }}||kr£|| ||  |dd…dd…|f< qwd|dd…dd…|f< qwt 
|dd¡}| ¡  | |¡ t|ƒdkr=|j|dd…df |dd…df d	dd
dd t|ƒD ]\}\} }!tj |t|ƒ ¡}"|j| |!d|"dddd qát|ƒdkr=|j|dd…df |dd…df ddd
dd t|ƒD ]\}\} }!tj |t|ƒ ¡}"|j| |!d|"ddd q |durS|j|d |d ddddddd |j|› d|d › d|
› d|› ddd |jdd d! | d"¡ g }#g }$d}%|dur™t|tjƒr‘| |
|¡ ¡ dk}%n| |
|¡dk}%tt|dƒƒD ]	}&|&|jd k r||&  |
|¡}'|%durÉt |%¡rÉ|'|%  ¡ }(|(|'|% < |'|' ¡  |' 	¡ |' ¡  d#  })tj |)¡dd…dd…dd…f }*|%durt |*¡}+g d$¢|+|% < d
|* d%|+  }*t |*¡ ddd¡  ¡ },|# |,¡ tj!d|
|ftj"d&}-|&t|ƒk rX||&df ||&df }.}/t#t $t 
|.d|d ¡¡ƒ}0t#t $t 
|/d|
d ¡¡ƒ}1d'|-dd…|1|0f< |&t|ƒk r||&df ||&df }2}3t#t $t 
|2d|d ¡¡ƒ}2t#t $t 
|3d|
d ¡¡ƒ}3d'|-d|3|2f< d(|-d|3|2f< d(|-d|3|2f< t |-¡  ¡ }4|$ |4¡ q t|#ƒdkrÅt%|#dddd)}5|5 ddd¡ &¡  ¡ }6t|$ƒdkrßt%|$ddd(d)}7|7 ddd¡ &¡  ¡ }8| ¡  t|#ƒdkr,t|$ƒdkr,t	|6jd |8jd ƒ}9t'j(|6|9|6jd ft'j)d*}:t'j(|8|9|8jd ft'j)d*};t *|:|;g¡}<| |<¡ |jd+dd n3t|#ƒdkrL| |6¡ |%dur?d,nd-}=|jd.|=› dd nt|$ƒdkr_| |8¡ |jd/dd | d"¡ | ¡  | ¡ }>|t+t,  t, }?|>t+t,  t, }@t -dt|@ƒd ¡}A|j.|Ad0 |@d1d2d3d4d5 |j.|Ad0 |?dt|@ƒ… d1d2d6d7d5 |j/d8d9d |j0d:d9d |jd;dd |jd d |j1d%d< | ¡  t|tjƒrÖ| ¡ n|}Bt|	tj2ƒrá|	n|	}Ct -dt|Bƒd ¡}D|j.|Dd0 |Bd1d2d=d>d5 |j.|Dd0 |Cdt|Bƒ… d1d2d?d@d5 |j/d8d9d |j0dAd9d |jdBdd |jd d |j1d%d< dS )Ca  
    Visualize a training sample during training.
    
    Args:
        dino_tokens_sample: (num_patches, dino_feat_dim) DINO tokens
        groundplane_coords_sample: (num_patches, 2) Ground-plane XZ coordinates
        current_eef_pos_sample: (2,) Current EEF position (optional, can be None)
        onehot_targets_sample: (max_timesteps, num_patches) One-hot targets
        heights_sample: (max_timesteps,) GT heights
        seq_id_sample: Episode ID string
        attention_scores: (max_timesteps, num_patches) Attention scores
        heights_pred_sample: (max_timesteps,) Predicted heights
        H_patches, W_patches: Patch grid dimensions
        max_timesteps: Maximum number of timesteps
        ax_vis, ax_attn, ax_height: Matplotlib axes
        epoch: Current epoch number
    r
   r&   )r   Nr   r   r   rý   rõ   rö   ÚGTr÷   rû   r	   rü   rþ   r  ÚPredr  rë   r  r  r  r	  r²   r
  z	 | Epoch z | DINO Patches (ú)r  r  r  r  r  r  )r©   rC   rC   rì   rL   rC   r©   )ÚnrowÚpaddingÚ	pad_value)Úinterpolationz;Attention Maps (top) | One-hot: White=GT, Red=Pred (bottom)z (Volume Masked)Ú zAttention MapszOne-hot: White=GT, Red=Predgš™™™™™É?gš™™™™™Ù?g333333ã?Úgreenz	GT Height)Úwidthrù   rÿ   rú   ÚredzPred HeightÚTimestepé	   z
Height (m)zHeight Trajectory)rù   Úbluez
GT GripperÚorangezPred GripperzGripper ValuezGripper Open/Close Trajectory)3r‘   rD   r   r_   Únumpyr~   Úviewr}   r”   rÎ   r&  r  r  r  r  r  r  r   r!  r"  r#  r'   r   r   ÚTensorr   r   Úanyr  Ú
zeros_liker   r   r   r$  r/   r(   r%  r   r   Úcv2ÚresizeÚINTER_LINEARÚvstackr’   r“   r+   ÚbarÚ
set_xlabelÚ
set_ylabelÚgridr   )EÚdino_tokens_sampleÚgroundplane_coords_sampleÚcurrent_eef_pos_sampleÚonehot_targets_sampleÚheights_sampleÚgrippers_sampleÚseq_id_sampler0  Úheights_pred_sampleÚgrippers_pred_sampler   r   r'  Úax_visrJ  Ú	ax_heightÚ
ax_gripperÚepochÚvolume_mask_sampleÚpredicted_patch_indicesr-  r™   Úpatch_yÚpatch_xÚgt_patch_indicesr+  r/  r)  r:  ÚchannelÚmin_valÚmax_valr  r;  rÿ   Úattention_imgsÚonehot_imgsÚvolume_mask_2dr{   rH  Úmin_valid_valuerI  Úattention_rgbÚmask_overlayÚattention_tensorrG  r?  r@  rA  rB  rC  rD  Úonehot_tensorÚattention_gridÚattention_grid_npÚonehot_gridÚonehot_grid_npÚtarget_widthÚattention_resizedÚonehot_resizedÚcombinedÚ
mask_labelÚheights_gt_npr   Úheights_gt_denormÚ	timestepsÚgrippers_gt_npÚgrippers_pred_npÚtimesteps_gripperr!   r!   r"   Úvisualize_training_sample°  sú   


*"$
..
ÿ*


 "






&&r”  r8   )FF)r£   r¤   )r²   NN)N)!Ú__doc__r[  r   Úmatplotlib.pyplotÚpyplotr  Úmatplotlib.gridspecr   r   r`  r´   ÚtimeÚpathlibr   Útorchvision.utilsr   Úscipy.spatial.transformr   r½   r¸   r#   r4   rB   rK   rZ   rn   ry   r|   r   r¢   rè   rK  r”  r!   r!   r!   r"   Ú<module>   s<    
	 &
&8
> 
ù Eü