o
    Hi                    @   sN  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
mZ ddlZddlmZ ddlmZ ddlZddlZddlZddlZddlZddlZddlZddlZddlZejdeje ddlZddlm Z  ddl!m"  m#Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl)Z/dd	l"m0Z0 d
d Z1dd Z2dd Z3dd Z4dd Z5dd Z6dd Z7dd Z8dd Z9dGddZ:dd Z;d Z<d!Z=d"Z>d#Z?d$a@dHd&d'ZAdGd(d)ZBd*d+ ZCdGd,d-ZDd.d/ ZEd0d1 ZFd2d3 ZGd4d5 ZH	$		6		7	8				9dId:d;ZIe?d8dfd<d=ZJd>d? ZKd@dA ZLe?d8fdBdCZMdDdE ZNeOdFkr%eN  dS dS )Ja;  Train trajectory volume predictor on LIBERO demonstrations.

Model predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (CE at trajectory pixel only).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel (teacher forcing), decoded at pred pixel in val/inference.
    N)
DataLoaderConcatDataset)Path)tqdm)#project_points_from_world_to_camera)RealTrajectoryDatasetCachedTrajectoryDatasetN_WINDOW)TrajectoryHeatmapPredictorN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINS	PRED_SIZE)*recover_3d_from_direct_keypoint_and_heightc           
      C   s   | dkrt S | dkrddlm} |S | dkrddlm} |S | dkr*ddlm} |S | d	kr6dd
lm} |S | dkrBddl	m
} |S | dkrNddlm} |S | dkrZddlm} |S | dkrfddlm} |S | dkrrddlm}	 |	S | dkrxt S td|  )Nparaactr   )ACTPredictorda3)DA3Predictormoge)MoGePredictordino_vla)DinoVLAPredictorinternvl)InternVLAPredictorinternvl_act)InternVLACTPredictordual_da3)DualDA3Predictor	dual_para)DualParaPredictorcost_volume)CostVolumePredictor
wrist_onlyzUnknown model_type: )r
   Z	model_actr   Z	model_da3r   Z
model_moger   Zmodel_dino_vlar   Zmodel_vla_internvlr   Zmodel_vla_internvl_actr   Zmodel_dual_da3r   Zmodel_dual_parar    Zmodel_cost_volumer"   
ValueError)

model_typer   r   r   r   r   r   r   r    r"    r&   9/data/cameron/567_augmentation_viewpoint_project/train.pyget_model_class$   s@   r(   c                 C   s   | dvS )zTACT-style models use direct regression; everything else uses pixel-aligned heatmaps.)r   r   r&   r%   r&   r&   r'   is_heatmap_modelG      r*   c                 C   s   | dv S )z%Dual-camera models process two views.)r   r   r&   r)   r&   r&   r'   is_dual_modelK   r+   r,   c                 C   sL   t j}t j}| | || d  }|dd}|td   dtd }|S )a  Discretize continuous height values into bin indices.

    Args:
        height_values: (B, N_WINDOW) or (N_WINDOW,) tensor of heights in [MIN_HEIGHT, MAX_HEIGHT]

    Returns:
        bin_indices: (B, N_WINDOW) or (N_WINDOW,) tensor of bin indices in [0, N_HEIGHT_BINS-1]
    :0yE>              ?   r   )model_module
MIN_HEIGHT
MAX_HEIGHTclampr   long)height_valuesmin_hmax_h
normalizedbin_indicesr&   r&   r'   discretize_heightP   s   	r;   c                 C   sH   t j}t j}| jdd}tjddt| jd}|| }|||  | }|S )zDecode height bin logits back to continuous height values.

    Args:
        bin_logits: (B, N_WINDOW, N_HEIGHT_BINS) logits for each bin

    Returns:
        height_values: (B, N_WINDOW) continuous height values in [MIN_HEIGHT, MAX_HEIGHT]
    dimr.   r/   device)r1   r2   r3   argmaxtorchlinspacer   r@   )
bin_logitsr7   r8   r:   bin_centersr9   r6   r&   r&   r'   decode_height_binsc   s   	rF   c                 C   sH   t j}t j}| | || d  }|dd}|td   dtd S )z6Discretize continuous gripper values into bin indices.r-   r.   r/   r0   r   )r1   MIN_GRIPPERMAX_GRIPPERr4   r   r5   )Zgripper_valuesmin_gmax_gr9   r&   r&   r'   discretize_gripperu   s
   rK   c                 C   s@   t j}t j}| jdd}tjddt| jd}|| ||  | S )uX   Decode (B, N_WINDOW, N_GRIPPER_BINS) logits → (B, N_WINDOW) continuous gripper values.r<   r=   r.   r/   r?   )r1   rG   rH   rA   rB   rC   r   r@   )rD   rI   rJ   r:   rE   r&   r&   r'   decode_gripper_bins~   s
   rL   c                 C   sh   t jtj| jt jd}t jtj| jt jd}| | || d  }|dd}|td  	 dtd S )uT   Discretize (B, N_WINDOW, 3) euler angles → (B, N_WINDOW, 3) bin indices, per axis.r@   dtyper-   r.   r/   r0   r   )
rB   tensorr1   MIN_ROTr@   float32MAX_ROTr4   r   r5   )Zeuler_valuesmin_rmax_rr9   r&   r&   r'   discretize_rotation   s
   rU   c                 C   s`   t jtj| jt jd}t jtj| jt jd}| jdd}t jddt	| jd}|| ||  | S )uQ   Decode (B, N_WINDOW, 3, N_ROT_BINS) → (B, N_WINDOW, 3) continuous euler angles.rM   r<   r=   r.   r/   r?   )
rB   rO   r1   rP   r@   rQ   rR   rA   rC   r   )Z
rot_logitsrS   rT   r:   rE   r&   r&   r'   decode_rotation_bins   s
   rV   c                 C   s   t |}| j\}}}}g }tdD ]B}	| dddd|	ddf || |}
|dddd|	f || }tj|
|dd}|durL||||  }||  qt	|
 }|durq|
 jdd}|||  d|  S |d S )zCross-entropy for 3 euler axes, averaged.

    Args:
        pred_rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        target_euler:         (B, N_WINDOW, 3) euler angles in radians
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore
       NnoneZ	reductionr/   ming      @)rU   shaperangereshapeFcross_entropyappendmeanrB   stacksumr4   )Zpred_rotation_logitsZtarget_eulermaskZtarget_binsBN_ZNrlossesaxisZlogits_axisZtarget_axiscetotaln_validr&   r&   r'   compute_rotation_loss   s   ( rn   c                 C   sT   t jtj| jt jd}t jtj| jt jd}|| || d  dd}t	| |S )zMSE loss for continuous rotation prediction (sigmoid [0,1] vs normalized target).

    Args:
        pred_rotation_sigmoid: (B, N_WINDOW, 3) sigmoid outputs in [0, 1]
        target_delta_rotvec:   (B, N_WINDOW, 3) delta axis-angle values
    rM   r-   r.   r/   )
rB   rO   r1   rP   r@   rQ   rR   r4   r_   mse_loss)Zpred_rotation_sigmoidZtarget_delta_rotvecrS   rT   Ztarget_normr&   r&   r'   compute_rotation_loss_mse   s   rp      -C6?  i  F    c                 C   s~   g }t d| |D ]*}t d||D ]!}|D ]}	ttj||gtjdt|	||}
|
dur0||
 qqq|r:t|S tdS )zABuild 3D points for volume visualization (numpy). Returns (N, 3).r   rN   N)r   rW   )r]   r   nparrayfloat64floatra   zeros)HWcamera_posecam_KZheight_bucket_centersZ
pixel_steppointsyxZheightptr&   r&   r'   build_volume_3d_points_for_vis   s   
r   c                 C   sT  | j \}}}}}| j}	|dddddf  d|d }
|dddddf  d|d }|d|d }g }t|D ]M}| dd|f }||d}|dd|f ||  |dd|f |  |
dd|f   }tj||dd}|dur||dd|f  }||	  q?t
| }|dur| jdd}|| | S || S )	al  Cross-entropy with softmax over all 3D cells (per timestep).

    Args:
        pred_volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
        trajectory_2d: (B, N_WINDOW, 2) pixel coords [x, y]
        target_height_bins: (B, N_WINDOW) bin indices in [0, N_HEIGHT_BINS-1]
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore (for out-of-view wrist targets)
    Nr   r0   r<   rX   rY   r/   rZ   )r\   r@   r5   r4   r]   r^   r_   r`   ra   rb   rB   rc   rd   )Zpred_volume_logitstrajectory_2dtarget_height_binsre   rf   rg   Nhr{   r|   r@   pxpyZh_binri   tZlogits_tZlogits_flatZ
target_idxrk   rl   rm   r&   r&   r'   compute_volume_loss   s&   	&&@r   c                 C   s.  | j \}}}}}| j}tj||d|tjd}tj|||tjd}t|D ]S}	| dd|	f }
|
jdd\}}||dj	dd}|| }|| }|
 |dd|	df< |
 |dd|	df< |
tj||ddd||f j	dd|dd|	f< q$tjd	d
t|d}tj}tj}|| }|||  | }||fS )an  From volume (B, N_WINDOW, N_HEIGHT_BINS, H, W) get pred 2D and height per timestep.

    For each t: max over height bins gives (H,W) score; argmax gives (x,y); at (x,y) argmax over bins gives height bin.
    Returns:
        pred_2d: (B, N_WINDOW, 2) float pixel coords
        pred_height: (B, N_WINDOW) continuous height from decode_height_bins at that pixel
       rM   Nr0   r=   r<   r   r?   r.   r/   )r\   r@   rB   rz   rQ   r5   r]   maxr^   rA   ry   ZarangerC   r   r1   r2   r3   )volume_logitsrf   rg   r   r{   r|   r@   pred_2dZpred_height_binsr   vol_tZ
max_over_hrh   Zflat_idxr   r   rE   r7   r8   r9   pred_heightr&   r&   r'   &extract_pred_2d_and_height_from_volume   s.   r   c                 C   sv   |j \}}|dk }tj| || d||| dd}|dur7||||  }| | jdd S | S )u  2-class CE for gripper (open/close).

    Args:
        pred_gripper_logits: (B, N_WINDOW, 2) logits for [open, close]
        target_gripper:      (B, N_WINDOW) values in [-1, 1] → class 0 (open) or 1 (close)
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore
    r   r   rX   rY   Nr/   rZ   )r\   r5   r_   r`   r^   rd   r4   rb   )pred_gripper_logitstarget_gripperre   rf   rg   Ztarget_classrk   r&   r&   r'   compute_gripper_loss  s   
r   c                 C   s   t jg d| jdddd}t jg d| jdddd}| | |   }t|ddddd}|  }|  }	t	|
 |j\}
}t||
g}|||	|fS )z-Get visualization arrays for a single sample.g
ףp=
?gv/?gCl?r?   rW   r0   gZd;O?gy&1?g?r   r   )rB   rO   r@   viewcpunumpyrv   clipZ	transposeunravel_indexrA   r\   rw   )rgbtarget_heatmappred_heatmapZ	target_2drb   stdZ
rgb_denormZrgb_visZ	pred_heatZ	target_ptpred_ypred_xZpred_ptr&   r&   r'   visualize_sample(  s   r   c                 C   s    || d }| | |  ddS )zDNormalize values to [0, 1] given per-axis min/max. Clamps to [0, 1].r-   r.   r/   )r4   valuesZmin_valsZmax_valsZ
range_valsr&   r&   r'   normalize_to_015  s   r   c                 C   s   || }| | | S )z6Denormalize values from [0, 1] back to original scale.r&   r   r&   r&   r'   denormalize_from_01:  s   r   c                 C   s   | j }tjtj|tjd}tjtj|tjd}tjtj|tjd}	tjtj|tjd}
t	|||}t	||	|
}t
| |}t
||}|dk }t
||}|||fS )zGACT losses: MSE for pos/rot (normalized [0,1]), BCE for binary gripper.rM   r   )r@   rB   rO   r1   MIN_POSrQ   MAX_POSrP   rR   r   r_   ro   ry   Z binary_cross_entropy_with_logits)pos_predrot_predgripper_predtrajectory_3dtrajectory_eulertrajectory_gripperr@   min_posmax_posmin_rotmax_rotZ
pos_targetZ
rot_targetZpos_lossrot_lossZgrip_targetgripper_lossr&   r&   r'   compute_act_loss?  s   
r   2   
   r   皙?c           H         s  |    d}d}d}d}d}t|	}t|}t|ddd}|D ]}|d |}|d |}|d |}|d |}|d	d	df }|d
 |}d|v rX|d |n|}i }|	dv rnd|v rn|d ||d< n|	dv r|d|v r||d |d< t|	ru|d	d	d	d	df }t|} tt }!||! }"|d |}#|d |}$|$|! }%|d |}&| ||#||"|%d}'t	|'d |"| }(t
|'d |})t|'d |}*t	|'d |%| d	d}+t
|'d |d	d},t|'d |d	d}-|(}.|*}/|)}0|( |* |) |+ |- |, d}1d	urH|1D ]}2|2vr|1|2 |2< d| |2  ||1|2   |2< qtfdd |1D  t|1 fd!d"|1D }3nd#d" |1D }3|3d$ |( |3d% |*  |3d& |)  |3d' |+  |3d( |-  |3d) |,  }4n|r"|	d*krd|v r|d |}|d |}|d	d	df }|d	d	d	d	df }t|} tt }!||! }"|	d+krd|v r|d |}#| ||#||"| |d, ||d- ||d. ||d/ |d0	\}5}6}7}8n| ||fd1|"i|\}5}6}7}8t	|5|"| }.t
|6|}0|7d	u strtjd2|jd3}/ny|	d+krt|7|}/nnt|7|}/nh|j}tjtj|tjd4}9tjtj|tjd4}:tjtj|tjd4};tjtj|tjd4}<t|d	d	df |9|:}=t|d	d	df |;|<}>| ||f|=|>d5|\}?}@}At|?|@|A|||\}.}/}0trtjd2|d3}/t|	s.|/ dk}B|. |0 d6}1|Br|/ |1d7< d	ur|1D ]}2|2vr|1|2 |2< d| |2  ||1|2   |2< qfd8d9|1D }C|Crtfd:d |CD  t|C fd;d"|CD }3nd<d" |1D }3|3d=d>}+|3d7d2}-|3d?d>},nd> }+ }-},|r|r|.}4n|+|. |,|0  }4|Br.|4|-|/  }4|  |4  |  ||4 7 }|d7 }|d7 }t|	r|j |4 d@|( d@|* d@|) d@|+ d@|- d@|, d@dA ||( 7 }||) 7 }||* 7 }|dkr|| dkr|4 |( |* |) |+ |- |, dB}Dd	ur|3D ]}2|3|2 |DdC|2 < qt!j"|D|dD nN|+|.  }E|-|/  }F|,|0  }G||E7 }||G7 }||F7 }|j |4 d@|Ed@|Gd@|Fd@dE |dkr|| dkrt!j"|4 |E|G|F|+|-|,dF|dD |d	ur*|dkr*|| dkr*|| |d	ur?|
dkr?||
 dkr?|| q|| || || || |fS )GzTrain for one epoch.

    Args:
        just_heatmap: if True, only volume loss is applied (gripper loss skipped).
        model_type: 'para'/'da3'/'moge' use heatmap CE, 'act' uses direct MSE.
    r   ZTrainFdescleaver   r   r   r   Nr   trajectory_delta_rotvecr   r   clip_embeddingr   r   task_description	task_textr   	wrist_rgbwrist_trajectory_2dwrist_in_viewstart_keypoint_2dagent_query_pixelsZwrist_query_pixelsagent_volumeagent_gripperagent_rotationwrist_volumere   wrist_gripperwrist_rotation)a_vola_rota_gripw_volw_rotw_gripr0   c                 3        | ]}d  | d  V  qdS r/   r-   Nr&   .0k	loss_emasr&   r'   	<genexpr>      ztrain_epoch.<locals>.<genexpr>c                    "   i | ]}|  | d   qS r-   r&   r   inv_sumr   Zn_termsr&   r'   
<dictcomp>     " ztrain_epoch.<locals>.<dictcomp>c                 S      i | ]}|d qS r/   r&   r   r&   r&   r'   r         r   r   r   r   r   r   r#   r!   r}   
cam_K_normwrist_camera_posewrist_cam_K_normr   r   Zagent_query_height_binsagent_cam_poseagent_cam_K_normwrist_cam_poser   query_pixelsr.   r?   rM   Zcurrent_eef_posZcurrent_gripper)volgriprotc                    s    g | ]}  |d dkr|qS )r   绽|=)getr   r   r&   r'   
<listcomp>       ztrain_epoch.<locals>.<listcomp>c                 3   r   r   r&   r   r   r&   r'   r     r   c                    r   r   r&   r   r   r&   r'   r     r   c                 S   r   r   r&   r   r&   r&   r'   r     r   r   r/   r   .4f)lossr   r   r   r   r   r   )train_step/lossztrain_step/agent_volume_lossztrain_step/agent_rotation_lossztrain_step/agent_gripper_lossztrain_step/wrist_volume_lossztrain_step/wrist_rotation_lossztrain_step/wrist_gripper_lossztrain_step/w_step)r   r   r   r   )r   ztrain_step/volume_lossztrain_step/gripper_lossztrain_step/rotation_lossztrain_step/w_volztrain_step/w_rotztrain_step/w_grip)#trainr*   intr   tor,   r;   r   
IMAGE_SIZEr   r   rn   itemrd   lenSKIP_ROTATIONrB   rO   r@   rp   r1   r   rQ   r   rG   rH   r   r   r   Z	zero_gradZbackwardr   set_postfixwandblog)Hmodel
dataloader	optimizerr@   just_heatmapglobal_step_startvis_every_stepsvis_callbackZlog_scalars_everyr%   save_every_stepssave_callbackr   Z	ema_alpha
total_losstotal_volume_losstotal_gripper_lossZtotal_rotation_lossZ	n_batchesheatmap_modeglobal_steppbarbatchr   r   r   r   r   r   trajectory_rotextra_kwargstarget_heightr   coord_scaletrajectory_2d_predr   wrist_traj_2dZwrist_traj_2d_predZ
wrist_maskoutr   r   r   r   r   r   volume_lossrotation_lossr   Z
raw_lossesr   Zweightsr   r   gripper_logitsrotation_logitsZ_featsr   r   Zmin_gripZmax_gripcurrent_eef_normcurrent_grip_normr   r   r   Zhas_rotZactive_keysZlog_dataZweighted_volZweighted_rotZweighted_gripr&   r   r'   train_epochS  s  



$"









$

	"	"r#  c           p      C   sp  |    t|s|dkrFd}d}t  |D ]	}|d |}	|d |}
|d |}|d |}|d |}t| }t|ddddd	f }|dkr|d
 |}| |	||
dddf |
| ||d ||d ||d ||d |d	\}}}}t||
| |t|| }|durd|v r|d |n|}|t	|| }nc|d
 |}|d |}| |	||
dddf |
| || d}t|d |
| |}t|d || |dd}t|d |}t|d |dd}t
|d |}t
|d |dd}|| | | | | }|| |	jd  7 }||	jd 7 }qW d   n	1 s/w   Y  |t|d }||dddddddf	S d}d} d}!d}"d}#d}$d}%d}&d}'t|}(t , t|ddd})t|)D ]\}*}|d |}	|d |}+|d |},|d |}-|d }.|d }/|+dddf }0|,ddddd	f }1|d |}2d|v r|d |n|2}3i }4|dv rd|v r|d ||4d< n|d v rd!|v r|d! |4d"< |(stjtj|tjd#}5tjtj|tjd#}6tjtj|tjd#}7tjtj|tjd#}8t|,dddf |5|6}9t|-dddf |7|8}:| |	|0f|9|:d$|4\};}<}=t|;|<|=|,|2|-\}>}?}@|>|@ |? }|| |	jd  7 }| |> |	jd  7 } |!|@ |	jd  7 }!t|;|5|6}A|A|, jd%d&jdd&  d' }B|"|B7 }"t|=dkt|=t|= }C|%t |C|- jdd&  7 }%|&|	jd 7 }&|)j!| d(|B|	jd  d)d* qmt|1}Dt}E|E| }F|+|F }G| |	|0fd+|Gi|4\}H}I}J}Kt|H|G|D}Lt|I|-}Mt"s|Jdu rtjd,|d-}Nnt
|J|3}N|durVd.d/ |# D }O|OrOtd0d1 |O$ D }Pt%|O}d2|Ov r/||P |&d2dd3  nd4}Qd5|Ov rB||P |&d5dd3  nd4}R|Q|L |R|M  |N }n|L|M |N }n|L|M |N }|| |	jd  7 }| |L |	jd  7 } |!|M |	jd  7 }!t'|H\}S}T|S|F }U| (|K|S\}V}|Vj)d%d&}W|W* d6 d4 }X|Hjd |Hjd |Hjd7 |Hjd8 f\}Y}Z}[}\t+|ZD ] }]tj|Udd|]f |+dd|]f  dd& }^|"|^ 7 }"q|#t |T|1 jdd&  7 }#|$d,7 }$|%t |X|- jdd&  7 }%|&|	jd 7 }&|)j!| d(tj|Udddf |+dddf  dd&  d9d: |*dkr|'du rg }_t+t,D ]C}]|Hd|]f - }`t.j/|`0d%dd&0|`jd |`jd |`jd	 }a|ajdd&d }bt.j1|b2d2d||fd;dd<d= }c|_3|c q9t4|_}_|Td }d|Xd }e|d5 dkr|d2d6t,}d|e5 dkr|e2d6t,}e|.d 7 8 }f|/d 7 8 }g|g9 }h|hd  |9  < |hd  |9  < g }it+t,D ]F}]|Ud|]df  |Ud|]df  }j}k|Td|]f  }lt:t;j<|j|kgt;j=d>|l|f|h}m|mdur
|i3|m q|i3|,d|]f 7 8  qt;<|i}ni d|	d d?|d@ d |dA|_d|+d d|,d dB|dB d dC|dC d dD|dD d 7 8 dE|dE d dF|nd|.d 7 8 d|/d 7 8 dG|hdH|ddI|1d dJ|edK|-d }'qmW d   n	1 sw   Y  td|&}|"|t,  }o|| | | d,|!| |o|#| |$| |%| |'f	S )LzValidate model.r!   r   r   r   r   r   r   Nr   r   r}   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   ZValFr   r   r   r   r   r   rM   r   r<   r=   rs   r   .1f)r   Zpos_mmr   r.   r?   c                 S   s   i | ]\}}|d kr||qS )r   r&   r   r   vr&   r&   r'   r     s    zvalidate.<locals>.<dictcomp>c                 s   s    | ]	}d |d  V  qdS r   r&   r   r&  r&   r&   r'   r     s    zvalidate.<locals>.<genexpr>r   r-   r/   r          @rW      .2f)r   r   bilinearsizemodeZalign_cornersr   r   ru   r   heatmap_targetr   trajectory_quatrgb_frames_rawworld_to_camerabase_zpred_trajectory_3dcam_K_at_sizer   r  pred_gripperr   )>evalr,   rB   no_gradr   r   r;   r   r   rp   rn   r   r\   r   r*   r   	enumeraterO   r1   r   rQ   r   rG   rH   r   r   r   normrb   rd   whereZ	ones_likeabsr  r  itemsr   r  r   r   predict_at_pixelsrA   ry   r]   r	   
contiguousr_   softmaxr^   interpolate	unsqueezera   rc   r>   expandr   r   copyr   rv   rw   rx   )pr  r  r@   
image_sizer%   r   r  nr  r   traj_2dtraj_3d	traj_gripZ
traj_eulercsZthr   r   r   r   rh   r   Ztraj_rot_valr  r  r   r   r   r   r   r   val_lossr  r  Ztotal_pixel_errorZtotal_height_errorZtotal_height_error_tfZtotal_gripper_errorZ	n_samplesZsample_datar  r  Z	batch_idxr   r   r   r}   r   r   r  r   r  r  	min_pos_t	max_pos_t
min_grip_t
max_grip_tr!  r"  r   r   r   Zvol_lossr   Z	grip_lossZpos_pred_denormZ
pos_err_mmZgrip_pred_binaryr   	pred_sizer  r  r   r  r   featsr  r   r  Zactiver   Zw_vZw_gr   r   pred_2d_fullr   pred_gripper_classr7  rf   rg   r{   r|   r   Zpixel_error_tpred_heatmapsr   	vol_probs	heatmap_t
heatmap_upZpred_h_0Zpred_g_0Zcam_pose_npZcam_K_norm_npZcam_K_nppred_trajectory_3d_listr   r   hr   pred_trajectory_3d_npZavg_pixel_errorr&   r&   r'   validateM  s  
	(
$

"$


&&,.""H0&
&

	
  
'r\  c                 C   sD   t j| t jd}|jdkr|dd}t||||d}dd |D S )aL  Project (N,3) world points to pixel coords on the training image (flipud of obs).

    Matches debug_libero_projection.py exactly: project_points_from_world_to_camera returns
    (row, col) that can be drawn directly on flipud(obs_img) with NO additional row flip.
    Returns list of (u, v) = (col, row) ready for cv2 drawing.
    ru   r0   rW   )r   Zworld_to_camera_transformZcamera_heightZcamera_widthc                 S   s4   g | ]}t tt|d  t tt|d fqS )r0   r   )r   roundry   )r   Zrcr&   r&   r'   r   K  s   4 z&_proj_world_to_vis.<locals>.<listcomp>)rv   Zasarrayrx   ndimr^   r   )Z	points_3dr3  r{   r|   ptsZpix_rcr&   r&   r'   _proj_world_to_vis;  s   
r`  c           *      C   sL  | du rdS | d }| d }| d }| d }| d }t | d }g }ttD ]h}	||	   }
|
jdd \}}| d	 |	    }||  }| d
krX||  }t	
|
}||d< t	|
d |d  dd}|d t	j}t	| |j\}}t|t|}}d|  kr|k rn n/d|  kr|k rn n#t|||fdtjddtj t|d|d |d ftjdddtj ||	   t	j}t||||d \}}d|  kr|k rhn n~d|  kr|k rhn npt|||fddd t|d|d |d ftjdddtj | }||d< t||||d \}}d|  kr6|k rtn n<d|  krD|k rtn n.t|||fddd t|d|d |d ftjdddtj t|||f||fddtj d| v r| d |	 }| }||d< t||||d \}}d|  kr|k rn nXd|  kr|k rn nJt|||fddd t|d |d |d ftjdddtj d|  kr|k rn nd|  kr|k rn nt|||f||fddtj ||	   t	j} t| }!d!}"g d"}#t|#D ]T\}$}%||!dd|$f |"  }&t|&|||d \}'}(d|'  kr:|k rfn qd|(  krI|k rfn qt|||f|'|(f|%dtj t||'|(fd#|%d qt|d$|	 d%tjdddtj t|d$|	 d%tjdd&dtj || q&t	j |dd'})t!j"|)| d(td  d)d*S )+a  Build a horizontal strip (one tile per timestep) matching debug_libero_projection.py style.

    Each tile shows the actual RGB frame at that timestep with:
      - predicted heatmap blended in red
      - predicted pixel: green crosshair
      - GT EEF projection: white filled circle + label
      - GT base-plane projection: cyan ring + yellow line to EEF
      - GT EEF rotation axes: red (x), green (y), blue (z) lines
    Nr3  r   r1  r   r2  r4  r   r   r-   ).r   g??r   r0   g     o@r      r      Zpredrq   gffffff?   rc  rc  rc  r<   Zeefr   rc  rc  Zgt_base   )rc  rc  r   r5  )rc     r   	pred_baseg{Gz?))rc  r   r   rb  r   r   rc  rW   zt=)rq      )   rm  rm  rj   z: timesteps 0..z (left->right))Zcaption)#ry   r]   r	   r   r   r\   detachr[   r   rv   Z
zeros_liker   astypeuint8r   rA   r   cv2Z
drawMarkerZMARKER_CROSSLINE_AAputTextFONT_HERSHEY_SIMPLEXrx   r`  circlerE  lineT_robZquat2matr:  ra   concatenater  Image)*sample
split_namer3  r   r1  r   r2  r4  Ztilesr   framer{   r|   Zpred_heatmap_tZheatZheat_rgbZoverlayZvisr   r   r   r   Zeef_posur&  Zeef_baseZugZvgZ	pred_3d_trj  ZupZvpZeef_quatZeef_rotZaxis_lenZaxis_colors_rgbiZcolorZendpointZuavastripr&   r&   r'   build_wandb_timestep_stripN  s~   

0(8(8(
8(8
<""r  c           ;      C   s  |    t  |d dd |}|d dd |}|d dd |}|d dd |}|d d   }	|d d   }
|
 }|d  |9  < |d  |9  < |d	d	df }|d
krd|v r|d dd |}|d dd |}|d	d	df }|d d   }	|d d   }
|
 }|d  |9  < |d  |9  < i }|dkrd|v r|d dd ||d< n|dv rd|v r|d d g|d< t|r|d dd |}|d dd |}t| }| ||||| || d}|d }|d }nX|dkr]|d dd |}t| }| ||||| |d dd ||d dd ||d dd ||d dd |d\}}}}n| ||fi |\}}}}t	|\}}|j
d }|| }|| }|dkrg }ttD ]7}|d|f }|d|df  d|d }|d|df  d|d }||d	d	||f    qtj||dd}| |||\} }nt|r| j||dd\} }n| ||\} }| jdd}!|! d d }"g }#ttD ]C}$|d|$f  }%tj|%ddd|%j
d |%j
d |%j
d  }&|&jddd }'tj|'dd||fd!d"d#d$ }(|#|( q t|#}#g })ttD ]A}$|d|$df  |d|$df  }*}+|d|$f  },ttj|*|+gtj d%|,|	|}-|)|-d	ur|-n	|d|$f    qOt|)}.|d
krd|v rtg d&!d'dd}/tg d(!d'dd}0|d d  |0 |/ dd"dd d}1|1d#tddd }2|d) d   }3n|d* d }2|d+ d   }3i d|d d,|d- d |d.|#d|d d|d d/|d/ d d*|2d+|3d0|d0 d d1|.d|	d2|d3|d d4|dd	d	d f d5|"d d6|d }4t|rd7|v r|d7 }5t	|5\}6}7g }8ttD ]9}$|5d|$f  }%tj|%ddd|%j
}&|&jddd }9tj|9dd||fd!d"d#d$ }:|8|: qc|d |4d< t|8|4d8< |d |4d< |d) d   |4d)< |d9 d   |4d9< |4W  d	   S 1 sw   Y  d	S ):z4Build visualization sample dict from a single batch.r   r   r0   r   r   r   r}   r   Nr#   r   r   r   r   r   r   r   r   r   r   r   Zagent_featsr!   )r   r   r   r   r   r   r<   r?   Zagent)Z	view_namer=   r(  r/   r   r+  Fr,  r/  ru   r   rW   r   wrist_world_to_camerar2  r3  r   r0  r   r1  r4  r5  r6  r   r  r7  r   r   wrist_pred_heatmapr   )$r8  rB   r9  r   r   r   rE  r,   r   r   r\   r]   r	   r5   r4   ra   rA   r   rO   rC  r?  ry   r@  r_   rA  r^   r   rB  rc   r   rv   rw   rx   r   permuterD  );r  r  r@   rF  r%   r   r   r   r   r}   r   r~   r   r  r   r  rK  r  r   rR  rh   r   r   rQ  r  rS  Z_pred_hbinsZ_tZ_vol_tZ_pxZ_pyZ_pred_hbins_tr   rT  r7  rU  r   r   rV  rW  rX  rY  r   r   rZ  r   r[  Z_meanZ_stdwrist_denormZrgb_frames_for_visZw2c_for_visresultZ	wrist_volZ	w_pred_2dZw_pred_heightZw_pred_heatmapsZhm_tZhm_upr&   r&   r'   build_sample_data_for_logging  s   


	

$
0&
&,
*	

 &r  c            G         sP  t jdd} | jdtdg ddd | jdtd	d
d | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdttdd | jdttd d | jd!ttd"d | jd#tdd$d | jd%td&d'd | jd(td&d)d | jd*td d+d | jd,td-g d.d/d | jd0tdd1d | jd2td3d4d | jd5td6d7d | jd8td9d:d | jd;td<d=d | jd>tdd?d | jd@tdAg dBdCd | jdDdEdFdG | jdHdEdIdG | jdJtdKdKdLgdMd | jdNtdOdPd | jdQtdOdRd | jdStdTdUd | jdVtdWdXd | jdYtdOdZd | jd[td\d]d | jd^tdd_d | jd`td3dad | jdbdEdcdG | jdddEdedG | jdfdEdgdG | jdhtddid | 	 t
tjdj j   jdkdkdl jrt
jn dm }ttj rdnn
tjj rdondptdq  tjjjjjjjjjj j!j"j#t$t%t&t'drds tdt j(rd }j)rj)* + dukrd }ndvdw j),dxD }njg}t-j(j|t$t%j.jj/dAkrj/ndydz}j(j|j.d{n{j)rwdd|l0m} |1 j  2 }j)* + dukr;t3t4|}nd}dw j),dxD }td~|  g }|D ]}t5t$j|jjj.d}|6| qQt7|}j|jjdnt5t$jjjjj.d}jjjjdtd  tdt8| d j9rtd |d }	G dd dtj:j;j<}
|
|	dd|
|	ddn%t8|}t=dt|j  }|| }tj:j;j>|||gt? @d6d\tdt8 d tdt8 d tAj!dkddkdtAj!dyddkdtdjB d tCjB}tDt$t%jEjFjGd}jBdv r8jH|d< |d4i |IjJrtKjLMjJrtdjJ  tjNjJd}|d }O }dd |P D }g }g }|P D ]#\}}||v r|jQ|| jQkr|||< |6| qz|6| qzjR|dyd tdt8| d |rtdt8| d|d d   |Sdd}|Sdd}td| d|  njJrtdjJ  tTdd U D }tTdd U D }td|dxd|dxdd| | dd tVjWtXdd U j"ddd}d }d }d }jY} jYrjtKjLM| sj| Zdd}!t8|!dkrj|!d + dkrNdnd}"|!d |" }#tKjLM|#rj|#} tdjY d|   jYrtKjLM| rtd|   tjN| d}|d }$O }i }%g }&|$P D ]*\}}||v r|jQ|| jQkr||%|< q|&6| d|jQ d|| jQ  qt[|\ t[|$\  }'t[|$\ t[|\  }(|'rtdt]|'  |(rtdt]|(  |&rtd |&D ]
})td|)  q|^|% jR|dyd |&rtd n$z	R|d  W n t_y9 }* ztd|*  W Y d }*~*nd }*~*ww |Sddd }d|v red|v re|d |d f}td|d dțd|d dțdʝ d|v rd|v r|d |d f}td|d dțd|d dțdΝ td|  njYrtdЈjY  d }+|M rz&t`|dddӍ},taN|,}+W d    n	1 sw   Y  td|  W n t_y }* ztd|*  d }+W Y d }*~*nd }*~*ww fddׄ}-|d ur
|\tb_ctb_dtdtbjcdțdtbjddțdʝ nc|+d u rA|- }+|jjdkdkdl t`|dddӍ},taje|+|,ddڍ W d    n	1 s5w   Y  td|  t|+d tb_ct|+d tb_dtdtbjcdțdtbjddțdʝ tftbjctbjd dk rmtdރ |d ur|\tb_gtb_htdtbjgdțdtbjhdțdΝ nT|+d u r|- }+|jjdkdkdl t`|dddӍ},taje|+|,ddڍ W d    n	1 sw   Y  td|  t|+d tb_gt|+d tb_htdtbjgdțdtbjhdțdΝ d }.|d urd|v rd|v r|d |d f}.|.d ur|.\tb_itb_jtdtbji dtbjj  nU|+d u r=|- }+|jjdkdkdl t`|dddӍ},taje|+|,ddڍ W d    n	1 s8w   Y  d|+v rb|+d tb_i|+d tb_jtdddw tbjiD  dddw tbjjD   |d urrd|v rr|d tb_kn|+d urd|+v r|+d tb_ktdddw tbjkD   d }/|d urd|v rd|v r|d |d f}/|/d ur|/\tb_ltb_mtdtbjl dtbjm  nU|+d u r|- }+|jjdkdkdl t`|dddӍ},taje|+|,ddڍ W d    n	1 sw   Y  d|+v r|+d tb_l|+d tb_mtdddw tbjlD  dddw tbjmD   tdj# d td}0d}1jnaotor-td jpr8d }2td ndddd}2td|2d dd|2d dd|2d dd d d fddfdd}3jqdkr}|3d  fdd}4d d	 jdkdkdl d gd g fd
d	fdd
|3	
fdd}5trr }6tst4|j#ddD ]5}7jtdkrtrr |6 d }8|8jtkrtd|8ddjtdd  ntdd  td|7 dj#  td  tudy|1jq|5jBjv|4|2d\}9}:};}<}1td|9dd|:dd|;dd|<dd	 twjB|2d \	}=}>}?}@}A}B}C}D}Etd!|=dd"|>dd#|Add$|Bd\ d%d&|Dd
 tjx|7|9|:|;|<|=|>|@|A|Bd' |Dd(|1d) i d|7dO dÈO d*|9d|=dtbjcdtbjddtbjgdtbjhdtbjidtbjjdtbjkdtbjldtbjmd+jFd,jG}Fty|F d-  |=|0k 
r|=}0ty|F d.  td/|=dd qtz  td0 td1 td2|0d td3   d S (5  Nz1Train PARA trajectory heatmap predictor on LIBERO)Zdescriptionz--model_typer   )r   r   r   r   r   r   r   r   r   r#   r!   zModel architecture to train)typedefaultZchoiceshelpz--model_namezOpenGVLab/InternVL2_5-1Bz4HuggingFace model name (used by internvl model_type))r  r  r  z--benchmarkZlibero_spatialzLIBERO benchmark namez	--task_idr   zTask index within benchmarkz
--task_ids zbComma-separated task indices to train on all at once (e.g. '0,1,2' or 'all'). Overrides --task_id.z--cameraZ	agentviewz*Camera name used for training observationsz--max_demosz'Maximum demos to load from task datasetz--val_splitr   z*Fraction of episodes to use for validationz--batch_sizezBatch size for trainingz--lrzLearning ratez--epochszNumber of epochsz--checkpointz!Path to checkpoint to resume fromz
--run_nameZpara_liberoz/Name of run (used for checkpoint paths and W&B)z--wandb_projectzW&B project namez--wandb_entityzW&B entity/team (optional)z--wandb_modeonline)r  ZofflineZdisabledzW&B modez--stats_cache_pathz+Path to JSON cache for height/gripper statsz--stats_sample_limiti  zHRandom number of samples to use for stats computation (0 = full dataset)z--stats_seed*   z!Random seed for stats subsamplingz--vis_every_stepsr   z,Log visualization images every N train stepsz--frame_striderW   uZ   Sample every Nth frame from the demo (default 3 → ~6.7Hz @ 20Hz, N_WINDOW=4 spans ~0.6s)z--cache_rootzfPath to pre-rendered dataset (e.g. /data/libero/parsed_libero). Uses CachedTrajectoryDataset when set.z	--augmentrX   )rX   ZperspectiveZcropallzLAugmentation mode: none, perspective, crop, all (crop+rot+shear+perspective)z--freeze_backboneZ
store_truezCFreeze DINOv2 backbone to preserve pre-trained viewpoint invariance)actionr  z--drop_start_kpz@Zero out start_keypoint_2d input (removes 2D pixel conditioning)z
--backboneZdinoZresnetzABackbone: dino (DINOv2 ViT-S/16) or resnet (ResNet-18 pretrained)z--pos_loss_weightr/   zDWeight for position loss (ACT: normalizes pos MSE to match rot/grip)z--volume_loss_weightzGWeight for volume/heatmap CE loss (PARA: scale relative to grip/rot CE)z--gripper_loss_weightg      @zWeight for gripper CE lossz--rotation_loss_weight      ?zWeight for rotation CE lossz--act_gripper_loss_weightzWeight for ACT gripper BCE lossz--save_every_stepsrs   z@Save checkpoint every N training steps (0 = only save per-epoch)z--max_minutesz8Stop training after N minutes (0 = no limit, use epochs)z--async_eval_everyz<Launch background eval every N training steps (0 = disabled)z--no_ema_lossz7Disable EMA adaptive loss weighting (use equal weights)z--skip_rotationz5Skip rotation loss entirely (for zero-rotation tasks)z--overfit_one_samplez1Overfit to a single dataset sample (sanity check)z--pretrained_backboneu   Path to point-track pretrained checkpoint (e.g. point_track_pretraining/checkpoints/.../best.pth). Loads only the DINO backbone weights (dino.* keys) from the checkpoint, initializing all other heads randomly. Use this for the pretrain→finetune pipeline.ZcheckpointsT)parentsexist_okzdataset_stats.jsoncudampsr   zUsing device: )	benchmarktask_idcamera	max_demos	val_split
batch_sizelrepochsrF  n_windowZn_height_binsZn_gripper_bins)ZprojectZentitynamer.  Zconfigz
Loading dataset...r  c                 S      g | ]}t |qS r&   r   r   r   r&   r&   r'   r         zmain.<locals>.<listcomp>,F)
cache_rootbenchmark_nametask_idsrF  r  frame_strider  augment)r  r  r  r  )r  c                 S   r  r&   r  r  r&   r&   r'   r     r  z  Multi-task mode: tasks )rF  r  r  r  r  r  )r  r  r  r  )r  r  r  r  z
  Source: z	  Total:  samplesuC   
⚠ OVERFIT MODE: using a single repeated sample for train and valc                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
zmain.<locals>._RepeatDataset  c                 S   s   || _ || _d S N)r{  rG  )selfr{  rG  r&   r&   r'   __init__  s   
z%main.<locals>._RepeatDataset.__init__c                 S      | j S r  rG  )r  r&   r&   r'   __len__     z$main.<locals>._RepeatDataset.__len__c                 S   r  r  )r{  )r  rh   r&   r&   r'   __getitem__  r  z(main.<locals>._RepeatDataset.__getitem__N)r  )__name__
__module____qualname__r  r  r  r&   r&   r&   r'   _RepeatDataset  s    
r  r  r  r   r0   )	generatoru   ✓ Train: u	   ✓ Val: rl  )r  ZshuffleZnum_workersZ
pin_memoryz
Initializing model (type=z)...)Ztarget_sizer  freeze_backbonedrop_start_kpbackboner   
model_namez#
Loading pretrained backbone from: )Zmap_locationmodel_state_dictc                 S   s    i | ]\}}| d r||qS )zdino.)
startswithr%  r&   r&   r'   r     r   zmain.<locals>.<dictcomp>)strictz	  Loaded z) backbone keys from pretrained checkpointz
  Skipped z( keys (shape mismatch or not in model):    epoch?rL  z  Pretrained checkpoint: epoch=z, val_loss=z)
WARNING: pretrained backbone not found: c                 s   s    | ]
}|j r| V  qd S r  )requires_gradnumelr   pr&   r&   r'   r   0  s    zmain.<locals>.<genexpr>c                 s   s    | ]}|  V  qd S r  )r  r  r&   r&   r'   r   1  s    zTrainable parameters: z / z (d   r*  z%)c                 S   r  r  )r  r  r&   r&   r'   <lambda>5  s    zmain.<locals>.<lambda>rr   )r  Zweight_decay.r   r   .pthz.ptzCheckpoint not found at z, using z
Loading checkpoint: z: checkpoint z
 vs model u    ⚠ Missing keys (random init): u   ⚠ Unexpected keys (ignored): u#   ⚠ Shape mismatches (random init):z    u2   ⚠ Skipping optimizer state (model shape changed)optimizer_state_dictu$   ⚠ Could not load optimizer state: 
min_height
max_heightu#   ✓ Height range from checkpoint: [z.6fz, z] mmin_grippermax_gripperu$   ✓ Gripper range from checkpoint: []u   ✓ Resumed from epoch u   
⚠ Checkpoint not found: rzutf-8)encodingu   ✓ Loaded stats cache: u    ⚠ Failed to read stats cache: c                     sT  t t  } jr,jdkr,tj| }tj}td| d|  dj d n
| }td|  d g }g }g }g }g }t }d}	t|d || krO| n|}
d}t	|d	d
d}|	|k r||
k r|d7 }|| krn|	}n
|
| }||v rxq[|| |t k rn}|t k r|n|t  }z|| }W n	 ty   Y q[w |d  }|d  }|d  }|d  }||d d df   ||  || || || |	d7 }	|d |	|k r||
k sc|  t |dkst |dkr td ttjttjttjttjtjtjdddS tj|tjd}tj|tjd}tj|dd}tj|dd}tj|dd}ddlm   |d ! } "|tj# fdd|D dd}t| t| t| t| |jdd |jdd | |jdd |jdd t$|j%t$|j%t$t&t$t'dS )Nr   z-
Computing dataset stats from random subset: /z samples (seed=)z,
Computing dataset stats from full dataset: r  rm  zStats subsetF)rl   r   r   r0   r   r   r   r1  r   uE   ⚠ No valid samples found for stats; falling back to model defaults.)r  r  r  r  r   r   num_height_valuesnum_gripper_valuesru   rn  )Rotationc                    s"   g | ]}   |  qS r&   )inv	from_quatZ	as_rotvec)r   qZScipyRZref_rotr&   r'   r     s    z<main.<locals>.compute_stats_from_dataset.<locals>.<listcomp>)r  r  r  r  r   r   ref_rotation_quatr   r   r  r  dataset_sourcer  rF  )(r  Zstats_sample_limitr[   randomZRandomZ
stats_seedprintsetr   r   Z	randrangeadd	Exceptionr   extendtolistra   updateclosery   r1   r2   r3   rG   rH   rP   rR   rv   rw   rx   ry  Zscipy.spatial.transformr  rE  r  rc   r   r-  r	   r   )Z	total_lenZsample_countrngZall_heightsZall_grippersZ
all_eulersZall_positionsZ	all_quatsseenZsuccess_countZmax_attemptsZattemptsr  Z
global_idxZdatasetZ	local_idxr{  r   r   r   r1  Zall_heights_npZall_grippers_npZall_eulers_npZall_positions_npZall_quats_npZref_quatZall_delta_rotvec)argsr  train_datasetval_datasetr  r'   compute_stats_from_dataset  s    










z(main.<locals>.compute_stats_from_datasetu)   ✓ Using height range from checkpoint: [w)Zindentu   ✓ Saved stats cache: u    ✓ Height range from dataset: [gư>uT     ⚠ WARNING: MIN_HEIGHT == MAX_HEIGHT — all height predictions will be constant!u*   ✓ Using gripper range from checkpoint: [u!   ✓ Gripper range from dataset: [r   r   u$   ✓ Rotation range from checkpoint: z .. u#   ✓ Rotation range (delta rotvec): c                 S      g | ]}|d qS .3fr&   r'  r&   r&   r'   r     r  r  u   ✓ Reference rotation: c                 S   r  )r   r&   r'  r&   r&   r'   r     r  r   r   u$   ✓ Position range from checkpoint: u!   ✓ Position range from dataset: c                 S   r  r  r&   r'  r&   r&   r'   r   )  r  z
Starting training for z
 epochs...infu   ✓ Rotation loss SKIPPEDu/   ✓ EMA loss weighting DISABLED (equal weights)g'@g      @gGz?)r   r   r   u&   ✓ EMA loss weights initialized (vol=r   r$  z, rot=r   z, grip=r   r  c                 S   s  d| vrdS t | }tg dddd}tg dddd}| d  | | dd}|dd	d}|dt	d
d
d

 |d< | d |d< | d |d< d| v r\| d |d< t|| d}|durd| v rzl| d }|j}	t|	}
|
jdkr|
jd t	 }tt	D ]D}|| dk r|| }t|
dd||| df td ddtj|
dd||| df< t|
d|d dftjdddtj qt|
}W |S W |S  ty   Y |S w |S )zJBuild a wrist view strip for dual models (same layout as agentview strip).r  Nr   rW   r0   r   r   r   r   r<   r2  r   r   r   r  r3  Z_wristr   r     P   rc  zOUT OF FRUSTUMr  rm  皙?)rc  r   r   )dictrB   rO   r   r   r4   r  rC  rD  r	   r@  r  Z_imagerv   rw   r^  r\   r]   r   rp  r   rq  rr  rt  ru  rs  r  rz  r  )r{  r|  Zwrist_samplerb   r   r  Z	wrist_hwcr  Zin_viewZpil_imgZarrZtile_wr   Zx0r&   r&   r'   _build_wrist_strip<  sJ   

Lz main.<locals>._build_wrist_stripc           '   	      s  ddl }  t  | d dd  }| d d   }| d dd  }| d dd  }| d dddf  }tjtj	 tj
d}tjtj tj
d}	tjtj tj
d}
tjtj tj
d}t|dddf ||	}t|dddf |
|}i }d	| v r| d	 dd  |d	< ||f||d
|\}}}t|d ||	  }W d   n1 sw   Y  |d  }tg dddd}tg dddd}|| | ddddd d tj}|||j}|d   }tjj|| ddd }| }d| v r%| d d   ntd}dd }d| v r| d d   }||||}||||}t|D ].\} }!|!durz|||!ddd | dkrz|| d  durz|||| d  |!dd qMt|D ].\} }!|!dur|||!ddd | dkr|| d  dur|||| d  |!dd qtt |D ]} ||  dur||  dur||||  ||  dd qnnd| v rCd| v rC| d d   }"| d d   }#|#! }$|$d  t"9  < |$d  t"9  < dd }%|%||"|$}|%||"|$}t|D ]\} }!|!r+|||!ddd qt|D ]\} }!|!rA|||!ddd q1|#|| dd |j$d!d"d |#|d#|d$d%d&|j$d'dd |||j%}&t&'|&S )(zHBuild a visualization image for ACT: input image with GT/pred keypoints.r   Nr   r0   r   r   r   rM   r   r   r   rW   r   r   rc  r<   rn  rs   Z
aug_matrixc           
      S   s   g }| D ]R}t |d}|d dd d f | }|d dkrQ|d |d  |d |d  }}|t ||dg }	|t|	d |	d  t|	d |	d  f q|d  q|S )Nr/   rW   r   r   r0   )rv   ra   rw   r   )
Zpts_3dw2cZaug_matZpts_2dr   r  Zprojr~  r&  Zpt_hr&   r&   r'   proj_3d_to_2d  s   "0z3main.<locals>._act_vis_strip.<locals>.proj_3d_to_2dr3  re  rb  rk  rg  r}   r   c                 S   s|   g }| D ]7}t |d}|| }|d dkr6||d d  }|t|d |d  t|d |d  f q|d  q|S )Nr/   r   r   rW   r0   )rv   ra   r   )r_  ZcpZckr  r   r  Zpcr   r&   r&   r'   _proj_fallback  s   @z4main.<locals>._act_vis_strip.<locals>._proj_fallbackz green=GT red=pred yellow=err)r     r  rf  z3D err: r$  Zmm)r  rt   ra  )(rr  r8  rB   r9  r   r   r   rO   r1   r   rQ   r   rG   rH   r   r   r   r4   r  rp  rv   rq  ZcvtColorZCOLOR_RGB2BGRZlinalgr;  rb   Zeyer:  rv  rw  r]   r  rE  r   rt  ru  ZCOLOR_BGR2RGBr  rz  )'r  tagZ_cv2r   rH  rI  rJ  Zstart_kprM  rN  rO  rP  Zcur_eefZcur_gripZextra_kwr   rh   Z	grip_predZpos_pred_worldZimg_trb   r   Zimg_npZimg_bgrZgt_3dZ	err_3d_mmZmean_errZaug_Mr  r  Zgt_ptsZpred_ptsr   r   Zcam_poser   r~   r  Zimg_rgb)r@   r  r&   r'   _act_vis_strip_  s   
.(


 


zmain.<locals>._act_vis_stripc              
      s  t jrmtt}tt}t|tjd}t|tjd}i }t|d}t|d}|d ur9||d< |d urA||d< tjr`|d}|d}	|d urX||d< |	d ur`|	|d< |rktj	|| d d S d S z8tt}t
d	td
 }
|
 }dd | D }i } |d|d<  |d|d< tj	|| d W d S  ty } ztd|  W Y d }~d S d }~ww )N)rF  r%   r   valzvis/train_stripzvis/val_stripzvis/wrist_train_stripzvis/wrist_val_stripr   r   )r0   c                 S   s,   i | ]\}}|t |tjr|d n|qS )r   )
isinstancerB   ZTensorrC  r%  r&   r&   r'   r     s     z4main.<locals>.log_visualizations.<locals>.<dictcomp>z	vis/trainzvis/valz"  [vis] ACT visualization failed: )r*   r%   nextiterr  r   r  r,   r  r  rB   Zrandintr  r   r>  r  r  )r   Ztrain_batchZ	val_batchZsample_trainZsample_val_localZpayloadZtrain_stripZ	val_stripZwrist_trainZ	wrist_valZval_idxZ
val_samplee)r  r  r  r@   r  train_loaderr  
val_loaderr&   r'   log_visualizations  sJ   





z main.<locals>.log_visualizationsc                    sp   |     tjtjtjtjtjtjtjtj	d}t
| d|  d  t
| d  td|  d dS )z3Save a mid-epoch checkpoint at a given global step.)r  r  r  r  r  r  r  r   r   r   r   Zstep_r  
latest.pthz
  Saved step checkpoint: step_N)
state_dictr1   r2   r3   rG   rH   rP   rR   r   r   rB   saver  )r   Zckpt)CHECKPOINT_DIRr  r  r&   r'   save_step_checkpoint  s   z"main.<locals>.save_step_checkpointZlogsZscratchc                    sF  j dkrdS | j  dks| dkrdS d dur$d  du r$dS  d }t tjtjtjtj	tj
tjtjtjtjd
| d|  dj d }dtjd	d
 dtjdd
 dtjdd
 dtjdd
 d dj d| dj djsjnd d d}tj|dtjtjdd< td|   dS )z9Save scratch checkpoint and launch eval.py in background.r   Nzscratch_eval.pth)
r  r  r  r  r  r   r   r  r   r   Zeval_rh   z.mp4zPYTHONPATH=Z
PYTHONPATHr  z DINO_REPO_DIR=ZDINO_REPO_DIRz DINO_WEIGHTS_PATH=ZDINO_WEIGHTS_PATHz CUDA_VISIBLE_DEVICES=ZCUDA_VISIBLE_DEVICESz python z/eval.py --model_type z --checkpoint z --benchmark z --task_id zR --n_episodes 1 --save_video --teleport --zero_rotation --max_steps 600 --out_dir z:/eval_run --clip_embeddings_dir /data/libero/parsed_liberoT)shellstdoutstderrz!
  [async eval] launched at step )async_eval_everyZpollrB   r  r  r1   r2   r3   rG   rH   rP   rR   REF_ROTATION_QUATr   r   run_nameosenvironr   r%   r  r  r  
subprocessPopenZDEVNULLr  )r   Zscratch_ckptZ
video_pathZeval_cmd)r	  
_eval_procr  r  scratch_dir
script_dirr&   r'   maybe_launch_async_eval  sP   
z%main.<locals>.maybe_launch_async_evalc              
      s*  j dkrdS d d djsjnd  }| sdS t|ddd d	d
}|s.dS |d } d | jkr| j d< z6|d}t	
d| d|  | r[t|nt|}tjdtj|dddi| d td|j  W dS  ty } ztd|  W Y d}~dS d}~ww dS )z5Check if a new eval video exists and log it to wandb.r   NZeval_runvideosZtask_z*.mp4c                 S   s
   |   jS r  )statst_mtimer  r&   r&   r'   r  G  s   
 z4main.<locals>.maybe_log_eval_video.<locals>.<lambda>T)keyreversez	.h264.mp4zffmpeg -y -i z2 -vcodec libx264 -pix_fmt yuv420p -loglevel error zeval/rollout_videor   Zmp4)Zfpsformatr   z'
  [async eval] logged video to wandb: z%
  [async eval] failed to log video: )r  r  r  existssortedglobr  r  with_suffixr  systemstrr  r  ZVideor  r  r  )r   Z	video_dirr  ZlatestZ	h264_pathZvid_pathr  )_last_logged_eval_videor  r  r&   r'   maybe_log_eval_video?  s,   
"
z"main.<locals>.maybe_log_eval_videoc                    s    |  |  |  d S r  r&   r   )_orig_vis_callbackr  r&  r&   r'   vis_and_eval_callbackY  s   z#main.<locals>.vis_and_eval_callbackZEpochs)r   g      N@u   
⏱ Time limit reached (z.0fz min). Stopping.
z<============================================================zEpoch r  )r	  r
  r  r  r%   r  r  r   zTrain Loss: r   z
 (Volume: z, Gripper: z, Rotation: )r%   r   zVal - Loss: z
, Volume: z, Pixel Error: zpx, Height Error: r  zmm, Gripper: g     @@)r  z
train/lossztrain/volume_lossztrain/gripper_lossztrain/rotation_losszval/losszval/volume_losszval/gripper_losszval/pixel_errorzval/height_error_mmzval/gripper_abs_errorr   
train_lossr  r  r  zbest.pthu   ✓ Saved best model (val_loss=z=
============================================================u   ✓ Training complete!zBest val loss: zCheckpoints saved to: r&   ){argparseZArgumentParserZadd_argumentr$  r   ry   
BATCH_SIZELEARNING_RATE
NUM_EPOCHSZ
parse_argsr   __file__parentr  mkdirstats_cache_pathrB   r@   r  Zis_availableZbackendsr  r  r  initZwandb_projectZwandb_entityZ
wandb_moder  r  r  r  r  r  r  r  r   r	   r   r   r  r  r  lowersplitr   r  r  Zlibero.liberoZget_benchmark_dictZget_num_taskslistr]   r   ra   r   r  Zoverfit_one_sampleutilsdataZDatasetr   Zrandom_split	GeneratorZmanual_seedr   r%   r(   r  r  r  r  r  r   Zpretrained_backboner  pathr  loadr  r>  r\   Zload_state_dictr   rd   Z
parametersoptimZAdamWfilter
checkpointrsplitr  keysr   r  r  openjsonr1   r2   r3   dumpr=  rG   rH   rP   rR   r  r   r   Zskip_rotationr  Zno_ema_lossr  timer   Zmax_minutesr#  r  r\  r  r  Zfinish)GZparserr2  Ztask_id_listZfull_datasetZ_bmZn_tasksZdatasetstidZdsZ
one_sampler  Zdataset_sizeZval_sizeZ
train_sizeZ
ModelClassZmodel_kwargsZpt_ckptZpt_stateZ
model_dictZbackbone_keysZloadedZskippedr   r&  Zpt_epochZpt_val_lossZn_trainableZn_totalZstart_epochr>  Zcheckpoint_height_valuesZcheckpoint_gripper_valuesZcheckpoint_pathZaltZ	other_extZalt_pathZmodel_stateZfiltered_stateZshape_mismatchesZmissing_keysZunexpected_keysmsgr  Zstats_cachefr  Zcheckpoint_rot_valuesZcheckpoint_pos_valuesZbest_val_lossr  r   r  r
  r(  Ztraining_start_timer  elapsedr*  Ztrain_volume_lossZtrain_gripper_lossZtrain_rotation_lossrL  Zval_heatmap_lossZval_height_lossZval_gripper_lossZ	val_errorZval_height_errorZval_height_error_tfZval_gripper_errorZ
sample_valZcheckpoint_datar&   )r	  r  r  r  r%  r'  r  r  r@   r  r&  r  r  r  r  r  r  r  r  r'   main;  s  































	



 *
"

""

^ 

 





,





,
.#t)&":

F	



rI  __main__r  )rt   )
Fr   r   Nr   r   r   NNr   )P__doc__rB   Ztorch.nnZnnZtorch.nn.functionalZ
functionalr_   Ztorch.optimr<  Ztorch.utils.datar   r   r   rv   pathlibr   r   r+  r  rB  Zmathr  r  rD  sysr  r:  insertdirnamer/  rr  Zrobosuite.utils.camera_utilsr   Zrobosuite.utils.transform_utilsr7  Ztransform_utilsrx  r8  r   r   r	   r  r
   r   r   r   r   r1   r   r(   r*   r,   r;   rF   rK   rL   rU   rV   rn   rp   r,  r-  r.  r   r  r   r   r   r   r   r   r   r   r#  r\  r`  r  r  rI  r  r&   r&   r&   r'   <module>   s    #			
	



 { o\       
y
