o
    {iBd                    @   sN  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
mZ ddlZddlmZ ddlmZ ddlZddlZddlZddlZddlZddlZddlZddlZddlZejdeje ddlZddlm Z  ddl!m"  m#Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl)Z/dd	l"m0Z0 d
d Z1dd Z2dd Z3dd Z4dd Z5dd Z6dd Z7dd Z8dd Z9dGddZ:dd Z;d Z<d!Z=d"Z>d#Z?d$a@dHd&d'ZAdGd(d)ZBd*d+ ZCdGd,d-ZDd.d/ ZEd0d1 ZFd2d3 ZGd4d5 ZH	$		6		7	8				9dId:d;ZIe?d8dfd<d=ZJd>d? ZKd@dA ZLe?d8fdBdCZMdDdE ZNeOdFkr%eN  dS dS )Ja;  Train trajectory volume predictor on LIBERO demonstrations.

Model predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (CE at trajectory pixel only).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel (teacher forcing), decoded at pred pixel in val/inference.
    N)
DataLoaderConcatDataset)Path)tqdm)#project_points_from_world_to_camera)RealTrajectoryDatasetCachedTrajectoryDatasetN_WINDOW)TrajectoryHeatmapPredictorN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINS	PRED_SIZE)*recover_3d_from_direct_keypoint_and_heightc           
      C   s   | dkrt S | dkrddlm} |S | dkrddlm} |S | dkr*ddlm} |S | d	kr6dd
lm} |S | dkrBddl	m
} |S | dkrNddlm} |S | dkrZddlm} |S | dkrfddlm} |S | dkrrddlm}	 |	S | dkrxt S td|  )Nparaactr   )ACTPredictorda3)DA3Predictormoge)MoGePredictordino_vla)DinoVLAPredictorinternvl)InternVLAPredictorinternvl_act)InternVLACTPredictordual_da3)DualDA3Predictor	dual_para)DualParaPredictorcost_volume)CostVolumePredictor
wrist_onlyzUnknown model_type: )r
   	model_actr   Z	model_da3r   Z
model_moger   Zmodel_dino_vlar   Zmodel_vla_internvlr   Zmodel_vla_internvl_actr   Zmodel_dual_da3r   Zmodel_dual_parar    Zmodel_cost_volumer"   
ValueError)

model_typer   r   r   r   r   r   r   r    r"    r'   4/data/cameron/para_normalized_losses/libero/train.pyget_model_class$   s@   r)   c                 C   s   | dvS )zTACT-style models use direct regression; everything else uses pixel-aligned heatmaps.)r   r   r'   r&   r'   r'   r(   is_heatmap_modelG      r+   c                 C   s   | dv S )z%Dual-camera models process two views.)r   r   r'   r*   r'   r'   r(   is_dual_modelK   r,   r-   c                 C   sL   t j}t j}| | || d  }|dd}|td   dtd }|S )a  Discretize continuous height values into bin indices.

    Args:
        height_values: (B, N_WINDOW) or (N_WINDOW,) tensor of heights in [MIN_HEIGHT, MAX_HEIGHT]

    Returns:
        bin_indices: (B, N_WINDOW) or (N_WINDOW,) tensor of bin indices in [0, N_HEIGHT_BINS-1]
    :0yE>              ?   r   )model_module
MIN_HEIGHT
MAX_HEIGHTclampr   long)height_valuesmin_hmax_h
normalizedbin_indicesr'   r'   r(   discretize_heightP   s   	r<   c                 C   sH   t j}t j}| jdd}tjddt| jd}|| }|||  | }|S )zDecode height bin logits back to continuous height values.

    Args:
        bin_logits: (B, N_WINDOW, N_HEIGHT_BINS) logits for each bin

    Returns:
        height_values: (B, N_WINDOW) continuous height values in [MIN_HEIGHT, MAX_HEIGHT]
    dimr/   r0   device)r2   r3   r4   argmaxtorchlinspacer   rA   )
bin_logitsr8   r9   r;   bin_centersr:   r7   r'   r'   r(   decode_height_binsc   s   	rG   c                 C   sH   t j}t j}| | || d  }|dd}|td   dtd S )z6Discretize continuous gripper values into bin indices.r.   r/   r0   r1   r   )r2   MIN_GRIPPERMAX_GRIPPERr5   r   r6   )Zgripper_valuesmin_gmax_gr:   r'   r'   r(   discretize_gripperu   s
   rL   c                 C   s@   t j}t j}| jdd}tjddt| jd}|| ||  | S )uX   Decode (B, N_WINDOW, N_GRIPPER_BINS) logits → (B, N_WINDOW) continuous gripper values.r=   r>   r/   r0   r@   )r2   rH   rI   rB   rC   rD   r   rA   )rE   rJ   rK   r;   rF   r'   r'   r(   decode_gripper_bins~   s
   rM   c                 C   sh   t jtj| jt jd}t jtj| jt jd}| | || d  }|dd}|td  	 dtd S )uT   Discretize (B, N_WINDOW, 3) euler angles → (B, N_WINDOW, 3) bin indices, per axis.rA   dtyper.   r/   r0   r1   r   )
rC   tensorr2   MIN_ROTrA   float32MAX_ROTr5   r   r6   )Zeuler_valuesmin_rmax_rr:   r'   r'   r(   discretize_rotation   s
   rV   c                 C   s`   t jtj| jt jd}t jtj| jt jd}| jdd}t jddt	| jd}|| ||  | S )uQ   Decode (B, N_WINDOW, 3, N_ROT_BINS) → (B, N_WINDOW, 3) continuous euler angles.rN   r=   r>   r/   r0   r@   )
rC   rP   r2   rQ   rA   rR   rS   rB   rD   r   )
rot_logitsrT   rU   r;   rF   r'   r'   r(   decode_rotation_bins   s
   rX   c                 C   s   t |}| j\}}}}g }tdD ]B}	| dddd|	ddf || |}
|dddd|	f || }tj|
|dd}|durL||||  }||  qt	|
 }|durq|
 jdd}|||  d|  S |d S )zCross-entropy for 3 euler axes, averaged.

    Args:
        pred_rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        target_euler:         (B, N_WINDOW, 3) euler angles in radians
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore
       Nnone	reductionr0   ming      @)rV   shaperangereshapeFcross_entropyappendmeanrC   stacksumr5   )Zpred_rotation_logitsZtarget_eulermasktarget_binsBN_NrlossesaxisZlogits_axisZtarget_axiscetotaln_validr'   r'   r(   compute_rotation_loss   s   ( rs   c                 C   sT   t jtj| jt jd}t jtj| jt jd}|| || d  dd}t	| |S )zMSE loss for continuous rotation prediction (sigmoid [0,1] vs normalized target).

    Args:
        pred_rotation_sigmoid: (B, N_WINDOW, 3) sigmoid outputs in [0, 1]
        target_delta_rotvec:   (B, N_WINDOW, 3) delta axis-angle values
    rN   r.   r/   r0   )
rC   rP   r2   rQ   rA   rR   rS   r5   rb   mse_loss)Zpred_rotation_sigmoidZtarget_delta_rotvecrT   rU   target_normr'   r'   r(   compute_rotation_loss_mse   s   rv      -C6?  i  F    c                 C   s~   g }t d| |D ]*}t d||D ]!}|D ]}	ttj||gtjdt|	||}
|
dur0||
 qqq|r:t|S tdS )zABuild 3D points for volume visualization (numpy). Returns (N, 3).r   rO   N)r   rY   )r`   r   nparrayfloat64floatrd   zeros)HWcamera_posecam_KZheight_bucket_centersZ
pixel_steppointsyxheightptr'   r'   r(   build_volume_3d_points_for_vis   s   
r   c                 C   sT  | j \}}}}}| j}	|dddddf  d|d }
|dddddf  d|d }|d|d }g }t|D ]M}| dd|f }||d}|dd|f ||  |dd|f |  |
dd|f   }tj||dd}|dur||dd|f  }||	  q?t
| }|dur| jdd}|| | S || S )	al  Cross-entropy with softmax over all 3D cells (per timestep).

    Args:
        pred_volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
        trajectory_2d: (B, N_WINDOW, 2) pixel coords [x, y]
        target_height_bins: (B, N_WINDOW) bin indices in [0, N_HEIGHT_BINS-1]
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore (for out-of-view wrist targets)
    Nr   r1   r=   rZ   r[   r0   r]   )r_   rA   r6   r5   r`   ra   rb   rc   rd   re   rC   rf   rg   )Zpred_volume_logitstrajectory_2dtarget_height_binsrh   rj   rk   Nhr   r   rA   pxpyZh_binrn   tZlogits_tZlogits_flatZ
target_idxrp   rq   rr   r'   r'   r(   compute_volume_loss   s&   	&&@r   c                 C   s.  | j \}}}}}| j}tj||d|tjd}tj|||tjd}t|D ]S}	| dd|	f }
|
jdd\}}||dj	dd}|| }|| }|
 |dd|	df< |
 |dd|	df< |
tj||ddd||f j	dd|dd|	f< q$tjd	d
t|d}tj}tj}|| }|||  | }||fS )an  From volume (B, N_WINDOW, N_HEIGHT_BINS, H, W) get pred 2D and height per timestep.

    For each t: max over height bins gives (H,W) score; argmax gives (x,y); at (x,y) argmax over bins gives height bin.
    Returns:
        pred_2d: (B, N_WINDOW, 2) float pixel coords
        pred_height: (B, N_WINDOW) continuous height from decode_height_bins at that pixel
       rN   Nr1   r>   r=   r   r@   r/   r0   )r_   rA   rC   r   rR   r6   r`   maxra   rB   r   arangerD   r   r2   r3   r4   )volume_logitsrj   rk   r   r   r   rA   pred_2dZpred_height_binsr   vol_tZ
max_over_hrl   flat_idxr   r   rF   r8   r9   r:   pred_heightr'   r'   r(   &extract_pred_2d_and_height_from_volume   s.   r   c                 C   sv   |j \}}|dk }tj| || d||| dd}|dur7||||  }| | jdd S | S )u  2-class CE for gripper (open/close).

    Args:
        pred_gripper_logits: (B, N_WINDOW, 2) logits for [open, close]
        target_gripper:      (B, N_WINDOW) values in [-1, 1] → class 0 (open) or 1 (close)
        mask: (B, N_WINDOW) optional, 1=valid 0=ignore
    r   r   rZ   r[   Nr0   r]   )r_   r6   rb   rc   ra   rg   r5   re   )pred_gripper_logitstarget_gripperrh   rj   rk   target_classrp   r'   r'   r(   compute_gripper_loss  s   
r   c                 C   s   t jg d| jdddd}t jg d| jdddd}| | |   }t|ddddd}|  }|  }	t	|
 |j\}
}t||
g}|||	|fS )z-Get visualization arrays for a single sample.g
ףp=
?gv/?gCl?r@   rY   r1   gZd;O?gy&1?g?r   r   )rC   rP   rA   viewcpunumpyr|   clip	transposeunravel_indexrB   r_   r}   )rgbtarget_heatmappred_heatmapZ	target_2dre   stdZ
rgb_denormZrgb_visZ	pred_heatZ	target_ptpred_ypred_xZpred_ptr'   r'   r(   visualize_sample(  s   r   c                 C   s    || d }| | |  ddS )zDNormalize values to [0, 1] given per-axis min/max. Clamps to [0, 1].r.   r/   r0   )r5   valuesmin_valsmax_valsZ
range_valsr'   r'   r(   normalize_to_015  s   r   c                 C   s   || }| | | S )z6Denormalize values from [0, 1] back to original scale.r'   r   r'   r'   r(   denormalize_from_01:  s   r   c                 C   s   | j }tjtj|tjd}tjtj|tjd}tjtj|tjd}	tjtj|tjd}
t	|||}t	||	|
}t
| |}t
||}|dk }t
||}|||fS )zGACT losses: MSE for pos/rot (normalized [0,1]), BCE for binary gripper.rN   r   )rA   rC   rP   r2   MIN_POSrR   MAX_POSrQ   rS   r   rb   rt   r    binary_cross_entropy_with_logits)pos_predrot_predgripper_predtrajectory_3dtrajectory_eulertrajectory_gripperrA   min_posmax_posmin_rotmax_rot
pos_targetZ
rot_targetpos_lossrot_lossgrip_targetgripper_lossr'   r'   r(   compute_act_loss?  s   
r   2   
   r   皙?c           H         s  |    d}d}d}d}d}t|	}t|}t|ddd}|D ]}|d |}|d |}|d |}|d |}|d	d	df }|d
 |}d|v rX|d |n|}i }|	dv rnd|v rn|d ||d< n|	dv r|d|v r||d |d< t|	ru|d	d	d	d	df }t|} tt }!||! }"|d |}#|d |}$|$|! }%|d |}&| ||#||"|%d}'t	|'d |"| }(t
|'d |})t|'d |}*t	|'d |%| d	d}+t
|'d |d	d},t|'d |d	d}-|(}.|*}/|)}0|( |* |) |+ |- |, d}1d	urH|1D ]}2|2vr|1|2 |2< d| |2  ||1|2   |2< qtfdd |1D  t|1 fd!d"|1D }3nd#d" |1D }3|3d$ |( |3d% |*  |3d& |)  |3d' |+  |3d( |-  |3d) |,  }4n|r"|	d*krd|v r|d |}|d |}|d	d	df }|d	d	d	d	df }t|} tt }!||! }"|	d+krd|v r|d |}#| ||#||"| |d, ||d- ||d. ||d/ |d0	\}5}6}7}8n| ||fd1|"i|\}5}6}7}8t	|5|"| }.t
|6|}0|7d	u strtjd2|jd3}/ny|	d+krt|7|}/nnt|7|}/nh|j}tjtj|tjd4}9tjtj|tjd4}:tjtj|tjd4};tjtj|tjd4}<t|d	d	df |9|:}=t|d	d	df |;|<}>| ||f|=|>d5|\}?}@}At|?|@|A|||\}.}/}0trtjd2|d3}/t|	s.|/ dk}B|. |0 d6}1|Br|/ |1d7< d	ur|1D ]}2|2vr|1|2 |2< d| |2  ||1|2   |2< qfd8d9|1D }C|Crtfd:d |CD  t|C fd;d"|CD }3nd<d" |1D }3|3d=d>}+|3d7d2}-|3d?d>},nd> }+ }-},|r|r|.}4n|+|. |,|0  }4|Br.|4|-|/  }4|  |4  |  ||4 7 }|d7 }|d7 }t|	r|j |4 d@|( d@|* d@|) d@|+ d@|- d@|, d@dA ||( 7 }||) 7 }||* 7 }|dkr|| dkr|4 |( |* |) |+ |- |, dB}Dd	ur|3D ]}2|3|2 |DdC|2 < qt!j"|D|dD nN|+|.  }E|-|/  }F|,|0  }G||E7 }||G7 }||F7 }|j |4 d@|Ed@|Gd@|Fd@dE |dkr|| dkrt!j"|4 |E|G|F|+|-|,dF|dD |d	ur*|dkr*|| dkr*|| |d	ur?|
dkr?||
 dkr?|| q|| || || || |fS )GzTrain for one epoch.

    Args:
        just_heatmap: if True, only volume loss is applied (gripper loss skipped).
        model_type: 'para'/'da3'/'moge' use heatmap CE, 'act' uses direct MSE.
    r   ZTrainFdescleaver   r   r   r   Nr   trajectory_delta_rotvecr   r   clip_embeddingr   r   task_description	task_textr   	wrist_rgbwrist_trajectory_2dwrist_in_viewstart_keypoint_2dagent_query_pixelsZwrist_query_pixelsagent_volumeagent_gripperagent_rotationwrist_volumerh   wrist_gripperwrist_rotation)a_vola_rota_gripw_volw_rotw_gripr1   c                 3        | ]}d  | d  V  qdS r0   r.   Nr'   .0k	loss_emasr'   r(   	<genexpr>      ztrain_epoch.<locals>.<genexpr>c                    "   i | ]}|  | d   qS r.   r'   r   inv_sumr   n_termsr'   r(   
<dictcomp>     " ztrain_epoch.<locals>.<dictcomp>c                 S      i | ]}|d qS r0   r'   r   r'   r'   r(   r         r   r   r   r   r   r   r#   r!   r   
cam_K_normwrist_camera_posewrist_cam_K_normr   r   Zagent_query_height_binsagent_cam_poseagent_cam_K_normwrist_cam_poser   query_pixelsr/   r@   rN   current_eef_poscurrent_gripper)volgriprotc                    s    g | ]}  |d dkr|qS )r   绽|=)getr   r   r'   r(   
<listcomp>       ztrain_epoch.<locals>.<listcomp>c                 3   r   r   r'   r   r   r'   r(   r     r   c                    r   r   r'   r   r   r'   r(   r     r   c                 S   r   r   r'   r   r'   r'   r(   r     r   r  r0   r  .4f)lossr   r   r   r   r   r   )train_step/lossztrain_step/agent_volume_lossztrain_step/agent_rotation_lossztrain_step/agent_gripper_lossztrain_step/wrist_volume_lossztrain_step/wrist_rotation_lossztrain_step/wrist_gripper_lossztrain_step/w_step)r  r  r  r  )r  ztrain_step/volume_lossztrain_step/gripper_lossztrain_step/rotation_lossztrain_step/w_volztrain_step/w_rotztrain_step/w_grip)#trainr+   intr   tor-   r<   r   
IMAGE_SIZEr   r   rs   itemrg   lenSKIP_ROTATIONrC   rP   rA   rv   r2   r   rR   r   rH   rI   r   r   r  	zero_gradbackwardr  set_postfixwandblog)Hmodel
dataloader	optimizerrA   just_heatmapglobal_step_startvis_every_stepsvis_callbackZlog_scalars_everyr&   save_every_stepssave_callbackr   Z	ema_alpha
total_losstotal_volume_losstotal_gripper_lossZtotal_rotation_loss	n_batchesheatmap_modeglobal_steppbarbatchr   r   r   r   r   r   trajectory_rotextra_kwargstarget_heightr   coord_scaletrajectory_2d_predr   wrist_traj_2dZwrist_traj_2d_predZ
wrist_maskoutr   r   r   r   r   r   volume_lossrotation_lossr   Z
raw_lossesr   weightsr  r   gripper_logitsrotation_logitsZ_featsr   r   Zmin_gripZmax_gripcurrent_eef_normcurrent_grip_normr   r   r   Zhas_rotZactive_keysZlog_dataZweighted_volZweighted_rotZweighted_gripr'   r   r(   train_epochS  s  



$"









$

	"	"r;  c           p      C   sp  |    t|s|dkrFd}d}t  |D ]	}|d |}	|d |}
|d |}|d |}|d |}t| }t|ddddd	f }|dkr|d
 |}| |	||
dddf |
| ||d ||d ||d ||d |d	\}}}}t||
| |t|| }|durd|v r|d |n|}|t	|| }nc|d
 |}|d |}| |	||
dddf |
| || d}t|d |
| |}t|d || |dd}t|d |}t|d |dd}t
|d |}t
|d |dd}|| | | | | }|| |	jd  7 }||	jd 7 }qW d   n	1 s/w   Y  |t|d }||dddddddf	S d}d} d}!d}"d}#d}$d}%d}&d}'t|}(t , t|ddd})t|)D ]\}*}|d |}	|d |}+|d |},|d |}-|d }.|d }/|+dddf }0|,ddddd	f }1|d |}2d|v r|d |n|2}3i }4|dv rd|v r|d ||4d< n|d v rd!|v r|d! |4d"< |(stjtj|tjd#}5tjtj|tjd#}6tjtj|tjd#}7tjtj|tjd#}8t|,dddf |5|6}9t|-dddf |7|8}:| |	|0f|9|:d$|4\};}<}=t|;|<|=|,|2|-\}>}?}@|>|@ |? }|| |	jd  7 }| |> |	jd  7 } |!|@ |	jd  7 }!t|;|5|6}A|A|, jd%d&jdd&  d' }B|"|B7 }"t|=dkt|=t|= }C|%t |C|- jdd&  7 }%|&|	jd 7 }&|)j!| d(|B|	jd  d)d* qmt|1}Dt}E|E| }F|+|F }G| |	|0fd+|Gi|4\}H}I}J}Kt|H|G|D}Lt|I|-}Mt"s|Jdu rtjd,|d-}Nnt
|J|3}N|durVd.d/ |# D }O|OrOtd0d1 |O$ D }Pt%|O}d2|Ov r/||P |&d2dd3  nd4}Qd5|Ov rB||P |&d5dd3  nd4}R|Q|L |R|M  |N }n|L|M |N }n|L|M |N }|| |	jd  7 }| |L |	jd  7 } |!|M |	jd  7 }!t'|H\}S}T|S|F }U| (|K|S\}V}|Vj)d%d&}W|W* d6 d4 }X|Hjd |Hjd |Hjd7 |Hjd8 f\}Y}Z}[}\t+|ZD ] }]tj|Udd|]f |+dd|]f  dd& }^|"|^ 7 }"q|#t |T|1 jdd&  7 }#|$d,7 }$|%t |X|- jdd&  7 }%|&|	jd 7 }&|)j!| d(tj|Udddf |+dddf  dd&  d9d: |*dkr|'du rg }_t+t,D ]C}]|Hd|]f - }`t.j/|`0d%dd&0|`jd |`jd |`jd	 }a|ajdd&d }bt.j1|b2d2d||fd;dd<d= }c|_3|c q9t4|_}_|Td }d|Xd }e|d5 dkr|d2d6t,}d|e5 dkr|e2d6t,}e|.d 7 8 }f|/d 7 8 }g|g9 }h|hd  |9  < |hd  |9  < g }it+t,D ]F}]|Ud|]df  |Ud|]df  }j}k|Td|]f  }lt:t;j<|j|kgt;j=d>|l|f|h}m|mdur
|i3|m q|i3|,d|]f 7 8  qt;<|i}ni d|	d d?|d@ d |dA|_d|+d d|,d dB|dB d dC|dC d dD|dD d 7 8 dE|dE d dF|nd|.d 7 8 d|/d 7 8 dG|hdH|ddI|1d dJ|edK|-d }'qmW d   n	1 sw   Y  td|&}|"|t,  }o|| | | d,|!| |o|#| |$| |%| |'f	S )LzValidate model.r!   r   r   r   r   r   r   Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r1   ZValFr   r   r   r   r   r   rN   r  r=   r>   ry   r  .1f)r  Zpos_mmr   r/   r@   c                 S   s   i | ]\}}|d kr||qS )r  r'   r   r   vr'   r'   r(   r     s    zvalidate.<locals>.<dictcomp>c                 s   s    | ]	}d |d  V  qdS r   r'   r   r>  r'   r'   r(   r     s    zvalidate.<locals>.<genexpr>r  r.   r0   r         @rY      .2f)r  r   bilinearsizemodealign_cornersr   r   r{   r   heatmap_targetr   trajectory_quatrgb_frames_rawworld_to_camerabase_zpred_trajectory_3dcam_K_at_sizer   r/  pred_gripperr   )>evalr-   rC   no_gradr  r   r<   r   r   rv   rs   r  r_   r   r+   r   	enumeraterP   r2   r   rR   r   rH   rI   r   r   r   normre   rg   where	ones_likeabsr  r  itemsr   r  r  r   predict_at_pixelsrB   r   r`   r	   
contiguousrb   softmaxra   interpolate	unsqueezerd   rf   r?   expandr   r   copyr   r|   r}   r~   )pr  r  rA   
image_sizer&   r   r%  nr,  r   Ztraj_2dZtraj_3dZ	traj_gripZ
traj_eulercsthr   r  r  r  rl   r  Ztraj_rot_valr2  r3  r   r   r   r   r   r   val_lossr&  r'  Ztotal_pixel_errorZtotal_height_errorZtotal_height_error_tfZtotal_gripper_error	n_samplessample_datar)  r+  	batch_idxr   r   r   r   r   r   r/  r   r-  r.  Z	min_pos_tZ	max_pos_tZ
min_grip_tZ
max_grip_tr9  r:  r   r   r   vol_lossr   	grip_lossZpos_pred_denormZ
pos_err_mmZgrip_pred_binaryr   	pred_sizer0  r1  r   r7  r8  featsr4  r   r5  activer   w_vZw_gr   r   pred_2d_fullr   pred_gripper_classrP  rj   rk   r   r   r   Zpixel_error_tpred_heatmapsr   	vol_probs	heatmap_t
heatmap_upZpred_h_0Zpred_g_0Zcam_pose_npZcam_K_norm_npZcam_K_nppred_trajectory_3d_listr   r   hr   pred_trajectory_3d_npZavg_pixel_errorr'   r'   r(   validateM  s  
	(
$

"$


&&,.""H0&
&

	
  
'rw  c                 C   sD   t j| t jd}|jdkr|dd}t||||d}dd |D S )aL  Project (N,3) world points to pixel coords on the training image (flipud of obs).

    Matches debug_libero_projection.py exactly: project_points_from_world_to_camera returns
    (row, col) that can be drawn directly on flipud(obs_img) with NO additional row flip.
    Returns list of (u, v) = (col, row) ready for cv2 drawing.
    r{   r1   rY   )r   world_to_camera_transformcamera_heightcamera_widthc                 S   s4   g | ]}t tt|d  t tt|d fqS )r1   r   )r  roundr   )r   rcr'   r'   r(   r	  K  s   4 z&_proj_world_to_vis.<locals>.<listcomp>)r|   asarrayr~   ndimra   r   )Z	points_3drL  r   r   ptspix_rcr'   r'   r(   _proj_world_to_vis;  s   
r  c           *      C   sL  | du rdS | d }| d }| d }| d }| d }t | d }g }ttD ]h}	||	   }
|
jdd \}}| d	 |	    }||  }| d
krX||  }t	
|
}||d< t	|
d |d  dd}|d t	j}t	| |j\}}t|t|}}d|  kr|k rn n/d|  kr|k rn n#t|||fdtjddtj t|d|d |d ftjdddtj ||	   t	j}t||||d \}}d|  kr|k rhn n~d|  kr|k rhn npt|||fddd t|d|d |d ftjdddtj | }||d< t||||d \}}d|  kr6|k rtn n<d|  krD|k rtn n.t|||fddd t|d|d |d ftjdddtj t|||f||fddtj d| v r| d |	 }| }||d< t||||d \}}d|  kr|k rn nXd|  kr|k rn nJt|||fddd t|d |d |d ftjdddtj d|  kr|k rn nd|  kr|k rn nt|||f||fddtj ||	   t	j} t| }!d!}"g d"}#t|#D ]T\}$}%||!dd|$f |"  }&t|&|||d \}'}(d|'  kr:|k rfn qd|(  krI|k rfn qt|||f|'|(f|%dtj t||'|(fd#|%d qt|d$|	 d%tjdddtj t|d$|	 d%tjdd&dtj || q&t	j |dd'})t!j"|)| d(td  d)d*S )+a  Build a horizontal strip (one tile per timestep) matching debug_libero_projection.py style.

    Each tile shows the actual RGB frame at that timestep with:
      - predicted heatmap blended in red
      - predicted pixel: green crosshair
      - GT EEF projection: white filled circle + label
      - GT base-plane projection: cyan ring + yellow line to EEF
      - GT EEF rotation axes: red (x), green (y), blue (z) lines
    NrL  r   rJ  r   rK  rM  r   r   r.   ).r   g?g?r   r1   g     o@r      r      predrw   gffffff?   )r  r  r  r=   eef)r   r  r  Zgt_base   )r  r  r   rN  )r     r   	pred_baseg{Gz?))r  r   r   r  )r   r   r  rY   zt=)rw      )   r  r  ro   z: timesteps 0..z (left->right))caption)#r   r`   r	   r   r   r_   detachr^   r   r|   
zeros_liker   astypeuint8r   rB   r  cv2
drawMarkerMARKER_CROSSLINE_AAputTextFONT_HERSHEY_SIMPLEXr~   r  circler_  lineT_robquat2matrS  rd   concatenater  Image)*sample
split_namerL  r   rJ  r   rK  rM  tilesr   framer   r   Zpred_heatmap_tZheatZheat_rgboverlayvisr   r   r   r   eef_posur>  Zeef_baseZugZvgZ	pred_3d_tr  upvpeef_quatZeef_rotZaxis_lenZaxis_colors_rgbicolorendpointuavastripr'   r'   r(   build_wandb_timestep_stripN  s~   

0(8(8(
8(8
<""r  c           ;      C   s  |    t  |d dd |}|d dd |}|d dd |}|d dd |}|d d   }	|d d   }
|
 }|d  |9  < |d  |9  < |d	d	df }|d
krd|v r|d dd |}|d dd |}|d	d	df }|d d   }	|d d   }
|
 }|d  |9  < |d  |9  < i }|dkrd|v r|d dd ||d< n|dv rd|v r|d d g|d< t|r|d dd |}|d dd |}t| }| ||||| || d}|d }|d }nX|dkr]|d dd |}t| }| ||||| |d dd ||d dd ||d dd ||d dd |d\}}}}n| ||fi |\}}}}t	|\}}|j
d }|| }|| }|dkrg }ttD ]7}|d|f }|d|df  d|d }|d|df  d|d }||d	d	||f    qtj||dd}| |||\} }nt|r| j||dd\} }n| ||\} }| jdd}!|! d d }"g }#ttD ]C}$|d|$f  }%tj|%ddd|%j
d |%j
d |%j
d  }&|&jddd }'tj|'dd||fd!d"d#d$ }(|#|( q t|#}#g })ttD ]A}$|d|$df  |d|$df  }*}+|d|$f  },ttj|*|+gtj d%|,|	|}-|)|-d	ur|-n	|d|$f    qOt|)}.|d
krd|v rtg d&!d'dd}/tg d(!d'dd}0|d d  |0 |/ dd"dd d}1|1d#tddd }2|d) d   }3n|d* d }2|d+ d   }3i d|d d,|d- d |d.|#d|d d|d d/|d/ d d*|2d+|3d0|d0 d d1|.d|	d2|d3|d d4|dd	d	d f d5|"d d6|d }4t|rd7|v r|d7 }5t	|5\}6}7g }8ttD ]9}$|5d|$f  }%tj|%ddd|%j
}&|&jddd }9tj|9dd||fd!d"d#d$ }:|8|: qc|d |4d< t|8|4d8< |d |4d< |d) d   |4d)< |d9 d   |4d9< |4W  d	   S 1 sw   Y  d	S ):z4Build visualization sample dict from a single batch.r   r   r1   r   r   r   r   r   Nr#   r   r   r   r   r   r   r   r   r   r   r   Zagent_featsr!   )r   r   r   r   r   r   r=   r@   agent)Z	view_namer>   r@  r0   r   rC  FrD  rH  r{   r   rY   r   wrist_world_to_camerarK  rL  r   rI  r   rJ  rM  rN  rO  r   r/  rP  r   r   wrist_pred_heatmapr   )$rQ  rC   rR  r  r   r   r_  r-   r   r   r_   r`   r	   r6   r5   rd   rB   r  rP   r]  rY  r   rZ  rb   r[  ra   r   r\  rf   r   r|   r}   r~   r   permuter^  );r  r,  rA   r`  r&   r   r   r   r   r   r   r   r   r.  r   r2  rb  r3  r   rk  rl   r   r   rj  r0  rn  Z_pred_hbins_tZ_vol_t_pxZ_pyZ_pred_hbins_tr   ro  rP  rp  r   r   rq  rr  rs  rt  r   r   ru  r   rv  _mean_stdwrist_denormZrgb_frames_for_visZw2c_for_visresultZ	wrist_volZ	w_pred_2dZw_pred_heightZw_pred_heatmapsZhm_tZhm_upr'   r'   r(   build_sample_data_for_logging  s   


	

$
0&
&,
*	

 &r  c            G         sh  t jdd} | jdtdg ddd | jdtd	d
d | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdttdd | jdttd d | jd!ttd"d | jd#tdd$d | jd%td&d'd | jd(td&d)d | jd*td d+d | jd,td-g d.d/d | jd0tdd1d | jd2td3d4d | jd5td6d7d | jd8td9d:d | jd;td<d=d | jd>tdd?d | jd@tdAdBd | jdCtdAdDd | jdEtdFdGd | jdHtdIdJd | jdKtdAdLd | jdMtdNdOd | jdPtddQd | jdRtd3dSd | jdTdUdVdW | jdXdUdYdW | jdZdUd[dW | jd\tdd]d | 	 t
tjd^ j   jd_d_d` jr[t
jn da }ttj ridbn
tjj rrdcnddtde  tjjjjjjjjjj j!j"j#t$t%t&t'dfdg tdh j(rd }j)rƈj)* + dikrd }ndjdk j),dlD }njg}t-j(j|t$t%j.jdm}j(j|j.dnn{j)rEddol/m} |0 j  1 }j)* + dikr	t2t3|}ndpdk j),dlD }tdq|  g }|D ]}t4t$j|jjj.dr}|5| qt6|}j|jjdsnt4t$jjjjj.dr}jjjjdttdu  tdvt7| dw j8rtdx |d }	G dydz dztj9j:j;}
|
|	d{d||
|	d}d|n%t7|}t<d~t|j  }|| }tj9j:j=|||gt> ?d6d\tdt7 dw tdt7 dw t@j!d_dd_dt@j!ddd_dtdjA d tBjA}tCt$t%dd}jAdv rjD|d< |d$i |

E
jFrtGjHIjFrtdjF  tjJjFd}|d }
K }dd |L D }g }g }|L D ]#\}}||v r`|jM|| jMkr`|||< |5| qC|5| qC
jN|dd tdt7| d |rtdt7| d|d d   |Odd}|Odd}td| d|  njFrtdjF  tPdd 
Q D }tPdd 
Q D }td|dld|dldd| | dd tRjStTdd 
Q j"ddd}d }d }d }jU} jUr3tGjHI| s3| Vdd~}!t7|!dkr3|!d~ + dkrdnd}"|!d |" }#tGjHI|#r3|#} tdjU d|   jUrYtGjHI| rYtd|   tjJ| d}|d }$
K }i }%g }&|$L D ]*\}}||v r|jM|| jMkrt||%|< q\|&5| d|jM d|| jM  q\tW|X tW|$X  }'tW|$X tW|X  }(|'rtdtY|'  |(rtdtY|(  |&rtd |&D ]
})td|)  q|Z|% 
jN|dd |&rtd n$z	N|d  W n t[y }* ztd|*  W Y d }*~*nd }*~*ww |Oddd~ }d|v r.d|v r.|d |d f}td|d dd|d~ dd d|v rQd|v rQ|d |d f}td|d dd|d~ dd td|  njUretdĈjU  d }+|I rz&t\|dddǍ},t]J|,}+W d    n	1 sw   Y  td|  W n t[y }* ztd|*  d }+W Y d }*~*nd }*~*ww fdd˄}-|d ur|\t^__t^_`tdt^j_ddt^j`dd nc|+d u r
|- }+|jjd_d_d` t\|dddǍ},t]ja|+|,dd΍ W d    n	1 sw   Y  td|  t|+d t^__t|+d t^_`tdt^j_ddt^j`dd tbt^j_t^j` dk r6td҃ |d urQ|\t^_ct^_dtdt^jcddt^jddd nT|+d u r|- }+|jjd_d_d` t\|dddǍ},t]ja|+|,dd΍ W d    n	1 s|w   Y  td|  t|+d t^_ct|+d t^_dtdt^jcddt^jddd d }.|d urd|v rd|v r|d |d f}.|.d ur|.\t^_et^_ftdt^je dt^jf  nU|+d u r|- }+|jjd_d_d` t\|dddǍ},t]ja|+|,dd΍ W d    n	1 sw   Y  d|+v r+|+d t^_e|+d t^_ftdddk t^jeD  dddk t^jfD   |d ur;d|v r;|d t^_gn|+d urJd|+v rJ|+d t^_gtdddk t^jgD   d }/|d urpd|v rpd|v rp|d |d f}/|/d ur|/\t^_ht^_itdt^jh dt^ji  nU|+d u r|- }+|jjd_d_d` t\|dddǍ},t]ja|+|,dd΍ W d    n	1 sw   Y  d|+v r|+d t^_h|+d t^_itdddk t^jhD  dddk t^jiD   tdj# d td}0d}1jjaktkrtd jlrd }2td ndddd}2td|2d dd|2d dd|2d dd dd 
fdd}3jmdkr7|3d  
fdd}4d d jd_d_d` d gd g 
fddfdd	|3	fd d}5tnn }6tot3|j#ddD ]}7jpdkrtnn |6 d }8|8jpkrtd|8ddjpdd  ntdd	  td
|7 dj#  td	  tq
d|1jm|5jAjr|4|2d\}9}:};}<}1td|9dd|:dd|;dd|<dd	 ts
jA|2d\	}=}>}?}@}A}B}C}D}Etd|=dd|>dd|Add|BdN dd|Dd
 tjt|7|9|:|;|<|=|>|@|A|Bd |Dd|1d |7
K K |9|=t^j_t^j`t^jct^jdt^jet^jft^jgt^jht^jid}Ftu|F d  |=|0k 	r|=}0tu|F d  td|=dd qtv  td  td! td"|0d td#   d S (%  Nz1Train PARA trajectory heatmap predictor on LIBERO)descriptionz--model_typer   )r   r   r   r   r   r   r   r   r   r#   r!   zModel architecture to train)typedefaultchoiceshelpz--model_namezOpenGVLab/InternVL2_5-1Bz4HuggingFace model name (used by internvl model_type))r  r  r  z--benchmarklibero_spatialzLIBERO benchmark namez	--task_idr   zTask index within benchmarkz
--task_ids zbComma-separated task indices to train on all at once (e.g. '0,1,2' or 'all'). Overrides --task_id.z--camera	agentviewz*Camera name used for training observationsz--max_demosz'Maximum demos to load from task datasetz--val_splitr   z*Fraction of episodes to use for validationz--batch_sizezBatch size for trainingz--lrzLearning ratez--epochszNumber of epochsz--checkpointz!Path to checkpoint to resume fromz
--run_namepara_liberoz/Name of run (used for checkpoint paths and W&B)z--wandb_projectzW&B project namez--wandb_entityzW&B entity/team (optional)z--wandb_modeonline)r  ZofflinedisabledzW&B modez--stats_cache_pathz+Path to JSON cache for height/gripper statsz--stats_sample_limiti  zHRandom number of samples to use for stats computation (0 = full dataset)z--stats_seed*   z!Random seed for stats subsamplingz--vis_every_stepsr   z,Log visualization images every N train stepsz--frame_striderY   uZ   Sample every Nth frame from the demo (default 3 → ~6.7Hz @ 20Hz, N_WINDOW=4 spans ~0.6s)z--cache_rootzfPath to pre-rendered dataset (e.g. /data/libero/parsed_libero). Uses CachedTrajectoryDataset when set.z--pos_loss_weightr0   zDWeight for position loss (ACT: normalizes pos MSE to match rot/grip)z--volume_loss_weightzGWeight for volume/heatmap CE loss (PARA: scale relative to grip/rot CE)z--gripper_loss_weightg      @zWeight for gripper CE lossz--rotation_loss_weight      ?zWeight for rotation CE lossz--act_gripper_loss_weightzWeight for ACT gripper BCE lossz--save_every_stepsry   z@Save checkpoint every N training steps (0 = only save per-epoch)z--max_minutesz8Stop training after N minutes (0 = no limit, use epochs)z--async_eval_everyz<Launch background eval every N training steps (0 = disabled)z--no_ema_loss
store_truez7Disable EMA adaptive loss weighting (use equal weights))actionr  z--skip_rotationz5Skip rotation loss entirely (for zero-rotation tasks)z--overfit_one_samplez1Overfit to a single dataset sample (sanity check)z--pretrained_backboneu   Path to point-track pretrained checkpoint (e.g. point_track_pretraining/checkpoints/.../best.pth). Loads only the DINO backbone weights (dino.* keys) from the checkpoint, initializing all other heads randomly. Use this for the pretrain→finetune pipeline.checkpointsT)parentsexist_okzdataset_stats.jsoncudampsr   zUsing device: )	benchmarktask_idcamera	max_demos	val_split
batch_sizelrepochsr`  n_windowZn_height_binsZn_gripper_bins)projectentitynamerF  configz
Loading dataset...allc                 S      g | ]}t |qS r'   r  r   r   r'   r'   r(   r	        zmain.<locals>.<listcomp>,)
cache_rootbenchmark_nametask_idsr`  r  frame_strider  )r  r  r  r  )r  c                 S   r  r'   r  r  r'   r'   r(   r	    r  z  Multi-task mode: tasks )r`  r  r  r  r  r  )r  r  r  r  )r  r  r  r  z
  Source: z	  Total:  samplesuC   
⚠ OVERFIT MODE: using a single repeated sample for train and valc                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
zmain.<locals>._RepeatDataset  c                 S   s   || _ || _d S N)r  ra  )selfr  ra  r'   r'   r(   __init__  s   
z%main.<locals>._RepeatDataset.__init__c                 S      | j S r  ra  )r  r'   r'   r(   __len__     z$main.<locals>._RepeatDataset.__len__c                 S   r  r  )r  )r  rl   r'   r'   r(   __getitem__  r  z(main.<locals>._RepeatDataset.__getitem__N)r  )__name__
__module____qualname__r  r  r  r'   r'   r'   r(   _RepeatDataset  s    
r  r  r  r   r1   )	generatoru   ✓ Train: u	   ✓ Val: r  )r  shufflenum_workers
pin_memoryFz
Initializing model (type=z)...)target_sizer  freeze_backboner   
model_namez#
Loading pretrained backbone from: )map_locationmodel_state_dictc                 S   s    i | ]\}}| d r||qS )zdino.)
startswithr=  r'   r'   r(   r     r
  zmain.<locals>.<dictcomp>)strictz	  Loaded z) backbone keys from pretrained checkpointz
  Skipped z( keys (shape mismatch or not in model):    epoch?rd  z  Pretrained checkpoint: epoch=z, val_loss=z)
WARNING: pretrained backbone not found: c                 s   s    | ]
}|j r| V  qd S r  )requires_gradnumelr   pr'   r'   r(   r   '  s    zmain.<locals>.<genexpr>c                 s   s    | ]}|  V  qd S r  )r  r  r'   r'   r(   r   (  s    zTrainable parameters: z / z (d   rB  z%)c                 S   r  r  )r  r  r'   r'   r(   <lambda>,  s    zmain.<locals>.<lambda>rx   )r  weight_decay.r   r   .pthz.ptzCheckpoint not found at z, using z
Loading checkpoint: z: checkpoint z
 vs model u    ⚠ Missing keys (random init): u   ⚠ Unexpected keys (ignored): u#   ⚠ Shape mismatches (random init):z    u2   ⚠ Skipping optimizer state (model shape changed)optimizer_state_dictu$   ⚠ Could not load optimizer state: 
min_height
max_heightu#   ✓ Height range from checkpoint: [z.6fz, z] mmin_grippermax_gripperu$   ✓ Gripper range from checkpoint: []u   ✓ Resumed from epoch u   
⚠ Checkpoint not found: rzutf-8)encodingu   ✓ Loaded stats cache: u    ⚠ Failed to read stats cache: c                     sT  t t  } jr,jdkr,tj| }tj}td| d|  dj d n
| }td|  d g }g }g }g }g }t }d}	t|d || krO| n|}
d}t	|d	d
d}|	|k r||
k r|d7 }|| krn|	}n
|
| }||v rxq[|| |t k rn}|t k r|n|t  }z|| }W n	 ty   Y q[w |d  }|d  }|d  }|d  }||d d df   ||  || || || |	d7 }	|d |	|k r||
k sc|  t |dkst |dkr td ttjttjttjttjtjtjdddS tj|tjd}tj|tjd}tj|dd}tj|dd}tj|dd}ddlm   |d ! } "|tj# fdd|D dd}t| t| t| t| |jdd |jdd | |jdd |jdd t$|j%t$|j%t$t&t$t'dS )Nr   z-
Computing dataset stats from random subset: /z samples (seed=)z,
Computing dataset stats from full dataset: r  r  zStats subsetF)rq   r   r   r1   r   r   r   rJ  r   uE   ⚠ No valid samples found for stats; falling back to model defaults.)r  r  r  r  r   r   num_height_valuesnum_gripper_valuesr{   r  )Rotationc                    s"   g | ]}   |  qS r'   )inv	from_quat	as_rotvec)r   qScipyRref_rotr'   r(   r	    s    z<main.<locals>.compute_stats_from_dataset.<locals>.<listcomp>)r  r  r  r  r   r   ref_rotation_quatr   r   r  r  dataset_sourcer  r`  )(r  Zstats_sample_limitr^   randomRandomZ
stats_seedprintsetr   r   	randrangeadd	Exceptionr   extendtolistrd   updatecloser   r2   r3   r4   rH   rI   rQ   rS   r|   r}   r~   r  Zscipy.spatial.transformr  r_  r  rf   r  rE  r	   r  )	total_lenZsample_countrngZall_heightsZall_grippersZ
all_eulersZall_positionsZ	all_quatsseenZsuccess_countZmax_attemptsattemptsr+  Z
global_idxdatasetZ	local_idxr  r   r   r   rJ  Zall_heights_npZall_grippers_npZall_eulers_npZall_positions_npZall_quats_npref_quatZall_delta_rotvec)argsr"  train_datasetval_datasetr  r(   compute_stats_from_dataset{  s    










z(main.<locals>.compute_stats_from_datasetu)   ✓ Using height range from checkpoint: [w)indentu   ✓ Saved stats cache: u    ✓ Height range from dataset: [gư>uT     ⚠ WARNING: MIN_HEIGHT == MAX_HEIGHT — all height predictions will be constant!u*   ✓ Using gripper range from checkpoint: [u!   ✓ Gripper range from dataset: [r   r   u$   ✓ Rotation range from checkpoint: z .. u#   ✓ Rotation range (delta rotvec): c                 S      g | ]}|d qS .3fr'   r?  r'   r'   r(   r	    r  r!  u   ✓ Reference rotation: c                 S   r:  )r  r'   r?  r'   r'   r(   r	    r  r   r   u$   ✓ Position range from checkpoint: u!   ✓ Position range from dataset: c                 S   r:  r;  r'   r?  r'   r'   r(   r	     r  z
Starting training for z
 epochs...infu   ✓ Rotation loss SKIPPEDu/   ✓ EMA loss weighting DISABLED (equal weights)g'@g      @gGz?)r  r  r  u&   ✓ EMA loss weights initialized (vol=r  r<  z, rot=r  z, grip=r  r  c                 S   s  d| vrdS t | }tg dddd}tg dddd}| d  | | dd}|dd	d}|dt	d
d
d

 |d< | d |d< | d |d< d| v r\| d |d< t|| d}|durd| v rzl| d }|j}	t|	}
|
jdkr|
jd t	 }tt	D ]D}|| dk r|| }t|
dd||| df td ddtj|
dd||| df< t|
d|d dftjdddtj qt|
}W |S W |S  ty   Y |S w |S )zJBuild a wrist view strip for dual models (same layout as agentview strip).r  Nr   rY   r1   r   r   r   r   r=   rK  r   r   r   r  rL  Z_wristr   r     P   r  zOUT OF FRUSTUMr   r  g?)r  r   r   )dictrC   rP   r   r   r5   r  r]  r^  r	   rZ  r  _imager|   r}   r~  r_   r`   r   r  r  r  r  r  r  r  r  r  r)  )r  r  Zwrist_samplere   r   r  Z	wrist_hwcr  in_viewZpil_imgarrZtile_wr   x0r'   r'   r(   _build_wrist_strip3  sJ   

Lz main.<locals>._build_wrist_stripc           
         s   t jsd S tt}tt}t|tjd}t|tjd}i }t|d}t|d}|d ur;||d< |d urC||d< tjrb |d} |d}	|d urZ||d< |	d urb|	|d< |rmtj	|| d d S d S )	N)r`  r&   r  valzvis/train_stripzvis/val_stripzvis/wrist_train_stripzvis/wrist_val_stripr  )
r+   r&   nextiterr  r  r  r-   r  r  )
r  Ztrain_batchZ	val_batchZsample_trainZsample_val_localpayloadZtrain_stripZ	val_stripZwrist_trainZ	wrist_val)rE  r4  rA   r  train_loader
val_loaderr'   r(   log_visualizationsV  s.   





z main.<locals>.log_visualizationsc                    sp   |     tjtjtjtjtjtjtjtj	d}t
| d|  d  t
| d  td|  d dS )z3Save a mid-epoch checkpoint at a given global step.)r*  r  r  r  r  r  r  r   r   r   r   Zstep_r  
latest.pthz
  Saved step checkpoint: step_N)
state_dictr2   r3   r4   rH   rI   rQ   rS   r   r   rC   saver%  )r  Zckpt)CHECKPOINT_DIRr  r  r'   r(   save_step_checkpointr  s   z"main.<locals>.save_step_checkpointlogsscratchc                    sF  j dkrdS | j  dks| dkrdS d dur$d  du r$dS  d }t tjtjtjtj	tj
tjtjtjtjd
| d|  dj d }dtjd	d
 dtjdd
 dtjdd
 dtjdd
 d dj d| dj djsjnd d d}tj|dtjtjdd< td|   dS )z9Save scratch checkpoint and launch eval.py in background.r   Nzscratch_eval.pth)
r  r  r  r  r  r   r   r!  r   r   eval_rl   z.mp4zPYTHONPATH=
PYTHONPATHr  z DINO_REPO_DIR=DINO_REPO_DIRz DINO_WEIGHTS_PATH=DINO_WEIGHTS_PATHz CUDA_VISIBLE_DEVICES=CUDA_VISIBLE_DEVICESz python z/eval.py --model_type z --checkpoint z --benchmark z --task_id zR --n_episodes 1 --save_video --teleport --zero_rotation --max_steps 600 --out_dir z:/eval_run --clip_embeddings_dir /data/libero/parsed_liberoT)shellstdoutstderrz!
  [async eval] launched at step )async_eval_everypollrC   rO  rN  r2   r3   r4   rH   rI   rQ   rS   REF_ROTATION_QUATr   r   run_nameosenvironr  r&   r  r  r  
subprocessPopenDEVNULLr%  )r  Zscratch_ckptZ
video_pathZeval_cmd)rP  
_eval_procr4  r  scratch_dir
script_dirr'   r(   maybe_launch_async_eval  sP   
z%main.<locals>.maybe_launch_async_evalc              
      s*  j dkrdS d d djsjnd  }| sdS t|ddd d	d
}|s.dS |d } d | jkr| j d< z6|d}t	
d| d|  | r[t|nt|}tjdtj|dddi| d td|j  W dS  ty } ztd|  W Y d}~dS d}~ww dS )z5Check if a new eval video exists and log it to wandb.r   NZeval_runvideosZtask_z*.mp4c                 S   s
   |   jS r  )statst_mtimer  r'   r'   r(   r	    s   
 z4main.<locals>.maybe_log_eval_video.<locals>.<lambda>T)keyreversez	.h264.mp4zffmpeg -y -i z2 -vcodec libx264 -pix_fmt yuv420p -loglevel error zeval/rollout_videor   Zmp4)fpsformatr  z'
  [async eval] logged video to wandb: z%
  [async eval] failed to log video: )r\  r  r  existssortedglobrj  rk  with_suffixr`  systemstrr  r  ZVideor%  r  r)  )r  Z	video_dirri  latestZ	h264_pathZvid_pathe)_last_logged_eval_videor4  rf  r'   r(   maybe_log_eval_video  s,   
"
z"main.<locals>.maybe_log_eval_videoc                    s    |  |  |  d S r  r'   r  )_orig_vis_callbackrh  ry  r'   r(   vis_and_eval_callback  s   z#main.<locals>.vis_and_eval_callbackZEpochs)r   g      N@u   
⏱ Time limit reached (z.0fz min). Stopping.
z<============================================================zEpoch r  )r  r   r!  r"  r&   r#  r$  r   zTrain Loss: r  z
 (Volume: z, Gripper: z, Rotation: )r&   r   zVal - Loss: z
, Volume: z, Pixel Error: zpx, Height Error: r<  zmm, Gripper: g     @@)r  z
train/lossztrain/volume_lossztrain/gripper_lossztrain/rotation_losszval/losszval/volume_losszval/gripper_losszval/pixel_errorzval/height_error_mmzval/gripper_abs_errorr  )r  r  r  
train_lossrd  r  r  r  r  r   r   r!  r   r   rM  zbest.pthu   ✓ Saved best model (val_loss=z=
============================================================u   ✓ Training complete!zBest val loss: zCheckpoints saved to: r'   )wargparseArgumentParseradd_argumentru  r  r   
BATCH_SIZELEARNING_RATE
NUM_EPOCHS
parse_argsr   __file__parentr_  mkdirstats_cache_pathrC   rA   r  is_availablebackendsr  r%  r  initwandb_projectZwandb_entity
wandb_moder  r  r  r  r  r  r  r  r  r	   r   r   r  r  r  lowersplitr   r  libero.liberoget_benchmark_dictget_num_taskslistr`   r   rd   r   r  Zoverfit_one_sampleutilsdataDatasetr   random_split	Generatormanual_seedr   r&   r)   r@  r  r  Zpretrained_backboner`  pathrp  loadrN  rX  r_   load_state_dictr  rg   
parametersoptimAdamWfilter
checkpointrsplitr&  keysrq  r,  r)  openjsonr2   r3   r4   dumprW  rH   rI   rQ   rS   r^  r   r   Zskip_rotationr  Zno_ema_lossr!  timer   max_minutesr;  r#  rw  r  rO  finish)Gparserr  Ztask_id_listZfull_dataset_bmn_tasksdatasetstiddsZ
one_sampler  Zdataset_sizeZval_sizeZ
train_sizeZ
ModelClassZmodel_kwargsZpt_ckptZpt_stateZ
model_dictZbackbone_keysloadedskippedr   r>  Zpt_epochZpt_val_lossZn_trainableZn_totalZstart_epochr  Zcheckpoint_height_valuesZcheckpoint_gripper_valuesZcheckpoint_pathaltZ	other_extZalt_pathZmodel_stateZfiltered_stateZshape_mismatchesmissing_keysunexpected_keysmsgrw  Zstats_cachefr7  Zcheckpoint_rot_valuesZcheckpoint_pos_valuesbest_val_lossr*  r   rL  rQ  r{  Ztraining_start_timer  elapsedr}  Ztrain_volume_lossZtrain_gripper_lossZtrain_rotation_lossrd  Zval_heatmap_lossZval_height_lossZval_gripper_lossZ	val_errorZval_height_errorZval_height_error_tfZval_gripper_errorZ
sample_valZcheckpoint_datar'   )rP  rE  re  rx  rz  r4  r"  rA   rh  ry  r  r  rf  rg  r5  rJ  r6  rK  r(   main;  s  






























		



 *
"

""

^ 

 





,





,
.#&":

F


r  __main__r  )rz   )
Fr   r   Nr   r   r   NNr   )P__doc__rC   torch.nnnntorch.nn.functional
functionalrb   Ztorch.optimr  torch.utils.datar   r   r   r|   pathlibr   r   r~  r  r  mathr#  rb  r  sysr`  r  insertdirnamer  r  robosuite.utils.camera_utilsr   robosuite.utils.transform_utilsr  transform_utilsr  r  r   r   r	   r  r
   r   r   r   r   r2   r   r)   r+   r-   r<   rG   rL   rM   rV   rX   rs   rv   r  r  r  r  r  r   r   r   r   r   r   r   r   r;  rw  r  r  r  r  r  r'   r'   r'   r(   <module>   s    #			
	



 { o\      
i
