o
    i                     @   s*  d Z ddlZddlmZ ddlm  mZ ddlmZ ddl	m
Z
mZ ddlZddlmZ ddlmZ ddlZddlZddlZddlZddlZddlZejdeje ddlZddlmZ ddlm  m Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl&Z,dd	lm-Z- d
d Z.dd Z/dd Z0dd Z1dd Z2dd Z3dd Z4dd Z5dd Z6dZ7dZ8dZ9dZ:d a;d!a<d"a=d a>d a?dEd$d%Z@d&d' ZAd(d) ZBd*d+ ZCd,d- ZDd.d/ ZEd0d1 ZFd2d3 ZG	4		5		6	7		dFd8d9ZHe:d7fd:d;ZId<d= ZJd>d? ZKe:d7fd@dAZLdBdC ZMeNdDkreM  dS dS )Ga;  Train trajectory volume predictor on LIBERO demonstrations.

Model predicts a pixel-aligned volume: N_WINDOW x N_HEIGHT_BINS logits per pixel (CE at trajectory pixel only).
Gripper is per-pixel (N_WINDOW x N_GRIPPER_BINS per pixel): supervised at GT pixel (teacher forcing), decoded at pred pixel in val/inference.
    N)
DataLoaderConcatDataset)Path)tqdm)#project_points_from_world_to_camera)RealTrajectoryDatasetCachedTrajectoryDatasetN_WINDOW)TrajectoryHeatmapPredictorN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINS	PRED_SIZE)*recover_3d_from_direct_keypoint_and_heightc                 C   s   | dkrt S | dkrddlm} |S | dkrddlm} |S | dkr*ddlm} |S | d	kr6dd
lm} |S | dkrBddl	m
} |S | dkrNddlm} |S td|  )Nparaactr   )ACTPredictorda3)DA3Predictormoge)MoGePredictordino_vla)DinoVLAPredictorinternvl)InternVLAPredictorinternvl_act)InternVLACTPredictorzUnknown model_type: )r
   Z	model_actr   Z	model_da3r   Z
model_moger   Zmodel_dino_vlar   Zmodel_vla_internvlr   Zmodel_vla_internvl_actr   
ValueError)
model_typer   r   r   r   r   r    r   5/data/cameron/para/panda_streaming/../libero/train.pyget_model_class!   s*   r!   c                 C   s   | dvS )zTACT-style models use direct regression; everything else uses pixel-aligned heatmaps.)r   r   r   r   r   r   r    is_heatmap_model9   s   r#   c                 C   sL   t j}t j}| | || d  }|dd}|td   dtd }|S )a  Discretize continuous height values into bin indices.

    Args:
        height_values: (B, N_WINDOW) or (N_WINDOW,) tensor of heights in [MIN_HEIGHT, MAX_HEIGHT]

    Returns:
        bin_indices: (B, N_WINDOW) or (N_WINDOW,) tensor of bin indices in [0, N_HEIGHT_BINS-1]
    :0yE>              ?   r   )model_module
MIN_HEIGHT
MAX_HEIGHTclampr   long)height_valuesmin_hmax_h
normalizedbin_indicesr   r   r    discretize_height>   s   	r2   c                 C   sH   t j}t j}| jdd}tjddt| jd}|| }|||  | }|S )zDecode height bin logits back to continuous height values.

    Args:
        bin_logits: (B, N_WINDOW, N_HEIGHT_BINS) logits for each bin

    Returns:
        height_values: (B, N_WINDOW) continuous height values in [MIN_HEIGHT, MAX_HEIGHT]
    dimr%   r&   device)r(   r)   r*   argmaxtorchlinspacer   r7   )
bin_logitsr.   r/   r1   bin_centersr0   r-   r   r   r    decode_height_binsQ   s   	r=   c                 C   sH   t j}t j}| | || d  }|dd}|td   dtd S )z6Discretize continuous gripper values into bin indices.r$   r%   r&   r'   r   )r(   MIN_GRIPPERMAX_GRIPPERr+   r   r,   )Zgripper_valuesmin_gmax_gr0   r   r   r    discretize_gripperc   s
   rB   c                 C   s@   t j}t j}| jdd}tjddt| jd}|| ||  | S )uX   Decode (B, N_WINDOW, N_GRIPPER_BINS) logits → (B, N_WINDOW) continuous gripper values.r3   r4   r%   r&   r6   )r(   r>   r?   r8   r9   r:   r   r7   )r;   r@   rA   r1   r<   r   r   r    decode_gripper_binsl   s
   rC   c                 C   sh   t jtj| jt jd}t jtj| jt jd}| | || d  }|dd}|td  	 dtd S )uT   Discretize (B, N_WINDOW, 3) euler angles → (B, N_WINDOW, 3) bin indices, per axis.r7   dtyper$   r%   r&   r'   r   )
r9   tensorr(   MIN_ROTr7   float32MAX_ROTr+   r   r,   )Zeuler_valuesmin_rmax_rr0   r   r   r    discretize_rotationu   s
   rL   c                 C   s`   t jtj| jt jd}t jtj| jt jd}| jdd}t jddt	| jd}|| ||  | S )uQ   Decode (B, N_WINDOW, 3, N_ROT_BINS) → (B, N_WINDOW, 3) continuous euler angles.rD   r3   r4   r%   r&   r6   )
r9   rF   r(   rG   r7   rH   rI   r8   r:   r   )Z
rot_logitsrJ   rK   r1   r<   r   r   r    decode_rotation_bins~   s
   rM   c                 C   s   t |}| j\}}}}g }tdD ]/}| dddd|ddf || |}	|dddd|f || }
|t|	|
 qt|	 S )zCross-entropy for 3 euler axes, averaged so scale matches gripper loss (~log(N_ROT_BINS)).

    Args:
        pred_rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS)
        target_euler:         (B, N_WINDOW, 3) euler angles in radians
       N)
rL   shaperangereshapeappendFcross_entropyr9   stackmean)Zpred_rotation_logitsZtarget_eulerZtarget_binsBN_ZNrlossesaxisZlogits_axisZtarget_axisr   r   r    compute_rotation_loss   s   ( r\      -C6?  i  r&         @      ?    c                 C   s~   g }t d| |D ]*}t d||D ]!}|D ]}	ttj||gtjdt|	||}
|
dur0||
 qqq|r:t|S tdS )zABuild 3D points for volume visualization (numpy). Returns (N, 3).r   rE   N)r   rN   )rP   r   nparrayfloat64floatrR   zeros)HWcamera_posecam_KZheight_bucket_centersZ
pixel_steppointsyxheightptr   r   r    build_volume_3d_points_for_vis   s   
rr   c                 C   s  | j \}}}}}| j}|dddddf  d|d }	|dddddf  d|d }
|d|d }g }t|D ];}| dd|f }||d}|dd|f ||  |
dd|f |  |	dd|f   }|tj||dd q?t	
| S )a  Cross-entropy with softmax over all 3D cells (per timestep).

    For each timestep, flatten volume to (B, H*W*N_HEIGHT_BINS), softmax over cells,
    and supervise with the correct 3D cell index: (height_bin, y, x) -> h_bin*(H*W) + y*W + x.

    Args:
        pred_volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, H, W)
        trajectory_2d: (B, N_WINDOW, 2) pixel coords [x, y]
        target_height_bins: (B, N_WINDOW) bin indices in [0, N_HEIGHT_BINS-1]
    Nr   r'   r3   rV   )	reduction)rO   r7   r,   r+   rP   rQ   rR   rS   rT   r9   rU   rV   )Zpred_volume_logitstrajectory_2dtarget_height_binsrW   rX   Nhri   rj   r7   pxpyZh_binrZ   tZlogits_tZlogits_flatZ
target_idxr   r   r    compute_volume_loss   s   &&@rz   c                 C   s.  | j \}}}}}| j}tj||d|tjd}tj|||tjd}t|D ]S}	| dd|	f }
|
jdd\}}||dj	dd}|| }|| }|
 |dd|	df< |
 |dd|	df< |
tj||ddd||f j	dd|dd|	f< q$tjd	d
t|d}tj}tj}|| }|||  | }||fS )an  From volume (B, N_WINDOW, N_HEIGHT_BINS, H, W) get pred 2D and height per timestep.

    For each t: max over height bins gives (H,W) score; argmax gives (x,y); at (x,y) argmax over bins gives height bin.
    Returns:
        pred_2d: (B, N_WINDOW, 2) float pixel coords
        pred_height: (B, N_WINDOW) continuous height from decode_height_bins at that pixel
       rD   Nr'   r4   r3   r   r6   r%   r&   )rO   r7   r9   rh   rH   r,   rP   maxrQ   r8   rg   aranger:   r   r(   r)   r*   )volume_logitsrW   rX   rv   ri   rj   r7   pred_2dZpred_height_binsry   vol_tZ
max_over_hrY   Zflat_idxrx   rw   r<   r.   r/   r0   pred_heightr   r   r    &extract_pred_2d_and_height_from_volume   s.   r   c                 C   s   |dk  }t| |S )u  Binary cross-entropy for gripper (open/close).

    IMPORTANT: gripper MLP outputs (B, N_WINDOW) raw logits — NOT bins.
    Target: -1 (open) → 0, +1 (close) → 1.
    DO NOT revert this to cross_entropy — the model outputs 1 logit, not 32 bins.

    Args:
        pred_gripper_logits: (B, N_WINDOW) raw logits (pre-sigmoid)
        target_gripper:      (B, N_WINDOW) values in [-1, 1]
    r   )rg   rS    binary_cross_entropy_with_logits)pred_gripper_logitstarget_gripperZtarget_binaryr   r   r    compute_gripper_loss   s   r   c                 C   s   t jg d| jdddd}t jg d| jdddd}| | |   }t|ddddd}|  }|  }	t	|
 |j\}
}t||
g}|||	|fS )z-Get visualization arrays for a single sample.)g
ףp=
?gv/?gCl?r6   rN   r'   )gZd;O?gy&1?g?r{   r   )r9   rF   r7   viewcpunumpyrd   clip	transposeunravel_indexr8   rO   re   )rgbtarget_heatmappred_heatmapZ	target_2drV   stdZ
rgb_denormZrgb_visZ	pred_heatZ	target_ptpred_ypred_xZpred_ptr   r   r    visualize_sample   s   r   c                 C   s    || d }| | |  ddS )zDNormalize values to [0, 1] given per-axis min/max. Clamps to [0, 1].r$   r%   r&   )r+   valuesmin_valsmax_valsZ
range_valsr   r   r    normalize_to_01  s   r   c                 C   s   || }| | | S )z6Denormalize values from [0, 1] back to original scale.r   r   r   r   r    denormalize_from_01  s   r   c                 C   s   | j }tjtj|tjd}tjtj|tjd}tjtj|tjd}	tjtj|tjd}
t	|||}t	||	|
}t
| |}t
||}|dk }tt
|| }|||fS )zGACT losses: MSE for pos/rot (normalized [0,1]), BCE for binary gripper.rD   r   )r7   r9   rF   r(   MIN_POSrH   MAX_POSrG   rI   r   rS   mse_lossrg   ACT_GRIPPER_LOSS_WEIGHTr   )pos_predrot_predgripper_predtrajectory_3dtrajectory_eulertrajectory_gripperr7   min_posmax_posmin_rotmax_rotZ
pos_targetZ
rot_targetZpos_lossrot_lossZgrip_targetgripper_lossr   r   r    compute_act_loss  s   
r   F2   
   r   c           3      C   s  |    d}d}d}d}d}t|	}t|}t|ddd}|D ]}|d |}|d |}|d |}|d |}|d	d	df }|d
 |}i }|	dv rad|v ra|d ||d< n|	dv rod|v ro|d |d< |r|d	d	d	d	df }t|}t}|t }|| } | ||fd| i|\}!}"}#}$|!jd }%|%t }t	t
|!| | }&tt|"| }'tt|#| }(|r|&})nk|&|' |( })nd|j}tjtj|tjd}*tjtj|tjd}+tjtj|tjd},tjtj|tjd}-t|d	d	df |*|+}.t|d	d	df |,|-}/| ||f|.|/d|\}0}1}2t|0|1|2|||\}&}(}'|&|' |( })|  |)  |  ||) 7 }||& 7 }||' 7 }||( 7 }|d7 }|j|) d|& d|' d|( dd |d7 }|dkr|| dkrtj |) |& |' |( d|d |d	ur|dkr|| dkr|| |d	ur|
dkr||
 dkr|| q|| || || || |fS )zTrain for one epoch.

    Args:
        just_heatmap: if True, only volume loss is applied (gripper loss skipped).
        model_type: 'para'/'da3'/'moge' use heatmap CE, 'act' uses direct MSE.
    r   ZTrainFdescleaver   r   rt   r   Nr   r   r   clip_embeddingr   r   task_description	task_textr{   query_pixelsr3   rD   Zcurrent_eef_posZcurrent_gripperr'   .4f)lossvolgriprot)ztrain_step/lossztrain_step/volume_lossztrain_step/gripper_lossztrain_step/rotation_lossstep)!trainr#   intr   tor2   r   
IMAGE_SIZErO   VOLUME_LOSS_WEIGHTrz   GRIPPER_LOSS_WEIGHTr   ROTATION_LOSS_WEIGHTr\   r7   r9   rF   r(   r   rH   r   r>   r?   r   r   	zero_gradbackwardr   itemset_postfixwandblog)3model
dataloader	optimizerr7   just_heatmapglobal_step_startvis_every_stepsvis_callbackZlog_scalars_everyr   save_every_stepssave_callback
total_losstotal_volume_losstotal_gripper_lossZtotal_rotation_lossZ	n_batchesheatmap_modeglobal_steppbarbatchr   r   rt   r   start_keypoint_2dr   extra_kwargstarget_heightru   Zpred_size_tmpcoord_scaletrajectory_2d_predr~   gripper_logitsrotation_logitsZ_feats	pred_sizevolume_lossr   rotation_lossr   r   r   Zmin_gripZmax_gripcurrent_eef_normcurrent_grip_normr   r   r   r   r   r    train_epoch%  s   


""r   c           U   	   C   s
  |    d}d}d}d}d}	d}
d}d}d}t|}t  t|ddd}t|D ]\}}|d |}|d |}|d |}|d	 |}|d
 }|d }|dddf }|dddddf }|d |}i }|dv rd|v r|d ||d< n|dv rd|v r|d |d< |sltjtj	|tj
d}tjtj|tj
d}tjtj|tj
d}tjtj|tj
d}t|dddf ||} t|dddf ||}!| ||f| |!d|\}"}#}$t|"|#|$|||\}%}&}'|%|' |& }(||( |jd  7 }||% |jd  7 }||' |jd  7 }t|"||})|)| jddjdd  d }*||*7 }t|$dkt|$t|$ }+|t|+| jdd  7 }||jd 7 }|j|( d|*|jd  dd q+t|},t}-|-| }.||. }/| ||fd|/i|\}0}1}2}3tt|0|/|, }4tt|1| }5t t!|2| }6|4|5 |6 }(||( |jd  7 }||4 |jd  7 }||5 |jd  7 }t"|0\}7}8|7|. }9| #|3|7\}:};tt$|:dkt|:t|: }<|0jd |0jd |0jd |0jd f\}=}>}?}@t%|>D ] }Atj|9dd|Af |dd|Af  dd }B||B 7 }q|	t|8| jdd  7 }	|
d 7 }
|t|<| jdd  7 }||jd 7 }|j|( dtj|9dddf |dddf  dd  d!d" |dkr|du rg }Ct%t&D ]C}A|0d|Af ' }Dt(j)|D*ddd*|Djd |Djd |Djd }E|Ej+ddd }Ft(j,|F-d-d||fd#dd$d% }G|C.|G qt/|C}C|8d }H|<d }I|H0 dkr|H-d1t&}H|I0 dkr|I-d1t&}I|d 2 3 }J|d 2 3 }K|K4 }L|Ld  |9  < |Ld  |9  < g }Mt%t&D ]F}A|9d|Adf  |9d|Adf  }N}O|8d|Af  }Pt5t6j7|N|Ogt6j8d&|P|J|L}Q|QdurX|M.|Q q |M.|d|Af 2 3  q t67|M}Ri d|d d'|d( d |d)|Cd|d d|d d*|d* d d+|d+ d d,|d, d 2 3 d-|d- d d.|Rd
|d 2 3 d|d 2 3 d/|Ld0|Hd1|d d2|Id3|d }q+W d   n	1 sw   Y  t+d|}S||St&  }T||S ||S d ||S |T|	|S |
|S ||S |f	S )4zValidate model.r   NZValFr   r   rt   r   r   rk   
cam_K_normr{   r   r   r   r   r   r   rD   r   r3   r4   r'   r_   r   z.1f)r   Zpos_mmr   ra   rN      r%   .2f)r   rw   bilinearsizemodealign_cornersr   r   rc   r   heatmap_targetr   trajectory_quatrgb_frames_rawworld_to_camerabase_zpred_trajectory_3dcam_K_at_sizer   r   pred_gripperr   )9evalr#   r9   no_gradr   	enumerater   rF   r(   r   rH   r   r>   r?   r   r   r   rO   r   normrV   sumwhere	ones_likeabsr   r2   r   r   rz   r   r   r   r\   r   predict_at_pixelssigmoidrP   r	   
contiguousrS   softmaxrQ   r|   interpolate	unsqueezerR   rU   r5   expandr   r   copyr   rd   re   rf   )Ur   r   r7   
image_sizer   r   r   r   Ztotal_pixel_errorZtotal_height_errorZtotal_height_error_tfZtotal_gripper_error	n_samplesZsample_datar   r   	batch_idxr   r   rt   r   r   rk   r   r   r   r   r   Z	min_pos_tZ	max_pos_tZ
min_grip_tZ
max_grip_tr   r   r   r   r   vol_lossr   	grip_lossr   Zpos_pred_denormZ
pos_err_mmZgrip_pred_binaryru   r   r   r   r~   r   r   featsr   r   r   r   r   pred_2d_fullr   rY   r   rW   rX   ri   rj   ry   Zpixel_error_tpred_heatmapsr   	vol_probs	heatmap_t
heatmap_upZpred_h_0Zpred_g_0Zcam_pose_npZcam_K_norm_npZcam_K_nppred_trajectory_3d_listrw   rx   hrq   pred_trajectory_3d_npnZavg_pixel_errorr   r   r    validate  sZ  
$

"$
,.""H0&
&

	
  
r  c                 C   sD   t j| t jd}|jdkr|dd}t||||d}dd |D S )aL  Project (N,3) world points to pixel coords on the training image (flipud of obs).

    Matches debug_libero_projection.py exactly: project_points_from_world_to_camera returns
    (row, col) that can be drawn directly on flipud(obs_img) with NO additional row flip.
    Returns list of (u, v) = (col, row) ready for cv2 drawing.
    rc   r'   rN   )rm   Zworld_to_camera_transformZcamera_heightZcamera_widthc                 S   s4   g | ]}t tt|d  t tt|d fqS )r'   r   )r   roundrg   ).0rcr   r   r    
<listcomp>\  s   4 z&_proj_world_to_vis.<locals>.<listcomp>)rd   asarrayrf   ndimrQ   r   )Z	points_3dr   ri   rj   ptsZpix_rcr   r   r    _proj_world_to_visL  s   
r   c           &      C   sH  | du rdS | d }| d }| d }| d }| d }t | d }g }ttD ]}	||	   }
|
jdd \}}| d	 |	    }||  }| d
krX||  }t	
|
}||d< t	|
d |d  dd}|d t	j}t	| |j\}}t|t|}}d|  kr|k rn n/d|  kr|k rn n#t|||fdtjddtj t|d|d |d ftjdddtj ||	   t	j}t||||d \}}d|  kr|k rn nd|  kr|k rn nt|||fddd t|d|d |d ftjdddtj | }||d< t||||d \}}d|  kr4|k rvn n@d|  krB|k rvn n2t|||fddd t|d|d|d |d ftjdddtj t|||f||fddtj ||	   t	j}t|}d}g d }t|D ]T\} }!||dd| f |  }"t|"|||d \}#}$d|#  kr|k rn qd|$  kr|k rn qt|||f|#|$f|!dtj t||#|$fd!|!d qt|d"|	 d#tjdddtj t|d"|	 d#tjdd$dtj || q&t	j |dd%}%t!j"|%| d&td  d'd(S ))a  Build a horizontal strip (one tile per timestep) matching debug_libero_projection.py style.

    Each tile shows the actual RGB frame at that timestep with:
      - predicted heatmap blended in red
      - predicted pixel: green crosshair
      - GT EEF projection: white filled circle + label
      - GT base-plane projection: cyan ring + yellow line to EEF
      - GT EEF rotation axes: red (x), green (y), blue (z) lines
    Nr   r   r   rt   r   r   r{   r   r$   ).r   g?g?r   r'   g     o@r      r      predr]   gffffff?   )r"  r"  r"  r3   Zeef)r   r"  r"  zbase z=.3f   )r"  r"  r   g{Gz?))r"  r   r   r!  )r   r   r"  rN   zt=)r]      )   r)  r)  r[   z: timesteps 0..z (left->right))caption)#rg   rP   r	   r   r   rO   detachminr|   rd   
zeros_liker   astypeuint8r   r8   r   cv2
drawMarkerMARKER_CROSSLINE_AAputTextFONT_HERSHEY_SIMPLEXrf   r   circler  lineT_robZquat2matr   rR   concatenater   Image)&sampleZ
split_namer   r   r   rt   r   r   Ztilesry   frameri   rj   Zpred_heatmap_tZheatZheat_rgbZoverlayvisr   r   rw   rx   eef_posuvZeef_baseZugZvgeef_quatZeef_rotZaxis_lenZaxis_colors_rgbicolorendpointuavastripr   r   r    build_wandb_timestep_strip_  sj   

0(4(80
<""rI  c           $      C   s  |    t  |d dd |}|d dd |}|d dd |}|d dd |}|d d   }	|d d   }
|
 }|d  |9  < |d  |9  < |d	d	df }i }|d
kr~d|v r~|d dd ||d< n|dv rd|v r|d d g|d< | ||fi |\}}}}t|\}}|jd }|| }|| }| 	||\}}t
t|dkt|t| }g }ttD ]B}|d|f  }tj|ddd|jd |jd |jd }|jddd }tj|dd||fdddd }|| qt|}g }ttD ]A}|d|df  |d|df  }} |d|f  }!ttj|| gtjd|!|	|}"||"d	urS|"n	|d|f    qt|}#i d|d d|d d |d|d|d d|d d|d d d|d d d|d d   d|d d d|#d|	d |d!|d d"|dd	d	df d#|d d$|d W  d	   S 1 sw   Y  d	S )%z4Build visualization sample dict from a single batch.r   r   r'   rt   r   r   rk   r   Nr   r   r   r   r   r3   ra   r4   r{   r   Fr   r   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r9   r   r   r   r   r  r   rO   r  r   r  r   rP   r	   r  rS   r  rQ   r|   r  r  rR   rU   r   r   rd   re   rf   )$r   r   r7   r	  r   r   rt   r   r   rk   r   rl   r   r   r~   rY   r  r   r   r   r   r  r   r   r  ry   r   r  r  r  r  rw   rx   r  rq   r  r   r   r    build_sample_data_for_logging  s   

0&
&,
	
&rJ  c            D         sf  t jdd} | jdtdg ddd | jdtd	d
d | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdtddd | jdttdd | jd ttd!d | jd"ttd#d | jd$tdd%d | jd&td'd(d | jd)td'd*d | jd+td d,d | jd-td.g d/d0d | jd1tdd2d | jd3td4d5d | jd6td7d8d | jd9td:d;d | jd<td=d>d | jd?tdd@d | jdAtdBdCd | jdDtdBdEd | jdFtdGdHd | jdItdJdKd | jdLtdBdMd | jdNtdOdPd | jdQdRdSdT | jdUtddVd | 	 j
ajajajajattj}|dW j   jdXdXdY jrHtjn dZ }ttj rVd[n
tjj r_d\nd]t d^  t!j"j#j$jj%j&j'j(j)j*j+j,j-t.t/t0t1d_d` t da j2rd }j3rj34 5 dbkrd }ndcdd j36deD }nj'g}t7j2j&|t.t/j8j)df}j2j&|j8dgn{j3r2ddhl9m&} |: j&  ; }j34 5 dbkrt<t=|}ndidd j36deD }t dj|  g }|D ]}t>t.j&|j(j)j8dk}	|?|	 qt@|}j&|j(j)dlnt>t.j&j'j(j)j8dk}j&j'j(j)dmt dn  t dotA| dp jBrt dq |d }
G drds dstjCjDjE}||
dtdu||
ddun%tA|}tFdvt|j* }|| }tjCjDjG|||gtH Id7dw\t dxtA dp t dytA dp tJj+dXdzdXd{tJj+d|dzdXd{	t d}jK d~ tLjK}tMt.t/d|d}jKdv rjN|d< |di |OjPrtQjRSjPrt djP  tjTjPd}|d }U }dd |V D }g }g }|V D ]#\}}||v rL|jW|| jWkrL|||< |?| q/|?| q/jX|d|d t dtA| d |rwt dtA| d|d d   |Ydd}|Ydd}t d| d|  njPrt djP  tZdd [ D }tZdd [ D }t d|ded|dedd| | dd t\j]t^dd [ j,ddd}d }d }d } j_}!j_rtQjRS|!s|!`ddv}"tA|"dkr|"dv 5 dkrdnd}#|"d |# }$tQjRS|$r|$}!t dj_ d|!  j_rEtQjRS|!rEt d|!  tjT|!d}|d }%U }i }&g }'|%V D ]*\}}||v rq|jW|| jWkr`||&|< qH|'?| d|jW d|| jW  qHta|b ta|%b  }(ta|%b ta|b  })|(rt dtc|(  |)rt dtc|)  |'rt d |'D ]
}*t d|*  q|d|& jX|d|d |'rt d n$z	X|d  W n tey }+ zt d|+  W Y d }+~+nd }+~+ww |Ydddv }d|v rd|v r|d |d f}t d|d dd|dv dd d|v r=d|v r=|d |d f} t d| d dd| dv dd t d|  nj_rQt dj_  d },|S rz&tf|ddd}-tgT|-},W d    n	1 sqw   Y  t d|  W n tey }+ zt d|+  d },W Y d }+~+nd }+~+ww fddÄ}.|d ur|\th_ith_jt dthjiddthjjdd nc|,d u r|. },|jjdXdXdY tf|ddd}-tgjk|,|-ddƍ W d    n	1 sw   Y  t d|  t|,d th_it|,d th_jt dthjiddthjjdd tlthjithjj dk r"t dʃ | d ur=| \th_mth_nt dthjmddthjndd nT|,d u rt|. },|jjdXdXdY tf|ddd}-tgjk|,|-ddƍ W d    n	1 shw   Y  t d|  t|,d th_mt|,d th_nt dthjmddthjndd d }/|d urd|v rd|v r|d |d f}/|/d ur|/\th_oth_pt dthjo dthjp  nU|,d u r|. },|jjdXdXdY tf|ddd}-tgjk|,|-ddƍ W d    n	1 sw   Y  d|,v r|,d th_o|,d th_pt dddd thjoD  dddd thjpD   d }0|d ur0d|v r0d|v r0|d |d f}0|0d urH|0\th_qth_rt dthjq dthjr  nU|,d u rx|. },|jjdXdXdY tf|ddd}-tgjk|,|-ddƍ W d    n	1 ssw   Y  d|,v r|,d th_q|,d th_rt dddd thjqD  dddd thjrD   t d؈j- dٝ tdڃ}1d}2	fdd܄}3jsdkr|3d  fddބ}4ttt=|j-ddD ]}5t dd  t d|5 dj-  t d  tud||2js|3jKjv|4d\}6}7}8}9}2t d|6dd|7dd|8dd|9dd	 tw	jKd\	}:};}<}=}>}?}@}A}Bt d|:dd|;dd|>dd|?dO dd|Ad
 t!jx|5|6|7|8|9|:|;|=|>|?d |Ad|2d |5U U |6|:thjithjjthjmthjnthjothjpthjqthjrd}Cty|C d  |:|1k r|:}1ty|C d  t d|:dd qt!z  t d t d t d|1d t d   d S )Nz1Train PARA trajectory heatmap predictor on LIBERO)descriptionz--model_typer   )r   r   r   r   r   r   r   zModel architecture to train)typedefaultchoiceshelpz--model_namezOpenGVLab/InternVL2_5-1Bz4HuggingFace model name (used by internvl model_type))rL  rM  rO  z--benchmarkZlibero_spatialzLIBERO benchmark namez	--task_idr   zTask index within benchmarkz
--task_ids zbComma-separated task indices to train on all at once (e.g. '0,1,2' or 'all'). Overrides --task_id.z--cameraZ	agentviewz*Camera name used for training observationsz--max_demosr   z'Maximum demos to load from task datasetz--val_splitg?z*Fraction of episodes to use for validationz--batch_sizezBatch size for trainingz--lrzLearning ratez--epochszNumber of epochsz--checkpointz!Path to checkpoint to resume fromz
--run_nameZpara_liberoz/Name of run (used for checkpoint paths and W&B)z--wandb_projectzW&B project namez--wandb_entityzW&B entity/team (optional)z--wandb_modeonline)rQ  offlinedisabledzW&B modez--stats_cache_pathz+Path to JSON cache for height/gripper statsz--stats_sample_limiti  zHRandom number of samples to use for stats computation (0 = full dataset)z--stats_seed*   z!Random seed for stats subsamplingz--vis_every_stepsr   z,Log visualization images every N train stepsz--frame_striderN   uZ   Sample every Nth frame from the demo (default 3 → ~6.7Hz @ 20Hz, N_WINDOW=6 spans ~0.9s)z--cache_rootzfPath to pre-rendered dataset (e.g. /data/libero/parsed_libero). Uses CachedTrajectoryDataset when set.z--pos_loss_weightr&   zDWeight for position loss (ACT: normalizes pos MSE to match rot/grip)z--volume_loss_weightzGWeight for volume/heatmap CE loss (PARA: scale relative to grip/rot CE)z--gripper_loss_weightr`   zWeight for gripper CE lossz--rotation_loss_weightra   zWeight for rotation CE lossz--act_gripper_loss_weightzWeight for ACT gripper BCE lossz--save_every_stepsr_   z@Save checkpoint every N training steps (0 = only save per-epoch)z--overfit_one_sample
store_truez1Overfit to a single dataset sample (sanity check))actionrO  z--pretrained_backboneu   Path to point-track pretrained checkpoint (e.g. point_track_pretraining/checkpoints/.../best.pth). Loads only the DINO backbone weights (dino.* keys) from the checkpoint, initializing all other heads randomly. Use this for the pretrain→finetune pipeline.checkpointsT)parentsexist_okzdataset_stats.jsoncudampsr   zUsing device: )	benchmarktask_idcamera	max_demos	val_split
batch_sizelrepochsr	  n_windowZn_height_binsZn_gripper_bins)projectentitynamer   configz
Loading dataset...allc                 S      g | ]}t |qS r   r   r  ro   r   r   r    r  i      zmain.<locals>.<listcomp>,)
cache_rootbenchmark_nametask_idsr	  rd  frame_strider_  )ro  r\  rq  rr  )r\  c                 S   rj  r   rk  rl  r   r   r    r  |  rm  z  Multi-task mode: tasks )r	  rp  r]  r^  r_  rr  )r\  rq  r^  r_  )r\  r]  r^  r_  z
  Source: z	  Total:  samplesuC   
⚠ OVERFIT MODE: using a single repeated sample for train and valc                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
zmain.<locals>._RepeatDataset  c                 S   s   || _ || _d S N)r<  r  )selfr<  r  r   r   r    __init__  s   
z%main.<locals>._RepeatDataset.__init__c                 S      | j S ru  r  )rv  r   r   r    __len__     z$main.<locals>._RepeatDataset.__len__c                 S   rx  ru  )r<  )rv  rY   r   r   r    __getitem__  r{  z(main.<locals>._RepeatDataset.__getitem__N)rt  )__name__
__module____qualname__rw  rz  r|  r   r   r   r    _RepeatDataset  s    
r  rt  ry  r'   )	generatoru   ✓ Train: u	   ✓ Val: r(  )ra  shufflenum_workers
pin_memoryFz
Initializing model (type=z)...)target_sizerd  freeze_backboner   
model_namez#
Loading pretrained backbone from: )map_locationmodel_state_dictc                 S   s    i | ]\}}| d r||qS )zdino.)
startswith)r  krA  r   r   r    
<dictcomp>  s     zmain.<locals>.<dictcomp>)strictz	  Loaded z) backbone keys from pretrained checkpointz
  Skipped z( keys (shape mismatch or not in model):    epoch?val_lossz  Pretrained checkpoint: epoch=z, val_loss=z)
WARNING: pretrained backbone not found: c                 s   s    | ]
}|j r| V  qd S ru  )requires_gradnumelr  pr   r   r    	<genexpr>  s    zmain.<locals>.<genexpr>c                 s   s    | ]}|  V  qd S ru  )r  r  r   r   r    r    s    zTrainable parameters: z / z (d   r   z%)c                 S   rx  ru  )r  )r  r   r   r    <lambda>  s    zmain.<locals>.<lambda>r^   )rb  weight_decay.r{   rq   .pthz.ptzCheckpoint not found at z, using z
Loading checkpoint: z: checkpoint z
 vs model u    ⚠ Missing keys (random init): u   ⚠ Unexpected keys (ignored): u#   ⚠ Shape mismatches (random init):z    u2   ⚠ Skipping optimizer state (model shape changed)optimizer_state_dictu$   ⚠ Could not load optimizer state: 
min_height
max_heightu#   ✓ Height range from checkpoint: [z.6fz, z] mmin_grippermax_gripperu$   ✓ Gripper range from checkpoint: []u   ✓ Resumed from epoch u   
⚠ Checkpoint not found: rzutf-8)encodingu   ✓ Loaded stats cache: u    ⚠ Failed to read stats cache: c                     s  t t  }  jr, jdkr,t j| }t j}td| d|  d j d n
| }td|  d g }g }g }g }t }d}t|d || krM| n|}	d}
t	|d	d
d}||k r|
|	k r|
d7 }
|| krl|}n
|
| }||v rvqY|| |t k rn}|t k r|n|t  }z|| }W n	 ty   Y qYw |d  }|d  }|d  }||d d df   ||  || || |d7 }|d ||k r|
|	k sa|  t |dkst |dkrtd ttjttjttjttjtjtjdddS tj|tjd}tj|tjd}tj|dd}tj|dd}t| t| t| t| |jdd |jdd |jdd |jdd t|j t|j tt!tt"dS )Nr   z-
Computing dataset stats from random subset: /z samples (seed=)z,
Computing dataset stats from full dataset: rs  r)  zStats subsetF)totalr   r   r'   r   r   r   r{   uE   ⚠ No valid samples found for stats; falling back to model defaults.)r  r  r  r  r   r   num_height_valuesnum_gripper_valuesrc   r*  )r  r  r  r  r   r   r   r   r  r  dataset_sourcerd  r	  )#lenZstats_sample_limitr-  randomRandomZ
stats_seedprintsetr|   r   	randrangeadd	Exceptionr   extendtolistrR   updatecloserg   r(   r)   r*   r>   r?   rG   rI   rd   re   rf   r:  r   r   r	   r   )	total_lenZsample_countrngZall_heightsZall_grippersZ
all_eulersZall_positionsseenZsuccess_countZmax_attemptsattemptsr   
global_idxdataset	local_idxr<  r   r   r   Zall_heights_npZall_grippers_npZall_eulers_npZall_positions_np)argsr  train_datasetval_datasetr   r    compute_stats_from_dataset0  s    








z(main.<locals>.compute_stats_from_datasetu)   ✓ Using height range from checkpoint: [w)indentu   ✓ Saved stats cache: u    ✓ Height range from dataset: [gư>uT     ⚠ WARNING: MIN_HEIGHT == MAX_HEIGHT — all height predictions will be constant!u*   ✓ Using gripper range from checkpoint: [u!   ✓ Gripper range from dataset: [r   r   u$   ✓ Rotation range from checkpoint: z .. u!   ✓ Rotation range from dataset: c                 S      g | ]}|d qS r&  r   r  rA  r   r   r    r    rm  r   r   u$   ✓ Position range from checkpoint: u!   ✓ Position range from dataset: c                 S   r  r  r   r  r   r   r    r    rm  z
Starting training for z
 epochs...infc                    s   t  jsd S tt}tt}t|t jd}t|t jd}i }t|d}t|d}|d ur;||d< |d urC||d< |rNtj|| d d S d S )N)r	  r   r   valzvis/train_stripzvis/val_stripr   )	r#   r   nextiterrJ  r   rI  r   r   )r   Ztrain_batchZ	val_batchZsample_trainZsample_val_localpayloadZtrain_stripZ	val_strip)r  r7   r   train_loader
val_loaderr   r    log_visualizations  s    


z main.<locals>.log_visualizationsc                    sp   |     tjtjtjtjtjtjtjtj	d}t
| d|  d  t
| d  td|  d dS )z3Save a mid-epoch checkpoint at a given global step.)r   r  r  r  r  r  r  r   r   r   r   Zstep_r  
latest.pthz
  Saved step checkpoint: step_N)
state_dictr(   r)   r*   r>   r?   rG   rI   r   r   r9   saver  )r   Zckpt)CHECKPOINT_DIRr   r   r   r    save_step_checkpoint  s   z"main.<locals>.save_step_checkpointZEpochs)r   
z<============================================================zEpoch r  )r   r   r   r   r   r   r   zTrain Loss: r   z
 (Volume: z, Gripper: z, Rotation: r  r"   zVal - Loss: z
, Volume: z, Pixel Error: zpx, Height Error: r&  zmm, Gripper: g     @@)r  z
train/lossztrain/volume_lossztrain/gripper_lossztrain/rotation_losszval/losszval/volume_losszval/gripper_losszval/pixel_errorzval/height_error_mmzval/gripper_abs_errorr   )r  r  r  
train_lossr  r  r  r  r  r   r   r   r   r  zbest.pthu   ✓ Saved best model (val_loss=z=
============================================================u   ✓ Training complete!zBest val loss: zCheckpoints saved to: r   ){argparseArgumentParseradd_argumentstrr   rg   
BATCH_SIZELEARNING_RATE
NUM_EPOCHS
parse_argsZpos_loss_weightPOS_LOSS_WEIGHTZvolume_loss_weightr   Zgripper_loss_weightr   Zrotation_loss_weightr   Zact_gripper_loss_weightr   r   __file__parentrun_namemkdirstats_cache_pathr9   r7   rZ  is_availablebackendsr[  r  r   initZwandb_projectZwandb_entity
wandb_moder\  r]  r^  r_  r`  ra  rb  rc  r   r	   r   r   ro  rq  rH  lowersplitr   rr  Zlibero.liberoZget_benchmark_dictZget_num_taskslistrP   r   rR   r   r  Zoverfit_one_sampleutilsdataDatasetr|   random_split	Generatormanual_seedr   r   r!   dictr  r   Zpretrained_backboneospathexistsloadr  itemsrO   load_state_dictgetr   
parametersoptimAdamWfilter
checkpointrsplitr  keyssortedr  r  openjsonr(   r)   r*   dumpr   r>   r?   rG   rI   r   r   r   r   r   r   r  r   r  finish)DparserZ
script_dirr  Ztask_id_listZfull_dataset_bmZn_tasksdatasetstiddsZ
one_sampler  Zdataset_sizeZval_sizeZ
train_sizeZ
ModelClassZmodel_kwargsZpt_ckptZpt_state
model_dictZbackbone_keysloadedskippedr  rA  Zpt_epochZpt_val_lossZn_trainableZn_totalZstart_epochr  Zcheckpoint_height_valuesZcheckpoint_gripper_valuesZcheckpoint_pathaltZ	other_extZalt_pathZmodel_stateZfiltered_stateZshape_mismatchesmissing_keysunexpected_keysmsgeZstats_cachefr  Zcheckpoint_rot_valuesZcheckpoint_pos_valuesbest_val_lossr   r  r  r  r  Ztrain_volume_lossZtrain_gripper_lossZtrain_rotation_lossr  Zval_heatmap_lossZval_height_lossZval_gripper_lossZ	val_errorZval_height_errorZval_height_error_tfZval_gripper_errorZ
sample_valZcheckpoint_datar   )
r  r  r  r7   r   r   r  r  r  r  r    main  s`  




























		



 *
"

""

O 

 





,




,
*



4
r  __main__)rb   )Fr   r   Nr   r   r   N)O__doc__r9   torch.nnnntorch.nn.functional
functionalrS   torch.optimr  torch.utils.datar   r   r   rd   pathlibr   r   r  r   r  r  sysr  r  insertdirnamer  r1  Zrobosuite.utils.camera_utilsr   Zrobosuite.utils.transform_utilsr  Ztransform_utilsr9  r  r   r   r	   r   r
   r   r   r   r   r(   r   r!   r#   r2   r=   rB   rC   rL   rM   r\   r  r  r  r   r   r   r   r  r   rr   rz   r   r   r   r   r   r   r   r  r   rI  rJ  r  r}  r   r   r   r    <module>   s    				

y /OE    
B
