o
    LiY5                     @   s*  d Z ddlZddlZddlZddlZddlZddlmZ ddlZddl	Z
ddlZddlmZ ddlm  mZ ddlmZ ddlmZ ejdeje ddlZejZddlmZ ddlmZmZ ej !dd	Z"ej !d
d	Z#dZ$dZ%dd Z&dd Z'G dd dej(Z)dd Z*e+dkre*  dS dS )u  2D Point Track Pretraining — trains PARA or ACT on 2D pixel trajectories only.

PARA mode: N_HEIGHT_BINS=1, CE over H×W per timestep (predict future EEF pixel location)
ACT mode: CLS→MLP→(u,v) pixel coordinates, L2 loss

No height, no gripper, no rotation supervision.

Usage:
    # PARA 2D pretrain on arm-deleted data
    python train_pretrain_2d.py --model_type para --cache_root /data/libero/ood_objpos_arm_deleted         --run_name para_pretrain_arm_deleted --max_minutes 10

    # ACT pixel pretrain on circle overlay data
    python train_pretrain_2d.py --model_type act --cache_root /data/libero/ood_objpos_circle_overlay         --run_name act_pretrain_circle --max_minutes 10
    N)Path)
DataLoader)tqdm)CachedTrajectoryDataset)TrajectoryHeatmapPredictor	PRED_SIZEDINO_REPO_DIR DINO_WEIGHTS_PATH     c                 C   s   | j \}}}}}| dddddf |||| }|dddddf t |  d|d }	|dddddf t |  d|d }
|
| |	 }t||| || ||| }|S )u   CE loss over H×W for each timestep. No height dimension.

    pred_volume: (B, N_WINDOW, 1, H, W) — single height bin
    trajectory_2d: (B, N_WINDOW, 2) — target (u, v) pixel coords
    Nr      )shapereshape
IMAGE_SIZElongclampFcross_entropy)Zpred_volumetrajectory_2d	pred_sizeBT_HWlogitsZtarget_uZtarget_vZtarget_flatloss r   @/data/cameron/para_normalized_losses/libero/train_pretrain_2d.pycompute_2d_pixel_loss_para0   s   $..&r    c                 C   s   |t  }t| |S )u   L2 loss on predicted (u, v) pixel coordinates.

    pred_uv: (B, N_WINDOW, 2) — predicted pixel coords (sigmoid → [0, 1])
    trajectory_2d: (B, N_WINDOW, 2) — target (u, v) pixel coords
    )r   r   mse_loss)pred_uvr   Ztarget_normr   r   r   compute_2d_pixel_loss_actC   s   r#   c                       s6   e Zd ZdZdef fdd	Zdd Zdd Z  ZS )	ACTPixelPredictorzOACT-style model that predicts (u, v) pixel coordinates instead of 3D positions.r   c                    s   t    || _|| _d| _tjjtddt	d| _
| j
j| _| j}|d }tt|t||t t||t t||d t | _tdd |  D }td|d	d
| d d S )NZ	act_pixeldinov3_vits16pluslocal)sourceweights   c                 s   s    | ]}|  V  qd S )N)numel).0pr   r   r   	<genexpr>h   s    z-ACTPixelPredictor.__init__.<locals>.<genexpr>u   ✓ ACT Pixel model: ,z params, output (B, z, 2))super__init__target_sizen_window
model_typetorchhubloadr   r
   dino	embed_dimnn
Sequential	LayerNormLinearGELUSigmoid	pixel_mlpsum
parametersprint)selfr1   r2   kwargsDinp_dimZn_params	__class__r   r   r0   Q   s*   




	zACTPixelPredictor.__init__c                 C   s   | j |\}\}}| j jD ]}| j jr| j j||dnd }|||}q| j jr;| j |d d d | j jd f }n| j |d d d | j jd f }|d d df S )N)r   r   r   r   )r7   prepare_tokens_with_masksblocks
rope_embeduntie_cls_and_patch_normscls_normn_storage_tokensnorm)rC   xx_tokensH_pW_pblkrope_sincosZx_normr   r   r   _extract_clsk   s   &$zACTPixelPredictor._extract_clsc           	      K   sh   |j d }| |}| dkr|d|d}|| j }tj||gdd}| |	|| j
d}|S )Nr   r   dimr)   )r   rV   rY   	unsqueezeexpandr1   r4   catr?   r   r2   )	rC   rP   start_keypoint_2drD   r   clsZkp_norminppredr   r   r   forwardv   s   


zACTPixelPredictor.forward)	__name__
__module____qualname____doc__N_WINDOWr0   rV   ra   __classcell__r   r   rG   r   r$   N   s
    r$   c            :      C   s  t  } | jdtdddgd | jdtdd | jdtdd | jd	td
d | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd |  }ttj	
 rldnd}td|  t|j|j|jgttdd}tdt| d  td!t|d" }t|| }tjjj|||gt d#d$\}}t||jdd%ddd&}t||jd'd(dd)}	|jdkrd!t_tttd*}
ntttd*}
|
 |}
tj!j"|
# |j$d+d,}t%d-|j& }|j'ddd. zdd l(}|j)|j*|j&|j+d/ W n   Y td0}t,, }d}t-|j.D ],}t,, | d1 |j/kr1 n|
0  d}d}|D ]9}t,, | d1 |j/krM n)|d2  |}|d3  |}|d d df }|jdkr|
||\}}}}|j1d }t2 }}|3|td!||}t4||t2}n
|
||}t5||}|6  |7  tj8j9|
# d4 |:  ||; 7 }|d!7 }|d!7 }|d5 dkrz|<|; |d6 W n   Y |d7 dkrt|jdkrtzy|
=  t>  |
|d d! |d d! \} }}}W d    n	1 sw   Y  |
0  | 3d!td!t2t2} |d ? @d!d(dA }!|!d8 Bdd8CtDjE}!tFG|!tFjH}!|d ? A }"tFI|!t|"d t|"d! fd9d:d; tFJ|!d<g}#tFJ|!K d<g}$t-tD ]}%| d|%df ? }&tjL|&3d;dd=3t2t2A }'tFJ|'ttf}(|(|( d>  d8 CtDjE})tFM|)tFjN}*tFO|!d?|*d@d}+tDP|(Q |(j1},tFI|+|,d! |,d fdAdBd; tFR|+dC|%d!  dDtFjSd@dBd( |#TtFJ|+d< |!K }-|d|%f ? A }.tFI|-t|.d t|.d! fddEd; tFR|-dF|%d!  dDtFjSd@dEd( |$TtFJ|-d< qftDjU|#d!dG}/tDjU|$d!dG}0tDV|/|0g}1|dH }2|2j'ddI tFWt|2dJ|dKdL |1 tFG|1tFjX}3|<|Y|3|dM W q; tZys }4 ztdN|4  W Y d }4~4q;d }4~4ww q;|dkr} n|| }5|
=  d}6d}7t> \ |	D ]Q}|d2  |}|d3  |}|d d df }|jdkr|
||\}}}}|j1d }|3|td!t2t2}t4||t2}n
|
||}t5||}|6|; 7 }6|7d!7 }7qW d    n	1 sw   Y  |6td!|7 }8t,, | d1 }9tdO|d!  dP|5dQdR|8dQdS|9dTdU	 |8|k r;|8}t[|
\ |\ ||8|j|jdkr2d!nd ddV|dW  z|<|8|dX W q   Y qtdYdZ  td[|dQ td\|dW   tdZ  z|]  W d S    Y d S )]Nz--model_typeZparaact)typedefaultchoicesz--cache_rootT)ri   requiredz
--run_namez--benchmarkZlibero_spatial)ri   rj   z	--task_idr   z--batch_size   z--lrg-C6?z--epochsi'  z--max_minutes
   z--wandb_projectZpara_liberoz--wandb_modeZonlinecudacpuzUsing device:    )
cache_rootZbenchmark_nameZtask_idsZ
image_sizer2   Zframe_stridez	Dataset: z samplesr      *   )	generatorr   )
batch_sizeshufflenum_workers
pin_memory	drop_lastFr)   )rv   rw   rx   ry   )r1   r2   gh㈵>)lrweight_decayzcheckpoints/)parentsexist_ok)projectnamemodeinf<   rgbr   g      ?2   )z
train/lossstep         )r   r   r   rW   )r   r   rX   g:0yE>g?g333333?   )r   r   r   zt+)rn      )r   r   r   zGT t+)axisZvis)r~   Zstep_06dz.png)zvis/heatmapr   z  Vis error: zEpoch z: train=z.4fz val=z [z.1fzmin])model_state_dictZoptimizer_state_dictepochval_lossr3   Zn_height_binsZpretrain_2dzbest.pth)zval/lossr   
z2==================================================z"Training complete. Best val loss: zCheckpoint: )^argparseArgumentParseradd_argumentstrintfloat
parse_argsr4   devicero   is_availablerB   r   rr   	benchmarkZtask_idr   rf   lenmaxutilsdatarandom_split	Generatormanual_seedr   rv   r3   model_moduleN_HEIGHT_BINSr   r$   tooptimAdamWrA   r{   r   Zrun_namemkdirwandbinitZwandb_projectZ
wandb_modetimerangeepochsZmax_minutestrainr   r   r   r    r#   	zero_gradbackwardr9   clip_grad_norm_r   itemlogevalno_gradrp   permutenumpyclipastypenpuint8cv2cvtColorCOLOR_RGB2BGRcircleresizecopysoftmaxapplyColorMapCOLORMAP_JETaddWeightedunravel_indexargmaxputTextFONT_HERSHEY_SIMPLEXappendconcatenatevstackimwriteCOLOR_BGR2RGBImage	Exceptionsave
state_dictfinish):parserargsr   datasetZn_valZn_trainZtrain_dsZval_dsZtrain_loaderZ
val_loadermodel	optimizerZckpt_dirr   Zbest_val_loss
start_timeZglobal_stepr   Z
epoch_lossZ	n_batchesbatchimgZtraj_2dstart_kpvolume_logitsr   r   r   r   volr   r"   Zvis_volvis_imgkppanelsZ	gt_panelstr   probhmhm_normhm_coloroverlaypeakZgt_imgZgt_uvZvis_rowZgt_rowZvis_combinedZvis_dirZvis_rgbeZ	avg_trainr   Zval_nZavg_valelapsedr   r   r   main   sb  









$& "&"




,


r   __main__),re   r   jsonossysr   pathlibr   r   r   r   r4   torch.nnr9   torch.nn.functional
functionalr   torch.utils.datar   r   pathinsertdirname__file__r   r   r   ZORIGINAL_N_HEIGHT_BINSr   r   r   r   environgetr   r
   rf   r   r    r#   Moduler$   r   rb   r   r   r   r   <module>   s>    3 `
