o
    [2Äi.‚  ã                   @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlZddlZddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ ddlmZ ej deeeƒj ƒ¡ eeƒ !¡ j j d	 d
 Z"ej dee"ƒ¡ ddl#m$Z$ ddl%m&Z& ddl'm(Z( dZ)dZ*dZ+dZ,dZ-dZ.dZ/dZ0dZ1dZ2da3da4da5da6g d¢a7g d¢a8dZ9dZ:dZ;G dd„ dej<ƒZ=e.fdd „Z>d!d"„ Z?d#d$„ Z@d%d&„ ZAd'd(„ ZBd)d*„ ZCd+d,„ ZDd-d.„ ZEd/d0„ ZFd1d2„ ZGe*fd3d4„ZHd<d7d8„ZId9d:„ ZJeKd;kreJƒ  dS dS )=aO  Train UVA (MAR video model) + PARA action heads jointly on LIBERO.

Joint training: video diffusion loss + PARA volume/gripper/rotation CE losses.
UVA decoder tokens are upsampled to 64x64 and used as pixel-aligned features for PARA
heads (same architecture as DINO features in model.py, but with UVA backbone).

Usage:
    CUDA_VISIBLE_DEVICES=5 python libero/train_uva_para.py         --cache_root /data/libero/parsed_libero         --uva_checkpoint video_training/unified_video_action/checkpoints/simple_uva_libero_stride3_latest.pt         --log_wandb --run_name uva_para_libero_spatial
é    N)ÚPath)ÚSimpleNamespace)Ú
DataLoader)Ú	rearrange)Útqdm)ÚRotationZvideo_trainingZunified_video_action)ÚAutoencoderKL)Úmar_base_video_only)ÚCachedTrajectoryDataseté   é   iÀ  é@   gÃõ(\ÂÍ?é    i   é   ç        ç      ð?g      ð¿)çn†ðù!	Àr   r   )çn†ðù!	@r   r   g      @ç      à?c                       s>   e Zd ZdZeeeeef‡ fdd„	Z	d	dd„Z
dd„ Z‡  ZS )
Ú	ParaHeadsaw  Volume + gripper + rotation heads on MAR decoder tokens.

    Takes (B, T, S, C) decoder tokens, reshapes to spatial grid,
    upsamples 16->64 with convs, then:
      - volume_head: 1x1 conv -> (B, T, N_HEIGHT_BINS, 64, 64)
      - gripper/rotation MLPs indexed at query pixel (teacher forcing in train)
    Same architecture as the DINO feature processing in model.py.
    c                    sò   t ƒ  ¡  |}|| _|| _t tj||dddt ¡ tjddddtj||dddt ¡ tjddddtj||dddt ¡ ¡| _	t ||d¡| _
t t |¡t ||¡t ¡ t ||¡¡| _t t |¡t ||¡t ¡ t |d| ¡¡| _d S )Né   é   )Úpaddingé   ÚbilinearF)Úscale_factorÚmodeÚalign_corners)ÚsuperÚ__init__Úpara_out_sizeÚn_height_binsÚnnÚ
SequentialÚConv2dÚGELUÚUpsampleÚfeature_netÚvolume_headÚ	LayerNormÚLinearÚgripper_mlpÚrotation_mlp)ÚselfÚdecoder_dimr    r!   Zn_gripper_binsZ
n_rot_binsÚD©Ú	__class__© ú+/data/cameron/para/libero/train_uva_para.pyr   R   s$   
û"ÿ&
ÿzParaHeads.__init__Nc                 C   sf  |j \}}}}tt|d ƒƒ }}| || |||¡ dddd¡}	|  |	¡}
|  |
¡}| j}|
 |||||¡}| ||| j	||¡}d }}|dur­|d  
¡  d|d ¡}|d  
¡  d|d ¡}| ¡ }tj||
jd	 |d¡ ||¡}tj||
jd	 d|¡ ||¡}|||dd…||f }| || |¡}|  |¡ ||t¡}|  |¡ ||dt¡}||||fS )
a¢  
        Args:
            dec_tokens: (B, T, S, C) where S = MAR_GRID^2 = 256
            query_pixels: (B, T, 2) in PARA_OUT_SIZE coords [x, y]. Optional.
        Returns:
            volume_logits: (B, T, N_HEIGHT_BINS, out_size, out_size)
            feats: (B, T, D, out_size, out_size)
            gripper_logits: (B, T, N_GRIPPER_BINS) or None
            rotation_logits: (B, T, 3, N_ROT_BINS) or None
        r   r   r   r   r   N©.r   ©.r   ©Údevice)ÚshapeÚintÚroundÚreshapeÚpermuter'   r(   r    Úviewr!   ÚlongÚclampÚdetachÚtorchÚaranger7   Úexpandr+   ÚN_GRIPPER_BINSr,   Ú
N_ROT_BINS)r-   Ú
dec_tokensÚquery_pixelsÚBÚTÚSÚCZH_latZW_latÚxÚfeatsÚvolÚPÚfeats_5dÚvolume_logitsÚgripper_logitsÚrotation_logitsÚpxÚpyZ	feats_detÚ	batch_idxÚtime_idxÚindexedÚflatr2   r2   r3   Úforwardl   s(    

  zParaHeads.forwardc                 C   sî   |j dd… \}}|j d }|j d }|d  ¡  d|d ¡}|d  ¡  d|d ¡}tj||jd |d¡ ||¡}	tj||jd d|¡ ||¡}
||	|
dd…||f }| || |¡}|  	|¡ ||t
¡}|  |¡ ||d	t¡}||fS )
a.  Inference: predict gripper/rotation at given pixel locations.

        Args:
            feats_5d: (B, T, D, P, P)
            query_pixels: (B, T, 2) in PARA_OUT_SIZE coords
        Returns:
            gripper_logits: (B, T, N_GRIPPER_BINS)
            rotation_logits: (B, T, 3, N_ROT_BINS)
        Nr   éÿÿÿÿr4   r   r   r5   r6   r   )r8   r>   r?   rA   rB   r7   r=   rC   r;   r+   rD   r,   rE   )r-   rP   rG   rH   rI   rO   rK   rT   rU   rV   rW   rX   rY   rR   rS   r2   r2   r3   Úpredict_at_pixels‘   s   


  zParaHeads.predict_at_pixels©N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚDECODER_DIMÚPARA_OUT_SIZEÚN_HEIGHT_BINSrD   rE   r   rZ   r\   Ú__classcell__r2   r2   r0   r3   r   H   s    	þ
%r   c                 C   s<   | | || d  }|  dd¡}||d   ¡   d|d ¡S ©Nç:Œ0âŽyE>r   r   r   r   )r?   r>   )Zheight_valuesÚmin_hÚmax_hÚn_binsÚ
normalizedr2   r2   r3   Údiscretize_height­   ó   rl   c                 C   ó<   | | || d  }|  dd¡}|td   ¡   dtd ¡S rf   )r?   rD   r>   )Zgripper_valuesÚmin_gÚmax_grk   r2   r2   r3   Údiscretize_gripper³   rm   rq   c                 C   rn   rf   )r?   rE   r>   )Zeuler_valuesÚmin_rÚmax_rrk   r2   r2   r3   Údiscretize_rotation¹   rm   rt   c                 C   sú   | j \}}}}}|dd…dd…df  ¡  d|d ¡}|dd…dd…df  ¡  d|d ¡}	| d|d ¡}
g }t|ƒD ]9}| dd…|f  |d¡}|
dd…|f ||  |	dd…|f |  |dd…|f   ¡ }| tj||dd¡ q<t 	|¡ 
¡ S )z(CE over flattened (Nh*H*W) per timestep.Nr   r   r[   Úmean)Ú	reduction)r8   r>   r?   Úranger;   ÚappendÚFÚcross_entropyrA   Ústackru   )Zpred_volume_logitsÚtrajectory_2dÚtarget_height_binsrH   ÚNÚNhÚHÚWrT   rU   Zh_binÚlossesÚtZlogits_flatZ
target_idxr2   r2   r3   Úcompute_volume_loss¿   s   &&@r„   c                 C   s:   t |||ƒ}| j\}}}t |  || |¡| || ¡¡S r]   )rq   r8   ry   rz   r;   )Zpred_gripper_logitsZtarget_gripperro   rp   Útarget_binsrH   r~   ZNgr2   r2   r3   Úcompute_gripper_lossÍ   s   "r†   c                 C   s”   t |||ƒ}| j\}}}}g }	tdƒD ]/}
| d d …d d …|
d d …f  || |¡}|d d …d d …|
f  || ¡}|	 t ||¡¡ qt |	¡ 	¡ S )Nr   )
rt   r8   rw   r;   rx   ry   rz   rA   r{   ru   )Zpred_rotation_logitsZtarget_eulerrr   rs   r…   rH   r~   Ú_ZNrr‚   ÚaxisZlogits_axisZtarget_axisr2   r2   r3   Úcompute_rotation_lossÓ   s   ( r‰   c                 C   s  | j \}}}}}| j}tj||d|d}	tj|||tjd}
t|ƒD ]S}| dd…|f }|jdd\}}| |d¡jdd}|| }|| }| 	¡ |	dd…|df< | 	¡ |	dd…|df< |tj
||ddd…||f jdd|
dd…|f< q"tjd	d
||d}||
 ||  | }|	|fS )zFFrom volume (B, T, Nh, H, W) -> pred_2d (B, T, 2), pred_height (B, T).r   r6   )r7   ÚdtypeNr   ©Údimr[   r   r   r   )r8   r7   rA   Úzerosr>   rw   Úmaxr=   ÚargmaxÚfloatrB   Úlinspace)rQ   rh   ri   rH   r~   r   r€   r   r7   Úpred_2dZpred_height_binsrƒ   Zvol_tZ
max_over_hr‡   Zflat_idxrU   rT   Zbin_centersZpred_heightr2   r2   r3   Úextract_pred_2d_and_heightÞ   s    4r“   c                 C   sF   | j \}}}}}tj|  ||d¡dd |||||¡}|jddd S )zGvolume_logits (B, T, Nh, H, W) -> max-over-height heatmap (B, T, H, W).r[   r   r‹   r   )r8   ry   Úsoftmaxr;   r=   rŽ   )rQ   rH   rI   r   r€   r   Z	vol_probsr2   r2   r3   Úbuild_max_along_ray_heatmapsö   s   &r•   c                 C   sj   t | ƒ}| ¡ st|  }tdg d¢d}t| ¡ rt|ƒnd |d}| |¡ ¡  | 	¡ D ]}d|_
q-|S )Nr   )r   r   r   r   r   )Úvae_embed_dimZch_mult)Zautoencoder_pathÚddconfigF)r   Úis_absoluteÚUVA_ROOTr   r   ÚexistsÚstrÚtoÚevalÚ
parametersÚrequires_grad)Úvae_ckptr7   Ú	ckpt_pathr—   ÚvaeÚpr2   r2   r3   Ú	build_vae  s   r¤   c                    s8  t |ƒ}| ¡ s| ¡ st| }| ¡ std|› dƒ dS zddl}tj|||d}W n ty=   tj||dd}Y nw d|v rZ|d  	d	¡pO|d  	d
¡}dd„ | 
¡ D ƒ}nd
|v rc|d
 }ntdt| ¡ ƒ› ƒ‚|  ¡ ‰ ‡ fdd„| 
¡ D ƒ}ˆ  |¡ | jˆ dd tdt|ƒ› dt|ƒ› d|› ƒ dS )zGLoad pretrained UVA MAR weights (strict=False to allow new PARA heads).zUVA checkpoint not found: z; training from scratchNr   )Úmap_locationÚpickle_moduleF)r¥   Úweights_onlyZstate_dictsZ	ema_modelÚmodelc                 S   s(   i | ]\}}|  d ¡r|dd… |“qS )zmodel.é   N)Ú
startswith©Ú.0ÚkÚvr2   r2   r3   Ú
<dictcomp>   s   ( z'load_mar_checkpoint.<locals>.<dictcomp>z Unrecognized checkpoint format: c                    s.   i | ]\}}|ˆ v rˆ | j |j kr||“qS r2   )r8   r«   ©Úcurrentr2   r3   r¯   '  s   . )ÚstrictzLoaded ú/z MAR keys from )r   rš   r˜   r™   ÚprintÚdillrA   ÚloadÚ	ExceptionÚgetÚitemsÚKeyErrorÚlistÚkeysÚ
state_dictÚupdateÚload_state_dictÚlen)r¨   r¡   r7   rµ   ÚpayloadÚsdZmodel_sdZloadabler2   r°   r3   Úload_mar_checkpoint  s.   ÿ

&rÃ   c                 C   s–   | j d }| dd…d|…f }| ddddd¡}| || d|j d |j d ¡}tj|||fddd	}| ||d||¡}| ddddd¡}|d
 d S )zbConvert (B, N_WINDOW, H, W, 3) raw frames [0,1] to (B, 3, n_frames, uva_size, uva_size) in [-1,1].r   Nr   r   r   r   r   F)Úsizer   r   ç       @r   )r8   r<   r;   ry   Úinterpolate)Úrgb_frames_rawÚn_framesZuva_img_sizerH   Úframesr2   r2   r3   Úframes_to_video_tensor-  s   
 rÊ   éô  é*   c              	   C   s*  t  |¡}t|t| ƒƒ}| tt| ƒƒ|¡}g g g }}}t|dddD ]9}	z| |	 }
W n	 ty6   Y q%w |
d  ¡ }| 	|dd…df  
¡ ¡ | 	|
d  ¡  
¡ ¡ | |
d  ¡ ¡ q%t |¡}t |¡}tj|d	d
}t| ¡ ƒt| ¡ ƒt| ¡ ƒt| ¡ ƒ|jd	d
 
¡ |jd	d
 
¡ dœS )z/Scan dataset for height/gripper/rotation range.zComputing statsF©ÚdescÚleaveÚtrajectory_3dNr   Útrajectory_gripperÚtrajectory_eulerr   ©rˆ   )Ú
min_heightÚ
max_heightÚmin_gripperÚmax_gripperÚmin_rotÚmax_rot)ÚrandomÚRandomÚminrÀ   Úsamplerw   r   r·   ÚnumpyÚextendÚtolistrx   ÚnpÚarrayÚconcatenater   rŽ   )ÚdatasetZsample_limitÚseedÚrngÚnÚindicesZall_heightsZall_grippersZ
all_eulersÚidxrÝ   Zt3dZheightsZgrippersZeulersr2   r2   r3   Úcompute_dataset_stats=  s*   
ÿ

ýrê   c            f         sÈ  t jdd} | jdtdd | jdtdd | jdtd	d | jd
tddd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jddd  | jd!td"d#d | jd$td%d&d | jd'td(d)d | jd*td+d | jd,td-d | jd.td-d/d | jd0dd1d2 | jd3td4d5d |  ¡ }t |j¡}t	t
ƒj}|d6 |j }|jd7d7d8 |jrãd9d l}|jdt|ƒ|jd:d; t|j|ƒ}ttd<d=d<d>d?d@dA |¡}t||j|ƒ tttdB |¡}tdCtdDdE„ | ¡ D ƒƒdF›dGƒ d }	|jr4|j  ¡  !¡ d	kr4dHdI„ |j "dF¡D ƒ}	t#|j$|j%|	t&t'|j(dJ}
tdKt)|
ƒ› dLƒ t*d=tt)|
ƒ|j+ ƒƒ}t)|
ƒ| }tj,j-j.|
||gt /¡  0dM¡dN\}}tdOt)|ƒ› dPt)|ƒ› ƒ t1||j2d7|j3d7dQ}t1||j2dR|j3d7dQ}|dS }| 4¡ r¿t5|ƒ}t6 7|¡}W d   ƒ n	1 s²w   Y  tdT|› ƒ n)t8|
ƒ}t5|dUƒ}t6j9||dVdW W d   ƒ n	1 sÜw   Y  tdX|› ƒ |dY a:|dZ a;|d[ a<|d\ a=|d] a>|d^ a?tj@t>tjA|d_}tj@t?tjA|d_}td`t:da›dbt;da›dcƒ tddt<da›dbt=da›dcƒ tdet>› dft?› ƒ tt& }|jBrX| ¡ D ]}dR|_CqB| D¡  tE| ¡ ƒ}tdgƒ ntE| ¡ ƒtE| ¡ ƒ }tjFjG||jHddh}d9}|jIr¬t	|jIƒ 4¡ r¬tj7|jI|di}| J|dj ¡ | J|dk ¡ | J|dl ¡ | Kdmd9¡d= }tdn|jI› do|› ƒ d9}tdpƒ}tLtM||jNƒdqdrD ]‘}|jBsÇ| O¡  | O¡  d9d9d9d9d9dsœ}d9} tL|dt|› dRdu}!|!D ]­}"|"dv  |¡}#tP|#t'tƒ}$|$jQ\}%}&}'}(})|"dw  |¡d d …d t'…f }*|"dx  |¡d d …d t'…f }+|"dy  |¡d d …d t'…f },|"dz  |¡d d …d t'…f }-|*| }.|. Rd9td{ ¡}.|+d d …d d …dVf }/tS|/t:t;ƒ}0tT|$d|ƒ}1t U¡  | V|1 ¡ ¡}2|2 W¡ tX }3W d   ƒ n	1 stw   Y  tT|3d}|%d~}3tT|3dƒ}4| Y|4¡}5tT|5d€|%d~}5|5d d …d d=…f  Zd|'dd¡}6|jBs¬| [|5|6¡}7ntj@d‚|dƒ}7|j\|5|6d d„}8||8|.d…\}9}:};}<t]|9|.|0ƒ}=t^|;|,t<t=ƒ}>t_|<|-||ƒ}?|=t`|>  ta|?  }@|jb|7 |jc|@  }A| d¡  |A e¡  tjfj, g|d-¡ | h¡  |d†  |A i¡ 7  < |d‡  |7 i¡ 7  < |dˆ  |= i¡ 7  < |d‰  |> i¡ 7  < |dŠ  |? i¡ 7  < | d=7 } |!jj|7 i¡ d‹›|= i¡ d‹›|> i¡ d‹›|? i¡ d‹›dŒ |jrod9d l}|jk|A i¡ |7 i¡ |= i¡ |> i¡ |? i¡ dœ|dŽ ||jl d9krX|jrXd9d l}| D¡  | D¡  t U¡ » tm|9d d=… ƒ}B|Bd9  n¡  o¡ }C|#d  n¡  o¡ }Dtp q|Dttf¡}Ed9d lr}F|F sd¡ d9d ltmu}G |GjvdVt'dt' d?fd‘\}H}ItMt'ƒD ]™}J|Id9|Jf }K|C|J }L|L|L w¡  |L *¡ |L w¡  d’  }M|Ed“ txjy|Mtx z|M¡tx z|M¡gdd”d“  }N|K {tx ||Nd9d=¡¡ |Kj}|.d9|Jd9f  n¡  i¡ |.d9|Jd=f  n¡  i¡ d•d–d—dVd˜ |K ~d™|J› dš¡ |K d›¡ |#d9|Jf  n¡  o¡ }Otp q|Ottf¡}P|Id=|Jf  {|P¡ |Id=|Jf  ~d™|J› dœ¡ |Id=|Jf  d›¡ qÎ|G €¡  t ‚¡ }Q|Gjƒ|Qdd"dž |Q „d9¡ d9dŸl…m†}R |jkd | †|R 5|Q¡¡i|dŽ |G ‡d	¡ ||jld¡  d9kr@d9d lˆ}S|$d d=…d d …d9f }T| V|T ¡ ¡}U|U W¡ tX }V|V ‰d=¡ Zd=t'ddd¡}W|jŠd=|W|j‹d-d¢d£\}X}Y| Œ|XtX ¡}Z|Z d=t'dtt¡}Z|Zd9  n¡ d- d¤  Rd9d=¡}[tŽjd¥dRd¦}|j}\W d   ƒ n	1 s
w   Y  |[ ‘d9dVdd=¡ o¡ d§  ’d¨¡}]|Sjj“|\t ”|]¡dd© |jkdª|j•|\d«d¬i|dŽ t	|\ƒj–d7d­ W d   ƒ n	1 sKw   Y  | O¡  | O¡  |d9krŠ||j— d9krŠ||| ˜¡ | ˜¡ | ˜¡ |t:t;t<t=t>t?d®œ}^t ™|^|d¯ ¡ td°|› d±ƒ |d=7 }qát*d=| ƒ‰ ‡ fd²d³„| š¡ D ƒ}_tdt|› d´|_d† da›dµ|_d‡ da›d¶|_dˆ da›d·|_d‰ da›d¸|_dŠ da›ƒ | D¡  | D¡  d9d9d9d¹œ}`d9}at U¡ û tL|dºdRduD ]ë}"|"dv  |¡}#tP|#t'tƒ}$|$jQd9 }%|"dw  |¡d d …d t'…f }*|"dx  |¡d d …d t'…f }+|"dy  |¡d d …d t'…f },|"dz  |¡d d …d t'…f }-|*| }.|. Rd9td{ ¡}.tS|+d d …d d …dVf t:t;ƒ}0tT|$d|ƒ}1| V|1 ¡ ¡}2|2 W¡ tX }3tT|3d}|%d~}3tT|3dƒ}4| Y|4¡}5tT|5d€|%d~}5|5d d …d d=…f  Zdt'dd¡}6|j\|5|6d d„}8||8ƒ\}9}:}Y}Yt]|9|.|0ƒ}=t›|9t:t;ƒ\}b}Ytjœ|b|. dd» ¡  i¡ }c|`dˆ  |= i¡ |% 7  < |`d¼  |c|% 7  < |a|%7 }aqáW d   ƒ n	1 sØw   Y  t*d=|aƒ}a|`dˆ |a }d|`d¼ |a }etd½|dda›d¾|ed¿›dÀƒ |jrd9d l}|jk||_d† |_d‡ |_dˆ |_d‰ |_dŠ |d|edÁœ|dŽ |d|k rM|d}||| ˜¡ | ˜¡ | ˜¡ |t:t;t<t=t>t?d®œ}^t ™|^|dÂ ¡ tdÃ|dda›dÄ|› d±ƒ q¼|jr[d9d l}| ž¡  tdÅ|› ƒ d S )ÆNzTrain UVA + PARA on LIBERO)Údescriptionz--cache_rootz/data/libero/parsed_libero)ÚtypeÚdefaultz--benchmarkÚlibero_spatialz
--task_idsÚallz--uva_checkpointz/checkpoints/simple_uva_libero_stride3_latest.ptzPretrained UVA MAR checkpoint)rì   rí   Úhelpz
--vae_ckptzpretrained_models/vae/kl16.ckptz--batch_sizer   z--lrg-Cëâ6?z--epochsrË   z--frame_strider   z	--workersé   z--deviceÚcudaz
--run_nameZuva_para_liberoz--log_wandbÚ
store_true)Úactionz--vis_everyéd   zSteps between vis updatesz--checkpoint_everyiè  zSave checkpoint every N stepsz
--num_iterr   z Diffusion sampling steps for visz--val_splitgš™™™™™©?z--video_loss_weightr   z--para_loss_weightz8Weight for total PARA loss (volume + gripper + rotation)z--freeze_marz*Freeze MAR backbone, only train PARA heads)rô   rð   z--resumeÚ zPath to resume checkpointÚcheckpointsT)ÚparentsÚexist_okr   Zonline)ÚprojectÚconfigÚnamer   r   r   Ú100r©   i   )Úimg_sizeÚ
vae_strideÚ
patch_sizer–   Únum_sampling_stepsÚ
diffloss_dÚ
diffloss_w)r.   r    zParaHeads: c                 s   s    | ]}|  ¡ V  qd S r]   )Únumel)r¬   r£   r2   r2   r3   Ú	<genexpr>  s   € zmain.<locals>.<genexpr>ú,z paramsc                 S   s   g | ]}t |ƒ‘qS r2   )r9   )r¬   rL   r2   r2   r3   Ú
<listcomp>’  s    zmain.<locals>.<listcomp>)Ú
cache_rootZbenchmark_nameÚtask_idsÚ
image_sizeÚn_windowÚframe_stridez	Dataset: z samplesrÌ   )Ú	generatorzTrain: z, Val: )Ú
batch_sizeÚshuffleÚnum_workersÚ
pin_memoryFzdataset_stats.jsonzLoaded stats from Úwr   )ÚindentzSaved stats to rÔ   rÕ   rÖ   r×   rØ   rÙ   )rŠ   r7   z	Height: [z.4fz, ú]z
Gripper: [z
Rotation: z .. u0   Frozen MAR backbone â€” only training PARA heads)ÚlrÚweight_decay)r¥   Úmar_state_dictÚpara_heads_state_dictÚoptimizer_state_dictÚepochzResumed from z
 at epoch ÚinfZEpochs)rÎ   )ÚtotalÚvideoÚvolumeÚgripperÚrotationzEpoch rÍ   rÇ   r|   rÐ   rÑ   rÒ   gj¼t“ð?zb c t h w -> (b t) c h wz(b t) c h w -> b t c h w)Úbzb t c h w -> (b t) c h wz(b t) s c -> b t s cr[   r   r6   )Úmask)rG   r  r  r  r  r   z.3f)ÚvidrN   ÚgripÚrot)ztrain_step/total_lossztrain_step/video_lossztrain_step/volume_lossztrain_step/gripper_lossztrain_step/rotation_loss)Ústep)r   r   ZAgg)Úfigsizerg   r   rÓ   Zcyané(   rL   )ÚcÚsÚmarkerZ
linewidthszt=z heatmapÚoffz	 GT frameÚpng)ÚformatÚdpi)ÚImagezvis/heatmap_and_framesé   gffffffî?)ÚbszÚcondÚnum_iterÚcfgÚtemperaturerÅ   z.mp4)ÚsuffixÚdeleteéÿ   Úuint8)Úfpszvis/predicted_videoZmp4)r.  )Ú
missing_ok)r  Úglobal_stepr  r  r  ÚstatsrÔ   rÕ   rÖ   r×   rØ   rÙ   z
latest.pthz  Saved latest.pth (step ú)c                    s   i | ]	\}}||ˆ  “qS r2   r2   r«   ©rç   r2   r3   r¯     s    zmain.<locals>.<dictcomp>z: total=z vid=z vol=z grip=z rot=)r  r  Úpixel_errorZValr‹   rA  z  Val: volume=z, pixel_error=z.2fzpx (in 64-space))r  ztrain/total_lossztrain/video_lossztrain/volume_lossztrain/gripper_lossztrain/rotation_losszval/volume_losszval/pixel_error_64zbest.pthz  Saved best (val_vol=z, step zDone. Checkpoints at )ŸÚargparseÚArgumentParserÚadd_argumentr›   r9   r   Ú
parse_argsrA   r7   r   Ú__file__ÚparentZrun_nameÚmkdirZ	log_wandbÚwandbÚinitÚvarsr¤   r    r	   ÚUVA_IMG_SIZErœ   rÃ   Zuva_checkpointr   rb   rc   r´   Úsumrž   r	  ÚstripÚlowerÚsplitr
   r  Ú	benchmarkÚPRERENDER_SIZEÚN_FRAMESr  rÀ   rŽ   Z	val_splitÚutilsÚdataÚrandom_splitÚ	GeneratorÚmanual_seedr   r  Zworkersrš   ÚopenÚjsonr¶   rê   ÚdumpÚ
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERÚMIN_ROTÚMAX_ROTÚtensorÚfloat32Z
freeze_marrŸ   r   r»   ÚoptimÚAdamWr  Zresumer¿   r¸   r   rw   ÚepochsÚtrainrÊ   r8   r?   rl   r   Úno_gradÚencoderÝ   ÚLATENT_SCALEÚpatchifyrC   Úcompute_lossÚforward_decode_tokensr„   r†   r‰   ÚGRIPPER_LOSS_WEIGHTÚROTATION_LOSS_WEIGHTZvideo_loss_weightZpara_loss_weightÚ	zero_gradÚbackwardr"   Úclip_grad_norm_r&  ÚitemÚset_postfixÚlogZ	vis_everyr•   ÚcpurÞ   Úcv2ÚresizeÚ
matplotlibÚuseÚmatplotlib.pyplotÚpyplotÚsubplotsrÜ   rá   r{   Ú
zeros_likeÚimshowÚclipÚscatterÚ	set_titlerˆ   Útight_layoutÚioÚBytesIOÚsavefigÚseekÚPILr0  ÚcloseÚtorchvisionÚ	unsqueezeZsample_tokensr4  Údecoder=   ÚtempfileÚNamedTemporaryFilerü   r<   ÚastypeZwrite_videoÚ
from_numpyZVideoÚunlinkZcheckpoint_everyr½   Úsaver¹   r“   Únormru   Úfinish)fr£   Úargsr7   Z
script_dirZckpt_dirrI  r¢   ÚmarZ
para_headsr	  rä   Zval_sizeZ
train_sizeZtrain_datasetZval_datasetZtrain_loaderZ
val_loaderZ
stats_pathÚfr>  Úmin_r_tÚmax_r_tÚcoord_scaleZp_paramÚ
all_paramsÚoptZstart_epochZckptr=  Zbest_val_lossr  Zepoch_lossesZ	n_batchesÚpbarÚbatchrÇ   r  rH   rK   rI   r€   r   Útraj_2dÚtraj_3dÚtraj_gripperÚ
traj_eulerÚ	traj_paraZtarget_heightr}   Úframes_flatZ	posteriorÚzZz_flatÚx_tokensÚcond_tokensZ
video_lossrF   rQ   rM   rR   rS   Zvolume_lossZgripper_lossZrotation_lossZ	para_lossZ
total_lossZheatmapsZheatmaps_npZinput_frameZinput_frame_smallry  ÚpltÚfigÚaxesrƒ   ÚaxZheatZ	heat_normZoverlayZgt_frameZgt_smallÚbufr0  rŠ  Zfirst_frameZ
posterior0Úz0r3  Útokensr‡   ÚpredZpred_npZtmp_pathZ	frames_npZ	ckpt_dataÚavgZ
val_lossesZval_nr’   Z	pixel_errZval_volZval_pxr2   r@  r3   Úmain\  s¢  þ
ÿÿ
þýÿþ$ú
ÿ
ÿ
ÿ

ÿÿ




þ

 ÿ



üûú	
 ,ý



ÿÿ€Â@ø

"ÿÿÿ




 áÿ
"ø	÷
ø
€r±  Ú__main__)rË   rÌ   )Lra   rB  r„  rZ  rÚ   Úsysr  Úpathlibr   Útypesr   rw  rÞ   rá   rA   Útorch.nnr"   Ztorch.nn.functionalÚ
functionalry   Útorch.utils.datar   Úeinopsr   r   Zscipy.spatial.transformr   ZScipyRÚpathÚinsertr›   rF  rG  Úresolver™   Zsimple_uva.vaer   Úsimple_uva.modelr	   rU  r
   rS  rL  rR  rc   rj  rd   rD   rE   rb   ZMAR_GRIDr\  r]  r^  r_  r`  ra  rn  ro  ZPARA_LOSS_WEIGHTÚModuler   rl   rq   rt   r„   r†   r‰   r“   r•   r¤   rÃ   rÊ   rê   r±  r^   r2   r2   r2   r3   Ú<module>   s~    e 
   


ÿ