o
    ¢?Ñi€  ã                   @   sb  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlZddlZddlZddlmZ ddlm  mZ ddlZddlmZmZmZ ddlZddlZddlmZ ddlmZ ddlmZm Z  ddl!m"Z" dd	l#m$Z$ dd
l%m&Z' ddl(Z(ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ej/ 0de1e
e2ƒj3ƒ¡ ej/ 0dd¡ ddl4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:Z:ddl;Z;ddl<Z<ee=ddZ>dZ?dZ@dZAdZBdZCdZDdZEd aFaGd aHaId aJaKG dd„ dejLƒZMdd„ ZNdd„ ZOd d!„ ZPd"d#„ ZQdd$d%ejRfd&d'„ZSd(d)„ ZTd6d,d-„ZUG d.d/„ d/eƒZVd7d1d2„ZWd3d4„ ZXe=d5kr/eXƒ  dS dS )8uò  Joint SVD video diffusion + PARA action prediction training.

During each training step:
  1. UNet forward pass with random noise level (standard diffusion training)
  2. Hook into up_block_1 (1280ch, 20x36) and up_block_2 (640ch, 40x72)
  3. Project each to 128 dims via linear, concat â†’ 256ch
  4. Bilinear upsample to 64x64, conv refinement, PARA heads
  5. Loss = EMA-weighted(volume_loss, gripper_loss, diffusion_loss)

Usage:
    CUDA_VISIBLE_DEVICES=0,3,5,8 python train_svd_para_joint.py
é    N)ÚPath)ÚDatasetÚ
DataLoaderÚRandomSampler)ÚAccelerator)Ú
get_logger)ÚProjectConfigurationÚset_seed)Útqdm)ÚImage)ÚRotation)ÚAutoencoderKLTemporalDecoder)Úget_scheduler)ÚCLIPImageProcessorÚCLIPVisionModelWithProjectionz/data/cameron/para_videopolicy)ÚStableVideoDiffusionPipeline)Ú UNetSpatioTemporalConditionModel)ÚCachedTrajectoryDatasetÚINFO)Ú	log_levelé@   é    éÀ  g{®Gáz„?é€   g        c                       s2   e Zd ZdZdeef‡ fdd„	Zddd„Z‡  ZS )	ÚParaHeadsOnUNetz?PARA heads that attach to SVD UNet's up_block_1 and up_block_2.é   c                    s¼   t ƒ  ¡  || _t d|d¡| _t d|d¡| _|d }t tj||dddt ¡ tj||dddt ¡ tj||dddt ¡ ¡| _	t ||d¡| _
t |td¡| _t |dt d¡| _d S )Ni   r   i€  é   é   )Úpadding)ÚsuperÚ__init__Ún_height_binsÚnnÚConv2dÚproj_block1Úproj_block2Ú
SequentialÚGELUÚfeature_convsÚvolume_headÚN_GRIPPER_BINSÚgripper_headÚ
N_ROT_BINSÚrotation_head)ÚselfÚn_windowr!   Zproj_dimÚD©Ú	__class__© úH/data/cameron/vidgen/svd_motion_lora/Motion-LoRA/train_svd_para_joint.pyr    Q   s   
ýzParaHeadsOnUNet.__init__Nc                 C   s0  t }|  |¡}tj|||fddd}|  |¡}tj|||fddd}tj||gdd}|  |¡}|  |¡}d }	}
|dur“|j	d }|dd…df  
¡  d|d ¡}|dd…df  
¡  d|d ¡}tj||jd}|  | ¡ ¡}||dd…||f }	|  | ¡ ¡}||dd…||f }| |d	t¡}
||	|
fS )
a  
        Args:
            feat_block1: (B*T, 1280, 20, 36) from up_block_1
            feat_block2: (B*T, 640, 40, 72) from up_block_2
            query_pixels: (B*T, 2) in PARA_OUT_SIZE coords [x, y]
        Returns:
            volume_logits: (B*T, N_HEIGHT_BINS, P, P)
            gripper_logits: (B*T, N_GRIPPER_BINS) or None
            rotation_logits: (B*T, 3, N_ROT_BINS) or None
        ÚbilinearF©ÚsizeÚmodeÚalign_cornersr   ©ÚdimNr   ©Údevicer   )ÚPARA_OUT_SIZEr$   ÚFÚinterpolater%   ÚtorchÚcatr(   r)   ÚshapeÚlongÚclampÚaranger=   r+   Údetachr-   Úviewr,   )r.   Zfeat_block1Zfeat_block2Úquery_pixelsÚPÚf1Úf2ZfeatsÚvolZgripper_logitsZrotation_logitsÚBTÚpxÚpyÚidxZgrip_mapZrot_mapZ	rot_at_pxr3   r3   r4   Úforwardd   s(   




  
zParaHeadsOnUNet.forward©N)	Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚN_HEIGHT_BINSÚPROJ_DIMr    rR   Ú__classcell__r3   r3   r1   r4   r   N   s    r   c                 C   s8   | | || d  }|  dd¡|d   ¡   d|d ¡S )Nç:Œ0âŽyE>r   r   )rE   rD   )ÚvaluesZmin_vZmax_vÚn_binsÚnormr3   r3   r4   Ú
discretize”   s   $r_   c                 C   s’   | j \}}}}|dd…df  ¡  d|d ¡}|dd…df  ¡  d|d ¡}| d|d ¡}	|  |d¡}
|	||  ||  |  ¡ }t |
|¡S )zBvol_logits: (BT, Nh, H, W), traj_2d: (BT, 2), target_h_bins: (BT,)Nr   r   éÿÿÿÿ)rC   rD   rE   Úreshaper?   Úcross_entropy)Ú
vol_logitsÚtraj_2dÚtarget_h_binsrN   ÚNhÚHÚWrO   rP   Zh_binZlogits_flatÚ
target_idxr3   r3   r4   Úcompute_volume_loss˜   s     rj   c                 C   s   t |||tƒ}t | |¡S rS   )r_   r*   r?   rb   )ÚlogitsÚtargetÚmin_gÚmax_gZtarget_binsr3   r3   r4   Úcompute_gripper_loss¢   s   ro   c              
   C   sf   g }t dƒD ]%}t|dd…|f || || tƒ}| t | dd…|dd…f |¡¡ qt |¡ ¡ S )z*logits: (BT, 3, Nr), target_euler: (BT, 3)r   N)	Úranger_   r,   Úappendr?   rb   rA   ÚstackÚmean)rk   Ztarget_eulerZmin_rZmax_rÚlossesÚaxisÚbinsr3   r3   r4   Úcompute_rotation_loss¦   s
   "&rw   ç      ð?Úcpuc                 C   s0   t j| ||dd d }t j ||¡ |¡ ¡ S )N©Údtyper=   g•5 ”ÿÿï?gH¯¼šò×z>)rA   ÚrandÚdistributionsÚNormalÚicdfÚexp)rC   ÚlocÚscaler=   r{   Úur3   r3   r4   Úrand_log_normal²   s   r„   c           	      C   sz   | }|j \}}}}}| ddddd¡ || |||¡}| |¡j ¡ }|j||g|j dd… ¢R Ž }| ddddd¡}|S )z4t: (B, C, T, H, W) -> latents (B, C_latent, T, h, w)r   r   r   r   é   N)rC   Úpermutera   ÚencodeÚlatent_distÚsample)	ÚtÚvaeÚvideoÚbÚcÚfÚhÚwÚlatentsr3   r3   r4   Útensor_to_vae_latent¶   s   "r“   ÚbicubicTc                 C   sÄ  | j dd … \}}||d  ||d  f}t|d d d dƒt|d d d dƒf}ttd|d  dƒƒttd|d  dƒƒf}|d d	 dkrS|d d |d f}|d d	 dkre|d |d d f}tjjj| |d d	 gd	 |d d	 gd	  d
d} tjd| j| j	d}	t
||ƒD ]2\}
}tj|| j| j	d|d d	  }t |d	  d	|
d	   ¡}|| ¡  }|	 d¡| d¡ }	qŽ|	 | j d dd¡ d¡}	tj| |	| j d d} tj| |||dS )Néþÿÿÿr   r   rx   ç       @gü©ñÒMbP?g      @r   r   Úreflect)r8   rz   r`   éýÿÿÿ)Úgroupsr6   )rC   ÚmaxÚintrA   r"   Ú
functionalÚpadÚonesr{   r=   ÚziprF   r€   ÚsumÚ	unsqueezeÚexpandr?   Úconv2dr@   )Úinputr7   Úinterpolationr9   r   r‘   ÚfactorsÚsigmasÚksÚkernelÚsÚkÚcoordÚgr3   r3   r4   Ú_resize_with_antialiasingÀ   s    ,,$$4 r®   c                   @   s.   e Zd ZdZ		ddd„Zd	d
„ Zdd„ ZdS )ÚVideoParaDatasetzQWraps CachedTrajectoryDataset to provide video frames for SVD + PARA annotations.é@  é@  é   r   r   c	           	      C   s2   t ||||||d| _|| _|| _|| _|| _d S )N)Ú
cache_rootZbenchmark_nameÚtask_idsÚ
image_sizer/   Úframe_stride)r   Úpara_datasetÚwidthÚheightÚsample_framesrµ   )	r.   r³   Ú	benchmarkr´   r¸   r¹   rº   rµ   r¶   r3   r3   r4   r    Ù   s   ý
zVideoParaDataset.__init__c                 C   s
   t | jƒS rS   )Úlenr·   )r.   r3   r3   r4   Ú__len__å   s   
zVideoParaDataset.__len__c                 C   sˆ   | j | }|d }|jd }| dddd¡}tj|| j| jfddd}|d	 d
 }| dddd¡}||d |d |d |d |d dœS )NÚrgb_frames_rawr   r   r   r   r5   Fr6   r–   rx   Útrajectory_2dÚtrajectory_3dÚtrajectory_gripperÚtrajectory_euler)Úpixel_valuesr¿   rÀ   rÁ   rÂ   r¾   )r·   rC   r†   r?   r@   r¹   r¸   )r.   rQ   r‰   Z
rgb_framesÚTÚframesZframes_resizedrŒ   r3   r3   r4   Ú__getitem__è   s    

ÿúzVideoParaDataset.__getitem__N)r°   r±   r²   r   r   )rT   rU   rV   rW   r    r½   rÆ   r3   r3   r3   r4   r¯   Ö   s    
ÿr¯   éô  c                 C   s  t  d¡}| tt| ƒƒt|t| ƒƒ¡}g g g }}}t|dddD ]4}z| j| }W n   Y q#| |d  	¡ d d …df  
¡ ¡ | |d  	¡  
¡ ¡ | |d  	¡ ¡ q#t |¡t |¡t |d	¡}	}
}t|	 ¡ ƒt|	 ¡ ƒt|
 ¡ ƒt|
 ¡ ƒ| d	¡ 
¡ | d	¡ 
¡ d
œS )Né*   ÚStatsF)ÚdescÚleaverÀ   r   rÁ   rÂ   r   )Ú
min_heightÚ
max_heightÚmin_gripperÚmax_gripperÚmin_rotÚmax_rot)ÚrandomÚRandomr‰   rp   r¼   Úminr
   r·   ÚextendÚnumpyÚtolistrq   ÚnpÚarrayÚconcatenateÚfloatrš   )ÚdatasetÚnÚrngÚindicesZall_hZall_gZall_erQ   rª   r   r­   Úer3   r3   r4   Úcompute_dataset_stats  s   

"$þrá   c            ˆ         s*  t  ¡ } | jdtdd | jdtdd | jdtdd | jdtd	d | jd
tdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdddd | jdtdd | jdtdd | jdtd d | jd!td"d | jd#td$d |  ¡ }d%d&lm} |d'd(}t	d)d|gd*}|j
}t|jƒ |jrÄtj|jd'd+ tjd,t|jƒjt|ƒd- tj}tj|jd.d/j||d0}| d1¡ tj|jd2d/j||d0‰ˆ d1¡ tj|jd3d/}tj|j d4|j vrþd4nd d/j||d0}	|	 !¡  t"dd5j|tj#d0}
i ‰ ‡ fd6d7„}|	j$d  %|d8ƒ¡ |	j$d9  %|d:ƒ¡ ‡fd;d<„}d=d>„ |j& 'd?¡D ƒ}t(|j)|j*||j+|j,|j-|j.d@}t|jƒdA }| /¡ rht0 1t2|ƒ¡}nt3|ƒ}|jr{t0j4|t2|dBƒd9dC |dD |dE a5a6|dF |dG a7a8|dH |dI a9a:tj;t9|dJ}tj;t:|dJ}t<t= }t> ?dKt@|ƒ› dLt5dM›dNt6dM›dO¡ tA||jBd'dPd'd'dQ}|jCrä|	 d1¡ |	 D¡  |
 E¡ |jFdR dSœg}t> ?dT¡ n|	 E¡ |jFdU dSœ|
 E¡ |jFdR dSœg}tjGjH|dVdW}tIdX|d%|jJdY}| K|	|
|||¡\}	}
}}}i ‰d%}tLtM|jJƒ|j dZ}tMd[ƒD ]Z}|D ]K}||jJkr; nA| N|	|
¡„ |d\  |¡}|jO\}}}} }!|d]  |¡d d …d |…f }"|d^  |¡d d …d |…f }#|d_  |¡d d …d |…f }$|d`  |¡d d …d |…f }%|"|  Pd%t<da ¡}&tQ|#d d …d d …d9f t5t6tRƒ}'|d d …d d …d%d…f }(tS||ƒ})t T|)¡}*|)jOd% }+tU|+gdbdcdd |)¡},|,d d …d d d d f }-t T|(¡|- |( }.tS|.|ƒ}/|/d d …d d …d%d…f }/|/|jVjW }/tU|+gdedfdd |)j
¡}0|0d d …d d d d f }1|)|*|1  }2t Xdgd>„ |0D ƒ¡ |¡}3|2|1d9 d dc  }4||d d …d d …d%f  ¡ ƒ}5tjYt ;dhg¡ Z|+¡t ;dig¡ Z|+¡|, [¡ gddjj||d0}6|/ Zdkdk|dkdk¡}7tj\|4|7gddj}8|8 ]d%d9ddldP¡}8ˆ  ^¡  |	|8|3|5|6dmj_}9|*}:|1 ]d%d9ddldP¡};|; |;d9 d dc  }<d|;d9 d  }=|9|< |=|2 ]d%d9ddldP¡  }>|) ]d%d9ddldP¡}?d|1 ]d%d9ddldP¡d9  |1 ]d%d9ddldP¡d9  }@|@|>|? d9   `¡ }Aˆ d8  ¡ }Bˆ d:  ¡ }C|& a|| d9¡}D|' a|| ¡}E|$ a|| ¡}F|% a|| dl¡}G|
|B|C|Ddn\}H}I}Jtb|H|D|Eƒ}Ktc|I|Ft7t8ƒ}Ltd|J|G||ƒ}M|K e¡ |L e¡ |A e¡ doœ}N|ND ]}O|Oˆvr@|N|O ˆ|O< dtf ˆ|O  tf|N|O   ˆ|O< q3‡fdpd>„|ND ƒ}P|Prytg‡fdqdr„|PD ƒƒ‰t@|Pƒ‰‡‡‡fdsdt„|PD ƒ}Qndudt„ |ND ƒ}Q|Q hdvd¡|K |Q hdwd¡|L  |Q hdxd¡|A  }R| i|R¡ |jjr³tk|	 E¡ ƒtk|
 E¡ ƒ }S| l|Sdy¡ | m¡  | m¡  | n¡  W d   ƒ n	1 sÊw   Y  |jrq|dz d%kr tjo|R e¡ |K e¡ |L e¡ |M e¡ |A e¡ |Q hdvd¡|Q hdwd¡|Q hdxd¡d{œ|d| |jp|K e¡ dM›|L e¡ dM›|A e¡ dM›do ||jq d%kr!|d%ks&|dzkr,zÝ| r|	¡}T| r|
¡}U|Hd |…  s¡  ¡ }Vttju|V a|dk¡ddj}W|W v|tRt<t<¡}W|Wjwddjd%  [¡  x¡ }X|Wjwddjd%  v|dk¡jyddj}Y|Yt< }Z|Yt< }[d%d lz}\|\ {¡ }]|T |tj} ~|]d4¡¡ tj|]d4tjd}}^t€j|j|^tjd~d}_|_ |¡ i ‰‡fd€d„}`|^j$d  %|`d‚ƒ¡}a|^j$d9  %|`dƒƒ¡}b|d„ d…  [¡  x¡ }ct ‚|cd†  ƒt„j…¡¡ †|j+|j,f¡}dt ‡¡  |_|d|j,|j+|j-dPd‡dˆjˆd% }eW d   ƒ n	1 söw   Y  |a ‰¡  |b ‰¡  d }fd  }g}hd‚ˆv rzdƒˆv rzt Š¡  |Uˆd‚ ˆdƒ ƒ\}i}j}jW d   ƒ n	1 s/w   Y  t‹|ijOd% t@|eƒ|ƒ}kttju|id |k…  a|kdk¡ddj}l|l v|ktRt<t<¡}l|ljwddjd%  [¡  x¡ }f|ljwddjd%  v|kdk¡jyddj}m|mt< }h|mt< }g~_~^tŒj|]d'd‰ ˆ ^¡  tjŽ ¡  g }nt‹|t@|eƒdƒ}otM|oƒD ]B}pt|"d%|pd%f  e¡ ƒ}qt|"d%|pdf  e¡ ƒ}r|d„ d%|pf  [¡  x¡ d†  ƒt„j…¡ ¡ }s|X|p }t|t|t ‹¡  |t w¡ |t ‹¡  dŠ  }ut‘ †|ud‹¡}vt‘ ’|vd†  ƒt„j…¡t‘j“¡}wt‘ ”|wt‘j•¡}w|sdc |wdc   ƒt„j…¡}xt‘ –|x|q|rfdŒddl¡ t|[|p  e¡ | ƒ}yt|Z|p  e¡ | ƒ}zt‘ –|x|y|zfdŒdŽdl¡ t‘ —|xd|p› dd‘t‘j˜dcd’d9¡ t‘ †t„ ™|e|p ¡d‹¡}{|fd ur¼|pt@|fƒk r¼|f|p }||||| ‹¡  || w¡ || ‹¡  dŠ  }}t‘ †|}d‹¡}~t‘ ’|~d†  ƒt„j…¡t‘j“¡}t‘ ”|t‘j•¡}|{dc |dc   ƒt„j…¡}€|gd ur°t|g|p  e¡ | ƒ}t|h|p  e¡ | ƒ}‚t‘ –|€||‚fdŒdŽdl¡ t‘ –|€|q|rfdŒdd9¡ n|{}€t‘ —|€d“|p› dd‘t‘j˜dcd’d9¡ t„jš|x|€gdd”}ƒ|n ›|ƒ¡ qš|nrt„ Y|n¡ œd%dldd9¡}„tjod•tj|„d9d–d—i|d| t> ?d˜|› ¡ W n' tžy+ }… zd%d lŸ}†t>  d™|› dš|…› d›|† ¡¡ › ¡ W Y d }…~…nd }…~…ww |d%krq||j¢ d%krqt|jƒdœ|›  }‡|‡j£d'd+ | r|	¡}T|T ||‡d4 ¡ t ¤| r|
¡ ¥¡ | ¥¡ ||dœ|‡dž ¡ t> ?dŸ|› ¡ |d7 }| ¦d¡ q/||jJkr„ nq*|jrŽt §¡  t> ?d ¡ d S )¡Nz--pretrainedz1checkpoints/stable-video-diffusion-img2vid-xt-1-1)ÚtypeÚdefaultz--pretrain_unetz.output_libero_ood_objpos/checkpoint-31500/unetz--cache_rootz/data/libero/ood_objpos_task0z--benchmarkÚlibero_spatialz
--task_idsÚ0z--widthr°   z--heightr±   z--num_framesr²   z--frame_strider   z--batch_sizez--lrg-Cëâ6
?z--freeze_unetÚ
store_truez"Freeze UNet, train only PARA heads)ÚactionÚhelpz--max_stepsi?B z--output_dirZoutput_svd_para_jointz--ckpt_everyiè  z--vis_everyéÈ   z--seedé{   r   )ÚDistributedDataParallelKwargsT)Úfind_unused_parametersÚbf16)Úmixed_precisionÚgradient_accumulation_stepsÚkwargs_handlers)Úexist_okZsvd_para_joint)ÚprojectÚnameÚconfigr‹   )Ú	subfolder)r{   FÚimage_encoderÚfeature_extractorÚunet)r/   c                    ó   ‡‡ fdd„}|S )Nc                    s   t |tƒr	|d n|ˆ ˆ< d S ©Nr   )Ú
isinstanceÚtuple)ÚmoduleÚinpÚout)Úcapturedró   r3   r4   Úhook_fnT  s   z(main.<locals>.make_hook.<locals>.hook_fnr3   )ró   r  )r   ©ró   r4   Ú	make_hookS  ó   zmain.<locals>.make_hookZ
up_block_1r   Z
up_block_2c                    sD   t | dƒ} | d d } tjj | g d¢g d¢¡} ˆ | ƒj}| d¡S )N)éà   r  rx   r–   )g3<Í4'ÐÞ?gwgí¶MÝ?gy{Îå Ú?)g‡Bô91Ñ?g•wÝt.¹Ð?gÝ	U¦Ñ?r   )r®   ÚtorchvisionÚ
transformsrœ   Ú	normalizeÚimage_embedsr¡   )rÃ   Úimage_embeddings)rö   r3   r4   Úencode_image[  s   
þ

zmain.<locals>.encode_imagec                 S   s   g | ]}t |ƒ‘qS r3   )r›   )Ú.0Úxr3   r3   r4   Ú
<listcomp>e  s    zmain.<locals>.<listcomp>Ú,)r³   r»   r´   r¸   r¹   rº   r¶   zdataset_stats.jsonr‘   )ÚindentrÌ   rÍ   rÎ   rÏ   rÐ   rÑ   r<   z	Dataset: z samples, Height: [z.3fz, ú]r…   )Ú
batch_sizeÚshuffleÚnum_workersÚ
pin_memoryÚ	drop_lastr–   )ÚparamsÚlru(   UNet FROZEN â€” training PARA heads onlyg{®Gáz”?g-Cëâ6?)Úweight_decayÚconstant)Ú	optimizerÚnum_warmup_stepsÚnum_training_steps)ÚdisableiŸ† rÃ   r¿   rÀ   rÁ   rÂ   gj¼t“ð?g      Àg      à?)rC   r   r‚   gffffffæ?gš™™™™™ù?c                 S   s   g | ]}d |  ¡  ‘qS )g      Ð?)Úlog)r  Úsigmar3   r3   r4   r  ¾  s    g      @g     À_@r:   r`   r   )Úencoder_hidden_statesÚadded_time_ids)rI   )rM   ÚgripÚdiffc                    s    g | ]}ˆ   |d ¡dkr|‘qS )r   g»½×Ùß|Û=)Úget©r  r«   ©Ú	loss_emasr3   r4   r  ú  s     c                 3   s     | ]}d ˆ | d  V  qdS )rx   r[   Nr3   r&  r'  r3   r4   Ú	<genexpr>ü  s   € zmain.<locals>.<genexpr>c                    s"   i | ]}|ˆˆ  ˆ| d   “qS )r[   r3   r&  )Úinv_sumr(  rÝ   r3   r4   Ú
<dictcomp>þ  s   " zmain.<locals>.<dictcomp>c                 S   s   i | ]}|d “qS )rx   r3   r&  r3   r3   r4   r+     s    rM   r#  r$  rx   é   )ztrain/total_lossztrain/volume_lossztrain/gripper_lossztrain/rotation_lossztrain/diffusion_lossztrain/w_volztrain/w_gripztrain/w_diff)Ústep)rõ   Útorch_dtypeÚfp16)rø   r.  Úvariantc                    rù   )Nc                    s&   t |tƒr	|d n| ¡  ¡ ˆ ˆ< d S rú   )rû   rü   rG   rÛ   )Úmodrþ   rÿ   )Úgen_capturedró   r3   r4   Úfn?  s   &z"main.<locals>.gen_hook.<locals>.fnr3   )ró   r3  )r2  r  r4   Úgen_hook>  r  zmain.<locals>.gen_hookÚub1Úub2r¾   )r   r   éÿ   é   )r¹   r¸   Ú
num_framesÚdecode_chunk_sizeÚnum_inference_steps)Úignore_errorsr[   )r   r   é   )r   r7  r7  )r7  r   r   zGT t=z PRED=red GT=cyan)r,  r8  )r7  r7  r7  zGen t=)ru   zvis/gt_vs_gen_heatmapsZmp4)ÚfpsÚformatzLogged vis at step zVis failed at step z: Ú
zcheckpoint-)Ú
para_headsr  ÚstatsÚglobal_stepzpara_checkpoint.ptzSaved checkpoint at step zTraining complete!)¨ÚargparseÚArgumentParserÚadd_argumentÚstrr›   rÛ   Ú
parse_argsÚ
acceleraterë   r   r=   r	   ÚseedÚis_main_processÚosÚmakedirsÚ
output_dirÚwandbÚinitr   ró   ÚvarsrA   Úbfloat16r   Úfrom_pretrainedZ
pretrainedÚtoÚrequires_grad_r   r   r   Zpretrain_unetÚenable_gradient_checkpointingr   Úfloat32Ú	up_blocksÚregister_forward_hookr´   Úsplitr¯   r³   r»   r¸   r¹   r9  r¶   ÚexistsÚjsonÚloadÚopenrá   ÚdumpÚ
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERÚMIN_ROTÚMAX_ROTÚtensorr>   ÚPRERENDER_SIZEÚloggerÚinfor¼   r   r  Zfreeze_unetÚevalÚ
parametersr  ÚoptimÚAdamWr   Ú	max_stepsÚpreparer
   rp   Ú
accumulaterC   rE   r_   rX   r“   Ú
randn_liker„   rô   Úscaling_factorÚTensorrr   r¢   ry   rB   r†   Úclearr‰   rs   ra   rj   ro   rw   ÚitemÚ	EMA_ALPHAr    r%  ÚbackwardÚsync_gradientsÚlistÚclip_grad_norm_r-  Ú	zero_gradr  Úset_postfixZ	vis_everyÚunwrap_modelrG   r?   ÚsoftmaxrH   rš   rÖ   ÚargmaxÚtempfileÚmkdtempÚsave_pretrainedÚpathÚjoinÚfloat16r   r   Ú	fromarrayÚastyperØ   Úuint8ÚresizeÚinference_moderÅ   ÚremoveÚno_gradrÔ   ÚshutilÚrmtreeÚcudaÚempty_cacheÚcopyÚcv2ÚapplyColorMapÚCOLORMAP_JETÚcvtColorÚCOLOR_BGR2RGBÚcircleÚputTextÚFONT_HERSHEY_SIMPLEXrÙ   rÚ   rq   Ú	transposeZVideoÚ	ExceptionÚ	tracebackÚwarningÚ
format_excZ
ckpt_everyÚmkdirÚsaveÚ
state_dictÚupdateÚfinish)ˆÚpÚargsrë   Z
ddp_kwargsÚacceleratorr=   Úweight_dtyper‹   r÷   rø   rA  r  r  r´   rÜ   Z
stats_pathrB  Zmin_r_tZmax_r_tÚcoord_scaleÚ
dataloaderÚ
all_paramsr  Úlr_schedulerrC  Úprogress_barÚepochÚbatchrÃ   ÚBÚCrÄ   rg   rh   rd   Ztraj_3dZtraj_gripperZ
traj_eulerZ	traj_parare   Zconditional_pixel_valuesr’   ÚnoiseÚbszZcond_sigmasZcond_sigmas_5dZconditional_pixel_values_noisedZconditional_latentsr§   Z	sigmas_5dZnoisy_latentsÚ	timestepsZinp_noisy_latentsr
  r"  Zconditional_latents_expandedrþ   Z
model_predrl   Z	sigmas_bcÚc_outÚc_skipÚdenoisedZtarget_latentsZ	weightingZdiffusion_lossÚfeat1Úfeat2Ztraj_para_flatZtarget_h_flatZtraj_grip_flatZtraj_euler_flatrc   Úgrip_logitsZ
rot_logitsZvolume_lossZgripper_lossZrotation_lossÚrawr«   ÚactiveÚweightsÚ
total_lossZ
all_p_flatZunet_unwrappedZpara_unwrappedZvol_detÚ	vol_probsZheatmaps_trainZ	pred_flatZpred_py_trainZpred_px_trainr€  Útmp_dirZ
fresh_unetÚpiper4  Úh1Úh2Zfirst_frame_rawZ	first_pilZgen_pilZgen_heatmapsZgen_pred_pxZgen_pred_pyZgen_volÚ_Zn_genZgv_probsZgfZ
vis_framesZn_visrŠ   Zgt_xZgt_yÚgt_frameÚhtZht_nZht_upZht_cÚleftZpx_tZpy_tZgen_npÚhgZhg_nZhg_upZhg_cÚrightZgpxZgpyÚcombinedZvis_vidrà   rœ  Úsave_dirr3   )r   r2  rö   r*  r(  rÝ   r4   Úmain  sÀ  ÿÿ
ÿ
ÿÿÿ
ÿÿ
ÿÿþ

ý
&
ÿ
þÿ
ÿ
 


ÿ
ÿýüüÿÿ0ÿÿÿÿ
$ÿþ

“p


ø	÷


ý$

 ÿþ

ÿ
þþÿ
ÿ 
( 
ÿ 

ÿ
ÿÿ€,€þ
üûÿrË  Ú__main__)r”   T)rÇ   )YrW   rD  Úior\  ÚmathrL  rÒ   r  ÚsysÚpathlibr   r’  rÖ   rØ   rA   Útorch.nnr"   Útorch.nn.functionalrœ   r?   Útorch.utils.checkpointÚtorch.utils.datar   r   r   r  rI  r   Zaccelerate.loggingr   Úaccelerate.utilsr   r	   Ú	tqdm.autor
   ÚPILr   Zscipy.spatial.transformr   ZScipyRÚ	diffusersr   Zdiffusers.optimizationr   Útransformersr   r   rƒ  ÚinsertrG  Ú__file__ÚparentÚsvd.pipelinesr   Ú
svd.modelsr   Údatar   ÚdecordÚimageiorO  rT   rh  r>   rX   r*   r,   rg  rv  rY   r`  ra  rb  rc  rd  re  ÚModuler   r_   rj   ro   rw   rW  r„   r“   r®   r¯   rá   rË  r3   r3   r3   r4   Ú<module>   s‚    F



,   

ÿ