o
    TSÍiP^  ã                   @   s¸  d Z ddlZddlZddlZddlZddlZddlmZ ddlZddl	Z
ddlZddlmZ ddlm  mZ ddlmZ ddlmZ ddlmZ ej deeeƒjƒ¡ edƒZej deeƒ¡ ddlmZ d	Zd
Z dZ!dZ"dZ#dZ$dZ%dZ&d\Z'Z(dZ)dZ*e&d Z+da,da-da.da/g d¢a0g d¢a1dZ2dZ3G dd„ dej4ƒZ5G dd„ dej4ƒZ6e"fdd„Z7d d!„ Z8d"d#„ Z9d$d%„ Z:d&d'„ Z;d(d)„ Z<d*d+„ Z=d3d.d/„Z>d0d1„ Z?e@d2krÚe?ƒ  dS dS )4aè  Train SVD Video Diffusion + PARA action heads on LIBERO.

Frozen SVD UNet as feature extractor: extract up_block_2 features at mid and late
denoising timesteps, concatenate, upsample to 64x64, feed to PARA heads.

Usage:
    CUDA_VISIBLE_DEVICES=4 python train_svd_para.py         --cache_root /data/libero/ood_objpos_task0         --svd_checkpoint /data/cameron/vidgen/svd_motion_lora/Motion-LoRA/output_libero_7f/checkpoint-46000/unet         --log_wandb --run_name svd_para_ood_objpos
é    N)ÚPath)Ú
DataLoader)Útqdm)ÚRotationz0/data/cameron/vidgen/svd_motion_lora/Motion-LoRA)ÚCachedTrajectoryDatasetiÀ  )i@  i@  é@   é    é   i€  )é(   éH   é   é   é   g        ç      ð?g      ð¿)çn†ðù!	Àr   r   )çn†ðù!	@r   r   g      @ç      à?c                       s<   e Zd ZdZd‡ fdd„	Zdd„ Ze ¡ dd	d
„ƒZ‡  Z	S )ÚSVDFeatureExtractorzFFrozen SVD UNet that extracts up_block_2 features at two noise levels.NÚcudac                    sJ  t ƒ  ¡  ddlm} ddlm}m} ddlm}m	} |p|}	t
d|	› ƒ |j|	dt|	ƒvr1dnd tjd| _| j ¡  | j ¡ D ]}
d|
_qB|j|d	tjd| _| j ¡  | j ¡ D ]}
d|
_q\|j|d
tjd| _| j ¡  | j ¡ D ]}
d|
_qv|j|dd| _|j|dd| _| jjd|d i | _| jjd  |  d¡¡ d S )Nr   )Ú UNetSpatioTemporalConditionModel)ÚAutoencoderKLTemporalDecoderÚEulerDiscreteScheduler)ÚCLIPImageProcessorÚCLIPVisionModelWithProjectionzLoading SVD UNet from: Úunet)Ú	subfolderZtorch_dtypeFÚvaeÚimage_encoderÚfeature_extractor)r   Ú	scheduleré   ©Údevicer   Ú
up_block_2)ÚsuperÚ__init__Z
svd.modelsr   Z	diffusersr   r   Ztransformersr   r   ÚprintÚfrom_pretrainedÚstrÚtorchÚfloat16r   ÚevalÚ
parametersÚrequires_gradr   r   r   r   Zset_timestepsÚ	_capturedZ	up_blocksÚregister_forward_hookÚ
_make_hook)ÚselfÚsvd_base_pathÚsvd_unet_pathr"   r   r   r   r   r   Z	unet_pathÚp©Ú	__class__© ú0/data/cameron/para_videopolicy/train_svd_para.pyr%   M   s@   
þ
ÿ
ÿ
ÿzSVDFeatureExtractor.__init__c                    s   ‡ ‡fdd„}|S )Nc                    s2   t |tƒr|d  ¡ ˆjˆ < d S | ¡ ˆjˆ < d S )Nr   )Ú
isinstanceÚtupleÚdetachr.   )ÚmoduleÚinputÚoutput©Únamer1   r7   r8   Úhook_fn{   s   
z/SVDFeatureExtractor._make_hook.<locals>.hook_fnr7   )r1   r@   rA   r7   r?   r8   r0   z   s   zSVDFeatureExtractor._make_hooké   c                 C   sN  |j }|jd }|d d }| j | ¡ ¡j ¡ }|| jjj }| 	d¡ 
dd|dd¡}|| jjj }g }t|ƒD ]*}	t ||	 dd¡}
|
 ddd¡ ¡  ¡ d  tj¡}ddlm} | | |¡¡ q9| j|dd	j}|j|tjd
}|  |¡j 	d¡}tjg d¢g|tjd 
|d¡}t |¡}g }t t!fD ]…}| j"j#| }| j"j$| }|||  }||d d d  }| ddddd¡}| ddddd¡}tj%||gdd}| j& '¡  | j(|| 	d¡||dj}| j&d }|jd || kr|j)||g|jdd… ¢R Ž dd…df }n|jd |kr|dd… }| | *¡ ¡ q˜tj%|dd}|S )a  
        Extract UNet features at mid and late denoising timesteps.

        Args:
            image_tensor: (B, 3, H, W) in [0, 1], at SVD resolution
        Returns:
            features: (B, COMBINED_FEAT_DIM, feat_H, feat_W) = (B, 1280, 40, 72)
        r   g       @r   r   rB   éÿ   ©ÚImageÚpt)ÚimagesZreturn_tensors)Údtype)é   é   g{®Gáz”?©r"   rH   r   é   r	   ©Údim)Zencoder_hidden_statesÚadded_time_idsr#   N)+r"   Úshaper   ÚencodeÚhalfZlatent_distÚsampleÚconfigZscaling_factorÚ	unsqueezeÚrepeatÚranger)   ÚclampÚpermuteÚcpuÚnumpyÚastypeÚnpÚuint8ÚPILrE   ÚappendÚ	fromarrayr   Zpixel_valuesÚtor*   r   Zimage_embedsÚtensorÚ
randn_likeÚMID_STEPÚ	LATE_STEPr   ZsigmasZ	timestepsÚcatr.   Úclearr   ÚviewÚfloat)r1   Zimage_tensorZ
num_framesr"   ÚBZimg_normZlatentZcond_latentZ
clip_inputÚiZimg_pilZimg_nprE   Zclip_processedZimage_embeddingsrO   ÚnoiseZ	all_featsZstep_idxÚsigmaÚtZnoisy_latentÚscaled_inputZnoisy_5dZcond_5dZ
unet_inputÚ_ZfeatÚcombinedr7   r7   r8   Úextract_features‚   sV   

" 


ýü
,z$SVDFeatureExtractor.extract_features)Nr   )rB   )
Ú__name__Ú
__module__Ú__qualname__Ú__doc__r%   r0   r)   Úno_gradrs   Ú__classcell__r7   r7   r5   r8   r   J   s    -r   c                       s4   e Zd ZdZeeeef‡ fdd„	Zddd„Z	‡  Z
S )Ú	ParaHeadsz<Volume + gripper + rotation heads on SVD diffusion features.c                    s°   t ƒ  ¡  |}|| _|| _|| _t tj|ddddt ¡ tjdddddt ¡ tjdddddt ¡ ¡| _	t d|| d¡| _
t d|t d¡| _t d|d t d¡| _d S )Ni   rL   rB   )Úpadding)r$   r%   Úpara_out_sizeÚn_height_binsÚn_windowÚnnÚ
SequentialÚConv2dÚGELUÚfeature_netÚvolume_headÚN_GRIPPER_BINSÚgripper_headÚ
N_ROT_BINSÚrotation_head)r1   Úfeat_dimr|   r~   r}   ÚDr5   r7   r8   r%   Ó   s   
ýzParaHeads.__init__Nc                 C   sV  |j d }| j}tj|||fddd}|  |¡}|  |¡}| || j| j||¡}d }}	|dur¥| j}
|d  	¡  
d|d ¡}|d  	¡  
d|d ¡}|  | ¡ ¡}| ||
t||¡}tj||jd	 |d¡ ||
¡}tj|
|jd	 d|
¡ ||
¡}|||dd…||f }|  | ¡ ¡}| ||
d
t||¡}|||dd…dd…||f }	||||	fS )a–  
        Args:
            features: (B, D, H_feat, W_feat) from SVD extractor
            query_pixels: (B, N_WINDOW, 2) in PARA_OUT_SIZE coords [x, y]
        Returns:
            volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, P, P)
            feats: (B, 512, P, P)
            gripper_logits: (B, N_WINDOW, N_GRIPPER_BINS) or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS) or None
        r   ÚbilinearF©ÚsizeÚmodeÚalign_cornersN).r   rB   ).rB   r!   rL   )rP   r|   ÚFÚinterpolaterƒ   r„   ri   r~   r}   ÚlongrX   r†   r;   r…   r)   Úaranger"   Úexpandrˆ   r‡   )r1   ÚfeaturesÚquery_pixelsrk   ÚPÚxÚfeatsÚvolZgripper_logitsZrotation_logitsÚNÚpxÚpyZgrip_mapZ	batch_idxZtime_idxZrot_mapr7   r7   r8   Úforwardæ   s(   


  zParaHeads.forward©N)rt   ru   rv   rw   ÚCOMBINED_FEAT_DIMÚPARA_OUT_SIZEÚN_WINDOWÚN_HEIGHT_BINSr%   rž   ry   r7   r7   r5   r8   rz   Ð   s    ÿrz   c                 C   s8   | | || d  }|  dd¡|d   ¡   d|d ¡S ©Nç:Œ0âŽyE>r   rB   )rX   r’   )Zheight_valuesÚmin_hÚmax_hÚn_binsÚ
normalizedr7   r7   r8   Údiscretize_height  ó   $rª   c                 C   ó8   | | || d  }|  dd¡td   ¡   dtd ¡S r¤   )rX   r…   r’   )Zgripper_valuesÚmin_gÚmax_gr©   r7   r7   r8   Údiscretize_gripper  r«   r¯   c                 C   r¬   r¤   )rX   r‡   r’   )Zeuler_valuesÚmin_rÚmax_rr©   r7   r7   r8   Údiscretize_rotation  r«   r²   c                 C   sú   | j \}}}}}|d d …d d …df  ¡  d|d ¡}|d d …d d …df  ¡  d|d ¡}	| d|d ¡}
g }t|ƒD ]9}| d d …|f  |d¡}|
d d …|f ||  |	d d …|f |  |d d …|f   ¡ }| tj||dd¡ q<t 	|¡ 
¡ S )Nr   rB   éÿÿÿÿÚmean)Ú	reduction)rP   r’   rX   rW   Úreshaper`   r   Úcross_entropyr)   Ústackr´   )ÚpredÚtraj_2dÚtarget_h_binsrk   r›   ÚNhÚHÚWrœ   r   Zh_binÚlossesro   Zlogits_flatZ
target_idxr7   r7   r8   Úcompute_volume_loss  s   &&@rÀ   c                 C   s:   t |||ƒ}| j\}}}t |  || |¡| || ¡¡S rŸ   )r¯   rP   r   r·   r¶   )r¹   Útargetr­   r®   Útarget_binsrk   r›   ZNgr7   r7   r8   Úcompute_gripper_loss+  s   "rÃ   c                 C   s”   t |||ƒ}| j\}}}}g }	tdƒD ]/}
| d d …d d …|
d d …f  || |¡}|d d …d d …|
f  || ¡}|	 t ||¡¡ qt |	¡ 	¡ S )NrL   )
r²   rP   rW   r¶   r`   r   r·   r)   r¸   r´   )r¹   Ztarget_eulerr°   r±   rÂ   rk   r›   rq   ZNrr¿   ÚaxisÚlogitsrÁ   r7   r7   r8   Úcompute_rotation_loss0  s   ( rÆ   c                 C   s  | j \}}}}}| j}tj||d|d}	tj|||tjd}
t|ƒD ]S}| d d …|f }|jdd\}}| |d¡jdd}|| }|| }| 	¡ |	d d …|df< | 	¡ |	d d …|df< |tj
||dd d …||f jdd|
d d …|f< q"tjdd||d}||
 ||  | }|	|fS )Nr   r!   rK   rB   rM   r³   r   )rP   r"   r)   Úzerosr’   rW   Úmaxri   Úargmaxrj   r“   Úlinspace)Zvolume_logitsr¦   r§   rk   r›   r¼   r½   r¾   r"   Úpred_2dZpred_h_binsro   Zvol_tZ
max_over_hrq   Zflat_idxr   rœ   Zbin_centersÚpred_heightr7   r7   r8   Úextract_pred_2d_and_height:  s    4rÍ   éô  é*   c              	   C   s   t  |¡}t|t| ƒƒ}| tt| ƒƒ|¡}g g g }}}t|dddD ]7}	z| |	 }
W n	 ty6   Y q%w | |
d  	¡ d d …df  
¡ ¡ | |
d  	¡  
¡ ¡ | |
d  	¡ ¡ q%t |¡}t |¡}t |d¡}t| ¡ ƒt| ¡ ƒt| ¡ ƒt| ¡ ƒ| d¡ 
¡ | d¡ 
¡ d	œS )
NzComputing statsF©ÚdescÚleaveÚtrajectory_3dr   Útrajectory_gripperÚtrajectory_eulerr   )Ú
min_heightÚ
max_heightÚmin_gripperÚmax_gripperÚmin_rotÚmax_rot)ÚrandomÚRandomÚminÚlenrS   rW   r   Ú	ExceptionÚextendr[   Útolistr`   r]   ÚarrayÚconcatenaterj   rÈ   )ÚdatasetZsample_limitÚseedÚrngÚnÚindicesZall_hZall_gZall_eÚidxÚsÚhÚgÚer7   r7   r8   Úcompute_dataset_statsP  s$   
ÿ" ýrï   c            <      C   sö  t  ¡ } | jdtdd | jdtdd | jdtdd | jdtd	d | jd
tdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jdtdd | jddd | jdtdd | jdtdd |  ¡ }t |j¡}t	t
ƒjd  |j }|jd!d!d" |jr¨d#d l}|jd$t|ƒ|jd% t|j|j|d& |¡}td'ƒ td(tj ¡ d) d*›d+ƒ ttttd, |¡}td-td.d/„ | ¡ D ƒƒd0›d1ƒ d2d3„ |j  d0¡D ƒ}t!|j"|j#|t$t|j%d4}td5t&|ƒ› d6ƒ t'd7tt&|ƒd8 ƒƒ}	t&|ƒ|	 }
tj(j)j*||
|	gt +¡  ,d9¡d:\}}t-||j.d!|j/d!d;}t-||j.d<|j/d!d;}td=t&|ƒ› d>t&|ƒ› ƒ |d? }| 0¡ r]t1 2t3|ƒ¡}nt4|ƒ}t1j5|t3|d@ƒdAdB |dC |dD a6a7|dE |dF a8a9|dG |dH a:a;tj<t:tj=|dI}tj<t;tj=|dI}tdJt6dK›dLt7dK›dMƒ tdNt8dK›dLt9dK›dMƒ tt$ }tj>j?| ¡ |j@ddO}tj>jAjB||jCt&|ƒ dP}d#}tDtE|jCƒdQdRD ]‰}| F¡  tD|dS|› d<dT}|D ]t}|dU }t <g dV¢¡ Gd7dd7d7¡ |¡}t <g dW¢¡ Gd7dd7d7¡ |¡}| |¡| | }tHjI|tJdXd<dY}t K¡  | L|¡}W d   ƒ n	1 s=w   Y  |dZ  |¡d d …d t…f } |d[  |¡d d …d t…f }!|d\  |¡d d …d t…f }"|d]  |¡d d …d t…f }#| |  Md#td^ ¡}$tN|!d d …d d …dAf t6t7ƒ}%|||$d_\}&}'}(})tO|&|$|%ƒ}*tP|(|"t8t9ƒ}+tQ|)|#||ƒ},|*tR|+  tS|,  }-| T¡  |- U¡  tjVj( W| ¡ d`¡ | X¡  | X¡  |jY|* Z¡ da›|+ Z¡ da›|, Z¡ da›db |jrd#d l}|j[|- Z¡ |* Z¡ |+ Z¡ |, Z¡ | \¡ d# dcœ|dd ||j] d#kr5|jr5d#d l}d#d l^}.|. _de¡ d#d l`ma}/ | b¡  t K¡ ó tHjc|&d d7…  dd7tdf¡dAdg}0|0 Gd7ttett¡}0|0j'dAdgd# }1|d#  fd7dAd#¡ g¡  h¡ dh  itjjk¡}2tl m|2ttf¡}3|/jnd7tdt dfdi\}4}5tEtƒD ]m}6|1d#|6f  g¡  h¡ }7|7|7 o¡  |7 '¡ |7 o¡  dj  }8|3dk dl tjjp|8tj q|8¡tj q|8¡gdfdmdl  }9|5|6  rtj s|9d#d7¡¡ |5|6 jt|$d#|6d#f  g¡ |$d#|6d7f  g¡ dndodpdAdq |5|6  udr|6› ¡ |5|6  vds¡ q„|/ w¡  tx y¡ }:|/jz|:dtddu |: {d#¡ d#dvl|m}}; |j[dw| }|; 3|:¡¡i|dd |/ ~dx¡ W d   ƒ n	1 s,w   Y  | F¡  |d#kr_||j d#kr_t €||| ¡ | ¡ |dyœ|dz|› d{ ¡ td||› ƒ |d77 }qïqÜtd}ƒ |jryd#d l}| ‚¡  d S d S )~Nz--cache_rootz/data/libero/ood_objpos_task0)ÚtypeÚdefaultz--benchmarkÚlibero_spatialz
--task_idsÚ0z
--svd_basezb/data/cameron/vidgen/svd_motion_lora/Motion-LoRA/checkpoints/stable-video-diffusion-img2vid-xt-1-1z--svd_checkpointzW/data/cameron/vidgen/svd_motion_lora/Motion-LoRA/output_libero_7f/checkpoint-46000/unetz--batch_sizer	   z--lrg-Cëâ6?z--epochsrÎ   z--frame_striderL   z	--workersz--devicer   z
--run_nameZsvd_para_ood_objposz--log_wandbÚ
store_true)Úactionz--vis_everyéd   z--checkpoint_everyiè  ÚcheckpointsT)ÚparentsÚexist_okr   Zsvd_para_libero)ZprojectrT   r@   )r2   r3   r"   z%SVD feature extractor loaded (frozen)z  GPU memory: g    eÍÍAz.2fz GB)r‰   r|   r~   zParaHeads: c                 s   s    | ]}|  ¡ V  qd S rŸ   )Únumel)Ú.0r4   r7   r7   r8   Ú	<genexpr>”  s   € zmain.<locals>.<genexpr>ú,z trainable paramsc                 S   s   g | ]}t |ƒ‘qS r7   )Úint)rû   r˜   r7   r7   r8   Ú
<listcomp>—  s    zmain.<locals>.<listcomp>)Ú
cache_rootÚbenchmark_nameÚtask_idsÚ
image_sizer~   Úframe_stridez	Dataset: z samplesrB   gš™™™™™©?rÏ   )Ú	generator)Ú
batch_sizeÚshuffleÚnum_workersÚ
pin_memoryFzTrain: z, Val: zdataset_stats.jsonÚwr   )ÚindentrÖ   r×   rØ   rÙ   rÚ   rÛ   )rH   r"   z	Height: [z.4fz, ú]z
Gripper: [)ÚlrÚweight_decay)ÚT_maxZEpochs)rÑ   zEpoch rÐ   Úrgb)g
×£p=
ß?gÉv¾Ÿ/Ý?g–C‹lçûÙ?)gZd;ßOÍ?gyé&1¬Ì?gÍÌÌÌÌÌÌ?r‹   rŒ   Ztrajectory_2drÓ   rÔ   rÕ   gj¼t“ð?)r–   r   z.3f)rš   ZgripZrot)ztrain/total_lossztrain/volume_lossztrain/gripper_lossztrain/rotation_lossztrain/lr)ÚstepZAggr³   rM   rC   )Úfigsizer¥   g     ào@r   )rÄ   Úcyanr
   r˜   )Úcrë   ÚmarkerZ
linewidthszt=ÚoffÚpng)ÚformatÚdpirD   zvis/heatmapsÚall)ÚepochÚglobal_stepÚpara_heads_state_dictZoptimizer_state_dictÚstatsZcheckpoint_z.ptzSaved checkpoint at step zTraining complete!)ƒÚargparseÚArgumentParserÚadd_argumentr(   rþ   rj   Ú
parse_argsr)   r"   r   Ú__file__ÚparentZrun_nameÚmkdirZ	log_wandbÚwandbÚinitÚvarsr   Úsvd_baseZsvd_checkpointrb   r&   r   Úmemory_allocatedrz   r    r¡   r¢   Úsumr,   r  Úsplitr   r   Ú	benchmarkÚPRERENDER_SIZEr  rß   rÈ   ÚutilsÚdataÚrandom_splitÚ	GeneratorÚmanual_seedr   r  ZworkersÚexistsÚjsonÚloadÚopenrï   ÚdumpÚ
MIN_HEIGHTÚ
MAX_HEIGHTÚMIN_GRIPPERÚMAX_GRIPPERÚMIN_ROTÚMAX_ROTrc   Úfloat32ÚoptimÚAdamWr  Úlr_schedulerÚCosineAnnealingLRÚepochsr   rW   Útrainri   r   r‘   ÚSVD_SIZErx   rs   rX   rª   rÀ   rÃ   rÆ   ÚGRIPPER_LOSS_WEIGHTÚROTATION_LOSS_WEIGHTÚ	zero_gradÚbackwardr   Úclip_grad_norm_r  Úset_postfixÚitemÚlogÚget_last_lrZ	vis_everyÚ
matplotlibZuseÚmatplotlib.pyplotÚpyplotr+   Úsoftmaxr¶   r£   rY   rZ   r[   r\   r]   r^   Úcv2ÚresizeÚsubplotsrÞ   r¸   Ú
zeros_likeÚimshowÚclipÚscatterÚ	set_titlerÄ   Ztight_layoutÚioÚBytesIOÚsavefigÚseekr_   rE   ÚcloseZcheckpoint_everyÚsaveÚ
state_dictÚfinish)<r4   Úargsr"   Zckpt_dirr&  Úsvd_extractorÚ
para_headsr  rå   Zval_sizeZ
train_sizeZtrain_dsZval_dsZtrain_loaderZ
val_loaderZ
stats_pathr  Zmin_r_tZmax_r_tÚcoord_scaleÚoptr   r  r  ÚpbarÚbatchr  r´   ÚstdÚrgb_01Úrgb_svdr•   rº   Ztraj_3dZtraj_gripperZ
traj_eulerZ	traj_parar»   Ú
vol_logitsr™   Úgrip_logitsÚ
rot_logitsZvol_lossZ	grip_lossZrot_lossZ
total_lossrP  ÚpltÚheatmapsZheatmaps_2dZinput_frameZinput_smallÚfigÚaxesro   ÚheatÚ	heat_normÚoverlayÚbufrE   r7   r7   r8   Úmainh  sd  ÿÿýüýü$ú
ÿ
ÿ
ÿ

  
ÿÿ
ÿ
ûú	

 & ÿÿ(ÿ
çüûeþry  Ú__main__)rÎ   rÏ   )Arw   r  r\  r5  rÜ   ÚsysÚpathlibr   rT  r[   r]   r)   Útorch.nnr   Útorch.nn.functionalÚ
functionalr   Útorch.utils.datar   r   Zscipy.spatial.transformr   ZScipyRÚpathÚinsertr(   r#  r$  ÚSVD_ROOTr0  r   r.  rF  r¡   r£   r…   r‡   r¢   ZSVD_FEAT_DIMZ
SVD_FEAT_HZ
SVD_FEAT_Wre   rf   r    r9  r:  r;  r<  r=  r>  rG  rH  ÚModuler   rz   rª   r¯   r²   rÀ   rÃ   rÆ   rÍ   rï   ry  rt   r7   r7   r7   r8   Ú<module>   sn     C

 N
ÿ