o
    ²Ç¯iÆ#  ã                   @   sÞ   d Z ddlZddlZddlZddlmZ ddlZddlm  m	Z
 ddlmZ ddlmZ ddlZddlmZ eeƒ ¡ jd Zej deeƒ¡ ddlmZ dd	lmZmZmZ d
Zddd„Z dd„ Z!e"dkrme!ƒ  dS dS )zeTrain student to match teacher DINO features. Optional wandb with --log_wandb. PCA vis of GT vs pred.é    N)ÚPath)Ú
DataLoader)Ú	make_grid)Útqdmé   )ÚStudentDinoVideo)ÚDroidVideoDatasetÚCachedClipDatasetÚcollate_batché   é   c                 C   s`  | j \}}}}}|  ddddd¡ d|¡}| ¡  ¡  ¡ }|durY| ddddd¡ d|¡ ¡  ¡  ¡ }	|	jdd}
|	|
 }tjj	|d	d
\}}}|d|… j
}||
 }|| }n |jdd}
||
 }tjj	|d	d
\}}}|d|… j
}|| }| |||||¡}| ddddd¡}| ¡ | ¡ }}||kr || ||  }| |||||¡}t |¡ ¡ S )z…feats: (B, T, D, H, W). Returns (B, T, 3, H, W) in [0,1] for RGB.
    PCA fit from fit_ref if given. Min-max norm across whole video.r   r   r   é   é   éÿÿÿÿN)ZaxisF)Zfull_matrices)ÚshapeÚpermuteZreshapeZdoubleÚcpuÚnumpyÚmeanÚnpZlinalgZsvdÚTZ	transposeÚminÚmaxÚtorchZ
from_numpyÚfloat)ZfeatsZn_componentsÚfit_refÚBr   ÚDÚHÚWÚxÚrefr   Zref_centeredÚUÚSZVhZprojZ
x_centeredZoutZloZhi© r$   ú,/data/cameron/vidgen/dino_vid_model/train.pyÚpca_vis   s.   &
r&   c            &      C   s|  t  ¡ } | jdtdd | jdtdd | jdtd dd | jd	td
d | jdtddd | jdtdd | jdtdd | jdtdd | jdtdd | jdddd | jdtddd | jdtddd | jd td!d"d | jd#td$d%d | jd&td'd(d |  ¡ }t|jƒ 	¡ }t
 |j¡}|jr¦d)d l}|jd't|ƒ|jd* t|ƒ |¡}t
jj| ¡ |jd+}|jrÍt|jƒ}t|ƒ}td,|› d-ƒ nt|jdd.d/d0}t|ƒ}td,|› d1ƒ t||jd2|jt d2|jd)k|jd)kröd
nd d3}	d)}
t!|j"d4d5d6}|
|j"k r*|jrHt|jƒ}t|ƒ}|d)kr'td7ƒ t# $d8¡ qt||jd2|jt d2|jd)k|jd)kr<d
nd d3}	td9|› d:ƒ d }d2}|	D ]Ô}|r]|d u r[|}|}|
|j"krf n¾|j|d2d;}| %d)d<d=d>d
¡}|j&|||j'd?\}}|}|}t( )| ¡ | ¡ ¡}t( )| ¡ | ¡ ¡}|| }|j*| }| +¡  | ,¡  | -¡  |jrÊd)d l}|j.| /¡ | /¡ | /¡ | /¡ d@œ|
dA |
|j0 d)krâ|jrâtdB|
› ƒ d)d l}t
 1¡   |d d=…  2¡  3¡ }|j4|d d=… |j'd?\}}W d   ƒ n	1 s	w   Y  t5|d dC}|d) }t6|d
dD}|d d=… }|d)  %d=d)d<d>¡d! dE  7d)d=¡}t6|d
dD}t
 1¡  |d d=…  2¡  3¡ }W d   ƒ n	1 sVw   Y  |d) d! dE  7d)d=¡}t6|d
dD} |d)  2¡  3¡ d! dE  7d)d=¡}!|d)  2¡  3¡ d! dE  7d)d=¡}"t6|!d
dD}#t6|"d
dD}$|j.| 8| %d=d<d)¡ 3¡  9¡ ¡| 8|  %d=d<d)¡ 3¡  9¡ ¡| 8|# %d=d<d)¡ 3¡  9¡ ¡| 8|$ %d=d<d)¡ 3¡  9¡ ¡| 8| %d=d<d)¡ 3¡  9¡ ¡dFœ|
dA |
d=7 }
| :d=¡ |
d)kr"|
|j; d)kr"t|j<ƒ}%|%j=d2d2dG t
 >|
| ?¡ | ?¡ dHœ|%dI|j› dJ ¡ tdK|j› dLƒ qN|
|j"k s
| @¡  |jr<d)d l}| A¡  d S d S )MNz	--keygripz/data/cameron/keygrip)ÚtypeÚdefaultz--data-rootz /data/weiduoyuan/droid_raw/1.0.1z--cache-dirzJUse pre-extracted .pt clips (run precache_clips.py first) for fast loading)r'   r(   Úhelpz--batch-sizer   z	--workersr   z-DataLoader workers (increase for large batch)z--lrg-Cëâ6?z--stepsi'  z--vis-everyéd   z--deviceÚcudaz--log_wandbZ
store_truezLog to wandb)Úactionr)   z--checkpoint-dirZcheckpointsz6Save student checkpoint every --checkpoint-every stepsz--checkpoint-everyiè  z*Save interval when --checkpoint-dir is setz--rgb-loss-weightg      ð?z0Weight for RGB L1 reconstruction loss at 256x256z--t-stargÍÌÌÌÌÌì?z+MIP t* (noise/interpolant level for step-2)z--nameZdino_vid_modelzName of the runr   )ZprojectZconfigÚname)Úlrz	Dataset: z( samples (cache; will rescan each epoch)g      @é   )Z
num_framesZ
sample_fpsÚsizez videosT)Ú
batch_sizeZshuffleZnum_workersZ
collate_fnZ
pin_memoryZpersistent_workersZprefetch_factorÚstepsÚstep)ÚtotalZdescZunitzCache empty, waiting 10s...é
   zRescan cache: z samples)Znon_blockingr   r   r   )Út_star)ztrain/loss_totalztrain/loss_rgbztrain/loss_rgb0ztrain/loss_rgb1)r3   zVisualizing at step )r   )Znrowg       @)zvis/input_rgbzvis/recon_rgbzvis/recon_step0zvis/recon_step1zvis/pca_pred)ÚparentsÚexist_ok)r3   ÚstudentZ	optimizerZrun_z
_latest.ptzSaved checkpoint run_z.pt)BÚargparseZArgumentParserZadd_argumentÚstrÚintr   Z
parse_argsr   ZkeygripÚresolver   ÚdeviceZ	log_wandbÚwandbZinitÚvarsr-   r   ZtoZoptimZAdamWZ
parametersr.   Z	cache_dirr	   ÚlenÚprintr   Z	data_rootr   r1   Zworkersr
   r   r2   ÚtimeÚsleepr   Zmip_train_predsr6   ÚFZmse_lossZrgb_loss_weightZ	zero_gradZbackwardr3   ÚlogÚitemZ	vis_everyZno_gradÚdetachr   Zmip_infer_stepsr&   r   ZclampZImager   ÚupdateZcheckpoint_everyZcheckpoint_dirÚmkdirZsaveZ
state_dictÚcloseZfinish)&ÚpÚargsZkeygrip_rootr>   r?   r9   ZoptZdatasetZ	n_samplesÚloaderZglobal_stepZpbarÚyZoverfitr    Zx_targetZa0_hatZa1_hatZx_recZz_predZ	rgb_loss0Z	rgb_loss1Zrgb_lossZlossZ
z_pred_visZinfer0Zinfer1Zpca_predZ	grid_predZinp_rgbZgrid_rgbZ	x_rec_visZgrid_rgb_recZ
infer0_visZ
infer1_visZgrid_rgb_step0Zgrid_rgb_step1Zckpt_dirr$   r$   r%   Úmain5   s  
ø


ø


üù

ü$
ÿ  úù

þ€ŸbþrP   Ú__main__)r   N)#Ú__doc__r:   ÚsysrC   Zpathlibr   r   Ztorch.nn.functionalZnnZ
functionalrE   Ztorch.utils.datar   Ztorchvision.utilsr   r   r   r   Ú__file__r=   r7   Zvidgen_rootÚpathÚinsertr;   Zdino_vid_model.modelr   Zdino_vid_model.datasetr   r	   r
   Z
NUM_FRAMESr&   rP   Ú__name__r$   r$   r$   r%   Ú<module>   s,    
 
ÿ