o
    #iM                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZm Z m!Z! G dd dej"Z#G dd de#Z$dS )    N)abstractmethod)contextmanager)AnyDictTupleUnionOptionalList)
ListConfig)version)	load_file)LitEma)defaultget_obj_from_strinstantiate_from_configprint0)pack_one
unpack_one	rearrangec                
       s   e Zd ZdZ				ddedef dedef dedef def fdd	Zee	 d
fdedee
eef deddfddZedefddZdd Zed ddZedejfddZedejfddZdd ZedefddZ  ZS )!AbstractAutoencoderz5
    This is the base class for all autoencoders
    Njpg	ema_decaymonitormode	input_keyc                    s`   t    || _|d u| _|| _|d ur|| _|d ur|| _tt	j
tdkr.d| _d S d S )N2.0.0F)super__init__r   use_emar   r   r   r   parsetorch__version__automatic_optimization)selfr   r   r   r   	__class__ 8/data/cameron/vidgen/VidTok/vidtok/models/autoencoder.pyr      s   


zAbstractAutoencoder.__init__Tpathignore_keysverbosereturnc                 C      t  NNotImplementedError)r#   r(   r)   r*   r&   r&   r'   init_from_ckpt+      z"AbstractAutoencoder.init_from_ckptc                 C   r,   r-   r.   r#   batchr&   r&   r'   	get_input/   r1   zAbstractAutoencoder.get_inputc                 O   s   | j r
| |  d S d S r-   )r   	model_emar#   argskwargsr&   r&   r'   on_train_batch_end3   s   z&AbstractAutoencoder.on_train_batch_endc              
   c   s    | j r| j|   | j|  |d urtd| d z!d V  W | j r<| j|   |d ur>td| d d S d S d S | j rX| j|   |d urYtd| d w w w )NzO[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] z: Switched to EMA weightsz: Restored training weights)r   r5   store
parameterscopy_tor   restore)r#   contextr&   r&   r'   	ema_scope8   s4   


zAbstractAutoencoder.ema_scopec                 O      t d)Nz|[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class calledr.   r6   r&   r&   r'   encodeK      zAbstractAutoencoder.encodec                 O   r@   )Nz|[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class calledr.   r6   r&   r&   r'   decodeQ   rB   zAbstractAutoencoder.decodec                 C   s:   t d|d  d t|d |fd|i|dt S )Nz[[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> targetz <<< optimizer from configlrparams)r   r   getdict)r#   rF   rE   cfgr&   r&   r'   !instantiate_optimizer_from_configW   s   &z5AbstractAutoencoder.instantiate_optimizer_from_configc                 C   r,   r-   r.   r#   r&   r&   r'   configure_optimizers]   r1   z(AbstractAutoencoder.configure_optimizers)NNNr   r-   )__name__
__module____qualname____doc__r   floatstrr   r   tupler   listr
   boolr0   r   r4   r9   r   r?   r    TensorrA   rC   rJ   rL   __classcell__r&   r&   r$   r'   r      s:    


.r   c                       s  e Zd ZdZdddddededed	ed
eedf dedef fddZe	 dfde
deeeef deddfddZdedejfddZdefddZdefddZdd Zdejd ejd!edejfd"d#Zdejd ejd!edejfd$d%Z				dJd&ee d'ee d(ee d)ee ddf
d*d+ZdKd,d-ZdLd.ed/edefd0d1Zd.edefd2d3Zd4ejdejfd5d6ZdLd7ed8edejfd9d:Zd7edejfd;d<Z d.edeejejejf fd=d>Z!defd?d@Z"defdAdBZ#dMdefdDdEZ$defdFdGZ%e& dedefdHdIZ'  Z(S )NAutoencodingEnginez;
    Base class for all video tokenizers that we train
    N      ?F)optimizer_configlr_g_factorcompile_modelencoder_configdecoder_configloss_configregularizer_configrZ   r[   r\   c                   s  |	 dd }
|	 dd}|	 dd}t j|i |	 ttjtdkr+|r+tjndd }|t|| _	|t|| _
t|| _t|| _t|d	d
i| _|| _| j	j| _dt| j	j | _d| _d| _| j| j | _d| _d| _t| jdt| j	j  | _t| jdt| j	j  | _d| _d| _| jrt | | j!d| _"t#dtt$| j"%  d t#d|
  |
d ur| j&|
||d d S d S )N	ckpt_pathr)   r&   r*   Tr   c                 S   s   | S r-   r&   )xr&   r&   r'   <lambda>{   s    z-AutoencodingEngine.__init__.<locals>.<lambda>rD   ztorch.optim.Adam   F      r   )decayz^[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of .z][bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: )r)   r*   )'popr   r   r   r   r    r!   compiler   encoderdecoderlossregularizationr   rZ   r[   	is_causallenZtempo_dsZtemporal_compression_ratio
use_tilingZnum_sample_frames_batch_sizeZnum_latent_frames_batch_sizetile_sample_min_heighttile_sample_min_widthint
spatial_dstile_latent_min_heighttile_latent_min_widthtile_overlap_factor_heighttile_overlap_factor_widthr   r   r   r5   r   rT   buffersr0   )r#   r]   r^   r_   r`   rZ   r[   r\   r7   r8   ra   r)   r*   rj   r$   r&   r'   r   g   sH   


zAutoencodingEngine.__init__Tr(   r)   r*   r+   c                 C   s  | drtj|dd}d|v r|d n|}n| dr!t|}ntd| t| }|D ]}|D ]}t||rGt	d| d ||= q4q0| j
|d	d
\}	}
t	d| dt|	 dt|
 d |rt|	dkrst	d|	  t|
dkrt	d|
  d S d S d S )Nckptcpu)map_location
state_dictsafetensorszUnknown checkpoint: z[[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key z from state_dict.F)strictz\[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from z with z missing and z unexpected keysr   z\[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: z_[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: )endswithr    loadload_safetensorsr/   rT   keysrematchr   load_state_dictrp   )r#   r(   r)   r*   r{   weightsr   kikmissing
unexpectedr&   r&   r'   r0      s>   



z!AutoencodingEngine.init_from_ckptr3   c                 C   s
   || j  S r-   )r   r2   r&   r&   r'   r4         
zAutoencodingEngine.get_inputc                 C   sP   t tdd | j t tdd | j  t | j  t | j  }|S )Nc                 S      | j S r-   requires_gradpr&   r&   r'   rc          z;AutoencodingEngine.get_autoencoder_params.<locals>.<lambda>c                 S   r   r-   r   r   r&   r&   r'   rc      r   )	rT   filterrk   r;   rl   rn   get_trainable_parametersrm   Z$get_trainable_autoencoder_parametersr#   rF   r&   r&   r'   get_autoencoder_params   s   z)AutoencodingEngine.get_autoencoder_paramsc                 C   s   t | j }|S r-   )rT   rm   r   r   r&   r&   r'   get_discriminator_params   s   z+AutoencodingEngine.get_discriminator_paramsc                 C   s
   | j  S r-   )rl   get_last_layerrK   r&   r&   r'   r      r   z!AutoencodingEngine.get_last_layerabblend_extentc              	   C   s   t |jd |jd |}t|D ]@}|d d d d d d | | d d f d||   |d d d d d d |d d f ||   |d d d d d d |d d f< q|S )N      minshaperange)r#   r   r   r   yr&   r&   r'   blend_v      R&zAutoencodingEngine.blend_vc                 C   s   t |jd |jd |}t|D ]@}|d d d d d d d d | | f d||   |d d d d d d d d |f ||   |d d d d d d d d |f< q|S )N   r   r   )r#   r   r   r   rb   r&   r&   r'   blend_h   r   zAutoencodingEngine.blend_hrr   rs   rx   ry   c                 C   sr   d| _ |p| j| _|p| j| _t| jdt| jj  | _t| jdt| jj  | _|p/| j	| _	|p5| j
| _
d S )NTrd   )rq   rr   rs   rt   rp   rk   ru   rv   rw   rx   ry   )r#   rr   rs   rx   ry   r&   r&   r'   enable_tiling   s   z AutoencodingEngine.enable_tilingc                 C   s
   d| _ d S )NF)rq   rK   r&   r&   r'   disable_tiling   r   z!AutoencodingEngine.disable_tilingrb   return_reg_logc                 C   s\   | j r| |}| j|| jd d\}}n| |}| j|| jd d\}}|r,||fS |S )Nrd   )n_steps)rq   tile_encodern   global_steprk   )r#   rb   r   zreg_logr&   r&   r'   rA      s   

zAutoencodingEngine.encodec                 C   s  |j dd  \}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]Q}g }td||D ]A}d|gg}g }t	|D ])\}\}}|d d d d ||||| j ||| j f }| 
|}|| qW|tj|dd qJ|| q@g }t	|D ]O\}}g }t	|D ]:\}}|dkr| ||d  | ||}|dkr| ||d  ||}||d d d d d d d |	d |
f  q|tj|dd qtj|dd}|S Nr   r   rd   )dimr   r   )r   rt   rr   rx   rs   ry   rv   rw   r   	enumeraterk   appendr    catr   r   )r#   rb   
num_framesheightwidthoverlap_heightoverlap_widthblend_extent_heightblend_extent_widthrow_limit_heightrow_limit_widthrowsirowj	start_endZresult_zidxstart_frame	end_frametileresult_rows
result_rowencr&   r&   r'   r      sL   



.zAutoencodingEngine.tile_encodetoken_indicesc                 C   sT   t |d}t|d\}}| j|}t |d}| j|}t||d}t |d}|S )Nz... -> ... 1zb * dzb d n c -> b n (c d)zb ... d -> b d ...)r   r   rn   Zindices_to_codesZproject_outr   )r#   r   pscodesr   r&   r&   r'   indices_to_latent/  s   


z$AutoencodingEngine.indices_to_latentr   decode_from_indicesc                 C   s0   |r|  |}| jr| |}|S | |}|S r-   )r   rq   tile_decoderl   )r#   r   r   rb   r&   r&   r'   rC   9  s   


zAutoencodingEngine.decodec                 C   s  |j dd  \}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]j}g }td||D ]Z}d|gg}g }t	|D ]B\}\}}|d d d d ||||| j ||| j f }| 
|}| jr|d |kr|d d d d d | jj f }|| qW|tj|dd qJ|| q@g }t	|D ]O\}}g }t	|D ]:\}}|dkr| ||d  | ||}|dkr| ||d  ||}||d d d d d d d |	d |
f  q|tj|dd qtj|dd}|S r   )r   rt   rv   rx   rw   ry   rr   rs   r   r   rl   ro   rk   time_downsample_factorr   r    r   r   r   )r#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   timer   r   r   r   r   r   decr&   r&   r'   r   B  sP   



 .zAutoencodingEngine.tile_decodec                 C   sj   | j jr"t  | j|dd\}}W d    n1 sw   Y  n	| j|dd\}}| |}|||fS )NT)r   )rk   Zfix_encoderr    no_gradrA   rC   )r#   rb   r   r   r   r&   r&   r'   forwardt  s   


zAutoencodingEngine.forwardc              	   C   sz  |  |}|jdkr|d}| |\}}}|jdkr%|jdkr%|d}|  \}}| | | j|||d| j|  dd\}	}
|  | 	|	 t
jj|  d |  | | | | | j|||d| j|  dd\}}|  | 	| t
jj|  d |  | | |	|d	}||
 || | j|d
d
d
d
d |jd d }| jd|d
d
d
dd
d d S )Nr   rd      r   trainZ
last_layersplitg      4@r   )ztrain/aelossztrain/disclossT)prog_barloggeron_stepon_epochrE   Zlr_absF)r   r   r   r   	sync_dist)r4   ndim	unsqueeze
optimizerstoggle_optimizerrm   r   r   	zero_gradmanual_backwardr    nnutilsclip_grad_norm_r   stepuntoggle_optimizerr   updatelog_dictparam_groupslog)r#   r3   	batch_idxrb   r   xrecregularization_logZopt_gZopt_daelosslog_dict_aedisclosslog_dict_discr   rE   r&   r&   r'   training_step~  sj   





	



	




z AutoencodingEngine.training_stepc                 C   sT   |  ||}|   | j ||dd}|| W d    |S 1 s#w   Y  |S )NZ_ema)postfix)_validation_stepr?   r   )r#   r3   r   r   Zlog_dict_emar&   r&   r'   validation_step  s   

z"AutoencodingEngine.validation_step c              	   C   s   |  |}|jdkr|d}| |\}}}|jdkr%|jdkr%|d}| j|||d| j|  d| d\}}	| j|||d| j|  d| d\}
}| d| d|	d| d  |	| | |	 |	S )	Nr   rd   r   r   valr   r   z	/rec_loss)	r4   r   r   rm   r   r   r   r   r   )r#   r3   r   r   rb   r   r   r   r   r   r   r   r&   r&   r'   r     s8   






 


z#AutoencodingEngine._validation_stepc                 C   sL   |   }|  }| |t| jd| j | j}| || j| j}||gg fS )NrY   )r   r   rJ   r   r[   learning_raterZ   )r#   Z	ae_paramsZdisc_paramsZopt_aeZopt_discr&   r&   r'   rL     s   z'AutoencodingEngine.configure_optimizersc                 C   sr   t  }| |}| |\}}}||d< ||d< |   | |\}}}||d< W d    |S 1 s2w   Y  |S )NinputsrecsZrecs_ema)rH   r4   r?   )r#   r3   r   rb   _r   Zxrec_emar&   r&   r'   
log_images  s   



zAutoencodingEngine.log_images)NNNN)r+   N)F)r   ))rM   rN   rO   rP   r   r   rQ   rU   r   rS   rR   r   rT   r
   r0   r    rV   r4   r   r   r   rt   r   r   r   r   r   r   rA   r   r   rC   r   r   r   r   r   rL   r   r   rW   r&   r&   r$   r'   rX   b   sn    
	,8 	  


/
	"2
E$rX   )%r   abcr   
contextlibr   typingr   r   r   r   r   r	   	omegaconfr
   	packagingr   r    lightning.pytorchpytorchplZsafetensors.torchr   r   Zvidtok.modules.emar   vidtok.modules.utilr   r   r   r   Zvidtok.modules.regularizersr   r   r   LightningModuler   rX   r&   r&   r&   r'   <module>   s     P