o
    #iH\                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZm Z m!Z! G dd dej"Z#G dd de#Z$dS )    N)abstractmethod)contextmanager)AnyDictTupleUnionOptionalList)
ListConfig)version)	load_file)LitEma)defaultget_obj_from_strinstantiate_from_configprint0)pack_one
unpack_one	rearrangec                
       s   e Zd ZdZ				ddedef dedef dedef def fdd	Zee	 d
fdedee
eef deddfddZedefddZdd Zed ddZedejfddZedejfddZdd ZedefddZ  ZS )!AbstractAutoencoderz5
    This is the base class for all autoencoders
    Njpg	ema_decaymonitormode	input_keyc                    s`   t    || _|d u| _|| _|d ur|| _|d ur|| _tt	j
tdkr.d| _d S d S )N2.0.0F)super__init__r   use_emar   r   r   r   parsetorch__version__automatic_optimization)selfr   r   r   r   	__class__ =/data/cameron/vidgen/VidTok/vidtok/models/autoencoder_v1_1.pyr      s   


zAbstractAutoencoder.__init__Tpathignore_keysverbosereturnc                 C      t  NNotImplementedError)r#   r(   r)   r*   r&   r&   r'   init_from_ckpt+      z"AbstractAutoencoder.init_from_ckptc                 C   r,   r-   r.   r#   batchr&   r&   r'   	get_input/   r1   zAbstractAutoencoder.get_inputc                 O   s   | j r
| |  d S d S r-   )r   	model_emar#   argskwargsr&   r&   r'   on_train_batch_end3   s   z&AbstractAutoencoder.on_train_batch_endc              
   c   s    | j r| j|   | j|  |d urtd| d z!d V  W | j r<| j|   |d ur>td| d d S d S d S | j rX| j|   |d urYtd| d w w w )NzO[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] z: Switched to EMA weightsz: Restored training weights)r   r5   store
parameterscopy_tor   restore)r#   contextr&   r&   r'   	ema_scope8   s4   


zAbstractAutoencoder.ema_scopec                 O      t d)Nz|[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] encode()-method of abstract base class calledr.   r6   r&   r&   r'   encodeK      zAbstractAutoencoder.encodec                 O   r@   )Nz|[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] decode()-method of abstract base class calledr.   r6   r&   r&   r'   decodeQ   rB   zAbstractAutoencoder.decodec                 C   s:   t d|d  d t|d |fd|i|dt S )Nz[[bold magenta]\[vidtok.models.autoencoder][AbstractAutoencoder][/bold magenta] loading >>> targetz <<< optimizer from configlrparams)r   r   getdict)r#   rF   rE   cfgr&   r&   r'   !instantiate_optimizer_from_configW   s   &z5AbstractAutoencoder.instantiate_optimizer_from_configc                 C   r,   r-   r.   r#   r&   r&   r'   configure_optimizers]   r1   z(AbstractAutoencoder.configure_optimizers)NNNr   r-   )__name__
__module____qualname____doc__r   floatstrr   r   tupler   listr
   boolr0   r   r4   r9   r   r?   r    TensorrA   rC   rJ   rL   __classcell__r&   r&   r$   r'   r      s:    


.r   c                       s\  e Zd ZdZddddddededed	ed
eedf dededef fddZe	 dfde
deeeef deddfddZdedejfddZdefddZdefddZdd Zd d! ZdVd"d#ZdWd%d&Zd'ejd(ejd)edejfd*d+Zd'ejd(ejd)edejfd,d-ZdXd.d/Z				dYd0ee d1ee d2ee d3ee ddf
d4d5ZdZd6d7ZdXd8ed9edefd:d;Z d8edefd<d=Z!d>ejdejfd?d@Z"d>ejdejfdAdBZ#dXdCedDedejfdEdFZ$dCedejfdGdHZ%d8edeejejejf fdIdJZ&defdKdLZ'defdMdNZ(d[defdPdQZ)defdRdSZ*e+ dedefdTdUZ,  Z-S )\AutoencodingEnginez;
    Base class for all video tokenizers that we train
    N      ?F)optimizer_configlr_g_factorcompile_model
use_tilingencoder_configdecoder_configloss_configregularizer_configrZ   r[   r\   r]   c                   s  |
 dd }|
 dd}|
 dd}|
 dd| _|
 dd	| _t j|	i |
 ttjtd
kr9|r9tj	ndd }|t
|| _|t
|| _t
|| _t
|| _t|ddi| _|| _| j| jj | _d| _| jj| _dt| jj | _|| _d	| _| j| j | _d| _d| _t| jdt| jj  | _t| jdt| jj  | _ d| _!d| _"| j#rt$| | j%d| _&t'dtt(| j&)  d t'd|  |d ur| j*|||d d S d S )N	ckpt_pathr)   r&   r*   Tr]   Ft_chunk_enc   r   c                 S   s   | S r-   r&   )xr&   r&   r'   <lambda>~   s    z-AutoencodingEngine.__init__.<locals>.<lambda>rD   ztorch.optim.Adam      r   )decayz^[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Keeping EMAs of .z][bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Use ckpt_path: )r)   r*   )+popr]   rc   r   r   r   r   r    r!   compiler   encoderdecoderlossregularizationr   rZ   r[   time_downsample_factort_chunk_decZuse_overlap	is_causallenZtempo_dsZtemporal_compression_ratioZnum_sample_frames_batch_sizeZnum_latent_frames_batch_sizetile_sample_min_heighttile_sample_min_widthint
spatial_dstile_latent_min_heighttile_latent_min_widthtile_overlap_factor_heighttile_overlap_factor_widthr   r   r   r5   r   rT   buffersr0   )r#   r^   r_   r`   ra   rZ   r[   r\   r]   r7   r8   rb   r)   r*   rl   r$   r&   r'   r   g   sP   


zAutoencodingEngine.__init__Tr(   r)   r*   r+   c                 C   s  | drtj|dd}d|v r|d n|}n| dr!t|}ntd| t| }|D ]}|D ]}t||rGt	d| d ||= q4q0| j
|d	d
\}	}
t	d| dt|	 dt|
 d |rt|	dkrst	d|	  t|
dkrt	d|
  d S d S d S )Nckptcpu)map_location
state_dictsafetensorszUnknown checkpoint: z[[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Deleting key z from state_dict.F)strictz\[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Restored from z with z missing and z unexpected keysr   z\[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Missing Keys: z_[bold magenta]\[vidtok.models.autoencoder][AutoencodingEngine][/bold magenta] Unexpected Keys: )endswithr    loadload_safetensorsr/   rT   keysrematchr   load_state_dictrt   )r#   r(   r)   r*   r~   weightsr   kikmissing
unexpectedr&   r&   r'   r0      s>   



z!AutoencodingEngine.init_from_ckptr3   c                 C   s
   || j  S r-   )r   r2   r&   r&   r'   r4         
zAutoencodingEngine.get_inputc                 C   sP   t tdd | j t tdd | j  t | j  t | j  }|S )Nc                 S      | j S r-   requires_gradpr&   r&   r'   rf          z;AutoencodingEngine.get_autoencoder_params.<locals>.<lambda>c                 S   r   r-   r   r   r&   r&   r'   rf      r   )	rT   filterrm   r;   rn   rp   get_trainable_parametersro   Z$get_trainable_autoencoder_parametersr#   rF   r&   r&   r'   get_autoencoder_params   s   z)AutoencodingEngine.get_autoencoder_paramsc                 C   s   t | j }|S r-   )rT   ro   r   r   r&   r&   r'   get_discriminator_params   s   z+AutoencodingEngine.get_discriminator_paramsc                 C   s
   | j  S r-   )rn   get_last_layerrK   r&   r&   r'   r      r   z!AutoencodingEngine.get_last_layerc                 C   s&   |  D ]\}}t|drd |_qd S )Ncausal_cache)named_moduleshasattrr   )r#   parentnamemoduler&   r&   r'   _empty_causal_cached   s
   
z'AutoencodingEngine._empty_causal_cachedc                 C   s"   |   D ]
}t|dr||_qd S )Nis_first_chunk)modulesr   r   )r#   r   r   r&   r&   r'   _set_first_chunk   s
   
z#AutoencodingEngine._set_first_chunkr   c                 C   s,   |D ]}|  D ]
}t|dr||_qqd S )Ncache_offset)r   r   r   )r#   r   r   r   	submoduler&   r&   r'   _set_cache_offset   s   
z$AutoencodingEngine._set_cache_offsetabblend_extentc              	   C   s   t |jd |jd |}t|D ]@}|d d d d d d | | d d f d||   |d d d d d d |d d f ||   |d d d d d d |d d f< q|S )N      minshaperange)r#   r   r   r   yr&   r&   r'   blend_v      R&zAutoencodingEngine.blend_vc                 C   s   t |jd |jd |}t|D ]@}|d d d d d d d d | | f d||   |d d d d d d d d |f ||   |d d d d d d d d |f< q|S )N   r   r   )r#   r   r   r   re   r&   r&   r'   blend_h   r   zAutoencodingEngine.blend_hc                 C   sP   ddgg}d}|}	 ||kr	 |S t |||r| jn| j }|||g |}q
)Nr   r   )r   rr   rc   append)r#   tdecoder_mode	start_endstartendr&   r&   r'   build_chunk_start_end   s   
z(AutoencodingEngine.build_chunk_start_endru   rv   r{   r|   c                 C   sr   d| _ |p| j| _|p| j| _t| jdt| jj  | _t| jdt| jj  | _|p/| j	| _	|p5| j
| _
d S )NTrg   )r]   ru   rv   rw   rt   rm   rx   ry   rz   r{   r|   )r#   ru   rv   r{   r|   r&   r&   r'   enable_tiling  s   z AutoencodingEngine.enable_tilingc                 C   s
   d| _ d S )NF)r]   rK   r&   r&   r'   disable_tiling  r   z!AutoencodingEngine.disable_tilingre   return_reg_logc                 C   sr   |  | j | d | jr | |}| j|| jd d\}}n| |}| j|| jd d\}}|r7||fS |S )NTrg   )n_steps)r   rm   r   r]   tile_encoderp   global_step)r#   re   r   zreg_logr&   r&   r'   rA     s   


zAutoencodingEngine.encodec                 C   s  |j dd  \}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]X}g }td||D ]H}| 	|}g }t
|D ]0\}\}}| |dk |d d d d ||||| j ||| j f }| |}|| qW|tj|dd qJ|| q@g }t
|D ]O\}}g }t
|D ]:\}}|dkr| ||d  | ||}|dkr| ||d  ||}||d d d d d d d |	d |
f  q|tj|dd qtj|dd}|S )Nr   r   rg   dimr   r   )r   rw   ru   r{   rv   r|   ry   rz   r   r   	enumerater   rm   r   r    catr   r   )r#   re   
num_framesheightwidthoverlap_heightoverlap_widthblend_extent_heightblend_extent_widthrow_limit_heightrow_limit_widthrowsirowjr   result_zidxstart_frame	end_frametileresult_rows
result_rowencr&   r&   r'   r   '  sN   



.zAutoencodingEngine.tile_encodetoken_indicesc                 C   s   |  dks
J d|j\}}}}|d|dd}| j|}|dddd||jd d}| j|}|||||ddddddS )Nr   z-token_indices should be of shape (b, t, h, w)r   r   rg   r   )r   r   	unsqueezereshaperp   Zindices_to_codespermuteZproject_out)r#   r   r   r   hwcodesr   r&   r&   r'   indices_to_latentU  s     z$AutoencodingEngine.indices_to_latentc           	      C   sp   |j d }| j|dd}g }|D ] \}}|d d ||d d d d f }| |}||  qtj|ddS )Nr   Tr   rg   r   )r   r   r   r   cloner    r   )	r#   r   r   r   r   r   r   chunkZchunk_zr&   r&   r'   tile_indices_to_latent^  s   
 
z)AutoencodingEngine.tile_indices_to_latentr   decode_from_indicesc                 C   sX   |r| j r| |}n| |}| | j | d | j r%| |}|S | |}|S )NT)r]   r   r   r   rn   r   tile_decode)r#   r   r   re   r&   r&   r'   rC   h  s   



zAutoencodingEngine.decodec                 C   s  |j dd  \}}}t| jd| j  }t| jd| j  }t| j| j }t| j| j }| j| }	| j| }
g }td||D ]-}g }td||D ]}| j	r| j
jdv s[J d| j
jdkr| | jgd | | jjd j| jjd gd | | jjd j| jjd | jjgd nf| j
jdkr| | jgd | | jjd j| jjd | jjd | jjgd n>| | jgd | | jjd j| jjd gd | | jjd j| jjd gd | | jjd j| jjd | jjgd	 | j|d
d}g }t|D ][\}\}}| |dk |d d d d || j	r&|d |kr&|d n|||| j ||| j f }| |}| j	rW|d |krW|d d d d d | j
j f }|| q|tj|dd qK|| q@g }t|D ]S\}}g }t|D ]=\}}|dkr| ||d  | ||}|dkr| ||d  ||}||d d d d d d d |	d |
f  q|tj|dd qutj|dd}|S )Nr   r   r   )rg   r      z4Only support 2x, 4x or 8x temporal downsampling now.r   rg   r   r   Tr   r   )r   rw   ry   r{   rz   r|   ru   rv   r   rs   rm   rq   r   rn   Zup_temporalupsampleconv_outr   r   r   r   r    r   r   r   )r#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   timer   r   r   r   r   r   decr&   r&   r'   r   x  sl   

"*4""($
 

0zAutoencodingEngine.tile_decodec                 C   s   | j jr"t  | j|dd\}}W d    n1 sw   Y  n	| j|dd\}}| |}|jd |jd krL|d d d d |jd  d df }|||fS )NT)r   rg   .)rm   Zfix_encoderr    no_gradrA   rC   r   )r#   re   r   r   r   r&   r&   r'   forward  s   

$
zAutoencodingEngine.forwardc              	   C   sz  |  |}|jdkr|d}| |\}}}|jdkr%|jdkr%|d}|  \}}| | | j|||d| j|  dd\}	}
|  | 	|	 t
jj|  d |  | | | | | j|||d| j|  dd\}}|  | 	| t
jj|  d |  | | |	|d	}||
 || | j|d
d
d
d
d |jd d }| jd|d
d
d
dd
d d S )Nr   rg      r   trainZ
last_layersplitg      4@r   )ztrain/aelossztrain/disclossT)prog_barloggeron_stepon_epochrE   Zlr_absF)r   r   r   r   	sync_dist)r4   ndimr   
optimizerstoggle_optimizerro   r   r   	zero_gradmanual_backwardr    nnutilsclip_grad_norm_r   stepuntoggle_optimizerr   updatelog_dictparam_groupslog)r#   r3   	batch_idxre   r   xrecregularization_logZopt_gZopt_daelosslog_dict_aedisclosslog_dict_discr  rE   r&   r&   r'   training_step  sj   





	



	




z AutoencodingEngine.training_stepc                 C   sT   |  ||}|   | j ||dd}|| W d    |S 1 s#w   Y  |S )NZ_ema)postfix)_validation_stepr?   r
  )r#   r3   r  r  Zlog_dict_emar&   r&   r'   validation_step
  s   

z"AutoencodingEngine.validation_step c              	   C   s   |  |}|jdkr|d}| |\}}}|jdkr%|jdkr%|d}| j|||d| j|  d| d\}}	| j|||d| j|  d| d\}
}| d| d|	d| d  |	| | |	 |	S )	Nr   rg   r   r   valr   r   z	/rec_loss)	r4   r   r   ro   r   r   r  r
  r  )r#   r3   r  r  re   r   r  r  r  r  r  r  r&   r&   r'   r    s8   






 


z#AutoencodingEngine._validation_stepc                 C   sL   |   }|  }| |t| jd| j | j}| || j| j}||gg fS )NrY   )r   r   rJ   r   r[   learning_raterZ   )r#   Z	ae_paramsZdisc_paramsZopt_aeZopt_discr&   r&   r'   rL   5  s   z'AutoencodingEngine.configure_optimizersc                 C   sr   t  }| |}| |\}}}||d< ||d< |   | |\}}}||d< W d    |S 1 s2w   Y  |S )NinputsrecsZrecs_ema)rH   r4   r?   )r#   r3   r  re   _r  Zxrec_emar&   r&   r'   
log_imagesB  s   



zAutoencodingEngine.log_images)T)r   )F)NNNN)r+   N)r  ).rM   rN   rO   rP   r   r   rQ   rU   r   rS   rR   r   rT   r
   r0   r    rV   r4   r   r   r   r   r   r   rw   r   r   r   r   r   r   r   rA   r   r   r   rC   r   r   r  r  r  rL   r   r  rW   r&   r&   r$   r'   rX   b   s~    
	
,@ 	

  


.	
"BE$rX   )%r   abcr   
contextlibr   typingr   r   r   r   r   r	   	omegaconfr
   	packagingr   r    Zlightning.pytorchpytorchplZsafetensors.torchr   r   Zvidtok.modules.emar   vidtok.modules.utilr   r   r   r   Zvidtok.modules.regularizersr   r   r   LightningModuler   rX   r&   r&   r&   r'   <module>   s     P