o
    ?߱iRE                     @   s   d dl Z d dlmZmZ d dlmZ d dlZd dlm  m	Z
 d dlmZ d dlmZ d dlmZ G dd dejjeZG d	d
 d
eZG dd deZG dd deZG dd deZdS )    N)ABCabstractmethod)Optional)rank0_first)	CRED_ENVS)load_from_s3_with_cachec                       s   e Zd ZdZddedef fddZedefd	d
Ze	de
jde
jfddZe	de
jde
jfddZedefddZ  ZS )BaseVAEz
    Abstract base class for a Variational Autoencoder (VAE).

    All subclasses should implement the methods to define the behavior for encoding
    and decoding, along with specifying the latent channel size.
       vaechannelnamec                    s   t    || _|| _d S N)super__init__r   r   )selfr   r   	__class__ [/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/tokenizers/base_vae.pyr   %   s   

zBaseVAE.__init__returnc                 C   s   | j S )zC
        Returns the number of latent channels in the VAE.
        )r   r   r   r   r   	latent_ch*   s   zBaseVAE.latent_chstatec                 C      dS )z
        Encodes the input tensor into a latent representation.

        Args:
        - state (torch.Tensor): The input tensor to encode.

        Returns:
        - torch.Tensor: The encoded latent tensor.
        Nr   )r   r   r   r   r   encode1      zBaseVAE.encodelatentc                 C   r   )z
        Decodes the latent representation back to the original space.

        Args:
        - latent (torch.Tensor): The latent tensor to decode.

        Returns:
        - torch.Tensor: The decoded tensor.
        Nr   )r   r   r   r   r   decode>   r   zBaseVAE.decodec                 C   s   t d)zC
        Returns the spatial reduction factor for the VAE.
        zQThe spatial_compression_factor property must be implemented in the derived class.)NotImplementedErrorr   r   r   r   spatial_compression_factorK   s   z"BaseVAE.spatial_compression_factor)r	   r
   )__name__
__module____qualname____doc__intstrr   propertyr   r   torchTensorr   r   r   __classcell__r   r   r   r   r      s    r   c                       s   e Zd ZdZdddee dfdededededed	ed
df fddZded
dfddZ	e
 de
jd
e
jfddZe
 de
jd
e
jfddZdd Z  ZS )BasePretrainedImageVAEa#  
    A base class for pretrained Variational Autoencoder (VAE) that loads mean and standard deviation values
    from a remote store, handles data type conversions, and normalization
    using provided mean and standard deviation values for latent space representation.
    Derived classes should load pre-trained encoder and decoder components from a remote store

    Attributes:
        latent_mean (Tensor): The mean used for normalizing the latent representation.
        latent_std (Tensor): The standard deviation used for normalizing the latent representation.
        dtype (dtype): Data type for model tensors, determined by whether bf16 is enabled.

    Args:
        mean_std_fp (str): File path to the pickle file containing mean and std of the latent space.
        latent_ch (int, optional): Number of latent channels (default is 16).
        is_image (bool, optional): Flag to indicate whether the output is an image (default is True).
        is_bf16 (bool, optional): Flag to use Brain Floating Point 16-bit data type (default is True).
       Tr   mean_std_fpr   is_imageis_bf16load_mean_stdr   Nc           	         s   t  || |rtjntj}|| _|| _|| _|| _|| _	|d u s)t
|ts)J |d u r1d | _ntj|s<tjdv rDdd |d| _ntd| d| | d S )N)proddevstgs3)backendpath_mappings3_credential_pathzInvalid s3_credential_path: z  and APP_ENV is not prod/dev/stg)r   r   r'   bfloat16float32dtyper-   r,   r   r/   
isinstancer%   backend_argsospathexistsr   APP_ENVFileNotFoundErrorregister_mean_std)	r   r   r,   r   r-   r.   r6   r/   r9   r   r   r   r   f   s"   

zBasePretrainedImageVAE.__init__c                 C   s  | j r
d| jddgnd| jdddg}| jrV|dd }t|d| j d| dttj	 i| j
d\}}| jd|| jj| d	d
 | jd|| jj| d	d
 d S tj rcttj	 ntd}| jdtj|| j|dd	d
 | jdtj|| j|dd	d
 d S )N   .vae/z
_mean_std.map_locationZeasy_io_kwargsr;   latent_meanF
persistent
latent_stdcpu)r9   device)r-   r   r/   splitr   r   r'   rM   cudacurrent_devicer;   register_buffertor9   reshapeis_availablezerosones)r   r,   target_shapeZ	extentionrH   rK   rM   r   r   r   rA      s<   $

$
z(BasePretrainedImageVAE.register_mean_stdr   c                 C   s   |j }| j|}| j|}| || j }t|tjrnt|tr3t|d tjs.J |d }nt	d||| | S )zp
        Encode the input state to latent space; also handle the dtype conversion, mean and std scaling
        r   zInvalid type of encoded state)
r9   rH   rR   rK   encoderr:   r'   r(   tuple
ValueError)r   r   in_dtyperH   rK   Zencoded_stater   r   r   r      s   

zBasePretrainedImageVAE.encoder   c                 C   s:   |j }|| j| | j| }| || j |S )zj
        Decode the input latent to state; also handle the dtype conversion, mean and std scaling
        )r9   rK   rR   rH   decoder)r   r   r[   r   r   r   r      s   zBasePretrainedImageVAE.decodec                 O   s$   ~~| j | j | j| j dS z
        Resets the data type of the encoder and decoder to the model's default data type.

        Args:
            *args, **kwargs: Unused, present to allow flexibility in method calls.
        N)r\   rR   r9   rX   r   argskwargsr   r   r   reset_dtype   s   z"BasePretrainedImageVAE.reset_dtype)r    r!   r"   r#   r   r%   r$   boolr   rA   r'   no_gradr(   r   r   ra   r)   r   r   r   r   r*   S   s8    	 #r*   c                       sz   e Zd ZdZ					ddedededed	ee d
edededef fddZdeddfddZ	deddfddZ
  ZS )JITVAEa?  
    A JIT compiled Variational Autoencoder (VAE) that loads pre-trained encoder
    and decoder components from a remote store, handles data type conversions, and normalization
    using provided mean and standard deviation values for latent space representation.

    Attributes:
        encoder (Module): The JIT compiled encoder loaded from storage.
        decoder (Module): The JIT compiled decoder loaded from storage.
        latent_mean (Tensor): The mean used for normalizing the latent representation.
        latent_std (Tensor): The standard deviation used for normalizing the latent representation.
        dtype (dtype): Data type for model tensors, determined by whether bf16 is enabled.

    Args:
        enc_fp (str): File path to the encoder's JIT file on the remote store.
        dec_fp (str): File path to the decoder's JIT file on the remote store.
        name (str): Name of the model, used for differentiating cache file paths.
        mean_std_fp (str): File path to the pickle file containing mean and std of the latent space.
        latent_ch (int, optional): Number of latent channels (default is 16).
        is_image (bool, optional): Flag to indicate whether the output is an image (default is True).
        is_bf16 (bool, optional): Flag to use Brain Floating Point 16-bit data type (default is True).
    Nr+   Tenc_fpdec_fpr   r,   r6   r   r-   r.   r/   c
           
   	      s2   t  j|||||||	d | | | | d S )N)r6   r/   )r   r   load_encoderload_decoder)
r   re   rf   r   r,   r6   r   r-   r.   r/   r   r   r   r      s   
	zJITVAE.__init__r   c                 C   `   t |d| j ddttj i| jd| _| j  | j	 D ]}d|_
q!| j| j dS )z
        Load the encoder from the remote store.

        Args:
        - enc_fp (str): File path to the encoder's JIT file on the remote store.
        rE   _enc.jitrF   rG   FN)r   r   r'   rM   rO   rP   r;   rX   eval
parametersrequires_gradrR   r9   )r   re   paramr   r   r   rg         
zJITVAE.load_encoderc                 C   ri   )z
        Load the decoder from the remote store.

        Args:
        - dec_fp (str): File path to the decoder's JIT file on the remote store.
        rE   _dec.jitrF   rG   FN)r   r   r'   rM   rO   rP   r;   r\   rk   rl   rm   rR   r9   )r   rf   rn   r   r   r   rh     ro   zJITVAE.load_decoder)Nr+   TTT)r    r!   r"   r#   r%   r   r$   rb   r   rg   rh   r)   r   r   r   r   rd      s8    	
rd   c                       s~   e Zd ZdZ				ddededejjded	ed
ee de	de
de
f fddZdededejjddfddZdd Z  ZS )StateDictVAEa  
    A Variational Autoencoder (VAE) that loads pre-trained weights into
    provided encoder and decoder components from a remote store, handles data type conversions,
    and normalization using provided mean and standard deviation values for latent space representation.

    Attributes:
        encoder (Module): The encoder with weights loaded from storage.
        decoder (Module): The decoder with weights loaded from storage.
        latent_mean (Tensor): The mean used for normalizing the latent representation.
        latent_std (Tensor): The standard deviation used for normalizing the latent representation.
        dtype (dtype): Data type for model tensors, determined by whether bf16 is enabled.

    Args:
        enc_fp (str): File path to the encoder's JIT file on the remote store.
        dec_fp (str): File path to the decoder's JIT file on the remote store.
        vae (Module): Instance of VAE with not loaded weights
        name (str): Name of the model, used for differentiating cache file paths.
        mean_std_fp (str): File path to the pickle file containing mean and std of the latent space.
        latent_ch (int, optional): Number of latent channels (default is 16).
        is_image (bool, optional): Flag to indicate whether the output is an image (default is True).
        is_bf16 (bool, optional): Flag to use Brain Floating Point 16-bit data type (default is True).
    Nr+   Tre   rf   r
   r   r,   r6   r   r-   r.   c
           
         s*   t  j|||||	|d | ||| d S )N)r6   )r   r   load_encoder_and_decoder)
r   re   rf   r
   r   r,   r6   r   r-   r.   r   r   r   r   <  s   zStateDictVAE.__init__r   c                 C   s   t |d| j ddttj i| jd}t |d| j ddttj i| jd}| | B }dd | D }|	| |
  | D ]}d|_qJ|| j || _| jj| _| jj| _d	S )
z
        Load the encoder from the remote store.

        Args:
        - vae_fp (str): File path to the vae's state dict file on the remote store.
        - vae (str): VAE module into which weights will be loaded.
        rE   rj   rF   rG   rp   c                 S   s   i | ]\}}|d vr||qS ))zencoder.patcher.waveletszencoder.patcher._arangezdecoder.unpatcher.waveletszdecoder.unpatcher._aranger   ).0kvr   r   r   
<dictcomp>c  s    z9StateDictVAE.load_encoder_and_decoder.<locals>.<dictcomp>FN)r   r   r'   rM   rO   rP   r;   
state_dictitemsload_state_dictrk   rl   rm   rR   r9   r
   r   rX   r   r\   )r   re   rf   r
   Zstate_dict_encZstate_dict_decZjit_weights_state_dictrn   r   r   r   rr   L  s0   

z%StateDictVAE.load_encoder_and_decoderc                 O   s   ~~| j | j dS r]   r
   rR   r9   r^   r   r   r   ra   z  s   zStateDictVAE.reset_dtype)Nr+   TT)r    r!   r"   r#   r%   r'   nnModuler   r$   rb   r   rr   ra   r)   r   r   r   r   rq   $  s6    	
.rq   c                       s   e Zd Zddededdf fdd	Zd
d ZedddZe	 dej
dej
fddZe	 dej
dej
fddZedefddZ  ZS )SDVAEr+   FT	count_stdis_downsampler   Nc              	      s   t  jddd tj| _| jdtjg d| jd ddddd	d
 | jddtjg d| jddddd | j	 d	d
 || _
|| _|| _|   |   d S )N   Zsd_vae)r   r   scale)gGz@g{Gz@gGz@g=
ףp=
@)r9   rB   rD   FrI   biasg      )g=
ףp=@g      
@gQ?g333333)r   r   r'   r7   r9   rQ   tensor
reciprocalrS   r   
batch_sizer~   r   load_vaera   )r   r   r~   r   r   r   r   r     s"   "(zSDVAE.__init__c                 O   s   ~~| j | j d S r   rz   r^   r   r   r   ra     s   zSDVAE.reset_dtypec                 C   sb   dt jd< dt jd< dd l}d}z|jjj|dd}W n   |jj|}Y | d| _d S )	N1HF_HUB_DISABLE_SYMLINKS_WARNINGHF_HUB_DISABLE_PROGRESS_BARSr   zstabilityai/sd-vae-ft-mseT)local_files_onlyF)	r<   environ	diffusersmodelsAutoencoderKLfrom_pretrainedrk   requires_grad_r
   )r   r   Zvae_namer
   r   r   r   r     s   

zSDVAE.load_vaer   c           	      C   s   | j r|jdd \}}tj||d |d fddd}|j}|| j}|d d }| j|d	 }|j|j	}}| j
rF|t||  }n|}|| j }|| j }||S )
z-
        state : pixel range [-1, 1]
        N   bilinearFsizemodealign_corners      ?g       @latent_dist)r   shapeFinterpolater9   rR   r
   r   meanstdr~   r'   
randn_liker   r   )	r   r   _h_wr[   r   r   r   r   r   r   r   r     s   


zSDVAE.encoder   c                    s   |j }| j }| j }| j }t fdd| jD } jr>|j	dd  \}}t
j||d |d fddd}||d d S )	Nc                    s   g | ]
} j |d  qS )sample)r
   r   )rs   batchr   r   r   
<listcomp>  s    z SDVAE.decode.<locals>.<listcomp>r   r   r   Fr   r   )r9   rR   r   r   r'   catrN   r   r   r   r   r   )r   r   r[   r   r   r   r   r   r     s   

 zSDVAE.decodec                 C   r   )N   r   r   r   r   r   r     s   z SDVAE.spatial_compression_factor)r+   FT)r   N)r    r!   r"   rb   r   ra   r   r   r'   rc   r(   r   r   r&   r$   r   r)   r   r   r   r   r}     s    r}   )r<   abcr   r   typingr   r'   Ztorch.nn.functionalr{   
functionalr   Z1cosmos_predict2._src.imaginaire.utils.distributedr   Acosmos_predict2._src.imaginaire.utils.env_parsers.cred_env_parserr   Z.cosmos_predict2._src.imaginaire.utils.s3_utilsr   r|   r   r*   rd   rq   r}   r   r   r   r   <module>   s   6}Ta