o
     ݱi                     @   s   d dl mZ d dlZd dlmZ d dlm  mZ d dl	m
Z
 d dlmZ d dlmZmZmZmZmZ G dd dejZG dd	 d	ejZG d
d dejZG dd dejZG dd dejZG dd deZdS )    )OptionalN)AutoencoderKL)	rearrange)
CenterCropColorJitter
RandomCropResize	Normalizec                   @   s    e Zd ZdZdejfddZdS )ToTensorz}
    Convert a batch of images from (B, H, W, C) to (B, C, H, W)
    and normalize the pixel values to the range [0, 1].
    inputsc                 C   s   | d  dS )N)r            g     o@)permute
contiguousfloatdiv_selfr    r   D/data/cameron/vidgen/unified-world-model/models/common/transforms.pyforward   s   zToTensor.forwardN)__name__
__module____qualname____doc__torchTensorr   r   r   r   r   r
      s    r
   c                       s>   e Zd ZdZdeeef f fddZdejfddZ	  Z
S )AutoRandomCropzR
    Perform random cropping during training and center cropping during eval.
    sizec                    s    t    || _t|d| _d S )N)r   )super__init__r   r   random_crop)r   r   	__class__r   r   r!       s   
zAutoRandomCrop.__init__r   c                 C   s    | j r| |S tj|| jdS )N)imgoutput_size)trainingr"   ttfcenter_cropr   r   r   r   r   r   %   s   
zAutoRandomCrop.forward)r   r   r   r   tupleintr!   r   r   r   __classcell__r   r   r#   r   r      s    r   c                	       sF   e Zd ZdZdedededee f fddZdejfd	d
Z	  Z
S )AutoColorJitterzH
    Perform color jittering during training and no-op during eval.
    
brightnesscontrast
saturationhuec                    s$   t    t|||t|d| _d S )Nr.   r/   r0   r1   )r    r!   r   r*   color_jitter)r   r.   r/   r0   r1   r#   r   r   r!   2   s   
zAutoColorJitter.__init__r   c                 C   s   | j r| |S |S N)r'   r3   r   r   r   r   r   A   s   
zAutoColorJitter.forward)r   r   r   r   r   r*   r!   r   r   r   r,   r   r   r#   r   r-   -   s    r-   c                       s@   e Zd ZdZ fddZdejfddZdejfdd	Z  Z	S )
VAEDownsamplez4
    Downsample images using a pre-trained VAE.
    c                    s   t    tg dg ddd| _tg dg ddd| _tg ddddd}tg d	dddd}| d
| | d| t	
d| _| j D ]}d|_qN| jjj| _d S )N)      ?r6   r6   Tmeanstdinplace)r;   r;   )r   r   r   )        r<   r<   r<   r      )      @r>   r>   r>   shiftscalezstabilityai/sdxl-vaeF)r    r!   r	   norminv_normr   tensorviewregister_bufferr   from_pretrainedvae
parametersrequires_gradconfigscaling_factor)r   r?   r@   pr#   r   r   r!   M   s   
zVAEDownsample.__init__imagesc                 C   s@   |  |}| j|j }|| j}|| j	| j
}|S r4   )rA   rG   encodeZlatent_distsamplemul_rK   sub_r?   r   r@   )r   rM   featsr   r   r   r   _   s
   
zVAEDownsample.forwardrR   c                 C   s<   | | j| j}|| j}| j|j}| 	|}|S r4   )
rP   r@   add_r?   r   rK   rG   decoderO   rB   )r   rR   rM   r   r   r   inversef   s
   
zVAEDownsample.inverse)
r   r   r   r   r!   r   r   r   rU   r,   r   r   r#   r   r5   H   s
    r5   c                       sx   e Zd ZdZ						ddeeeef  deeeef  dedee d	ed
ef fddZ	e
dd Zdd Z  ZS )ImageTransformz3
    Apply a sequence of transforms to images.
    NTFresize_shape
crop_shaper"   r3   
downsampleimagenet_normc              	      s   t    t }|t  |d ur|t| |d ur.|r'|t| n|t| |d urG|t|d |d |d t	|d d |rV|rOt
d |t  n|rf|tg dg dd	d
 tj| | _d S )Nr.   r/   r0   r1   r2   z=Disabling imagenet normalization since downsample is enabled.)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?Tr7   )r    r!   listappendr
   r   r   r   r-   r*   printr5   r	   nn
Sequential	transform)r   rW   rX   r"   r3   rY   rZ   r`   r#   r   r   r!   s   s:   
	

zImageTransform.__init__c                 C   s   t | jd ts
J | jd S )Nr;   )
isinstancer`   r5   )r   r   r   r   rG      s   
zImageTransform.vaec                 C   s
   |  |S r4   )r`   )r   rM   r   r   r   r      s   
zImageTransform.forward)NNTNFT)r   r   r   r   r   r*   r+   booldictr!   propertyrG   r   r,   r   r   r#   r   rV   n   s0    3
rV   c                   @   s   e Zd ZdZdd ZdS )VideoTransformzQ
    Flatten videos to images, apply transforms, and reshape back to videos.
    c                 C   s0   |j d }t|d}| |}t|d|d}|S )Nr   zb t h w c-> (b t) h w cz(b t) c h w-> b c t h w)t)shaper   r`   )r   rM   
num_framesr   r   r   r      s
   


zVideoTransform.forwardN)r   r   r   r   r   r   r   r   r   re      s    re   )typingr   r   torch.nnr^   Z!torchvision.transforms.functional
transforms
functionalr(   	diffusersr   einopsr   Ztorchvision.transformsr   r   r   r   r	   Moduler
   r   r-   r5   rV   re   r   r   r   r   <module>   s    	
&A