o
     ݱi                     @   sz   d dl mZmZmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZ G dd dejZdS )	    )DictOptionalTupleUnionN)	rearrange)CLIPTextEncoder)VideoTransformVAEDownsample)ResNetImageEncoderViTImageEncoderc                       s   e Zd Z								d'dedededeeef d	eeef d
edee dede	dedef fddZ
deeee f fddZ		d(deejeej f dedefddZdefddZdefdd Zdedefd!d"Zd#d$ Zd%d& Z  ZS ))UWMObservationEncoderNTFvit
shape_meta
num_frames	embed_dimresize_shape
crop_shaperandom_cropcolor_jitterimagenet_normvision_backboneuse_low_dimuse_languagec                    s   t    || _|| _tdd |d  D | _tdd |d  D | _t| j| _	|| _
t|||||d| _|	dkrGt| j	|d| _n|	dkrTt| j	|d| _ntd	|	 |
| _|| _|rht|d
nd | _t | _d S )Nc                 S       g | ]\}}|d  dkr|qS )typergb .0kvr   r   B/data/cameron/vidgen/unified-world-model/models/uwm/obs_encoder.py
<listcomp>        z2UWMObservationEncoder.__init__.<locals>.<listcomp>obsc                 S   r   )r   low_dimr   r   r   r   r!   r"   "   r#   )r   r   r   r   r   r   )	num_viewsr   resnetzUnsupported vision backbone: )r   )super__init__r   r   sorteditemsrgb_keyslow_dim_keyslenr&   r   r   obs_transformr   img_encoderr
   NotImplementedErrorr   r   r   text_encoderr	   vae)selfr   r   r   r   r   r   r   r   r   r   r   	__class__r   r!   r)      sD   
	

zUWMObservationEncoder.__init__	obs_dictsc           	         s   t |tr|g}d}nd}t |tsJ t|}dd t|D }| jD ], tj fdd|D dd}| |}|j	|dd}t
|D ]\}}|| | qDq$dd |D }|r^|d }|S )	za
        Accept a list of observation dictionaries and apply the same transform to each.
        TFc                 S   s   g | ]}g qS r   r   )r   _r   r   r!   r"   W   s    z9UWMObservationEncoder.apply_transform.<locals>.<listcomp>c                    s   g | ]}|  qS r   r   )r   obs_dictkeyr   r!   r"   Y       r   dimc                 S   s   g | ]	}t j|d dqS )   r=   )torchstack)r   imgsr   r   r!   r"   a   s    )
isinstancedictlistr.   ranger,   r@   catr/   chunk	enumerateappend)	r4   r7   is_singletonZnum_obstransformed_imgsZcombined_imgsZchunked_imgsiimgr   r:   r!   apply_transformJ   s$   


z%UWMObservationEncoder.apply_transformH   	imgs_listinversemicrobatch_sizec                 C   s   t |tjr|g}d}nd}t |tsJ tj|dd}|jdd \}}t|d}g }td|jd |D ]}	||	|	|  }
|rH| j	|
}n| |
}|
| q5tj|dd}t|d||d	}|std
d |D }t|j|dd}|S )z
        Accept a list of images and apply VAE to downsample or upsample images.
        If inverse is False, downsample images. Otherwise, upsample images.
        Process images in microbatches to reduce memory usage.
        TFr   r=   N   zb v c t h w -> (b v t) c h wz(b v t) c h w -> b v c t h w)br    c                 S   s   g | ]}|j d  qS )r   )shape)r   rN   r   r   r!   r"      s    z3UWMObservationEncoder.apply_vae.<locals>.<listcomp>)rC   r@   TensorrE   rG   rV   r   rF   r3   rR   rJ   split)r4   rQ   rR   rS   rK   rB   BVrL   rM   Z
batch_imgsZbatch_transformed_imgschunk_sizesr   r   r!   	apply_vaef   s.   

zUWMObservationEncoder.apply_vaecurr_obs_dictc                    s   |   }| |}| jr* fdd| jD }tj|ddd}tj||gdd}| jrA| j d  d d}tj||gdd}|S )	Nc                       g | ]} | qS r   r   r   r;   r]   r   r!   r"      r<   z9UWMObservationEncoder.encode_curr_obs.<locals>.<listcomp>r=   r?   	input_idsattention_maskrb   rc   )	rO   r0   r   r-   r@   rG   flattenr   r2   )r4   r]   	curr_imgs
curr_featslow_dims
lang_featsr   r`   r!   encode_curr_obs   s   

z%UWMObservationEncoder.encode_curr_obsnext_obs_dictc                 C   s   |  |}| |}|S )N)rO   r\   )r4   rk   	next_imgsnext_latentsr   r   r!   encode_next_obs   s   

z%UWMObservationEncoder.encode_next_obsc           	         s   |   |g\}}| |}| jr. fdd| jD }tj|ddd}tj||gdd}| jrE| j d  d d}tj||gdd}| 	|}||fS )	Nc                    r^   r   r   r_   r`   r   r!   r"      r<   zBUWMObservationEncoder.encode_curr_and_next_obs.<locals>.<listcomp>ra   r=   r?   rb   rc   rd   )
rO   r0   r   r-   r@   rG   re   r   r2   r\   )	r4   r]   rk   rf   rl   rg   rh   ri   rm   r   r`   r!   encode_curr_and_next_obs   s   

z.UWMObservationEncoder.encode_curr_and_next_obsc                    sN   t  fdd jD } j j  j t j j |  t j j  S )Nc                 3   s&    | ]} j d  | d d V  qdS )r$   rV   ra   N)r   r_   r4   r   r!   	<genexpr>   s    
z1UWMObservationEncoder.feat_dim.<locals>.<genexpr>)sumr-   r&   r   r   intr   r   )r4   Zlow_dim_sizer   rp   r!   feat_dim   s   zUWMObservationEncoder.feat_dimc                 C   s   i }| j D ]}| jd | d }tjd| jg|R dtji||< qt  | |}W d    n1 s6w   Y  t|j	dd  S )Nr$   rV   r?   dtype)
r,   r   r@   zerosr   uint8no_gradrn   tuplerV   )r4   Z	dummy_obsr   	img_shapeZlatentr   r   r!   latent_img_shape   s   

z&UWMObservationEncoder.latent_img_shape)NNTNFr   TT)FrP   )__name__
__module____qualname__rD   rs   r   boolr   r   strr)   r   rE   rO   r@   rW   r\   rj   rn   ro   rt   r{   __classcell__r   r   r5   r!   r      s^    

	
=
+r   )typingr   r   r   r   r@   torch.nnnneinopsr   Zmodels.common.languager   Zmodels.common.transformsr   r	   Zmodels.common.visionr
   r   Moduler   r   r   r   r!   <module>   s    