o
    \is                     @   sv   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 d Z
Zd ZZd	Zd
Zee Zd	ZG dd dejZdS )z|Video latent model: DINO first frame -> 16x16 patches; learnable patches for other frames; self-attention; conv up to 32x32.    )PathN)	rearrange   )DINOFeaturizer             c                	       sD   e Zd ZdZddedededef fd	d
Zdd Zdd Z  Z	S )VideoLatentModelzPredict VidTok-style continuous latents (B, C, T, 32, 32) from 8-frame 256x256 video.
    First frame -> DINO patch tokens 16x16; frames 1..7 -> learnable 16x16 + time embed; self-attn; conv up to 32x32.
            keygrip_root
hidden_dim	num_heads
num_layersc                    s  t    t|| _t|| _| jj}tj| dd| _	t
tdtd  ttd | _tt | _| jjjdd t fddt|D | _t | _tj  tddftddfd| _ttj  ddd	t tj  t!d
ddd| _"| #  d S )Nr   )kernel_size{Gz?r   c                    s(   g | ]}t j  d  dddddqS )r   g        ZgeluT)Zd_modelZnheadZdim_feedforwardZdropoutZ
activationZbatch_firstZ
norm_first)nnZTransformerEncoderLayer).0_r   r    +/data/cameron/vidgen/our_vid_model/model.py
<listcomp>#   s    
z-VideoLatentModel.__init__.<locals>.<listcomp>)r   stride   )paddingr   r	   )r   r   )$super__init__r   r   r   dinoZ	embed_dimr   Conv2d	dino_projZ	ParametertorchZrandn
NUM_FRAMESPATCH_HPATCH_Wother_frames_embedZ	Embedding
time_embedweightdataZnormal_Z
ModuleListrangeattn_layersZ	LayerNorm	attn_normZConv3d	TIME_DOWNtemporal_poolZ
SequentialZGELUConvTranspose2d
Z_CHANNELSspatial_upsample_init_weights)selfr   r   r   r   Zdino_dim	__class__r   r   r      s&   


"

 zVideoLatentModel.__init__c                 C   s   | j | jfD ]}t|dr"tjj|jdd |jd ur"tj|j q| j	D ] }t
|tjtjfrFtjj|jdd |jd urFtj|j q&d S )Nr)   r   )Zgain)r"   r/   hasattrr   ZinitZxavier_uniform_r)   ZbiasZzeros_r2   
isinstancer!   r0   )r4   mr   r   r   r3   :   s   



zVideoLatentModel._init_weightsc                 C   s8  |j d }|dddddf }t  | |}W d   n1 s$w   Y  | |}| j|dddd}tjtd |j	d}|| 
|dtd ddd }tj|d|gdd}t|d}| jD ]}	|	|}qf| |}t|dtttd	}| |}|j \}
}}}}t|d
}| |}t|d|td}|S )z
        x: (B, 3, 8, 256, 256) in [-1, 1]
        out: (B, Z_CHANNELS, LATENT_T, LATENT_H, LATENT_W) = (B, 8, 4, 32, 32)
        r   Nr   )device)Zdimzb t c h w -> b (t h w) czb (t h w) c -> b c t h w)thwzb c t h w -> (b t) c h wz(b t) c h w -> b c t h w)br<   )shaper#   Zno_gradr    r"   r'   ZexpandZaranger$   r;   r(   ZviewcatZ	unsqueezer   r,   r-   r%   r&   r/   r2   LATENT_T)r4   xBZfirstZ	dino_featZfirst_tokensZother_tokensZtime_idxZtokensZlayerr?   cr<   r=   r>   Zoutr   r   r   forwardF   s*   


 






zVideoLatentModel.forward)r   r   r   )
__name__
__module____qualname____doc__r   intr   r3   rF   __classcell__r   r   r5   r   r
      s
     "r
   )rJ   Zpathlibr   r#   Ztorch.nnr   Zeinopsr   Zdino_featurizerr   ZLATENT_HZLATENT_Wr%   r&   r$   r.   rB   r1   ZModuler
   r   r   r   r   <module>   s    