
    \is                         d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	 dxZ
ZdxZZd	Zd
Zeez  Zd	Z G d dej$                        Zy)z|Video latent model: DINO first frame -> 16x16 patches; learnable patches for other frames; self-attention; conv up to 32x32.    )PathN)	rearrange   )DINOFeaturizer             c            	       B     e Zd ZdZd	dedededef fdZd Zd Z xZ	S )
VideoLatentModelzPredict VidTok-style continuous latents (B, C, T, 32, 32) from 8-frame 256x256 video.
    First frame -> DINO patch tokens 16x16; frames 1..7 -> learnable 16x16 + time embed; self-attn; conv up to 32x32.
    keygrip_root
hidden_dim	num_heads
num_layersc                     t         |           t        |      | _        t	        |      | _        | j
                  j                  }t        j                  ||d      | _	        t        j                  t        j                  dt        dz
  |t        t              dz        | _        t        j"                  t        |      | _        | j$                  j&                  j(                  j+                  dd       t        j,                  t/        |      D cg c]!  }t        j0                  |||dz  dddd	      # c}      | _        t        j4                  |      | _        t        j8                  ||t:        ddft:        ddf
      | _        t        j>                  t        j                  ||dd      t        j@                         t        jB                  |tD        ddd            | _#        | jI                          y c c}w )Nr   )kernel_size{Gz?r      g        geluT)d_modelnheaddim_feedforwarddropout
activationbatch_first
norm_first)r   stride   )paddingr
   )r   r   )%super__init__r   r   r   dino	embed_dimnnConv2d	dino_proj	Parametertorchrandn
NUM_FRAMESPATCH_HPATCH_Wother_frames_embed	Embedding
time_embedweightdatanormal_
ModuleListrangeTransformerEncoderLayerattn_layers	LayerNorm	attn_normConv3d	TIME_DOWNtemporal_pool
SequentialGELUConvTranspose2d
Z_CHANNELSspatial_upsample_init_weights)selfr   r   r   r   dino_dim_	__class__s          +/data/cameron/vidgen/our_vid_model/model.pyr!   zVideoLatentModel.__init__   s    ."<0	99&&8ZQG"$,,u{{1j1njZacj/knr/r"s,,z:>##++At4== :&*
  &&" *Q! *
  j1  YYz:IWXZ[K\foqrtuevw "IIj*a;GGIz:qAN!

 	+*
s   &G;c                 T   | j                   | j                  fD ]p  }t        |d      st        j                  j                  |j                  d       |j                  Ht        j                  j                  |j                         r | j                  D ]  }t        |t        j                  t        j                  f      s.t        j                  j                  |j                  d       |j                  ft        j                  j                  |j                          y )Nr0   r   )gain)r&   r;   hasattrr$   initxavier_uniform_r0   biaszeros_r@   
isinstancer%   r>   )rB   ms     rF   rA   zVideoLatentModel._init_weights:   s    ..$"4"45 	+Aq(#''t'<66%GGNN166*		+
 && 	+A!bii););<=''t'<66%GGNN166*		+    c                 X   |j                   d   }|dddddf   }t        j                         5  | j                  |      }ddd       | j	                        }| j
                  j                  |dddd      }t        j                  t        dz
  |j                        }|| j                  |      j                  dt        dz
  ddd      z   }t        j                  |j                  d      |gd      }t        |d      }| j                  D ]
  }	 |	|      } | j!                  |      }t        |dt        t"        t$        	      }| j'                  |      }|j                   \  }
}}}}t        |d
      }| j)                  |      }t        |d|t*              }|S # 1 sw Y   _xY w)z
        x: (B, 3, 8, 256, 256) in [-1, 1]
        out: (B, Z_CHANNELS, LATENT_T, LATENT_H, LATENT_W) = (B, 8, 4, 32, 32)
        r   Nr   )device)dimzb t c h w -> b (t h w) czb (t h w) c -> b c t h w)thwzb c t h w -> (b t) c h wz(b t) c h w -> b c t h w)brU   )shaper(   no_gradr"   r&   r-   expandaranger*   rS   r/   viewcat	unsqueezer   r6   r8   r+   r,   r;   r@   LATENT_T)rB   xBfirst	dino_featfirst_tokensother_tokenstime_idxtokenslayerrX   crU   rV   rW   outs                   rF   forwardzVideoLatentModel.forwardF   s   
 GGAJ!Q'
]]_ 	)		%(I	)~~i0..55aRRH<<
Qqxx@#dooh&?&D&DQ
UVXZ\]_`&aaL2215|D!L6#=>%% 	#E6]F	#'6#=wZab##F+1aA6#=>##F+71I
#	) 	)s   FF))i     r   )
__name__
__module____qualname____doc__r   intr!   rA   rl   __classcell__)rE   s   @rF   r   r      s5     T  s  S  be  D
+rP   r   )rq   pathlibr   r(   torch.nnr$   einopsr   dino_featurizerr   LATENT_HLATENT_Wr+   r,   r*   r:   r`   r?   Moduler    rP   rF   <module>r|      sY    C     + 8 '
	"
Kryy KrP   