o
    jCÄiO(  ã                   @   sz   d Z ddlZddlZddlZddlmZ ej dd¡Zej dd¡Z	dZ
dZdZd	Zd
d„ ZeeƒZG dd„ dejƒZdS )uÒ  DinoVideoModel: predict future VAE latent tokens from a single DINO-encoded frame.

Uses a 3-step diffusion process: during training, a random noise level is sampled
and the 3D conv blocks predict the noise residual. During inference, 3 reverse
diffusion steps denoise from pure Gaussian noise to predicted VAE tokens.

Architecture:
  1. Frozen DINO ViT-S/16 extracts (B, D, 16, 16) patch features from first frame
  2. Temporal expand + temporal embeddings â†’ (B, D, N_FRAMES, 16, 16)
  3. Concatenate with noised VAE tokens z_t â†’ (B, D + vae_embed_dim, N_FRAMES, 16, 16)
  4. Add timestep embedding (broadcast over spatial/temporal dims)
  5. 3D conv blocks predict noise Îµ â†’ (B, vae_embed_dim, N_FRAMES, 16, 16)
é    NÚDINO_REPO_DIRz/data/cameron/dinov3ÚDINO_WEIGHTS_PATHzM/data/cameron/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é   é   c                    s   ‡ fdd„t ˆ ƒD ƒS )uI   Cosine schedule for alpha_bar: alpha_bar[t] = cosÂ²(Ï€/2 Â· (t+1)/(T+1)).c                    s0   g | ]}t  t jd  |d  ˆ d  ¡d  ‘qS )é   é   )ÚmathÚcosÚpi)Ú.0Út©Zn_steps© ú</data/cameron/para/video_training/custom_dino_video/model.pyÚ
<listcomp>#   s   0 z._cosine_alpha_bar_schedule.<locals>.<listcomp>)Úranger   r   r   r   Ú_cosine_alpha_bar_schedule!   s   r   c                       sn   e Zd ZdZeeef‡ fdd„	Z‡ fdd„Zdd„ Z	dd	„ Z
d
d„ Zdd„ Zddd„Ze ¡ dd„ ƒZ‡  ZS )ÚDinoVideoModelzWPredict future VAE latent tokens via 3-step diffusion from a single DINO-encoded frame.c                    sŒ  t ƒ  ¡  || _|| _|| _t| _t|ƒ}|  dt	j
|t	jd¡ td|› ddd„ |D ƒ› ƒ tdƒ t	jjtdd	td
| _| j ¡ D ]}d|_qD| j ¡  | jj| _td| j› dƒ | j}t t	 d||dd¡d ¡| _t ||| ¡| _|| }t tj||dddt ¡ tj||dddt ¡ tj|ddddt ¡ tjd|dd¡| _td|› d|› d|› d|› ƒ td|› d|› dƒ d S )NÚ	alpha_bar)ÚdtypezDiffusion schedule (z steps): alpha_bar = c                 S   s   g | ]}|d ›‘qS )z.4fr   )r   Úar   r   r   r   7   s    z+DinoVideoModel.__init__.<locals>.<listcomp>zLoading DINOv3 model...Zdinov3_vits16plusÚlocal)ÚsourceÚweightsFz"Frozen DINOv3 backbone (embed_dim=ú)r   g{®Gáz”?r   )Úkernel_sizeÚpaddingé   )r   z3D conv blocks: z -> z -> 256 -> zOutput: (B, z, u   , 16, 16) â€” noise prediction) ÚsuperÚ__init__Ún_framesÚvae_embed_dimÚn_diffusion_stepsÚDINO_PATCH_SIZEÚ
patch_sizer   Úregister_bufferÚtorchÚtensorÚfloat32ÚprintÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚnnÚ	ParameterÚrandnÚtemporal_embedÚ	EmbeddingÚtimestep_embedÚ
SequentialÚConv3dÚGELUÚconv3d_blocks)Úselfr!   r"   r#   r   ÚparamÚDÚin_ch©Ú	__class__r   r   r    ,   sF   
ü

ù 	zDinoVideoModel.__init__c                    s(   t ƒ  |¡ t| dƒr| j |¡| _| S )Nr-   )r   ÚtoÚhasattrr-   )r<   Údevicer@   r   r   rB   ]   s   
zDinoVideoModel.toc                 C   s  |j d }| j |¡\}\}}| jjD ]}| jjr!| jj||dnd}|||ƒ}q| jjr[| j |dd…d| jjd …f ¡}| j |dd…| jjd d…f ¡}	t	j
||	gdd}n| j |¡}|dd…| jjd d…f }
|
 |||| j¡}| dddd¡ ¡ }|S )zÂExtract patch features from frozen DINO backbone.

        Args:
            x: (B, 3, 256, 256) ImageNet-normalized input

        Returns:
            patch_features: (B, D, H_p, W_p)
        r   )ÚHÚWNr   ©Údimr   r   )Úshaper-   Zprepare_tokens_with_masksÚblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensÚnormr'   ÚcatÚreshaper1   ÚpermuteÚ
contiguous)r<   ÚxÚBZx_tokensÚH_pZW_pÚblkZrope_sincosZ
x_norm_clsZx_norm_patchesZpatch_tokensZpatch_featuresr   r   r   Ú_extract_dino_featuresc   s   
	$$z%DinoVideoModel._extract_dino_featuresc                 C   sZ   t  ¡  |  |¡}W d  ƒ n1 sw   Y  | d¡ dd| jdd¡}|| j }|S )uã   Extract DINO features and expand temporally.

        Args:
            x: (B, 3, 256, 256) ImageNet-normalized input

        Returns:
            cond: (B, D, N_FRAMES, 16, 16) â€” DINO features + temporal embeddings
        Nr   éÿÿÿÿ)r'   Úno_gradrT   Ú	unsqueezeÚexpandr!   r5   )r<   rP   ZfeatsÚcondr   r   r   Ú_get_dino_cond}   s   
	ÿ
zDinoVideoModel._get_dino_condc                 C   sR   |   |¡}tj||gdd}|  |¡}||dd…dd…dddf  }|  |¡}|S )aˆ  Predict noise given conditioning image, noised tokens, and timestep.

        Args:
            x: (B, 3, 256, 256) ImageNet-normalized input frame
            z_t: (B, vae_embed_dim, N_FRAMES, 16, 16) noised VAE tokens
            t: (B,) integer timestep indices in [0, n_diffusion_steps)

        Returns:
            eps_pred: (B, vae_embed_dim, N_FRAMES, 16, 16) predicted noise
        r   rG   N)rZ   r'   rL   r7   r;   )r<   rP   Úz_tr   rY   ÚhÚt_embZeps_predr   r   r   Úpredict_noise   s   


zDinoVideoModel.predict_noisec                 C   sR   | j | }|dd…ddddf }t |¡}t |¡| t d| ¡|  }||fS )a  Forward diffusion: add noise to clean tokens.

        Args:
            z_0: (B, C, T, H, W) clean VAE tokens
            t: (B,) integer timestep indices

        Returns:
            z_t: (B, C, T, H, W) noised tokens
            eps: (B, C, T, H, W) the noise that was added
        Nç      ð?)r   r'   Ú
randn_likeÚsqrt)r<   Úz_0r   Úalpha_bar_tÚepsr[   r   r   r   Úforward_diffusion¥   s
   

 z DinoVideoModel.forward_diffusionNc           	      C   sV   |j d }|j}|du rtjd| j|f|d}|  ||¡\}}|  |||¡}|||fS )u  Training forward pass: sample noise level, predict clean xâ‚€.

        Args:
            x: (B, 3, 256, 256) ImageNet-normalized input
            z_0: (B, vae_embed_dim, N_FRAMES, 16, 16) clean target VAE tokens
            t: (B,) optional timestep indices. If None, sampled randomly.

        Returns:
            x0_pred: (B, vae_embed_dim, N_FRAMES, 16, 16) predicted clean tokens
            z_0: (B, vae_embed_dim, N_FRAMES, 16, 16) actual clean tokens (target)
            t: (B,) sampled timestep indices
        r   N©rD   )rI   rD   r'   Úrandintr#   re   r^   )	r<   rP   rb   r   rQ   rD   r[   rd   Úx0_predr   r   r   Úforward·   s   

zDinoVideoModel.forwardc                 C   s,  |j d }|j}tj|| j| jdd|d}tt| jƒƒD ]w}tj	|f||tj
d}|  |||¡}| j| }|dkr‘|t |¡|  t d| ¡ }	| j|d  }
||
 }d| }t |
¡| d|  }t |¡d|
  d|  }|| ||  }d|
 d|  | }t |¡}||t |¡  }q|}q|S )u¸  3-step reverse diffusion (xâ‚€-prediction): denoise from pure noise to predicted VAE tokens.

        At each step the network directly predicts xâ‚€. We derive Îµ from xâ‚€_pred
        to compute the DDPM posterior for stepping to the next noise level.

        Args:
            x: (B, 3, 256, 256) ImageNet-normalized input

        Returns:
            z_0_pred: (B, vae_embed_dim, N_FRAMES, 16, 16) predicted clean VAE tokens
        r   r   rf   )rD   r   r_   r   )rI   rD   r'   r4   r"   r!   Úreversedr   r#   ÚfullÚlongr^   r   r	   ra   r`   )r<   rP   rQ   rD   ÚzÚstepr   rh   rc   Zeps_derivedÚalpha_bar_prevÚalpha_tZbeta_tZcoef_z0Zcoef_ztÚmuZposterior_varÚsigmar   r   r   ÚsampleÓ   s(   

 
zDinoVideoModel.sample)NN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚN_FRAMESÚVAE_EMBED_DIMÚN_DIFFUSION_STEPSr    rB   rT   rZ   r^   re   ri   r'   rV   rs   Ú__classcell__r   r   r@   r   r   )   s    ÿ1
r   )rw   Úosr	   r'   Útorch.nnr2   ÚenvironÚgetr   r   r$   rx   ry   rz   r   Z	ALPHA_BARÚModuler   r   r   r   r   Ú<module>   s"    þ