o
    2±i\                     @   s  d Z ddlZddlZddlmZ ddlZddlmZ ee	 j
d d Ze r9eeejvr9ejdee ddlmZ ddlmZmZ dZd	Zd
Zee ZdZd ZZdd Zdd ZefddZefddZefddZ eefddZ!dd Z"G dd dej#Z$G dd dej#Z%G dd dej#Z&G d d! d!ej#Z'G d"d# d#ej#Z(G d$d% d%ej#Z)G d&d' d'ej#Z*G d(d) d)ej#Z+G d*d+ d+ej#Z,d0d.d/Z-dS )1z
DINO-conditioned diffusion in VAE latent space.
- Tokenizer & diffusion from simple_uva (UVA).
- Single-frame DINO encoder at 32x32 from dino_vid_model.
- Lightweight self-attention denoising net conditioned on DINO features.
    N)Path   unified_video_action)create_diffusion)load_dino_encoderencode_frameg(\?         c                 C   s&   | j \}}}}| ||ddddS )z=latent: (B, C, H, W) with H=W=16, C=16. Returns (B, 256, 16).r      r   )shapereshapepermute)ZlatentBCHW r   ,/data/cameron/vidgen/diffusion_dino/model.pypatchify_latent    s   r   c                 C   s.   | j \}}}d }}| ddd||||S )z.tokens: (B, 256, 16). Returns (B, 16, 16, 16).r
   r   r   r   )r   r   r   )tokensr   Sr   hwr   r   r   unpatchify_latent&   s   r   c                 C   sJ   t   || }| }W d   n1 sw   Y  || }t|S )z[Encode frame to VAE latent, scale, and patchify. frame: (B, 3, H, W). Returns (B, 256, 16).N)torchno_gradencodemoder   )framevaescale	posteriorzr   r   r   frame_to_tokens-   s   


r%   c                 C   s   | j \}}}}}| ddddd|| |||}t  ||}	|	 | }
W d   n1 s3w   Y  t|
}|||t t	}|S )z
    video: (B, 3, T, H, W) in [-1,1] (or compatible). Returns (B, T*256, 16).
    Uses a single VAE encode over flattened frames.
    r   r   r         N)
r   r   r   r   r   r   r   r   TOKENS_PER_FRAME	TOKEN_DIM)videor!   r"   r   r   Tr   r   framesr#   r$   r   r   r   r   video_to_tokens5   s   "

r-   c                 C   sB   t | | }t  ||W  d   S 1 sw   Y  dS )zHUnpatchify, unscale, decode. tokens: (B, 256, 16). Returns (B, 3, H, W).N)r   r   r   decode)r   r!   r"   r$   r   r   r   tokens_to_frameD   s   
$r/   c           
      C   s   | j \}}}||t krtd|t  d| | || t|}t|| }t  ||}	W d   n1 s;w   Y  |	||d|	j d |	j d ddd	dd
}	|	S )zr
    tokens: (B, T*256, 16). Returns (B, 3, T, 256, 256) (VAE output range depends on VAE; typically [-1,1]).
    zExpected tokens second dim , got Nr&   r   r   r   r   r'   )	r   r(   
ValueErrorr   r   r   r   r.   r   )
r   r!   r"   tr   r   r   Z	tokens_btr$   r,   r   r   r   tokens_to_videoK   s   
,r4   c                 C   s   | d|  | S )Nr   r   )xshiftr"   r   r   r   modulateZ   s   r7   c                       s4   e Zd Zd	 fdd	Zed
ddZdd Z  ZS )TimestepEmbedderr	   c              	      s@   t    ttj||ddt tj||dd| _|| _d S )NTbias)super__init__nn
SequentialLinearSiLUmlpfrequency_embedding_size)selfhidden_sizerB   	__class__r   r   r<   _   s   

zTimestepEmbedder.__init__'  c              	   C   s   |d }t t| t jd|t j| jd | }| d d d f  |d   }t jt 	|t 
|gdd}|d rOt j|t |d d d df gdd}|S )Nr   r   )startenddtypedevicer   dimr   )r   expmathlogarangefloat32rK   floatcatcossin
zeros_like)r3   rM   
max_periodhalffreqsargs	embeddingr   r   r   timestep_embeddingh   s   
(z#TimestepEmbedder.timestep_embeddingc                 C   s   |  || j}| |S N)r]   rB   rA   )rC   r3   t_freqr   r   r   forwardv   s   
zTimestepEmbedder.forward)r	   )rG   )__name__
__module____qualname__r<   staticmethodr]   r`   __classcell__r   r   rE   r   r8   ^   s
    	r8   c                       s.   e Zd ZdZdef fddZdd Z  ZS )ResBlockzOPer-token MLP block with AdaLN modulation (same pattern as UVA SimpleMLPAdaLN).channelsc              	      sl   t    tj|dd| _ttj||ddt tj||dd| _tt tj|d| dd| _	d S )Nư>epsTr9   r&   )
r;   r<   r=   	LayerNormin_lnr>   r?   r@   rA   adaLN_modulation)rC   rg   rE   r   r   r<   ~   s   

zResBlock.__init__c                 C   sB   |  |jddd\}}}t| |||}| |}|||  S )Nr&   r   rL   )rm   chunkr7   rl   rA   )rC   r5   yr6   r"   gater   r   r   r   r`      s   
zResBlock.forwardra   rb   rc   __doc__intr<   r`   re   r   r   rE   r   rf   {   s    rf   c                       s.   e Zd Zdedef fddZdd Z  ZS )
FinalLayermodel_channelsout_channelsc                    sT   t    tj|ddd| _tj||dd| _tt tj|d| dd| _	d S )NFrh   )elementwise_affinerj   Tr9   r   )
r;   r<   r=   rk   
norm_finalr?   linearr>   r@   rm   )rC   ru   rv   rE   r   r   r<      s   

zFinalLayer.__init__c                 C   s4   |  |jddd\}}t| |||}| |S Nr   r   rL   )rm   rn   r7   rx   ry   )rC   r5   ro   r6   r"   r   r   r   r`      s   
zFinalLayer.forward)ra   rb   rc   rs   r<   r`   re   r   r   rE   r   rt      s    rt   c                       >   e Zd ZdZeded df fdd	Zdd Zd	d
 Z  ZS )DinoCondDiffusionNetz
    Denoising net: noisy VAE tokens (N, 16) + timestep t + DINO cond (N, D).
    Per-token MLP (UVA SimpleMLPAdaLN-style) conditioned on t + DINO via AdaLN.
    Output (N, out_channels) for epsilon (+ variance).
       r      c                    s~   t    || _|| _ | _|| _t | _t	  | _
t	| | _t fddt|D | _t || _|   d S )Nc                    s   g | ]}t  qS r   )rf   .0_ru   r   r   
<listcomp>   s    z1DinoCondDiffusionNet.__init__.<locals>.<listcomp>)r;   r<   in_channelsrv   ru   num_res_blocksr8   
time_embedr=   r?   
cond_embed
input_proj
ModuleListrange
res_blocksrt   final_layer_init_weights)rC   r   ru   rv   r   rE   r   r   r<      s   

zDinoCondDiffusionNet.__init__c                 C   s   dd }|  | tjj| jjd jdd tjj| jjd jdd | jD ]}tj|j	d jd tj|j	d j
d q&tj| jj	d jd tj| jj	d j
d tj| jjjd tj| jjj
d d S )Nc                 S   @   t | tjrtj| j | jd urtj| jd d S d S d S Nr   
isinstancer=   r?   initxavier_uniform_weightr:   	constant_mr   r   r   _basic_init      
z7DinoCondDiffusionNet._init_weights.<locals>._basic_initr   {Gz?stdr   r   )applyr=   r   normal_r   rA   r   r   r   rm   r:   r   ry   rC   r   blockr   r   r   r      s   

z"DinoCondDiffusionNet._init_weightsc                 C   s@   |  |}| || | }| jD ]}|||}q| ||S r^   )r   r   r   r   r   )rC   r5   r3   cro   r   r   r   r   r`      s
   

zDinoCondDiffusionNet.forward	ra   rb   rc   rr   r)   r<   r   r`   re   r   r   rE   r   r|      s    r|   c                       s8   e Zd ZdZd
dededef fddZdd	 Z  ZS )Conv3DBlockuU   3D conv block: GroupNorm, 3×3×3 conv, AdaLN from global conditioning (B, cond_dim).r   rg   cond_dim
num_groupsc                    sR   t    t||| _tj||ddd| _tt tj	|d| dd| _
d S )Nr&   r   )kernel_sizepaddingr   Tr9   )r;   r<   r=   	GroupNormnormConv3dconvr>   r@   r?   rm   )rC   rg   r   r   rE   r   r   r<      s   

zConv3DBlock.__init__c                 C   sp   |  |jddd\}}|d d d d d d d f }|d d d d d d d f }t| |||}| |}|| S rz   )rm   rn   r7   r   r   )rC   r5   ro   r6   r"   r   r   r   r   r`      s   
zConv3DBlock.forwardr   rq   r   r   rE   r   r      s    	r   c                       r{   )DinoCondDiffusionNetConv3Du   
    Denoising net over latent volume: reshape (B*SEQ_LEN, 16) -> (B, 16, 8, 16, 16),
    in_proj -> 5× Conv3DBlock(cond) -> out_proj -> (B*SEQ_LEN, 32).
    Conditioning c (N, model_channels) is aggregated to (B, model_channels) for AdaLN.
    r}   r      c                    s   t    || _|| _ | _|| _t | _t	  | _
tj| dd| _t fddt|D | _tj |dd| _|   d S )Nr   )r   c                    s   g | ]}t   qS r   )r   r   r   r   r   r     s    
z7DinoCondDiffusionNetConv3D.__init__.<locals>.<listcomp>)r;   r<   r   rv   ru   
num_blocksr8   r   r=   r?   r   r   in_projr   r   blocksout_projr   )rC   r   ru   rv   r   rE   r   r   r<      s   


z#DinoCondDiffusionNetConv3D.__init__c                 C   s   dd }|  | tjj| jjd jdd tjj| jjd jdd | jD ]}tj|j	d jd tj|j	d j
d q&tj| jjd tj| jj
d d S )Nc                 S   sF   t | tjtjfrtj| j | jd ur!tj| jd d S d S d S r   )	r   r=   r?   r   r   r   r   r:   r   r   r   r   r   r     s   
z=DinoCondDiffusionNetConv3D._init_weights.<locals>._basic_initr   r   r   r   r   )r   r=   r   r   r   rA   r   r   r   rm   r:   r   r   r   r   r   r   
  s   

z(DinoCondDiffusionNetConv3D._init_weightsc           
      C   s   |j d }|t }|t |ksJ ||t| j}|ddd|| jttt}||td d df }||tdj	dd}| 
|| | }| |}| jD ]}	|	||}qQ| |}|ddddd|d| j}||| j}|S )Nr   r   r   r   rL   r&   r'   )r   SEQ_LENviewr   r   r   
NUM_FRAMESSEQ_HSEQ_Wmeanr   r   r   r   r   rv   )
rC   r5   r3   r   Nr   t_batchZc_globalro   r   r   r   r   r`     s   



z"DinoCondDiffusionNetConv3D.forwardr   r   r   rE   r   r      s    r   c                       s4   e Zd ZdZd	dedef fddZdd Z  ZS )
TransformerLayerzZSimple self-attention transformer block with AdaLN modulation from per-token conditioning.r   rg   	num_headsc              	      s   t    || _|| _tj|dd| _tj|dd| _tj||dd| _	t
tj|d| ddt tjd| |dd| _t
t tj|d| dd| _t
t tj|d| dd| _d S )Nrh   ri   T)batch_firstr'   r9   r&   )r;   r<   rg   r   r=   rk   ln1ln2MultiheadAttentionattnr>   r?   r@   rA   adaLN_modulation1adaLN_modulation2)rC   rg   r   rE   r   r   r<   4  s"   

zTransformerLayer.__init__c                 C   s   |  |jddd\}}}t| |||}| j|||dd\}}|||  }| |jddd\}	}
}t| ||	|
}| |}|||  }|S )zg
        x: (B, S, C) token features
        y: (B, S, C) per-token conditioning (time + DINO)
        r&   r   rL   F)need_weights)r   rn   r7   r   r   r   r   rA   )rC   r5   ro   Zshift1Zscale1Zgate1r   Zh_attnr   Zshift2Zscale2Zgate2h2r   r   r   r`   G  s   
zTransformerLayer.forwardr   rq   r   r   rE   r   r   1  s    r   c                       s@   e Zd ZdZeded ddf fdd	Zdd	 Zd
d Z  ZS )DinoCondDiffusionNetTransformera  
    Self-attention transformer denoiser over the token sequence.
    Reshapes (N, C) -> (B, SEQ_LEN, C), runs several TransformerLayer blocks,
    then maps back to (N, out_channels). Uses the same (time + cond) AdaLN
    conditioning pattern as the MLP denoiser.
    r}   r   r'   r   c                    s   t    || _|| _ | _|| _| _t | _t	
  | _t	
| | _t	 fddt|D | _t || _|   d S )Nc                    s   g | ]}t  d qS ))r   )r   r   ru   r   r   r   r   u  s    z<DinoCondDiffusionNetTransformer.__init__.<locals>.<listcomp>)r;   r<   r   rv   ru   
num_layersr   r8   r   r=   r?   r   r   r   r   layersrt   r   r   )rC   r   ru   rv   r   r   rE   r   r   r<   b  s   

z(DinoCondDiffusionNetTransformer.__init__c                 C   s  dd }|  | tjj| jjd jdd tjj| jjd jdd | jD ].}tj|j	d jd tj|j	d j
d tj|jd jd tj|jd j
d q&tj| jjd jd tj| jjd j
d tj| jjjd tj| jjj
d d S )Nc                 S   r   r   r   r   r   r   r   r   {  r   zBDinoCondDiffusionNetTransformer._init_weights.<locals>._basic_initr   r   r   r   r   )r   r=   r   r   r   rA   r   r   r   r   r:   r   r   rm   ry   )rC   r   layerr   r   r   r   z  s   

z-DinoCondDiffusionNetTransformer._init_weightsc           	      C   s   |j d }|t }|t |ksJ d| |}| || | }||t| j}||t| j}| jD ]}|||}q5||| j}||| j}| ||S )Nr   zN must be divisible by SEQ_LEN)	r   r   r   r   r   r   ru   r   r   )	rC   r5   r3   r   r   r   ro   Zy_seqr   r   r   r   r`     s   


z'DinoCondDiffusionNetTransformer.forwardr   r   r   rE   r   r   Z  s    	r   c                       sL   e Zd ZdZ					ddef fdd	Zd
d Zdd ZdddZ  Z	S )DinoDiffusiona  
    Full model: DINO encoder (single frame -> 32x32 features) + diffusion on residuals in VAE token space.
    - Tokenizer & diffusion schedule from UVA.
    - Conditioning: DINO tokens + first-frame VAE tokens, broadcast to all 8 frames with video position embeddings.
    - Target: residual tokens r = x_true - x_ref, where x_ref is the first-frame tokens broadcast to all frames.
    T1r}   r~   rA   keygrip_rootc           
         s(  t    tt||d\| _| _t| jdd| _t| _	t
| _t| _t| _|| _tdt|| _d}d}t||| jd }	| d|	 t| j| j || _ttdt|| _|dkrjtt
|t
d	 d
d| _n|dkr{tt
|t
d	 ddd| _nt t
|t
d	 |d| _t
| _!tj"j#| jdd d S )N)freeze	embed_dimi  r         ?g        noise_levelsconv3dr   r   )r   ru   rv   r   transformerr'   r   )r   ru   rv   r   r   )r   ru   rv   r   r   r   )$r;   r<   r   r   dino	dino_normgetattrdino_dimr   seq_lenr)   	token_dimr   
num_framesr(   tokens_per_frameru   maxrs   	num_stepsr   linspaceregister_bufferr=   r?   	cond_proj	Parameterzerosvideo_pos_embedr   netr   r|   r   r   r   )
rC   r   Zdino_freezenum_sampling_stepsrD   r   denoiserZ	noise_maxZ	noise_minr   rE   r   r   r<     sL   
	

zDinoDiffusion.__init__c                 C   sF   t | j| j|}|j\}}}}tjj|d}|d	ddd}|S )zNframe: (B, 3, H, W). Returns (B, seq_len, dino_dim) by pooling 32x32 -> 16x16.)r
   r
   r   r   r   )
r   r   r   r   r   r=   
functionaladaptive_avg_pool2dflattenr   )rC   r    featr   Dr   r   r   r   r   get_dino_cond  s
   zDinoDiffusion.get_dino_condc                 C   s  |j \}}}|tkrtdt d| |j |t| jfkr,td|t| jf d|j  |dddtddf }tj||gdd}| |}|d	|| j
| j| j}||t| j}|| j }||| d}	|d	|| j
| j| j|t| j}
||
 }||| d}|j d }t|}tjd| jd |f|jd	}| j| d}|||  }|}| |||	}tj|| jdd\}}t|| d
 S )a  
        target_tokens: (B, T*256, token_dim) VAE tokens for all 8 frames (this is x0).
        dino_cond: (B, 256, dino_dim) from get_dino_cond(first_frame).

        We predict the residual latent r0 = x0 - x_ref at each diffusion timestep,
        where x_ref is the first-frame tokens broadcast across all frames. The
        full latent is reconstructed as x0 = x_ref + r0.
        zExpected target_tokens seq_len=r0   Expected dino_cond shape Nr   rL   r   r   rK   r   )r   r   r2   r(   r   r   rT   r   	unsqueezeexpandr   r   ru   r   r   r   
randn_likerandintr   rK   r   r   splitr   )rC   target_tokens	dino_condr   r   r   first_tokens
cond_firstcond_b	cond_flatx_refr0r0_flatr   noiseksigmaZr_tr3   pred_allZpred_r0r   r   r   compute_loss  s8   	




zDinoDiffusion.compute_lossr   Nc                 C   s  |pt |  j}|jd }|t }|j|t| jfkr*td|t| jf d|j tj	||gdd}| 
|}|d|| j| j| j}||t| j}|| j }||d}	|d|| j| j| j|t| j}
|
|| j}| j}tj|| j|d| j|  }|}t|ddD ]:}tj|f|tj|d}| |||	}tj|| jdd\}}|dkrt|| }| j|d  }|||  }q|}q|| |t| j}|S )	a.  
        Sample full latent tokens x0 for 8 frames, conditioned on first-frame tokens + DINO.

        first_tokens: (B, 256, token_dim) first-frame VAE tokens.
        dino_cond: (B, 256, dino_dim) DINO tokens from the same frame.
        Returns: (B, T*256, token_dim) tokens for all frames.
        r   r   r0   r   rL   r   r   )rJ   rK   )next
parametersrK   r   r   r(   r   r2   r   rT   r   r   r   r   r   ru   r   r   r   r   randnr   r   fulllongr   r   r   r   )rC   r   r   temperaturerK   r   r   r   r   r   r   Z
x_ref_flatKr   rr   r3   r   r   r   rj   Z
sigma_prevx0r   r   r   sample  sB   




zDinoDiffusion.sample)Tr   r}   r~   rA   )r   N)
ra   rb   rc   rr   r   r<   r   r  r  re   r   r   rE   r   r     s    
8.r   r   rA   c                 K   s   t dt| ||d|S )N)r   r   r   r   )r   r   )r   r   r   kwargsr   r   r   build_dino_diffusionL  s   r  )r   rA   ).rr   rO   syspathlibr   r   torch.nnr=   __file__resolveparentsZ	_uva_rootexistsstrpathinsertsimple_uva.diffusionr   Zdino_encoderr   r   VAE_LATENT_SCALEr   r(   r   r)   r   r   r   r   r%   r-   r/   r4   r7   Moduler8   rf   rt   r|   r   r   r   r   r   r  r   r   r   r   <module>   sF    7D)I *