
    i&                        d Z ddlZddlZddlmZ ddlZddlmZ  ee      j                         j                  d   dz  Zej                         r6 ee      ej                  vr"ej                  j                  d ee             ddlmZ ddlmZmZ dZd	Zd
Zd
xZZd Zd ZefdZefdZd Z G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z#ddZ$y)z
DINO-conditioned diffusion in VAE latent space.
- Tokenizer & diffusion from simple_uva (UVA).
- Single-frame DINO encoder at 32x32 from dino_vid_model.
- Lightweight self-attention denoising net conditioned on DINO features.
    N)Path   unified_video_action)create_diffusion)load_dino_encoderencode_frameg(\?      c                 l    | j                   \  }}}}| j                  ||d      j                  ddd      S )z=latent: (B, C, H, W) with H=W=16, C=16. Returns (B, 256, 16).r      r   )shapereshapepermute)latentBCHWs        ,/data/cameron/vidgen/diffusion_dino/model.pypatchify_latentr      s5    JAq!Q>>!Q#++Aq!44    c                 t    | j                   \  }}}dx}}| j                  ddd      j                  ||||      S )z.tokens: (B, 256, 16). Returns (B, 16, 16, 16).r
   r   r   r   )r   r   r   )tokensr   Sr   hws         r   unpatchify_latentr   $   s>    llGAq!JA>>!Q"**1aA66r   c                     t        j                         5  |j                  |       }|j                         }ddd       |z  }t	        |      S # 1 sw Y   xY w)z[Encode frame to VAE latent, scale, and patchify. frame: (B, 3, H, W). Returns (B, 256, 16).N)torchno_gradencodemoder   )framevaescale	posteriorzs        r   frame_to_tokensr)   +   sP    	 JJu%	NN 	
E	A1	 s   "AAc                     t        |       |z  }t        j                         5  |j                  |      cddd       S # 1 sw Y   yxY w)zHUnpatchify, unscale, decode. tokens: (B, 256, 16). Returns (B, 3, H, W).N)r   r    r!   decode)r   r%   r&   r(   s       r   tokens_to_framer,   4   s:    &!E)A	 zz!}  s	   >Ac                     | d|z   z  |z   S )Nr    )xshiftr&   s      r   modulater1   ;   s    E	?U""r   c                   8     e Zd Zd fd	Zedd       Zd Z xZS )TimestepEmbedderc           	          t         |           t        j                  t        j                  ||d      t        j
                         t        j                  ||d            | _        || _        y )NT)bias)super__init__nn
SequentialLinearSiLUmlpfrequency_embedding_size)selfhidden_sizer=   	__class__s      r   r7   zTimestepEmbedder.__init__@   sS    ==II.$GGGIIIk;T:

 )A%r   c           	         |dz  }t        j                  t        j                  |       t        j                  d|t         j
                  | j                        z  |z        }| d d d f   j                         |d    z  }t        j                  t        j                  |      t        j                  |      gd      }|dz  r5t        j                  |t        j                  |d d d df         gd      }|S )Nr   r   )startenddtypedevicer   dimr   )r    expmathlogarangefloat32rE   floatcatcossin
zeros_like)trG   
max_periodhalffreqsargs	embeddings          r   timestep_embeddingz#TimestepEmbedder.timestep_embeddingI   s    ax		XXj!!llEMM!((ST

 Dz!E$K/IIuyy		$@bI	7		9e.>.>yBQB?O.P"QWYZIr   c                 \    | j                  || j                        }| j                  |      S )N)rX   r=   r<   )r>   rR   t_freqs      r   forwardzTimestepEmbedder.forwardW   s)    ((D,I,IJxxr   )r	   )i'  )__name__
__module____qualname__r7   staticmethodrX   r[   __classcell__r@   s   @r   r3   r3   ?   s"    A   r   r3   c                   *     e Zd ZdZd fd	Zd Z xZS )AdaLNSelfAttentionBlockz_LayerNorm (AdaLN with y) -> self-attention -> residual -> LayerNorm (AdaLN) -> MLP -> residual.c           
         t         |           t        j                  |d      | _        t        j
                  t        j                         t        j                  |d|z              | _        t        j                  ||d      | _
        t        j                  |d      | _        t        j
                  t        j                         t        j                  |d|z              | _        t        j
                  t        j                  |t        ||z              t        j                         t        j                  t        ||z        |            | _        y )NFelementwise_affine   T)batch_first)r6   r7   r8   	LayerNormnorm1r9   r;   r:   adaLN1MultiheadAttentionattnnorm2adaLN2intGELUr<   )r>   r?   	num_heads	mlp_ratior@   s       r   r7   z AdaLNSelfAttentionBlock.__init___   s    \\+%H
mmBGGIryya+o/VW))+ydS	\\+%H
mmBGGIryya+o/VW==IIk3{Y'>#?@GGIIIc+	12K@
r   c                 |   | j                  |      j                  dd      \  }}}t        | j                  |      ||      }| j	                  |||d      \  }}|||z  z   }| j                  |      j                  dd      \  }}	}
t        | j                  |      ||	      }||
| j                  |      z  z   }|S )Nrg   r   rF   F)need_weights)rk   chunkr1   rj   rm   ro   rn   r<   )r>   r/   yshift1scale1gate1r   _shift2scale2gate2s              r   r[   zAdaLNSelfAttentionBlock.forwardl   s     $A 4 4QB 4 ?TZZ]FF3yyAquy51	M $A 4 4QB 4 ?TZZ]FF3##r   )         @)r\   r]   r^   __doc__r7   r[   r`   ra   s   @r   rc   rc   \   s    i
	r   rc   c                   D     e Zd ZdZededz  ddddf fd	Zd	 Zd
 Z xZS )DinoCondDiffusionNeta  
    Denoising net: noisy VAE tokens (B*256, 16) + timestep t + DINO cond (B*256, D).
    Lightweight self-attention over the 256 token sequence, conditioned on t and DINO via AdaLN.
    Output (B*256, out_channels) for epsilon (and optionally variance).
       r        r   r   c           
      d   t         	|           || _        || _        t	        j
                  ||      | _        t        |      | _        t	        j
                  ||      | _	        t	        j                  t        |      D cg c]  }t        |||       c}      | _        t	        j                  |d      | _        t	        j                   t	        j"                         t	        j
                  |d|z              | _        t	        j
                  ||      | _        | j)                          y c c}w )N)rr   rs   Fre   r   )r6   r7   in_channelsout_channelsr8   r:   
input_projr3   
time_embed
cond_embed
ModuleListrangerc   blocksri   
norm_finalr9   r;   adaLN_finalout_proj_init_weights)
r>   r   model_channelsr   dino_dim
num_layersrr   rs   r{   r@   s
            r   r7   zDinoCondDiffusionNet.__init__   s     	&())K@*>:))Hn=mm:&%
 $NiS\]%
  ,,~%P==BIInaR`N`4ab		.,?%
s    D-c                    | j                         D ]|  }t        |t        j                        st        j                  j                  |j                         |j                  Tt        j                  j                  |j                         ~ t        j                  j                  | j                  j                  d   j                  d       t        j                  j                  | j                  j                  d   j                  d       t        j                  j                  | j                  d   j                  d       t        j                  j                  | j                  d   j                  d       t        j                  j                  | j                  j                  d       t        j                  j                  | j                  j                  d       y )Nr   g{Gz?)stdr   r   )modules
isinstancer8   r:   initxavier_uniform_weightr5   zeros_normal_r   r<   	constant_r   r   )r>   ms     r   r   z"DinoCondDiffusionNet._init_weights   s3    	+A!RYY'''166%GGNN166*		+
 	++A.554@
++A.554@
$**2.55q9
$**2.33Q7
$--..2
$--,,a0r   c                    |j                   d   }|t        z  }| j                  |      }| j                  |      }| j	                  |      }||z   }|j                  |t        d      }|j                  |t        d      }| j                  D ]  }	 |	||      } | j                  |      j                  dd      \  }
}t        | j                  |      |
|      }| j                  |      }|j                  || j                        S )Nr   r   r   rF   )r   SEQ_LENr   r   r   viewr   r   rv   r1   r   r   r   )r>   r/   rR   cNr   t_embc_embrw   blockr0   r&   s               r   r[   zDinoCondDiffusionNet.forward   s    GGAJLOOA""EMFF1gr"FF1gr"[[ 	EaA	''*000;uT__Q'6MM!vva**++r   )	r\   r]   r^   r   	TOKEN_DIMr7   r   r[   r`   ra   s   @r   r   r   x   s2     ]21,r   r   c                   H     e Zd ZdZ	 	 	 	 	 ddef fdZd Zd ZddZ xZ	S )	DinoDiffusionz
    Full model: DINO encoder (single frame -> 32x32 features) + diffusion in VAE token space.
    Same tokenizer (VAE + patchify) and diffusion schedule as UVA; denoising net is DINO-conditioned self-attention.
    keygrip_rootc                    t         |           t        t        |      |      \  | _        | _        t        | j                  dd      | _        t        | _	        t        | _        t        t        |t        dz  | j                  ||      | _        t        dd      | _        t        |d      | _        t        | _        y )	N)freeze	embed_dimr   r   )r   r   r   r   r   rr    cosine)timestep_respacingnoise_schedule)r6   r7   r   r   dino	dino_normgetattrr   r   seq_lenr   	token_dimr   netr   train_diffusiongen_diffusionr   )r>   r   dino_freezenum_sampling_stepsr?   r   rr   r@   s          r   r7   zDinoDiffusion.__init__   s     	$5d<6HQ\$]!	4>		;<"'!&"Q]]!
  0!( 
 .1(
 %r   c                    t        | j                  | j                  |      }|j                  \  }}}}t        j
                  j                  j                  |d      }|j                  d      j                  ddd      }|S )zNframe: (B, 3, H, W). Returns (B, seq_len, dino_dim) by pooling 32x32 -> 16x16.)r
   r
   r   r   r   )
r   r   r   r   r    r8   
functionaladaptive_avg_pool2dflattenr   )r>   r$   featr   Dr   r   s          r   get_dino_condzDinoDiffusion.get_dino_cond   sf    DIIt~~u=ZZ
1axx""66tXF||A&&q!Q/r   c                    |j                   \  }}}|j                  ||z  d      }|j                  ||z  d      }t        j                  d| j                  j
                  |j                   d   f|j                        }t        |      }	| j                  j                  | j                  |||	      }
|
d   j                         S )z
        target_tokens: (B, seq_len, token_dim) in VAE latent space.
        dino_cond: (B, seq_len, dino_dim) from get_dino_cond(first_frame).
        r   r   rE   r   loss)r   r   r    randintr   num_timestepsrE   dicttraining_lossesr   mean)r>   target_tokens	dino_condr   r   r{   target_flat	cond_flatrR   model_kwargs	loss_dicts              r   compute_losszDinoDiffusion.compute_loss   s    
 &++7A#++AK<%%a'k26	MM  ..q!# ''	
 i(((88HHk1l
	  %%''r   c           
      v   |xs# t        | j                               j                  }|j                  d   }|t        z  }t        j                  || j                  |      }|j                  |d      }t        |      }| j                  j                  | j                  || j                  f|d||d|      }	|	S )z]Sample (B*seq_len, token_dim) from diffusion conditioned on dino_cond (B, seq_len, dino_dim).r   r   r   r   F)noiseclip_denoisedr   rE   progresstemperature)next
parametersrE   r   r   r    randnr   r   r   r   p_sample_loopr   )
r>   r   r   rE   r   r   r   r   r   outs
             r   samplezDinoDiffusion.sample   s    94 1299OOAKAt//?%%a,	i(  ..HH  !%# / 	
 
r   )T100r   r   r   )g      ?N)
r\   r]   r^   r   r   r7   r   r   r   r`   ra   s   @r   r   r      s5      %%<((r   r   c                 0    t        dt        |       |d|S )N)r   r   r.   )r   r   )r   r   kwargss      r   build_dino_diffusionr     s)     ,'-  r   )r   )%r   rI   syspathlibr   r    torch.nnr8   __file__resolveparents	_uva_rootexistsstrpathinsertsimple_uva.diffusionr   dino_encoderr   r   VAE_LATENT_SCALEr   r   SEQ_HSEQ_Wr   r   r)   r,   r1   Moduler3   rc   r   r   r   r.   r   r   <module>r      s     
    N""$,,Q/2HH	#i.8HHOOAs9~& 1 9  
	 57 '7  (8 # ryy  :bii 8<,299 <,~RBII Rjr   