o
    ȯi&                     @   s|   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ dZdZd	 ZZG d
d dejZG dd dejZdS )z|Teacher: frozen DINO on all frames -> (B, 8, D, 32, 32). Student: same -> (B, 8, D, 32, 32). 512x512 input -> 32x32 patches.    )PathN)	rearrange   )	load_dinoextract_patch_featuresDinoNormalize   i       c                       s:   e Zd ZdZdef fddZ fddZdd Z  ZS )	TeacherDinoVideozeFrozen DINO run on each of 8 frames (resized to 512x512 -> 32x32 patches). Returns (B, 8, D, 32, 32).keygrip_rootc                    s2   t    t|dd| _| jj| _ttd| _d S )NTfreezeZ	dino_size)super__init__r   dino	embed_dimr   	DINO_SIZEnorm)selfr   	__class__ ,/data/cameron/vidgen/dino_vid_model/model.pyr   ^   s   

zTeacherDinoVideo.__init__c                    s   t  | | j|| _| S )N)r   tor   )r   devicer   r   r   r   d   s   zTeacherDinoVideo.toc           	      C   sl   |j \}}}}}t|d}| |}t  t| j|dd}W d   n1 s)w   Y  t|d||dS )zOx: (B, 3, 8, 256, 256). Returns (B, 8, D, 32, 32). Batched over time for speed.zb c t h w -> (b t) c h wNz(b t) d h w -> b t d h w)bt)shaper   r   torchno_gradr   r   )	r   xBCTHWZx_flatfeatr   r   r   forwardi   s   


zTeacherDinoVideo.forward)	__name__
__module____qualname____doc__r   r   r   r(   __classcell__r   r   r   r   r
   [   s
    r
   c                       sf   e Zd ZdZdef fddZdd Zdd Zdd
efddZ	dd
efddZ
dd
efddZ  ZS )StudentDinoVideoa  Single-frame DINO + Minimal Iterative Policy (MIP)-style 2-step RGB video regression.

    We treat the "action" as the whole video in pixel space. The same network is invoked twice:
    - step 0: pi(o, 0, 0)
    - step 1: pi(o, I_{t*}, t*)  (training uses GT noisy interpolant; inference uses t* * a0_hat)

    Outputs are in [-1, 1]. The forward() path is deterministic inference (returns step-1 output).
    Use `mip_train_preds()` to get both step predictions for supervision.
    r   c                    s&  t    t|dd| _| jj| _ttd| _dt | _	| jd }t
| jd d}t
| jd d	}tj| j	| jd
d| _ttd
| jt t| j| j| _ttj| j|dd
dt tj||ddd
dt tj||ddd
dt tj||ddd
dt tj|| j	d
dt 
| _d S )NFr   r            @   r   r	   r   )kernel_size)r3   padding)r3   strider4   )r   r   r   r   r   r   r   r   
NUM_FRAMESout_channelsmaxnnConv2dit_proj
SequentialLinearGELUt_embedConvTranspose2dTanhhead)r   r   Zdec_c1Zdec_c2Zdec_c3r   r   r   r   ~   s4   




zStudentDinoVideo.__init__c                 C   s,   |  |d d d d df }t| j|d d S )Nr   )r   r   r   )r   r!   firstr   r   r   _encode_first   s   zStudentDinoVideo._encode_firstc           	      C   s   |j d }|du rtj|| jdd|j|jd}tjjj|t	t
fddd}| |}t|s8tj||j|jd}|dd|d}| | j|jd	|| jdd}|| | }| |S )
z8pi_theta(o_feat, I_t, t) -> (B, 3*NUM_FRAMES, 256, 256).r   N   )r   dtypebilinearF)sizemodealign_cornersr   rF   )r   r   zerosr7   r   rF   r9   
functionalinterpolatePATCH_HPATCH_Wr;   	is_tensortensorreshapeexpandr?   floatr   viewr   rB   )	r   
first_featI_tr   r"   Zit32condtbzr   r   r   pi   s   


&
zStudentDinoVideo.pi?t_starc                 C   s   |j d }| |}| j|ddd|tddd}||| jdd}t|}|| d| |  j	|j
d}	| j||	|d|tddd}
||
fS )	zReturn (a0_hat, a1_hat) in (B, T, 3, 256, 256) for MIP training supervision.

        target_video should be (B, T, 3, 256, 256) in [-1, 1].
        r   N        rX   r   r/   rE   g      ?rK   )r   rD   r\   rV   r6   rS   r7   r   
randn_liker   rF   )r   r!   Ztarget_videor^   r"   rW   a0Za_flatr[   rX   a1r   r   r   mip_train_preds   s   


z StudentDinoVideo.mip_train_predsc           	      C   sd   |j d }| |}| j|ddd}| j||| |d}||tddd}||tddd}||fS )zDeterministic 2-step inference returning both steps (a0_hat, a1_hat).

        Returns:
            a0_hat: (B, T, 3, 256, 256)
            a1_hat: (B, T, 3, 256, 256)
        r   Nr_   r`   r/   rE   )r   rD   r\   rV   r6   )	r   r!   r^   r"   rW   a0_flata1_flatrb   rc   r   r   r   mip_infer_steps   s   

z StudentDinoVideo.mip_infer_stepsc           	      C   sl   |j d }| |}| j|ddd}| j||| |d}||tddd}|d|t| jtt	}||fS )zTDeterministic 2-step inference. Returns (feats, rgb) where rgb is step-1 prediction.r   Nr_   r`   r/   rE   r   )
r   rD   r\   rV   r6   	unsqueezerT   r   rO   rP   )	r   r!   r^   r"   rW   re   rf   rgbfeatsr   r   r   r(      s   

zStudentDinoVideo.forward)r]   )r)   r*   r+   r,   r   r   rD   r\   rU   rd   rg   r(   r-   r   r   r   r   r.   s   s    
r.   )r,   pathlibr   r   torch.nnr9   einopsr   Zdino_loaderr   r   r   r6   r   rO   rP   Moduler
   r.   r   r   r   r   <module>   s    N