o
    Gjz-                  	   @   s  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 ej
ddZej
ddZdZdZd	Zd
ZdZdZdZd:ddZG dd dejZG dd dejZG dd dejZd;ddZG dd dejZedkr?eej rydndZe  e! Z"e#d d! e"$ D Z%e&d"e%d# e'd$d%ee eZ(e'd$ed$ ee Z)ee)eeeeeed&Z*e&d'e*+ , d(d)e*- , d( ej.de"j/d*ed+Z0e1e*Z2e"3e*e0e2Z4e"e(e4e0Z5e&d,e6e5j7  e	8e5e2Z9e&d-e9, d. e"j:e(dd/ d0d1Z;e&d2e6e;j7 d3e;- , d4d5e;+ , d4d6 ej<dkrAe&d7ej= d8 d4d9 dS dS dS )<uC  DINOv3-conditioned heatmap diffusion (DDPM).

Per Cameron 2026-05-19: denoise heatmaps directly. The forward process is
  x_t = sqrt(α̅_t) · x_0 + sqrt(1 - α̅_t) · ε,  ε ~ N(0, I)
where x_0 is the GT heatmap stack (B, T, H, W) — Gaussian blob (σ≈2px) at each future GT
pixel. The model predicts ε given (rgb, x_t, t). Standard cosine β schedule, 1000 train
timesteps, 10-step DDIM at inference.

Why heatmaps (not coords): multiple plausible futures = multiple modes in the heatmap;
diffusion samples from one mode per pass rather than collapsing to the centroid.
    NDINO_REPO_DIRz/data/cameron/keygrip/dinov3DINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pth   i     8          @)g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?Mb?+?c                 C   sr   | d }t d| |}t ||  | d|  tj d d }||d  }d|dd  |d d   }|j|dS )N   r   g      ?   )max)torchlinspacecosmathpiclamp)TsZmax_betastepsxfalphas_cumprodbetas r   1/data/cameron/para/libero/model_dino_diffusion.pycosine_betas   s   (r   c                       $   e Zd Z fddZdd Z  ZS )SinusoidalTimeEmbc                    s   t    || _d S N)super__init__dim)selfr#   	__class__r   r   r"   '   s   zSinusoidalTimeEmb.__init__c                 C   sf   | j d }ttd tj||jd | }| d|d }tj	t
|t|gddS )Nr   i'  devicer   r   r#   )r#   r   expr   logaranger(   float	unsqueezecatsinr   )r$   thalfZfreqsar   r   r   forward)   s   
&zSinusoidalTimeEmb.forward__name__
__module____qualname__r"   r4   __classcell__r   r   r%   r   r   &   s    r   c                       r   )ResBlockc                    s`   t    td|| _tj||ddd| _td|| _tj||ddd| _t	||| _
d S )Nr      r
   padding)r!   r"   nn	GroupNormnorm1Conv2dconv1norm2conv2Lineart_proj)r$   cht_dimr%   r   r   r"   2   s   
""zResBlock.__init__c                 C   sV   |  t| |}|| |d d d d d d f  }| t| |}|| S r    )rB   Fsilur@   rF   rD   rC   )r$   r   t_ehr   r   r   r4   7   s   "zResBlock.forwardr5   r   r   r%   r   r:   1   s    r:   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )		SmallUNetuB   Small 3-level UNet for (B, in_ch, 56, 56) → (B, out_ch, 56, 56).      c              	      s8  t    tt|t||t t||| _tj||ddd| _	t
||| _tj||d dddd| _t
|d || _tj|d |d dddd| _t
|d || _tj|d |d dddd| _t
|d || _tj|d |dddd| _t
||| _tj||ddd| _tj| jj tj| jj d S )Nr;   r
   r<   r      )strider=   )r!   r"   r>   
Sequentialr   rE   SiLU	t_emb_mlprA   in_convr:   r1down1r2down2r_midConvTranspose2dup2r2bup1r1bout_convinitzeros_weightbias)r$   in_chout_chbase_chrH   r%   r   r   r"   @   s"   


$zSmallUNet.__init__c           	      C   s   |  |}| |}| ||}| |}| ||}| |}| ||}| || }| ||}| 	|| }| 
||}| |S r    )rT   rU   rV   rW   rX   rY   rZ   r\   r]   r^   r_   r`   )	r$   r   r1   rK   Zh0h1h2u1u0r   r   r   r4   T   s   

zSmallUNet.forward)rN   rO   )r6   r7   r8   __doc__r"   r4   r9   r   r   r%   r   rM   >   s    rM     cpuc                 C   s   | j d }|| }|| }	| d | }
| d |	 }tj||tjddd|d}tj||tjdddd|}|
||dd}
|||dd}t||
 d || d   d|d   }|d d S )	uw  gt_pix_504: (B, T, 2) GT pixels in image_size space.
    Returns: (B, T, H, W) GT heatmap rescaled to [-1, 1] so diffusion noise (N(0,1)) doesn't
    overwhelm the signal at moderate t. Peak = +1 at GT pixel, far background = -1.
    Previous version was [0, 1] which was the bug — 99% of pixels at 0 with N(0,1) noise
    means signal-to-noise is terrible for most pixels.r   ).r   ).r
   r(   dtyper
   r   r         ?)shaper   r,   float32viewr*   )
gt_pix_504r   HWsigma_px
image_sizer(   BZscale_xZscale_ycxcyysxsgr   r   r   make_gaussian_heatmap^   s   
 ,r   c                       sp   e Zd ZdZeeedddfdedef fddZ	d	d
 Z
dd Zdd Zdd Ze ddefddZ  ZS )DinoHeatmapDiffusionu   DINOv3-conditioned heatmap diffusion model.

    Inputs:  rgb (B,3,IMG,IMG), x_t (B, T, H, W) noisy heatmap stack, t (B,) ints in [0, T_diff)
    Outputs: ε_pred (B, T, H, W) — DDPM noise prediction.
    @   i  TT_difffreeze_backbonec                    s  t    || _|| _|| _|| _ttjvrtj	dt t
jjtddtd| _|r7| j D ]}|d q/t| jdd| _tt| j|dtd	|t tj||d
dd| _t|| |ddd| _t|}d| }	t
j|	dd}
| jd|dd | jd|	dd | jd|
dd | jdt
|
dd | jdt
d|
 dd | jdt
t !dd
dddd | jdt
t"!dd
dddd d S )Nr   Zdinov3_vits16pluslocal)sourceweightsF	embed_dimi  r
   r   r;   r<   rN   rO   )re   rf   rg   rH   rq   r)   r   )
persistentalphasr   sqrt_alphas_cumprodsqrt_one_minus_alphas_cumprodmeanstd)#r!   r"   n_windowry   heatmap_resr   r   syspathinsertr   hubloadr   dino
parametersrequires_grad_getattrr   r>   rR   rA   r?   rS   	cond_projrM   unetr   cumprodregister_buffersqrttensorIMAGENET_MEANrt   IMAGENET_STD)r$   r   ry   r   Zcond_dimr   r   pr   r   r   r%   r   r   r"   u   s>   


"&zDinoHeatmapDiffusion.__init__c                 C   s   || j  | j S r    )r   r   )r$   Zrgb01r   r   r   
_normalize   s   zDinoHeatmapDiffusion._normalizec                 C   s  |j d }|j d | jkrtj|| j| jfddd}| |}tj r'tjntj	}tj
|jj|d | j|}W d   n1 sDw   Y  t|trX|d|d	}n|}|tj}|j d }| jt }|dd
d||||}	tj|	| j| jfddd}
| |
S )z$Return cond features at heatmap_res.r   r   bilinearF)sizemodealign_corners)device_typerp   NZx_norm_patchtokensZ	x_prenormr   r
   )rr   ry   rI   interpolater   r   cudais_bf16_supportedbfloat16float16autocastr(   typer   Zforward_features
isinstancedictgettors   DINO_PATCH_SIZEpermutereshaper   r   )r$   rgbrz   r   Zautocast_dtypeZfeatsZpatch_tokensDr   Zfeat_2dZfeat_hmr   r   r   _cond_features   s*   





z#DinoHeatmapDiffusion._cond_featuresc                 C   s(   |  |}tj||gdd}| ||S )u;   ε-prediction. rgb (B,3,*,*), x_t (B, T, H, W), t (B,) int.r
   r)   )r   r   r/   r   )r$   r   x_tr1   condinpr   r   r   r4      s   
zDinoHeatmapDiffusion.forwardc                 C   s<   | j | dddd}| j| dddd}|| ||  S )u:   Forward diffusion: x_t = sqrt(α̅) x_0 + sqrt(1-α̅) ε.r   r
   )r   rt   r   )r$   x_0r1   noisesaZsomar   r   r   q_sample   s   zDinoHeatmapDiffusion.q_sample
   n_stepsc                 C   s  |j }|jd }| j }}tj|| j|||d}t| jd d|d  	|}| 
|}	t|D ]Y}
||
 }||
d  }tj|ft||tjd}tj||	gdd}| ||}| j| }|dkrh| j| ntjd|d}|d|  |  |  }| | d|  |  }q2|S )z9DDIM-style sampling from t=T_diff-1 down to 0 in n_steps.r   r'   r
   ro   r)   rq   )r(   rr   r   r   randnr   r   r   longr   r   rangefullintr/   r   r   r   r   )r$   r   r   r(   rz   rv   rw   r   Z	timestepsr   ir1   Zt_nextZt_batchr   epsr3   Za_nextZx0_hatr   r   r   sample   s    

"

 zDinoHeatmapDiffusion.sample)r   )r6   r7   r8   rl   N_WINDOWIMG_SIZEHEATMAP_RESr   boolr"   r   r   r4   r   r   no_gradr   r9   r   r   r%   r   r   o   s     $r   __main__r   c                 c   s    | ]
}|j r| V  qd S r    )requires_gradnumel).0r   r   r   r   	<genexpr>   s    r   zTrainable: ,r   r;   )ry   r(   zx0 max: z.3fz min: )r   r'   z
eps_pred: zinit MSE loss: z.4fr
      )r   zsample: z	, range [z.2fz, ]zpeak: g    eAz GB)r   r	   )rm   rn   )>rl   osr   r   r   torch.nnr>   torch.nn.functional
functionalrI   environr   r   r   r   r   r   r   GAUSSIAN_SIGMAr   r   r   Moduler   r:   rM   r   r   r6   r(   r   is_availabler   evalmsumr   n_tprintrandr   gt_pixx0r   itemminrandintr   r1   
randn_liker   r   r   eps_predtuplerr   mse_losslossr   sampr   max_memory_allocatedr   r   r   r   <module>   s\    
	
 
e(
6 