o
    'jc&                     @   s  d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	m
Z ejddZejddZdZd	Zd
Zd
Zd
ZdZd
ZdZdZdZd%ddZG dd dejZG dd dejZedkreej ! ridndZed	d"e# Z$e%dd e$& D Z'e(de'd e)ddee"eZ*ej+e,dd
de,ddde,dddgd d!"eZ-e.  e$e*e-Z/W d   n1 sw   Y  e/0 D ]\Z1Z2e3e2d"re(d#e1 d$e4e2j5  qdS dS )&u  DINO + per-voxel AdaLN-Zero MLP volume head (Peebles DiT-style conditioning).

Replaces the rank-1 bilinear F·key scoring of the volume KV head with a small
(t,z)-conditioned MLP per voxel — strictly more expressive, fixes the "per-t
heatmap collapse" diagnosed across volume v1/v2/v3.

Per Cameron 2026-05-20 ("ultrathink" turn):
  - 1×1 conv layout treats (B, T, Z) as the batch dim → no 6D materialisation.
  - Bottleneck d=D_FEAT → D_BOT → D_FEAT per block (4× cheaper than full d).
  - Shared first projection: refine DINO once to F ∈ (B, D_FEAT, H, W),
    then broadcast across (T, Z) and apply N_BLOCKS AdaLN-Zero blocks.
  - α (residual scale) and FiLM γ,β derived from cond = sin(t) + sin(z),
    final cond_proj zero-initialised → block starts as identity at init.
  - Gripper/rotation read from the MLP penultimate (post-blocks, pre-heatmap)
    at the per-(b,t) argmax voxel (GT during training, teacher-forced).
    N)
checkpointDINO_REPO_DIRz/data/cameron/keygrip/dinov3DINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthi  0       8                 @c                 C   s   t j| t jd}t t jd|dt jdt||   }t | |}t |d| |d d dd df< t 	|d| |d d dd df< |S )N)dtyper   r
      )
torcharangefloat32expmathlogzerossin	unsqueezecos)ndimbaseposdivpe r   3/data/cameron/para/libero/model_dino_volume_film.pysinusoidal_features'   s   *&&r    c                       s(   e Zd ZdZ fddZdd Z  ZS )AdaLNZeroBlocku   AdaLN-Zero residual block over a (N, d, H, W) tensor.
       x → GN(x) → FiLM(γ,β) → 1×1conv(d → d_bot) → GELU → 1×1conv(d_bot → d)
       → + α · ...  (α zero-init → block is identity at start)
    c                    sv   t    tjd|dd| _t|d| | _tj| jj	 tj| jj
 tj||dd| _tj||dd| _d S )Nr   F)
num_groupsnum_channelsaffine   kernel_size)super__init__nn	GroupNormnormLinear	cond_projinitzeros_weightbiasConv2ddownup)selfdd_botd_cond	__class__r   r   r)   5   s   
zAdaLNZeroBlock.__init__c                 C   s   |  |}|jddd\}}}|d|jd dd}|d|jd dd}|d|jd dd}| |}|d|  | }| |}t|}| |}|||  S )Nr%   r   r   g      ?)	r.   chunkviewshaper,   r4   Fgelur5   )r6   xcondZgbagbahr   r   r   forward@   s   




zAdaLNZeroBlock.forward)__name__
__module____qualname____doc__r)   rI   __classcell__r   r   r:   r   r!   0   s    r!   c                       sH   e Zd Zeeeeeee	e
eeddf fdd	Zdd Zd
dd	Z  ZS )DinoVolumeFiLMFTc              	      s,  t    || _|| _|| _|| _| _|	| _|
| _|| _	t
jjtddtd| _|r5| j D ]}d|_q/| jj| _ttj| j| jdddt tj| jdd| _| jd	t|dd
 | jdt|dd
 t fddt|D | _tjddd| _t|| _td| | _ d S )NZdinov3_vits16pluslocal)sourceweightsFr%   r   )r'   paddingr&   t_sin)
persistentz_sinc                    s   g | ]}t  qS r   )r!   ).0_r8   r9   d_featr   r   
<listcomp>t   s    z+DinoVolumeFiLM.__init__.<locals>.<listcomp>)!r(   r)   n_windown_height_binsn_gripper_bins
n_rot_binsrZ   
image_size	pred_sizeuse_checkpointr   hubloadr   r   dino
parametersrequires_grad	embed_dimr*   
Sequentialr3   GELUrefineregister_bufferr    
ModuleListrangeblocksheatmap_headr-   gripper_headrotation_head)r6   r\   r]   r^   r_   rZ   r8   r9   Zn_blocksr`   ra   Zfreeze_backbonerb   pr:   rY   r   r)   P   s:   



zDinoVolumeFiLM.__init__c                 C   s   |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd }|||}q| jjr[| j|d d d | jjd f }| j|d d | jjd d f }	t	j
||	gdd}n| j|}|d d | jjd d f }
|
|||| jdddd S )Nr   )HWr   r=   r%   r
   )r@   re   Zprepare_tokens_with_masksro   Z
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensr,   r   catreshaperh   permute
contiguous)r6   rC   BZx_tokensZH_pZW_pblkZropeZx_clsZx_patpatchr   r   r   _extract_dino_features~   s   
$$"z%DinoVolumeFiLM._extract_dino_featuresNc                 C   sR  |j d }| j| j| j}}}| |}tj|| j| jfddd}| |}	|	j dd \}
}| j	
|dd| j
d|d }|
d||d|||d|| | d }|	
|dd||
||||||
|}||| | ||
| }| jD ]}| jr| jrt|||dd	}q{|||}q{|}| |}|
||||
|}||	d
}|dur'|d d|d }|d d|
d }|d d|d }tj||jd
d|||}tj||jd
|d||}|||  ||  |  }| | }}||dd||f |||}| ||d< | |
||d| j}||d< |S )u   rgb: (B, 3, IMG, IMG). kp_zyx: (B, T, 3) long — (z_bin, y_grid, x_grid)
        at GT during training (teacher forcing), volume argmax at inference.
        r   bilinearF)sizemodealign_cornersNr   r<   )Zuse_reentrant)volume_logitsZpixel_feats).r   ).r   ).r
   )devicegripper_logitsr%   Zrotation_logits)r@   r\   r]   rZ   r}   rA   interpolatera   rk   rT   r?   rV   expandrw   ry   ro   rb   trainingckptrp   clampr   r   r   flattenrq   rr   r_   )r6   rgbkp_zyxrz   TZr7   r|   Zfeat_upZF_featrt   ru   Zcond_tzZ	cond_flatZF_exprC   r{   Zpenultlogitr   outZz_idxy_idxx_idxZt_idxZb_idxflatZyfxfZsampledrotr   r   r   rI      sB   


 0$



  zDinoVolumeFiLM.forwardN)rJ   rK   rL   N_WINDOWN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINSD_FEATD_BOTD_CONDN_BLOCKSIMG_SIZE	PRED_SIZEr)   r}   rI   rN   r   r   r:   r   rO   O   s    .rO   __main__cudacpu)r\   c                 c   s    | ]
}|j r| V  qd S r   )rg   numel)rW   rs   r   r   r   	<genexpr>   s    r   zTrainable: ,r%   )r
   r   r<   r=   r@   z  z: )r   )6rM   osr   r   torch.nnr*   torch.nn.functional
functionalrA   Ztorch.utils.checkpointr   r   environgetr   r   r   r   r   r   r   r   r   r   r   r   r    Moduler!   rO   rJ   r   r   is_availabletoevalmsumrf   Zn_tprintrandr   stackrandintkpno_gradr   itemskvhasattrtupler@   r   r   r   r   <module>   sZ    
	u

