o
    	Cj                     @   s  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	
ddZej	
ddZdZdZd	Zd	Zd	Zd
Zd	ZdZdZG dd dejZG dd dejZedkreej r\dndZedde Ze dd e! D Z"e#de"d e$ddeeeZ%e&dedeZ'e(  ee%e'dZ)W d   n1 sw   Y  e)* D ]\Z+Z,e-e,dre#de+ de.e,j/  qdS dS ) u  DINO + per-pixel 5-layer residual MLP that regresses all T-step bins.

Baseline for the query-MLP design. Same DINO trunk, same F refinement to d_feat,
but the head is a per-pixel 5-layer residual MLP (1×1 conv stack) that at each
pixel outputs the entire flattened-in-time prediction:
  - volume:   T * Z         (height bin per t)
  - gripper:  T * n_grip
  - rotation: T * 3 * n_rot

Volume is materialised densely (B, T, Z, H, W) — needed for the argmax heatmap.
For gripper/rotation: the dense (T * n_grip, H, W) etc. tensors would be huge,
so we run the conv head only at the per-(b, t) GT pixel (sample penult first,
then small linear). The trained weights are equivalent to a dense 1×1 conv —
this is just a memory-efficient evaluation.

Per Cameron 2026-05-20: "produce the same F feature map, and regress the height
bins rotation bins and gripper bins, flattened in time, as a 5 layer residual
mlp on top of each pixel feature (no volume)".
    NDINO_REPO_DIRz/data/cameron/keygrip/dinov3DINO_WEIGHTS_PATHzU/data/cameron/keygrip/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthi  2       8         c                       s,   e Zd ZdZef fdd	Zdd Z  ZS )ResMLPBlocku   Per-pixel residual MLP block, expressed as 1×1 conv stack.
       Mathematically: at each pixel, x → LN(x) → Linear(d → 4d) → GELU → Linear(4d → d) → +x.c                    sL   t    tjd|dd| _tj||| dd| _tj|| |dd| _d S )N   T
num_groupsnum_channelsaffinekernel_size)super__init__nn	GroupNormnormConv2dfc1fc2)selfd	mlp_ratio	__class__ 1/data/cameron/para/libero/model_dino_per_pixel.pyr   +   s   
zResMLPBlock.__init__c                 C   s0   |  |}| |}t|}| |}|| S N)r   r   Fgelur   )r   xhr   r   r   forward1   s
   



zResMLPBlock.forward)__name__
__module____qualname____doc__	MLP_RATIOr   r%   __classcell__r   r   r   r   r	   (   s    r	   c                
       sD   e Zd Zeeeeeee	e
edf
 fdd	Zdd Zd	ddZ  ZS )
DinoPerPixelMLPFc              	      s  t    || _|| _|| _|| _ | _|| _|	| _t	j
jtddtd| _|
r2| j D ]}d|_q,| jj| _ttj| j| jdddt tj| j dd| _t fd	d
t|D | _tjd dd| _tj || dd| _t || | _t |d | | _d S )NZdinov3_vits16pluslocal)sourceweightsF   r
   )r   paddingr   c                    s   g | ]}t  qS r   )r	   ).0_d_featr   r   r   
<listcomp>Y   s    z,DinoPerPixelMLP.__init__.<locals>.<listcomp>Tr   ) r   r   n_windown_height_binsn_gripper_bins
n_rot_binsr5   
image_size	pred_sizetorchhubloadr   r   dino
parametersrequires_grad	embed_dimr   
Sequentialr   GELUrefine
ModuleListrangeblocksr   
final_normvolume_headLineargripper_headrotation_head)r   r7   r8   r9   r:   r5   Zn_blocksr   r;   r<   Zfreeze_backbonepr   r4   r   r   :   s2   


 zDinoPerPixelMLP.__init__c                 C   s   |j d }| j|\}\}}| jjD ]}| jjr!| jj||dnd }|||}q| jjr[| j|d d d | jjd f }| j|d d | jjd d f }	t	j
||	gdd}n| j|}|d d | jjd d f }
|
|||| jdddd S )Nr   )HWr
   )dimr0      )shaper@   Zprepare_tokens_with_masksrI   Z
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensr   r=   catreshaperC   permute
contiguous)r   r#   BZx_tokensZH_pZW_pblkZropeZx_clsZx_patpatchr   r   r   _extract_dino_featuresf   s   
$$"z&DinoPerPixelMLP._extract_dino_featuresNc                 K   s  |j d }| j}| j}| |}tj|| j| jfddd}	| |	}
|
}| jD ]}||}q'| 	|}| 
|}|j dd \}}||||||}||
d}|dur|d  d|d	 }|d
  d|d	 }tj||jd|d	||}||dd||f }| ||||| j}| ||||d| j}tj||jdd	|||}||||f |d< ||||f |d< |S )uM  
        rgb: (B, 3, IMG, IMG)
        start_pix: unused here (kept for interface parity with query model)
        query_pixels: (B, T, 2) of (y_grid, x_grid) in pred_size coords — per-timestep
                       GT pixels at training, volume argmax at inference. If None, grip/rot
                       not returned.
        r   bilinearF)sizemodealign_cornersN)volume_logitsZpixel_feats).r   r
   ).r
   )devicer0   Zgripper_logitsZrotation_logits)rT   r7   r8   r\   r!   interpolater<   rF   rI   rJ   rK   viewlongclampr=   arangerc   expandrM   r9   rN   r:   )r   rgbZ	start_pixquery_pixelskwargsrY   TZr[   Zfeat_upZF_featr$   rZ   volrP   rQ   rb   outqyqxZb_idxZsampledZgrip_allZrot_allZt_idxr   r   r   r%   u   s6   







  zDinoPerPixelMLP.forward)NN)r&   r'   r(   N_WINDOWN_HEIGHT_BINSN_GRIPPER_BINS
N_ROT_BINSD_FEATN_BLOCKSr*   IMG_SIZE	PRED_SIZEr   r\   r%   r+   r   r   r   r   r,   9   s    ,r,   __main__cudacpu)r7   c                 c   s    | ]
}|j r| V  qd S r    )rB   numel)r2   rO   r   r   r   	<genexpr>   s    r   zTrainable: ,rS   r0   )rS   r   rS   )rk   rT   z  z: )0r)   osmathr=   torch.nnr   torch.nn.functional
functionalr!   environgetr   r   ry   rs   rt   ru   rv   rz   rw   r*   rx   Moduler	   r,   r&   rc   r|   is_availabletoevalmsumrA   Zn_tprintrandrj   randintqpno_gradrp   itemskvhasattrtuplerT   r   r   r   r   <module>   sD    l

	