o
    qj                  	   @   s  d Z ddlZddlZddlZddlZddlmZ ddlm  mZ	 ej
dej
e ddlmZmZmZmZmZmZmZmZ dZdZdZdZdZd	Zd
ZG dd deZedkre ej!" rcdndZ e #e $ Z%e&dd e%' D Z(e)de(d e*ddee#e Z+ej,e-dddefe-dd
defe-dd
defgdd#e Z.e/  e%e+e.Z0W d   n1 sw   Y  e01 D ]\Z2Z3e4e3dre)de2 de5e3j6  qdS dS )u  DinoVolumeKV + gripper/rotation MLP heads (libero-style teacher forcing).

Inputs:  rgb (B, 3, IMG, IMG), query_pixels (B, T, 2) in 504-space (GT at train, argmax at eval)
Outputs:
  volume_logits:   (B, T, Z, h, w)         — unchanged from DinoVolumeKV
  gripper_logits:  (B, T, N_GRIPPER_BINS)  — CE over discretised gripper
  rotation_logits: (B, T, 3, N_ROT_BINS)   — CE per euler axis
  pixel_feats:     (B, key_dim, h, w)      — for viz/debug

Gripper/rotation MLPs index `pixel_feats` at `query_pixels` (teacher-forced GT at train).
We DETACH the indexed features so MLP grads don't destabilise the volume objective —
mirrors the original libero pattern where the volume head is the load-bearing one.
    N)DinoVolumeKVDINO_REPO_DIRDINO_WEIGHTS_PATHsinusoidal_featuresIMG_SIZEN_WINDOWN_HEIGHT_BINSKEY_DIM0       i           8   c                       s   e Zd ZdZeeeeddddee	e
eeedfdededed	ed
ededededededededededef fddZdd Zd fdd	Z  ZS )DinoVolumeKVFullu7  Volume KV + TEMPORAL TRANSFORMER head for gripper and rotation.

    Per Cameron 2026-05-19 (later in session): replace per-timestep MLPs with a tiny
    transformer encoder that processes all 8 future keypoints jointly. Each token =
       z_emb[z_t] + y_emb[y_t] + x_emb[x_t] + t_emb[t] + Linear(F[y_t, x_t])
    Self-attention over the 8 tokens lets gripper@t=5 see the keypoints at t=0..4 and t=6,7
    so it can reason about trajectory phase ("descending → grasping → ascending"). Each
    output token is mapped to gripper bins + rotation bins per axis via small linears.

    Inputs at forward time:
      rgb:     (B, 3, IMG, IMG)
      kp_zyx:  (B, T, 3) long — (z_bin, y_grid, x_grid). At training this is GT (teacher
               forcing); at inference it's the argmax over the volume head's output.
    sin   Zdinov3_vits16plusFn_windown_height_binskey_dim
image_size
height_enctime_enchead_hiddendino_variant
n_rot_binsn_gripper_binsd_model	tf_layerstf_heads	pred_griddetach_for_headc              
      s   t  j||||||||d |	| _|
| _|| _|| _|| _| jdt||dd | jdt||dd t	
||| _t	
||| _t	||| _t	j|||d ddd	d	d
}t	j||d| _t	|| _t	||
| _t	||	| _d S )N)r   r   r   r   r   r   r   r   z_sinF)
persistentt_sinr   g        geluT)r   nheaddim_feedforwarddropout
activationbatch_first
norm_first)
num_layers)super__init__r   r   r   r    r!   register_bufferr   nn	Embedding	y_tok_emb	x_tok_embLinearfeat_to_tokenTransformerEncoderLayerTransformerEncodertransformer	LayerNormout_normgripper_outrotation_out)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	enc_layer	__class__ 6/data/cameron/para/libero/model_dino_volume_kv_full.pyr.   1   s,   
zDinoVolumeKVFull.__init__c                 C   s&  |j \}}}}|j d }| jr| n|}|d d| jd }	|d d|d }
|d d|d }tj||jd|d	||}||dd|
|f }tj||jdd|	||}| j
|	 }| j| }|| | |
 | | | | }| |}| |}| |}| |}||fS )u   pixel_feats: (B, C, ph, pw). kp_zyx: (B, T, 3) long — (z_bin, y_grid, x_grid).
        Returns gripper_logits (B, T, n_grip), rotation_logits (B, T, 3, n_rot).   ).r   r   ).rC   ).   )deviceN)shaper!   detachclampr   torcharangerE   viewexpandr"   r$   r2   r3   r5   r8   r:   r;   r<   )r=   pixel_featskp_zyxBCphpwTZfeatszyxZ	batch_idxZsampledZt_idxZz_peZt_petokenshgriprotrA   rA   rB   predict_from_keypoints[   s&   
  





z'DinoVolumeKVFull.predict_from_keypointsNc                    s<   t  |}|dur| |d |\}}||d< ||d< |S )u   If kp_zyx given, run the transformer head and emit gripper/rotation logits.
        kp_zyx: (B, T, 3) long — (z_bin, y_grid, x_grid). GT at train, argmax at inference.NrM   gripper_logitsrotation_logits)r-   forwardr[   )r=   rgbrN   outrY   rZ   r?   rA   rB   r^   v   s   zDinoVolumeKVFull.forwardN)__name__
__module____qualname____doc__r   r   r	   r   
N_ROT_BINSN_GRIPPER_BINSD_MODEL_HEAD	TF_LAYERSTF_HEADS	PRED_GRIDintstrboolr.   r[   r^   __classcell__rA   rA   r?   rB   r   "   sN    *r   __main__cudacpuc                 c   s    | ]
}|j r| V  qd S ra   )requires_gradnumel).0prA   rA   rB   	<genexpr>   s    rw   zTrainable: ,rD      )dimrF   z  z: )7re   ossysmathrI   torch.nnr0   Ztorch.nn.functional
functionalFpathinsertdirname__file__Zmodel_dino_volume_kvr   r   r   r   r   r   r   r	   rf   rg   Z	DA3_INPUTrh   ri   rj   rk   r   rb   rE   rq   is_availabletoevalmsum
parametersn_tprintrandr_   stackrandintkpno_gradr`   itemskvhasattrtuplerF   rA   rA   rA   rB   <module>   sN    (_

