o
    Í
j¡C  ã                   @   sF  d Z ddlZddlZddlZddlmZ ddlm  mZ ej	 
dd¡Zej	 
dd¡ZdZdZd	Zd
ZdZdZdZdZdZdZdZdd„ Zdd„ ZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dƒZedkr¡e  ej! "¡ rydnd¡Z e#dƒ edd d! $e ¡Z%e&d"d#„ e% '¡ D ƒƒZ(e&d$d#„ e% '¡ D ƒƒZ)e#d%e(d&›d'e)d&›ƒ d(\Z*Z+e ,e*e+d)dd¡ $e ¡Z-e .e*e+d*¡ $e ¡d Z/e 0¡  e%e-e/ƒZ1W d  ƒ n1 sÛw   Y  e#d+e1d, j2d-e1d. j2ƒ e#d/ƒ d	Z3e ,e*e3d)dd¡ $e ¡Z4e .e*e3d*¡ $e ¡d Z5e 0¡  e% 6e4¡Z7W d  ƒ n	1 s w   Y  e#d0e7j2ƒ e8e9e+e3ƒƒZ:e:dd)… D ]\Z;e7dd…e;e+ e;…f Z<e5dd…e;e+ e;…f Z=e 0¡  e% >e<e=e=dd…d1f ¡Z1W d  ƒ n	1 sow   Y  e#d2e;› d3e1d, j2› d4e1d. j2› d5e1d6 j2› d7e1d8 j2› 
ƒ q8e#d9e?e:ƒ› d:ƒ dS dS );u’  Two-stage autoregressive transformer policy (refactor of model_autoregressive.py).

Cameron's design (2026-05-16):
  Stage A â€” PatchEncoder: per-frame DINO + small projection. Cacheable.
            Runs ONCE per frame ever (eval) or once per window (train).
  Stage B â€” ARHead: cross-frame transformer with rel-PE (target-specific).
            Runs per prediction step. Small attention budget.

Wins vs. v1:
  Eval cost: H DINO calls/step â†’ 1 DINO call/step (HÃ— cheaper).
  Train density: one DINO call â†’ K supervision signals (per window of W>H, K = W-H targets).

Defaults: W=20 window, H=8 attention context, grid=56, D=384 (DINOv3 ViT-S/16+).
é    NÚDINO_REPO_DIRz;/Users/cameronsmith/Projects/robotics_testing/random/dinov3ÚDINO_WEIGHTS_PATHzt/Users/cameronsmith/Projects/robotics_testing/random/dinov3/weights/dinov3_vits16plus_pretrain_lvd1689m-4057cbaa.pthé   é   é   é8   i€  é   é   iÀ  é    c                 C   sV   |d }t  t j|||dt d¡|   ¡}|  d¡| }t j| ¡ | ¡ gddS )Né   ©ÚdeviceÚdtypeg     ˆÃ@éÿÿÿÿ©Údim)	ÚtorchÚexpÚarangeÚmathÚlogÚ	unsqueezeÚcatÚsinÚcos)Ú	positionsr   r   r   ÚhalfZfreqsÚangles© r   ú4/data/cameron/para/libero/model_autoregressive_v2.pyÚ_sincos_pe_1d)   s   &r    c                 C   s>   |d }t | d |||ƒ}t | d |||ƒ}tj||gddS )Nr   ).r   ).é   r   r   )r    r   r   )Úxyr   r   r   r   ÚpxÚpyr   r   r   Ú_sincos_pe_2d0   s   r%   c                       s:   e Zd ZdZeeddf‡ fdd„	Zdd„ Zdd„ Z‡  Z	S )	ÚPatchEncoderzÇPer-frame DINO patches + optional learned projection. No temporal/positional info added here.

    The output is the cache primitive: (B, W, Np, D). At eval the cache is a ring buffer of size H.
    Tc                    sÚ   t ƒ  ¡  || _t| _|| j | _| jd | _td|› dƒ tj	j
tddtd| _|r<| j ¡ D ]}d|_q1| j ¡  | jj| _| j|ksQJ d|› d	| j› ƒ‚|| _|rft t |¡t ||¡¡| _d S t ¡ | _d S )
Nr   z%PatchEncoder: loading DINOv3 (frozen=z)...Zdinov3_vits16plusÚlocal)ÚsourceÚweightsFzd_model z	 != DINO )ÚsuperÚ__init__Útarget_sizeÚDINO_PATCH_SIZEZ
patch_sizeÚpatches_per_sideÚ	n_patchesÚprintr   ÚhubÚloadr   r   ÚdinoÚ
parametersÚrequires_gradÚevalÚ	embed_dimÚfreeze_backboneÚnnÚ
SequentialÚ	LayerNormÚLinearÚprojÚIdentity)Úselfr,   Úd_modelr8   Zadd_projectionÚp©Ú	__class__r   r   r+   ?   s&   
ÿ

  zPatchEncoder.__init__c           
      C   sä  | j rˆt ¡ u | j |¡\}\}}| jjD ]}| jjr$| jj||dnd}|||ƒ}q| jjr^| j |dd…d| jj	d …f ¡}| j 
|dd…| jj	d d…f ¡}tj||gdd}n| j 
|¡}|dd…| jj	d d…f }	W d  ƒ |	 ¡ S 1 sw   Y  |	 ¡ S | j |¡\}\}}| jjD ]}| jjr¤| jj||dnd}|||ƒ}q–| jjrÞ| j |dd…d| jj	d …f ¡}| j 
|dd…| jj	d d…f ¡}tj||gdd}n| j 
|¡}|dd…| jj	d d…f S )u   x: (N, 3, H, W) â†’ (N, Np, D))ÚHÚWNr!   r   )r8   r   Úno_gradr3   Zprepare_tokens_with_masksÚblocksZ
rope_embedZuntie_cls_and_patch_normsZcls_normZn_storage_tokensÚnormr   Údetach)
r?   ÚxÚtokensZH_pZW_pÚblkZrope_sincosZcls_nZpat_nÚpatchesr   r   r   Ú_dino_patchesZ   s4   
$$
õô$$zPatchEncoder._dino_patchesc                 C   sZ   |j dd… \}}|j|| g|j dd… ¢R Ž }|  |¡}|  |¡}| ||| j| j¡S )u2   frames: (B, W, 3, H, W) â†’ patches: (B, W, Np, D)Nr   )ÚshapeÚviewrN   r=   r/   r7   )r?   ÚframesÚBrE   rJ   rM   r   r   r   Úforwardw   s
    

zPatchEncoder.forward)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
IMAGE_SIZEÚTRANSFORMER_Dr+   rN   rS   Ú__classcell__r   r   rB   r   r&   9   s    ÿr&   c                       sF   e Zd ZdZeeeeee	e
 f‡ fdd„	Zdd„ Ze	fdd„Z‡  ZS )ÚARHeaduß  Cross-frame AR transformer + readout.

    Inputs (per prediction call):
      patch_tokens:    (B, H, Np, D)  â€” sliced cache of past H frames
      eef_history_xy:  (B, H, 2)      â€” pixel coords at those H frames (state EEF)
      anchor_xy:       (B, 2)         â€” anchor for the relative-PE; usually = eef_history_xy[:, -1]
      target_size:     int (image size, for normalization)

    The "current" frame is index H-1 and the readout is the EEF query token there.
    c              
      sÖ  t ƒ  ¡  || _|| _|| _|| _|d | _t t	 
|¡d ¡| _t t	 
|¡d ¡| _t t	 
|¡d ¡| _t t d|¡t ¡ t ||¡¡| _tj||d| ddddd}tj||d| _t t |¡t ||¡t ¡ t ||| ¡¡| _t |¡| _t t ||¡t ¡ t |t¡¡| _t t ||¡t ¡ t |d	¡¡| _t t ||¡t ¡ t |d
t ¡¡| _t	 | j¡d | j }t	 | j¡d | j }	t	j||	dd\}
}| jdt	j ||
gdd !| jd¡dd d S )Nr   g{®Gáz”?r	   g        ÚgeluT)r@   ÚnheadÚdim_feedforwardÚdropoutÚ
activationÚbatch_firstÚ
norm_first)Ú
num_layersr!   é   g      à?Úij)ÚindexingÚpatch_xy_01r   r   F)Ú
persistent)"r*   r+   Úhistory_lenÚ	grid_sizer@   r.   r/   r9   Ú	Parameterr   ÚrandnÚ	eef_tokenÚtype_embed_patchÚtype_embed_eefr:   r<   ÚGELUÚ
rel_pe_mlpÚTransformerEncoderLayerÚTransformerEncoderÚtransformerr;   ÚreadoutÚ	feat_normÚN_HEIGHT_BINSÚheight_headÚgripper_headÚ
N_ROT_BINSÚrotation_headr   ÚmeshgridÚregister_bufferÚstackÚreshape)r?   ri   rj   r@   Ún_headsÚn_layersr.   ÚlayerÚysÚxsÚgrid_yÚgrid_xrB   r   r   r+   Ž   sV   

ÿ
þý
þ
þþ
ýzARHead.__init__c              	   C   s`   | j }|d }tj|||d}| tjtj|||tjdddtdƒ¡ |j|ddj|ddS )Nr!   ©r   r   )Údiagonalz-infr   r   )	r/   r   ÚzerosÚmasked_fill_ÚtriuÚonesÚboolÚfloatÚrepeat_interleave)r?   rD   r   ÚNpZ	per_frameÚblockr   r   r   Ú_causal_time_maskÅ   s
   *zARHead._causal_time_maskc           %      C   sJ  |j \}}}}|| jksJ d| j› d|› ƒ‚|| jksJ ‚|j}	|j}
|| j }| j| j  dd|¡ 	|||¡}|t
|ƒ }|t
|ƒ }tj||
|	d}t|||
|	ƒ}t| j |	¡||
|	ƒ}| j |	¡ d¡ 	||d¡}|  || d¡ d ¡}t|||
|	ƒ}|  || d¡ d ¡}g }| d¡| }t|ƒD ];}|dd…|f | ||  }|dd…|f |dd…|f  |dd…|f  ||   d¡}| tj||gdd	¡ q™tj|dd	}|  ||
¡}| j||d
}|| jd  d }|dd…|dd…f }|  |¡} |  |¡}!|  |!¡}"|  |!¡ d¡}#|  |!¡ ddt¡}$| |"|#|$dœS )zReturns: logits (B, grid^2).zexpected H=z, got r!   r   r   r   g       @Nr   )Úmaskr   rd   )Ú	xy_logitsÚheight_logitsÚgripper_logitÚrotation_logits)rO   ri   r/   r   r   rn   rm   ro   rP   ÚexpandrŽ   r   r   r    r%   rg   Útor   rq   ÚrangeÚappendr   r’   rt   ru   rv   rx   ry   Úsqueezer{   rz   )%r?   Zpatch_tokensÚeef_history_xyÚ	anchor_xyr,   rR   rD   r   ÚDr   r   rM   Zeef_tokZeef_01Z	anchor_01Ztime_idxZtime_peZ	patch_absZ
patch_xy_bZ	patch_relZeef_absZeef_relZ
frame_seqsZpatches_pos_tÚtrA   ÚeÚseqr“   ÚoutZ	query_idxÚqr”   Úfr•   r–   r—   r   r   r   rS   Ì   sJ    
 >


üzARHead.forward)rT   rU   rV   rW   ÚHISTORY_LENÚ	GRID_SIZErY   ÚTRANSFORMER_HÚTRANSFORMER_LrX   r-   r+   r’   rS   rZ   r   r   rB   r   r[   ‚   s    þ7r[   c                       s:   e Zd ZdZeeeeee	df‡ fdd„	Z
ddd„Z‡  ZS )	ÚARTransformerPolicyV2aI  End-to-end convenience wrapper. For multi-target training, use the two stages directly:
        patches = self.patch_encoder(window_frames)      # (B, W, Np, D), one DINO call
        for t in target_steps:
            history = patches[:, t-H:t]
            logits  = self.ar_head(history, eef_xy[:, t-H:t], eef_xy[:, t-1])
    Tc                    sH   t ƒ  ¡  || _|| _|| _t|||ƒ| _t||||||t d| _	d S )N)r.   )
r*   r+   r,   ri   rj   r&   Úpatch_encoderr[   r-   Úar_head)r?   r,   ri   rj   r@   r€   r   r8   rB   r   r   r+   	  s   
ÿzARTransformerPolicyV2.__init__Nc                 C   s4   |   |¡}|du r|dd…df }|  |||| j¡S )uô   Single-target forward (predicts EEF after the last frame).

        Args:
            frames: (B, H, 3, target_size, target_size)
            eef_history_xy: (B, H, 2)
            anchor_xy: (B, 2) â€” defaults to eef_history_xy[:, -1]
        Nr   )r«   r¬   r,   )r?   rQ   r   rž   rM   r   r   r   rS     s   
zARTransformerPolicyV2.forward©N)rT   rU   rV   rW   rX   r¦   r§   rY   r¨   r©   r+   rS   rZ   r   r   rB   r   rª     s    þrª   c                   @   s(   e Zd ZdZdd„ Zdd„ Zdd„ ZdS )	ÚRolloutCachea÷  Ring buffer of patch tokens for closed-loop AR rollout.

    Usage:
        cache = RolloutCache(history_len=8, n_patches=784, d_model=384, device=device)
        # at each rollout step:
        new_patches = patch_encoder(frame.unsqueeze(0).unsqueeze(0))[:, 0]  # (1, Np, D)
        cache.push(new_patches, eef_xy)
        history_patches, history_eef = cache.window()
        logits = ar_head(history_patches, history_eef, history_eef[:, -1])
        next_eef = grid_idx_to_pixel(logits.argmax())
    c                 C   s:   || _ tjd||||d| _tjd|d|d| _d| _d S )Nr!   r‡   r   r   )rD   r   r‰   rM   Úeef_xyÚfill)r?   ri   r/   r@   r   r   r   r   r+   1  s   
zRolloutCache.__init__c                 C   sÚ   |  ¡ dkr| dd¡n|}| j| jk r1|d | jd| jf< |d | jd| jf< |  jd7  _dS | jdd…dd…f  ¡ | jdd…dd…f< | jdd…dd…f  ¡ | jdd…dd…f< |d | jd< |d | jd< dS )z3new_patches: (1, Np, D)  new_eef_xy: (2,) or (1, 2)r!   r   r   Nr   )r   r   )r   rP   r°   rD   rM   r¯   Úclone)r?   Znew_patchesZ
new_eef_xyr   r   r   Úpush7  s   ,,zRolloutCache.pushc                 C   sˆ   | j | jk r>| j ¡ }| j ¡ }t| j | jƒD ]"}|dtd| j d ƒf |d|f< |dtd| j d ƒf |d|f< q||fS | j| jfS )zQReturn the current H-window, left-padded with the earliest frame if not yet full.r   r!   )r°   rD   rM   r±   r¯   rš   Úmax)r?   rM   ÚeefÚir   r   r   ÚwindowD  s   

 "zRolloutCache.windowN)rT   rU   rV   rW   r+   r²   r¶   r   r   r   r   r®   $  s
    r®   Ú__main__ÚcudaÚcpuz6
== Smoke test: ARTransformerPolicyV2 single-target ==T)ri   r8   c                 c   s    | ]
}|j r| ¡ V  qd S r­   )r5   Únumel©Ú.0rA   r   r   r   Ú	<genexpr>V  s   € r½   c                 c   s    | ]}|  ¡ V  qd S r­   )rº   r»   r   r   r   r½   W  s   € zTrainable: ú,z / )r   r   rd   r   zsingle-target xy:r”   zheight:r•   u1   
== Smoke test: PatchEncoder once, ARHead Ã— K ==zpatches:r   z  target_step=z: xy z	  height z  grip r–   z  rot r—   z(would run z% ARHead calls per DINO call at train))@rW   Úosr   r   Útorch.nnr9   Útorch.nn.functionalÚ
functionalÚFÚenvironÚgetr   r   r-   r¦   Z
WINDOW_LENr§   rY   r¨   r©   rX   rw   ZN_GRIPPER_BINSrz   r    r%   ÚModuler&   r[   rª   r®   rT   r   r¸   Úis_availabler0   r™   ÚmodelÚsumr4   Ún_trainÚn_totalrR   rD   rl   rQ   Úrandr´   rF   r£   rO   rE   Zframes_wZeef_wr«   rM   Úlistrš   Útarget_stepsr    Úhist_pÚhist_er¬   Úlenr   r   r   r   Ú<module>   sz    	I#
,
ÿ
ÿ
ÿ"ÿÿà