o
    =j                     @   s  d Z ddlZddlZddlZdD ]ZeeZeeje< qdd ejd _dd ejd _	ddl
Z
ddlmZ dd	lmZ d
ZdZdZG dd dejZG dd dejZedkre
e
j rbdndZe e Zedd e D Zeded e
ddeeeZ e
!  ee Z"W d   n1 sw   Y  e"# D ]@\Z$Z%e&e%drede$ de'e%j(  qe)e%e*e'frede$ de+e% d e%re&e%d drede'e%d j(  qej,dkred e
j- d! d"d# dS dS dS )$u  DA3-based pixel-aligned heatmap predictor.

Wraps Depth-Anything-3 (DA3-SMALL). Repurposes the DualDPT head:
  - main head: 1 ch depth + 1 ch conf — kept, supervised by frozen-DA3 depth predictions.
  - aux  head: original 7 + 1 ch — REPLACED with (N_WINDOW + 1) channels so the
               DualDPT split (`aux_pred=[..., :-1]`, `aux_conf=[..., -1]`) yields
               N_WINDOW heatmap channels and 1 unused conf channel.

We bypass DA3's `_process_camera_estimation` (which DELETES `ray`/`ray_conf` from the
output) and call `backbone` + `head` directly.

Output of `forward(rgb)`:
  pred_heatmap: (B, N_WINDOW, H_out, W_out)  raw logits
  pred_depth:   (B, H_out, W_out)
  dino_feats:   list of (B, n_patches, C) intermediate DINO features (for PCA viz)
    N)depth_anything_3.utils.export!depth_anything_3.utils.pose_alignc                  O      d S N akr   r   ,/data/cameron/para/libero/model_da3_pixel.py<lambda>       r   r   c                  O   r   r   r   r   r   r   r
   r      r   r   )DepthAnything3z/data/cameron/da3_weightsi     c                       s8   e Zd ZdZdefdedef fddZdd Z  ZS )	FiLMHeaduJ  Per-timestep FiLM modulation + per-timestep 1×1 conv → 1 logit per (pixel, t).

    Input:  (B, in_dim, H, W) shared per-pixel features (from DA3 DPT aux pre-final layers)
    Output: (B, N_WINDOW + 1, H, W) where the last channel is a dummy conf (DualDPT splits
            it off). Heatmaps live in channels [:N_WINDOW].
        in_dimn_windowc                    sx   t    || _|| _tt||| _tt	||| _
tj|ddd| _tj| jj tjj| jjdd d S )N   )kernel_size{Gz?std)super__init__r   r   nn	ParametertorchonesscalezerosshiftConv2dconvinitzeros_biasnormal_weight)selfr   r   	__class__r   r
   r   )   s   
zFiLMHead.__init__c           	      C   s   |j \}}}}|d| jd d d d d d d f  | jd d d d d d d f  }||| j |||}| |}||| j||}tj|d|||j	|j
d}tj||gddS )Nr   )devicedtype)dim)shape	unsqueezer   r    reshaper   r"   r   r   r+   r,   cat)	r(   xBCHWZx_toutZ
dummy_confr   r   r
   forward7   s   
zFiLMHead.forward)	__name__
__module____qualname____doc__N_WINDOWintr   r8   __classcell__r   r   r)   r
   r   !   s    r   c                       s6   e Zd Zeedfdedef fddZdd Z  Z	S )DA3PixelModelNr   weights_pathc                    s   t    || _|| _t|}|jj| _|jj| _~| jd u r-t	t
| jdg d| _nt	| j| _| jjjd }|d }tj|j|d dddd}tj|j tjj|jdd ||d< d S )	NZ
out_layers)      	      r   r   )r   stridepaddingr   r   )r   r   r   Z_user_feat_layersr   from_pretrainedmodelbackboneheadlistgetattrdino_feat_layersscratchZoutput_conv2_auxr   r!   in_channelsr#   r$   r%   r&   r'   )r(   r   rA   rO   fullZlast_aux_seqZold_convZnew_convr)   r   r
   r   I   s    




zDA3PixelModel.__init__c                 C   s  | d}tj rtjntj}tj|jj|d | j	|d| j
dd\}}W d   n1 s0w   Y  |jd |jd }}tj|jjdd	 | j|||d
d}W d   n1 s]w   Y  |d }	|	 dkrv|	ddd
d
f }
n|	 dkr|	ddd
f }
n
tdt|	j |d }| dkr|ddd
f d
ddd}n| dkr|d
ddd}n
tdt|j g }|D ]}t|ttfr||d
  q|| q||
|dS )z rgb: (B, 3, 504, 504) in [0, 1].r   )device_typer,   NZsaddle_balanced)Z	cam_tokenZexport_feat_layersZref_view_strategyrF   F)rS   enabledr   )Zpatch_start_idxdepthrB      zunexpected depth shape ray      zunexpected ray shape )pred_heatmap
pred_depth
dino_feats)r/   r   cudais_bf16_supportedbfloat16float16autocastr+   typerK   rO   r.   rL   r-   RuntimeErrortuplepermute
isinstancerM   append)r(   rgbr2   Zautocast_dtypeZfeatsZ_auxr5   r6   Zhead_outrV   r\   rX   r[   r]   Zlayer_featsr   r   r
   r8   j   sD   
zDA3PixelModel.forward)
r9   r:   r;   r=   DA3_WEIGHTSr>   strr   r8   r?   r   r   r)   r
   r@   H   s
    !r@   __main__r^   cpuc                 c   s    | ]
}|j r| V  qd S r   )requires_gradnumel).0pr   r   r
   	<genexpr>   s    rr   zTrainable: ,rZ   rY   r.   z  z: z: list()z    first: zpeak: g    eAz.2fz GB).r<   systypesosn
ModuleTypemmodulesexportZalign_poses_umeyamar   torch.nnr   Zdepth_anything_3.apir   rj   Z	DA3_INPUTr=   Moduler   r@   r9   r+   r^   is_availabletoevalsum
parametersZn_tprintrandri   no_gradr7   itemsr	   vhasattrre   r.   rg   rM   lenrc   max_memory_allocatedr   r   r   r
   <module>   sD    'S


*
 