o
    !i@                     @   s  d Z ddlZddlZddlZddlmZ ddlm  mZ ddl	m
Z
 ejddZdZdZdZdZd	ZG d
d dejZG dd dejZG dd dejZG dd dejZedkreej redndZededZeeZeddddeZeddddeZ e!ddgeZ"e#dedeZ$e#dedeZ%e&  eee e"e$e%dZ'W d   n1 sw   Y  e'( D ]\Z)Z*e*dure+e) de*j,  qdS dS )uf  Dual-camera PARA with DA3 pretrained backbone + DPT upsampling.

Processes both agentview and wrist camera through a shared DA3 backbone,
upsamples via pretrained DPT refinement to 64×64 with 64-dim features,
then applies per-view 1×1 conv heads for volume/rotation/gripper prediction.

At eval, picks the view with higher heatmap confidence per timestep.
    N)	load_fileDA3_WEIGHTS_PATHz+/data/cameron/da3_weights/model.safetensors       @      c                       s$   e Zd Z fddZdd Z  ZS )ResidualConvUnitc                    sD   t    tj||ddd| _tj||ddd| _tjdd| _d S )N      )paddingT)inplace)super__init__nnConv2dconv1conv2ReLUrelu)selffeatures	__class__ =/data/cameron/para_normalized_losses/libero/model_dual_da3.pyr      s   
zResidualConvUnit.__init__c                 C   s0   |  |}| |}|  |}| |}|| S N)r   r   r   )r   xoutr   r   r   forward%   s
   



zResidualConvUnit.forward__name__
__module____qualname__r   r   __classcell__r   r   r   r   r      s    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )		RefineNetTc                    s>   t    t||d| _|rt|| _nd | _t|| _d S )Nr
   )r   r   r   r   out_convr   resConfUnit1resConfUnit2)r   r   has_rcu1r   r   r   r   .   s   
zRefineNet.__init__Nc                 C   s<   |d ur|| }| j d ur|  |}| |}| |}|S r   )r&   r'   r%   )r   r   skipr   r   r   r   7   s   



zRefineNet.forward)Tr   r   r   r   r   r   r$   -   s    	r$   c                       s2   e Zd ZdZg ddf fdd	Zdd Z  ZS )DPTFeatureExtractoru	  DPT head that produces 64-dim features at ~64×64 resolution.

    Takes multi-scale ViT features from layers [5,7,9,11] and progressively
    refines them through the pretrained DPT pipeline, stopping at the stage
    that gives us ~64×64 spatial resolution.
    0   `        r   c                    s   t    || _tj|d |dddd| _tj|d |dddd| _tj|d |dddd| _tj|d |dddd| _t	|dd| _
t	|dd| _t	|dd| _d S )	Nr   r	   r
   F)r   bias   )r(   T)r   r   r   r   r   	layer1_rn	layer2_rn	layer3_rn	layer4_rnr$   
refinenet4
refinenet3
refinenet2)r   in_channelsr   r   r   r   r   I   s   
zDPTFeatureExtractor.__init__c                 C   s   |d j d }g }t|D ]\}}||||ddddd}|| q| |d }	| |d }
| |d }| |d }| 	|}t
j||j dd ddd	}| ||}t
j||
j dd ddd	}| ||
}t
j|ttfddd	}|S )
u  
        Args:
            layer_outputs: list of 4 tensors, each (B, N_patches, D) from backbone layers [5,7,9,11]
            patch_h, patch_w: patch grid dimensions

        Returns:
            features: (B, 64, H_out, W_out) where H_out ≈ 64
        r   r	   r
   r1   NbilinearFsizemodealign_corners)shape	enumeratereshapepermuteappendr2   r3   r4   r5   r6   Finterpolater7   r8   	PRED_SIZE)r   layer_outputspatch_hpatch_wBfeatsi	layer_outfl1l2l3l4r4r4_upr3r3_upr2r   r   r   r   r   Y   s    	
zDPTFeatureExtractor.forward)r    r!   r"   __doc__r   r   r#   r   r   r   r   r*   A   s    r*   c                       sh   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dddZ
dddZ		dddZ  ZS )DualDA3PredictorzQDual-camera PARA: shared DA3 backbone + DPT features + per-view prediction heads.  Fc                    s  t    |_|_|_t_d_td t	j
jdddd_d_g d_tt}d	  fd
d| D }jj|dd\}}	tdt| dt| d |rjj D ]}
d|
_q[j  td g d}tfdd|D _tjD ]"\}}d| }| d|v r|| d |j_|| d |j_qtjd _d|v r|d jj_|d jj_td t |dd_!d}i }| D ]\}}|"|r|t|d  }|||< qшj!j|dd}tdt|t|j#  d d}t$t	%|d  _&d}d!D ]5}t'| d"t(||t) d# t'| d$t(||| d# t'| d%t(||d& t* d# qtd'| d(t) d)| d(| d*| d+t* d, td-| d.|  t+d/d0  D }t+d1d0  D }td2|d3d4|d3d5 d S )6Ndual_da3z)Loading DA3 backbone (DINOv2 ViT-S/14)...zfacebookresearch/dinov2Zdinov2_vits14F)Z
pretrainedr/   )      	      zmodel.backbone.pretrained.c                    s,   i | ]\}}|  r|t d  |qS r   )
startswithlen).0kv)prefixr   r   
<dictcomp>   s   , z-DualDA3Predictor.__init__.<locals>.<dictcomp>)strictu   ✓ Loaded DA3 backbone (z keys, z	 missing)u   ✓ Frozen DA3 backboner+   c                    s    g | ]}t  jd  |dqS )r1   r
   )r   r   	embed_dim)rc   ch)r   r   r   
<listcomp>   s    z-DualDA3Predictor.__init__.<locals>.<listcomp>zmodel.head.projects.z.weightz.biasr1   zmodel.head.norm.weightzmodel.head.norm.biasu!   ✓ Loaded DPT projection weightsr   )r9   r   zmodel.head.scratch.u#   ✓ Loaded DPT refinement weights (z	 matched)g{Gz?)agentwrist_volume_headr
   _gripper_head_rotation_headr	   u   ✓ Per-view heads: volume(   ×z), gripper(z), rotation(u   ×3×)u   ✓ Feature dim: z, pred_size: c                 s   s    | ]}|  V  qd S r   )numelrc   pr   r   r   	<genexpr>   s    z,DualDA3Predictor.__init__.<locals>.<genexpr>c                 s   s    | ]
}|j r| V  qd S r   )requires_gradrs   rt   r   r   r   rv      s    u   ✓ DualDA3: ,z / z trainable params),r   r   target_size	pred_sizen_window
PATCH_SIZE
patch_size
model_typeprinttorchhubloadbackboneri   
out_layersr   r   itemsload_state_dictrb   
parametersrw   evalr   
ModuleListprojectsrA   weightdatar0   	LayerNorm	head_normr*   dptra   unexpected_keys	Parameterrandnstart_keypoint_embeddingsetattrr   N_HEIGHT_BINS
N_ROT_BINSsum)r   ry   rz   r{   freeze_backbonekwargssdZbackbone_sdmissing
unexpectedparamr9   rM   projpkZ
dpt_prefixZdpt_sdrd   re   new_keyloadedDZN_GRIPviewn_totaln_trainabler   )rf   r   r   r      sr   





  (.zDualDA3Predictor.__init__c                    s   t  | | j|| _| S r   )r   tor   )r   devicer   r   r   r      s   zDualDA3Predictor.toc                 C   s   |j d }| jj|| jdd}tt|d d j d  }}g }|D ]$\}}|dd|j d d}	t	j
||	gdd}
| |
}
||
 q#|||fS )a@  Extract multi-scale features from DA3 backbone.

        Returns:
            layer_outputs: list of 4 tensors, each (B, N_patches, D*2) for layers [5,7,9,11]
                           D*2 because cat_token=True (patch token concatenated with CLS)
            patch_h, patch_w: spatial dimensions of patch grid
        r   T)nZreturn_class_tokenr
   r:   )dim)r@   r   Zget_intermediate_layersr   intmathsqrt	unsqueezeexpandr   catr   rD   )r   r   rK   rL   rI   rJ   rH   patch_tokens	cls_tokenZcls_expandedcombinedr   r   r   _extract_features   s   
 

z"DualDA3Predictor._extract_featuresc           	      C   s\   |d j d }g }t|D ]\}}||||ddddd}| j| |}|| q|S )uE   Project multi-scale features through DPT projects → DPT refinement.r   r:   r	   r
   r1   )r@   rA   rB   rC   r   rD   )	r   rH   rI   rJ   rK   	projectedrM   rN   rO   r   r   r   _project_features   s   z"DualDA3Predictor._project_featuresNc                 C   sL  |j d }| j}| j }}t| | d}t| | d}	t| | d}
||||t||}|dur|d  d|d }|d  d|d }tj	||j
d	|d||}tj	||j
d	d|||}|	|||d
||}|||dd||f }|
|||dt||}|||dddd||f }nd }}|||fS )a  Apply per-view prediction heads.

        Args:
            feats: (B, 64, pred_size, pred_size)
            view_name: 'agent' or 'wrist'
            query_pixels: (B, N_WINDOW, 2) for indexing gripper/rotation

        Returns:
            volume_logits: (B, N_WINDOW, N_HEIGHT_BINS, pred_size, pred_size)
            gripper_logits: (B, N_WINDOW, 2) or None
            rotation_logits: (B, N_WINDOW, 3, N_ROT_BINS) or None
        r   rn   ro   rp   N).r   r
   ).r
   r   r1   r	   )r@   r{   rz   getattrr   r   longclampr   aranger   r   r   )r   rL   	view_namequery_pixelsrK   NHWZvol_headZ	grip_headZrot_headvolpxpy	batch_idxtime_idxgrip_mapgripper_logitsrot_maprotation_logitsr   r   r   _get_view_predictions   s$   

  
z&DualDA3Predictor._get_view_predictionsrl   c                 C   s   |  |||\}}}||fS )z6For eval: predict gripper/rotation at specific pixels.)r   )r   rL   r   r   _griprotr   r   r   predict_at_pixels%  s   z"DualDA3Predictor.predict_at_pixelsc           -      C   s  |j d }i }| |\}}	}
| ||	|
}| j|d }| j|d }| j|d }| j|d }| j|}t	j
||j dd ddd}| j||}t	j
||j dd ddd}| j||}t	j
|| j| jfddd}|dur| dkr|d|d	}|dddf | j | j  d| jd }|dddf | j | j  d| jd }tj||jd
}||dd||f  | jd7  < | |d|\}}}||d< ||d< ||d< ||d< |durs| |\}}}| |||}| j|d } | j|d }!| j|d }"| j|d }#| j|#}$t	j
|$|"j dd ddd}%| j|%|"}&t	j
|&|!j dd ddd}'| j|'|!}(t	j
|(| j| jfddd})| |)d|\}*}+},|*|d< |+|d< |,|d< |)|d< |S )a]  
        Args:
            agent_img:           (B, 3, H, W) agentview image
            wrist_img:           (B, 3, H, W) wrist camera image (optional)
            start_keypoint_2d:   (B, 2) or (2,) current EEF pixel on agentview
            agent_query_pixels:  (B, N_WINDOW, 2) GT pixels on agentview (pred_size space)
            wrist_query_pixels:  (B, N_WINDOW, 2) GT pixels on wrist (pred_size space)

        Returns dict with:
            agent_volume, agent_gripper, agent_rotation, agent_feats
            wrist_volume, wrist_gripper, wrist_rotation, wrist_feats (if wrist_img given)
        r   r
   r1   r	   Nr;   Fr<   r:   r   rl   agent_volumeagent_gripperagent_rotationagent_featsrm   wrist_volumewrist_gripperwrist_rotationwrist_feats)r@   r   r   r   r2   r3   r4   r5   r6   rE   rF   r7   r8   rz   r   r   r   ry   r   r   r   r   r   r   r   )-r   	agent_img	wrist_imgstart_keypoint_2dagent_query_pixelswrist_query_pixelsrK   resultZagent_layersphpwZagent_projectedrP   rQ   rR   rS   rT   rU   rV   rW   rX   r   Zskp_xZskp_ybiavagarZwrist_layersZwphZwpwZwrist_projectedZwl1Zwl2Zwl3Zwl4Zwr4Zwr4_upZwr3Zwr3_upZwr2r   ZwvZwgwrr   r   r   r   *  sZ   
..$
zDualDA3Predictor.forwardr   )rl   )NNNN)r    r!   r"   rY   rG   N_WINDOWr   r   r   r   r   r   r   r#   r   r   r   r   rZ      s    P

'rZ   __main__cudacpur[   )ry   r{   r1   r	   g      l@)r   r   r   z: )-rY   osr   r   torch.nnr   torch.nn.functional
functionalrE   Zsafetensors.torchr   environgetr   r   r   r   rG   r|   Moduler   r$   r*   rZ   r    r   r   is_availablemodelr   r   r   r   tensorkpzerosaqwqno_gradr   r   rd   re   r   r@   r   r   r   r   <module>   sL    	A p

