o
    Ii                     @   sv   d Z ddlZddlZddlmZ ddlm  mZ dZdZ	dZ
dZdZdZejddZd	d
 ZG dd dejZdS )uw  MoGe baseline — MoGe v2 pretrained DINOv2 backbone + PARA heads.

Uses the DINOv2 ViT-S/14 backbone from MoGe v2, pretrained for monocular
geometry estimation. The hypothesis: geometry-pretrained features improve
3D EEF prediction.

Architecture: same as PARA (heatmap volume + gripper/rotation MLPs) but with
MoGe's geometry-pretrained backbone instead of vanilla DINOv3.
    N       @      MOGE_WEIGHTS_PATHz#/data/cameron/moge_weights/model.ptc           
      C   s   t jjdddd}t j| dd}d|v r|d }d}i }| D ]\}}||r5|t|d	 }|||< q |j|dd
\}}	|rRtdt| d|d	d  d |	retdt|	 d|	d	d  d tdt| d |S )zMLoad DINOv2 ViT-S/14 backbone and initialize with MoGe v2 pretrained weights.zfacebookresearch/dinov2Zdinov2_vits14F)Z
pretrainedcpu)map_locationmodelzencoder.backbone.N)strictz  MoGe backbone missing keys (z):    z...z!  MoGe backbone unexpected keys (u%   ✓ Loaded MoGe pretrained backbone (z keys))torchhubloaditems
startswithlenload_state_dictprint)
Zweights_pathbackbonesdprefixZbackbone_sdkvnew_keymissing
unexpected r   '/data/cameron/para/libero/model_moge.py_load_dinov2_vits14_from_moge   s$   
""r   c                       sX   e Zd ZdZdeedf fdd	Z fddZdd	 Zd
d Z	dd Z
dddZ  ZS )MoGePredictorzDMoGe-pretrained DINOv2 ViT-S/14 backbone + PARA-style heatmap heads.i  Fc                    s  t    || _|| _|| _t| _d| _td t	t
| _|r4| j D ]}d|_q$| j  td ntd d| _| j}tt|d | _td| d	 ttj||d
ddt tj||d
ddt tj||d
ddt | _td|  tj||t dd| _td| dt d| d| d		 tt|t||t t|d| _tt|t||t t|d
t | _td| dt  d	 td| dt d	 d S )Nmogez*Loading MoGe pretrained DINOv2 backbone...Fu   ✓ Frozen MoGe backboneu   ✓ MoGe backbone is trainablei  g{Gz?u,   ✓ Learnable start keypoint embedding (dim=)      )kernel_sizepaddingu1   ✓ Feature convs: 3× Conv2d(3×3) at pred_size=)r$   u   ✓ Volume   head → (B, z, u   ✓ Gripper  MLP  → (B, u   ✓ Rotation MLP  → (B, z, 3, )!super__init__target_size	pred_sizen_window
PATCH_SIZE
patch_size
model_typer   r   r   r   
parametersrequires_gradeval	embed_dimnn	Parameterr   randnstart_keypoint_embedding
SequentialConv2dGELUfeature_convsN_HEIGHT_BINSvolume_head	LayerNormLineargripper_mlp
N_ROT_BINSrotation_mlpN_GRIPPER_BINS)selfr(   r)   r*   freeze_backbonekwargsparamD	__class__r   r   r'   5   sD   



""&zMoGePredictor.__init__c                    s   t  | | j|| _| S N)r&   tor   )rB   devicerG   r   r   rJ   a   s   zMoGePredictor.toc                 C   sd   |j d }| jj|dgd}|d }t|j d d  }}||||| j}|dddd }|S )z4Extract patch features using MoGe's DINOv2 backbone.r      )nr#   g      ?r"      )shaper   Zget_intermediate_layersintreshaper1   permute
contiguous)rB   xBfeatspatch_tokensH_pW_ppatch_featuresr   r   r   _extract_featuresf   s   
zMoGePredictor._extract_featuresc                 C   s|   |j \}}}}|j d }|d  d|d }|d  d|d }	tj||jd|d||}
||
d d |	|f S )Nr#   ).r   r   ).r#   rK   )rO   longclampr   arangerK   viewexpand)rB   rV   query_pixelsrU   rF   HWNpxpy	batch_idxr   r   r   _index_featuresp   s   
 zMoGePredictor._index_featuresc           	      C   sd   |j d d \}}| | |}||| | j}| |||}| |||dt}||fS )NrN   r"   )rO   ri   detachrQ   r1   r>   r@   r?   )	rB   rV   rb   rU   re   indexedflatgripperrotationr   r   r   predict_at_pixelsx   s   zMoGePredictor.predict_at_pixelsNc                 C   s<  |j d }| |}|j \}}}}	| dkr |d|d}|d d df |	 | j  d|	d }
|d d df | | j  d|d }tj	||j
d}||d d ||
f  | jd7  < tj|| j| jfddd}| |}| |}||| jt| j| j}|d ur| ||\}}nd  }}||||fS )Nr   r#   r\   bilinearF)sizemodealign_corners)rO   r[   dim	unsqueezera   r(   r]   r^   r   r_   rK   r5   Finterpolater)   r9   r;   r`   r*   r:   ro   )rB   rT   start_keypoint_2drb   rU   rZ   _rF   rX   rY   start_patch_xstart_patch_ybatch_indicesrV   volvolume_logitsgripper_logitsrotation_logitsr   r   r   forward   s"   

**$

zMoGePredictor.forwardrI   )__name__
__module____qualname____doc__	PRED_SIZEN_WINDOWr'   rJ   r[   ri   ro   r   __classcell__r   r   rG   r   r   2   s    ,
r   )r   osr   torch.nnr2   torch.nn.functional
functionalrw   r   r:   rA   r?   r   r+   environgetr   r   Moduler   r   r   r   r   <module>   s    
