o
    i4                     @   s   d dl T d dlmZ d dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlm
  mZ d dlZd dlZd dlZd dlZd dlZd dlmZ ddlmZmZmZ d	d
lmZmZmZ d	dlmZmZm Z  G dd de
j!Z"dS )    )*)Number)partial)PathN)hf_hub_download   )normalized_view_plane_uvrecover_focal_shiftangle_diff_vec3   )wrap_dinov2_attention_with_sdpa'wrap_module_with_gradient_checkpointing)unwrap_module_with_gradient_checkpointing)DINOv2EncoderMLP	ConvStackc                       s  e Zd ZU eed< eed< eed< eed< eed< eed< dddddd	d
gfdee	e
f dee	e
f dee	e
f dee	e
f dee	e
f dee	e
f ded dee f fddZedejfddZedejfddZedefddZejdefddZed4dee	eee f deee	e
f  dd fddZdd Zd d! Zd"d# Zd$ejdejfd%d&Z d'ejd(eeej!f dee	ejf fd)d*Z"e# 		+	,	,		,d5d'ejd(ed-ed.ed/ed0eee$ejf  d1edee	ejf fd2d3Z%  Z&S )6	MoGeModelencoderneckpoints_head	mask_head
scale_headonnx_compatible_modeNlineari  i  normal_headremap_output)r   sinhexpsinh_expnum_tokens_rangec	           
         s   t t|   |	rtd|	  || _|| _tdi || _t	di || _
|d ur3t	di || _|d ur?t	di || _|d urKt	di || _|d urYtdi || _d S d S )Nz8The following deprecated/invalid arguments are ignored:  )superr   __init__warningswarnr   r   r   r   r   r   r   r   r   r   r   )
selfr   r   r   r   r   r   r   r   Zdeprecated_kwargs	__class__r    (/data/cameron/moge_repo/moge/model/v2.pyr"      s    zMoGeModel.__init__returnc                 C      t |  jS N)next
parametersdevicer%   r    r    r(   r.   ;      zMoGeModel.devicec                 C   r*   r+   )r,   r-   dtyper/   r    r    r(   r1   ?   r0   zMoGeModel.dtypec                 C   s   t | ddS )N_onnx_compatible_modeF)getattrr/   r    r    r(   r   C   s   zMoGeModel.onnx_compatible_modevaluec                 C   s   || _ || j_d S r+   )r2   r   r   )r%   r4   r    r    r(   r   G   s   pretrained_model_name_or_pathmodel_kwargsc                 K   sv   t | r	|}ntd|ddd|}tj|ddd}|d }|dur)|| | di |}|j|d d	d
 |S )a   
        Load a model from a checkpoint file.

        ### Parameters:
        - `pretrained_model_name_or_path`: path to the checkpoint file or repo id.
        - `compiled`
        - `model_kwargs`: additional keyword arguments to override the parameters in the checkpoint.
        - `hf_kwargs`: additional keyword arguments to pass to the `hf_hub_download` function. Ignored if `pretrained_model_name_or_path` is a local path.

        ### Returns:
        - A new instance of `MoGe` with the parameters loaded from the checkpoint.
        modelzmodel.pt)Zrepo_idZ	repo_typefilenameZcpuT)Zmap_locationZweights_onlymodel_configNF)strictr    )r   existsr   torchloadupdateZload_state_dict)clsr5   r6   Z	hf_kwargsZcheckpoint_pathZ
checkpointr9   r7   r    r    r(   from_pretrainedL   s    
zMoGeModel.from_pretrainedc                 C      | j   d S r+   )r   init_weightsr/   r    r    r(   rB   m      zMoGeModel.init_weightsc                 C   s:   | j   | j  dD ]}t| |rt| |  qd S )Nr   r   r   )r   enable_gradient_checkpointingr   hasattrr3   )r%   headr    r    r(   rE   p   s   


z'MoGeModel.enable_gradient_checkpointingc                 C   rA   r+   )r   enable_pytorch_native_sdpar/   r    r    r(   rH   w   rC   z$MoGeModel.enable_pytorch_native_sdpapointsc                 C   s   | j dkr	 |S | j dkrt|}|S | j dkr6|jddgdd\}}t|}tj|| |gdd}|S | j dkrW|jddgdd\}}tjt|t|gdd}|S td	| j  )
Nr   r   r   r   r   dimr   zInvalid remap output type: )r   r<   r   splitr   cat
ValueError)r%   rI   Zxyzr    r    r(   _remap_pointsz   s    






zMoGeModel._remap_pointsimage
num_tokensc                    s  |j \}}|j|j}} }|| d || d }}	t|tjr3|  |	  }}	n	t|t|	}}	j|||	dd\ }
 d d d d g t	dD ];}t
|	d|  |d|  |||d}|dddd|d	d	d	} | d u r| |< qRtj | |gdd
 |< qR   fdddD \}}}tdr|
nd }fdd|||fD \}}}|d ur|dddd}|}|d ur|dddd}tj|d	d
}|d ur|d }|d ur|d }||||d}dd | D }|S )N      ?T)Zreturn_class_token   r   )widthZheightaspect_ratior1   r.   r   r   rJ   rK   c                 3   s0    | ]}t |rt| d  ndV  qdS )rJ   N)rF   r3   ).0rG   )featuresr%   r    r(   	<genexpr>   s   . z$MoGeModel.forward.<locals>.<genexpr>rD   r   c                 3   s4    | ]}|d urt j| fddddnd V  qd S )NZbilinearF)modeZalign_cornersZ	antialias)FZinterpolate)rX   v)img_himg_wr    r(   rZ      s   2    rI   normalmaskmetric_scalec                 S      i | ]\}}|d ur||qS r+   r    rX   kr]   r    r    r(   
<dictcomp>       z%MoGeModel.forward.<locals>.<dictcomp>)shaper.   r1   
isinstancer<   TensorroundZlongr   ranger   Zpermute	unsqueezeexpandconcatr   rF   r   rQ   r\   	normalizesqueezeZsigmoidr   items)r%   rR   rS   Z
batch_size_r.   r1   rW   Zbase_hZbase_wZ	cls_tokenlevelZuvrI   rb   rc   rd   return_dictr    )rY   r^   r_   r%   r(   forward   sH   " 

 
zMoGeModel.forward	   Tresolution_levelforce_projection
apply_maskfov_xuse_fp16c                    s  |  dkrd}|d}nd}|j| j| jd}|jdd \}	}
|	|
 }|
|	 }|du r?| j\}}t||d ||   }tj	| jj
tj|oM| jtjkd	 | j||d
 W d   n1 sbw   Y   fdddD \}}}}tdd |||||g\}}}}}tj	| jj
tjd |dur|dk}nd}|dur3|du rt||\}}n4|d|d  d  tttj||j|jdd  }|jdkr|d |jd }t|||d\}}|d d|d  d  | |d d|d  d  }}tj||tjd|j|jdtjd|j|jd}|d  |d 7  < |dur,||d dkM }|d  }nd\}}|rG|durGtjj||d}|durm|dur]||dddddf 9 }|durm||ddddf 9 }|r|dur|durt|d |tjnd}|durt||tjnd}|durt|d |t|nd}W d   n	1 sw   Y  |||||d}dd | D }|rdd | D }|S )a  
        User-friendly inference function

        ### Parameters
        - `image`: input image tensor of shape (B, 3, H, W) or (3, H, W)
        - `num_tokens`: the number of base ViT tokens to use for inference, `'least'` or `'most'` or an integer. Suggested range: 1200 ~ 2500. 
            More tokens will result in significantly higher accuracy and finer details, but slower inference time. Default: `'most'`. 
        - `force_projection`: if True, the output point map will be computed using the actual depth map. Default: True
        - `apply_mask`: if True, the output point map will be masked using the predicted mask. Default: True
        - `fov_x`: the horizontal camera FoV in degrees. If None, it will be inferred from the predicted point map. Default: None
        - `use_fp16`: if True, use mixed precision to speed up inference. Default: True
            
        ### Returns

        A dictionary containing the following keys:
        - `points`: output tensor of shape (B, H, W, 3) or (H, W, 3).
        - `depth`: tensor of shape (B, H, W) or (H, W) containing the depth map.
        - `intrinsics`: tensor of shape (B, 3, 3) or (3, 3) containing the camera intrinsics.
        r`   Tr   F)r1   r.   Nry   )device_typer1   enabled)rS   c                 3   s    | ]	}  |d V  qd S r+   )get)rX   rg   outputr    r(   rZ      s    z"MoGeModel.infer.<locals>.<genexpr>ra   c                 S   s   t | tjr
|  S | S r+   )rk   r<   rl   float)xr    r    r(   <lambda>   s    z!MoGeModel.infer.<locals>.<lambda>)r   r1   rT   r   r   )r.   r1   )focal).r   ).NN)NN)
intrinsics).N)rI   r   depthrc   rb   c                 S   re   r+   r    rf   r    r    r(   rh   *  ri   z#MoGeModel.infer.<locals>.<dictcomp>c                 S   s   i | ]
\}}|| d qS )r   )rs   rf   r    r    r(   rh   -  s    )rL   ro   tor1   r.   rj   r   intr<   ZautocasttypeZfloat16rx   mapZfloat32r	   ZtanZdeg2radZ	as_tensorndimrp   utils3dZptZintrinsics_from_focal_centerZtensorZcloneZdepth_map_to_point_mapwhereinfZ
zeros_likert   )r%   rR   rS   rz   r{   r|   r}   r~   Zomit_batch_dimZoriginal_heightZoriginal_widthZarearW   Z
min_tokensZ
max_tokensrI   rb   rc   rd   Zmask_binaryr   shiftru   ZfxZfyr   r   rw   r    r   r(   infer   sr   
$"

8
62



"&-zMoGeModel.inferr+   )Nry   TTNT)'__name__
__module____qualname__r   __annotations__r   r   boolDictstrAnyLiteralListr   r"   propertyr<   r.   r1   r   setterclassmethodUnionr   IObytesOptionalr@   rB   rE   rH   rl   rQ   Z
LongTensorrx   Zinference_moder   r   __classcell__r    r    r&   r(   r      s   
 





4 ,8	r   )#typingZnumbersr   	functoolsr   pathlibr   r#   r<   Ztorch.nnZnnZtorch.nn.functionalZ
functionalr\   Ztorch.utilsZtorch.utils.checkpointZ	torch.ampZtorch.versionr   Zhuggingface_hubr   Zutils.geometry_torchr   r	   r
   Zutilsr   r   r   modulesr   r   r   ZModuler   r    r    r    r(   <module>   s$    