o
    iBF                     @  s   d Z ddlmZ ddlZddlmZmZ ddlZddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" de	j#j$_%dZ&dZ'G dd dej(eZ)dS )z
Depth Anything 3 API module.

This module provides the main API for Depth Anything 3, including model loading,
inference, and export capabilities. It supports both single and nested model architectures.
    )annotationsN)OptionalSequence)PyTorchModelHubMixin)Image)create_objectload_config)MODEL_REGISTRY)
Prediction)export)affine_inverse)InputProcessor)OutputProcessor)logger)align_poses_umeyamaFzmodel.safetensorszconfig.jsonc                      s   e Zd ZU dZdZded< dVdW fdd	Ze 				
	
	dXdYddZ	dddd
d
ddddddddddd dd!i fdZd7d8Z
				d[d\d:d;Zd]d=d>Zd^d@dAZ		Bd_d`dEdFZ		
	
	dadbdIdJZdcdLdMZdddNdOZdedQdRZdfdTdUZ  ZS )gDepthAnything3a\  
    Depth Anything 3 main API class.

    This class provides a high-level interface for depth estimation using Depth Anything 3.
    It supports both single and nested model architectures with metric scaling capabilities.

    Features:
    - Hugging Face Hub integration via PyTorchModelHubMixin
    - Support for multiple model presets (vitb, vitg, nested variants)
    - Automatic mixed precision inference
    - Export capabilities for various formats (GLB, PLY, NPZ, etc.)
    - Camera pose estimation and metric depth scaling

    Usage:
        # Load from Hugging Face Hub
        model = DepthAnything3.from_pretrained("huggingface/model-name")

        # Or create with specific preset
        model = DepthAnything3(preset="vitg")

        # Run inference
        prediction = model.inference(images, export_dir="output", export_format="glb")
    N
str | None_commit_hash	da3-large
model_namestrc                   sP   t    || _tt| j | _t| j| _| j  t	 | _
t | _d| _dS )a1  
        Initialize DepthAnything3 with specified preset.

        Args:
        model_name: The name of the model preset to use.
                    Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
        **kwargs: Additional keyword arguments (currently unused).
        N)super__init__r   r   r	   Zconfigr   modelevalr   input_processorr   output_processordevice)selfr   kwargs	__class__ 2/data/cameron/da3_repo/src/depth_anything_3/api.pyr   K   s   
	

zDepthAnything3.__init__Fsaddle_balancedimagetorch.Tensor
extrinsicstorch.Tensor | None
intrinsicsexport_feat_layerslist[int] | Noneinfer_gsbooluse_ray_poseref_view_strategyreturndict[str, torch.Tensor]c           	      C  s   t j rt jnt j}t  4 t j|jj|d | 	|||||||W  d   W  d   S 1 s6w   Y  W d   dS 1 sFw   Y  dS )a  
        Forward pass through the model.

        Args:
            image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
            extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
            intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
            export_feat_layers: Layer indices to return intermediate features for.
            infer_gs: Enable Gaussian Splatting branch.
            use_ray_pose: Use ray-based pose estimation instead of camera decoder.
            ref_view_strategy: Strategy for selecting reference view from multiple views.

        Returns:
            Dictionary containing model predictions
        )Zdevice_typeZdtypeN)
torchcudaZis_bf16_supportedZbfloat16Zfloat16Zno_gradZautocastr   typer   )	r   r%   r'   r)   r*   r,   r.   r/   Zautocast_dtyper"   r"   r#   forwardc   s   
"zDepthAnything3.forwardT  upper_bound_resizeZmini_npzg      D@i@B    $list[np.ndarray | Image.Image | str]np.ndarray | Nonealign_to_input_ext_scalerender_extsrender_ixts	render_hwtuple[int, int] | Noneprocess_resintprocess_res_method
export_direxport_formatSequence[int] | Noneconf_thresh_percentilefloatnum_max_pointsshow_camerasfeat_vis_fpsexport_kwargsOptional[dict]r
   c              	   C  s  d|v r
|s
J dd|v rt |d tsJ d| |||||\}}}| |||\}}}| |dur9| nd}|durDt|ng }| |||||||}| |}| 	||||}| 
||}|durd|v r|rwd|vrw| d}d|v rd|vri |d< |d ||	|
d	 d
|v rd
|vri |d
< |d
 |||d d|v rd|vri |d< |d d|i d|v rd|vri |d< |d |||d | j|||fi | |S )u  
        Run inference on input images.

        Args:
            image: List of input images (numpy arrays, PIL Images, or file paths)
            extrinsics: Camera extrinsics (N, 4, 4)
            intrinsics: Camera intrinsics (N, 3, 3)
            align_to_input_ext_scale: whether to align the input pose scale to the prediction
            infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
            use_ray_pose: Use ray-based pose estimation instead of camera decoder (default: False)
            ref_view_strategy: Strategy for selecting reference view from multiple views.
                Options: "first", "middle", "saddle_balanced", "saddle_sim_range".
                Default: "saddle_balanced". For single view input (S ≤ 2), no reordering is performed.
            render_exts: Optional render extrinsics for Gaussian video export
            render_ixts: Optional render intrinsics for Gaussian video export
            render_hw: Optional render resolution for Gaussian video export
            process_res: Processing resolution
            process_res_method: Resize method for processing
            export_dir: Directory to export results
            export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
            export_feat_layers: Layer indices to export intermediate features from
            conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
            num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
            show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
            feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
            export_kwargs: additional arguments to export functions.

        Returns:
            Prediction object containing depth maps and camera parameters
        Zgsz6must set `infer_gs=True` to perform gs-related export.Zcolmapr   z.`image` must be image paths for COLMAP export.NZgs_videoz	-gs_video)r'   r)   Zout_image_hwZglb)rF   rH   rI   Zfeat_visZfps)Zimage_pathsrF   rB   )
isinstancer   _preprocess_inputs_prepare_model_inputs_normalize_extrinsicsZclonelist_run_model_forward_convert_to_prediction%_align_to_input_extrinsics_intrinsics_add_processed_imagesupdate_export_results)r   r%   r'   r)   r;   r,   r.   r/   r<   r=   r>   r@   rB   rC   rD   r*   rF   rH   rI   rJ   rK   imgs_cpuimgsex_tin_t	ex_t_norm
raw_output
predictionr"   r"   r#   	inference   st   8



zDepthAnything3.inference=tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]c           	      C  sh   t   }| ||dur| nd|dur| nd||\}}}t   }td|| d|j |||fS )z.Preprocess input images using input processor.NzProcessed Images Done takingzseconds. Shape: )timer   copyr   infoshape)	r   r%   r'   r)   r@   rB   
start_timerX   end_timer"   r"   r#   rN     s    	

z!DepthAnything3._preprocess_inputsrX   c                 C  sl   |   }|j|ddd  }|dur|j|ddd  nd}|dur/|j|ddd  nd}|||fS )z Prepare tensors for model input.T)Znon_blockingN)_get_model_devicetorG   )r   rX   r'   r)   r   rY   rZ   r[   r"   r"   r#   rO   -  s   
z$DepthAnything3._prepare_model_inputsrZ   c                 C  s   |du rdS t |ddddf }|| }t |}|ddddf }|jdd}t|}tj|dd}|ddddf | |ddddf< |S )	zNormalize extrinsicsN   .   )Zdimg?)min)r   Znormr2   ZmedianZclamp)r   rZ   Z	transformr\   Zc2wsZtranslationsZdistsZmedian_distr"   r"   r#   rP   G  s   
$z$DepthAnything3._normalize_extrinsics
   r^   ransac_view_threshc           	      C  sz   |du r|S |  |_t|j|  t||kddd\}}}}|r8|dddddf   |_| j|  _|S ||_|S )z#Align depth map to input extrinsicsNT*   )ZransacZreturn_alignedZrandom_state.rj   )numpyr)   r   r'   lenZdepth)	r   r'   r)   r^   r;   rn   _ZscaleZaligned_extrinsicsr"   r"   r#   rT   U  s    	

z4DepthAnything3._align_to_input_extrinsics_intrinsicsrY   r[   c              	   C  s   |j }|jdk}	|	rtj| t }
|durt|nd}| |||||||}|	r1tj| t }t	d||
  d |S )zRun model forward pass.r3   NzModel Forward Pass Done. Time:  seconds)
r   r4   r2   r3   Zsynchronizera   rQ   r5   r   rc   )r   rY   rZ   r[   r*   r,   r.   r/   r   Z	need_syncre   Zfeat_layersoutputrf   r"   r"   r#   rR   o  s   
z!DepthAnything3._run_model_forwardr]   c                 C  s4   t   }| |}t   }td||  d |S )z.Convert raw model output to Prediction object.z%Conversion to Prediction Done. Time: rs   )ra   r   r   rc   )r   r]   re   rt   rf   r"   r"   r#   rS     s
   
z%DepthAnything3._convert_to_predictionc                 C  sh   | dddd  }tg d}tg d}|| | }t|dd}|d tj}||_|S )z5Add processed images to prediction for visualization.r      rj   ri   )g
ףp=
?gv/?gCl?)gZd;O?gy&1?g?   )	ZpermuteZcpurp   npZarrayZclipZastypeZuint8Zprocessed_images)r   r^   rX   Zprocessed_imgsZmeanZstdr"   r"   r#   rU     s   z$DepthAnything3._add_processed_imagesNonec                 K  s>   t   }t|||fi | t   }td||  d dS )z1Export results to specified format and directory.zExport Results Done. Time: rs   N)ra   r   r   rc   )r   r^   rD   rC   r   re   rf   r"   r"   r#   rW     s   zDepthAnything3._export_resultstorch.devicec                 C  sT   | j dur| j S |  D ]
}|j | _ |j   S |  D ]
}|j | _ |j   S td)z
        Get the device where the model is located.

        Returns:
            Device where the model parameters are located

        Raises:
            ValueError: If no tensors are found in the model
        NzNo tensor found in model)r   
parametersZbuffers
ValueError)r   Zparambufferr"   r"   r#   rg     s   



z DepthAnything3._get_model_device)r   )r   r   )NNNFFr$   )r%   r&   r'   r(   r)   r(   r*   r+   r,   r-   r.   r-   r/   r   r0   r1   )*r%   r9   r'   r:   r)   r:   r;   r-   r,   r-   r.   r-   r/   r   r<   r:   r=   r:   r>   r?   r@   rA   rB   r   rC   r   rD   r   r*   rE   rF   rG   rH   rA   rI   r-   rJ   rA   rK   rL   r0   r
   )NNr6   r7   )r%   r9   r'   r:   r)   r:   r@   rA   rB   r   r0   r`   )rX   r&   r'   r(   r)   r(   r0   r`   )rZ   r(   r0   r(   )Trm   )r'   r(   r)   r(   r^   r
   r;   r-   rn   rA   r0   r
   )NFFr$   )rY   r&   rZ   r(   r[   r(   r*   rE   r,   r-   r.   r-   r/   r   r0   r1   )r]   r1   r0   r
   )r^   r
   rX   r&   r0   r
   )r^   r
   rD   r   rC   r   r0   rx   )r0   ry   )__name__
__module____qualname____doc__r   __annotations__r   r2   Zinference_moder5   r_   rN   rO   rP   rT   rR   rS   rU   rW   rg   __classcell__r"   r"   r    r#   r   0   sh   
 $ 




	r   )*r   
__future__r   ra   typingr   r   rp   rw   r2   Ztorch.nnZnnZhuggingface_hubr   ZPILr   Zdepth_anything_3.cfgr   r   Zdepth_anything_3.registryr	   Zdepth_anything_3.specsr
   Zdepth_anything_3.utils.exportr   Zdepth_anything_3.utils.geometryr   Z)depth_anything_3.utils.io.input_processorr   Z*depth_anything_3.utils.io.output_processorr   Zdepth_anything_3.utils.loggerr   Z!depth_anything_3.utils.pose_alignr   ZbackendsZcudnnZ	benchmarkZSAFETENSORS_NAMEZCONFIG_NAMEZModuler   r"   r"   r"   r#   <module>   s,   
