o
    iD                     @  s   d dl mZ d dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZ d dlmZmZmZ d d	lmZ d
d ZG dd dejZG dd dejZdS )    )annotationsN)Dict)
DictConfig	OmegaConf)create_object)pose_encoding_to_extri_intri)apply_metric_scalingcompute_alignment_maskcompute_sky_maskleast_squares_scale_scalarsample_tensor_for_quantileset_sky_regions_to_max_depth)affine_inverseas_homogeneousmap_pdf_to_opacity)get_extrinsic_from_camrayc                 C  s
   t | S )N)r   create)Zcfg_obj r   8/data/cameron/da3_repo/src/depth_anything_3/model/da3.py	_wrap_cfg$   s   
r   c                      s   e Zd ZdZdZd/ fdd	Zddg dddfd0ddZd1ddZd2ddZd3d$d%Z	d4d&d'Z
		d5d6d)d*Zd7d-d.Z  ZS )8DepthAnything3Neta  
    Depth Anything 3 network for depth estimation and camera pose estimation.

    This network consists of:
    - Backbone: DinoV2 feature extractor
    - Head: DPT or DualDPT for depth prediction
    - Optional camera decoders for pose estimation
    - Optional GSDPT for 3DGS prediction

    Args:
        preset: Configuration preset containing network dimensions and settings

    Returns:
        Dictionary containing:
        - depth: Predicted depth map (B, H, W)
        - depth_conf: Depth confidence map (B, H, W)
        - extrinsics: Camera extrinsics (B, N, 4, 4)
        - intrinsics: Camera intrinsics (B, N, 3, 3)
        - gaussians: 3D Gaussian Splats (world space), type: model.gs_adapter.Gaussians
        - aux: Auxiliary features for specified layers
       Nc                   sP  t    t|tjr|ntt|| _t|tjr|ntt|| _d\| _	| _
|durKt|tjr5|ntt|| _	t|tjrD|ntt|| _
d\| _| _|dur|durt|tjra|ntt|| _| jjd }t|tjr|j|ksJ d| d|j || _dS |d |ksJ d| d|d  tt|| _dS dS dS )zY
        Initialize DepthAnything3Net with given yaml-initialized configuration.
        NNN   zgs_head.out_dim should be z, got Z
output_dimz!gs_head output_dim should set to )super__init__
isinstancennModuler   r   backboneheadcam_deccam_enc
gs_adaptergs_headZd_inZout_dim)selfZnetr    r!   r"   r$   r#   Z
gs_out_dim	__class__r   r   r   B   s4   



zDepthAnything3Net.__init__Fsaddle_balancedxtorch.Tensor
extrinsicstorch.Tensor | None
intrinsicsexport_feat_layerslist[int] | Noneinfer_gsbooluse_ray_poseref_view_strategystrreturnDict[str, torch.Tensor]c              
   C  s"  |dur*t j|jjdd | |||jdd }W d   n1 s$w   Y  nd}| j||||d\}	}
|jd |jd }}t j|jjdd. | |	||}|r]| |||}n| 	|	|||}|rr| 
|	||||||}W d   n1 s|w   Y  | |}| |
||||_|S )a1  
        Forward pass through the network.

        Args:
            x: Input images (B, N, 3, H, W)
            extrinsics: Camera extrinsics (B, N, 4, 4) 
            intrinsics: Camera intrinsics (B, N, 3, 3) 
            feat_layers: List of layer indices to extract features from
            infer_gs: Enable Gaussian Splatting branch
            use_ray_pose: Use ray-based pose estimation
            ref_view_strategy: Strategy for selecting reference view

        Returns:
            Dictionary containing predictions and auxiliary features
        NF)Zdevice_typeenabled)	cam_tokenr.   r3   )torchZautocastdevicetyper"   shaper   _process_depth_head_process_ray_pose_estimation_process_camera_estimation_process_gs_head_process_mono_sky_estimation_extract_auxiliary_featuresZaux)r%   r)   r+   r-   r.   r0   r2   r3   r9   featsZ	aux_featsHWoutputr   r   r   forwardd   s,   

	zDepthAnything3Net.forwardrH   c                 C  s   d|vr|S t |jdd}| dkr|S |  dkr|S |j| }| dkr:tjd| d|jd}|| }n|}t|d	}t	|jd
||d\|_}|S )zProcess mono sky estimation.sky333333?Z	threshold
   順 r   rN   r<   Gz?NZ	max_depth)
r
   rJ   sumdepthnumelr;   randintr<   quantiler   )r%   rH   non_sky_masknon_sky_depthidxsampled_depthnon_sky_max_r   r   r   rC      s"   


z.DepthAnything3Net._process_mono_sky_estimationheightintwidthc                 C  st  d|v rd|v rt |j|j|jjd |jjd \}}}t|}|ddddddddf }tddd |jd |jd	 d	d	 	|j
}|dddddf d
 | |ddddddf< |ddddd	f d
 | |ddddd	d	f< |dddddf | d |dddddd
f< |ddddd	f | d |ddddd	d
f< |`|`||_||_|S )z=Process ray pose estimation if ray pose decoder is available.rayray_confr8   N   r   r   r            ?)r   ra   rb   r>   r   r;   Zeyerepeatclonetor<   r+   r-   )r%   rH   r^   r`   Zpred_extrinsicZpred_focal_lengthsZpred_principal_pointsZpred_intrinsicr   r   r   r@      s&   


 42222z.DepthAnything3Net._process_ray_pose_estimationrE   list[torch.Tensor]rF   rG   c                 C  s   | j |||ddS )z3Process features through the depth prediction head.r   )patch_start_idx)r    )r%   rE   rF   rG   r   r   r   r?      s   z%DepthAnything3Net._process_depth_headc                 C  sZ   | j dur+|  |d d }d|v r|`d|v r|`t|||f\}}t||_||_|S )z>Process camera pose estimation if camera decoder is available.Nr:   r   ra   rb   )r!   ra   rb   r   r   r+   r-   )r%   rE   rF   rG   rH   Zpose_encZc2wZixtr   r   r   rA      s   

z,DepthAnything3Net._process_camera_estimation	in_imagesc              	   C  s   | j du s
| jdu r|S |dddusJ d|dd}|dd}	|dur,|	dus0J d|}
t|}|
dur>t|
}
| j |||d|d}|j}|j}| j||	|jt||||f|
d	}||_|S )
z=Process 3DGS parameters estimation if 3DGS head is available.NrT   z&must provide MV depth for the GS head.r+   r-   z5must process camera info first if GT is not availabler   )rE   rF   rG   rk   Zimages)r+   r-   ZdepthsZ	opacitiesraw_gaussiansZimage_shapeZgt_extrinsics)	r$   r#   getr   Zraw_gsZraw_gs_confrT   r   Z	gaussians)r%   rE   rF   rG   rH   rl   r+   r-   Zctx_extrZctx_intrZgt_extrZgs_outsrm   Z	densitiesZgs_worldr   r   r   rB      s@   	z"DepthAnything3Net._process_gs_headfeat_layers	list[int]c           	   	   C  sr   t  }t|t|ksJ t||D ]$\}}||jd |jd || j || j |jd g}||d| < q|S )z1Extract auxiliary features from specified layers.r   r   r:   Zfeat_layer_)r   lenzipZreshaper>   
PATCH_SIZE)	r%   rE   ro   rF   rG   Zaux_featuresZfeatZ
feat_layerZfeat_reshapedr   r   r   rD     s   	z-DepthAnything3Net._extract_auxiliary_features)NNNNr)   r*   r+   r,   r-   r,   r.   r/   r0   r1   r2   r1   r3   r4   r5   r6   )rH   r6   r5   r6   )rH   r6   r^   r_   r`   r_   r5   r6   )rE   rj   rF   r_   rG   r_   r5   r6   )
rE   rj   rF   r_   rG   r_   rH   r6   r5   r6   r   )rE   rj   rF   r_   rG   r_   rH   r6   rl   r*   r+   r,   r-   r,   r5   r6   )
rE   rj   ro   rp   rF   r_   rG   r_   r5   r6   )__name__
__module____qualname____doc__rs   r   rI   rC   r@   r?   rA   rB   rD   __classcell__r   r   r&   r   r   (   s&    %
7


8r   c                      s\   e Zd ZdZd% fddZddg ddd	fd&ddZd'ddZd'ddZ	 d(d)d#d$Z  Z	S )*NestedDepthAnything3Neta  
    Nested Depth Anything 3 network with metric scaling capabilities.

    This network combines two DepthAnything3Net branches:
    - Main branch: Standard depth estimation
    - Metric branch: Metric depth estimation for scaling alignment

    The network performs depth alignment using least squares scaling
    and handles sky region masking for improved depth estimation.

    Args:
        preset: Configuration for the main depth estimation branch
        second_preset: Configuration for the metric depth branch
    anyviewr   metricc                   s"   t    t|| _t|| _dS )z
        Initialize NestedDepthAnything3Net with two branches.

        Args:
            preset: Configuration for main depth estimation branch
            second_preset: Configuration for metric depth branch
        N)r   r   r   da3
da3_metric)r%   r{   r|   r&   r   r   r   D  s   

z NestedDepthAnything3Net.__init__NFr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   c           
   	   C  sJ   | j |||||||d}| |}	| ||	}| ||	}| ||	}|S )an  
        Forward pass through both branches with metric scaling alignment.

        Args:
            x: Input images (B, N, 3, H, W)
            extrinsics: Camera extrinsics (B, N, 4, 4) - unused
            intrinsics: Camera intrinsics (B, N, 3, 3) - unused
            feat_layers: List of layer indices to extract features from
            infer_gs: Enable Gaussian Splatting branch
            use_ray_pose: Use ray-based pose estimation
            ref_view_strategy: Strategy for selecting reference view

        Returns:
            Dictionary containing aligned depth predictions and camera parameters
        )r.   r0   r2   r3   )r}   r~   _apply_metric_scaling_apply_depth_alignment_handle_sky_regions)
r%   r)   r+   r-   r.   r0   r2   r3   rH   metric_outputr   r   r   rI   P  s   
zNestedDepthAnything3Net.forwardrH   r   c                 C  s   t |j|j|_|S )z0Apply metric scaling to the metric depth output.)r   rT   r-   )r%   rH   r   r   r   r   r   v  s
   z-NestedDepthAnything3Net._apply_metric_scalingc                 C  s   t |jdd}| dksJ d|j| }t|dd}t|d}t|j||j|j|}|j| }|j| }	t	|	|}
| j|
9  _|j
dddddd	d	f  |
9  < d
|_|
 |_|S )z2Apply depth alignment using least squares scaling.rK   rL   rM   z)Insufficient non-sky pixels for alignmentrN   )Zmax_samplesrf   Nrd   r   )r
   rJ   rS   
depth_confr   r;   rW   r	   rT   r   r+   Z	is_metricitemscale_factor)r%   rH   r   rX   Zdepth_conf_nsZdepth_conf_sampledZmedian_confZ
align_maskZvalid_depthZvalid_metric_depthr   r   r   r   r     s    



&
z.NestedDepthAnything3Net._apply_depth_alignment      i@sky_depth_deffloatc           	      C  s|   t |jdd}|j| }| dkr#tjd| d|jd}|| }n|}tt|d|}t	|j|j
||d\|_|_
|S )	z4Handle sky regions by setting them to maximum depth.rK   rL   rN   r   rO   rP   rQ   rR   )r
   rJ   rT   rU   r;   rV   r<   minrW   r   r   )	r%   rH   r   r   rX   rY   rZ   r[   r\   r   r   r   r     s   

z+NestedDepthAnything3Net._handle_sky_regions)r{   r   r|   r   rt   )rH   r6   r   r6   r5   r6   )r   )rH   r6   r   r6   r   r   r5   r6   )
ru   rv   rw   rx   r   rI   r   r   r   ry   r   r   r&   r   rz   4  s    
&
%rz   )
__future__r   r;   Ztorch.nnr   Zaddictr   	omegaconfr   r   depth_anything_3.cfgr   Z&depth_anything_3.model.utils.transformr   Z depth_anything_3.utils.alignmentr   r	   r
   r   r   r   Zdepth_anything_3.utils.geometryr   r   r   Z depth_anything_3.utils.ray_utilsr   r   r   r   rz   r   r   r   r   <module>   s      