
    g#                    b   d dl mZ d dlmZ d dlmZmZmZmZ d dl	Z	d dl	m
Z
 d dlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ e G d d             Z edddddd      Z	 	 	 	 ddZe e	j<                  d      e	j>                  f	 	 	 	 	 	 	 ddZ  G d de
jB                        Z"y)    )annotations)	dataclass)MappingOptionalTupleUnionN)nn)ComposeConvertImageDtypeLambda	NormalizeToTensor   )MultiresConvDecoder)DepthProEncoder)
FOVNetwork)VIT_CONFIG_DICT	ViTPreset
create_vitc                  Z    e Zd ZU dZded<   ded<   ded<   dZded	<   dZd
ed<   dZded<   y)DepthProConfigzConfiguration for DepthPro.r   patch_encoder_presetimage_encoder_presetintdecoder_featuresNzOptional[str]checkpoint_urizOptional[ViTPreset]fov_encoder_presetTbooluse_fov_head)__name__
__module____qualname____doc____annotations__r   r   r        @/home/cameronsmith/repos/ml-depth-pro/src/depth_pro/depth_pro.pyr   r      s7    %####$(NM(.2+2L$r&   r   dinov2l16_384z./checkpoints/depth_pro.pt   T)r   r   r   r   r   r   c                d    | t         v rt         |    }t        | d      }||fS t        d|  d      )zCreate and load a backbone model given a config.

    Args:
    ----
        preset: A backbone preset to load pre-defind configs.

    Returns:
    -------
        A Torch module and the associated config.

    F)presetuse_pretrainedzPreset z not found.)r   r   KeyError)r+   configmodels      r'   create_backbone_modelr0   1   sD       (&? &= 455r&   cpuc           	        t        | j                        \  }}t        | j                        \  }}d}| j                  r%| j                  t        | j                        \  }}|j
                  }|j                  }	t        ||||	| j                        }
t        | j                  gt        |
j                        z   | j                        }t        |
|d| j                  |      j                        }|t        j                  k(  r|j                          t!        t#               t%        fd      t'        g dg d      t)        |      g      }| j*                  t        j,                  | j*                  d	
      }|j/                  |d      \  }}t1        |      dk7  rt3        d|       |D cg c]	  }d|vs| }}t1        |      dk7  rt3        d|       ||fS c c}w )a  Create a DepthPro model and load weights from `config.checkpoint_uri`.

    Args:
    ----
        config: The configuration for the DPT model architecture.
        device: The optional Torch device to load the model onto, default runs on "cpu".
        precision: The optional precision used for the model, default is FP32.

    Returns:
    -------
        The Torch DepthPro model and associated Transform.

    )r+   N)dims_encoderpatch_encoderimage_encoderhook_block_idsr   )r3   dim_decoder)    r   )encoderdecoder	last_dimsr   fov_encoderc                &    | j                        S )N)to)xdevices    r'   <lambda>z-create_model_and_transforms.<locals>.<lambda>   s    QTT&\ r&   )      ?rB   rB   r1   )map_locationT)
state_dictstrictr   z.Found unexpected keys when loading monodepth: fc_normz)Keys are missing when loading monodepth: )r0   r   r   r   r   encoder_feature_dimsencoder_feature_layer_idsr   r   r   listr3   DepthPror>   torchhalfr
   r   r   r   r   r   loadload_state_dictlenr-   )r.   r@   	precisionr4   patch_encoder_configr5   _r<   r3   r6   r9   r:   r/   	transformrD   missing_keysunexpected_keyskeys    `                r'   create_model_and_transformsrW   H   s   $ +@**+'M' -**M1 Kv88D.f6O6OPQ'<<L)CCN!##%00G "--.g6J6J1KK++G (( 	bj 
 EJJ

J)*o7i(		
I (ZZ 5 5EJ
(-(=(=!$ )> )
%o 1$@@QR  (4Ly7KLL|!F|nUVV)	 Ms   :	G*G*c                       e Zd ZdZ	 	 d	 	 	 	 	 	 	 	 	 d fdZedd       Zd	dZ ej                         	 	 d
	 	 	 	 	 dd       Z
 xZS )rJ   zDepthPro network.c                X   t         |           || _        || _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  |dz  |dz  dddd      t        j                  |dz  |d   ddd      t        j                  d      t        j                  |d   |d   ddd      t        j                               | _
        | j                  d   j                  j                  j                  d       |rt        ||	      | _        y
y
)am  Initialize DepthPro.

        Args:
        ----
            encoder: The DepthProEncoder backbone.
            decoder: The MultiresConvDecoder decoder.
            last_dims: The dimension for the last convolution layers.
            use_fov_head: Whether to use the field-of-view head.
            fov_encoder: A separate encoder for the field of view.

              r   )kernel_sizestridepaddingr   T)in_channelsout_channelsr\   r]   r^   bias   )num_featuresr<   N)super__init__r9   r:   r7   r	   
SequentialConv2dConvTranspose2dReLUheadra   datafill_r   fov)selfr9   r:   r;   r   r<   r7   	__class__s          r'   re   zDepthPro.__init__   s   & 	))MMII[A-1QPQ '1,(A- IIq ! GGDMIIilIaLaSTUGGI+
	2 			!$$Q' !{TDH r&   c                .    | j                   j                  S )z.Return the internal image size of the network.)r9   img_size)rn   s    r'   rq   zDepthPro.img_size   s     ||$$$r&   c                H   |j                   \  }}}}|| j                  k(  r|| j                  k(  sJ | j                  |      }| j                  |      \  }}| j	                  |      }d}	t        | d      r*| j                  j                  ||j                               }	||	fS )a  Decode by projection and fusion of multi-resolution encodings.

        Args:
        ----
            x (torch.Tensor): Input image.

        Returns:
        -------
            The canonical inverse depth map [m] and the optional estimated field of view [deg].

        Nrm   )	shaperq   r9   r:   rj   hasattrrm   forwarddetach)
rn   r?   rR   HW	encodingsfeatures
features_0canonical_inverse_depthfov_degs
             r'   ru   zDepthPro.forward   s     WW
1aDMM!a4==&88LLO	#||I6*"&))H"54hh&&q**;*;*=>G&//r&   c           	        t        |j                        dk(  r|j                  d      }|j                  \  }}}}|| j                  k7  xs || j                  k7  }|r9t        j
                  j                  || j                  | j                  f|d      }| j                  |      \  }}	|Nd|z  t        j                  dt        j                  |	j                  t        j                              z        z  }|||z  z  }
|j                         }|r%t        j
                  j                  |
||f|d      }
dt        j                  |
dd	      z  }|j                         |d
S )a  Infer depth and fov for a given image.

        If the image is not at network resolution, it is resized to 1536x1536 and
        the estimated depth is resized to the original image resolution.
        Note: if the focal length is given, the estimated value is ignored and the provided
        focal length is use to generate the metric depth values.

        Args:
        ----
            x (torch.Tensor): Input image
            f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
            interpolation_mode (str): Interpolation function for downsampling/upsampling. 

        Returns:
        -------
            Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].

        r[   r   F)sizemodealign_cornersrB   g      ?g-C6?g     @)minmax)depthfocallength_px)rO   rs   	unsqueezerq   r	   
functionalinterpolateru   rK   tandeg2radr>   floatsqueezeclamp)rn   r?   f_pxinterpolation_moderR   rw   rx   resizer|   r}   inverse_depthr   s               r'   inferzDepthPro.infer   sD   2 qww<1AAWW
1admm#9qDMM'9))mmT]]3'#	 * A ,0<<?(<7UYYsU]]7::ekk;R-S'STTD/1t8<||~MM55QF1CSX 6 M ekk-TsCC ]]_"
 	
r&   )TN)
r9   r   r:   r   r;   ztuple[int, int]r   r   r<   zOptional[nn.Module])returnr   )r?   torch.Tensorr   z+Tuple[torch.Tensor, Optional[torch.Tensor]])Nbilinear)r?   r   r   z$Optional[Union[float, torch.Tensor]]r   zMapping[str, torch.Tensor])r    r!   r"   r#   re   propertyrq   ru   rK   no_gradr   __classcell__)ro   s   @r'   rJ   rJ      s     "+/6U 6U %6U #	6U
 6U )6Up % %02 U]]_ 6:%	6
6
 36

 
$6
 6
r&   rJ   )r+   r   r   zTuple[nn.Module, ViTPreset])r.   r   r@   ztorch.devicerP   ztorch.dtyper   zTuple[DepthPro, Compose])#
__future__r   dataclassesr   typingr   r   r   r   rK   r	   torchvision.transformsr
   r   r   r   r   network.decoderr   network.encoderr   network.fovr   network.vit_factoryr   r   r   r   DEFAULT_MONODEPTH_CONFIG_DICTr0   r@   float32rW   ModulerJ   r%   r&   r'   <module>r      s   
 # ! 2 2    1 , # G G 	 	 	 !/((/&!  0 ;'5<<."]]OOO O 	OdP
ryy P
r&   