
    gD,                    x    d dl mZ d dlZd dlmZmZ d dlZd dlmZ d dl	mc m
Z  G d dej                        Zy)    )annotationsN)IterableOptionalc                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d fdZd Zd Zedd       Z	 	 	 	 ddZ	dddZ
dddZ	 d	 dd	Zdd
Z xZS )DepthProEncoderzmDepthPro Encoder.

    An encoder aimed at creating multi-resolution encodings from Vision Transformers.
    c                \   t         	|           t        |      | _        || _        || _        t        |      | _        |j                  }|j                  }t        |j                  j                  d   |j                  j                  d   z        | _        	 d	 	 	 	 	 	 	 	 	 dd} ||| j                  d   |d      | _         ||| j                  d   d      | _         ||| j                  d   d      | _         ||| j                  d   d      | _         ||| j                  d   d      | _        t%        j&                  || j                  d   dddd	
      | _        t%        j*                  | j                  d   | j                  d   z   | j                  d   dddd	
      | _        | j                  j.                  | j                  d      j1                  | j2                         | j                  j.                  | j                  d      j1                  | j4                         y)a  Initialize DepthProEncoder.

        The framework
            1. creates an image pyramid,
            2. generates overlapping patches with a sliding window at each pyramid level,
            3. creates batched encodings via vision transformer backbones,
            4. produces multi-resolution encodings.

        Args:
        ----
            img_size: Backbone image resolution.
            dims_encoder: Dimensions of the encoder at different layers.
            patch_encoder: Backbone used for patches.
            image_encoder: Backbone used for global image encoder.
            hook_block_ids: Hooks to obtain intermediate features for the patch encoder model.
            decoder_features: Number of feature output in the decoder.

        r   Nc                    ||}t        j                  | |dddd      g}|t        |      D cg c]$  }t        j                  |dk(  r|n||dddd      & c}z  }t        j                  | S c c}w )N   r   Fin_channelsout_channelskernel_sizestridepaddingbias   )nnConv2drangeConvTranspose2d
Sequential)dim_indim_outupsample_layersdim_intblocksis         F/home/cameronsmith/repos/ml-depth-pro/src/depth_pro/network/encoder.py_create_project_upsample_blockz@DepthProEncoder.__init__.<locals>._create_project_upsample_block<   s     ! 		 &!( !	F  /
  ""+,6w!( !
 
F ==&))
s   )A/   )r   r   r   r   r   )r   r   r   r
   Tr   N)
r   intr   r"   r   r"   r   zOptional[int]return	nn.Module)super__init__listdims_encoderpatch_encoderimage_encoderhook_block_ids	embed_dimr"   patch_embedimg_size
patch_sizeout_sizeupsample_latent0upsample_latent1	upsample0	upsample1	upsample2r   r   upsample_lowresr   fuse_lowresr   register_forward_hook_hook0_hook1)
selfr(   r)   r*   r+   decoder_featurespatch_encoder_embed_dimimage_encoder_embed_dimr   	__class__s
            r   r&   zDepthProEncoder.__init__   sP   4 	 .**">2"/"9"9"/"9"9%%..q1]5N5N5Y5YZ[5\\
 &*	!	*!	*!	* !!	* #	!	*
 !	*F !?*%%a($	!
 !?*D4E4Ea4HZ[!
 8*D4E4Ea4HZ[
 8*D4E4Ea4HZ[
 8*D4E4Ea4HZ[
  "11/**1- 
 99**1-0A0A!0DD**1-
 	!!$"5"5a"89OOKK	
 	!!$"5"5a"89OOKK	
    c                    || _         y r!   )backbone_highres_hook0r;   modelinputoutputs       r   r9   zDepthProEncoder._hook0   
    &,#r@   c                    || _         y r!   )backbone_highres_hook1rC   s       r   r:   zDepthProEncoder._hook1   rG   r@   c                N    | j                   j                  j                  d   dz  S )z.Return the full image size of the SPN network.r      )r)   r-   r.   )r;   s    r   r.   zDepthProEncoder.img_size   s&     !!--66q9A==r@   c                x    |}t        j                  |dddd      }t        j                  |dddd      }|||fS )zCreate a 3-level image pyramid.N      ?bilinearF)sizescale_factormodealign_corners      ?)Finterpolate)r;   xx0x1x2s        r   _create_pyramidzDepthProEncoder._create_pyramid   sM    
  ]]Ds5

 ]]Dt*E
 2rzr@   c           	     d   d}t        |d|z
  z        }|j                  d   }t        t        j                  ||z
  |z              dz   }g }t	        |      D ]A  }||z  }	|	|z   }
t	        |      D ]'  }||z  }||z   }|j                  |d|	|
||f          ) C t        j                  |d      S )z7Split the input into small patches with sliding window.i  r
   .r   dim)r"   shapemathceilr   appendtorchcat)r;   rV   overlap_ratior/   patch_stride
image_sizestepsx_patch_listjj0j1r   i0i1s                 r   splitzDepthProEncoder.split   s    
:]):;<WWR[
DIIzJ6,FGH1Lu 	:A\!BjB5\ :%*_##Ac2b5"R%&7$89:		: yy1--r@   c                   t        t        j                  |j                  d   |z              }d}g }t	        |      D ]  }g }t	        |      D ]r  }	|||z  ||dz   z   }
|dk7  r|
d|dddf   }
|	dk7  r|
ddd|df   }
||dz
  k7  r|
dd| ddf   }
|	|dz
  k7  r|
dddd| f   }
|j                  |
       |dz  }t t        j                  |d      }|j                  |        t        j                  |d      }
|
S )z9Merge the patched input into a image with sliding window.r   r
   .Nr\   r]   )r"   r`   sqrtr_   r   rb   rc   rd   )r;   rV   
batch_sizer   rh   idxoutput_listrj   output_row_listr   rF   
output_rows               r   mergezDepthProEncoder.merge   s7   DIIaggajJ678u 	+A O5\ :+jC!G.DE6#C1$45F6#CGH$45F	>#C7(A$56F	>#CIgXI$56F&&v.q ?;Jz*%	+& ;B/r@   c                    |j                   \  }}}|dkD  r|dd|dddf   }|j                  ||||      j                  dddd      }|S )z<Discard class token and reshape 1D feature map to a 2D grid.r   Nr    r
   r   )r_   reshapepermute)r;   
embeddingswidthheightcls_token_offsetbhwcs           r   reshape_featurezDepthProEncoder.reshape_feature   sc     ##2q a#A'7'8!$;<J  ''65!<DDQ1aP
r@   c                   |j                   d   }| j                  |      \  }}}| j                  |d      }| j                  |d      }|}t        j                  |||fd      }	| j                  |	      }
| j                  |
| j                  | j                        }
| j                  | j                  | j                  | j                        }| j                  |d|dz  dz   |d	      }| j                  | j                  | j                  | j                        }| j                  |d|dz  dz   |d	      }t        j                  |
t        |      t        |      t        |      gd      \  }}}| j                  ||d	      }| j                  ||d
	      }|}| j                  |      }| j                  || j                  | j                        }| j                  |      }| j                  |      }| j                  |      }| j!                  |      }| j#                  |      }| j%                  |      }| j'                  t        j                  ||fd            }|||||gS )zEncode input at multiple resolutions.

        Args:
        ----
            x (torch.Tensor): Input image.

        Returns:
        -------
            Multi resolution encoded features.

        r   rS   )re   rM   r]   N   r    )rs   r      r
   )r_   rZ   ro   rc   rd   r)   r   r0   rB   rx   rI   lenr*   r1   r2   r3   r4   r5   r6   r7   )r;   rV   rs   rW   rX   rY   
x0_patches
x1_patches
x2_patchesx_pyramid_patchesx_pyramid_encodingsx_latent0_encodingsx_latent0_featuresx_latent1_encodingsx_latent1_featuresx0_encodingsx1_encodingsx2_encodingsx0_featuresx1_featuresx2_featuresx_global_featuress                         r   forwardzDepthProEncoder.forward   s    WWQZ
 ))!,
B
 ZZ$Z7
ZZ#Z6

 "IIZ0
 #001BC"22
 #22''MMMM

 "ZZ 4*q.1"45*VW ( 
 #22''MMMM

 "ZZ 4*q.1"45*VW ( 

 49;;_c*os:?4
0lL jj*ajP jj*ajP # !..z: 00t}}dmm

 "223EF!223EFnn[1nn[1nn[1 001BC ,,II{$56A>

 
 	
r@   )
r(   Iterable[int]r)   r$   r*   r$   r+   r   r<   r"   )r#   r"   )rV   torch.Tensorr#   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor])rS   )rV   r   re   floatr#   r   )r    )rV   r   rs   r"   r   r"   r#   r   )r
   )r|   r   )rV   r   r#   zlist[torch.Tensor])__name__
__module____qualname____doc__r&   r9   r:   propertyr.   rZ   ro   rx   r   r   __classcell__)r?   s   @r   r   r      s    
v
#v
 !v
 !	v

 &v
 v
p-- > >	8&.(< IJ&c
r@   r   )
__future__r   r`   typingr   r   rc   torch.nnr   torch.nn.functional
functionalrT   Moduler    r@   r   <module>r      s.    #  %    ~
bii ~
r@   