o
    iD                     @   s   d dl mZ d dl mZmZmZ d dlZd dlmZ d dl	mZ d dl
mZ d dlmZmZmZmZ G dd dejZ					
ddedeeef dedededejfddZ	
ddee dedededejf
ddZG dd dejZG dd dejZdS )    )Dict)ListSequenceTupleN)	rearrange)Permutecreate_uv_gridcustom_interpolateposition_grid_to_embedc                #       sX  e Zd ZdZddddddddd	d
dddddddededededededee dedededededededededd f" fd!d"Z	#d9d$e	e
j d%ed&ed'ed(edefd)d*Zd$e	e
j d%ed&ed'edeee
jf f
d+d,Zd$e	e
j de
jfd-d.Z	/d:d0e
jdede
jfd1d2Zd0e
jde
jfd3d4Zd;d0e
jd&ed%ed6ede
jf
d7d8Z  ZS )<DPTaA  
    DPT for dense prediction (main head + optional sky head, sky always 1 channel).

    Returns:
      - Main head:
        * If output_dim>1: { head_name, f"{head_name}_conf" }
        * If output_dim==1: { head_name }
      - Sky head (if use_sky_head=True): { sky_name }  # [B, S, 1, H/down_ratio, W/down_ratio]
          expexpp1   )r   i      r   FdepthTskyreluidt)
patch_size
output_dim
activationconf_activationfeaturesout_channels	pos_embed
down_ratio	head_nameuse_sky_headsky_namesky_activationuse_ln_for_heads	norm_typefusion_block_inplacedim_inr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   returnNc                   sH  t    || _|| _|| _|| _|	| _|
| _|| _|| _	|dk| _
|| _|| _d| _|dkr6t | _n|dkr@t | _ntd| dt fdd|D | _ttj|d	 |d	 d
d
d	dtj|d |d ddd	dt tj|d |d ddddg| _tt||dd| _t||d| j_t||d| j_t||d| j_t|d|d| j_|}d}tj||d dddd| j_ |rt!dt|t!dgng }tj"tj|d |ddddg|tj#ddtj||ddd	dR  | j_$| jr"tj"tj|d |ddddg|tj#ddtj|dddd	dR  | j_%d S d S )Nr   )r   r         layerr   zUnknown norm_type z, should be 'layer' or 'idt'.c              	      s    g | ]}t j |d d ddqS )r   r   kernel_sizestridepadding)nnConv2d).0ocr%    8/data/cameron/da3_repo/src/depth_anything_3/model/dpt.py
<listcomp>_   s     z DPT.__init__.<locals>.<listcomp>r      r*   r'   r(   F)expandinplace)has_residualr9       )r   r'   r(   r   )r   r(   r   r'   T)&super__init__r   r   r   r   r   	head_mainr    out_dimhas_confr   r!   intermediate_layer_idxr.   	LayerNormnormIdentity	Exception
ModuleListprojectsConvTranspose2dr/   resize_layers_make_scratchlistscratch_make_fusion_block
refinenet1
refinenet2
refinenet3
refinenet4output_conv1r   
SequentialReLUoutput_conv2sky_output_conv2)selfr%   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   head_features_1head_features_2ln_seq	__class__r2   r4   r=   *   s   





zDPT.__init__   featsHWpatch_start_idx
chunk_sizec           
         sL  |d d j \  fdd|D }i }d|v r)|dt|d di |du s1|krM| j||||fi |} fdd| D }t|S g td|D ]3t| i }	d|v rq|	d|d  i | jfd	d|D |||fi |	 qUfd
dd 	 D } fdd| D }t|S )a  
        Args:
            feats: List of 4 entries, each entry is a tensor like [B, S, T, C] (or the 0th element of tuple/list is that tensor).
            H, W:  Original image dimensions
            patch_start_idx: Starting index of patch tokens in sequence (for cropping non-patch tokens)
            chunk_size:      Chunk size along time dimension S

        Returns:
            Dict[str, Tensor]
        r   c                    s"   g | ]}|d     qS )r   )reshape)r0   feat)BCNSr3   r4   r5      s   " zDPT.forward.<locals>.<listcomp>imageszB S ... -> (B S) ...Nc                    0   i | ]\}}||j  g|jd d R  qS r   Nviewshaper0   kvre   rh   r3   r4   
<dictcomp>      0 zDPT.forward.<locals>.<dictcomp>c                    s   g | ]}|  qS r3   r3   )r0   f)s0s1r3   r4   r5      s    c                    s*   i | ]  t j fd dD ddqS )c                    s   g | ]}|  qS r3   r3   )r0   odrp   r3   r4   r5      s    z*DPT.forward.<locals>.<dictcomp>.<listcomp>r   )dim)torchcat)r0   )	out_dictsry   r4   rs      s   * c                    rj   rk   rl   ro   rr   r3   r4   rs      rt   )
rn   updater   _forward_implitemsr   rangeminappendkeys)
rW   r^   r_   r`   ra   rb   kwargsextra_kwargsout_dictkwr3   )re   rf   rg   rh   r}   rv   rw   r4   forward   s*   &zDPT.forwardc                 C   s  |d j \}}}|| j || j }}	g }
t| jD ]A\}}|| d d |d f }| |}|ddd ||||	}| j| |}| j	rO| 
|||}| j| |}|
| q| |
}t|| j | j }t|	| j | j }| j|}t|||fddd}| j	r| 
|||}|}| j|}i }| jr|dddd}| |dd d	f | j}| |d
 | j}|d|| j< |d|| j d< n| || jd|| j< | jr| j|}| |d|| j< |S )Nr   r'   r   bilinearTmodealign_cornersr(   .).r   _conf)rn   r   	enumeraterA   rC   permute
contiguousrc   rG   r   _add_pos_embedrI   r   _fuseintr   rL   rR   r	   rU   r@   _apply_activation_singler   r   squeezer>   r   rV   _apply_sky_activationr    )rW   r^   r_   r`   ra   re   _rf   phpwresized_feats	stage_idxtake_idxxfusedh_outw_outrd   main_logitsoutsfmappredconfZ
sky_logitsr3   r3   r4   r      sJ   

zDPT._forward_implc                 C   s   |\}}}}| j |}| j |}| j |}| j |}	| j j|	|jdd d}
| j j|
||jdd d}
| j j|
||jdd d}
| j 	|
|}
|
S )zf
        4-layer top-down fusion, returns finest scale features (after fusion, before neck1).
        r'   Nsize)
rL   	layer1_rn	layer2_rn	layer3_rn	layer4_rnrQ   rn   rP   rO   rN   )rW   r^   l1l2l3l4l1_rnl2_rnl3_rnl4_rnoutr3   r3   r4   r     s   z	DPT._fuselinearr   c                 C   s   t |tr	| n|}|dkrt|S |dkrt|d S |dkr(t|S |dkr1t|S |dkr:t|S |dkrEtjj	
|S |dkrNt|S |S )	z
        Apply activation to single channel output, maintaining semantic consistency with value branch in multi-channel case.
        Supports: exp / relu / sigmoid / softplus / tanh / linear / expp1
        r   r   r   expm1r   sigmoidsoftplustanh)
isinstancestrlowerr{   r   r   r   r   r.   
functionalr   r   )rW   r   r   actr3   r3   r4   r     s    




zDPT._apply_activation_singlec                 C   sD   t | jtr| j n| j}|dkrt|S |dkr t|S |S )z
        Sky head activation (fixed 1 channel):
          * 'sigmoid' -> Sigmoid probability map
          * 'relu'    -> ReLU positive domain output
          * 'linear'  -> Original value (logits)
        r   r   )r   r!   r   r   r{   r   r   )rW   r   r   r3   r3   r4   r   7  s   
	

zDPT._apply_sky_activation皙?ratioc                 C   sp   |j d |j d }}t|||| |j|jd}t||j d | }|dddd |j d ddd}|| S )z:Simple UV position encoding directly added to feature map.r   )aspect_ratiodtypedevicer   r'   r   N)rn   r   r   r   r
   r   r7   )rW   r   r`   r_   r   r   r   per3   r3   r4   r   J  s
   $zDPT._add_pos_embed)r]   )r   )r   )__name__
__module____qualname____doc__r   r   r   boolr=   r   r{   Tensorr   r   TyDictr   r   r   r   floatr   __classcell__r3   r3   r[   r4   r      s    	
{
0
=
,r   Tr   Fr   r   r:   groupsr9   r&   c                 C   s"   t | tj|ddddd|||d	S )Nr8   FT)	r   r   deconvbnr7   r   r   r:   r   )FeatureFusionBlockr.   rT   )r   r   r:   r   r9   r3   r3   r4   rM   V  s   
rM   in_shape	out_shaper7   c           	   	   C   s   t  }|}||rdnd }||rdnd }||rdnd }t j| d |dddd|d|_t j| d |dddd|d|_t j| d |dddd|d|_t j| d |dddd|d|_|S )	Nr'   r   r6   r]   r   r(   Fbiasr   )r.   Moduler/   r   r   r   r   )	r   r   r   r7   rL   c1c2c3c4r3   r3   r4   rJ   j  s   rJ   c                       sP   e Zd ZdZddedejdededdf
 fd	d
Zde	j
de	j
fddZ  ZS )ResidualConvUnitz1Lightweight residual convolution block for fusionr   r   r   r   r   r&   Nc              	      sl   t    || _|| _tj||dddd|d| _tj||dddd|d| _d | _d | _	|| _
tj | _d S )Nr(   r   Tr   )r<   r=   r   r   r.   r/   conv1conv2norm1norm2r   	quantizedFloatFunctionalskip_add)rW   r   r   r   r   r[   r3   r4   r=   ~  s   
zResidualConvUnit.__init__r   c                 C   s^   |  |}| |}| jd ur| |}|  |}| |}| jd ur(| |}| j||S )N)r   r   r   r   r   r   add)rW   r   r   r3   r3   r4   r     s   







zResidualConvUnit.forward)r   )r   r   r   r   r   r.   r   r   r=   r{   r   r   r   r3   r3   r[   r4   r   {  s    &r   c                       s   e Zd ZdZ							ddedejded	ed
ededeeef dededdf fddZ	ddde
jdeeef de
jfddZ  ZS )r   zOTop-down fusion block: (optional) residual merge + upsampling + 1x1 contractionFTNr   r   r   r   r   r7   r   r   r:   r   r&   c
              	      s   t    || _|| _|| _|rt||||	dnd | _t||||	d| _|r*|d n|}
tj	||
dddd|	d| _
tj | _d S )N)r   r'   r   r   Tr   )r<   r=   r   r   r:   r   resConfUnit1resConfUnit2r.   r/   out_convr   r   r   )rW   r   r   r   r   r7   r   r   r:   r   out_featuresr[   r3   r4   r=     s   
zFeatureFusionBlock.__init__r   xsc                G   s   |d }| j rt|dkr| jdur| j|| |d }| |}|du r1| jdu r1ddi}n|du r;d| ji}nd|i}t|fi |d| jd}| 	|}|S )	z
        xs:
          - xs[0]: Top branch input
          - xs[1]: Lateral input (can do residual addition with top branch)
        r   r   Nscale_factorr'   r   r   r   )
r:   lenr   r   r   r   r   r	   r   r   )rW   r   r   yZ	up_kwargsr3   r3   r4   r     s   


zFeatureFusionBlock.forward)FFFTNTr   )r   r   r   r   r   r.   r   r   r   r=   r{   r   r   r   r3   r3   r[   r4   r     s>    
	
0r   )NTr   F)r   F)typingr   r   r   r   r   r{   torch.nnr.   addicteinopsr   'depth_anything_3.model.utils.head_utilsr   r   r	   r
   r   r   r   r   rM   rJ   r   r   r3   r3   r3   r4   <module>   sR     ;


