o
    {i                     @   sZ   d dl Z d dlZd dlZd dlm  mZ d dlm	Z	 ddl
mZ G dd dejjZdS )    N)v2   )NestedTensorc                       s.   e Zd ZdZ fddZdefddZ  ZS )WindowsWrappera  
    This wrapper will take an input (NestedTensor) at size (h, w) and split it
    in `N = n_windows_h * n_windows_w` equally sized windows (the bottom and right windows might
    be a little bit smaller), with sizes that are multiples of the patch size (as the input should be).

    Then, the input will be resized at the size of the top left window (h / n_windows_h, w / n_windows_w).
    This resized input, plus the N windows, will be passed through the backbone.
    Then, the features of the resized input will be resized to the original input size, while the
    features of the windows will be concatenated side by side to reconstruct a feature map also
    corresponding to the original image's size.

    Finally, both the features from the windows and from the resized images are stacked.
    Compared to the output of the backbone of size [B, C, H, W], the output here is [B, 2 * C, H, W]
    c                    s@   t    || _|| _|| _|| _|j| _dd |jD | _d S )Nc                 S   s   g | ]}|d  qS )r    .0elr   r   P/data/cameron/keygrip/volume_dino_tracks/dinov3/eval/detection/models/windows.py
<listcomp>(   s    z+WindowsWrapper.__init__.<locals>.<listcomp>)super__init__	_backbone_n_windows_w_n_windows_h_patch_sizestridesnum_channels)selfbackbonen_windows_wn_windows_h
patch_size	__class__r   r
   r       s   
zWindowsWrapper.__init__tensor_listc              	      s"  |j }|jd |jd }}t| j  j  j }t| j  j  j }|g jd  || jd   g }|g jd  || jd   g }dgtt	| }	dgtt	| }
 fddt
 jD t
 jD ]@}t
 jD ]8}tjj||	| |
| || || d}tjj|j|	| |
| || || d} t||dd | |< qzqstjfd	dt
tD d
d}tjj|||fd} t||jd}tjtjj|d j |jd
d  d|gdd}tj|jd   |jd
d  dtjd }t||dg}|S )Nr         r   c                    s    g | ]}d d t  jD qS )c                 S   s   g | ]}d qS )r   r   r   _r   r   r
   r   4   s    5WindowsWrapper.forward.<locals>.<listcomp>.<listcomp>)ranger   r   )r   r   r
   r   4   s     z*WindowsWrapper.forward.<locals>.<listcomp>)topleftheightwidth)tensorsmaskc                    s(   g | ]}t jd d  | D ddqS )c                 S   s   g | ]}|j qS r   )r&   r   r   r   r
   r   B   s    r    dim)torchcat)r   ih)window_patch_featuresr   r
   r   A   s    r)   )size)r&   shapemathceilr   r   r   listnpcumsumr!   r   
functionalcropr'   r   r   r+   r,   lenresizeFinterpolatefloattobool)r   r   r&   Z
original_hZ
original_wZwindow_hZwindow_wZall_hZall_wZall_h_cumsumZall_w_cumsumr-   iwZwindow_tensorZwindow_maskZwindow_tensorsZresized_global_tensorZglobal_featuresZconcat_tensorsZglobal_maskoutr   )r   r.   r
   forward*   sF   $$ 	

$.zWindowsWrapper.forward)__name__
__module____qualname____doc__r   r   rB   __classcell__r   r   r   r
   r      s    
r   )r2   numpyr5   r+   torch.nn.functionalnnr7   r;   Ztorchvision.transformsr   	util.miscr   Moduler   r   r   r   r
   <module>   s   