o
    {iM                     @   s   d Z ddlZddlZddlm  mZ ddlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZ dd	lmZ G d
d dejZG dd deZG dd dejZG dd dejZdd ZdS )z.
Deformable DETR model and criterion classes.
    N)nn   )box_ops)NestedTensor_get_clonesinverse_sigmoidnested_tensor_from_tensor_list   )build_backbone)build_transformerc                       sL   e Zd ZdZ						d fdd	Zdefd	d
Zejj	dd Z
  ZS )	PlainDETRzAThis is the Deformable DETR module that performs object detectionTF,  r   c              	      st  t    ||	 }| _| _|j}t|| _t||dd _	| _
|s0t||d  _n	|
r9t|| _tttj|jd |ddtd|g _| _| _| _| _d}td| |  }t||  jj_tj j	jd	 j jd tj j	jd	 jjd  jD ]}tjj!|d j dd
 tj|d jd q|r|j"j#d n|j"j#}|rt$ j| _t$ j	| _	tj j	d jd	 jjdd d  j	 jj"_	n4tj j	jd	 jjdd d t fddt%|D  _t fddt%|D  _	d jj"_	|r2 j jj"_ j	D ]}tj|jd	 jjdd d q| _&|
 _'dS )a  Initializes the model.
        Parameters:
            backbone: torch module of the backbone to be used. See backbone.py
            transformer: torch module of the transformer architecture. See transformer.py
            num_classes: number of object classes
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
            with_box_refine: iterative bounding box refinement
            two_stage: two-stage Deformable DETR
            num_queries_one2one: number of object queries for one-to-one matching part
            num_queries_one2many: number of object queries for one-to-many matching part
            mixed_selection: a trick for Deformable DETR two stage

              r   r   r	   )kernel_size    g{Gz?)gainNg       c                       g | ]} j qS  )class_embed.0_selfr   M/data/cameron/keygrip/volume_dino_tracks/dinov3/eval/detection/models/detr.py
<listcomp>k       z&PlainDETR.__init__.<locals>.<listcomp>c                    r   r   )
bbox_embedr   r   r   r   r   l   r   g        )(super__init__num_queriestransformerd_modelr   Linearr   MLPr   num_feature_levels	Embeddingquery_embed
ModuleList
SequentialConv2dnum_channels	GroupNorm
input_projbackboneaux_losswith_box_refine	two_stagemathlogtorchonesbiasdatainit	constant_layersweightxavier_uniform_decoder
num_layersr   rangenum_queries_one2onemixed_selection)r   r0   r#   num_classesr'   r1   r2   r3   rB   num_queries_one2manyrC   r"   
hidden_dimZ
prior_prob
bias_valueprojZnum_predZ	box_embed	__class__r   r   r!   %   s^   


&"

$
zPlainDETR.__init__samplesc                 C   s  t |ts	t|}| |\}}g }g }t|D ]\}}| \}}	|| j| | ||	 |	dus7J qd}
| jr@| j	rM| j
jd| jddf }
	 tj| j| jgt|jd}d|| jdd| jf< d|d| j| jdf< | ||||
|\}}}}}}}}g }g }g }g }t|jd D ]}|dkr|}n||d  }t|}| j| || }| j| || }|jd dkr||7 }n|jd dksJ |d	ddf  |7  < | }||ddd| jf  ||dd| jdf  ||ddd| jf  ||dd| jdf  qt|}t|}t|}t|}|d |d |d |d d
}| jrM| |||d< | |||d< | jr\| }||d|d< |S )=  The forward expects a NestedTensor, which consists of:
           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels

        It returns a dict with the following elements:
           - "pred_logits": the classification logits (including no-object) for all queries.
                            Shape= [batch_size x num_queries x (num_classes + 1)]
           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
                           (center_x, center_y, height, width). These values are normalized in [0, 1],
                           relative to the size of each individual image (disregarding possible padding).
                           See PostProcess for information on how to retrieve the unnormalized bounding box.
           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                            dictionnaries containing the two above keys for each decoder layer.
        Nr   dtypedeviceTr	   r   r   r   .)pred_logits
pred_boxespred_logits_one2manypred_boxes_one2manyaux_outputsaux_outputs_one2manyrP   rQ   enc_outputs)
isinstancer   r   r0   	enumerate	decomposeappendr/   r3   rC   r)   r=   r"   r6   zerosboolrO   rB   r#   rA   shaper   r   r   sigmoidstackr1   _set_aux_loss)r   rK   featurespossrcsmaskslayerfeatsrcmaskquery_embedsself_attn_maskhsinit_referenceinter_referencesenc_outputs_classenc_outputs_coord_unactenc_outputs_deltaoutput_proposals	max_shapeoutputs_classes_one2oneoutputs_coords_one2oneoutputs_classes_one2manyoutputs_coords_one2manylvl	referenceoutputs_classtmpoutputs_coordoutZenc_outputs_coordr   r   r   forwardw   s   







zPlainDETR.forwardc                 C   s$   dd t |d d |d d D S )Nc                 S   s   g | ]	\}}||d qS )rV   r   )r   abr   r   r   r      s    z+PlainDETR._set_aux_loss.<locals>.<listcomp>r   zip)r   rz   r|   r   r   r   ra      s   $zPlainDETR._set_aux_loss)TFFr   r   F)__name__
__module____qualname____doc__r!   r   r~   r6   jitunusedra   __classcell__r   r   rI   r   r   "   s    Ror   c                   @   s*   e Zd ZdefddZejjdd ZdS )PlainDETRReParamrK   c           "   	   C   s.  t |ts	t|}| |\}}g }g }t|D ]\}}| \}}	|| j| | ||	 |	dus7J qd}
| jr@| j	rM| j
jd| jddf }
	 tj| j| jgt|jd}d|| jdd| jf< d|d| j| jdf< | ||||
|\}}}}}}}}g }g }g }g }g }g }g }g }t|jd D ]}|dkr|}n||d  }| j| || }| j| || }|jd dkrtt|||} nt||ddd| jf  ||dd| jdf  || ddd| jf  || dd| jdf  ||ddd| jf  ||dd| jdf  ||ddd| jf  ||dd| jdf  qt|}t|}t|}t|}|d |d |d |d |d |d |d |d d}!| jr| |||||!d	< | |||||!d
< | jr||||d|!d< |!S )rL   Nr   rM   Tr	   r   r   )rP   rQ   rR   rS   pred_boxes_oldpred_deltasZpred_boxes_old_one2manyZpred_deltas_one2manyrT   rU   rP   rQ   r   r   rW   )rX   r   r   r0   rY   rZ   r[   r/   r3   rC   r)   r=   r"   r6   r\   r]   rO   rB   r#   rA   r^   r   r   r   Zbox_xyxy_to_cxcywhZ
delta2bboxNotImplementedErrorr`   r1   ra   )"r   rK   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   rw   Zoutputs_coords_old_one2oneZoutputs_deltas_one2oneZoutputs_coords_old_one2manyZoutputs_deltas_one2manyrx   ry   rz   r{   r|   r}   r   r   r   r~      s   






zPlainDETRReParam.forwardc                 C   s8   dd t |d d |d d |d d |d d D S )Nc                 S   s"   g | ]\}}}}||||d qS )r   r   )r   r   r   cdr   r   r   r   u  s    
z2PlainDETRReParam._set_aux_loss.<locals>.<listcomp>r   r   )r   rz   r|   Zoutputs_coord_oldZoutputs_deltasr   r   r   ra   p  s   ,zPlainDETRReParam._set_aux_lossN)	r   r   r   r   r~   r6   r   r   ra   r   r   r   r   r      s
     r   c                       s4   e Zd ZdZd	 fdd	Ze d
ddZ  ZS )PostProcesszPThis module converts the model's output into the format expected by the coco apid   Fc                    s   t    || _|| _d S N)r    r!   topkreparam)r   r   r   rI   r   r   r!     s   

zPostProcess.__init__Nc              	   C   s  |d |d }}t |t |ksJ |jd dksJ | jr(|jd dks(J | }tj||jd d| jdd\}}|}	||jd  }
||jd  }t|}t	|d|

dddd}|d\}}| jr|d	d	d	d	f |d	d	d	d	f }}|d
dd	df jt||d |d
dd	df jt||d || d\}}tj||||gdd}ntj||||gdd}||d	d	d	d	d	f  }dd t|	||D }|S )a  Perform the computation
        Parameters:
            outputs: raw outputs of the model
            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
                          For evaluation, this must be the original image size (before any data augmentation)
                          For visualization, this should be the image size after data augment, but before padding
        rP   rQ   r	   r   r   r   )dimr   N.)minmaxc                 S   s   g | ]\}}}|||d qS ))scoreslabelsboxesr   )r   slr   r   r   r   r     s    z'PostProcess.forward.<locals>.<listcomp>)lenr^   r   r_   r6   r   viewr   Zbox_cxcywh_to_xyxygather	unsqueezerepeatunbindclamp_
zeros_liker`   r   )r   outputstarget_sizesoriginal_target_sizesZ
out_logitsZout_bboxprobZtopk_valuesZtopk_indexesr   Z
topk_boxesr   r   img_himg_wscale_hscale_wZ	scale_fctresultsr   r   r   r~     s,   	$
&""zPostProcess.forward)r   Fr   )	r   r   r   r   r!   r6   no_gradr~   r   r   r   rI   r   r     s
    r   c                       s(   e Zd ZdZ fddZdd Z  ZS )r&   z4Very simple multi-layer perceptron (also called FFN)c                    sJ   t    || _|g|d  }tdd t|g| ||g D | _d S )Nr	   c                 s   s     | ]\}}t ||V  qd S r   )r   r%   )r   nkr   r   r   	<genexpr>  s    zMLP.__init__.<locals>.<genexpr>)r    r!   r@   r   r*   r   r<   )r   	input_dimrF   
output_dimr@   hrI   r   r   r!     s   
,zMLP.__init__c                 C   s<   t | jD ]\}}|| jd k rt||n||}q|S )Nr	   )rY   r<   r@   Frelu)r   xirf   r   r   r   r~     s   &zMLP.forward)r   r   r   r   r!   r~   r   r   r   rI   r   r&     s    r&   c                 C   sL   t | |}t|}|jstnt}||||j|j|j|j|j	|j
|j|jd
S )N)rD   r'   r1   r2   r3   rB   rE   rC   )r
   r   r   r   r   rD   r'   r1   r2   r3   rB   rE   rC   )Zbackbone_modelargsr0   r#   model_classr   r   r   build_model  s   
r   )r   r4   r6   torch.nn.functionalr   
functionalr   utilr   Z	util.miscr   r   r   r   r0   r
   r#   r   Moduler   r   r   r&   r   r   r   r   r   <module>   s     M 0