o
    ¯§{i·I  ã                   @   s´   d dl Z d dlZd dlm  mZ d dlmZ d dlmZmZm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ G dd„ dejƒZG dd„ deƒZdd„ ZdS )é    N)Únn)Ú	constant_Únormal_Úxavier_uniform_é   )Úbox_xyxy_to_cxcywhÚ
delta2bboxé   )Úbuild_global_ape_decoder)Úbuild_global_rpe_decomp_decoder)ÚTransformerEncoderÚTransformerEncoderLayer)ÚLayerNorm2Dc                       s‚   e Zd Zdddddddddd	g d
¢ddddddddf‡ fdd„	Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Zdd„ Z	d!dd „Z
‡  ZS )"ÚTransformeré   é   é   Fi,  Ú	post_normÚdeformr	   é   )r   r   é    é@   é2   Ni   gš™™™™™¹?Úrelué   c              	      s|  t ƒ  ¡  || _|| _|| _|| _|dv sJ d|› ƒ‚|dkr&t|ƒ| _n|dkr0t|ƒ| _nt	‚t
 t ||¡¡| _|r_t
 ||¡| _t
 |¡| _t
 |d |d ¡| _t
 |d ¡| _nt
 |d¡| _|| _|	| _|| _|| _|r|	dkrt|ƒ|	ks‚J ‚|
| _t
 g ¡| _|D ]Š}||
krœ| j t
 ¡ ¡ q||
krÜt t! "||
 ¡ƒ}g }t#|d ƒD ]}|t
j$||dddt%|ƒt
 &¡ g7 }q±| t
j$||ddd¡ | j t
j'|Ž ¡ qt t! "|
| ¡ƒ}g }t#|d ƒD ]}|t
j(||dddt%|ƒt
 &¡ g7 }qí| t
j(||ddd¡ | j t
j'|Ž ¡ qd | _)|r8t*||||||ƒ}|r/t
 |¡nd }t+|||ƒ| _)|  ,¡  d S )N)Úpre_normr   z1expected norm type is pre_norm or post_norm, get Z
global_apeÚglobal_rpe_decompr   r	   )Úkernel_sizeÚstride)-ÚsuperÚ__init__Úd_modelÚnheadÚ	two_stageÚtwo_stage_num_proposalsr
   Údecoderr   ÚNotImplementedErrorr   Ú	ParameterÚtorchÚTensorÚlevel_embedÚLinearÚ
enc_outputÚ	LayerNormÚenc_output_normÚ	pos_transÚpos_trans_normÚreference_pointsÚmixed_selectionÚproposal_feature_levelsÚproposal_tgt_stridesÚproposal_min_sizeÚlenÚproposal_in_strideÚ
ModuleListÚenc_output_projÚappendÚIdentityÚintÚmathÚlog2ÚrangeÚConv2dr   ÚGELUÚ
SequentialÚConvTranspose2dÚencoderr   r   Ú_reset_parameters)Úselfr!   r"   Únum_feature_levelsr#   r$   r2   Ú	norm_typeÚdecoder_typer3   r7   r4   r5   ÚargsÚadd_transformer_encoderÚdim_feedforwardÚdropoutÚ
activationZnormalize_beforeÚnum_encoder_layersr   ÚscaleÚlayersÚ_Úencoder_layerÚencoder_norm©Ú	__class__© úT/data/cameron/keygrip/volume_dino_tracks/dinov3/eval/detection/models/transformer.pyr       s~   
ýýúzTransformer.__init__c                 C   sv   |   ¡ D ]}| ¡ dkrtj |¡ q| js't| jjjdd t	| jj
jdƒ t| jƒ t| jdƒr9| j ¡  d S d S )Nr	   g      ð?)Úgaing        rE   )Ú
parametersÚdimr   Úinitr   r#   r1   ÚweightÚdatar   Úbiasr   r*   Úhasattrr%   rE   )rF   ÚprW   rW   rX   rE      s   €
ÿzTransformer._reset_parametersc              	   C   sÈ   | j d }d}dtj }tj|tj|jd}|d|d  |  }|| }|d d …d d …d d …d f | }tj|d d …d d …d d …dd d…f  ¡ |d d …d d …d d …dd d…f  	¡ fdd 
d¡}|S )Nr   i'  ©ÚdtypeÚdevicer   r	   r   ©r[   )r!   r=   Úpir(   ÚarangeÚfloat32rd   ÚstackÚsinÚcosÚflatten)rF   Ú	proposalsÚnum_pos_featsÚtemperaturerP   Údim_tÚposrW   rW   rX   Úget_proposal_pos_embed   s   

 \z"Transformer.get_proposal_pos_embedc                 C   sF  | j dkr|  |||¡\}}}|j\}}}g }d}t|ƒD ]°\}	\}
}|d d …|||
|  …f  ||
|d¡}t |d d …d d …ddf  d¡}t |d d …dd d …df  d¡}t tjd|
d |
tj	|j
dtjd|d |tj	|j
d¡\}}t | d¡| d¡gd¡}t | d¡| d¡gd¡ |ddd¡}| d¡ |ddd¡d | }t |¡d d|	  }t ||fd¡ |dd	¡}| |¡ ||
| 7 }qt |d¡}|d
k|dk @ jddd}t |d|  ¡}| | d¡tdƒ¡}| | tdƒ¡}|}| | d¡tdƒ¡}| | tdƒ¡}|  |  |¡¡}d }|||fS )Nr	   r   rb   éÿÿÿÿr   ç      à?gš™™™™™©?ç       @r   ç{®Gáz„?ç®Gáz®ï?T©ÚkeepdimÚinf)r3   Úexpand_encoder_outputÚshapeÚ	enumerateÚviewr(   ÚsumÚmeshgridÚlinspacerh   rd   ÚcatÚ	unsqueezeÚexpandÚ	ones_liker:   ÚallÚlogÚmasked_fillÚfloatr.   r,   )rF   ÚmemoryÚmemory_padding_maskÚspatial_shapesÚN_ÚS_ÚC_rm   Ú_curÚlvlÚH_ÚW_Úmask_flatten_Úvalid_HÚvalid_WÚgrid_yÚgrid_xÚgridrP   ÚwhÚproposalÚoutput_proposalsÚoutput_proposals_validÚoutput_memoryÚ	max_shaperW   rW   rX   Úgen_encoder_output_proposalsœ   sB   

ÿ(""þ(

z(Transformer.gen_encoder_output_proposalsc           
      C   sx   |j \}}}t |d d …d d …df  d¡}t |d d …dd d …f  d¡}| ¡ | }| ¡ | }t ||gd¡}	|	S )Nr   r	   rs   )r|   r(   r   r‰   ri   )
rF   ÚmaskrR   ÚHÚWr•   r–   Zvalid_ratio_hZvalid_ratio_wZvalid_ratiorW   rW   rX   Úget_valid_ratioÄ   s     zTransformer.get_valid_ratioc                 C   s  t |ƒdksJ d|› dƒ‚|j\}}}|d \}}| ||||¡ dddd¡}	| |||¡}
g g g }}}t| jƒD ]4}| j| |	ƒ}tj|
d   	¡ |jdd … d 
tj¡}| |¡ | | d¡¡ | |jdd … ¡ q<tjd	d
„ |D ƒdd}tjdd
„ |D ƒdd}|||fS )Nr	   zGet encoder output of shape z, not sure how to expandr   é   r   éþÿÿÿ)Úsizec                 S   s   g | ]}|  d ¡ dd ¡‘qS )r   r	   )rl   Ú	transpose)Ú.0ÚmemrW   rW   rX   Ú
<listcomp>ß   s    z5Transformer.expand_encoder_output.<locals>.<listcomp>re   c                 S   s   g | ]}|  d ¡‘qS )r	   )rl   )r©   r¡   rW   rW   rX   r«   à   ó    )r6   r|   r~   Úpermuter?   r3   r9   ÚFÚinterpolater‰   Útor(   Úboolr:   Úsqueezer‚   )rF   rŠ   r‹   rŒ   ÚbsrR   ÚcÚhÚwZ_out_memoryZ_out_memory_padding_maskZ
out_memoryZout_memory_padding_maskZout_spatial_shapesÚirª   r¡   rW   rW   rX   r{   Í   s   (

z!Transformer.expand_encoder_outputc              	   C   s    |   |||¡\}}}| jj| jj |ƒ}d }| jj| jj |ƒ| }	| j}
tj|d |
ddd }t |	d| 	d¡ 
ddd¡¡}| ¡ }| ¡ }||||	||fS ©N).r   r	   re   rs   r   )r    r%   Úclass_embedÚ
num_layersÚ
bbox_embedr$   r(   ÚtopkÚgatherrƒ   ÚrepeatÚdetachÚsigmoid©rF   rŠ   Úmask_flattenrŒ   rž   rœ   rŸ   Úenc_outputs_classÚenc_outputs_deltaÚenc_outputs_coord_unactr¼   Ztopk_proposalsZtopk_coords_unactr1   rW   rW   rX   Úget_reference_pointsã   s$   
ÿúz Transformer.get_reference_pointsc           $         sf  g }g }g }g }	t t|||ƒƒD ]I\}
\}}}|j\}}}}||f}|	 |¡ | d¡ dd¡}| d¡}| d¡ dd¡}|ˆ j|
  ddd¡ }| |¡ | |¡ | |¡ qt 	|d¡}t 	|d¡}t 	|d¡}d }t 
‡ fdd„|D ƒd¡}ˆ jd ur‰ˆ j|||d}n|}|j\}}}ˆ jrßˆ  |||	¡\}}}}}}|}tj|ˆ jd| f|jd}ˆ  ˆ  ˆ  |¡¡¡}ˆ jsÊtj||dd\}} n>| d	¡ |dd¡} tj||dd\}}n)tj||dd\}} | d	¡ |dd¡}|  d	¡ |dd¡} ˆ  |¡ ¡ }|}d }ˆ  | ||||	||||||¡\}!}"|"}#ˆ jr)|!||#|||||fS |!||#d d d d d fS )
Nr   r	   rs   c                    s   g | ]}ˆ   |¡‘qS rW   )r¤   )r©   Úm©rF   rW   rX   r«     r¬   z'Transformer.forward.<locals>.<listcomp>)Úsrc_key_padding_maskrq   )rd   re   r   )r}   Úzipr|   r:   rl   r¨   r*   r~   r(   r‚   ri   rD   r#   rÆ   Úzerosr$   rd   r0   r/   rr   r2   Úsplitrƒ   r„   r1   rÀ   r%   )$rF   ÚsrcsÚmasksZ
pos_embedsÚquery_embedÚself_attn_maskZsrc_flattenrÂ   Zlvl_pos_embed_flattenrŒ   r‘   Úsrcr¡   Z	pos_embedr³   r´   rµ   r¶   Zspatial_shapeZlvl_pos_embedZlevel_start_indexZvalid_ratiosrŠ   rR   r1   rŸ   rÃ   rÅ   rÄ   rœ   Zinit_reference_outZpos_trans_outÚtgtÚhsÚinter_referencesZinter_references_outrW   rÈ   rX   Úforwardû   sŽ   




ùõø
zTransformer.forward)NN)Ú__name__Ú
__module__Ú__qualname__r    rE   rr   r    r¤   r{   rÆ   rÕ   Ú__classcell__rW   rW   rU   rX   r      s8    ëc(	r   c                   @   s   e Zd Zdd„ Zdd„ ZdS )ÚTransformerReParamc                 C   s   | j dkr|  |||¡\}}}|j\}}}g }d}t|ƒD ]l\}	\}
}| j|	 }t tjd|
d |
tj|j	dtjd|d |tj|j	d¡\}}t 
| d¡| d¡gd¡}| d¡ |ddd¡d | }t |¡| j d|	  }t 
||fd¡ |dd¡}| |¡ ||
| 7 }qt 
|d¡}|d \}
}| jd }|d d …d |
| …f  ||
|d¡}tj|d d …d d …ddf  ddd	| }tj|d d …dd d …df  ddd	| }tj
||||gdd
}| d¡}|d| k|d| k @ jddd	}| | d¡ ddd¡t|
|ƒ| ¡}| | t|
|ƒ| ¡}|}| | d¡tdƒ¡}| | tdƒ¡}|  |  |¡¡}|d d …d d d …f |d d …d d d …f f}|||fS )Nr	   r   rb   rs   rt   ru   r   Trx   re   rv   rw   )r3   r{   r|   r}   r4   r(   r€   r   rh   rd   r‚   rƒ   r„   r…   r5   r~   r:   r   r†   rˆ   r¾   Úmaxr‰   r.   r,   )rF   rŠ   r‹   rŒ   r   rŽ   r   rm   r   r‘   r’   r“   r   r—   r˜   r™   rš   r›   rœ   r”   r•   r–   Úimg_sizer   rž   rŸ   rW   rW   rX   r    T  sP   

ÿ
þ

$**
ÿÿ,
z/TransformerReParam.gen_encoder_output_proposalsc              	   C   s¤   |   |||¡\}}}| jj| jj |ƒ}| jj| jj |ƒ}tt|||ƒƒ}	| j}
tj	|d |
ddd }t 
|	d| d¡ ddd¡¡}| ¡ }|}||||	||fS r¸   )r    r%   r¹   rº   r»   r   r   r$   r(   r¼   r½   rƒ   r¾   r¿   rÁ   rW   rW   rX   rÆ   „  s$   
ÿúz'TransformerReParam.get_reference_pointsN)rÖ   r×   rØ   r    rÆ   rW   rW   rW   rX   rÚ   S  s    0rÚ   c                 C   sV   | j stnt}|| j| j| j| j| j| j | j	| j
| j| j| j| j| | j| j| jdS )N)r!   r"   rG   r#   r$   r2   rH   rI   r3   r7   r4   rJ   r5   rK   rO   )Úreparamr   rÚ   Ú
hidden_dimÚnheadsrG   r#   Únum_queries_one2oneÚnum_queries_one2manyr2   rH   rI   r3   r7   r4   r5   rK   rO   )rJ   Úmodel_classrW   rW   rX   Úbuild_transformer  s$   
ðrã   )r=   r(   Útorch.nn.functionalr   Ú
functionalr®   Ztorch.nn.initr   r   r   Zutil.box_opsr   r   Zglobal_ape_decoderr
   Zglobal_rpe_decomp_decoderr   Ztransformer_encoderr   r   Úutilsr   ÚModuler   rÚ   rã   rW   rW   rW   rX   Ú<module>   s     8J