o
    i                     @   s   d dl T d dlZd dlmZ d dlm  mZ dejfddZdejfddZ	dejfdd	Z
d
ejjdejjej fddZdS )    )*Nmodulec                    s.   ddl m  G  fdddj}|_S )Nr   )
checkpointc                       s$   e Zd ZjZ fddZ  ZS )zFwrap_module_with_gradient_checkpointing.<locals>._CheckpointingWrapperc                    s   t  jg|R ddi|S )Nuse_reentrantF)superforward)selfargskwargs)	__class__r    +/data/cameron/moge_repo/moge/model/utils.pyr      s   zNwrap_module_with_gradient_checkpointing.<locals>._CheckpointingWrapper.forward)__name__
__module____qualname__r   _restore_clsr   __classcell__r   r   r   )r   r   _CheckpointingWrapper	   s    r   )torch.utils.checkpointr   r   )r   r   r   r   r   'wrap_module_with_gradient_checkpointing   s   r   c                 C   s   | j j| _ d S N)r   r   )r   r   r   r   )unwrap_module_with_gradient_checkpointing   s   r   c                 C   s.   t jdks	J dG dd d| j}|| _| S )Nz2.0z"SDPA requires PyTorch 2.0 or laterc                   @   s$   e Zd ZddejdejfddZdS )z:wrap_dinov2_attention_with_sdpa.<locals>._AttentionWrapperNxreturnc           
      S   s   |j \}}}| |||d| j|| j ddddd}t|d\}}}	t|||	|}|dddd|||}| 	|}| 
|}|S )N      r         )shapeqkvreshape	num_headspermutetorchunbindFscaled_dot_product_attentionprojZ	proj_drop)
r   r   	attn_biasBNCr    qkvr   r   r   r      s   .

zBwrap_dinov2_attention_with_sdpa.<locals>._AttentionWrapper.forwardr   )r   r   r   r$   Tensorr   r   r   r   r   _AttentionWrapper   s    r1   )r$   __version__r   )r   r1   r   r   r   wrap_dinov2_attention_with_sdpa   s   r3   bucketr   c                 C   sL   t jjj}| }| }|| t jj||d t j	 }|
| |S )N)group)r$   distributedr5   WORLDsizebufferdiv_
all_reducefuturesFuture
set_result)stater4   group_to_use
world_sizegradfutr   r   r   sync_ddp_hook)   s   



rD   )typingr$   torch.nnnntorch.nn.functional
functionalr&   Moduler   r   r3   r6   
GradBucketr<   r=   r0   rD   r   r   r   r   <module>   s    $