o
    {i                     @   s   d dl Z d dlZd dlmZ d dlmZ dZdd Zdd Zej	j
G d	d
 d
ejjZG dd dejjZG dd deZdejjdedejjfddZdS )    N)LinearKMaskedBias)named_replaceg-q=c                 C   s<   t t jj}t j| td| }| | t j}||fS )N)min)torchfinfofloat8_e4m3fnmaxclampfloatEPSto)tZamax_tZmax_vZscale_tZt_fp8 r   D/data/cameron/keygrip/volume_dino_tracks/dinov3/layers/fp8_linear.pyscale   s   r   c           
   	   C   sv   t | |\}}t ||\}}tj|| |d| dd tjdd}	|	| |  tj}	|d ur9|	| }	|	S )N)   r   F)scale_ascale_bbias	out_dtypeuse_fast_accum)r   r   
_scaled_mmr   new_onesbfloat16r   )
firstZ
amax_firstZsecond_tZamax_second_tr   Z	first_fp8scale_firstZsecond_t_fp8Zscale_second_toutputr   r   r   matmul   s   	r   c                   @   s$   e Zd Zedd Zedd ZdS )Fp8LinearFnc                 C   sn   |  jddd}|  jddd}t|||||}|j| _|j| _|d ur)|jnd| _| |||  |S )NTdimkeepdimF)	absamaxr   requires_grada_requires_gradb_requires_gradbias_requires_gradsave_for_backwardr   )ctxab_tr   Zamax_aZamax_b_toutr   r   r   forward/   s   zFp8LinearFn.forwardc           
      C   s   | j \}}}| jr*|  }| jddd}||jd d}t||||d }nd }| j	r6| | }nd }| j
rB|jdd}	nd }	|||	fS )Nr   Tr    r   r   )r!   )saved_tensorsr&   r   
contiguousr#   r$   repeatshaper   r'   r(   sum)
r*   grad_outr+   r,   Zamax_bbZamax_grad_outZgrad_aZgrad_b	grad_biasr   r   r   backward=   s   
zFp8LinearFn.backwardN)__name__
__module____qualname__staticmethodr.   r7   r   r   r   r   r   -   s
    
r   c                   @   "   e Zd ZdejdejfddZdS )	Fp8Linearinputreturnc                 C   s4   t |jdd| j| j}|d|jd d }|S N)end_dimr   r   )r   applyflattenweightr   	unflattenr2   )selfr>   r-   r   r   r   r.   U   s   zFp8Linear.forwardNr8   r9   r:   r   Tensorr.   r   r   r   r   r=   T       r=   c                   @   r<   )Fp8LinearKMaskedBiasr>   r?   c                 C   sL   | j d ur| j | j nd }t|jdd| j|}|d|jd d }|S r@   )r   	bias_maskr   rC   rD   rE   rF   r2   )rG   r>   masked_biasr-   r   r   r   r.   \   s   zFp8LinearKMaskedBias.forwardNrH   r   r   r   r   rK   [   rJ   rK   root_modulefilterr?   c                   sl   t | ddtjjdtdtjjf fdd}t|| }dks&J dtj  ddl	m
} |  |S )	Nr   modulenamer?   c                    s   t | tjjr |s| S t| tjjkrt}nt| tkr"t}nJ t	t| | j
d dks8| jd dkr<td|| j
| j| jd u| jj| jjd}| j|_| j|_d7 |S )NF@   r   z_fp8 requires all dimensions to be multiples of 64 (consider using ffn_layer=swiglu64 or higher))in_featuresout_featuresr   dtypedevicer   )
isinstancer   nnLinearsearchtyper=   r   rK   strrS   rT   RuntimeErrorr   rE   rU   rV   )rP   rQ   new_cls
new_moduleZ	filter_retotal_countr   r   replaceg   s,   z'convert_linears_to_fp8.<locals>.replacezfp8: no layer found to convert)reset_cudagraph_trees)recompiler   rX   Moduler\   r   _dynamoreset_code_cachestorch._inductor.cudagraph_treesrc   )rN   rO   rb   r-   rc   r   r`   r   convert_linears_to_fp8c   s   
$

rj   )rd   r   Zdinov3.layers.attentionr   dinov3.utilsr   r   r   r   compilerallow_in_graphautogradFunctionr   rX   rY   r=   rK   rf   r\   rj   r   r   r   r   <module>   s   &"