o
    ?߱it                     @   s   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 									dded	ed
ede
de	dB dedB dedB dedB dedB dedB de
dedB deeeef B fddZdS )z
Imaginaire4 Attention Subpackage:
Unified implementation for all Attention implementations.

Flash Attention v2 (flash2) Backend: intermediate APIs
Only safe to import when FLASH2_SUPPORTED is True.
    )flash_attn_funcflash_attn_varlen_func)Tensor)flash2_attention_check)
CausalTypeFNquerykeyvalue	is_causalcausal_typescalecumulative_seqlen_Qcumulative_seqlen_KVmax_seqlen_Qmax_seqlen_KV
return_lsebackend_kwargsreturnc                 C   s  |du}t | |||||ddsJ |dur|n| jd d }|dur$|ni }|r| jd |jd   krA|jd   krAdksDJ  J | d}|d}|d}| |   kri|   kridkslJ  J td| d|d|d||||	||dd	
|\}}}| dksJ | d
ksJ |d}|d}ntd| ||||dd|\}}}t|tsJ t|tsJ | dksJ | dksJ |	dd
d}|
r||fS |S )a
  
    Runs Flash Attention v2 on given operands (Q, K, V) with the heads-last contiguous layout
        (`[batch, seqlen, heads, head_dim]`).

    Parameters:
        query (Tensor): 4-D query tensor, with the heads-last contiguous layout
            (`[batch, seqlen, heads, head_dim]`)

        key (Tensor): 4-D key tensor, with the heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim]`)

        value (Tensor): 4-D value tensor, with heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim_v]`)

        is_causal (bool): whether or not causal masking is enabled. Default is False.

        causal_type (CausalType): causal masking mode. Choices: `CausalType.TopLeft`,
            `CausalType.BottomRight`. Required when `is_causal = True`.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

        cumulative_seqlen_Q (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of query tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_KV` and `max_seqlen_{Q,KV}`.

        cumulative_seqlen_KV (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of key/value tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_Q` and `max_seqlen_{Q,KV}`.

        max_seqlen_Q (int | None): (varlen) Optional integer indicating the maximum query
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_KV`.

        max_seqlen_KV (int | None): (varlen) Optional integer indicating the maximum key/value
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_Q`.

    Other Parameters:
        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to Flash's
            attention operator, if any.

    Returns:
        output (Tensor): 4-D output tensor, with the heads-last contiguous layout
            (`[batch, seqlen, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, seqlen, heads, 1]`). Only returned when return_lse is True.
            NOTE: this tensor is not contiguous in this backend (Flash2) and it should not be made
            contiguous unless we can guarantee its results aren't merged via `merge_attentions`.
    NT)r   r   r	   r
   r   	is_varlenraise_errorg      r         )
qkvcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_ksoftmax_scalecausalreturn_attn_probs   )r   r   r   r    r!   r"       )
r   shapesqueezedimr   	unsqueezer   
isinstancer   permute)r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   outZlse__outputlser%   r%   d/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/attention/flash2/functions.pyflash2_attention   sl   E

8


2
r1   )	FNNNNNNFN)__doc__flash_attn.flash_attn_interfacer   r   torchr   7cosmos_predict2._src.imaginaire.attention.flash2.checksr   /cosmos_predict2._src.imaginaire.attention.masksr   boolfloatintdicttupler1   r%   r%   r%   r0   <module>   sR   	
