o
    vic                  $   @   s$  d Z ddlZddlmZ ddlmZmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ eeeedZdeiZ												d*dedededededB dedB dedB dedB dedB dedB dedB dedB de dB dede!dB dee"eef B f dd Z#	!	"	"					d+dededed#e"eB d$e"eB d%e"eB de"eB dedB de dB dede!dB dee"eef B fd&d'Z$	!	"	"				d,dededed#e"eB d$e"eB d%e"eB dedB de dB dede!dB dee"eef B fd(d)Z%dS )-zl
Imaginaire4 Attention Subpackage:
Unified implementation for all Attention implementations.

Frontend APIs
    N)Tensor)choose_backendchoose_multi_dim_backend)attention_param_checksattention_tensor_checks multi_dim_attention_param_checks multi_dim_attention_param_filter!multi_dim_attention_tensor_checksvarlen_tensor_checks)cudnn_attention)flash2_attention)flash3_attention)
CausalType)natten_attentionnatten_multi_dim_attention)safe_log)cudnnnattenZflash2Zflash3r   Fquerykeyvalue	is_causalcausal_typescale	seqlens_Q
seqlens_KVcumulative_seqlen_Qcumulative_seqlen_KVmax_seqlen_Qmax_seqlen_KVbackend
return_lsebackend_kwargsreturnc                 C   s  t | ||dds
J t| ||||d t| ||||||	|
|d	\}}	}
}|du}|dur.|n| jd d }|du rD|durDd}td |durY|tvrYtd	|d
t  dt	| ||||||dd}|du r| jd |jd ko|| jd |jd k}|r| jd }|jd }|| dksJ || }| }t
j||d|d}t
j||d|d}td t||||||||	|
||||dS |du rtdtd| d|tv sJ t| | |||||||	|
|||dS )a  
    Runs Attention on given operands (Q, K, V) with the heads-last contiguous layout
        (`[batch, seqlen, heads, head_dim]`).

    Varlen Attention is only supported for the sequence-packed layout: QKV tensors have batch size
    1, and tokens from different batches are concatenated without any padding along the sequence
    dimension. Sequence lengths for different batches can be provided in two ways:
        1. `seqlens_Q` and `seqlens_KV` (less efficient): only provide the sequence lengths as
            integer tensors (must be on the same device as QKV), and cumulative and maximum sequence
            lengths are recomputed on each call.
        2. `cumulative_seqlen_{Q,KV}` and `max_seqlen_{Q,KV}` (more efficient):
            compute cumulative and maximum sequence lengths. `cumulative_seqlen_{Q,KV}` are integer
            tensors on the same device as QKV containing the cumulative sum of `seqlens_{Q,KV}`,
            with an additional `0` element in the beginning, therefore sized `batch+1`.
            `max_seqlen_{Q,KV}` are integers (not Tensors) that represent the maximum sequence
            lengths for Q and KV among all sequence batches.
            You can use `generate_varlen_parameters` to generate these
            parameters:
                ```python3
                from cosmos_policy._src.imaginaire.attention.varlen import generate_varlen_parameters
                (
                    cumulative_seqlen_Q,
                    cumulative_seqlen_KV,
                    max_seqlen_Q,
                    max_seqlen_KV,
                ) = generate_varlen_parameters(q, k, v, seqlens_Q, seqlens_KV)
                ```

    Parameters:
        query (Tensor): 4-D query tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, head_dim]`)

        key (Tensor): 4-D key tensor, with the heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim]`)

        value (Tensor): 4-D value tensor, with heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim_v]`)

        is_causal (bool): whether or not causal masking is enabled. Default is False.

        causal_type (CausalType): causal masking mode. Choices: `CausalType.TopLeft`,
            `CausalType.BottomRight`, `CausalType.DontCare` (only valid when seqlen_q == seqlen_kv).
            Required when `is_causal = True`.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

        seqlens_Q (Tensor | None): (varlen) Optional 1-D tensor with size `batch`
            indicating the number of query tokens in each batch. Must be passed together with
            `seqlens_KV`.

        seqlens_KV (Tensor | None): (varlen) Optional 1-D tensor with size `batch`
            indicating the number of key/value tokens in each batch. Must be passed together with
            `seqlens_Q`.

        cumulative_seqlen_Q (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of query tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_KV` and `max_seqlen_{Q,KV}`.

        cumulative_seqlen_KV (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of key/value tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_Q` and `max_seqlen_{Q,KV}`.

        max_seqlen_Q (int | None): (varlen) Optional integer indicating the maximum query
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_KV`.

        max_seqlen_KV (int | None): (varlen) Optional integer indicating the maximum key/value
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_Q`.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            attention operator, if any. Only valid when a specific backend is selected (backend is
            not None).

    Returns:
        output (Tensor): 4-D output tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, 1]`). Only returned when return_lse is True.
    Tr   r   r   raise_error)r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   N      AA backend was not specified, but got backend_kwargs. Ignoring... Selected backend=, but available choices are . F)r   r   r   r   r   	is_varlenr    r%   r   )repeatsdimoutput_sizezVBackend incompatible with GQA/MQA use case. Trying again with graph transformation... )r   r   r   r   r   r   r   r   r   r   r!   r    r"   zvCould not find a compatible Attention backend for this use case / device. Try running with debug logs to find out why.zSelected Attention backend zZ is incompatible with this use case / device. Try running with debug logs to find out why.)r   r   r   r   r   r   r   r   r   r   r!   r"   )r   r   r
   shapelogdebugBACKEND_MAP
ValueErrorkeysr   torchrepeat_interleave	attention)r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r,   Zcompatible_backendZ
is_gqa_mqaheadsZheads_kvZh_kquery_tkey_tvalue_t r>   V/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/imaginaire/attention/frontend.pyr9   7   s   m
(



r9   r&      window_sizestridedilationc                    sL  t | ||dds
J t| ||||d\}}}}t}fddt|D  t dkrt fddtD }t fd	dt|D }t fd
dt|D }t fddt|D }t fddt|D }tdd |D syJ tdd |D sJ | j| jd g|| jd | jd R  }|j|jd g||jd |jd R  }|j|jd g||jd |jd R  }t	
dd|d|d|d|d|d|d|d|d|d t||||||||||	|
dS t| ||||d tdd t|D r]tdd |D r|dkr]t	
d d|d|d! |d"ur;t	
d#|d$ | d|}|d|}|d|}|d }t|||||tj|	d%S |d"urd|n| jd d& }|d"u r||
d"ur|d"}
t	
d' t| |||d(}|tvrtd)|d*t  d+t| | ||||||||	|
d,
S )-a  
    Runs Multi-Dimensional Attention on given operands (Q, K, V) with the heads-last contiguous
    layout (`[batch, *, heads, head_dim]`). Supports up to and including 3 dimensions:
        * 1-D: `[batch, X, heads, head_dim]`, with masking arguments expecting tuples of size 1.
        * 2-D: `[batch, X, Y, heads, head_dim]`, with masking arguments expecting tuples of size 2.
        * 3-D: `[batch, X, Y, Z, heads, head_dim]`, with masking arguments expecting tuples of size 3.

    The dimensions here refer to the layout of tokens; that is the arrangement of tokens for each
    batch/head, or the `[X]`, `[X, Y]`, `[X, Y, Z]` part of the input shape.
    We refer to these as the "token layout shape".

    For now, it is always expected that Q, K, and V match in the sizes of those dimensions.

    Masking arguments, all of which can be set uniformly across all dimensions or per dimension, are:
        * `window_size`: determines the sliding window size. -1 is interpreted as the maximum window
            size. Must be either -1 or at least 2 and at most the token layout shape.
            For example, if inputs are `[batch, X, Y, Z, heads_{q,kv}, head_dim_{qk,v}]`,
            `window_size` must be either an integer == -1 or an integer <= `min(X, Y, Z)`,
            or a tuple of size 3 corresponding to the three dimensions / axes, where:
                * `window_size[0] == -1 or 2 <= window_size[0] <= X`
                * `window_size[1] == -1 or 2 <= window_size[1] <= Y`
                * `window_size[2] == -1 or 2 <= window_size[2] <= Z`
            When `window_size` is set to the maximum for any dimension, we're effectively performing
            self attention (no sparsity) along that dimension.
            Default is -1 (self attention).

        * `stride`: determines the step size of the sliding window. Only matters when the
            corresponding `window_size` is not -1 / maximum (self attention).
            Default is 1, indicating the smallest sliding window delay.
            Larger values trade off translational equivariance for potentially improved efficiency.
            Maximum value for `stride` along each dimension is the corresponding `window_size`.
            If `stride == window_size` along any dimension, it is equivalent to blocked / windowed
            attention (from works such as Swin Transformer, SAM, ViTDet, etc) along that dimension,
            meaning no overlap between windows.
            For more details, please refer to the GNA paper:
            https://arxiv.org/abs/2504.16922

        * `dilation`: introduces gaps between tokens in a sliding window, similarly to dilated
            convolution.
            Default is 1, indicating no gaps.
            Maximum value is the largest positive integer that satisfies
            `window_size * dilation <= token_layout_shape` along that dimension.
            Higher dilation means more sparse and global context. Lower dilation means more
            locality.
            For more details, please refer to the DiNAT paper:
            https://arxiv.org/abs/2209.15001

        * `is_causal`: per-dimension causal mask.

    Parameters:
        query (Tensor): 4-D, 5-D, or 6-D query tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads, head_dim]`)

        key (Tensor): 4-D, 5-D, or 6-D key tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads_kv, head_dim]`)

        value (Tensor): 4-D, 5-D, or 6-D value tensor, with heads-last contiguous layout
            (`[batch, *token_layout_shape, heads_kv, head_dim_v]`)

        window_size (tuple | int): Attention window (kernel) size / shape. If an
            integer, it will be repeated for all dimensions. For example `window_size=3`, when
            `len(token_layout_shape) == 3`, is interpreted as `window_size=(3, 3, 3)`.
            `-1`s are replaced with the corresponding `token_layout_shape`.
            Final window size must satisfy `2 <= window_size <= token_layout_shape`.
            Default is -1 (no sparsity).

        stride (tuple | int): Sliding window step size/shape. If an integer, it will be repeated
            for all dimensions. For example `stride=2`, when `len(token_layout_shape) == 3`, is
            interpreted as `stride=(2, 2, 2)`.
            Final stride must satisfy `1 <= stride <= window_size`.
            Default is 1.

        dilation (tuple | int): Dilation step size/shape. If an integer, it will be repeated for
            all dimensions. For example `dilation=4`, when `len(token_layout_shape) == 3`, is
            interpreted as `dilation=(4, 4, 4)`.
            Final dilation must satisfy `2 <= dilation * window_size <= token_layout_shape`.
            Default is 1.

        is_causal (tuple | bool): Toggle causal masking. If a boolean, it will be repeated for all
            dimensions. For example `is_causal=True`, when `len(token_layout_shape) == 3`, is
            interpreted as `is_causal=(True, True, True)`.
            Default is False.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            multi-dim / sparse attention operator, if any. Only valid when a specific backend is
            selected (backend is not None).

    Returns:
        output (Tensor): 4-D, 5-D, or 6-D output tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads, 1]`). Only returned when return_lse is True.
    Tr$   )rA   rB   rC   r   c                    s   g | ]
} | d kr|qS )r@   r>   ).0i)token_layout_shaper>   r?   
<listcomp>  s    z/multi_dimensional_attention.<locals>.<listcomp>r   c                 3        | ]\}}| vr|V  qd S Nr>   rD   rE   stoken_layout_onesr>   r?   	<genexpr>      z.multi_dimensional_attention.<locals>.<genexpr>c                 3   rH   rI   r>   )rD   rE   wrL   r>   r?   rN     rO   c                 3   rH   rI   r>   rJ   rL   r>   r?   rN     rO   c                 3   rH   rI   r>   )rD   rE   drL   r>   r?   rN     rO   c                 3   rH   rI   r>   )rD   rE   crL   r>   r?   rN     rO   c                 s       | ]}|d kV  qdS    Nr>   )rD   xr>   r>   r?   rN         c                 s   rS   rT   r>   )rD   rP   r>   r>   r?   rN     rW   r-   r&   zvThis Multi-Dimensional Attention problem has 1s in the token layout, which can be simplified from <token_layout_shape=z, window_size=z	, stride=z, dilation=z, is_causal=z> into <token_layout_t=z, window_size_t=z, stride_t=z, dilation_t=z, is_causal_t=z>.)r   r   r   rA   rB   rC   r   r   r    r!   r"   c                 s   s    | ]	\}}||kV  qd S rI   r>   )rD   rV   rP   r>   r>   r?   rN     s    c                 s   s    | ]}|V  qd S rI   r>   )rD   rR   r>   r>   r?   rN     s    r@   zfThis Multi-Dimensional Attention problem is implementable with standard Attention: token_layout_shape=.NzIgnoring backend=z and backend args...)r   r   r   r!   r'   r(   )r   r   r   r    r)   r*   r+   
r   r   r   rA   rB   rC   r   r   r!   r"   )r	   r   lenrangetuple	enumerateallreshaper1   r2   r3   multi_dimensional_attentionr   zipanyflattenr9   r   ZDontCarer   MULTI_DIM_BACKEND_MAPr5   r6   )r   r   r   rA   rB   rC   r   r   r    r!   r"   num_dimsZtoken_layout_tZwindow_size_tZstride_tZ
dilation_tZis_causal_tr;   r<   r=   Zquery_1dZkey_1dZvalue_1dZis_causal_1dr>   )rM   rF   r?   r`     s   u,,,	




r`   c
           
      C   s:   |   dkrtd| jdt| |||||d|||	d
S )a	  
    Runs Spatio-Temporal Attention on unflattened QKV with the heads-last contiguous layout
    (`[batch, T, H, W, heads, head_dim]`).
    For now, it is always expected that Q, K, and V match in their shapes.

    Parameters:
        query (Tensor): 6-D query tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads, head_dim]`)

        key (Tensor): 6-D key tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads_kv, head_dim]`)

        value (Tensor): 6-D value tensor, with heads-last contiguous layout
            (`[batch, T, H, W, heads_kv, head_dim_v]`)

        window_size (tuple | int): Attention window (kernel) size / shape. If an
            integer, it will be repeated for all dimensions. For example `window_size=3` is
            interpreted as `window_size=(3, 3, 3)`.
            `-1`s are replaced with the corresponding value in `(T, H, W)`.
            Default is -1 (no sparsity).

        stride (tuple | int): Sliding window step size/shape. If an integer, it will be repeated
            for all dimensions. For example `stride=2` is interpreted as `stride=(2, 2, 2)`.
            Final stride must satisfy `1 <= stride <= window_size`.
            Default is 1.

        dilation (tuple | int): Dilation step size/shape. If an integer, it will be repeated for
            all dimensions. For example `dilation=4` is interpreted as `dilation=(4, 4, 4)`.
            Final dilation must satisfy `2 <= dilation * window_size <= (T, H, W)`.
            Default is 1.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            multi-dim / sparse attention operator, if any. Only valid when a specific backend is
            selected (backend is not None).

    Returns:
        output (Tensor): 6-D output tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads, 1]`). Only returned when return_lse is True.
       zjSpatio-Temporal Attention requires 6-D input tensors ([batch, T, H, W, heads, head_dim]), got query.shape=z).)TFFrY   )r/   r5   r1   r`   )
r   r   r   rA   rB   rC   r   r    r!   r"   r>   r>   r?   spatio_temporal_attention  s$   ?rg   )FNNNNNNNNNFN)r&   r@   r@   FNNFN)r&   r@   r@   NNFN)&__doc__r7   r   Z0cosmos_policy._src.imaginaire.attention.backendsr   r   Z.cosmos_policy._src.imaginaire.attention.checksr   r   r   r   r	   r
   Z-cosmos_policy._src.imaginaire.attention.cudnnr   Z.cosmos_policy._src.imaginaire.attention.flash2r   Z.cosmos_policy._src.imaginaire.attention.flash3r   Z-cosmos_policy._src.imaginaire.attention.masksr   Z.cosmos_policy._src.imaginaire.attention.nattenr   r   Z-cosmos_policy._src.imaginaire.attention.utilsr   r2   r4   rd   boolfloatintstrdictr\   r9   r`   rg   r>   r>   r>   r?   <module>   s    		

 c

 k	
