o
    ?߱iSt                  $   @   s  d Z ddlmZ ddlmZmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ eeed
ZdeiZ												d1dedededededB dedB dedB dedB dedB dedB dedB dedB dedB dededB dee eef B f ddZ!	 	!	!					d2dededed"e eB d#e eB d$e eB de eB dedB dedB dededB dee eef B fd%d&Z"				d3dededed'ededB dedB dededB dee eef B fd(d)Z#	 	!	!				d4dededed"e eB d#e eB d$e eB dedB dedB dededB dee eef B fd*d+Z$	d5d,e%e d-e%e d.ede eef fd/d0Z&dS )6zl
Imaginaire4 Attention Subpackage:
Unified implementation for all Attention implementations.

Frontend APIs
    )Tensor)choose_backendchoose_multi_dim_backend)attention_param_checksattention_tensor_checks multi_dim_attention_param_checks multi_dim_attention_param_filter!multi_dim_attention_tensor_checksvarlen_tensor_checks)flash2_attention)flash3_attention)
CausalType)natten_attentionnatten_multi_dim_attention)safe_log)nattenZflash2Zflash3r   FNquerykeyvalue	is_causalcausal_typescale	seqlens_Q
seqlens_KVcumulative_seqlen_Qcumulative_seqlen_KVmax_seqlen_Qmax_seqlen_KVbackend
return_lsebackend_kwargsreturnc                 C   s*  t | ||dds
J t| ||||d t| ||||||	|
|d	\}}	}
}|du}|dur.|n| jd d }|du rD|durDd}td |durY|tvrYtd	|d
t  dt	| ||||||dd}|du rq|du rqtd|du r}td| d|tv sJ t| | |||||||	|
|||dS )a  
    Runs Attention on given operands (Q, K, V) with the heads-last contiguous layout
        (`[batch, seqlen, heads, head_dim]`).

    Varlen Attention is only supported for the sequence-packed layout: QKV tensors have batch size
    1, and tokens from different batches are concatenated without any padding along the sequence
    dimension. Sequence lengths for different batches can be provided in two ways:
        1. `seqlens_Q` and `seqlens_KV` (less efficient): only provide the sequence lengths as
            integer tensors (must be on the same device as QKV), and cumulative and maximum sequence
            lengths are recomputed on each call.
        2. `cumulative_seqlen_{Q,KV}` and `max_seqlen_{Q,KV}` (more efficient):
            compute cumulative and maximum sequence lengths. `cumulative_seqlen_{Q,KV}` are integer
            tensors on the same device as QKV containing the cumulative sum of `seqlens_{Q,KV}`,
            with an additional `0` element in the beginning, therefore sized `batch+1`.
            `max_seqlen_{Q,KV}` are integers (not Tensors) that represent the maximum sequence
            lengths for Q and KV among all sequence batches.
            You can use `generate_varlen_parameters` to generate these
            parameters:
                ```python3
                from cosmos_predict2._src.imaginaire.attention.varlen import generate_varlen_parameters
                (
                    cumulative_seqlen_Q,
                    cumulative_seqlen_KV,
                    max_seqlen_Q,
                    max_seqlen_KV,
                ) = generate_varlen_parameters(q, k, v, seqlens_Q, seqlens_KV)
                ```

    Parameters:
        query (Tensor): 4-D query tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, head_dim]`)

        key (Tensor): 4-D key tensor, with the heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim]`)

        value (Tensor): 4-D value tensor, with heads-last contiguous layout
            (`[batch, seqlen_kv, heads_kv, head_dim_v]`)

        is_causal (bool): whether or not causal masking is enabled. Default is False.

        causal_type (CausalType): causal masking mode. Choices: `CausalType.TopLeft`,
            `CausalType.BottomRight`, `CausalType.DontCare` (only valid when seqlen_q == seqlen_kv).
            Required when `is_causal = True`.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

        seqlens_Q (Tensor | None): (varlen) Optional 1-D tensor with size `batch`
            indicating the number of query tokens in each batch. Must be passed together with
            `seqlens_KV`.

        seqlens_KV (Tensor | None): (varlen) Optional 1-D tensor with size `batch`
            indicating the number of key/value tokens in each batch. Must be passed together with
            `seqlens_Q`.

        cumulative_seqlen_Q (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of query tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_KV` and `max_seqlen_{Q,KV}`.

        cumulative_seqlen_KV (Tensor | None): (varlen) Optional 1-D tensor with size `batch + 1`
            indicating the cumulative sum of number of key/value tokens in each batch, with an
            additional 0 element in the beginning. Must be passed together with
            `cumulative_seqlen_Q` and `max_seqlen_{Q,KV}`.

        max_seqlen_Q (int | None): (varlen) Optional integer indicating the maximum query
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_KV`.

        max_seqlen_KV (int | None): (varlen) Optional integer indicating the maximum key/value
            sequence length in all batches. Must be passed together with `cumulative_seqlen_{Q,KV}`
            and `max_seqlen_Q`.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            attention operator, if any. Only valid when a specific backend is selected (backend is
            not None).

    Returns:
        output (Tensor): 4-D output tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, seqlen_q, heads, 1]`). Only returned when return_lse is True.
            NOTE: this tensor is not guaranteed to be contiguous with some backends and it should
            not be made contiguous unless we can guarantee its results aren't merged via
            `merge_attentions`.
    Tr   r   r   raise_error)r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   N      AA backend was not specified, but got backend_kwargs. Ignoring... Selected backend=, but available choices are . F)r   r   r   r   r   	is_varlenr   r#   zvCould not find a compatible Attention backend for this use case / device. Try running with debug logs to find out why.zSelected Attention backend zZ is incompatible with this use case / device. Try running with debug logs to find out why.)r   r   r   r   r   r   r   r   r   r   r   r    )
r   r   r
   shapelogdebugBACKEND_MAP
ValueErrorkeysr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r*   Zcompatible_backend r1   \/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/attention/frontend.py	attention4   s   p

r3   r$      window_sizestridedilationc                    s  t | ||dds
J t| ||||d\}}}}t}fddt|D  t dkr%t fddtD }t fd	dt|D }t fd
dt|D }t fddt|D }t fddt|D }tdd |D szJ tdd |D sJ | j| jd g|| jd | jd R  }|j|jd g||jd |jd R  }|j|jd g||jd |jd R  }dd | jdd D |jd g }t	
dd|d|d|d|d|d|d|d|d|d t|||||||||d|
d\}}|j| }|j|dd  }|	r#||fS |S t| ||||d tdd t|D rtd d |D rJ|d!krt	
d"d|d|d# |durgt	
d$|d% | d!|}|d!|}|d!|}|d }d&d | jdd D |jd g }t|||||tjdd'\}}|j| }|j|dd  }|	r||fS |S |dur|n| jd d( }|du r|
durd}
t	
d) t| |||d*}|tvrtd+|d,t  d-t| | ||||||||	|
d.
S )/a  
    Runs Multi-Dimensional Attention on given operands (Q, K, V) with the heads-last contiguous
    layout (`[batch, *, heads, head_dim]`). Supports up to and including 3 dimensions:
        * 1-D: `[batch, X, heads, head_dim]`, with masking arguments expecting tuples of size 1.
        * 2-D: `[batch, X, Y, heads, head_dim]`, with masking arguments expecting tuples of size 2.
        * 3-D: `[batch, X, Y, Z, heads, head_dim]`, with masking arguments expecting tuples of size 3.

    The dimensions here refer to the layout of tokens; that is the arrangement of tokens for each
    batch/head, or the `[X]`, `[X, Y]`, `[X, Y, Z]` part of the input shape.
    We refer to these as the "token layout shape".

    For now, it is always expected that Q, K, and V match in the sizes of those dimensions.

    Masking arguments, all of which can be set uniformly across all dimensions or per dimension, are:
        * `window_size`: determines the sliding window size. -1 is interpreted as the maximum window
            size. Must be either -1 or at least 2 and at most the token layout shape.
            For example, if inputs are `[batch, X, Y, Z, heads_{q,kv}, head_dim_{qk,v}]`,
            `window_size` must be either an integer == -1 or an integer <= `min(X, Y, Z)`,
            or a tuple of size 3 corresponding to the three dimensions / axes, where:
                * `window_size[0] == -1 or 2 <= window_size[0] <= X`
                * `window_size[1] == -1 or 2 <= window_size[1] <= Y`
                * `window_size[2] == -1 or 2 <= window_size[2] <= Z`
            When `window_size` is set to the maximum for any dimension, we're effectively performing
            self attention (no sparsity) along that dimension.
            Default is -1 (self attention).

        * `stride`: determines the step size of the sliding window. Only matters when the
            corresponding `window_size` is not -1 / maximum (self attention).
            Default is 1, indicating the smallest sliding window delay.
            Larger values trade off translational equivariance for potentially improved efficiency.
            Maximum value for `stride` along each dimension is the corresponding `window_size`.
            If `stride == window_size` along any dimension, it is equivalent to blocked / windowed
            attention (from works such as Swin Transformer, SAM, ViTDet, etc) along that dimension,
            meaning no overlap between windows.
            For more details, please refer to the GNA paper:
            https://arxiv.org/abs/2504.16922

        * `dilation`: introduces gaps between tokens in a sliding window, similarly to dilated
            convolution.
            Default is 1, indicating no gaps.
            Maximum value is the largest positive integer that satisfies
            `window_size * dilation <= token_layout_shape` along that dimension.
            Higher dilation means more sparse and global context. Lower dilation means more
            locality.
            For more details, please refer to the DiNAT paper:
            https://arxiv.org/abs/2209.15001

        * `is_causal`: per-dimension causal mask.

    Parameters:
        query (Tensor): 4-D, 5-D, or 6-D query tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads, head_dim]`)

        key (Tensor): 4-D, 5-D, or 6-D key tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads_kv, head_dim]`)

        value (Tensor): 4-D, 5-D, or 6-D value tensor, with heads-last contiguous layout
            (`[batch, *token_layout_shape, heads_kv, head_dim_v]`)

        window_size (tuple | int): Attention window (kernel) size / shape. If an
            integer, it will be repeated for all dimensions. For example `window_size=3`, when
            `len(token_layout_shape) == 3`, is interpreted as `window_size=(3, 3, 3)`.
            `-1`s are replaced with the corresponding `token_layout_shape`.
            Final window size must satisfy `2 <= window_size <= token_layout_shape`.
            Default is -1 (no sparsity).

        stride (tuple | int): Sliding window step size/shape. If an integer, it will be repeated
            for all dimensions. For example `stride=2`, when `len(token_layout_shape) == 3`, is
            interpreted as `stride=(2, 2, 2)`.
            Final stride must satisfy `1 <= stride <= window_size`.
            Default is 1.

        dilation (tuple | int): Dilation step size/shape. If an integer, it will be repeated for
            all dimensions. For example `dilation=4`, when `len(token_layout_shape) == 3`, is
            interpreted as `dilation=(4, 4, 4)`.
            Final dilation must satisfy `2 <= dilation * window_size <= token_layout_shape`.
            Default is 1.

        is_causal (tuple | bool): Toggle causal masking. If a boolean, it will be repeated for all
            dimensions. For example `is_causal=True`, when `len(token_layout_shape) == 3`, is
            interpreted as `is_causal=(True, True, True)`.
            Default is False.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            multi-dim / sparse attention operator, if any. Only valid when a specific backend is
            selected (backend is not None).

    Returns:
        output (Tensor): 4-D, 5-D, or 6-D output tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, *token_layout_shape, heads]`). Only returned when return_lse is True.
    Tr"   )r5   r6   r7   r   c                    s   g | ]
} | d kr|qS )r4   r1   ).0i)token_layout_shaper1   r2   
<listcomp>t  s    z/multi_dimensional_attention.<locals>.<listcomp>r   c                 3        | ]\}}| vr|V  qd S Nr1   r8   r9   stoken_layout_onesr1   r2   	<genexpr>v      z.multi_dimensional_attention.<locals>.<genexpr>c                 3   r<   r=   r1   )r8   r9   wr@   r1   r2   rB   w  rC   c                 3   r<   r=   r1   r>   r@   r1   r2   rB   x  rC   c                 3   r<   r=   r1   )r8   r9   dr@   r1   r2   rB   y  rC   c                 3   r<   r=   r1   )r8   r9   cr@   r1   r2   rB   z  rC   c                 s       | ]}|d kV  qdS    Nr1   r8   xr1   r1   r2   rB   |      c                 s   rG   rH   r1   )r8   rD   r1   r1   r2   rB   }  rL   r$   c                 S      g | ]}|qS r1   r1   rJ   r1   r1   r2   r;         NzvThis Multi-Dimensional Attention problem has 1s in the token layout, which can be simplified from <token_layout_shape=z, window_size=z	, stride=z, dilation=z, is_causal=z> into <token_layout_t=z, window_size_t=z, stride_t=z, dilation_t=z, is_causal_t=z>.r   r   r   r5   r6   r7   r   r   r   r   r    c                 s   s    | ]	\}}||kV  qd S r=   r1   )r8   rK   rD   r1   r1   r2   rB     s    c                 s   s    | ]}|V  qd S r=   r1   )r8   rF   r1   r1   r2   rB     s    r4   zfThis Multi-Dimensional Attention problem is implementable with standard Attention: token_layout_shape=.zIgnoring backend=z and backend args...c                 S   rN   r1   r1   rJ   r1   r1   r2   r;     rO   )r   r   r   r   r%   r&   )r   r   r   r   r'   r(   r)   )
r   r   r   r5   r6   r7   r   r   r   r    )r	   r   lenrangetuple	enumerateallreshaper+   r,   r-   multi_dimensional_attentionr   zipanyflattenr3   r   ZDontCarer   MULTI_DIM_BACKEND_MAPr/   r0   )r   r   r   r5   r6   r7   r   r   r   r   r    num_dimsZtoken_layout_tZwindow_size_tZstride_tZ
dilation_tZis_causal_tquery_tZkey_tvalue_toutput_shapeoutput_tZlse_toutputlseZquery_1dZkey_1dZvalue_1dZis_causal_1dZ	output_1dZlse_1dr1   )rA   r:   r2   rX      s   u,,,$

	

$

	

rX   metadatac           
   	   C   s\   ddl m} | std|dur|dkrtd|dddlm}	 |	| ||||||d	S )
a  
    Runs Variable-Length Multi-Dimensional Attention on sequence-packed QKV tensors.

    This operation performs sparse/multi-dimensional attention on variable-length sequences
    where tokens from different samples with different spatial layouts are concatenated
    along the sequence dimension. Each sample can have its own spatial dimensions
    (e.g., different height/width for 2D layouts).

    The metadata should be pre-computed using `configure_varlen_metadata` and reused
    across forward/backward passes for efficiency.

    **Requires NATTEN >= 0.21.6.dev1 and Blackwell DC-class architecture**

    Parameters:
        query (Tensor): 4-D query tensor with sequence-packed layout
            (`[1, seqlen_total, heads, head_dim]`)

        key (Tensor): 4-D key tensor with sequence-packed layout
            (`[1, seqlen_total, heads_kv, head_dim]`)

        value (Tensor): 4-D value tensor with sequence-packed layout
            (`[1, seqlen_total, heads_kv, head_dim_v]`)

        metadata (dict): Pre-computed varlen metadata from `imaginaire.varlen.generate_multi_dim_varlen_parameters`.

        scale (float | None): Attention scale. Defaults to head_dim ** -0.5.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return logsumexp values. Default is False.

        backend_kwargs (dict | None): Backend-specific arguments.

    Returns:
        output (Tensor): 4-D output tensor with sequence-packed layout
            (`[1, seqlen_total, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor (`[1, seqlen_total, heads]`).
            Only returned when return_lse is True.
    r   natten_supportedQmerge_attentions requires NATTEN. Please upgrade NATTEN to use attention merging.Nr   zYmulti_dimensional_attention_varlen currently only supports 'natten' backend, got backend=rQ   )!natten_multi_dim_attention_varlen)r   r   r   rd   r   r   r    )0cosmos_predict2._src.imaginaire.attention.nattenrf   RuntimeErrorr/   Z:cosmos_predict2._src.imaginaire.attention.natten.functionsrh   )
r   r   r   rd   r   r   r   r    rf   rh   r1   r1   r2   "multi_dimensional_attention_varlen  s"   5
rk   c
           
      C   s<   |   dkrtd| jdt| |||||d||||	dS )a	  
    Runs Spatio-Temporal Attention on unflattened QKV with the heads-last contiguous layout
    (`[batch, T, H, W, heads, head_dim]`).
    For now, it is always expected that Q, K, and V match in their shapes.

    Parameters:
        query (Tensor): 6-D query tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads, head_dim]`)

        key (Tensor): 6-D key tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads_kv, head_dim]`)

        value (Tensor): 6-D value tensor, with heads-last contiguous layout
            (`[batch, T, H, W, heads_kv, head_dim_v]`)

        window_size (tuple | int): Attention window (kernel) size / shape. If an
            integer, it will be repeated for all dimensions. For example `window_size=3` is
            interpreted as `window_size=(3, 3, 3)`.
            `-1`s are replaced with the corresponding value in `(T, H, W)`.
            Default is -1 (no sparsity).

        stride (tuple | int): Sliding window step size/shape. If an integer, it will be repeated
            for all dimensions. For example `stride=2` is interpreted as `stride=(2, 2, 2)`.
            Final stride must satisfy `1 <= stride <= window_size`.
            Default is 1.

        dilation (tuple | int): Dilation step size/shape. If an integer, it will be repeated for
            all dimensions. For example `dilation=4` is interpreted as `dilation=(4, 4, 4)`.
            Final dilation must satisfy `2 <= dilation * window_size <= (T, H, W)`.
            Default is 1.

        scale (float | None): Dot product scale (attention scale). Defaults to head_dim ** -0.5.

    Other Parameters:
        backend (str | None): Backend to run with. If unspecified (default), it will try to
            select the best available.

        return_lse (bool): Whether to return the logsumexp values. Default is False.

        backend_kwargs (dict | None): Key-value pair for passing arguments specific to the backend's
            multi-dim / sparse attention operator, if any. Only valid when a specific backend is
            selected (backend is not None).

    Returns:
        output (Tensor): 6-D output tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads, head_dim_v]`).

        logsumexp (Tensor): logsumexp tensor, with the heads-last contiguous layout
            (`[batch, T, H, W, heads]`). Only returned when return_lse is True.
       zjSpatio-Temporal Attention requires 6-D input tensors ([batch, T, H, W, heads, head_dim]), got query.shape=z).)TFFrP   )dimr/   r+   rX   )
r   r   r   r5   r6   r7   r   r   r   r    r1   r1   r2   spatio_temporal_attention0  s&   ?rn   outputslse_tensorstorch_compilec                 C   s6   ddl m} | stdddlm} || ||ddS )a%  
    Merges multiple attention outputs that share the same query.

    **NOTE: the user is responsible for ensuring ALL output and LSE tensors have the same data
    pointer as the outputs from the corresponding Attention operations for correct backpropagation!**

    **NOTE: requires NATTEN**

    **NOTE: for backpropagation, only two outputs can be merged for now.**

    Takes multiple attention outputs computed from the same set of query but w.r.t. different
    key/value pairs, and merges them as if all key/value pairs had been concatenated.
    This enables patterns like:
    - Combining local and global attention (e.g., sparse + dense context)
    - Pipelined context parallelism

    The merge operation correctly combines the attention outputs using their logsumexp values
    to produce a result equivalent to attending over the concatenated key/value pairs.

    Parameters:
        outputs (list[Tensor]): List of 4-D attention output tensors, with the heads-last layout
            (`[batch, seqlen, heads, head_dim]`). Must contain at least 2 tensors.

        lse_tensors (list[Tensor]): List of 3-D logsumexp tensors, with the heads-last layout
            (`[batch, seqlen, heads]`). Must match length of `outputs`.

        torch_compile (bool): Attempt to use `torch.compile` to fuse the underlying elementwise
            operations. Default is False.

    Returns:
        output (Tensor): Merged attention output tensor (`[batch, seqlen, heads, head_dim]`).

        logsumexp (Tensor): Updated logsumexp tensor (`[batch, seqlen, heads]`).
    r   re   rg   )merge_attentionsT)ro   rp   rq   Zuse_autograd_fix)ri   rf   rj   Znatten.functionalrr   )ro   rp   rq   rf   Znatten_merge_attentionsr1   r1   r2   rr     s   (rr   )FNNNNNNNNNFN)r$   r4   r4   FNNFN)NNFN)r$   r4   r4   NNFN)F)'__doc__torchr   Z2cosmos_predict2._src.imaginaire.attention.backendsr   r   Z0cosmos_predict2._src.imaginaire.attention.checksr   r   r   r   r	   r
   Z0cosmos_predict2._src.imaginaire.attention.flash2r   Z0cosmos_predict2._src.imaginaire.attention.flash3r   Z/cosmos_predict2._src.imaginaire.attention.masksr   ri   r   r   Z/cosmos_predict2._src.imaginaire.attention.utilsr   r,   r.   r\   boolfloatintstrdictrT   r3   rX   rk   rn   listrr   r1   r1   r1   r2   <module>   s<   		

 B

 x	
Q	

W
