o
    viF                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ d8dededededef
ddZ			
	
	
	d9dedededeej dB deej dB dededededB defddZ			
	
	
	d9dedededeej dB deej dB dededededB defddZ						d:dededededB dedB dedB dedB dedB d edB deddeef eeeeef B fd!d"Zdededed#ed$e
f
d%d&Z			
	
	
	d9dedededeej dB deej dB dededededB defd'd(Zd)ed*ed+ededB fd,d-Z	.	/	/	0d;ded1eeB d2eeB d3eeB d#eeB deeeeeeef fd4d5Zded1ed2ed3ed#ef
d6d7ZdS )<z
Imaginaire4 Attention Subpackage:
Unified implementation for all Attention implementations.

Common, op-specific, and backend-specific checks
    )Sequence)partial)AnyN)Tensor)
CausalType)log_or_raise_error)generate_varlen_parametersTquerykeyvalueraise_errorreturnc                 C   s   t t|d}| js|js|jr|dtd dS | js |js |jr(|dtd dS | j|jks4| j|jkrI|d| jd|jd|jd	td dS | j|jksU| j|jkrj|d
| jd|jd|jd	td dS dS )Nr   z/This operation does not support sparse tensors.	exceptionFz/This operation does not support nested tensors.zCQuery, key, and value must be on the same device, got query.device=z, key.device=z, value.device=.zFQuery, key, and value must assume the same data type, got query.dtype=z, key.dtype=z, value.dtype=T)r   r   	is_sparseNotImplementedError	is_nesteddevice
ValueErrordtype)r	   r
   r   r   	target_fn r   T/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/imaginaire/attention/checks.py_universal_tensor_checks#   s(   r   supported_dtypes_forwardsupported_dtypes_backwardsupports_mlasupports_gqa_mqabackend_namec	              	   C   s  |pd}t | |||dsdS tt|d}	|  | ks$|  | kr<|	d|  d| d| dtd dS | jd	 |jd	 ksP| jd	 |jd	 krk|	d
| jd	 d|jd	 d|jd	 dtd dS | jd |jd kr|	d| jd d|jd dtd dS |jd |jd kr|	d|jd d|jd dtd dS |s| jd |jd kr|	| d| jd d|jd dtd dS |s| jd |jd ks| jd |jd kr|	| d| jd d|jd d|jd dtd dS |r'| jd }
|jd }|
|k s|
| d	kr'|	d|
d|dtd dS |d urD| j|vrD|	| d| j d| dtd dS |d ure| jre| j|vre|	| d| j d| dtd dS dS )N	Attentionr   Fz5Q, K, and V must have the same rank, got query.dim()=z, key.dim()=z, value.dim()=r   r   r   9Q, K, and V must match in batch size, got query.shape[0]=, key.shape[0]=, value.shape[0]=z2Q and K head dims must match, got query.shape[-1]=z, key.shape[-1]=zDK and V must always have the same number of heads, got key.shape[2]=   z, value.shape[2]=zH does not support different head dims for QK and V, got query.shape[-1]=z, value.shape[-1]=zd does not support GQA/MQA, therefore number of heads in Q, K, and V must match, got query.shape[-2]=z, key.shape[-2]=z, value.shape[-2]=z1KV heads must evenly divide Q heads, got heads_q=z, heads_kv=z: does not support forward pass (inference) with data type z; supported dtypes: z: does not support backward pass (training) with data type T)r   r   r   dimr   shaper   requires_grad)r	   r
   r   r   r   r   r   r   r    r   Zheads_qheads_kvr   r   r   _universal_attention_checks?   s    "((,

r,   c	           
      C   s   |pd}t | |||dsdS t| ||||||||d	sdS tt|d}	|  dkr7|	d|  dtd dS |jd	 |jd	 krV|	d
|jd	 d|jd	 dtd dS dS )Nr!   r   F	r	   r
   r   r   r   r   r   r   r       z9Attention expects 4-D tensors as inputs, got query.dim()=r   r      z8K and V must match in sequence length, got key.shape[1]=z, value.shape[1]=Tr   r,   r   r   r(   r   r)   )
r	   r
   r   r   r   r   r   r   r    r   r   r   r   attention_tensor_checks   s:   r1   	seqlens_Q
seqlens_KVcumulative_seqlen_Qcumulative_seqlen_KVmax_seqlen_Qmax_seqlen_KVc	                 C   s  | j d |j d ks| j d |j d kr+td| j d d|j d d|j d dtdd ||||fD rEtdd ||fD rEd	S |d usM|d urVt| ||||d
S tdd ||||fD sntdd ||fD rrtd| j d dkrtd| j d d|d usJ |d usJ |d usJ |d usJ t|trt|tstdt|dt|d| j d }	|j d }
||	krtd|d|	d||
krtd|d|
d|dk s|dk rtd|d|dt|trt|tstd|j	| j	ks|j	| j	krtd|j	d|j	d| j	d|j
tjks+|j
tjkr8td|j
d|j
d| dksF| dkrUtd| d| d|j d |j d krmtd |j d!|j d|j d d"k rtd#|j d!|j d||||fS )$Nr   r"   r#   r$   r   c                 s       | ]}|d u V  qd S Nr   .0xr   r   r   	<genexpr>   
    
z'varlen_tensor_checks.<locals>.<genexpr>c                 s   s     | ]}|d u p|dkV  qd S )Nr   r   r:   r   r   r   r=      s
    
)NNr   r   )r	   r
   r   r2   r3   c                 s   r8   r9   r   r:   r   r   r   r=      r>   c                 s   s    | ]}|d kV  qdS )r   Nr   r:   r   r   r   r=      r>   zwVariable length Attention requires all 6 of cumulative_seqlen_{Q,KV}, max_seqlen_{Q,KV}, total_seqlen_{Q,KV} to be set.r/   zfVariable length Attention only supports sequence-packed memory layout (batch = 1), got query.shape[0]=zDmax_seqlen_Q and max_seqlen_KV must be ints, got type(max_seqlen_Q)=z, type(max_seqlen_KV)=z>Maximum sequence length cannot exceed total, got max_seqlen_Q=z, total_seqlen_Q=z?Maximum sequence length cannot exceed total, got max_seqlen_KV=z, total_seqlen_KV=z@Maximum sequence length cannot be less than 1, got max_seqlen_Q=z, max_seqlen_KV=zBcumulative_seqlen_Q and cumulative_seqlen_KV must both be tensors.zocumulative_seqlen_Q and cumulative_seqlen_KV must be on the same device as QKV, but cumulative_seqlen_Q.device=z, cumulative_seqlen_KV.device=z, query.device=zmcumulative_seqlen_Q and cumulative_seqlen_KV must both be torch.int32 tensors, got cumulative_seqlen_Q.dtype=z, cumulative_seqlen_KV.dtype=zecumulative_seqlen_Q and cumulative_seqlen_KV must both be 1-D tensors, got cumulative_seqlen_Q.dim()=z, cumulative_seqlen_KV.dim()=z_cumulative_seqlen_Q and cumulative_seqlen_KV must match in size, got cumulative_seqlen_Q.shape=z, cumulative_seqlen_KV.shape=r'   zmcumulative_seqlen_Q and cumulative_seqlen_KV must contain at least 2 elements, got cumulative_seqlen_Q.shape=)r)   r   allr   any
isinstanceinttyper   r   r   torchint32r(   )r	   r
   r   r2   r3   r4   r5   r6   r7   Ztotal_seqlen_QZtotal_seqlen_KVr   r   r   varlen_tensor_checks   s   ((	

rF   	is_causalcausal_typec                 C   s   |r|d u st |tstd|d|  |   kr)|   kr)dks,J  J |jd |jd ks8J |rZ|tjkr\| jd |jd kr^td| jd d|jd dd S d S d S )NznArgument causal_type must be specified as an enum instance of CausalType when is_causal=True, got causal_type=r   r.   r/   zWCausal mask type DontCare is only valid when seqlen_q == seqlen_kv, got query.shape[1]=z, key.shape[1]=)rA   r   r   r(   r)   DontCare)r	   r
   r   rG   rH   r   r   r   attention_param_checksO  s   
2"rJ   c	                 C   s   |pd}t | |||dsdS t| ||||||||d	sdS tt|d}	|  dvr7|	d|  dtd dS |  d	 }
| jd
d
|
  }|jd
d
|
  }|jd
d
|
  }||ks`||krr|	d|d|d|dtd dS dS )NzMulti-Dimensional Attentionr   Fr-   r.         zYMulti-Dimensional Attention supports 4-D, 5-D, or 6-D tensors as inputs, got query.dim()=r   r      r/   zlQ, K and V must match in their token layout shapes in multi-dimensional attention, got q_token_layout_shape=z, k_token_layout_shape=z, v_token_layout_shape=Tr0   )r	   r
   r   r   r   r   r   r   r    r   num_dimsZq_token_layout_shapeZk_token_layout_shapeZv_token_layout_shaper   r   r   !multi_dim_attention_tensor_checksd  sN   rP   paramrO   typenamec                    sX   t  rt fddt|D S t  tr*t |kr*tfdd D r* S d S )Nc                 3   s    | ]} V  qd S r9   r   )r;   _)rQ   r   r   r=         z/check_valid_tuple_or_element.<locals>.<genexpr>c                 3   s    | ]}t | V  qd S r9   )rA   r:   )rR   r   r   r=     s    )rA   tupleranger   lenr?   )rQ   rO   rR   r   )rQ   rR   r   check_valid_tuple_or_element  s
   
,rX   r%   r/   Fwindow_sizestridedilationc                 C   s
  |   dv sJ |   d }tdd | jdd|  D }t||t}|du r3td| d|d	t||t}|du rHtd
| d|d	t||t}	|	du r]td| d|d	t||t}
|
du rrtd| d|d	tdd t||D }||||	|
fS )zF
    Converts all multi-dimensional parameters to standard types.
    rK   rN   c                 s       | ]}|V  qd S r9   r   r;   sr   r   r   r=     rT   z3multi_dim_attention_param_filter.<locals>.<genexpr>r/   Nz:Parameter 'window_size' must be either an int or tuple of z ints, got window_size=r   z5Parameter 'stride' must be either an int or tuple of z ints, got stride=z7Parameter 'dilation' must be either an int or tuple of z ints, got dilation=z;Parameter 'is_causal' must be either a boolean or tuple of z booleans, got is_causal=c                 s   s$    | ]\}}|d kr|n|V  qdS )r%   Nr   )r;   r<   wr   r   r   r=     s   " )r(   rU   r)   rX   rB   r   boolzip)r	   rY   rZ   r[   rG   rO   token_layout_shapeZwindow_size_stride_	dilation_Z
is_causal_r   r   r    multi_dim_attention_param_filter  s*   
 re   c              
   C   s<  |   dv sJ |   d }tdd | jdd|  D }tdd |D r3td|d| jd	td
d |D rDtd|dtdd t|||D rctd|d|d|d| jd		tdd |D rttd|dtdd t||D rtd|d|dtdd |D rtd|ddS )z1
    Validates multi-dimensional parameters.
    rK   rN   c                 s   r\   r9   r   r]   r   r   r   r=     rT   z3multi_dim_attention_param_checks.<locals>.<genexpr>r/   c                 s       | ]}|d kV  qdS r/   Nr   r:   r   r   r   r=         zAToken layout dimensions must all be >= 2, got token_layout_shape=z (query.shape=z).c                 s   rf   rg   r   )r;   r_   r   r   r   r=     rh   zgParameter 'window_size' must be either -1 (no sparsity) or >= 2 along every dimension, got window_size=r   c                 s   s"    | ]\}}}|| |kV  qd S r9   r   )r;   r<   r_   dr   r   r   r=     s     zsThe product of 'window_size' and 'dilation' cannot be greater than the input (token layout shape), got window_size=z, dilation=z, token_layout_shape=c                 s       | ]}|d k V  qdS rg   r   r]   r   r   r   r=     rh   z=Parameter 'stride' allows positive integers only, got stride=c                 s   s    | ]	\}}||kV  qd S r9   r   )r;   r_   r^   r   r   r   r=     s    z[Parameter 'stride' cannot be greater than window size along any dimension, got window_size=z	, stride=c                 s   rj   rg   r   )r;   ri   r   r   r   r=     rh   zAParameter 'dilation' allows positive integers only, got dilation=N)r(   rU   r)   r@   r   ra   )r	   rY   rZ   r[   rG   rO   rb   r   r   r    multi_dim_attention_param_checks  s@   
 rk   )T)NNTTTN)NNNNNN)r%   r/   r/   F)__doc__collections.abcr   	functoolsr   typingr   rD   r   -cosmos_policy._src.imaginaire.attention.masksr   -cosmos_policy._src.imaginaire.attention.utilsr   Z.cosmos_policy._src.imaginaire.attention.varlenr   r`   r   listr   strr,   r1   rB   rU   rF   rJ   rP   rC   rX   re   rk   r   r   r   r   <module>   s>    	

`	

3	

 
	

6
)