o
    ?߱i'M                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ d?dededededef
ddZ			
	
	
	d@dedededeej dB deej dB dededededB defddZ			
	
	
	d@dedededeej dB deej dB dededededB defddZ						dAdededededB dedB dedB dedB dedB d edB deddeef eeeeef B fd!d"Zdededed#ed$e
f
d%d&Z			
	
	
	d@dedededeej dB deej dB dededededB defd'd(Z	*dBd+ed,ed-eded.ededB fd/d0Z	1	2	2	)dCd3ed4eeB d5eeB d6eeB d#eeB deeeeef fd7d8Zd3ed4ed5ed6ed#ef
d9d:Z	1	2	2	)dCded4eeB d5eeB d6eeB d#eeB deeeeeef fd;d<Zded4ed5ed6ed#ef
d=d>Z dS )Dz
Imaginaire4 Attention Subpackage:
Unified implementation for all Attention implementations.

Common, op-specific, and backend-specific checks
    )Sequence)partial)AnyN)Tensor)
CausalType)log_or_raise_error)generate_varlen_parametersTquerykeyvalueraise_errorreturnc                 C   s   t t|d}| js|js|jr|dtd dS | js |js |jr(|dtd dS | j|jks4| j|jkrI|d| jd|jd|jd	td dS | j|jksU| j|jkrj|d
| jd|jd|jd	td dS dS )Nr   z/This operation does not support sparse tensors.	exceptionFz/This operation does not support nested tensors.zCQuery, key, and value must be on the same device, got query.device=z, key.device=z, value.device=.zFQuery, key, and value must assume the same data type, got query.dtype=z, key.dtype=z, value.dtype=T)r   r   	is_sparseNotImplementedError	is_nesteddevice
ValueErrordtype)r	   r
   r   r   	target_fn r   Z/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/imaginaire/attention/checks.py_universal_tensor_checks#   s(   r   supported_dtypes_forwardsupported_dtypes_backwardsupports_mlasupports_gqa_mqabackend_namec	              	   C   s  |pd}t | |||dsdS tt|d}	|  | ks$|  | kr<|	d|  d| d| dtd dS | jd	 |jd	 ksP| jd	 |jd	 krk|	d
| jd	 d|jd	 d|jd	 dtd dS | jd |jd kr|	d| jd d|jd dtd dS |jd |jd kr|	d|jd d|jd dtd dS |s| jd |jd kr|	| d| jd d|jd dtd dS |s| jd |jd ks| jd |jd kr|	| d| jd d|jd d|jd dtd dS |r'| jd }
|jd }|
|k s|
| d	kr'|	d|
d|dtd dS |d urD| j|vrD|	| d| j d| dtd dS |d ure| jre| j|vre|	| d| j d| dtd dS dS )N	Attentionr   Fz5Q, K, and V must have the same rank, got query.dim()=z, key.dim()=z, value.dim()=r   r   r   9Q, K, and V must match in batch size, got query.shape[0]=, key.shape[0]=, value.shape[0]=z2Q and K head dims must match, got query.shape[-1]=z, key.shape[-1]=zDK and V must always have the same number of heads, got key.shape[2]=   z, value.shape[2]=zH does not support different head dims for QK and V, got query.shape[-1]=z, value.shape[-1]=zd does not support GQA/MQA, therefore number of heads in Q, K, and V must match, got query.shape[-2]=z, key.shape[-2]=z, value.shape[-2]=z1KV heads must evenly divide Q heads, got heads_q=z, heads_kv=z: does not support forward pass (inference) with data type z; supported dtypes: z: does not support backward pass (training) with data type T)r   r   r   dimr   shaper   requires_grad)r	   r
   r   r   r   r   r   r   r    r   Zheads_qZheads_kvr   r   r   _universal_attention_checks?   s    "((,

r+   c	           
      C   s   |pd}t | |||dsdS t| ||||||||d	sdS tt|d}	|  dkr7|	d|  dtd dS |jd	 |jd	 krV|	d
|jd	 d|jd	 dtd dS dS )Nr!   r   F	r	   r
   r   r   r   r   r   r   r       z9Attention expects 4-D tensors as inputs, got query.dim()=r   r      z8K and V must match in sequence length, got key.shape[1]=z, value.shape[1]=Tr   r+   r   r   r(   r   r)   )
r	   r
   r   r   r   r   r   r   r    r   r   r   r   attention_tensor_checks   s:   r0   	seqlens_Q
seqlens_KVcumulative_seqlen_Qcumulative_seqlen_KVmax_seqlen_Qmax_seqlen_KVc	                 C   s  | j d |j d ks| j d |j d kr+td| j d d|j d d|j d dtdd ||||fD rEtdd ||fD rEd	S |d usM|d urVt| ||||d
S tdd ||||fD sntdd ||fD rrtd| j d dkrtd| j d d|d usJ |d usJ |d usJ |d usJ t|trt|tstdt|dt|d| j d }	|j d }
||	krtd|d|	d||
krtd|d|
d|dk s|dk rtd|d|dt|trt|tstd|j	| j	ks|j	| j	krtd|j	d|j	d| j	d|j
tjks+|j
tjkr8td|j
d|j
d| dksF| dkrUtd| d| d|j d |j d krmtd |j d!|j d|j d d"k rtd#|j d!|j d||||fS )$Nr   r"   r#   r$   r   c                 s       | ]}|d u V  qd S Nr   .0xr   r   r   	<genexpr>   
    
z'varlen_tensor_checks.<locals>.<genexpr>c                 s   s     | ]}|d u p|dkV  qd S )Nr   r   r9   r   r   r   r<      s
    
)NNr   r   )r	   r
   r   r1   r2   c                 s   r7   r8   r   r9   r   r   r   r<      r=   c                 s   s    | ]}|d kV  qdS )r   Nr   r9   r   r   r   r<      r=   zwVariable length Attention requires all 6 of cumulative_seqlen_{Q,KV}, max_seqlen_{Q,KV}, total_seqlen_{Q,KV} to be set.r.   zfVariable length Attention only supports sequence-packed memory layout (batch = 1), got query.shape[0]=zDmax_seqlen_Q and max_seqlen_KV must be ints, got type(max_seqlen_Q)=z, type(max_seqlen_KV)=z>Maximum sequence length cannot exceed total, got max_seqlen_Q=z, total_seqlen_Q=z?Maximum sequence length cannot exceed total, got max_seqlen_KV=z, total_seqlen_KV=z@Maximum sequence length cannot be less than 1, got max_seqlen_Q=z, max_seqlen_KV=zBcumulative_seqlen_Q and cumulative_seqlen_KV must both be tensors.zocumulative_seqlen_Q and cumulative_seqlen_KV must be on the same device as QKV, but cumulative_seqlen_Q.device=z, cumulative_seqlen_KV.device=z, query.device=zmcumulative_seqlen_Q and cumulative_seqlen_KV must both be torch.int32 tensors, got cumulative_seqlen_Q.dtype=z, cumulative_seqlen_KV.dtype=zecumulative_seqlen_Q and cumulative_seqlen_KV must both be 1-D tensors, got cumulative_seqlen_Q.dim()=z, cumulative_seqlen_KV.dim()=z_cumulative_seqlen_Q and cumulative_seqlen_KV must match in size, got cumulative_seqlen_Q.shape=z, cumulative_seqlen_KV.shape=r'   zmcumulative_seqlen_Q and cumulative_seqlen_KV must contain at least 2 elements, got cumulative_seqlen_Q.shape=)r)   r   allr   any
isinstanceinttyper   r   r   torchint32r(   )r	   r
   r   r1   r2   r3   r4   r5   r6   Ztotal_seqlen_QZtotal_seqlen_KVr   r   r   varlen_tensor_checks   s   ((	

rE   	is_causalcausal_typec                 C   s   |r|d u st |tstd|d|  |   kr)|   kr)dks,J  J |jd |jd ks8J |rZ|tjkr\| jd |jd kr^td| jd d|jd dd S d S d S )NznArgument causal_type must be specified as an enum instance of CausalType when is_causal=True, got causal_type=r   r-   r.   zWCausal mask type DontCare is only valid when seqlen_q == seqlen_kv, got query.shape[1]=z, key.shape[1]=)r@   r   r   r(   r)   DontCare)r	   r
   r   rF   rG   r   r   r   attention_param_checksO  s   
2"rI   c	                 C   s   |pd}t | |||dsdS t| ||||||||d	sdS tt|d}	|  dvr7|	d|  dtd dS |  d	 }
| jd
d
|
  }|jd
d
|
  }|jd
d
|
  }||ks`||krr|	d|d|d|dtd dS dS )NzMulti-Dimensional Attentionr   Fr,   r-         zYMulti-Dimensional Attention supports 4-D, 5-D, or 6-D tensors as inputs, got query.dim()=r   r      r.   zlQ, K and V must match in their token layout shapes in multi-dimensional attention, got q_token_layout_shape=z, k_token_layout_shape=z, v_token_layout_shape=Tr/   )r	   r
   r   r   r   r   r   r   r    r   num_dimsZq_token_layout_shapeZk_token_layout_shapeZv_token_layout_shaper   r   r   !multi_dim_attention_tensor_checksd  sN   rO   FunknownparamrN   typename
param_namec                    s   t  rt fddt|D S t  tr1t |kr1tfdd D r1tdd  D S |r>td| d  dd S )Nc                 3   s    | ]} V  qd S r8   r   )r:   _)rQ   r   r   r<         z/check_valid_tuple_or_element.<locals>.<genexpr>c                 3   s    | ]}t | V  qd S r8   )r@   r9   )rR   r   r   r<     s    c                 s       | ]}|V  qd S r8   r   r9   r   r   r   r<     rU   zInvalid value for parameter z: r   )r@   tupleranger   lenr>   r   )rQ   rN   rR   r   rS   r   )rQ   rR   r   check_valid_tuple_or_element  s   
,rZ   r%   r.   token_layout_shapewindow_sizestridedilationc           
      C   s  t | trtdd | D rtd| dt| }|dv s J t||t}|du r5td| d|dt||t}|du rJtd	| d
|dt||t}|du r_td| d|dt||t}	|	du rttd| d|dtdd t| |D }||||	fS )F
    Converts all multi-dimensional parameters to standard types.
    c                 s       | ]	}t |t V  qd S r8   r@   rA   r9   r   r   r   r<         z>multi_dim_attention_param_filter_tensorless.<locals>.<genexpr>Dtoken_layout_shape must be an integer tuple, got token_layout_shape=r   r.   r'   rM   Nz:Parameter 'window_size' must be either an int or tuple of z ints, got window_size=z5Parameter 'stride' must be either an int or tuple of z ints, got stride=z7Parameter 'dilation' must be either an int or tuple of z ints, got dilation=z;Parameter 'is_causal' must be either a boolean or tuple of z booleans, got is_causal=c                 s   s$    | ]\}}|d kr|n|V  qdS )r%   Nr   )r:   r;   wr   r   r   r<     s   " )	r@   rW   r?   r   rY   rZ   rA   boolzip)
r[   r\   r]   r^   rF   rN   window_size_stride_	dilation_
is_causal_r   r   r   +multi_dim_attention_param_filter_tensorless  s,   rl   c                 C   s0  t | trtdd | D rtd| dt| }|dv s J tdd | D r1td| dtdd |D rBtd	|dtd
d t| ||D r]td|d|d| dtdd |D rntd|dtdd t||D rtd|d|dtdd |D rtd|ddS )1
    Validates multi-dimensional parameters.
    c                 s   r`   r8   ra   r9   r   r   r   r<     rb   z>multi_dim_attention_param_checks_tensorless.<locals>.<genexpr>rc   r   rd   c                 s       | ]}|d kV  qdS r.   Nr   r9   r   r   r   r<         zAToken layout dimensions must all be >= 2, got token_layout_shape=c                 s   rn   ro   r   )r:   re   r   r   r   r<     rp   zgParameter 'window_size' must be either -1 (no sparsity) or >= 2 along every dimension, got window_size=c                 s   s"    | ]\}}}|| |kV  qd S r8   r   )r:   r;   re   dr   r   r   r<     s     zsThe product of 'window_size' and 'dilation' cannot be greater than the input (token layout shape), got window_size=z, dilation=z, token_layout_shape=c                 s       | ]}|d k V  qdS ro   r   r:   sr   r   r   r<     rp   z=Parameter 'stride' allows positive integers only, got stride=c                 s   s    | ]	\}}||kV  qd S r8   r   )r:   re   rt   r   r   r   r<     rb   z[Parameter 'stride' cannot be greater than window size along any dimension, got window_size=z	, stride=c                 s   rr   ro   r   )r:   rq   r   r   r   r<     rp   zAParameter 'dilation' allows positive integers only, got dilation=N)r@   rW   r?   r   rY   rg   )r[   r\   r]   r^   rF   rN   r   r   r   +multi_dim_attention_param_checks_tensorless  s>   ru   c                 C   sd   |   dv sJ |   d }tdd | jdd|  D }t|||||d\}}}	}
||||	|
fS )r_   rJ   rM   c                 s   rV   r8   r   rs   r   r   r   r<     rU   z3multi_dim_attention_param_filter.<locals>.<genexpr>r.   r[   r\   r]   r^   rF   )r(   rW   r)   rl   )r	   r\   r]   r^   rF   rN   r[   rh   ri   rj   rk   r   r   r    multi_dim_attention_param_filter  s   
 rw   c                 C   sR   |   dv sJ |   d }tdd | jdd|  D }t|||||d dS )rm   rJ   rM   c                 s   rV   r8   r   rs   r   r   r   r<   $  rU   z3multi_dim_attention_param_checks.<locals>.<genexpr>r.   rv   N)r(   rW   r)   ru   )r	   r\   r]   r^   rF   rN   r[   r   r   r    multi_dim_attention_param_checks  s   
 
rx   )T)NNTTTN)NNNNNN)FrP   )r%   r.   r.   F)!__doc__collections.abcr   	functoolsr   typingr   rC   r   /cosmos_predict2._src.imaginaire.attention.masksr   /cosmos_predict2._src.imaginaire.attention.utilsr   Z0cosmos_predict2._src.imaginaire.attention.varlenr   rf   r   listr   strr+   r0   rA   rW   rE   rI   rO   rB   rZ   rl   ru   rw   rx   r   r   r   r   <module>   s    	

`	

3	

 
	

7

+
.
