o
    viH                     @  sr   d dl mZ d dlmZ d dlmZmZ d dlZd dlm	Z	 eG dd dZ
G dd de	jZG d	d
 d
ZdS )    )annotations)	dataclass)AnyOptionalNc                   @  s>   e Zd ZU dZded< dZded< dZded< dZded< d	S )
KVCacheConfigFboolrun_with_kvstore_kvr   intcurrent_idxrecompute_cross_attn_kvN)__name__
__module____qualname__r   __annotations__r	   r   r    r   r   P/data/cameron/vidgen/cosmos-policy/cosmos_policy/_src/predict2/utils/kv_cache.pyr      s
   
 r   c                      sF   e Zd ZdZdd fddZddddZdddZddddZ  ZS ) AttentionOpWithKVCachea  A thin wrapper that adds K/V caching to an existing attention op.

    This wrapper expects the wrapped op to accept (q, k, v, attn_mask=None)
    and return attention outputs with heads already flattened on the last dim.

    Cache semantics:
    - Cache entries are stored as per-chunk tensors, where each chunk corresponds
      to one latent frame composed of HxW tokens (after patchify).
    - The `max_cache_size` capacity therefore refers to the number of latent
      frames (chunks), NOT the number of individual tokens.
    - When `max_cache_size` is None, the cache grows without an automatic
      rolling window; otherwise, it acts as a rolling window of at most
      `max_cache_size` frames. Upon overflow, the oldest frames are dropped.
    Nattn_opnn.Module | Anymax_cache_sizeOptional[int]c                   s,   t    || _| j|d d| _d| _dS )au  Initialize the KV cache wrapper.

        Args:
            attn_op: The underlying attention operation (q, k, v[, attn_mask]) -> out.
            max_cache_size: Optional capacity measured in number of latent frames
                (chunks). Each chunk is a single frame worth of HxW tokens. If None,
                the cache does not enforce a rolling capacity.
        )r   N)super__init__r   reset_kv_cachepgstream)selfr   r   	__class__r   r   r   1   s
   
	
zAttentionOpWithKVCache.__init__returnNonec                 C  s0   d| _ dg|pd | _dg|pd | _|| _dS )a  Reset/initialize the KV caches.

        Args:
            max_cache_size: Optional capacity measured in number of latent frames
                (chunks). Each chunk is a single frame worth of HxW tokens. If None,
                the cache does not enforce a rolling capacity.
        r   Ni )	start_idxk_cachev_cacher   )r   r   r   r   r   r   @   s   	
z%AttentionOpWithKVCache.reset_kv_cacheqtorch.Tensorkvkv_cache_cfgr   c                K  s  | j d ur
| jd usJ d|jr$t|j}| | j |< | | j|< |jri|jdkri| j | j|j }| j| j|j }tdd |D rIJ tdd |D rTJ t	j
||g dd}	t	j
||g dd}
n|}	|}
|jr| jd urtdt|j| j | _| j||	|
fi |S )Nz9KV cache is not initialized. Call reset_kv_cache() first.r   c                 s      | ]}|d u V  qd S Nr   .0xr   r   r   	<genexpr>e       z1AttentionOpWithKVCache.forward.<locals>.<genexpr>c                 s  r*   r+   r   r,   r   r   r   r/   f   r0      )dim)r#   r$   r	   r
   r   detachr   r"   anytorchcatr   maxr   )r   r%   r'   r(   r)   kwargsindexZ	history_kZ	history_vZk_outZv_outr   r   r   forwardN   s&   	
zAttentionOpWithKVCache.forwardp2pcp_comm_typestrc                 C  s   | j j||||d d S )N)r<   )r   set_context_parallel_group)r   process_groupranksr   r<   r   r   r   r>   t   s   z1AttentionOpWithKVCache.set_context_parallel_groupr+   )r   r   r   r   )r   r   r    r!   )
r%   r&   r'   r&   r(   r&   r)   r   r    r&   )r;   )r<   r=   )	r   r   r   __doc__r   r   r:   r>   __classcell__r   r   r   r   r   !   s    
&r   c                   @  s0   e Zd ZdZddd	d
ZdddZdddZdS )VideoSeqPoszFlattened 3D grid positions for a video clip.

    Stores flattened t/h/w indices of length L = T*H*W to enable constructing
    RoPE frequencies aligned with global positions across sequential chunks.
    NTr
   HWr    r!   c                 C  s   || _ || _|| _|d ur/|d ur/|d ur/|jtjd| _|jtjd| _|jtjd| _d S tj	
 r=tdtj	 ntd}tj| j |tjd}tj| j|tjd}	tj| j|tjd}
tj||	|
dd\}}}|d| _|d| _|d| _d S )N)dtypecudacpu)devicerG   ij)indexing)rD   rE   rF   tor5   longpos_hpos_wpos_trH   is_availablerJ   current_devicearangemeshgridreshape)r   rD   rE   rF   rP   rQ   rR   rJ   thwr   r   r   r      s    &zVideoSeqPos.__init__c                 C  s   t | j S r+   )r
   rP   numel)r   r   r   r   size   s   zVideoSeqPos.sizet_idx'VideoSeqPos'c              	   C  s   t |}|dk s|t | jkrtd| d| j dt | jt | j }|| }|| }tdt | jt | j| j|| | j|| | j|| dS )zReturn a `VideoSeqPos` view for a single frame at absolute index `t_idx`.

        This is useful for streaming / KV-cache inference where the model is run on
        one frame at a time but RoPE positions must reflect global video indices.
        r   zt_idx out of range: z (valid: [0, z))r1   )rD   rE   rF   rP   rQ   rR   )	r
   rD   
IndexErrorrE   rF   rC   rP   rQ   rR   )r   r]   Ztokens_per_framestartendr   r   r   frame   s   zVideoSeqPos.frame)NNN)rD   r
   rE   r
   rF   r
   r    r!   )r    r
   )r]   r
   r    r^   )r   r   r   rA   r   r\   rb   r   r   r   r   rC   x   s
    
rC   )
__future__r   dataclassesr   typingr   r   r5   torch.nnnnr   Moduler   rC   r   r   r   r   <module>   s   W