o
     ݱi!                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZ G dd dejZ	G dd dejZ
G dd dejZG d	d
 d
ejZG dd dejZG dd dejZdddZdddZdd ZdddZdd Zdd ZdS )     N)	rearrangec                       s$   e Zd Z fddZdd Z  ZS )SinusoidalPosEmbc                    s   t    || _d S )N)super__init__dim)selfr   	__class__ ?/data/cameron/vidgen/unified-world-model/models/common/utils.pyr   
   s   

zSinusoidalPosEmb.__init__c                 C   sv   | j d }td|d  }ttj||jd|  }|d d d f |d d d f  }tj| |	 fdd}|S )N   '     )devicer   )
r   mathlogtorchexparanger   catsincos)r   xhalf_dimembr
   r
   r   forward   s   
 zSinusoidalPosEmb.forward)__name__
__module____qualname__r   r   __classcell__r
   r
   r   r   r   	   s    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )	GaussianFourierEmbz-Gaussian Fourier embeddings for noise levels.   Tc                    sH   t    || _|| _|| _t| jd | j }tj|| jd| _	d S )Nr   )requires_grad)
r   r   r   scale	trainabler   randnnn	ParameterW)r   r   r%   r&   ZW_initr   r
   r   r      s   
zGaussianFourierEmb.__init__c                 C   sF   |d d d f | j d d d f  d tj }tj| | gddS )Nr   r   r   )r*   nppir   r   r   r   )r   r   x_projr
   r
   r   r   $   s   ,zGaussianFourierEmb.forward)r#   Tr   r   r    __doc__r   r   r!   r
   r
   r   r   r"      s    
r"   c                       0   e Zd ZdZ			d	 fdd	Zdd Z  ZS )
PatchEmbed2Dz Convert image to patch embeddingr#         c                    s*   t    tj||||f||fd| _d S Nin_channelsout_channelskernel_sizestride)r   r   r(   Conv2dproj)r   
patch_size	num_chans	embed_dimr   r
   r   r   ,   s   
zPatchEmbed2D.__init__c                 C      |  |ddd}|S Nr   r   r;   flatten	transposer   r   r
   r
   r   r   :      zPatchEmbed2D.forward)r#   r2   r3   r.   r
   r
   r   r   r1   )       r1   c                       r0   )
PatchEmbed3D Convert video to patch embeddingr   r#   r#   r2   r3   c                    s"   t    tj||||d| _d S r4   )r   r   r(   Conv3dr;   )r   patch_shaper=   r>   r   r
   r   r   D   s   
zPatchEmbed3D.__init__c                 C   r?   r@   rA   rD   r
   r
   r   r   R   rE   zPatchEmbed3D.forward)rI   r2   r3   r.   r
   r
   r   r   rG   A   rF   rG   c                       s8   e Zd ZdZdddgddgf fdd	Zd	d
 Z  ZS )MultiLayerPatchEmbed3DrH   r2   i  r3   )r      rM   )r      rN   c           	   	      sz   t    t }|}tt||D ]#\}\}}|tj||||d |t|d k r2|t	  |}qtj
| | _d S )Nr5   r   )r   r   list	enumeratezipappendr(   rJ   lenGELU
Sequentialr;   )	r   r=   
embed_dimsZpatch_shapeslayersin_chansirK   Z	out_chansr   r
   r   r   \   s    
zMultiLayerPatchEmbed3D.__init__c                 C   r?   r@   rA   rD   r
   r
   r   r   s   rE   zMultiLayerPatchEmbed3D.forwardr.   r
   r
   r   r   rL   Y   s    rL   c                       s2   e Zd ZdZ				d
 fdd	Zdd	 Z  ZS )MultiviewPatchEmbed3Dz4Convert video with multiple views to patch embeddingrI   r   r2   r3   c                    s2   t    tj|| || |||d| _|| _d S )N)r6   r7   r8   r9   groups)r   r   r(   rJ   r;   	num_views)r   rK   r\   r=   r>   r   r
   r   r   }   s   

zMultiviewPatchEmbed3D.__init__c                 C   s(   t |d}| |}t |d| jd}|S )NzB V C T H W -> B (V C) T H WzB (V D) t h w -> B (V t h w) D)V)r   r;   r\   rD   r
   r
   r   r      s   

zMultiviewPatchEmbed3D.forward)rI   r   r2   r3   r.   r
   r
   r   r   rZ   z   s    rZ   Fr   c           	         st   t |}| | dksJ | |   d dksJ dd |D }tj|ddi} fdd|D }tj|dd	}|S )
ar  Create n-dimensional rotary positional embeddings.

    Args:
        dim: an int of the embedding dimension.
        grid_shape: a sequence of int of the length along each axis.
        base: the base from which to calculate the rotation angles.

    Returns:
        pos_embed: a tensor of shape (grid_shape[0]*...*grid_shape[N-1], dim) of positional embeddings.

    r   r   c                 S   s   g | ]	}t | qS r
   )r   r   float.0lengthr
   r
   r   
<listcomp>       z'get_nd_rotary_embed.<locals>.<listcomp>indexingijc                    s   g | ]
}t  | qS r
   )get_1d_rotary_embedrB   )r`   Z	axis_gridaxis_dimbaser
   r   rb      s    r   r   )rS   r   meshgridr   )	r   
grid_shape	cls_tokenri   num_axis
axis_ticks
axis_gridsZaxis_thetasthetasr
   rg   r   get_nd_rotary_embed   s   rq   c                 C   sJ   | d dksJ d|t d| d |    }t ||}|dd}|S )a)  Create 1D rotary positional embeddings from a grid of positions.

    Args:
        dim: the output dimension for each position.
        pos: a tensor of size (seq_len,) of positions to be encoded.

    Returns:
        thetas: a tensor of size (seq_len, dim) of rotary positional embeddings.
    r   r         ?r   )r   r   r^   outerrepeat)r   posri   rp   r
   r
   r   rf      s
   
rf   c                 C   s^   | j dd |j dd ksJ | jddd\}}tj| |gdd}| |  ||   S )a&  Rotates the input tensors by the positional embeddings.

    Args:
        x: a tensor of shape (..., seq_len, dim).
        thetas: a tensor of shape (..., seq_len, dim) of positional embeddings.

    Returns:
        x: a tensor of shape (..., seq_len, dim) of the rotated input tensors.
    Nr   r   r   )shapechunkr   r   r   r   )r   rp   x1x2Zx_rotate_halfr
   r
   r   apply_rotary_embed   s    
r{   c                    s   t |}tt| d|  d  dd |D }tj|ddi} fdd|D }tj|dd}|d	d	d	| f }|rJtjtd| f|gd
d}|S )a2  Create n-dimensional sinusoidal positional embeddings.

    Args:
        dim: an int of the embedding dimension.
        grid_shape: a sequence of int of the length along each axis.

    Returns:
        pos_embed: an array of shape (grid_shape[0]*...*grid_shape[N-1], dim) of positional embeddings.

    r   c                 S   s   g | ]	}t j|td qS )dtype)r+   r   r^   r_   r
   r
   r   rb      rc   z+get_nd_sinusoidal_embed.<locals>.<listcomp>rd   re   c                    s   g | ]
}t  |d qS )r   )get_1d_sinusoidal_embedreshape)r`   axisrh   r
   r   rb      s    r   r   Nr   )rS   intr+   ceilrj   concatenatezeros)r   rk   rl   rm   rn   ro   Zaxis_embeds	pos_embedr
   r   r   get_nd_sinusoidal_embed   s   
r   c                 C   sr   | d dksJ t j| d td}|| d  }dd|  }t d||}t |}t |}t j||gdd	}|S )
a!  Create 1D sinusoidal positional embeddings from a grid of positions.

    Args:
        dim: the output dimension for each position.
        pos: a list of size (seq_len,) of positions to be encoded.

    Returns:
        emb: an array of size (seq_len, dim) of positional embeddings.
    r   r   r|   g       @rr   r   zm,d->mdr   r   )r+   r   r^   einsumr   r   r   )r   ru   omegaoutemb_sinemb_cosr   r
   r
   r   r~      s   


r~   c                 C   s   t | tjr)tjj| jddd t | tjr%| jd ur'tj| jd d S d S d S t | tjrI| j	rKtj| jd | jd urMtj| jd d S d S d S d S )Ng        g{Gz?)meanstdr   rr   )

isinstancer(   Linearinitnormal_weightbias	constant_	LayerNormelementwise_affine)mr
   r
   r   init_weights  s   
r   )Fr   )r   )F)r   numpyr+   r   torch.nnr(   einopsr   Moduler   r"   r1   rG   rL   rZ   rq   rf   r{   r   r~   r   r
   r
   r
   r   <module>   s"    !


 