
    vh2                     V   d dl Z d dlZd dlZd dlmZ d dlmZmZm	Z	 d dlmZ d dl
mZ  G d dej                        Zd Z G d dej                        Z G d	 d
ej                        Z G d dej                        Z G d dej                        Z G d dej                        Zy)    N)	AttentionMlp
PatchEmbed)memory_efficient_attentionc                   F     e Zd Zdddddej                  f fd	Zd Z xZS )MEAttention   Fg        c                    t         |           ||z  dk(  sJ d       || _        ||z  | _        | j                  dz  | _        t        j                  ||dz  |      | _        |r || j                        nt        j                         | _	        |r || j                        nt        j                         | _
        t        j                  |      | _        t        j                  ||      | _        t        j                  |      | _        y )Nr   z$dim should be divisible by num_headsg         bias)super__init__	num_headshead_dimscalennLinearqkvIdentityq_normk_normDropout	attn_dropproj	proj_drop)	selfdimr   qkv_biasqk_normr   r   
norm_layer	__class__s	           4/home/cameronsmith/repos/canon_controll3r/old/dit.pyr   zMEAttention.__init__   s     	Y!#K%KK"y(]]D(
99S#'93:j/3:j/I.IIc3'	I.    c                 &   |j                   \  }}}| j                  |      j                  ||d| j                  | j                        j                  ddddd      }|j                  d      \  }}}| j                  |      | j                  |      }}t        |j                  dd      |j                  dd      |j                  dd      | j                        }|j                  |||      }| j                  |      }| j                  |      }|S )Nr      r         )r   )shaper   reshaper   r   permuteunbindr   r   r   	transposer   r   r   )	r   xBNCr   qkvs	            r#   forwardzMEAttention.forward'   s    ''1aHHQKWQ1dnndmm<WQ1a# 	
 **Q-1a{{1~t{{1~1 'KK1KK1KK1**	
 IIaA IIaLNN1r$   )__name__
__module____qualname__r   	LayerNormr   r5   __classcell__r"   s   @r#   r   r      s%     <</.r$   r   c                 T    | d|j                  d      z   z  |j                  d      z   S )Nr'   )	unsqueeze)r.   shiftr   s      r#   modulater?   H   s)    EOOA&&'%//!*<<<r$   c                   <     e Zd ZdZd fd	Zedd       Zd Z xZS )TimestepEmbedderz>
    Embeds scalar timesteps into vector representations.
    c           	          t         |           t        j                  t        j                  ||d      t        j
                         t        j                  ||d            | _        || _        y )NTr   )r   r   r   
Sequentialr   SiLUmlpfrequency_embedding_size)r   hidden_sizerF   r"   s      r#   r   zTimestepEmbedder.__init__Q   sS    ==II.$GGGIIIk;T:

 )A%r$   c           	         |dz  }t        j                  t        j                  |       t        j                  d|t         j
                        z  |z        j                  | j                        }| dddf   j                         |d   z  }t        j                  t        j                  |      t        j                  |      gd      }|dz  r5t        j                  |t        j                  |ddddf         gd      }|S )	ai  
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        r&   r   )startenddtype)deviceN)r   r'   )torchexpmathlogarangefloat32torL   floatcatcossin
zeros_like)tr   
max_periodhalffreqsargs	embeddings          r#   timestep_embeddingz#TimestepEmbedder.timestep_embeddingZ   s     ax		XXj!!llEMMBC
 "AHH"
	 	
 Dz!E$K/IIuyy		$@bI	7		E,,Yq"1"u-=>?RI r$   c                 `    | j                  || j                        }| j                  |      }|S N)r`   rF   rE   )r   rZ   t_freqt_embs       r#   r5   zTimestepEmbedder.forwards   s-    ((D,I,IJ r$   )   )'  )	r6   r7   r8   __doc__r   staticmethodr`   r5   r:   r;   s   @r#   rA   rA   L   s'    A  0r$   rA   c                   .     e Zd ZdZ	 	 d fd	Zd Z xZS )DiTBlockzN
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    c                    t         	|           t        j                  |dd      | _        |rt
        }nt        } ||f|dd|| _        t        j                  |dd      | _        t        ||z        }d }t        |||d      | _        t        j                  t        j                         t        j                  |d	|z  d
            | _        y )NFư>elementwise_affineepsT)r   r   c                  .    t        j                  d      S )Ntanh)approximate)r   GELU r$   r#   approx_geluz&DiTBlock.__init__.<locals>.approx_gelu   s    77v..r$   r   )in_featureshidden_features	act_layerdrop   r   )r   r   r   r9   norm1r   r   attnnorm2intr   rE   rC   rD   r   adaLN_modulation)
r   rG   r   	mlp_ratiouse_xformers_attentionblock_kwargsr|   mlp_hidden_dimru   r"   s
            r#   r   zDiTBlock.__init__~   s     	\\+%TR
! DD
#,t
?K
	 \\+%TR
[945	/ #*!	
 !#GGIryya+oDI!
r$   c                     |t        d       || j                  | j                  |            z   }|| j                  | j	                  |            z   }|S Nzignoring timestep)printr|   r{   rE   r}   r   r.   cs      r#   r5   zDiTBlock.forward   sL    =% 34 		4::a=**$**Q-))r$   )      @Fr6   r7   r8   rg   r   r5   r:   r;   s   @r#   rj   rj   y   s     $!
Fr$   rj   c                   (     e Zd ZdZ fdZd Z xZS )
FinalLayerz!
    The final layer of DiT.
    c                 .   t         |           t        j                  |dd      | _        t        j
                  |||z  |z  d      | _        t        j                  t        j                         t        j
                  |d|z  d            | _	        y )NFrl   rm   Tr   r&   )
r   r   r   r9   
norm_finalr   linearrC   rD   r   )r   rG   
patch_sizeout_channelsr"   s       r#   r   zFinalLayer.__init__   ss    ,,{uRVWiij0<?d
 !#GGIryya+oDI!
r$   c                 d    |t        d       | j                  |      }| j                  |      }|S r   )r   r   r   r   s      r#   r5   zFinalLayer.forward   s1    =% 34 OOAKKNr$   r   r;   s   @r#   r   r      s    
r$   r   c                   J     e Zd ZdZ	 	 	 	 	 	 	 	 	 d fd	Zd Zd ZddZ xZS )DiTz6
    Diffusion model with a Transformer backbone.
    c
                 4   t         |           || _        || _        || _        || _        || _        || _        |	| _        t        | j
                  | j                  ||dd      | _
        t        |||dz  | j                        | _        t        |      | _        	 dd l}
d}t#        j$                  t'        |      D cg c]  }t)        ||||       c}      | _        t-        ||	|      | _        | j1                          y # t         $ r d}Y nw xY wc c}w )NTF)img_sizer   in_chans	embed_dimr   flattenr&   )Pr   )r   r   )r   r   r   in_channelsr   widthrG   max_num_imagesr   r   
x_embedderFeaturePositionalEncoding	x_pos_encrA   
t_embedderxformersImportErrorr   
ModuleListrangerj   blocksr   final_layerinitialize_weights)r   r   r   r   rG   depthr   r   r   r   r   r   _r"   s                r#   r   zDiT.__init__   s!    	"&(
&,$ZZvv !
 3KTVV
 +;7	+%)"
 mm u  '+A	

 &k1lC!!  	+%*"	+
s   D ?DDDc                 |   d }| j                  |       | j                  j                  j                  j                  }t
        j                  j                  |j                  |j                  d   dg             t
        j                  j                  | j                  j                  j                  d       t
        j                  j                  | j                  j                  d   j                  d       t
        j                  j                  | j                  j                  d   j                  d       | j                  D ]p  }t
        j                  j                  |j                   d   j                  d       t
        j                  j                  |j                   d   j                  d       r y )Nc                    t        | t        j                        rkt        j                  j                  j                  | j                         | j                  +t        j                  j                  | j                  d       y y y )Nr   )	
isinstancer   r   rN   initxavier_uniform_weightr   	constant_)modules    r#   _basic_initz+DiT.initialize_weights.<locals>._basic_init  sV    &")),--fmm<;;*GG%%fkk15 + -r$   r   rM   g{Gz?)stdr&   )applyr   r   r   datar   r   r   viewr)   r   r   normal_r   rE   r   r   )r   r   wblocks       r#   r   zDiT.initialize_weights  s3   	6 	

; OO  '',,

B'7 89
$//..33Q7 	++A.554@
++A.554@ [[ 	BEGGe44R8??CGGe44R8==qA	Br$   c                 R   | j                   }| j                  j                  d   }t        |j                  d   dz        x}}|j                  |j                  d   |||||f      }t        j                  d|      }|j                  |j                  d   ||z  ||z  |f      }|S )zI
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        r   r'   g      ?)r)   znhwpqc->nhpwqc)r   r   r   r~   r)   r*   rN   einsum)r   r.   r   phr   imgss          r#   
unpatchifyzDiT.unpatchify#  s    
 OO&&q)AGGAJ#%&&A
 IIQWWQZAq!Q7I8LL)1-yy
AE1q5!<y=r$   c                    |j                   \  }}}}}| j                  }|j                  ||z  |||f      }| j                  |      }|j	                  dddd      }|j                  ||||z  ||z  | j
                  f      }| j                  |      }t        | j                        D ]8  \  }	}
|j                  |||z  |z  |dz  z  | j
                  f      } |
||      }: | j                  ||      }|j                  ||z  ||z  |dz  z  | j                  |dz  z  f      }| j                  |      }|j                  ||f|j                   dd z         }|j	                  ddddd      }|S )z

        Args:
            x: Image/Ray features (B, N, C, H, W).
            t: Timesteps (N,).

        Returns:
            (B, N, D, H, W)
        r   r&   r   r'   Nr(   )r)   r   r*   r   r+   rG   r   	enumerater   r   r   r   )r   r.   rZ   r/   r0   r   r   r   r   ir   s              r#   r5   zDiT.forward4  s~    1aAFFIIq1uaA&'OOAIIaAq!IIq!Q!VQ!VT-=-=>?NN1 "$++. 	HAu		1a!eai1a4/1A1ABCAaA	
 q
 IIq1ua!eq!tmT->->A-EFGOOAIIq!fqwwqr{*+IIaAq!$r$   )	i  rz        r	   r   r   r	   r'   rb   )	r6   r7   r8   rg   r   r   r   r5   r:   r;   s   @r#   r   r      s;     
6"pB0"'r$   r   c                   ,     e Zd Zd Zd fd	Zd Z xZS )r   c                 l   fd}t        j                  t        |      D cg c]
  } ||       c}      }t        j                  |dddddf         |dddddf<   t        j                  |dddddf         |dddddf<   t        j                  |      j                  d      S c c}w )z Sinusoid position encoding tablec           
          t              D cg c]$  }| t        j                  d|dz  z  z        z  & c}S c c}w )Nr&   )r   nppower)positionhid_jbased_hids     r#   get_position_angle_veczVFeaturePositionalEncoding._get_sinusoid_encoding_table.<locals>.get_position_angle_vecb  sG     #5\ 288D!uz*:U*BCC  s   );Nr   r&   r'   )r   arrayr   rX   rW   rN   FloatTensorr=   )r   
n_positionr   r   r   pos_isinusoid_tables     ``   r#   _get_sinusoid_encoding_tablez6FeaturePositionalEncoding._get_sinusoid_encoding_table_  s    	 8=j8IJu#E*J
 #%&&14a4)@"Aq!$Q$w"$&&14a4)@"Aq!$Q$w  0::1== Ks   B1c                 V   t         |           || _        || _        || _        || j                  dz  z  | _        | j                  d| j                  | j                  | j                  d             | j                  d| j                  | j
                  | j                  d             y )Nr&   image_pos_tablerf   token_pos_tableiw )r   r   r   feature_dimr   num_patchesregister_bufferr   )r   r   r   r   r   r"   s        r#   r   z"FeaturePositionalEncoding.__init__p  s    ,&&$&&!)3--##T%5%5u	
 	--  $"2"2E	
r$   c                    |j                   d   }|j                   d   }|j                  ||| j                  | j                        }| j                  d d d |f   j                         j                         }|j                  d|d| j                  f      }|j                  |d| j                  df      }| j                  j                         j                         }|j                  dd| j                  | j                  f      }|j                  ||ddf      }||z   |z   }|j                  ||| j                  z  | j                  f      }|S )Nr   r'   )	r)   r*   r   r   r   clonedetachrepeatr   )r   r.   
batch_size
num_imagespe1pe2x_pes          r#   r5   z!FeaturePositionalEncoding.forward  s8   WWQZ
WWQZ
IIj*d.>.>@P@PQ ""1kzk>288:AACkk1j!T-=-=>?jj*a)9)91=> ""((*113kk1a!1!143C3CDEjj*j!Q783w}||d&6&668H8HI
 r$   )r	   r   re   r'   )r6   r7   r8   r   r   r5   r:   r;   s   @r#   r   r   ^  s    >"
*r$   r   )rP   numpyr   rN   torch.nnr   timm.models.vision_transformerr   r   r   xformers.opsr   Moduler   r?   rA   rj   r   r   r   rt   r$   r#   <module>r      s        E E  35")) 5r=*ryy *Z:ryy :z 0M")) M`<		 <r$   