
    f                     6   d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dl	mc m
Z d dlZd dlmZmZ d dlmZ d dlZdZ G d dej&                        Zd Zd	 Zd
 Zd.dZd Zd Zd/dZd Z G d dej:                  j<                        Z G d dej@                        Z! G d dej@                        Z" G d dejF                  e"      Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d de"      Z( G d  d!ej@                        Z)d" Z* G d# d$ej@                        Z+ G d% d&ej@                        Z, G d' d(ej@                        Z- G d) d*ej@                        Z.	 	 	 	 d0d+Z/	 	 	 	 d1d,Z0	 	 	 	 d2d-Z1y)3    )abstractmethodN)	rearrangerepeat)einsumTc                        e Zd Z fdZ xZS )GroupNorm32c                 p    t         |   |j                               j                  |j                        S N)superforwardfloattypedtype)selfx	__class__s     4/home/cameronsmith/repos/minimal-diffusion/models.pyr   zGroupNorm32.forward   s'    wqwwy)..qww77    )__name__
__module____qualname__r   __classcell__r   s   @r   r   r      s    8 8r   r   c                     | dk(  rt        j                  |i |S | dk(  rt        j                  |i |S | dk(  rt        j                  |i |S t	        d|        )z4
    Create a 1D, 2D, or 3D convolution module.
             unsupported dimensions: )nnConv1dConv2dConv3d
ValueErrordimsargskwargss      r   conv_ndr(      sh     qyyy$)&))	yy$)&))	yy$)&))
/v6
77r   c                  ,    t        j                  | i |S )z!
    Create a linear module.
    )r   Linear)r&   r'   s     r   linearr+   #   s     99d%f%%r   c                     | dk(  rt        j                  |i |S | dk(  rt        j                  |i |S | dk(  rt        j                  |i |S t	        d|        )z8
    Create a 1D, 2D, or 3D average pooling module.
    r   r   r   r   )r   	AvgPool1d	AvgPool2d	AvgPool3dr#   r$   s      r   avg_pool_ndr0   *   sh     qy||T,V,,	||T,V,,	||T,V,,
/v6
77r   c                     t        | |      D ]8  \  }}|j                         j                  |      j                  |d|z
         : y)a#  
    Update target parameters to be closer to those of source parameters using
    an exponential moving average.

    :param target_params: the target parameter sequence.
    :param source_params: the source parameter sequence.
    :param rate: the EMA rate (closer to 1 means slower).
    r   )alphaN)zipdetachmul_add_)target_paramssource_paramsratetargsrcs        r   
update_emar<   7   sF     6 ;	c4 %%cT%:;r   c                 l    | j                         D ]   }|j                         j                          " | S )z<
    Zero out the parameters of a module and return it.
    )
parametersr4   zero_)moduleps     r   zero_modulerB   D   s3       	
Mr   c                     t        d|       S )z
    Make a standard normalization layer.

    :param channels: number of input channels.
    :return: an nn.Module for normalization.
        )r   )channelss    r   normalizationrF   M   s     r8$$r   c           	         |dz  }t        j                  t        j                  |       t        j                  d|t         j
                        z  |z        j                  | j                        }| dddf   j                         |d   z  }t        j                  t        j                  |      t        j                  |      gd      }|dz  r5t        j                  |t        j                  |ddddf         gd      }|S )	aY  
    Create sinusoidal timestep embeddings.

    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an [N x dim] Tensor of positional embeddings.
    r   r   )startendr   )deviceNdimr   )thexpmathlogarangefloat32torJ   r   catcossin
zeros_like)	timestepsrM   
max_periodhalffreqsr&   	embeddings          r   timestep_embeddingr^   W   s     !8DFF	*			t2:: NNQUUb	  b! 
 QW##%d3DtbffTl3<I
QwFFIr}}Yq"1"u5E'FGRP	r   c                 ~    |r7t        |      t        |      z   }t        j                  | t        |      g| S  | | S )a  
    Evaluate a function without caching intermediate activations, allowing for
    reduced memory at the expense of extra compute in the backward pass.

    :param func: the function to evaluate.
    :param inputs: the argument sequence to pass to `func`.
    :param params: a sequence of parameters `func` depends on but does not
                   explicitly take as arguments.
    :param flag: if False, disable gradient checkpointing.
    )tupleCheckpointFunctionapplylen)funcinputsparamsflagr&   s        r   
checkpointrh   l   s?     V}uV},!''c&kADAAV}r   c                   ,    e Zd Zed        Zed        Zy)ra   c                     || _         t        |d |       | _        t        ||d        | _        t	        j
                         5   | j                   | j                   }d d d        |S # 1 sw Y   S xY wr
   )run_functionlistinput_tensorsinput_paramsrN   no_grad)ctxrk   lengthr&   output_tensorss        r   r   zCheckpointFunction.forward   sp    ' gv/VW.ZZ\ 	B-S--s/@/@AN	B	Bs   A&&A0c                    | j                   D cg c]!  }|j                         j                  d      # c}| _         t        j                         5  | j                   D cg c]  }|j                  |       }} | j                  | }d d d        t        j                  j                  | j                   | j                  z   |d      }| ` | `	~d|z   S c c}w c c}w # 1 sw Y   WxY w)NT)allow_unusedNN)
rm   r4   requires_grad_rN   enable_gradview_asrk   autogradgradrn   )rp   output_gradsr   shallow_copiesrr   input_gradss         r   backwardzCheckpointFunction.backward   s    FIFWFWXQXXZ66t<X^^ 	? 584E4EFqaiilFNF-S--~>N	? kk&& 0 00	 ' 
 k))! Y
 G		? 	?s#   &CC C8CCC(N)r   r   r   staticmethodr   r~    r   r   ra   ra   ~   s(      * *r   ra   c            	       >     e Zd ZdZ	 ddedededef fdZd Z xZS )	AttentionPool2dzS
    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
    spacial_dim	embed_dimnum_heads_channels
output_dimc                 8   t         |           t        j                  t	        j
                  ||dz  dz         |dz  z        | _        t        d|d|z  d      | _        t        d||xs |d      | _	        ||z  | _
        t        | j                        | _        y )Nr   r   g      ?r   )r   __init__r   	ParameterrN   randnpositional_embeddingr(   qkv_projc_proj	num_headsQKVAttention	attention)r   r   r   r   r   r   s        r   r   zAttentionPool2d.__init__   s     	$&LLHHYq 01 45	S8HH%
!  9a)mQ?aJ,C)QG"&88%dnn5r   c                 |   |j                   ^}}}|j                  ||d      }t        j                  |j	                  dd      |gd      }|| j
                  d d d d d f   j                  |j                        z   }| j                  |      }| j                  |      }| j                  |      }|d d d d df   S )NrK   T)rM   keepdimrL   r   )shapereshaperN   rU   meanr   rT   r   r   r   r   )r   r   bc_spatials        r   r   zAttentionPool2d.forward   s    ''1xIIaBFFAFFr4F0!4"=))$1*588AAMM!NN1KKNAqzr   r
   )r   r   r   __doc__intr   r   r   r   s   @r   r   r      s=     66 6  	6
 6 r   r   c                        e Zd ZdZed        Zy)TimestepBlockzT
    Any module where forward() takes timestep embeddings as a second argument.
    c                      y)zJ
        Apply the module to `x` given `emb` timestep embeddings.
        Nr   r   r   emb
scene_infos       r   r   zTimestepBlock.forward   s    r   N)r   r   r   r   r   r   r   r   r   r   r      s      r   r   c                       e Zd ZdZd Zy)TimestepEmbedSequentialzt
    A sequential module that passes timestep embeddings to the children that
    support it as an extra input.
    c           	         |=t        j                  t        j                  |d d d df   |d d d df   fd            }| D ]g  }t        |t              r |||t
        s|d d d df   n|      }/|j                         dk(  r ||t
        s|d d d df   n|      \  }}` ||      }i ||j                  d      dk(  r|fS |dd |j                  d      dz  f   fS )Nr   rK   AttentionBlock   .r   )torchrX   rU   
isinstancer   use_cond	_get_namesize)r   r   r   r   layers        r   r   zTimestepEmbedSequential.forward   s    %*:*:599c!BQB$iPSTUVXWXVXTXPYEZ[];^*_z 	E%/!S(*QrrT"2
S"$44"1XZ"1"%5:V* !H	 zr2C7ddZLcZ__]_M`bcMcLcHc=dddr   N)r   r   r   r   r   r   r   r   r   r      s    

er   r   c                   *     e Zd ZdZd fd	Zd Z xZS )UpsampleaA  
    An upsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    c                     t         |           || _        |xs || _        || _        || _        |r*t        || j                  | j                  dd      | _        y y )Nr   r   padding)r   r   rE   out_channelsuse_convr%   r(   conv)r   rE   r   r%   r   r   s        r   r   zUpsample.__init__   sW     (4H 	dmmT5F5FSTUDI r   c                    |j                   d   | j                  k(  sJ | j                  dk(  rLt        j                  ||j                   d   dz  |j                   d   dz  |j                   d   dz  fd      }nt        j                  |dd      }|j                   d   |j                   d	   cxk(  rdk(  rn nt        j
                  |d
      }| j                  r| j                  |      }|S )Nr   r   r      nearest)mode)scale_factorr   rK   )r   r   r   r   )r   rE   r%   Finterpolatepadr   r   )r   r   outs      r   r   zUpsample.forward   s    wwqzT]]**99>--QWWQZ!^QWWQZ!^QWWQZRS^$T[deC --	BC772;!''"+**%%\*C==))C.C
r   r   Nr   r   r   r   r   r   r   r   s   @r   r   r      s    Vr   r   c                   *     e Zd ZdZd fd	Zd Z xZS )
DownsampleaD  
    A downsampling layer with an optional convolution.
    :param channels: channels in the inputs and outputs.
    :param use_conv: a bool determining if a convolution is applied.
    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    c                 <   t         |           || _        |xs || _        || _        || _        |dk7  rdnd}|dk7  rdnd}|r+t        || j                  | j                  ||d      | _        y | j                  | j                  k(  sJ t        |||      | _        y )Nr   r   )r   r   r   r   )r   r   r   )strider   )kernel_sizer   )	r   r   rE   r   r   r%   r(   opr0   )r   rE   r   r%   r   r   r   r   s          r   r   zDownsample.__init__  s     (4H 	aiY!8admmT%6%6F\]DG ==D$5$555!$F6JDGr   c                 `    |j                   d   | j                  k(  sJ | j                  |      S Nr   )r   rE   r   )r   r   s     r   r   zDownsample.forward  s'    wwqzT]]**wwqzr   r   r   r   s   @r   r   r      s    K r   r   c                   $     e Zd Z fdZd Z xZS )SimpleResMLPc                 `   t         |           d}t        j                  t        j                         t        j
                  ||            | _        t        j                  t        j                         t        j
                  ||            | _        t        j
                  ||      | _        y r   )	r   r   r   
SequentialSiLUr*   	in_layers
out_layersskip_connection)r   rE   r%   r   s      r   r   zSimpleResMLP.__init__  sx    GGIIIhx(

 --GGIIIhx(

  "yy(;r   c                 n    | j                  |      }| j                  |      }| j                  |      |z   S r
   )r   r   r   )r   r   hs      r   r   zSimpleResMLP.forward,  s4    NN1OOA##A&**r   )r   r   r   r   r   r   r   s   @r   r   r     s    <+r   r   c                   B     e Zd ZdZ	 	 	 	 	 	 	 d fd	ZddZddZ xZS )ResBlocka  
    A residual block that can optionally change the number of channels.
    :param channels: the number of input channels.
    :param emb_channels: the number of timestep embedding channels.
    :param dropout: the rate of dropout.
    :param out_channels: if specified, the number of out channels.
    :param use_conv: if True and out_channels is specified, use a spatial
        convolution instead of a smaller 1x1 convolution to change the
        channels in the skip connection.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param use_checkpoint: if True, use gradient checkpointing on this module.
    :param up: if True, use this block for upsampling.
    :param down: if True, use this block for downsampling.
    c                    t         |           || _        || _        || _        |xs || _        || _        || _        || _        t        j                  t        |      t        j                         t        ||| j
                  dd            | _        |	xs |
| _        |	r%t!        |d|      | _        t!        |d|      | _        nG|
r%t'        |d|      | _        t'        |d|      | _        n t        j(                         x| _        | _        t        j                  t        j                         t+        ||rd| j
                  z  n| j
                              | _        t        j                  t        j                         t+        ||rd| j
                  z  n| j
                              | _        t        j                  t        | j
                        t        j                         t        j0                  |      t3        t        || j
                  | j
                  dd                  | _        | j
                  |k(  rt        j(                         | _        n?|r t        ||| j
                  dd      | _        nt        ||| j
                  d      | _        t9        d| j                  dd      | _        t        j<                  d| j                        | _        y )	Nr   r   r   Fr   )rA      r   ) r   r   rE   emb_channelsdropoutr   r   use_checkpointuse_scale_shift_normr   r   rF   r   r(   r   updownr   h_updx_updr   Identityr+   
emb_layersemb_layers_scDropoutrB   r   r   SimpleCrossAttentionscene_info_attnr*   spatial_lin)r   rE   r   r   r   r   r   r%   r   updownr   s              r   r   zResBlock.__init__A  s@    	 ((4H ,$8!(#GGID(D$5$5q!D
 jD!(E48DJ!(E48DJ#HeT:DJ#HeT:DJ&(kkm3DJ--GGIL;O!d&7&7"7UYUfUfh
  ]]GGIH7Ka$"3"33QUQbQbd

 --$++,GGIJJ!d//1B1BAqQ	
 (#%;;=D #*h 1 11a$D  $+44;L;La#PD 3CaL99Qt}}5r   c                 h    t        | j                  |||f| j                         | j                        S )a  
        Apply the block to a Tensor, conditioned on a timestep embedding.
        :param x: an [N x C x ...] Tensor of features.
        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        )rh   _forwardr>   r   r   s       r   r   zResBlock.forward  s2     MMAs:.0A4CVCV
 	
r   c                     | j                   r| j                  d d | j                  d   }} ||      }|j                  d      dk(  r1	 t        j                  ||fd      }t        j                  ||fd      }| j                  |      }| j                  |      } ||      }n| j                  |      }| j                  |      j                  |j                        }t        |j                        t        |j                        k  r1|d   }t        |j                        t        |j                        k  r1|j                  ddddd      }t        j                  ||j                  dd        }|j                  d      d	k  r	 | j0                  rS| j2                  d   | j2                  dd  }}t        j4                  |dd
      \  }} ||      d|z   z  |z   } ||      }n||z   }| j3                  |      }| j7                  |      |z   S )NrK   r   r   ).Nr   r   r      rL   )r   r   r   rN   rU   r   r   r   r   r   rc   r   permuter   r   r   stackmeshgridlinspacecudar   r   r   r   r   r   chunkr   )r   r   r   r   in_restin_convr   emb_outuv_emb	pos_emb_x	scene_embout_normout_restscaleshifts                  r   r   zResBlock._forward  s   ;;#~~cr2DNN24FWG
Avvay!|&&!Aq/&&!Aq/

1A

1A
Aq!A//#&++AGG4'-- 3qww</79;Mc'-- 3qww<///!Aa!,--5 66":r>a $$!%!3T__QR5HhH88GQA6LE5q5y)E1AAGA"A##A&**r   )NFFr   FFFr
   r   r   r   r   r   r   r   r   r   s   @r   r   r   1  s0    ( "C6J	
$+r   r   c                   :     e Zd ZdZ	 	 	 	 	 d fd	Zd Zd Z xZS )r   a  
    An attention block that allows spatial positions to attend to each other.
    Originally ported from here, but adapted to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    c                 .   t         |           || _        ||n|| _        |dk(  r|| _        n||z  dk(  sJ d| d|        ||z  | _        || _        t        |      | _        t        d||dz  d      | _	        |rt        | j                        | _        nt        | j                        | _        t        t        d||d            | _        t        j                   d| j                        | _        t        j                   | j                  d      | _        t        j                   d	| j                        | _        t)        | j                        | _        t)        | j                        | _        t        j                   d| j                        | _        y )
NrK   r   zq,k,v channels z' is not divisible by num_head_channels r   r   r   r   r   )r   r   rE   channels_outr   r   rF   normr(   qkvr   r   QKVAttentionLegacyrB   proj_outr   r*   scene_info_proj_inscene_info_proj_outr   r   mlp_inmlp_outreg_pos_emb)r   rE   r   num_head_channelsr   use_new_attention_orderr   r   s          r   r   zAttentionBlock.__init__  sU    	 (4(<H,"&DN ,,1f 
*QRcQdef &)::DN,!(+	1h1a8")$..9DN 0?DN#GAx1$EF#%99S#?#%99T]]3#? 1T]]3#DMM2#DMM299Qt}}5r   c                 R    t        | j                  ||f| j                         d      S )NT)rh   r   r>   )r   r   r   s      r   r   zAttentionBlock.forward  s"    $--!J9JDQQr   c                 <   t        |j                        dk(  }|r|j                  ddd      dd d d d f   }|j                  ^}}}|j                  d      dk7  }|r^| j	                  t        j                  t        j                  t        j                  dd|j                  d            t        j                  dd|j                  d                  d      j                               }||j                  ddd      d d d d f   z   }|j                  ||d      }| j                  |      j                  ddd      }| j                  t        j                  dd|j                  d            d d d f   j                               d    j                  ddd      }	||	z   }t        j                  ||fd      }n| j                  |      j                  ddd      }| j                  t        j                  dd|j                  d            d d d f   j                               d    j                  ddd      }	||	z   }| j                  |j                  ddd            j                  ddd      }| j                  | j!                  |            }
| j#                  |
      }| j%                  |      }||z   }| j'                  |j                  ddd            j                  ddd      }|j                  d      dk7  rR|dd |j                  d       f   | j)                  |d|j                  d       d f   j                  ddd            }} |j                  ||g| }|r1|j+                  d      j+                  d      j                  ddd      }||fS )Nr   r   r   r   .rK   r   )rc   r   r   r   r   rN   r   r   r   r   r   r  r  rU   r  r   r   r   r  r  r  squeeze)r   r   r   not_spatialr   r   spatialusing_xr   r  r   r   r   s                r   r   zAttentionBlock._forward  s   L!O!))Aa*3tAd?;1w&&)Q,%%bhhr{{2;;r!AFFSUJ;WXZXcXcdfghijioioprisXt/uvx&y&~&~  'A  BFFNN1Qq)$q+66A 		!Q#A00<DDQqKJ**2;;r!JOOB<O+PQRSWQW+X+]+]+_`aefnnopqrstuK!+-J*~a(A''
3;;AaBA**2;;r!JOOB<O+PQRSWQW+X+]+]+_`aefnnopqrstuKKAKK		!Aa()11!Aa8hhtyy|$NN3MM!1ull3;;q1-.66q1=??2!C<Qjoob>Q=Q<Q8Q4RTXTlTlmpqt  wA  wF  wF  GI  wJ  vJ  vK  rK  nL  nT  nT  UV  WX  YZ  n[  U\z3ckk!Q))S[[_44R8@@1QG:~r   )r   rK   FFNr   r   s   @r   r   r     s*      %&6PR
'r   r   c                     |d   j                   ^}}}t        t        j                  |            }d|z  |dz  z  |z  }| xj                  t        j                  |g      z  c_        y)a(  
    A counter for the `thop` package to count the operations in an
    attention operation.
    Meant to be used like:
        macs, params = thop.profile(
            model,
            inputs=(inputs, timestamps),
            custom_ops={QKVAttention: QKVAttention.count_flops},
        )
    r   r   N)r   r   npprod	total_opsrN   DoubleTensor)model_xyr   r   r  num_spatial
matmul_opss           r   count_flops_attnr    s]     qTZZNAq7bggg&'K Q+*+a/J	OOr
|44Or   c                   8     e Zd ZdZ fdZd Zed        Z xZS )r  zh
    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
    c                 0    t         |           || _        y r
   r   r   n_headsr   r  r   s     r   r   zQKVAttentionLegacy.__init__.      r   c                 D   |j                   \  }}}|d| j                  z  z  dk(  sJ |d| j                  z  z  }|j                  || j                  z  |dz  |      j                  |d      \  }}}dt	        j
                  t	        j
                  |            z  }	t        j                  d||	z  ||	z        }
t        j                  |
j                         d      j                  |
j                        }
t        j                  d|
|      }|j                  |d|      S )z
        Apply QKV attention.
        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r   r   r   rL   bct,bcs->btsrK   bts,bcs->bct)r   r  r   splitrP   sqrtrN   r   softmaxr   r   r   r   r   bswidthrq   chqkvr   weightas               r   r   zQKVAttentionLegacy.forward2  s      IIE6DLL()Q..q4<<'(++b4<</a@FFrqFQ1aDIIdiim,,AIq5y
 FLLN388FIInfa0yyR((r   c                     t        | ||      S r
   r  r  r  r  s      r   count_flopszQKVAttentionLegacy.count_flopsD      r1--r   	r   r   r   r   r   r   r   r3  r   r   s   @r   r  r  )  s&    )$ . .r   r  c                   ,     e Zd ZdZd fd	ZddZ xZS )r   z(
    Simple vanilla cross attention
    c           
      F   t         |           ||z  }|dz  | _        || _        t	        j
                  ||d      | _        t	        j
                  ||dz  d      | _        t	        j
                  ||      | _        t	        j                  t	        j
                  |t        d|z              t	        j                         t	        j
                  t        d|z        |            | _        t	        j                  |g      | _        t	        j                  |g      | _        y )Ng      F)biasr   r   )r   r   r   headsr   r*   to_qto_kvprojr   r   GELUr   	LayerNormln_1ln_2)r   ch_kvch_qr9  dim_head	inner_dimr   s         r   r   zSimpleCrossAttention.__init__L  s    u$	%

 IIdIE:	YYui!m%@
IIi.	==IIdC$K(GGIIIc!D&k4(
 LL%)	LL$(	r   c                    t        |j                        dkD  rG | |j                  dd      |j                  dd      |      j                  d|j                  d d       S | j	                  |      }| j                  |      }| j                  | j                  |      }| j                  |      j                  dd      \  }}	t        fd|||	f      \  }}}	t        d	||      | j                  z  }
|
j                  |      }t        d
||	      }t        |d      }| j                  |      }|r||z   }| j!                  | j                  |            |z   }|S )Nr   r   r   r   r   rK   rL   c                      t        | d      S )Nzb n (h d) -> (b h) n dr   )r   )tr   s    r   <lambda>z.SimpleCrossAttention.forward.<locals>.<lambda>n  s    	!-E K r   zb i d, b j d -> b i jzb i j, b j d -> b i dz(b h) n d -> b n (h d)rG  )rc   r   flatten	unflattenr?  r@  r9  r:  r;  r   mapr   r   r&  r   r<  r   )r   r   r  softmax_axisuse_skipx_lny_lnr+  r,  r-  simattnr   r   s                @r   r   zSimpleCrossAttention.forwardb  s@   qww<>		!B		!BFPPQRSTSZSZ[^\^S_``yy|yy|JJIIdOzz$%%aR%01KaQRTUYW1a ,a3djj@{{|{,,dA65;iin37Shhtyy~&,
r   )   @   )rK   Tr   r   s   @r   r   r   H  s    ),r   r   c                   8     e Zd ZdZ fdZd Zed        Z xZS )r   zP
    A module which performs QKV attention and splits in a different order.
    c                 0    t         |           || _        y r
   r  r  s     r   r   zQKVAttention.__init__  r   r   c           	         |j                   \  }}}|d| j                  z  z  dk(  sJ |d| j                  z  z  }|j                  dd      \  }}}dt        j                  t        j                  |            z  }	t        j                  d||	z  j                  || j                  z  ||      ||	z  j                  || j                  z  ||            }
t        j                  |
j                         d      j                  |
j                        }
t        j                  d|
|j                  || j                  z  ||            }|j                  |d|      S )z
        Apply QKV attention.
        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
        :return: an [N x (H * C) x T] tensor after attention.
        r   r   r   rL   r"  rK   r#  )r   r  r   rP   r%  rN   r   viewr&  r   r   r   r   r'  s               r   r   zQKVAttention.forward  s$     IIE6DLL()Q..q4<<'())A1)%1aDIIdiim,,YR$,,.F;YR$,,.F;

 FLLN388FIInfaiiT\\8I2v.VWyyR((r   c                     t        | ||      S r
   r1  r2  s      r   r3  zQKVAttention.count_flops  r4  r   r5  r   s   @r   r   r   ~  s&    )( . .r   r   c                   f     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Zd Zd	dZd	dZd
dZddZ xZ	S )	UNetModela  
    The full UNet model with attention and timestep embedding.
    :param in_channels: channels in the input Tensor.
    :param emb_dim: base dimension of timestep embedding.
    :param model_channels: base channel count for the model.
    :param out_channels: channels in the output Tensor.
    :param num_res_blocks: number of residual blocks per downsample.
    :param attention_resolutions: a collection of downsample rates at which
        attention will take place. May be a set, list, or tuple.
        For example, if this contains 4, then at 4x downsampling, attention
        will be used.
    :param dropout: the dropout probability.
    :param channel_mult: channel multiplier for each level of the UNet.
    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
    :param dims: determines if the signal is 1D, 2D, or 3D.
    :param num_classes: if specified (as an int), then this model will be
        class-conditional with `num_classes` classes.
    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
    :param num_heads: the number of attention heads in each attention layer.
    :param num_heads_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
    :param resblock_updown: use residual blocks for up/downsampling.
    :param use_new_attention_order: use a different attention pattern for potentially
                                    increased efficiency.
    c                    t         !|           d\  }}|dk(  r|}|| _        || _        || _        || _        || _        || _        || _        |	| _	        |
| _
        || _        || _        |rt        j                  nt        j                  | _        || _        || _        || _        d}t)        j*                  t-        ||      t)        j.                         t-        ||            | _        t)        j*                  t-        ||      t)        j.                         t-        ||            | _        t-        d|      | _        t)        j*                  t-        ||      t)        j.                         t-        ||      t)        j.                         t-        |d            | _        | j                  t)        j8                  ||      | _        t=        |	d   |z        x}}t)        j>                  tA        tC        |||dd            g      | _"        || _#        |g}d}tI        |	      D ]4  \  }}tK        |      D ]  }tM        |||t=        ||z        |||	      g}t=        ||z        }||v r|jO                  tQ        |||||
             | jD                  jO                  tA        |        | xjF                  |z  c_#        |jO                  |        |tS        |	      dz
  k7  s|}| jD                  jO                  tA        |rtM        |||||||d      ntU        ||
||                   |}|jO                  |       |dz  }| xjF                  |z  c_#        7 tA        tM        ||||||      tQ        |||||
      tM        ||||||            | _+        | xjF                  |z  c_#        t)        j>                  g       | _,        t[        tI        |	            d d d   D ]  \  }}tK        |dz         D ]  }|j]                         } tM        || z   ||t=        ||z        |||	      g}t=        ||z        }||v r|jO                  tQ        |||||
             |r?||k(  r:|}|jO                  |rtM        |||||||d      nt_        ||
||             |dz  }| jX                  jO                  tA        |        | xjF                  |z  c_#          t)        j*                  ta        |      t)        j.                         tc        tC        |||dd                  | _2        t-        dd      | _3        t-        dd      | _4        t-        dd      | _5        t-        dd      | _6        t-        dd      | _7        t)        j>                  tK        d      D cg c]  }tQ        |dz  |||||       c}      | _8        t)        j>                  | jp                  D cg c]!  }tc        t)        jr                  dd            # c}      | _:        t)        j>                  | jp                  D cg c]  }tc        tC        ddddd             c}      | _;        tc        t)        jr                  dd            | _<        t)        j*                  t{        d      gt)        jr                  dd      gz    | _>        t)        j*                  t)        jr                  dd      gt{        d      gz   t)        jr                  dd      gz    | _?        t)        j*                  t{        d      gt)        jr                  dd      gz    | _@        t)        jr                  dd      | _A        t)        jr                  dd      | _B        t)        j8                  dd      j                         | _D        t)        j                  dddd      | _F        t        j                         | _I        y c c}w c c}w c c}w )N)r   rS  rK   r      r   r   r   r   )r   r%   r   r   )r   r   r  r	  T)r   r%   r   r   r   )r%   r   r   )r%   r   r   )r   r%   r   r   r            
   )r   r   r   r  r	     r      )Jr   r   
image_sizein_channelsmodel_channelsr   num_res_blocksattention_resolutionsr   channel_multconv_resamplenum_classesr   rN   float16rS   r   r   r  num_heads_upsampler   r   r+   r   
time_embedframe_embedcam_emb
cam_decode	Embedding	label_embr   
ModuleListr   r(   input_blocks_feature_size	enumerateranger   appendr   rc   r   middle_blockoutput_blocksrl   popr   rF   rB   r   cameras_proj_incameras_proj_outscene_graph_proj_inscene_graph_proj_outscene_emb_projmiddle_attnsr*   sg_attn_emb_projs_globalsg_attn_emb_projs_spatial
img_to_embr   scene_info_embed
bbox_embed
clip_embedglobal_clip_embglobal_bbox_embr   non_embeddingr"   img_tok_downproj	diffusionGaussianDiffusionfwd_diffuser)"r   rd  re  rf  r   rg  rh  time_emb_factorr   ri  rj  r%   rk  r   use_fp16r   r  rm  r   resblock_updownr	  time_embed_dimr*  input_chinput_block_chansdslevelmult_layersout_chiichr   s"                                    r   r   zUNetModel.__init__  s   . 	!$ L#!*$&,(,%:"(*&,#+RZZ
"!2"4-->>2GGI>>2

 ==>>2GGI>>2
 a0-->>2GGI>>2GGI>1%
 '\\+~FDNLOn<==XMM$WT;Aq%QRS
  D$\2 	)KE4>* -b.'DSaLaHbim  M  dx  z ./..MM&>U^  sD  ^u  w !!(()@&)IJ""b("!((,- L)A--!!((+* !"ngFY]n|  Th  os  t']TZ[ !((,a""b("3	)6 4b.'^  sG  IB~fw  Ri  kb.'^  sG  I

 	b ]]2.	, 782> 	)KE4>A-. )'++-b3hcR`cgRgNhos  ES  j~  @ $./..MM&>Ug  |M  g~  @ Q.0FMM* !"ngFY]n|  Th  mq  r%b-dQWX
 1HB""))*A6*JK""b("')	), ==-"3RWWYGTXZbdprs}~L  AA  C%a~ &s3#)"c? $*3t$4!$So MM #()+-  +BqDbQ_k}  Rc  }T  V+- . )+`d`q`q6r[\{299SQTCU7V6r(s%)+rv  sD  sD  8EmnWQPSUXZ[efEg8h  8E  *F&%biiC&89 "c1B0CRYYsSVEWDX0X Z--299Qs+;*<l3>O=P*PRTR[R[\_`cRdQe*eg--<+<*=ryyS?Q>R*RT!yyS1 iiC0\\!S1668 !#		#c!A 6%7793+- 7s 8Es   _&_% _c                 j   | j                  |d         }| j                  |d         }||z   x}}| j                  t        || j                              d d d f   x}}	 d|v rK|d   }	|| j                  |	j                  dd      j                  ddd            j                  dd	      z   }t        | j                  | j                  | j                        D ]  \  }
}}t        j                  ||fd      }t        j                  |t        j                  t        j                  |d d d df         |fd      fd
      }d|v r	  |
t        j                   ddddd      j#                  |      |      d   }|d d dd f   } | j%                  |d d |d   j'                  d       d f         }| j)                  |      }|dd |j'                  d
      dz  f   |d|j'                  d
      dz  d f   |dd |j'                  d
      dz  f   |d|j'                  d
      dz  d f   d}|||ffS )Nnoised_scene_graphnoised_camerasimg_tokr   r   r   r   Tr   rK   .)eps_scene_graphscene_grapheps_camerascameras)r  r}  rn  r^   rf  r  rJ  r   sumr3   r  r  r  rN   rU   rX   zerosrT   r~  r   r  )r   sg_inputrY   r  camera_infoorg_scene_infor   r   emb_timeimg_tok_condrR  global_img_emb_projlocal_img_emb_projcameras_predscene_graph_predout_dicts                   r   denoise_scene_graphzUNetModel.denoise_scene_graphk  s   ..x8L/MN**84D+EF&1K&??);ItGZGZ)[\]^_c]cddh	  !),L(<(<Qq(A(I(I!Aa(PQUUVW`dUeeC=@ARARSWSpSpqu  rP  rP  >Q 	*9D%'9Z 03JRVVR]]>RSTVUVTVRVCW5XYg4hij-k mnpqJH$ ""((1Qq1"5"8"8"DZPQRS
#AabD)J	* ,,jHEU<V<[<[\]<^;^;_9_.`b44jB
 '7s;X<L<Q<QRT<UWX<X;X7X&Yhxy|  ~N  ~S  ~S  TV  ~W  YZ  ~Z  ~[  z[  i\".s3LL4E4Eb4I14L3L/L"MXdehiuiziz{}i~  AB  jB  jC  fC  YDE),777r   c
                 \   d}	 i }
d\  }}d|vrt        j                  |d         |d<   |s|r|d   }|j                  ddddd	      }|s|rt        j                        }g }| j	                  t        || j                              d d d f   x}}| j                  |d
   j                  dd            }| j                  |d         }||z   x}}| j                  |      | j                  |      z   j                  dd      }||z   }|rj                  | j                        }| j                  D ]:  } |||t        j                   ||fd            \  }}|r|}|j#                  |       < | j%                  ||t        j                   ||fd            \  }}|r|}|st        j                  |      }|r| j&                  D ]  }|j)                         }j+                  d      |j+                  d      k7  r|d d d d d |j+                  d      f   }|st        j                  |      }t        j                   ||gd      }|j+                  d      dk(  r|} |||t        j                   ||fd            \  }} j                  j                        }| j-                  |      }|
|d d g df   |d d g df   j/                         dz  }
|
j1                         D ci c]5  \  }}|t3        |j4                        dk7  r|n|j                  ddddd	      7 c}}S c c}}w )NTTT
noised_rgbrgbr   r   r   r   r   bboxsr   rK   	clip_embsr  rL   r   r   r   r   r      )eps_rgbr  r  )r   rX   r   rN   rn  r^   rf  r  rJ  r  r  r  r  r   r   ru  rU   ry  rz  r{  r|  r   r   tanhitemsrc   r   )r   model_inputrY   r  autodecoderrN  clip_global_latentsample_rand_globaluse_embteacher_forcingr  use_encoderuse_decoderr   hsr   r  	bbox_embsr  r  r   
emb_globalr   r@   scene_info_hpr   r,  r-  s                                r   r   zUNetModel.forward  s   	  )K {*UEUEUVabgVhEi;|+D +,'Aii!Aa"A{mmAA);ItGZGZ)[\]^_c]cddh OOK$8$@$@B$GH	OOK$<=	&/)&;;**95d6J6J96UUZZ[\eiZj
Jtzz"A++  &q#rvvz>6RSU/V W+Z		!	
 !--abffj.=YZ\6]^MAkKr}}Q/Q, ,, VVVX66!9bggaj(Aa+2771:+o,>!BMM"$5FFAr7*??2&)^*%aRVVZ4PQS-TU*V qwwA((1+C "1W9~qy>..0 H OWnnN^_sqAGGa1QYYq1Qq-AA___s   *:L(c
                    d}	 i }
d\  }}|s|r|d   }|j                  ddddd      }|s|rt        j                        }g }| j                  t	        || j
                              d d d f   x}}|rwj                  | j                        }| j                  D ]   } |||d       d   }|j                  |       " | j                  ||d       d   }|st        j                  |      }|r| j                  |d	         }	 |j                         D ci c]  \  }}d
|v s|| }}}d|vr||t        j                  t              dd      j!                         j#                         t        j                  t        |      dd      j!                         j#                         dz  }| j$                  j'                  | |ddd idd|d   |rdini d||	      \  }}}\  }}|
||j                         D ci c]  \  }}|dz   | c}}z  |z  z  }
||fD cg c]  }|d|j)                  d      dz  d f    c}\  }}|d   |d   }}d}|rd|v rt-        d       |d   }|d   }| j/                  |      }| j1                  |      }||z   x} x}!}"| x}"}!|| j3                  |       z   }|rL| j4                  D ]  }|j7                         }#j)                  d      |#j)                  d      k7  r|d d d d d |#j)                  d      f   }|st        j                  |#      }#t        j8                  ||#gd      }|"j)                  d      dk(  r|!}" |||t        j8                  |"|!fd            \  }}" j                  j                        }| j;                  |      }$|
|$d d g df   |$d d g df   j=                         |$d d dgf   j?                         |$d d dgf   j?                         dz  }
|
j                         D ci c]5  \  }}|t        |j@                        dk7  r|n|j                  ddddd      7 c}}S c c}}w c c}}w c c}w c c}}w ) NTr  r  r   r   r   r   r   noisedr  r]  r_  )r  r  r  r  Fr  r  )r  r  conditioning
use_directr  r  	_intermed.rK   r  r  zusing sg as inputrL   r  r     )r  r  seginvdepth)!r   rN   rX   rn  r^   rf  r   r   ru  ry  rz  r  r  r   r   rc   r   r   r   diff_sample_from_reverse_processr   r  printr  r}  r  r{  r|  rU   r   r  sigmoidr   )%r   r  rY   r  r  rN  r  r  r  r  r  r  r  r   r  r   r  r   r@   r  r,  r-  xTgen_sggen_sg_intermedslatent_eps_lossscene_incam_indenoised_sgclean_sanityr  r  scene_latentr  r   r  r   s%                                        r   forward_zUNetModel.forward_  s   	  )K +,'Aii!Aa"A{mmAA );ItGZGZ)[\]^_c]cddh  tzz"A++ 1c4(+		!	   C-a0Ar}}Q/Q t'<'<RV'D !,!2!2!4F#!AA!A#FBF#2-EKKA",E,K,K,M,R,R,Tfkfqfqruvwrxz{|}f~  gE  gE  gG  gL  gL  gN  O  OHLHYHYHzHz{  BD  FG  JM  OS  IT$r:J7K  vAZcdpYq  MO#(kRa I{ IcFF#O5Fhv =M=S=S=U!Vca!K-/!VVYhhhH=Ef<MNqQs166":q=>12NOHV }-fY.?M[8%& +F"=1H..x8**625@;5NNN~
L #/.
>D''55,, VVVX66!9bggaj(Aa+2771:+o,>!BMM"$5FFAr7*??2&)^*%aRVVZ4PQS-TU*V qwwA((1+C "1W9~qy>..0q!u:--/ #AqcE
 2 2 4	 H OWnnN^_sqAGGa1QYYq1Qq-AA__i G "WNX `s   Q"Q8Q
"Q:Qc                    |d   }|j                  ddddd      }|rt        j                  |      }|du| j                  duk(  sJ d       d	|vrM t        j                  d      j                         d
   j                  g |d   j                  dd dd |d	<   g }	| j                  t        || j                              dddf   j                  d|j                  d      d      x}
}t        j                  |d	   ddddf   t        j                  j                  j!                  |d	   dddddf         fd      |d<   | j#                  t        t        j$                  |j                  d            j                         | j                              d   }||z   }
|j'                  | j(                        }| j*                  D ]  } |||
      }|	j-                  |        | j/                  ||
      }|j1                  dd      j3                  d      dddf   }|st        j                  |      }|rt        j4                  |      dz  dz
  dz  }|r|j7                  dd      }|
|z   }
| j8                  D ]  }|	j;                         }|j                  d      |j                  d      k7  r|ddddd|j                  d      f   }|st        j                  |      }t        j                  ||gd      } |||
      } |j'                  |j(                        }| j=                  |      }| j?                  ||z         }|ddg df   |ddg df   jA                         |dddgf   jC                         |dddgf   jC                         ||d}|jE                         D ci c]5  \  }}|tG        |j                        dk7  r|n|j                  ddddd      7 c}}S c c}}w )  
        Apply the model to an input batch.
        :param x: an [N x C x ...] Tensor of inputs.
        :param timesteps: a 1-D batch of timesteps.
        :param y: an [N] Tensor of labels, if class-conditional.
        :return: an [N x C x ...] Tensor of outputs.
        r  r   r   r   r   r   N<must specify y if and only if the model is class-conditionalr  ru   rK   .camsrL   ra  g皙?r  r  r]  r  )epsr  r  r  global_latentr  r  )$r   rN   rX   rk  eyer   expandr   rn  r^   rf  r   rU   korniageometryconversionsrotation_matrix_to_axis_anglero  rR   r   r   ru  ry  rz  rJ  r   	rand_likeclipr{  r|  r   rq  r  r  r  rc   )r   r  rY   r  r  rN  r  r  r   r  r   r  	emb_framer   r@   r  r  r   cam_predr  r,  r-  s                         r   r  zUNetModel.forward_  s$   * l
#
))Aa!A
"--*D(
 	JI	J K'#E266!9>>#3I#>#E#E#q{S_G`GfGfgihiGj#qkm#qnp#qK	");ItGZGZ)[\]^_c]cdkklnopououvwoxy{||h !ffk)&<S!BY&GHcHc  IB  IB  CN  OX  CY  Z]  ^`  _`  ^`  ac  bc  ac  Zc  Cd  Ie  &f  gi  jF %%&8166!99M9R9R9TVZViVi&jklpq	" FF4::'' 	Fq#AIIaL	 a% 		!B,,,4QtV<2==+]0KA0Ma0OQS/S}}/A/A#b/I} M!(( 	FBvvay"''!*$!Akrwwqzk/(:ab 1R2wA&Aq#A	 FF177Ohhqk !:; 1W9~1W9~**,1aS5z))+1#J..0 -
 OWnnN^_sqAGGa1QYYq1Qq-AA___s   7:O5c                    |d   }|j                  ddddd      }|du| j                  duk(  sJ d       g }| j                  t        || j                              dddf   j                  d	|j                  d      d	      x}}d
|vrM t        j                  d      j                         d   j
                  g |d   j                  dd d	d	 |d
<   || j                  t        j                  |d
   dddd	f   t        j                  j                  j!                  |d
   dddddf         fd	            z   }|j#                  | j$                        }| j&                  D ]  }	 |	||      }|j)                  |        | j+                  ||      }| j,                  D ]t  }	|j/                         }
|j                  d      |
j                  d      k7  r|ddddd|
j                  d      f   }t        j                  ||
gd      } |	||      }v |j#                  |j$                        }| j1                  |      }|ddg df   |ddg df   j3                         |dddgf   j5                         |dddgf   j5                         d}|j7                         D ci c]  \  }}||j                  ddddd       c}}S c c}}w )r  r  r   r   r   r   r   Nr  rK   r  ru   .rL   r  r  r]  r  )r  r  r  r  )r   rk  rn  r^   rf  r  r   rN   r  r   r   rp  rU   r  r  r  r  r   r   ru  ry  rz  r{  r|  r   r  r  r  )r   r  rY   r  r   r  r   r  r   r@   r  r   r  r,  r-  s                  r   forward_fullzUNetModel.forward_full  sO    l
#
))Aa!A
D(
 	JI	J );ItGZGZ)[\]^_c]cdkklnopououvwoxy{||hK'@bq	@PQZ@[@b@b  AOdop|d}  eD  eD  EG  FG  eH  AO  IK  AO  LN  AOY)?DLLY)?BQBr	)J6??KfKf  LE  LE  FQ  R[  F\  ]`  ac  bc  ac  df  ef  df  ]f  Fg  Lh  )i  jl  "m  n  n FF4::'' 	Fq#AIIaL	 a%(( 	FBvvay"''!*$!Akrwwqzk/(:a2wA&Aq#A		
 FF177Ohhqk 1W9~1W9~**,1aS5z))+1#J..0	
 3;..2BC31Q!))Aa!A&&CCCs   , K)r   r   )r   r   r   rS  Tr   NFFr   rK   rK   FFF)NFFFFFT)NFFFFr
   )
r   r   r   r   r   r  r   r  r  r   r   s   @r   r[  r[    s`    L !" %+i:V(8VZ`vZ`x^`@4Dr   r[  c                 f   | dk(  rd}n&| dk(  rd}n| dk(  rd}n| dk(  rd}nt        d|        g }| dk(  rd	}nd
}|j                  d      D ]  }|j                  | t        |      z         ! t	        d#i d| d|d|ddd|dt        |      ddd|d|dddddddddddd d!d d"d S )$Nrb  r   r   r   r   r   rT  r   r   r   r   rD   r   r   r   r      unsupported image size: 28,14,732,16,8,rd  re  r   rg  r   rf  rh  r   r  ri  rk  r   Fr  r   r   r  rm  rK   r   Tr  r	  r   r#   r$  ry  r   r[  r`   	rd  re  r   
base_widthrk  ri  attention_dsrh  ress	            r   UNetBigr  $  sO    S&	r	#	r	#	r	#3J<@AALR ) )$**3/ 4J#c(234   " 	
 " $L1  "        "  !" !%# r   c                 f   | dk(  rd}n&| dk(  rd}n| dk(  rd}n| dk(  rd}nt        d|        g }| dk(  rd	}nd
}|j                  d      D ]  }|j                  | t        |      z         ! t	        d#i d| d|d|d|dddt        |      ddd|d|dddddddddddd d!d d"d S )$Nrb  r  rT  r  rD   r  r  r  r  r  r  rd  re  rf  r   rg  r   rh  r   r  ri  rk  r   Fr  r   r   r  rm  rK   r   Tr  r	  r   r  r   s	            r   UNetr  S  sO    S&	r	#	r	#	r	#3J<@AALR ) )$**3/ 4J#c(234   " "	
  $L1  "        "  !" !%# r   c                 l   | dk(  rd}n&| dk(  rd}n| dk(  rd}n| dk(  rd}nt        d|        g }| dk(  rd	}nd
}|j                  d      D ]  }|j                  | t        |      z         ! t	        d$i d| d|d|d|dddt        |      ddddd|d|ddddddddddd d!d"d!d#d!S )%Nrb  r  rT  r  rD   r  r  r  r  r  r  rd  re  rf  r   rg  r   rh  r  r   r  ri  rk  r   Fr  r   r   r  rm  rK   r   Tr  r	  r   r  r   s	            r   	UNetSmallr    sY    S&	r	#	r	#	r	#3J<@AALR ) )$**3/ 4J#c(234   " "	
  $L1   "         "!" #$ !%% r   )gGz?)i'  )r   r   rc  N)r   r   rT  N)r   r   rD   N)2abcr   rP   numpyr  r   rN   torch.nnr   torch.nn.functional
functionalr   r  einopsr   r   r   r  r   	GroupNormr   r(   r+   r0   r<   rB   rF   r^   rh   ry   Functionra   Moduler   r   r   r   r   r   r   r   r   r  r  r   r   r[  r  r  r  r   r   r   <module>r     s            #  8",, 8

8&
8
;%*$*-- *>bii @	BII 	ebmm] e&ryy D <+299 +*D+} D+L[RYY [|5(. .>4299 4l.299 .F@
D		 @
DJ ,b ,b -r   