
    ܱi]                        d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZmZ  ej        e           Z!e G d de                      Z" G d deee          Z#dS )    )	dataclass)DictOptionalTupleUnionN)ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)
BaseOutputlogging)CROSS_ATTENTION_PROCESSORSAttentionProcessorAttnProcessor)TimestepEmbedding	Timesteps)
ModelMixin)UNetMidBlockSpatioTemporalget_down_blockget_up_blockc                   ,    e Zd ZU dZdZej        ed<   dS )!UNetSpatioTemporalConditionOutputa  
    The output of [`UNetSpatioTemporalConditionModel`].

    Args:
        sample (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    Nsample)__name__
__module____qualname____doc__r   torchTensor__annotations__     H/data/cameron/vidgen/Ctrl-World/models/unet_spatio_temporal_condition.pyr   r      s0            FELr!   r   c                   &    e Zd ZdZdZe	 	 	 	 	 	 	 	 	 	 	 	 	 d.dee         dededee	         dee	         dee         dedede
eee         f         de
eee         f         de
eee         ee         f         de
eee         f         def fd            Zedee	ef         fd            Zd e
eee	ef         f         fd!Zd" Zd/d$ee         d%eddfd&Z	 	 d0d(ej        d)e
ej        eef         d*ej        d+ej        d,ede
eef         fd-Z xZS )1 UNetSpatioTemporalConditionModela	  
    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
    returns a sample shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        addition_time_embed_dim: (`int`, defaults to 256):
            Dimension to to encode the additional time ids.
        projection_class_embeddings_input_dim (`int`, defaults to 768):
            The dimension of the projection of encoded `added_time_ids`.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
            The number of attention heads.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    TN       CrossAttnDownBlockSpatioTemporalr(   r(   DownBlockSpatioTemporalUpBlockSpatioTemporalCrossAttnUpBlockSpatioTemporalr,   r,   i@  i     r.                     
      r7      sample_sizein_channelsout_channelsdown_block_typesup_block_typesblock_out_channelsaddition_time_embed_dim%projection_class_embeddings_input_dimlayers_per_blockcross_attention_dimtransformer_layers_per_blocknum_attention_heads
num_framesc                 
   t                                                       || _        t          |          t          |          k    rt	          d| d| d          t          |          t          |          k    rt	          d| d| d          t          |t                    s6t          |          t          |          k    rt	          d| d| d          t          |
t                    r6t          |
          t          |          k    rt	          d|
 d| d          t          |	t                    s6t          |	          t          |          k    rt	          d|	 d| d          t          j	        ||d	         d
d          | _
        |d	         dz  }t          |d	         dd	          | _        |d	         }t          ||          | _        t          |dd	          | _        t          ||          | _        t          j        g           | _        t          j        g           | _        t          |t                    r|ft          |          z  }t          |
t                    r|
ft          |          z  }
t          |	t                    r|	gt          |          z  }	t          |t                    r|gt          |          z  }|}|d	         }t)          |          D ]r\  }}|}||         }|t          |          dz
  k    }t+          ||	|         ||         |||| d|
|         ||         d          }| j                            |           st/          |d         ||d         |
d         |d                   | _        d	| _        t          t5          |                    }t          t5          |                    }t          t5          |	                    }t          t5          |
                    }t          t5          |                    }|d	         }t)          |          D ]\  }}|t          |          dz
  k    }|}||         }|t7          |dz   t          |          dz
                     }|sd}| xj        dz  c_        nd}t9          |||         dz   ||         |||||d|||         ||         d          }| j                            |           |}t          j        |d	         dd          | _        t          j                    | _         t          j	        |d	         |d
d          | _!        d S )Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: r      r3   )kernel_sizepaddingr&   T)downscale_freq_shiftgh㈵>silu)

num_layersrC   r:   r;   temb_channelsadd_downsample
resnet_epsrB   rD   resnet_act_fn)rN   rC   rB   rD   F)rM   rC   r:   r;   prev_output_channelrN   add_upsamplerP   resolution_idxrB   rD   rQ       )num_channels
num_groupseps)"super__init__r9   len
ValueError
isinstanceintlistnnConv2dconv_inr   	time_projr   time_embeddingadd_time_projadd_embedding
ModuleListdown_blocks	up_blocks	enumerater   appendr   	mid_blocknum_upsamplersreversedminr   	GroupNormconv_norm_outSiLUconv_actconv_out)!selfr9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   time_embed_dimtimestep_input_dimblocks_time_embed_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dim%reversed_transformer_layers_per_blockup_block_typerS   rT   up_block	__class__s!                                   r"   r[   z)UNetSpatioTemporalConditionModel.__init__F   s   6 	&   C$7$777 go  g  g  Vd  g  g  g   !""c*:&;&;;; s  vH  s  s  `p  s  s  s   -s33 	<O8P8PTWXhTiTi8i8i v  xK  v  v  cs  v  v  v   )400 	S9L5M5MQTUeQfQf5f5f v  xK  v  v  cs  v  v  v   *C00 	S9I5J5JcRbNcNc5c5c m  rB  m  m  Zj  m  m  m  
 yq!	
 
 
 ,A.2"#5a#8$UVWWW/2/0BNSS&'>[\]]]./TVdee=,,r**)3// 	Q#6"83?O;P;P"P)3// 	Q#6"83?O;P;P"P&,, 	J 01C8H4I4II2C88 	b,H+ICP`LaLa+a( . ,A."+,<"="= 	0 	0A*M/2N#&8"9"9A"==N'+A.-I!-L)+3#11$7$:$7$:$  J ##J//// 4r"/)Eb)I 3B 7 3B 7
 
 
   '+84F+G+G&H&H#'+H5H,I,I'J'J$$(2B)C)C$D$D!'+H5H,I,I'J'J$04X>Z5[5[0\0\-4Q7 ). 9 9 	1 	1A}#&8"9"9A"==N"08;N7AE3GYCZCZ]^C^8_8_`M " %###q(###$#4Q7!;-RST-U)+$73) $@$C$@$C$  H N!!(+++"0  \7I!7LY[aefff			q!	
 
 
r!   returnc                     i }dt           dt          j        j        dt          t           t
          f         ffd|                                 D ]\  }} |||           |S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                     t          |d          r|                                ||  d<   |                                D ]\  }} |  d| ||           |S )Nget_processor
.processorrG   )hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorss        r"   r   zUUNetSpatioTemporalConditionModel.attn_processors.<locals>.fn_recursive_add_processors  s    
 v// I282F2F2H2H
d.../#)#8#8#:#: U U%++t,@,@h,@,@%TTTTr!   )strr   ra   Moduler   r   r   )rv   r   r   r   r   s       @r"   attn_processorsz0UNetSpatioTemporalConditionModel.attn_processors   s     
		HO	 S"445	 	 	 	 	 	 !//11 	B 	BLD&''fjAAAAr!   	processorc           	      z   t          | j                                                  }t          |t                    r9t          |          |k    r&t          dt          |           d| d| d          dt          dt          j        j	        ffd| 
                                D ]\  }} |||           dS )	a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                 ,   t          |d          rVt          |t                    s|                    |           n+|                    |                    |  d                     |                                D ]\  }} |  d| ||           d S )Nset_processorr   rG   )r   r^   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processors        r"   r   zXUNetSpatioTemporalConditionModel.set_attn_processor.<locals>.fn_recursive_attn_processor)  s    v// M!)T22 M((3333(($7J7J7J)K)KLLL#)#8#8#:#: T T%++t,@,@h,@,@%SSSST Tr!   N)r\   r   keysr^   r   r]   r   r   ra   r   r   )rv   r   countr   r   r   s        @r"   set_attn_processorz3UNetSpatioTemporalConditionModel.set_attn_processor  s    D(--//00i&& 	3y>>U+B+BmQTU^Q_Q_ m m05m mRWm m m  
	Tc 	T58? 	T 	T 	T 	T 	T 	T !//11 	A 	ALD&''fi@@@@	A 	Ar!   c           	      4   t          d | j                                        D                       rt                      }nCt	          dt          t          | j                                                                       |                     |           dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c              3   2   K   | ]}|j         t          v V  d S )N)r   r   ).0procs     r"   	<genexpr>zNUNetSpatioTemporalConditionModel.set_default_attn_processor.<locals>.<genexpr>:  s*      fft~!;;ffffffr!   zOCannot call `set_default_attn_processor` when attention processors are of type N)allr   valuesr   r]   nextiterr   )rv   r   s     r"   set_default_attn_processorz;UNetSpatioTemporalConditionModel.set_default_attn_processor6  s     ffH\HcHcHeHefffff 	%II Nbfgklp  mA  mH  mH  mJ  mJ  hK  hK  cL  cL  N  N   		*****r!   r   
chunk_sizedimc                     |dvrt          d|           |pd}dt          j        j        dt          dt          ffd|                                 D ]} |||           dS )	aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r3   z-Make sure to set `dim` to either 0 or 1, not r3   r   r   r   c                     t          | d          r|                     ||           |                                 D ]} |||           d S )Nset_chunk_feed_forward)r   r   )r   r   children)r   r   r   r   fn_recursive_feed_forwards       r"   r   z[UNetSpatioTemporalConditionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardW  so    v788 N---MMM** B B))%SAAAAB Br!   N)r]   r   ra   r   r_   r   )rv   r   r   r   r   s       @r"   enable_forward_chunkingz8UNetSpatioTemporalConditionModel.enable_forward_chunkingD  s     fRSRRSSS  _1
	Beho 	B3 	BUX 	B 	B 	B 	B 	B 	B mmoo 	? 	?F%%fj#>>>>	? 	?r!   Fr   timestepencoder_hidden_statesadded_time_idsreturn_dictc           	         d| j         z  d}d}t          fd|j        dd         D                       rt                              d           d}|}	t          j        |	          s|j        j        dk    }
|j        j        d	k    }t          |t                    r|
s|rt
          j        nt
          j        }n|
s|rt
          j        nt
          j        }t          j        |	g||j        
          }	n8t!          |	j                  dk    r |	d                             |j                  }	|j        dd         \  }}|	                    |          }	|                     |	          }|                    |j                  }|                     |          }|                     |                                          }|                    |df          }|                    |j                  }|                     |          }||z   }|                    dd          }|                    |d|j        d         |z            }|s|                    |d          }n%|                    ||z  d|j        d                   }|                     |          }t          j        |||j        |j        
          }|f}| j        D ]B}t=          |d          r|j        r |||||          \  }}n ||||          \  }}||z  }C|                      ||||          }tC          | j"                  D ]\  }}|t!          | j"                  dz
  k    }|t!          |j#                   d         }|dt!          |j#                            }|s|r|d         j        dd         }t=          |d          r|j        r |||||||          } ||||||          }| $                    |          }| %                    |          }| &                    |          } |j        ||g|j        dd         R  }|s|fS tO          |          S )a  
        The [`UNetSpatioTemporalConditionModel`] forward method.

        Args:
            sample (`torch.Tensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.Tensor`):
                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
            added_time_ids: (`torch.Tensor`):
                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
                embeddings and added to the time embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
                of a plain tuple.
        Returns:
            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is
                returned, otherwise a `tuple` is returned where the first element is the sample tensor.
        r1   FNc              3   *   K   | ]}|z  d k    V  dS )r   Nr    )r   sdefault_overall_up_factors     r"   r   z;UNetSpatioTemporalConditionModel.forward.<locals>.<genexpr>  s.      MMaq,,1MMMMMMr!   z9Forward upsample size to force interpolation output size.Tmpsnpu)dtypedevicer   )r   rR   r3   )r   output_size)r   has_cross_attention)hidden_statestembr   image_only_indicator)r   r   r   )r   r   res_hidden_states_tupler   upsample_sizer   )r   r   r   r   r   )r   )(rn   anyshapeloggerinfor   	is_tensorr   typer^   floatfloat32float64int32int64tensorr\   toexpandrd   r   re   rf   flattenreshaperg   repeat_interleaverc   zerosri   r   r   rm   rk   rj   resnetsrr   rt   ru   r   )rv   r   r   r   r   r   frame_level_condforward_upsample_sizer   	timestepsis_mpsis_npur   
batch_sizerE   t_embembtime_embedsaug_embr   down_block_res_samplesdownsample_blockres_samplesr{   upsample_blockr~   r   s                             @r"   forwardz(UNetSpatioTemporalConditionModel.forwarda  s   B %&t':$:! !&MMMM6<;LMMMMM 	)KKSTTT$(! 	y)) 	: ]'50F]'50F(E** K*0NFN(.J&Ju{i[fmTTTII!!Q&&!$**6=99I "(bqb!1
J$$Z00	y))
 v|,,!!%(((()?)?)A)ABB!)):r*:;;!nnSY//$$[11Gm 1%% ##JA39Q<R\C\#]]   	@$9$K$KJ\]$K$^$^!!$9$A$A*zBY[]_t_z{}_~$$! f%%${:z^d^klll"( $ 0 	2 	2')>?? DTDh &6&6"(*?)=	' ' '# '7&6"()=' ' '# #k1""  "7!5	   
 
 "+4>!:!: 	 	A~#dn"5"5"99N0#n6L2M2M1M1O1OPK%;<Zs>CY?Z?Z>Z<Z%[" " E&; E 6r : @ D~'<== .Bd '"(,7*?"/)=   ("(,7"/)=   ##F++v&&v&&  
JJabb9IJJJ 	90????r!   )Nr%   r&   r'   r*   r-   r/   r0   r1   r2   r3   r4   r8   )Nr   )TF)r   r   r   r    _supports_gradient_checkpointingr	   r   r_   r   r   r   r[   propertyr   r   r   r   r   r   r   r   r   boolr   r   __classcell__)r   s   @r"   r$   r$       s       ! !F (,$ &*(
&
 *@'*58346:MN6E1o
 o
c]o
 o
 	o

  *o
 c
o
" "#J#o
$ "%%o
& 03'o
(  U3Z0)o
* #3c
?3+o
, ',CsU5\,I&J-o
. #3c
?3/o
0 1o
 o
 o
 o
 o
 o
b c+=&=!>    X6 AE2Dd3PbKbFc2c,d  A  A  A  AD+ + +? ?(3- ?S ?Y] ? ? ? ?F !i@ i@i@ eS01i@  %|	i@
 i@ i@ 
0%7	8i@ i@ i@ i@ i@ i@ i@ i@r!   r$   )$dataclassesr   typingr   r   r   r   r   torch.nnra   diffusers.configuration_utilsr   r	   diffusers.loadersr
   diffusers.utilsr   r   $diffusers.models.attention_processorr   r   r   diffusers.models.embeddingsr   r   diffusers.models.modeling_utilsr   %diffusers.models.unets.unet_3d_blocksr   r   r   
get_loggerr   r   r   r$   r    r!   r"   <module>r      s   ! ! ! ! ! ! / / / / / / / / / / / /        I I I I I I I I 9 9 9 9 9 9 / / / / / / / / n n n n n n n n n n D D D D D D D D 6 6 6 6 6 6 j j j j j j j j j j 
	H	%	% 	  	  	  	  	 
 	  	  	 j@ j@ j@ j@ j@z;@[ j@ j@ j@ j@ j@r!   