o
    2AiV                     @   s   d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZ d dlmZmZ d dlmZmZmZ d dlmZmZ d d	lmZ d d
lmZmZmZ d dlm Z  e!e"Z#eG dd deZ$G dd deeee Z%dS )    )	dataclass)DictOptionalTupleUnionN)ConfigMixinregister_to_config)UNet2DConditionLoadersMixin)
BaseOutputlogging)CROSS_ATTENTION_PROCESSORSAttentionProcessorAttnProcessor)TimestepEmbedding	Timesteps)
ModelMixin)get_down_blockget_up_blockUNetMidBlockSpatioTemporal)PeftAdapterMixinc                   @   s    e Zd ZU dZdZejed< dS )!UNetSpatioTemporalConditionOutputa!  
    The output of [`UNetSpatioTemporalConditionModel`].

    Args:
        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    Nsample)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__ r   r   G/data/cameron/vidgen/svd_motion_lora/Motion-LoRA/svd/models/svd_unet.pyr      s   
 r   c                       sf  e Zd ZdZdZe									
					d6dee dededee	 dee	 dee dedede
eee f de
eee f de
eee ee f de
eee f def fddZedee	ef fd d!Zd"e
eee	ef f fd#d$Zd%d& Zd7d(d)Zd8d+ee d,eddfd-d.Z	d9d/ejd0e
ejeef d1ejd2ejd3ede
eef fd4d5Z  ZS ): UNetSpatioTemporalConditionModela	  
    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
    shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
    for all models (such as downloading or saving).

    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
            The tuple of downsample blocks to use.
        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
        addition_time_embed_dim: (`int`, defaults to 256):
            Dimension to to encode the additional time ids.
        projection_class_embeddings_input_dim (`int`, defaults to 768):
            The dimension of the projection of encoded `added_time_ids`.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
            The number of attention heads.
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    TN       CrossAttnDownBlockSpatioTemporalr%   r%   DownBlockSpatioTemporalUpBlockSpatioTemporalCrossAttnUpBlockSpatioTemporalr)   r)   i@  i     r+                     
      r4      sample_sizein_channelsout_channelsdown_block_typesup_block_typesblock_out_channelsaddition_time_embed_dim%projection_class_embeddings_input_dimlayers_per_blockcross_attention_dimtransformer_layers_per_blocknum_attention_heads
num_framesc                     s  t    || _t|t|krtd| d| dt|t|kr.td| d| dt|tsFt|t|krFtd| d| dt|
tr^t|
t|kr^td|
 d| dt|	tsvt|	t|krvtd|	 d| dtj	||d	 d
dd| _
|d	 d }t|d	 dd	d| _|d	 }t||| _t|dd	d| _t||| _tg | _tg | _t|tr|ft| }t|
tr|
ft| }
t|	tr|	gt| }	t|tr|gt| }|}|d	 }t|D ]0\}}|}|| }|t|d k}t||	| || |||| d|
| || dd}| j| qt|d ||d |
d |d d| _d	| _tt|}tt|}tt|	}tt|
}tt|}|d	 }t|D ]R\}}|t|d k}|}|| }|t|d t|d  }|sd}|  jd7  _nd}t||| d || |||||d||| || dd}| j| |}q\tj|d	 ddd| _t | _ tj	|d	 |d
dd| _!d S )Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: .zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: r      r0   )kernel_sizepaddingr#   T)downscale_freq_shiftgh㈵>silu)

num_layersr@   r7   r8   temb_channelsadd_downsample
resnet_epsr?   rA   resnet_act_fn)rJ   r@   r?   rA   F)rI   r@   r7   r8   prev_output_channelrJ   add_upsamplerL   resolution_idxr?   rA   rM       )num_channels
num_groupseps)"super__init__r6   len
ValueError
isinstanceintlistnnConv2dconv_inr   	time_projr   time_embeddingadd_time_projadd_embedding
ModuleListdown_blocks	up_blocks	enumerater   appendr   	mid_blocknum_upsamplersreversedminr   	GroupNormconv_norm_outSiLUconv_actconv_out) selfr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   time_embed_dimtimestep_input_dimblocks_time_embed_dimoutput_channelidown_block_typeinput_channelis_final_block
down_blockreversed_block_out_channelsreversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dim%reversed_transformer_layers_per_blockup_block_typerO   rP   up_block	__class__r   r    rW   F   s   




	

z)UNetSpatioTemporalConditionModel.__init__returnc                    sL   i }dt dtjjdtt tf f fdd |  D ]
\}} ||| q|S )z
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        namemodule
processorsc                    sL   t |dr|jdd||  d< | D ]\}} |  d| || q|S )Nget_processorT)return_deprecated_lora
.processorrC   )hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorsr   r    r     s
   
zUUNetSpatioTemporalConditionModel.attn_processors.<locals>.fn_recursive_add_processors)strr   r]   Moduler   r   r   )rr   r   r   r   r   r   r    attn_processors   s   
z0UNetSpatioTemporalConditionModel.attn_processors	processorc                    s   t | j }t|tr"t ||kr"tdt | d| d| ddtdtjj	f fdd | 
 D ]
\}} ||| q3d	S )
a4  
        Sets the attention processor to use to compute attention.

        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.

                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.

        z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                    sb   t |drt|ts|| n|||  d | D ]\}} |  d| || qd S )Nset_processorr   rC   )r   rZ   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processorr   r    r   )  s   

zXUNetSpatioTemporalConditionModel.set_attn_processor.<locals>.fn_recursive_attn_processorN)rX   r   keysrZ   r   rY   r   r   r]   r   r   )rr   r   countr   r   r   r   r    set_attn_processor  s   
z3UNetSpatioTemporalConditionModel.set_attn_processorc                 C   sJ   t dd | j D rt }ntdtt| j  | | dS )ze
        Disables custom attention processors and sets the default attention implementation.
        c                 s   s    | ]}|j tv V  qd S )N)r   r   ).0procr   r   r    	<genexpr>:  s    zNUNetSpatioTemporalConditionModel.set_default_attn_processor.<locals>.<genexpr>zOCannot call `set_default_attn_processor` when attention processors are of type N)allr   valuesr   rY   nextiterr   )rr   r   r   r   r    set_default_attn_processor6  s   z;UNetSpatioTemporalConditionModel.set_default_attn_processorFc                 C   s   t |dr
||_d S d S )Ngradient_checkpointing)r   r   )rr   r   valuer   r   r    _set_gradient_checkpointingC  s   

z<UNetSpatioTemporalConditionModel._set_gradient_checkpointingr   
chunk_sizedimc                    sZ   |dvrt d| |pd}dtjjdtdtf fdd |  D ]} ||| q"d	S )
aX  
        Sets the attention processor to use [feed forward
        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

        Parameters:
            chunk_size (`int`, *optional*):
                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
                over each tensor of dim=`dim`.
            dim (`int`, *optional*, defaults to `0`):
                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
                or dim=1 (sequence length).
        )r   r0   z-Make sure to set `dim` to either 0 or 1, not r0   r   r   r   c                    s6   t | dr| j||d |  D ]} ||| qd S )Nset_chunk_feed_forward)r   r   )r   r   children)r   r   r   r   fn_recursive_feed_forwardr   r    r   [  s
   
z[UNetSpatioTemporalConditionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forwardN)rY   r   r]   r   r[   r   )rr   r   r   r   r   r   r    enable_forward_chunkingH  s   z8UNetSpatioTemporalConditionModel.enable_forward_chunkingr   timestepencoder_hidden_statesadded_time_idsreturn_dictc                 C   s`  |}t |s.|jjdk}t|tr|rt jnt j}n|r t jnt j	}t j
|g||jd}nt|jdkr=|d |j}|jdd \}	}
||	}| |}|j|jd}| |}| | }||	df}||j}| |}|| }|dd}|j|
dd	}|j|
dd	}| |}t j|	|
|j|jd}|f}| jD ]"}t|d
r|jr|||||d\}}n	||||d\}}||7 }q| j||||d}t| jD ]2\}}|t|j d }|dt|j  }t|d
r|jr||||||d}q|||||d}q|  |}| !|}| "|}|j|	|
g|jdd R  }|s+|fS t#|dS )a  
        The [`UNetSpatioTemporalConditionModel`] forward method.

        Args:
            sample (`torch.FloatTensor`):
                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.FloatTensor`):
                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
            added_time_ids: (`torch.FloatTensor`):
                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
                embeddings and added to the time embeddings.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
                tuple.
        Returns:
            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
                a `tuple` is returned where the first element is the sample tensor.
        mps)dtypedevicer   Nr.   )r   rN   r0   )r   has_cross_attention)hidden_statestembr   image_only_indicator)r   r   r   )r   r   res_hidden_states_tupler   r   )r   r   r   r   )r   )$r   	is_tensorr   typerZ   floatfloat32float64int32int64tensorrX   shapetoexpandr`   r   ra   rb   flattenreshaperc   repeat_interleaver_   zerosre   r   r   ri   rg   rf   resnetsrn   rp   rq   r   )rr   r   r   r   r   r   	timestepsis_mpsr   
batch_sizerB   t_embembtime_embedsaug_embr   down_block_res_samplesdownsample_blockres_samplesrw   upsample_blockr   r   r    forwarde  s   













z(UNetSpatioTemporalConditionModel.forward)Nr"   r#   r$   r'   r*   r,   r-   r.   r/   r0   r1   r5   )F)Nr   )T)r   r   r   r    _supports_gradient_checkpointingr   r   r[   r   r   r   rW   propertyr   r   r   r   r   r   r   r   r   Tensorr   boolr   r   __classcell__r   r   r   r    r!   !   s    " 2"
#
r!   )&dataclassesr   typingr   r   r   r   r   torch.nnr]   Zdiffusers.configuration_utilsr   r   Zdiffusers.loadersr	   diffusers.utilsr
   r   Z$diffusers.models.attention_processorr   r   r   Zdiffusers.models.embeddingsr   r   Zdiffusers.models.modeling_utilsr   Zsvd.models.svd_unet_blocksr   r   r   Z	LoRA.peftr   
get_loggerr   loggerr   r!   r   r   r   r    <module>   s     
