
    ܱi#                        d dl mZ d dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlZd dlmZ d dlZd dlZd dlmZ d dlmZ d dlZd dlmZmZ d dlZd dlZd dlZdd
Zd Zd Z G d de
j                  Z G d de
j                  Z dS )    )StableVideoDiffusionPipeline)CtrlWorldDiffusionPipeline) UNetSpatioTemporalConditionModelN)Accelerator)
get_logger)tqdm)VideoReadercpuFc                    t          j        |t           j                  }t          j        |t           j                  }t          j        ||          }t          j        |d          }|                    dd||g          }t          | |          }|r2|dk    r,t          j        t          j        || g          |gd          }|S )z
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    dtyper   axis      )	nparangefloat32meshgridstackreshape!get_2d_sincos_pos_embed_from_gridconcatenatezeros)	embed_dim	grid_size	cls_tokenextra_tokensgrid_hgrid_wgrid	pos_embeds           4/data/cameron/vidgen/Ctrl-World/models/ctrl_world.pyget_2d_sincos_pos_embedr$      s     Yy
333FYy
333F;vv&&D8Dq!!!D<<Ay)455D1)TBBI ]\A%%NBHlI-F$G$G#SZ[\\\	    c                     | dz  dk    sJ t          | dz  |d                   }t          | dz  |d                   }t          j        ||gd          }|S )Nr   r   r   r   )!get_1d_sincos_pos_embed_from_gridr   r   )r   r!   emb_hemb_wembs        r#   r   r   (   sd    q=A .i1nd1gFFE-i1nd1gFFE
.%a
0
0
0CJr%   c                 X   | dz  dk    sJ t          j        | dz  t           j                  }|| dz  z  }dd|z  z  }|                    d          }t          j        d||          }t          j        |          }t          j        |          }t          j        ||gd	
          }|S )z}
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    r   r   r   g       @g      ?i'  zm,d->mdr   r   )r   r   float64r   einsumsincosr   )r   posomegaoutemb_sinemb_cosr*   s          r#   r'   r'   3   s     q=AIi1nBJ777E	Y^EE
++b//C
)IsE
*
*CfSkkGfSkkG
.'7+!
4
4
4CJr%   c                   (     e Zd Zd fd	ZddZ xZS )Action_encoder2Tc           
      V   t                                                       || _        || _        || _        || _        t          |          }t          j        t          j	        |d          t          j
                    t          j	        dd          t          j
                    t          j	        dd                    | _        t          j                            | j        d         j        dd           t          j                            | j        d         j        dd           d S )N   r   fan_inrelu)modenonlinearityr   )super__init__
action_dim
action_numhidden_size	text_condintnn
SequentialLinearSiLUaction_encodeinitkaiming_normal_weight)selfr@   rA   rB   rC   	input_dim	__class__s         r#   r?   zAction_encoder2.__init__H   s    $$&"
OO	]Ii&&GIIIdD!!GIIIdD!!
 
 	 21 5 <8Z`aaa
 21 5 <8Z`aaaaar%   Nc                    |j         \  }}}|st          j        |d          }|                     |          }|| j        r}t          j                    5   ||ddd                              |j                  }	 |d	i |	}
|
j	        }t          j
        |dd          }d d d            n# 1 swxY w Y   ||z   }|S )
Nzb t d -> b 1 (t d)
max_lengthptT)paddingreturn_tensors
truncationzb c -> b 1 (n c)r   )n )shapeeinops	rearrangerI   rC   torchno_gradtodevicetext_embedsrepeat)rM   actiontextstext_tokinizertext_encoderframe_level_condBTDinputsoutputshidden_texts               r#   forwardzAction_encoder2.forward[   s8   !A 	D%f.BCCF##F++ R R'|TXeijjjmmnz  oB  C  C&,0000%1$mK9KqQQQ	R R R R R R R R R R R R R R R k)Fs   AB..B25B2)T)NNNT__name__
__module____qualname__r?   rl   __classcell__rO   s   @r#   r7   r7   G   sW        b b b b b b&       r%   r7   c                   $     e Zd Z fdZd Z xZS )	CrtlWorldc                    t          t          |                                            || _        t	          j        |j                  | _        t          d           t                      }|
                    | j        j                                        d           || j        _        | j        j        | _        | j        j        | _        | j        j        | _        | j        j        | _        | j                            d           | j                            d           | j                            d           | j                                         ddlm}m} |                    |j                  | _        |                    |j        d          | _        | j                            d           t1          |j        t5          |j        |j        z             d|j        	          | _        d S )
NzBreplace the unet to support action condition and frame_level pose!F)strictTr   )AutoTokenizerCLIPTextModelWithProjection)use_fastr9   )r@   rA   rB   rC   )r>   rt   r?   argsr   from_pretrainedsvd_model_pathpipelineprintr   load_state_dictunet
state_dictvaeimage_encoder	schedulerrequires_grad_enable_gradient_checkpointingtransformersrw   rx   clip_model_pathrd   	tokenizerr7   r@   rD   num_history
num_framesrC   action_encoder)rM   rz   r   rw   rx   rO   s        r#   r?   zCrtlWorld.__init__o   s   i'')))	 5DTEXYYRSSS/11T]/::<<UKKK!M&	=$!]80 	&&&))%000	  &&&	//111 	LKKKKKKK7GGH\]]&66t7KUZ6[[((/// .UXY]YijnjyYyUzUz  IM  Y]  Yg  h  h  hr%   c           	         |d         }|d         }| j         j        }| j         j        }d}d}d}| j        j        }	|                    |          }|d d |	|	dz   f         }
|j        d d         \  }}|
d d df         }
t          j        |dddg|	          d
z  }d|dz  dz   dz  z  }||
t          j	        |
          |z  z   z  }
t          j        |
d|          }| j        j        rd|d d d |	f<   |d         }|                    |          }|                     ||| j        | j        | j        j                  }t          j        |          }t          j        |j        d         |	          dk                        d                              d          }||z  || z  z   }t          j        |ddddg|	          }||z  |z                                   }d|dz  dz   z  }| |dz  dz   dz  z  }d|dz  dz   dz  z  }|                                dz                      |g          }|dz  dz   |dz  z  }|t          j	        |          |z  z   }t          j        ||	dddg|	          dz  }|d d d |	f         }d|dz  dz   dz  z  ||t          j	        |          z  z   z  }t          j        |||d d |	d f         z  gd          }t          j        ||| j        j        j        z  gd          }| j        j        }| j        j        }| j                            ||||j        |dd          } |                     |          } d}!|                      |||| | j        j                  j        }"||"z  ||z  z   }#|!|#d d |	d f         |d d |	d f         z
  dz  |z                                   z  }!|!t          j!        d||          fS )Nlatenttextgffffff?g?g        r   r   r   )r^   g?g      ?zb c h w -> b f c h w)fra   )re   g?   g333333?)dimF)encoder_hidden_statesadded_time_idsre   )r^   r   )"r   r   r^   rz   r   r]   rX   r[   rand
randn_likerY   r`   his_cond_zeror   r   rd   re   
zeros_like	unsqueezerandnexplogr   catr   configscaling_factormotion_bucket_idfpsr}   _get_add_time_idssamplemeantensor)$rM   batchlatentsrb   r   r^   P_meanP_stdnoise_aug_strengthr   current_imgbszr   sigmac_incondition_latentra   action_hiddenuncond_hidden_states	text_mask
rnd_normalc_skipc_outc_noiseloss_weightnoisy_latentssigma_hhistorynoisy_historyinput_latentsr   r   r   loss
model_pred
predict_x0s$                                       r#   rl   zCrtlWorld.forward   s   /f	! y,**V$$ aaa[] ;;< rr*J!!!!A#&
CAq>&999C?E1HqLS((K%*:;*G*G%*OOP!=6LPZ[[[9" 	403QQQ_- x6""++FE4>4K\osox  pJ+  K  K  %/>>Z 3A 6vFFFtKVVWXYYccdeff	%i/0Dyj0QQ [#q!Q!26BBB
e#f,1133eQhl#5!8a<C//E1HqLS((99;;?++SE22zA~!3 5#3G#<#<u#DD +sKAq9&IIICO!!!L[L.)7A:a<#--'EDTU\D]D]:]0]^	=$}QQQ{||^7T2T"U[\]]] 	=2B48?Ca2a"bhijjj95im88>NPbdqdwy|~  BG  H  H'**622 YY}g]ky  LP  LU  LfY  g  g  n
Z'&=*@@
 	*QQQ{||^,wqqq~/FFJ[X^^```U\#f5AAAAAr%   rm   rr   s   @r#   rt   rt   n   s[        h h h h hFAB AB AB AB AB AB ABr%   rt   )Fr   )!&models.pipeline_stable_video_diffusionr   models.pipeline_ctrl_worldr   %models.unet_spatio_temporal_conditionr   numpyr   r[   torch.nnrE   rY   
accelerater   datetimeosaccelerate.loggingr   	tqdm.autor   jsondecordr	   r
   wandbswanlabmediapyr$   r   r'   Moduler7   rt   rW   r%   r#   <module>r      s   O O O O O O A A A A A A R R R R R R             " " " " " "  				 ) ) ) ) ) )        # # # # # # # #      $    ($ $ $ $ $bi $ $ $NeB eB eB eB eB	 eB eB eB eB eBr%   