o
     ݱi=                     @   s   d dl Z d dlmZ d dlm  mZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZmZ ddlmZ G dd	 d	ejZG d
d dejZG dd dejZG dd dejZdS )    N)DDIMScheduler)	rearrange)AdaLNAttentionBlockAdaLNFinalLayer)SinusoidalPosEmbinit_weights   )UWMObservationEncoderc                       sp   e Zd Z				ddedeedf deedf d	ed
ef
 fddZdd Zdd Zdd Ze	dd Z
  ZS )MultiViewVideoPatchifier      r      r   r         	num_viewsinput_shape.patch_shape	num_chans	embed_dimc                    s   t    || _|\}}}|\}	}
}||	 ||
 || | _| _| _|	|
|| _| _| _t	j
||||d| _t	|||	 |
 | | _d S )N)in_channelsout_channelskernel_sizestride)super__init__r   THWpTpHpWnnConv3dpatch_encoderLinearpatch_decoder)selfr   r   r   r   r   ZiTiHiWr    r!   r"   	__class__ :/data/cameron/vidgen/unified-world-model/models/uwm/uwm.pyr      s   


"z!MultiViewVideoPatchifier.__init__c                 C   
   |  |S N)patchify)r(   imgsr-   r-   r.   forward%   s   
z MultiViewVideoPatchifier.forwardc                 C   s(   t |d}| |}t |d| jd}|S )Nzb v c t h w -> (b v) c t h wz(b v) c t h w -> b (v t h w) c)v)r   r%   r   )r(   r2   featsr-   r-   r.   r1   (   s   

z!MultiViewVideoPatchifier.patchifyc                 C   s6   |  |}t|d| j| j| j| j| j| j| jd	}|S )Nz6b (v t h w) (c pt ph pw) -> b v c (t pt) (h ph) (w pw))r4   thwptphpw)	r'   r   r   r   r   r   r    r!   r"   )r(   r5   r2   r-   r-   r.   
unpatchify.   s   
z#MultiViewVideoPatchifier.unpatchifyc                 C   s   | j | j | j | j S r0   )r   r   r   r   )r(   r-   r-   r.   num_patches=   s   z$MultiViewVideoPatchifier.num_patches)r   r   r   r   )__name__
__module____qualname__inttupler   r3   r1   r<   propertyr=   __classcell__r-   r-   r+   r.   r
      s*    

r
   c                       s0   e Zd Zd	dedef fddZdd Z  ZS )
DualTimestepEncoder         @r   	mlp_ratioc                    sL   t    t|| _t|| }tt|d |t t||| _	d S )Nr   )
r   r   r   sinusoidal_pos_embrA   r#   
Sequentialr&   Mishproj)r(   r   rH   
hidden_dimr+   r-   r.   r   C   s   



zDualTimestepEncoder.__init__c                 C   s0   |  |}|  |}tj||gdd}| |S )Ndim)rI   torchcatrL   )r(   t1t2Ztemb1Ztemb2tembr-   r-   r.   r3   M   s   


zDualTimestepEncoder.forward)rF   rG   )r>   r?   r@   rA   floatr   r3   rD   r-   r-   r+   r.   rE   B   s    
rE   c                       s   e Zd Z							ddedeed	f d
eed	f dededededededededededef fddZdd Zdd Z	  Z
S )DualNoisePredictionNetr   rF      rG   Tr   global_cond_dimimage_shape.r   r   r   
action_len
action_dimr   timestep_embed_dimdepth	num_headsrH   qkv_biasnum_registersc                    s2  t    t||||d| _| jj}tt| }tt	||t
 t	|| _tt	|t
 t	||| _t|	| _ttd|jdd| _|| | }ttd|jdd| _||	  t fddt|
D | _t d| _d|f| _||| f| _|   d S )	N)r   r   r   r   r   r   {Gz?)stdc              	      s   g | ]}t  d qS ))rP   cond_dimr_   rH   r`   )r   ).0_rd   r   rH   r_   r`   r-   r.   
<listcomp>   s    z3DualNoisePredictionNet.__init__.<locals>.<listcomp>)rP   rd   r   )r   r   r
   obs_patchifierr=   rA   maxr#   rJ   r&   rK   action_encoderaction_decoderrE   timestep_embedding	ParameterrQ   emptynormal_	registers	pos_embed
ModuleListrangeblocksr   headaction_indsnext_obs_indsinitialize_weights)r(   rY   rZ   r   r   r   r[   r\   r   r]   r^   r_   rH   r`   ra   Zobs_lenrM   	total_lenr+   rg   r.   r   U   sL   






zDualNoisePredictionNet.__init__c                 C   s   |  t | jjjj}tjj|	|j
d dgddd tj| jjjd | jD ]}tj|jd jd tj|jd jd q)tj| jjd jd tj| jjd jd tj| jjjd tj| jjjd d S )Nr   rN   g        rb   )meanrc   )applyr   ri   r%   weightdatar#   initrp   viewshape	constant_biasru   ZadaLN_modulationrv   linear)r(   r8   blockr-   r-   r.   ry      s   
"
z)DualNoisePredictionNet.initialize_weightsc                 C   s>  |  |}| |}t|jdkr ||jd jtj|jd}t|jdkr6||jd jtj|jd}| 	||}| j
|jd dd}	tj|||	fdd}
|
| j }
tj||fdd}| jD ]}||
|}
qb| |
|}
|
d d | jd | jd f }|
d d | jd | jd f }| |}| j|}||fS )Nr   )dtypedevicerN   r   rO   )rk   ri   lenr   expandtorQ   longr   rm   rq   rR   rr   ru   rv   rw   rx   rl   r<   )r(   Zglobal_condactionaction_tnext_obs
next_obs_tZaction_embedZnext_obs_embedrU   rq   xcondr   action_noise_prednext_obs_noise_predr-   r-   r.   r3      s.   



  
zDualNoisePredictionNet.forward)r   rF   rX   rX   rG   Tr   )r>   r?   r@   rA   rB   rV   boolr   ry   r3   rD   r-   r-   r+   r.   rW   T   sN    


	
LrW   c                       s   e Zd Z												
	d*dedededededeedf dededededededef fddZd+ddZe	
 dd Ze	
 d d! Ze	
 d"d# Ze	
 d$d% Ze	
 d&d' Ze	
 d(d) Z  ZS ),UnifiedWorldModelr   rF   r      r   rX   r   Tr   d   
   squaredcos_cap_v2r[   r\   obs_encoderr   r]   latent_patch_shape.r^   r_   rH   r`   ra   num_train_stepsnum_inference_stepsc                    s   t    || _|| _||f| _|| _| j | _| j }| jdd }| jdd \}}t||||||||||||	|
|d| _	|| _
|| _t|||d| _dS )zi
        Assumes rgb input: (B, T, H, W, C) uint8 image
        Assumes low_dim input: (B, T, D)
        r   N)rY   rZ   r   r   r   r[   r\   r   r]   r^   r_   rH   r`   ra   )num_train_timestepsbeta_scheduleclip_sample)r   r   r[   r\   action_shaper   latent_img_shapefeat_dimrW   noise_pred_netr   r   r   noise_scheduler)r(   r[   r\   r   r   r]   r   r^   r_   rH   r`   ra   r   r   r   r   rY   rZ   r   r   r+   r-   r.   r      s@   


zUnifiedWorldModel.__init__Nc                 C   s   |j d |j}}| j||\}}t|}	tjd| j|f|d }
|d ur0| jd |
| < | j	
||	|
}t|}tjd| j|f|d }| j	
|||}| |||
||\}}t||	}t||}|| }| | | d}||fS )Nr   )lowhighsizer   r   )lossaction_lossdynamics_loss)r   r   r   encode_curr_and_next_obsrQ   
randn_likerandintr   r   r   	add_noiser   Fmse_lossitem)r(   obs_dictnext_obs_dictr   action_mask
batch_sizer   obsr   Zaction_noiser   Znoisy_actionZnext_obs_noiser   Znoisy_next_obsr   r   r   r   r   infor-   r-   r.   r3     sB   


zUnifiedWorldModel.forwardc                 C   r/   r0   )sample_marginal_action)r(   r   r-   r-   r.   sampleA  s   
zUnifiedWorldModel.samplec           	      C   s~   | j |}tj|jd f| j |jd}| j| j	 | jj
d }| jj
D ]}| |||||\}}| j|||j}q&|S Nr   r   rN   )r   encode_curr_obsrQ   randnr   r   r   r   set_timestepsr   	timestepsr   stepprev_sample)	r(   r   r   r   next_obs_sampler   r   rf   r   r-   r-   r.   sample_forward_dynamicsE  s    
z)UnifiedWorldModel.sample_forward_dynamicsc           
      C   s   | j ||\}}tj|jd f| j |jd}| j| j	 | jj
d }| jj
D ]}| |||||\}}	| j|||j}q)|S r   )r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   )
r(   r   r   obs_featr   action_sampler   r   r   rf   r-   r-   r.   sample_inverse_dynamics[  s$   
z)UnifiedWorldModel.sample_inverse_dynamicsc           	      C   s   | j |}tj|jd f| j |jd}tj|jd f| j |jd}| j	| j
 | jjd }| jjD ]}| |||||\}}| j|||j}q5|S Nr   r   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   )	r(   r   r   r   r   r   r6   rf   r   r-   r-   r.   sample_marginal_next_obss  &   
z*UnifiedWorldModel.sample_marginal_next_obsc           	      C   s   | j |}tj|jd f| j |jd}tj|jd f| j |jd}| j	| j
 | jjd }| jjD ]}| |||||\}}| j|||j}q5|S r   r   )	r(   r   r   r   r   r   r6   r   rf   r-   r-   r.   r     r   z(UnifiedWorldModel.sample_marginal_actionc                 C   s   | j |}tj|jd f| j |jd}tj|jd f| j |jd}| j	| j
 | jjD ]}| |||||\}}| j|||j}| j|||j}q/||fS r   r   )r(   r   r   r   r   r6   r   r   r-   r-   r.   sample_joint  s.   
zUnifiedWorldModel.sample_joint)r   rF   r   rX   rX   r   Tr   r   r   r   Tr0   )r>   r?   r@   rA   r	   rB   r   r   r3   rQ   no_gradr   r   r   r   r   r   rD   r-   r-   r+   r.   r      sj    
	

>*




r   )rQ   torch.nnr#   torch.nn.functional
functionalr   Z$diffusers.schedulers.scheduling_ddimr   einopsr   Zmodels.common.adaln_attentionr   r   Zmodels.common.utilsr   r   r   r	   Moduler
   rE   rW   r   r-   r-   r-   r.   <module>   s    5 