o
    #'µi _  ã                   @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZ dd„ ZG dd„ dej	ƒZ
G dd	„ d	ej	ƒZG d
d„ dej	ƒZG dd„ dej	ƒZG dd„ dej	ƒZd dlmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ dd„ ZG dd„ dej	ƒZdd„ ZdS )é    N)Ú
checkpoint)Úcreate_diffusionc                 C   s   | d|  | S )Né   © )ÚxÚshiftÚscaler   r   ú=/data/cameron/vidgen/unified_video_action/simple_uva/model.pyÚmodulate
   s   r
   c                       s4   e Zd Zd	‡ fdd„	Zed
dd„ƒZdd„ Z‡  ZS )ÚTimestepEmbedderé   c              	      s@   t ƒ  ¡  t tj||ddt ¡ tj||dd¡| _|| _d S )NT©Úbias)ÚsuperÚ__init__ÚnnÚ
SequentialÚLinearÚSiLUÚmlpÚfrequency_embedding_size)ÚselfÚhidden_sizer   ©Ú	__class__r   r	   r      s   
ý
zTimestepEmbedder.__init__é'  c                 C   s¨   |d }t  t |¡ t jd|t jd | ¡j| jd}| d d …d f  ¡ |d   }t j	t  
|¡t  |¡gdd}|d rRt j	|t  |d d …d d…f ¡gdd}|S )Né   r   )ÚstartÚendÚdtype©Údeviceéÿÿÿÿ©Údimr   )ÚtorchÚexpÚmathÚlogÚarangeÚfloat32Útor!   ÚfloatÚcatÚcosÚsinÚ
zeros_like)Útr$   Z
max_periodÚhalfZfreqsÚargsÚ	embeddingr   r   r	   Útimestep_embedding   s"   
ÿþÿüÿz#TimestepEmbedder.timestep_embeddingc                 C   s   |   || j¡}|  |¡}|S ©N)r5   r   r   )r   r1   Zt_freqZt_embr   r   r	   Úforward(   s   
zTimestepEmbedder.forward)r   )r   )Ú__name__Ú
__module__Ú__qualname__r   Ústaticmethodr5   r7   Ú__classcell__r   r   r   r	   r      s
    	r   c                       ó$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )ÚResBlockc              	      sr   t ƒ  ¡  || _tj|dd| _t tj||ddt ¡ tj||dd¡| _	t t ¡ tj|d| dd¡| _
d S )Nçíµ ÷Æ°>©ÚepsTr   é   )r   r   Úchannelsr   Ú	LayerNormÚin_lnr   r   r   r   ÚadaLN_modulation)r   rC   r   r   r	   r   /   s   
ý
ÿzResBlock.__init__c                 C   sB   |   |¡jddd\}}}t|  |¡||ƒ}|  |¡}|||  S )NrB   r"   r#   )rF   Úchunkr
   rE   r   )r   r   ÚyZ	shift_mlpZ	scale_mlpZgate_mlpÚhr   r   r	   r7   <   s   
zResBlock.forward©r8   r9   r:   r   r7   r<   r   r   r   r	   r>   .   s    r>   c                       r=   )Ú
FinalLayerc                    sT   t ƒ  ¡  tj|ddd| _tj||dd| _t t ¡ tj|d| dd¡| _	d S )NFr?   )Úelementwise_affinerA   Tr   r   )
r   r   r   rD   Ú
norm_finalr   Úlinearr   r   rF   )r   Úmodel_channelsÚout_channelsr   r   r	   r   D   s   
ÿ
ÿzFinalLayer.__init__c                 C   s8   |   |¡jddd\}}t|  |¡||ƒ}|  |¡}|S )Nr   r"   r#   )rF   rG   r
   rM   rN   )r   r   Úcr   r   r   r   r	   r7   N   s   
zFinalLayer.forwardrJ   r   r   r   r	   rK   C   s    
rK   c                       s8   e Zd Z	d
‡ fdd„	Zdd„ Zdd„ Zdd	„ Z‡  ZS )ÚSimpleMLPAdaLNFc                    s„   t ƒ  ¡  || _ˆ | _|| _|| _|| _tˆ ƒ| _t	 
|ˆ ¡| _t	 
|ˆ ¡| _t	 ‡ fdd„t|ƒD ƒ¡| _tˆ |ƒ| _|  ¡  d S )Nc                    s   g | ]}t ˆ ƒ‘qS r   )r>   ©Ú.0Ú_©rO   r   r	   Ú
<listcomp>i   s    z+SimpleMLPAdaLN.__init__.<locals>.<listcomp>)r   r   Úin_channelsrO   rP   Únum_res_blocksÚgrad_checkpointingr   Ú
time_embedr   r   Ú
cond_embedÚ
input_projÚ
ModuleListÚrangeÚ
res_blocksrK   Úfinal_layerÚinitialize_weights)r   rX   rO   rP   Ú
z_channelsrY   rZ   r   rV   r	   r   V   s   
	
ÿzSimpleMLPAdaLN.__init__c                 C   sÚ   dd„ }|   |¡ tjj| jjd jdd tjj| jjd jdd | jD ]}tj |j	d jd¡ tj |j	d j
d¡ q&tj | jj	d jd¡ tj | jj	d j
d¡ tj | jjjd¡ tj | jjj
d¡ d S )Nc                 S   sB   t | tjƒrtjj | j¡ | jd urtj | jd¡ d S d S d S )Nr   )	Ú
isinstancer   r   r%   ÚinitÚxavier_uniform_Úweightr   Ú	constant_)Úmoduler   r   r	   Ú_basic_inito   s   
ýz6SimpleMLPAdaLN.initialize_weights.<locals>._basic_initr   ç{®Gáz”?©Ústdr   r"   )Úapplyr   re   Únormal_r[   r   rg   r`   rh   rF   r   ra   rN   )r   rj   Úblockr   r   r	   rb   n   s   

z!SimpleMLPAdaLN.initialize_weightsc                 C   sr   |   |¡}|  |¡}|  |¡}|| }| jr(tj ¡ s(| jD ]}t|||ƒ}qn| jD ]}|||ƒ}q+|  	||¡S r6   )
r]   r[   r\   rZ   r%   ÚjitÚis_scriptingr`   r   ra   )r   r   r1   rQ   rH   rp   r   r   r	   r7      s   



ÿ
zSimpleMLPAdaLN.forwardc                 C   s²   |d t |ƒd … }tj||gdd}|  |||¡}|d d …d | j…f |d d …| jd …f }}	tj|t |ƒd dd\}
}|||
|   }tj||gdd}tj||	gddS )Nr   r   r#   r   )Úlenr%   r-   r7   rX   Úsplit)r   r   r1   rQ   Ú	cfg_scaler2   ÚcombinedZ	model_outrA   ÚrestZcond_epsZ
uncond_epsZhalf_epsr   r   r	   Úforward_with_cfgŒ   s   .zSimpleMLPAdaLN.forward_with_cfg©F)r8   r9   r:   r   rb   r7   rx   r<   r   r   r   r	   rR   U   s    ùrR   c                       s4   e Zd Z	d
‡ fdd„	Zddd„Zddd	„Z‡  ZS )ÚDiffLossFc                    sX   t t| ƒ ¡  |d | _|| _t|||d |||d| _tddd| _t|dd| _	d S )NÚn_framesr   )rX   rO   rP   rc   rY   rZ   Ú Úcosine)Ztimestep_respacingZnoise_schedule)
r   rz   r   r{   rX   rR   Únetr   Útrain_diffusionÚgen_diffusion)r   Útarget_channelsrc   ÚdepthÚwidthÚnum_sampling_stepsrZ   Úkwargsr   r   r	   r   ˜   s"   

úÿÿzDiffLoss.__init__Nc                 C   s¤   |j \}}}| || d¡}| || d¡}| || ¡}tjd| jj|j d f|jd}	t|d}
| j | j	||	|
¡}|d }|d urN||  
¡ | 
¡  }| ¡ S )Nr"   r   r    ©rQ   Úloss)ÚshapeÚreshaper%   Úrandintr   Znum_timestepsr!   ÚdictZtraining_lossesr~   ÚsumÚmean)r   ÚtargetÚzÚmaskZ
conf_scoreÚtext_latentsÚbszÚseq_lenrU   r1   Úmodel_kwargsZ	loss_dictr‡   r   r   r	   r7   ´   s$   
ü

ÿzDiffLoss.forwardç      ð?c           	   	   C   s–   |dks&t  |jd d | j¡ ¡ }t j||gdd}t||d}| jj}nt  |jd | j¡ ¡ }t|d}| jj	}| j
j||j|d|d|d}|S )	Nr•   r   r   r#   )rQ   ru   r†   F)Zclip_denoisedr”   ÚprogressÚtemperature)r%   Úrandnrˆ   rX   Úcudar-   r‹   r~   rx   r7   r€   Zp_sample_loop)	r   r   r—   Úcfgr‘   Únoiser”   Z	sample_fnÚsampled_token_latentr   r   r	   ÚsampleÈ   s$   

ù	zDiffLoss.samplery   )NNN)r•   r•   N)r8   r9   r:   r   r7   r   r<   r   r   r   r	   rz   —   s
    ù
rz   )Úpartial)Útqdm)Ú	rearrange©ÚBlockc                 C   sN   t  ||¡ |¡}t j|d|d d …d |  ¡ …f t  ||¡ |¡d ¡ }|S )Nr"   ©r$   ÚindexÚsrc)r%   Úzerosr+   ÚscatterÚlongÚonesÚbool)Úmask_lenÚorderr’   r“   r!   Zmaskingr   r   r	   Úmask_by_orderè   s   üûr­   c                       sÄ   e Zd ZdZddddddddddejdddddd	dd
df‡ fdd„	Zdd„ Zdd„ Zdd„ Z	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Z			 		d,d!d"„Zd-d$d%„Zd-d&d'„Zd(d)„ Zd*d+„ Z‡  ZS ).ÚMARVideoOnlyzHMAR for video-from-first-frame only: no text, proprio, action, or wrist.r   é   r   i   g      @gffffffæ?gš™™™™™¹?rB   Ú100Fc                    sÂ  t ƒ  ¡  d| _d| _d| _|| _|| _|| _|| |  | _| _	| j| j	 | _
||d  | _|| _|| _tj|d d dddd| _tj| jˆdd	| _tj| jˆdd	| _t t d
ˆ¡¡| _t t d
ˆ¡¡| _tjdˆ ˆdd	| _t t d
ˆ¡¡| _t t d
| jˆ¡¡| _t t d
| jˆ¡¡| _t t d
| j
ˆ¡¡| _tjˆdd| _ t !‡ ‡‡‡‡‡fdd„t"|ƒD ƒ¡| _#ˆˆƒ| _$tjˆˆdd	| _%t t d
| jˆ¡¡| _&t t d
| j
ˆ¡¡| _'t t d
| jˆ¡¡| _(t !‡ ‡‡‡‡‡fdd„t"|ƒD ƒ¡| _)ˆˆƒ| _*t t d
| jˆ¡¡| _+t t d
| j
ˆ¡¡| _,|  -¡  t.| jˆ||||| jdd
d	| _/| 0dd¡| _1d | _2| j1r_ddl3m4} |ˆ| 0dd¡| j| 0dd¡d| _2d S d S )Né   é@   r   r•   g      Ð?r   )Úlocr   Tr   r   rB   r?   r@   c                    ó"   g | ]}t ˆˆˆd ˆˆˆ d‘qS ©T)Zqkv_biasÚ
norm_layerZ	proj_dropZ	attn_dropr¡   rS   )Úattn_dropoutÚencoder_embed_dimÚencoder_num_headsÚ	mlp_ratior¶   Úproj_dropoutr   r	   rW   5  ó    
÷ùÿz)MARVideoOnly.__init__.<locals>.<listcomp>c                    r´   rµ   r¡   rS   )r·   Údecoder_embed_dimÚdecoder_num_headsrº   r¶   r»   r   r	   rW   O  r¼   Úclip)	r   rc   rƒ   r‚   r„   rZ   r{   Zlanguage_emb_modelZlanguage_emb_model_typeÚpredict_paraF)ÚParaHeadZpara_n_binsé    Zpara_out_size)r½   Ún_binsZin_grid_sizeÚout_size)5r   r   r{   Úbuffer_size_textÚbuffer_size_actionÚimg_sizeÚ
vae_strideÚ
patch_sizeÚseq_hÚseq_wr“   Útoken_embed_dimÚvae_embed_dimrZ   ÚstatsZ	truncnormÚmask_ratio_generatorr   r   Úz_proj_condÚz_projÚ	Parameterr%   r¦   Úfake_latent_xÚfake_action_latentÚproj_cond_x_layerÚfake_latentÚtext_pos_embedÚtemporal_pos_embedÚspatial_pos_embedrD   Ú	z_proj_lnr^   r_   Úencoder_blocksÚencoder_normÚdecoder_embedÚdecoder_temporal_pos_embedÚdecoder_spatial_pos_embedÚdecoder_text_pos_embedÚdecoder_blocksÚdecoder_normÚdiffusion_temporal_embedÚdiffusion_spatial_embedrb   rz   ÚdifflossÚgetrÀ   Ú	para_headZsimple_uva.para_headrÁ   )r   rÇ   rÈ   rÉ   r¸   Úencoder_depthr¹   r½   Údecoder_depthr¾   rº   r¶   rÍ   Úmask_ratio_minÚlabel_drop_probr·   r»   Ú
diffloss_dÚ
diffloss_wr„   rZ   r…   rÁ   r   )r·   r½   r¾   r¸   r¹   rº   r¶   r»   r	   r   ö   s¨   
ÿ
ÿÿÿÿ
öÿ
ÿÿÿ
öÿ
ÿÿ÷

üþzMARVideoOnly.__init__c                 C   sì   t jjj| jdd t jjj| jdd t jjj| jdd t jjj| jdd t jjj| jdd t jjj| j	dd t jjj| j
dd t jjj| jdd t jjj| jdd t jjj| jdd t jjj| jdd |  | j¡ d S )Nrk   rl   )r%   r   re   ro   rÓ   rÔ   rÖ   rØ   rÙ   rÞ   rß   rã   rä   r×   rà   rn   Ú_init_weights)r   r   r   r	   rb   }  s   zMARVideoOnly.initialize_weightsc                 C   sŠ   t |tjƒrtjj |j¡ |jd urtj |jd¡ d S d S t |tj	ƒrA|jd ur2tj |jd¡ |jd urCtj |jd¡ d S d S d S )Nr   r•   )
rd   r   r   r%   re   rf   rg   r   rh   rD   )r   Úmr   r   r	   rî   ‹  s   
ÿ

üzMARVideoOnly._init_weightsc           	      C   sd   |j \}}}}| j}|| || }}| ||||||¡}t d|¡}| ||| ||d  ¡}|S )Nznchpwq->nhwcpqr   )rˆ   rÉ   r‰   r%   Úeinsum)	r   r   r’   rQ   rI   ÚwÚpÚh_Úw_r   r   r	   Úpatchify–  s   zMARVideoOnly.patchifyc                 C   s`   |j d }| j}| j}| j| j}}| ||||||¡}t d|¡}| |||| || ¡}|S )Nr   znhwcpq->nchpwq)rˆ   rÉ   rÍ   rÊ   rË   r‰   r%   rð   )r   r   r’   rò   rQ   ró   rô   r   r   r	   Ú
unpatchifyŸ  s   
zMARVideoOnly.unpatchifyc                 C   sX   g }t |ƒD ]}t tt | jƒƒ¡}tj |¡ | |¡ qt 	t |¡¡ 
| j¡ ¡ S r6   )r_   ÚnpÚarrayÚlistr“   ÚrandomÚshuffleÚappendr%   ÚTensorr+   r!   r¨   )r   r’   ÚordersrU   r¬   r   r   r	   Úsample_orders©  s   zMARVideoOnly.sample_ordersc              	   C   sŠ   |j \}}}}| j d¡d }tt || ¡ƒ}tj|||jd}	tj	|	d|d d …d |…f tj
|||jdd}	|	 d¡ d|d¡}
|
S )Nr   r   r    r"   r£   )rˆ   rÏ   ZrvsÚintr÷   Úceilr%   r¦   r!   r§   r©   Ú	unsqueezeÚexpand)r   r   rþ   r’   r1   r“   Ú	embed_dimZ	mask_rateZnum_masked_tokensZspatial_maskr   r   r   r	   Úrandom_masking±  s   üzMARVideoOnly.random_maskingc                 C   s¨  |  ¡ \}}}}t|dƒ}|  |¡}t|dƒ}|  |¡}t|dƒ}| j d¡ ||  d¡d¡}||dk  |j¡||dk< | j	 d¡ 
|dd¡}	|	j| jdd}
tj|||
gdd}|  |¡}|  d¡}| j d¡ dd|d¡}| j d¡ d|dd¡}||  d|| |¡}|| }| j d¡ 
d| jd¡ |dd¡}|| j }tj||gdd}|  |¡}| jrÃtj ¡ sÃ| jD ]}t||ƒ}qºn
| jD ]}||ƒ}qÆ|  |¡}|S )	Núb t s -> b (t s)úb t s c -> b (t s) cr   r"   r   r¯   r#   r   )Úsizer    rÐ   rÑ   rÓ   r  r  r+   r   rÔ   ÚrepeatÚrepeat_interleaverÆ   r%   r-   rÕ   rØ   rÙ   r‰   rÖ   rÅ   r×   rÚ   rZ   rq   rr   rÛ   r   rÜ   )r   r   r   ÚcondÚBÚTÚSrU   Zfake_latent_expandedZaction_latentsZaction_latents_expandr  Ztemporal_pos_embed_expandedZspatial_pos_embed_expandedÚcombined_pos_embedr‘   rp   r   r   r	   Úforward_mae_encoder¿  sP   




ÿ

ÿÿþ

ý


ÿ


z MARVideoOnly.forward_mae_encoderc                 C   s>  |  ¡ \}}}t|dƒ}|  |¡}|j\}}}| j d¡ dd|d¡}| j d¡ d|dd¡}	||	  d|| |¡}
t	j
| j|
gdd}|| }| jr\t	j ¡ s\| jD ]}t||ƒ}qSn
| jD ]}||ƒ}q_|  |¡}|d d …| jd …f }| j d¡ dd|d¡}| j d¡ d|dd¡}||  d|| |¡}|| }|S )Nr  r   r"   r   r#   )r  r    rÝ   rˆ   rÞ   r  r  rß   r‰   r%   r-   rà   rZ   rq   rr   rá   r   râ   rÅ   rã   rä   )r   r   r   r  r  r  rU   r  Z#decoder_temporal_pos_embed_expandedZ"decoder_spatial_pos_embed_expandedZdecoder_combined_pos_embedr  rp   Z%diffusion_temporal_pos_embed_expandedZ$diffusion_spatial_pos_embed_expandedZdiffusion_combined_pos_embedr   r   r	   Úforward_mae_decoderî  sX   

ÿþÿþþ
ÿ
ÿ


ÿþÿþþz MARVideoOnly.forward_mae_decoderr²   r•   rN   c           !      K   s®  |j | _ | ¡ \}	}
}}}t|dƒ}|  |¡}t|d|	d}tj|| j| j| j| j d}tj	|| j| j| j d}|  
|¡}tt|ƒƒ}|rIt|ƒ}|D ]ý}| ¡ }|  |||¡}|  ||¡}t tjd |d  | ¡}t t | j| ¡g¡ | j ¡}|d d …df }t t dg¡ | j ¡t tj|dd	d
d |¡¡}t|d ||| j| j ƒ}| d¡ d|
d¡}t|dƒ}t|dƒ}||d krÍ|d |…  ¡ }nt |d |…  ¡ | ¡ ¡}t|d| jd}|dksðtj ||gdd}||j!d	d }|dkrd|d | j|d   | j  }n|}| j"j#|||d d}|dks1|j$ddd\}}|j$ddd\}}t|dƒ} || |j!d	d< t| d| jd ¡ }qKt|dƒ}|  %|¡}|d fS )Nzb t c h w -> (b t) c h wz (b t) seq_len c -> b t seq_len c)Úbr    g       @r   r   r"   T)r$   Úkeepdimsr  zb (t s) -> b t s)r1   r•   r#   )Úas_tuplerN   )r‘   r   r  úb (t s) c -> b t s czb t s c -> (b t) s c)&r!   r  r    rõ   r%   r¦   r{   r“   rÌ   r©   rÿ   rù   r_   rŸ   Úcloner  r  r÷   r.   r'   Úpirý   Úfloorr+   ÚmaximumÚminimumrŒ   r­   r  r  rª   Úlogical_xorr-   Únonzerorå   r   rG   rö   )!r   r’   r  Únum_iterrš   Zcfg_scheduler—   r–   r…   r  r  ÚCÚHÚWÚtokensr   rþ   ÚindicesÚstepZ
cur_tokensr   r   Z
mask_ratior«   Zmask_Z	mask_nextÚ	mask_flatZmask_to_predZz_predZcfg_iterrœ   rU   Zcur_tokens_flatr   r   r	   Úsample_tokens  s|   

ÿ
ÿÿþÿ

ÿ
"ÿ

ÿþ

zMARVideoOnly.sample_tokensNc           
      C   sf   |j | _ | ¡ \}}}}|du rtj||||j |jd}|  |||¡}|  ||¡}	t|	d||d}	|	S )zgRun encoder and decoder, return decoder tokens (B, T, S, C). If mask is None, no masking (all visible).N)r!   r   r  )r1   Ús)r!   r  r%   r¦   r   r  r  r    )
r   r   r  r   r  r  r  rU   rI   r   r   r   r	   Úforward_decode_tokensg  s   z"MARVideoOnly.forward_decode_tokensc                 C   s   | j |||d}|  |¡S )zRReturn volume_logits (B, T, n_bins, H_out, W_out) from decoder tokens + PARA head.)r   )r'  rç   )r   r   r  r   r   r   r   r	   Úforward_parar  s   
zMARVideoOnly.forward_parac                 C   s   |   ||¡S )zDForward for DataParallel: same as compute_loss. Returns scalar loss.)Úcompute_loss)r   r   r  r   r   r	   r7   w  s   zMARVideoOnly.forwardc                 C   sp   |j | _ | ¡ \}}}}|  |¡}|  ||¡}|  |||¡}	|  |	|¡}
t|dƒ}t|dƒ}| j||
|d}|S )zETraining loss: x, cond (B, T, S, C) token space. Returns scalar loss.r  r  )rŽ   r   r   )r!   r  rÿ   r  r  r  r    rå   )r   r   r  r  r  r  rU   rþ   r   rI   r   Zgt_flatr$  r‡   r   r   r	   r)  {  s   


zMARVideoOnly.compute_loss)r²   r•   rN   r•   Fr6   )r8   r9   r:   Ú__doc__r   rD   r   rb   rî   rõ   rö   rÿ   r  r  r  r%  r'  r(  r7   r)  r<   r   r   r   r	   r®   ó   sT    ë 	
/,
ø
Q
r®   c                  K   s*   t ddddddddttjdddœ| ¤ŽS )	z9Video-only MAR with base architecture (same as mar_base).i   é   r±   r?   r@   )r¸   rè   r¹   r½   ré   r¾   rº   r¶   Nr   )r®   rž   r   rD   )r…   r   r   r	   Úmar_base_video_only‰  s   ø	÷r,  )r'   r%   Útorch.nnr   Ztorch.utils.checkpointr   Zsimple_uva.diffusionr   r
   ÚModuler   r>   rK   rR   rz   Ú	functoolsrž   Únumpyr÷   rŸ   Zscipy.statsrÎ   Úeinopsr    Ztimm.models.vision_transformerr¢   r­   r®   r,  r   r   r   r	   Ú<module>   s.    BI   