o
    ?߱iz                     @   s@  d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ dd
lmZ g dZdgZdZeddG dd dZdejde e! fddZ"d#de#de e! de!de$fddZ%		d$de#de e! de!de!de$f
dd Z&G d!d" d"Z'dS )%u8
  
# Script for generating I2W videos in s3
PYTHONPATH=. python cosmos_predict2/_src/predict2/inference/video2world.py --experiment=Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4 --ckpt_path s3://bucket/cosmos_diffusion_v2/official_runs_vid2vid/Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4/checkpoints/iter_000045000 --save_root results/cli_debug_from_s3 --input_root /project/cosmos/ybalaji/data/internal_val_set_clean

# Script for text2world generation
export EXPERIMENT=Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-T2V_high_sigma_loss_reweighted
CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python cosmos_predict2/_src/predict2/inference/video2world.py --experiment=${EXPERIMENT} --ckpt_path s3://bucket/cosmos_diffusion_v2/official_runs_vid2vid/${EXPERIMENT}/checkpoints/iter_000025000 --save_root results/base_model/${EXPERIMENT}_025k_seed0_t2w --num_latent_conditional_frames=0 --seed=0 --input_root /project/cosmos/fangyinw/data/pbench/v0

# I2W with context parallel with 8 GPUs:
PYTHONPATH=. torchrun --nproc_per_node=8 cosmos_predict2/_src/predict2/inference/video2world.py --experiment=Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4 --ckpt_path s3://bucket/cosmos_diffusion_v2/official_runs_vid2vid/Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4/checkpoints/iter_000045000 --save_root results/cli_debug_from_s3 --input_root /project/cosmos/ybalaji/data/internal_val_set_clean --context_parallel_size 8

# V2W with context parallel with 8 GPUs:
PYTHONPATH=. torchrun --nproc_per_node=8 cosmos_predict2/_src/predict2/inference/video2world.py --experiment=Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4 --ckpt_path s3://bucket/cosmos_diffusion_v2/official_runs_vid2vid/Stage-c_pt_4-reason_embeddings-Index-26-Size-2B-Res-720-Fps-16-Note-HQ_V6_from_22_qwen_concat_resume4/checkpoints/iter_000045000 --save_root results/cli_debug_from_s3 --input_root pbench_upsampled_prompts --num_latent_conditional_frames=2 --context_parallel_size=8


Folder structure:
We assume the input root contains images and prompts in the following format:
input_root/
 ├── image_1.jpg
 ├── image_1.txt
 ├── image_2.jpg
 └── image_2.txt
 └── ...

or videos and prompts in the following format:
input_root/
 ├── video_1.mp4
 ├── video_1.txt
 ├── video_2.mp4
 └── video_2.txt
 └── ...
    N)	dataclass)TYPE_CHECKINGAnyparallel_state)Image)INTERNAL)distributedlog)easy_io)load_model_from_checkpoint)get_text_embedding)z.pngz.jpgz.jpegz.webpz.mp4a  The video captures a series of frames showing ugly scenes, static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. Overall, the video is of poor quality.T)slotsc                   @   st   e Zd ZU dZejeB eB ed< ejeB eB ed< dZ	ejee
e
f B ee
 B dB ed< dZeeef dB ed< dS )CameraConditionInputsz1Typed container for camera conditioning payloads.
extrinsics
intrinsicsN
image_sizemetadata)__name__
__module____qualname____doc__torchTensorlisttuple__annotations__r   intr   dictstrr    r    r    ]/data/cameron/vidgen/cosmos-predict2.5/cosmos_predict2/_src/predict2/inference/video2world.pyr   O   s   
 &r   video
resolutionc           
      C   sx   | j d | j d }}|\}}t|| || }tt|| tt|| f}tjj| |}tjj	||}	|	S )a  
    Resizes and crops the input video tensor while preserving aspect ratio.

    The video is first resized so that the smaller dimension matches the target resolution,
    preserving the aspect ratio. Then, it's center-cropped to the target resolution.

    Args:
        video (torch.Tensor): Input video tensor of shape (T, C, H, W).
        resolution (list[int]): Target resolution [H, W].

    Returns:
        torch.Tensor: Resized and cropped video tensor of shape (T, C, target_H, target_W).
          )
shapemaxr   mathceiltorchvision
transforms
functionalresizecenter_crop)
r"   r#   Zorig_horig_wZtarget_hZtarget_wscaling_ratioZresizing_shapeZvideo_resizedZvideo_croppedr    r    r!   resize_inputY   s   $r1   img_pathnum_video_framesr-   c                 C   s   t j| d }|tvrtd| t| }tjj	
|}|d}tj|t||d dddgdd}|d tj}|rGt||}|dddddd}|S )	a  
    Reads an image, converts it to a video tensor, and processes it for model input.

    The image is loaded, converted to a tensor, and replicated to match the
    `num_video_frames`. It's then optionally resized and permuted to the
    standard video format (B, C, T, H, W).

    Args:
        img_path (str): Path to the input image file.
        resolution (list[int]): Target resolution [H, W] for resizing.
        num_video_frames (int): The number of frames the output video tensor should have.
        resize (bool, optional): Whether to resize the image to the target resolution. Defaults to True.

    Returns:
        torch.Tensor: Processed video tensor of shape (1, C, T, H, W).

    Raises:
        ValueError: If the image extension is not one of the supported types.
       zInvalid image extension: r   dim     o@r$   r%      )ospathsplitext_IMAGE_EXTENSIONS
ValueErrorr   openr*   r+   r,   	to_tensor	unsqueezer   cat
zeros_likerepeattouint8r1   permute)r2   r#   r3   r-   extimg	vid_inputr    r    r!   read_and_process_imager   s   

(
rJ   r$   
video_pathnum_latent_conditional_framesc              
   C   sT  t j| d }| tvrtd| zt| \}}t	d|j
 d|  W n ty@ } z
td|  d| d}~ww t| d }	|	d	d
dd}	|	j
d }
d|d  d }t	d| d|  |dvrutd| |	j
\}}}}t||||}|
|k rtd|
 d| d| |
| }|	dd|dddddf }||ddd|ddddf< t	d| d| d|
d  d ||k r|ddddddddf }|| }|d|dd}||dd|dddddf< t	d| d |dd
dd	}|d tj}|rt||}|d
d
ddd	d}|S )a  
    Reads a video, processes it for model input.

    The video is loaded using easy_io, and uses the last 4x(num_latent_conditional_frames - 1) + 1 from the video.
    If the video is shorter than num_video_frames, it pads with the last frame repeated.
    The first num_latent_conditional_frames are marked as conditioning frames.

    Args:
        video_path (str): Path to the input video file.
        resolution (list[int]): Target resolution [H, W] for resizing.
        num_video_frames (int): Number of frames needed by the model (should equal model.tokenizer.get_pixel_num_frames(model.config.state_t)).
        num_latent_conditional_frames (int): Number of latent conditional frames from the input video (1 or 2).
        resize (bool, optional): Whether to resize the video to the target resolution. Defaults to True.

    Returns:
        torch.Tensor: Processed video tensor of shape (1, C, T, H, W) where T equals num_video_frames.

    Raises:
        ValueError: If the video extension is not supported or other validation errors.

    Note:
        Uses the last 4x(num_latent_conditional_frames - 1) + 1 frames from the video. If video is shorter, pads with last frame repeated.
    r4   zInvalid video extension: zLoaded video with shape z, metadata: zFailed to load video z: Nr7   r%   r   r$   r8   zWill extract z$ frames from input video and pad to )r4   r$   z6num_latent_conditional_frames must be 1 or 2, but got Video has only  frames but needs at least z* frames for num_latent_conditional_frames=zExtracted last z frames from video (frames z to )zPadded z! frames with last extracted frame)r9   r:   r;   lower_VIDEO_EXTENSIONSr=   r   loadr
   infor&   	Exceptionr   
from_numpyfloatrF   zerosrC   rD   rE   r1   r@   )rK   r#   r3   rL   r-   rG   video_framesvideo_metadataevideo_tensoravailable_framesframes_to_extractC_HW
full_video	start_idxextracted_frames
last_framepadding_frameslast_frame_repeatedr    r    r!   read_and_process_video   sN   
  "
  
ri   c                   @   sx  e Zd ZdZ						d4dededed	ed
edee dB dededefddZdd Z	de
dddfdejdedededededB dejdB fddZddddddde
ddd fded!eejB dB d"ed#ed$ed%ed&ed'ed(edededB dejdB d)efd*d+Zdddde
ddd fded!eejB dB d,ed-ed.ed"ed$ed'ed(ededejdB dejdB d)ed/ejfd0d1Zd2d3 ZdS )5Video2WorldInferencez
    Handles the Video2World inference process, including model loading, data preparation,
    and video generation from an image/video and text prompt. Now supports context parallelism.
    r4   ;cosmos_predict2/_src/predict2/configs/video2world/config.pyNFexperiment_name	ckpt_paths3_credential_pathcontext_parallel_sizeconfig_fileexperiment_optsoffload_diffusion_modeloffload_text_encoderoffload_tokenizerc
                 C   s  || _ || _|| _|| _d| _|| _|| _|	| _|rdnd}
| jdkr'|   |du r-g }t	s4|
d | jr<dtjd< |rPd|v rPt| j | j|d|d	\}}nt| j | j|d||
d
\}}| jrxtd t|drw|jdurw|jd|_n|jd | jrtd t|jdr|jjdur|jjd|j_t|jdr|jjdur|jjd|j_tj  | jr	 trddlm} |}| jdkr|j| j || _|| _ d| _!d| _"dS )a  
        Initializes the Video2WorldInference class.

        Loads the diffusion model and its configuration based on the provided
        experiment name and checkpoint path. Sets up distributed processing if needed.

        Args:
            experiment_name (str): Name of the experiment configuration.
            ckpt_path (str): Path to the model checkpoint (local or S3).
            s3_credential_path (str): Path to S3 credentials file (if loading from S3).
            context_parallel_size (int): Number of GPUs for context parallelism.
            config_file (str): Path to the config file.
            experiment_opts (list[str]): List of experiment options.
            offload_diffusion_model (bool): Whether to offload the diffusion model to CPU.
            offload_text_encoder (bool): Whether to offload the text encoder to CPU.
            offload_tokenizer (bool): Whether to offload the tokenizer to CPU.

        Returns:
            None
        Ncudar4   z~data_train1ZCOSMOS_PREDICT2_OFFLOAD_DITinteractiveT)rl   s3_checkpoint_dirrp   load_ema_to_regrq   )rl   rx   rp   ry   rq   	to_devicez7[Memory Optimization] Offloading DiT conditioner to CPUconditionercpuzC[Memory Optimization] Offloading tokenizer encoder & decoder to CPUencoderdecoderr   )Video2WorldModelRectifiedFlow)#rl   rm   rn   ro   process_grouprr   rs   rt   _init_distributedr   appendr9   environ$load_distilled_model_from_checkpointr   r
   rT   hasattrr{   rD   net	tokenizerr}   r~   r   ru   empty_cacher   ZEcosmos_predict2._src.predict2.models.video2world_model_rectified_flowr   Zenable_context_parallelmodelconfig
batch_sizeZneg_t5_embeddings)selfrl   rm   rn   ro   rp   rq   rr   rs   rt   model_devicer   r   r   r    r    r!   __init__   sr    








zVideo2WorldInference.__init__c                 C   sT   t   tj| jd t | _td| j  tdt 	  dt 
   dS )z:Initialize distributed processing for context parallelism.)ro   z'Initialized context parallel with size zCurrent rank: z, World size: N)r	   initr   initialize_model_parallelro   get_context_parallel_groupr   r
   rT   get_rankget_world_size)r   r    r    r!   r   z  s   
"z&Video2WorldInference._init_distributedTr"   promptnum_conditional_framesnegative_promptuse_neg_promptcameraactionc              
   C   sf  |j \}}	}
}}d||dur|dndtdd| jf t| jd|||d}|durL|j}|du rAtj||||g|j	d}|
|j|j|d	 |rV|dusVJ d
| jjdur|g|d< | jjj|gdddd|d< |r| jjj|gdddd|d< nt||d< |rt||d< | D ]\}}t|tjrt|| r| jtjd||< q|S )a  
        Prepares the input data batch for the diffusion model.

        Constructs a dictionary containing the video tensor, text embeddings,
        and other necessary metadata required by the model's forward pass.
        Optionally includes negative text embeddings.

        Args:
            video (torch.Tensor): The input video tensor (B, C, T, H, W).
            prompt (str): The text prompt for conditioning.
            num_conditional_frames (int): Number of conditional frames to use.
            negative_prompt (str, optional): Custom negative prompt.
            use_neg_prompt (bool, optional): Whether to include negative prompt embeddings. Defaults to True.
            camera (CameraConditionInputs | None): Optional typed camera metadata container.
            action: (torch.Tensor, optional) Target robot action for the K output videos, must be provided for action conditioned model.

        Returns:
            dict: A dictionary containing the prepared data batch, moved to the correct device and dtype.
        
video_dataNr          r4   )dataset_namer"   r   fpspadding_maskr   device)r   r   r   z7Negative prompt is required when use_neg_prompt is True
ai_caption)r   images)
data_batchZinput_caption_keyZt5_text_embeddingsZneg_t5_text_embeddings)dtype)r&   r@   r   randintr   rW   rX   r   tensorr   updater   r   r   text_encoderZcompute_text_embeddings_onliner   items
isinstancer   is_floating_pointru   rD   bfloat16)r   r"   r   r   r   r   r   r   Br_   Tra   rb   r   r   kvr    r    r!   _get_data_batch_input  sP   




z*Video2WorldInference._get_data_batch_input   M   z192,320#   
input_pathguidancer3   rL   num_input_videonum_output_videor#   seed	num_stepsc              	   C   s"  |dus|dus|dkr|dksJ d|dkr$| j  \}}||f}n|d}tdd |D }t|dks<J d	| j j| j jj}|du sN|d
kr`t	
dd||d
 |d t	j}nXt|trtj|d  }|tv rtd|  t|||dd}n4|tv rtd|  t||||dd}ntd| dtt  t|t	jr|}n	tdt| | j||||||
dd}t	jjt	t	j  rdndd}td|d dd | j!r| j j"durtd t#| j j"dr| j j"j dur| j j"j d| j j"_ t	j$  | j%r:td t#| j jdr5| j jj&dur5| j jj&d| j j_&t	j$  | j'rhtd  | j j(d| j _(t#| j d!rc| j j)durc| j j)d| j _)t	j$  i }|durt||d"}td# t*| j jd$d%r| j j+}n| j j,}||fd||	d|d&|}| j'rtd' | j j(d| j _(t#| j d!r| j j)dur| j j)d| j _)| j%rt#| j jdr| j jj&dur| j jj&d| j j_&t	j$  | j%rtd( t#| j jd)r
| j jj-dur
| j jj-d| j j_-t	j$  t|t.r0g }|D ]}| j /|}|0| qt	j1|dd*}n| j /|}| j%r_td+ t#| j jd)rZ| j jj-durZ| j jj-d| j j_-t	j$  | j!r| j j"durtd, t#| j j"dr| j j"j dur| j j"j d| j j"_ t	j$  |S )-ah  
        Generates a video based on an input image or video and text prompt.

        Processes the input, prepares the data batch, runs the diffusion
        model sampling, and decodes the result into a video tensor.

        Args:
            prompt: The text prompt describing the desired video content/style.
            input_path: Path to the input image or video file or a torch.Tensor.
            guidance: Classifier-free guidance scale. Defaults to 7.
            num_video_frames: Number of video frames to generate. Defaults to 77.
            num_latent_conditional_frames : Number of latent conditional frames. Defaults to 1.
            resolution: Target video resolution in "H,W" format. Defaults to "192,320".
            seed: Random seed for reproducibility. Defaults to 1.
            negative_prompt: Custom negative prompt. Defaults to the predefined default negative prompt.
            camera: CameraConditionInputs containing extrinsics, intrinsics, and optional image size metadata.
            action: Target robot action for the K output videos. Must be provided if model is action conditioned.
            num_steps: Number of generation steps. Defaults to 35.
            offload_diffusion_model: If True, offload diffusion model to CPU to save GPU memory. Defaults to False.
            offload_text_encoder: If True, offload text encoder to CPU to save GPU memory. Defaults to False.
            offload_tokenizer: If True, offload tokenizer to CPU to save GPU memory. Defaults to False.

        Returns:
            torch.Tensor: The generated video tensor (B, C, T, H, W) in the range [-1, 1].
        Nr4   zfexpected num_output_video==1 and num_output_video==1 for no camera conditioning or action conditioningnone,c                 S      g | ]}t |qS r    r   .0xr    r    r!   
<listcomp>      z;Video2WorldInference.generate_vid2world.<locals>.<listcomp>r$   "Resolution must be in 'H,W' formatr   r%   zProcessing image input: T)r2   r#   r3   r-   zProcessing video input: )rK   r#   r3   rL   r-   Unsupported file extension: z. Supported extensions: Unsupported input_path type: )r"   r   r   r   r   r   r   ru   r|   r   z+GPU memory usage after getting data_batch: i   @z.2fz GBz4[Memory Optimization] Offloading text encoder to CPUr   z6[Memory Optimization] Loading tokenizer encoder to GPUr}   z6[Memory Optimization] Loading diffusion network to GPUr{   )r   r   z7[Memory Optimization] Starting latent sample generationZuse_loraF)n_sampler   r   Zis_negative_promptr   z9[Memory Optimization] Offloading diffusion network to CPUz6[Memory Optimization] Loading tokenizer decoder to GPUr~   r5   z9[Memory Optimization] Offloading tokenizer decoder to CPUz.[Memory Optimization] Load text encoder to GPU)2r   get_video_height_widthsplitr   lenr   get_pixel_num_framesr   state_tr   rX   rD   rE   r   r   r9   r:   r;   rQ   r<   r
   rT   rJ   rR   ri   r=   r   typer   ru   memory_allocatedr   is_availablers   r   r   r   rt   r}   rr   r   r{   getattrZ generate_samples_from_batch_loraZgenerate_samples_from_batchr~   r   decoder   rA   )r   r   r   r   r3   rL   r   r   r#   r   r   r   r   r   hwvideo_resolutionmodel_required_framesrI   rG   r   	mem_bytesextra_kwargsZgenerate_samplessample
video_listZsample_chunkZvideo_chunkr"   r    r    r!   generate_vid2world  s   ")


"

 

 






 

 

 

 
z'Video2WorldInference.generate_vid2worldnum_output_frames
chunk_sizechunk_overlapreturnc           4      C   s  |dkr| j  \}}||f}n|d}tdd |D }t|dks(J d| j j| j jj}|du s:|dkrMt	
d	d
||d |d	 t	j}nrt|tr{tj|d	  }|tv rtd|  t|}tjj|}|d}|d t	j}|rt||}t	j|t	| |d	 d	d	d	gdd}|d!ddd	d
d}n|t"v rttd|  t#$|\}}t	%|& d }|!d
dd	d}|j'd	 }d|d	  d	 }||k rt(d| d| || }|dd|dddddf }|j'\}}}}t	
||||}||ddd|ddddf< ||k rO|ddddddddf } || }!|  d	|!d	d	}"|"|dd|dddddf< |!d	ddd
}|d t	j}|rgt||}|d!ddd	d
d}nKt(d| t|t	j)r|}|j'd |k r||j'd  }!t	
|j'd |j'd	 |!|j'd
 |j'd |j*}#t	j||#gdd}n	t(dt+| g }$|| }%|| }&|&dkrd	}'n
d	|&|% d	 |%  }'td|' d| d| d| d	 |, }(t-|'D ]})|)|% }*t.|*| |}+|+|* },|*|kr ntd|)d	  d|' d|* d|+  |(dddd|*|+ddddf }-|,|k r^||, }!t	
|-j'd |-j'd	 |!|-j'd
 |-j'd |-j*}#t	j|-|#gdd}-|)dkrf|}.n|}.| j/||-|||.||	|) |
|||d}/|/ddddd|,ddddf }/|)dkr|$0|/ n|$0|/dddd|dddddf  |)|'d	 k r|/d d  1d!d"d t	j}0|*|. }1|+}2|0dddd|.dddddf |(dddd|1|2ddddf< qt	j|$dd}3td#|3j'  |3S )$aP  
        Generate video using autoregressive sliding window approach.

        Args:
            prompt: The text prompt describing the desired video content/style.
            input_path: Path to the input image or video file or a torch.Tensor.
            num_output_frames: Total number of frames to generate in the final output.
            chunk_size: Number of frames per chunk (model's native capacity).
            chunk_overlap: Number of overlapping frames between chunks.
            guidance: Classifier-free guidance scale.
            num_latent_conditional_frames: Number of latent conditional frames.
            resolution: Target video resolution in "H,W" format.
            seed: Random seed for reproducibility.
            negative_prompt: Custom negative prompt.
            camera: Target camera extrinsics and intrinsics for the K output videos.
            action: Target robot action for the K output videos.
            num_steps: Number of generation steps.

        Returns:
            torch.Tensor: The generated video tensor (B, C, T, H, W) in the range [-1, 1].
        r   r   c                 S   r   r    r   r   r    r    r!   r     r   zKVideo2WorldInference.generate_autoregressive_from_batch.<locals>.<listcomp>r$   r   Nr   r4   r%   z+Processing image input for autoregressive: r7   r5   r8   z+Processing video input for autoregressive: rM   rN   rP   r   r   zGenerating z chunks with chunk_size=z, chunk_overlap=z for z total frameszProcessing chunk /z	, frames -)r   r   r   r3   rL   r#   r   r   r   r   r   g       @g      ?g        g      ?z!Generated final video with shape )2r   r   r   r   r   r   r   r   r   r   rX   rD   rE   r   r   r9   r:   r;   rQ   r<   r
   rT   r   r>   r*   r+   r,   r?   r@   r1   rA   rB   rC   rF   rR   r   rS   rV   rW   r&   r=   r   r   r   clonerangeminr   r   clamp)4r   r   r   r   r   r   r   rL   r#   r   r   r   r   r   r   r   r   r   Zfull_input_videorG   rH   rY   r`   r\   r]   r^   rd   re   r_   ra   rb   rc   rf   rg   rh   paddingZgenerated_chunksZeffective_chunk_sizeZremaining_after_first
num_chunksZcurrent_input_video	chunk_idxZstart_frameZ	end_frameZactual_chunk_sizeZchunk_inputZchunk_num_conditionalZchunk_videoZchunk_video_uint8Zupdate_startZ
update_endZfinal_videor    r    r!   "generate_autoregressive_from_batch  s   &




(

  
  


&&

&
,  &z7Video2WorldInference.generate_autoregressive_from_batchc                 C   sB   | j dkrddlm} ddlm} | r|  |  dS dS )zClean up distributed resources.r4   r   Nr   )ro   torch.distributedr	   megatron.corer   is_initializeddestroy_model_paralleldestroy_process_group)r   distr   r    r    r!   cleanup  s   
zVideo2WorldInference.cleanup)r4   rk   NFFF)r   r   r   r   r   r   r   boolr   r   _DEFAULT_NEGATIVE_PROMPTr   r   r   r   r   r   r   r    r    r    r!   rj      s    

	

z
R	

 W	

 ^rj   )T)r$   T)(r   r(   r9   dataclassesr   typingr   r   r   r*   r   r   PILr   %cosmos_predict2._src.imaginaire.flagsr   %cosmos_predict2._src.imaginaire.utilsr	   r
   -cosmos_predict2._src.imaginaire.utils.easy_ior   Z3cosmos_predict2._src.interactive.utils.model_loaderr   r   Z2cosmos_predict2._src.predict2.inference.get_t5_embr   Z0cosmos_predict2._src.predict2.utils.model_loaderr<   rR   r   r   r   r   r   r1   r   r   rJ   ri   rj   r    r    r    r!   <module>   sH   &	 1
[