import json
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def download_episode(episode_info):
    episode, episode_meta = episode_info
    rel_episode_path = episode_meta["relative_path"].lstrip("/")
    src_episode_path = os.path.join(src_s3_path, rel_episode_path)
    local_episode_path = os.path.join(local_path, rel_episode_path)
    
    # Create local directory if it doesn't exist
    os.makedirs(os.path.dirname(local_episode_path), exist_ok=True)
    
    extra = ["--exclude", "*.svo", "--exclude", "*-stereo.mp4", "--exclude", "*_im128.h5", "--size-only"]

    return subprocess.run(
        ["aws", "s3", "sync", src_episode_path, local_episode_path] + extra, # "--recursive"
        check=True,
    )

local_path = "/home/junjieye/datasets/droid_raw/1.0.1"
src_s3_path = "s3://scratch-tri-global/zubair.irshad/droid_raw/1.0.1" # "s3://robotics-manip-lbm/junjie/droid_raw_subset/1.0.1"

episodes_with_good_extrinsics = "reformatted_episodes_with_good_extrinsics.json"
with open(episodes_with_good_extrinsics, "r") as f:
    episodes_with_good_extrinsics = json.load(f)
# episodes = list(episodes_with_good_extrinsics.items())

failed_episodes = []
with open('failed_episodes.txt', 'r') as f:
    failed_episodes = [line.strip() for line in f.readlines()]

episodes = [(episode_id, episodes_with_good_extrinsics[episode_id]) for episode_id in failed_episodes]

# ## test
# result = download_episode(episodes[0])
# breakpoint()

with ThreadPoolExecutor(max_workers=8) as ex:
    futures = [ex.submit(download_episode, it) for it in episodes]
    for _ in tqdm(as_completed(futures), total=len(futures), desc="Downloading episodes"):
        pass
