"""Generate aria2c input file and download DROID dataset via direct URLs.

No HF API calls — constructs URLs directly from known naming convention.
aria2c handles parallel downloads, resumption, and retries natively.
"""

import os
import subprocess
from pathlib import Path

LOCAL_DIR = Path("/data/cameron/droid")
BASE_URL = "https://huggingface.co/datasets/cadene/droid_1.0.1/resolve/main"
TOTAL_EPISODES = 95600
CAMERAS = ["exterior_1_left", "exterior_2_left"]

# Get HF token for authenticated (faster) downloads
token = None
token_path = Path.home() / ".cache" / "huggingface" / "token"
if token_path.exists():
    token = token_path.read_text().strip()
    print(f"Using HF token: {token[:8]}...")


def build_aria2_input():
    """Generate aria2c input file with all URLs + output paths."""
    lines = []
    for ep in range(TOTAL_EPISODES):
        chunk = f"chunk-{ep // 1000:03d}"
        ep_str = f"episode_{ep:06d}"

        # Parquet
        rel = f"data/{chunk}/{ep_str}.parquet"
        local = LOCAL_DIR / rel
        if not local.exists():
            lines.append(f"{BASE_URL}/{rel}")
            lines.append(f"  dir={local.parent}")
            lines.append(f"  out={local.name}")
            lines.append("")

        # Videos (ext1 + ext2)
        for cam in CAMERAS:
            rel = f"videos/{chunk}/observation.images.{cam}/{ep_str}.mp4"
            local = LOCAL_DIR / rel
            if not local.exists():
                lines.append(f"{BASE_URL}/{rel}")
                lines.append(f"  dir={local.parent}")
                lines.append(f"  out={local.name}")
                lines.append("")

    return "\n".join(lines)


def main():
    input_file = LOCAL_DIR / "aria2_input.txt"

    print("Generating URL list...")
    content = build_aria2_input()
    n_files = content.count("\ndir=") if content else 0
    input_file.write_text(content)
    print(f"Files to download: {n_files}")

    if n_files == 0:
        print("All files already downloaded!")
        return

    # Build aria2c command
    cmd = [
        "aria2c",
        f"--input-file={input_file}",
        "--max-concurrent-downloads=16",
        "--max-connection-per-server=4",
        "--min-split-size=1M",
        "--split=4",
        "--retry-wait=10",
        "--max-tries=5",
        "--continue=true",
        "--auto-file-renaming=false",
        "--console-log-level=warn",
        "--summary-interval=30",
    ]
    if token:
        cmd.append(f"--header=Authorization: Bearer {token}")

    print(f"Launching aria2c with 16 concurrent downloads...")
    print(f"Command: {' '.join(cmd[:5])} ...")
    subprocess.run(cmd)
    print("Done!")


if __name__ == "__main__":
    main()
