#!/bin/bash
# Download DROID dataset chunk-by-chunk to avoid HF API rate limits.
# Each chunk has ~1000 episodes (~3000 files: parquet + ext1 + ext2 videos).
# Runs N_PARALLEL chunks concurrently for speed.

set -e

LOCAL_DIR="/data/cameron/droid"
REPO="cadene/droid_1.0.1"
N_CHUNKS=96        # chunk-000 through chunk-095
N_PARALLEL=4       # concurrent downloads
MAX_RETRIES=5

export HF_HUB_ENABLE_HF_TRANSFER=1

download_chunk() {
    local chunk_id=$1
    local chunk=$(printf "chunk-%03d" $chunk_id)
    local logfile="$LOCAL_DIR/logs/chunk_${chunk}.log"
    mkdir -p "$LOCAL_DIR/logs"

    # Check if chunk is already complete
    local n_pq=$(ls "$LOCAL_DIR/data/$chunk/"*.parquet 2>/dev/null | wc -l)
    local n_ext2=$(ls "$LOCAL_DIR/videos/$chunk/observation.images.exterior_2_left/"*.mp4 2>/dev/null | wc -l)

    # Last chunk (chunk-095) has 600 episodes, all others have 1000
    local expected=1000
    if [ $chunk_id -eq 95 ]; then expected=600; fi

    if [ "$n_pq" -ge "$expected" ] && [ "$n_ext2" -ge "$expected" ]; then
        echo "[$chunk] Already complete ($n_pq pq, $n_ext2 ext2)"
        return 0
    fi

    for attempt in $(seq 1 $MAX_RETRIES); do
        echo "[$chunk] Attempt $attempt (have $n_pq pq, $n_ext2 ext2)..."
        huggingface-cli download "$REPO" \
            --repo-type dataset \
            --local-dir "$LOCAL_DIR" \
            --include "data/$chunk/**" \
                      "videos/$chunk/observation.images.exterior_1_left/**" \
                      "videos/$chunk/observation.images.exterior_2_left/**" \
            >> "$logfile" 2>&1

        n_pq=$(ls "$LOCAL_DIR/data/$chunk/"*.parquet 2>/dev/null | wc -l)
        n_ext2=$(ls "$LOCAL_DIR/videos/$chunk/observation.images.exterior_2_left/"*.mp4 2>/dev/null | wc -l)

        if [ "$n_pq" -ge "$expected" ] && [ "$n_ext2" -ge "$expected" ]; then
            echo "[$chunk] Complete ($n_pq pq, $n_ext2 ext2)"
            return 0
        fi

        echo "[$chunk] Incomplete after attempt $attempt. Sleeping 30s..."
        sleep 30
    done

    echo "[$chunk] FAILED after $MAX_RETRIES attempts ($n_pq pq, $n_ext2 ext2)"
    return 1
}

export -f download_chunk
export LOCAL_DIR REPO MAX_RETRIES

echo "Downloading DROID dataset chunk-by-chunk ($N_PARALLEL parallel)..."
echo "Target: $LOCAL_DIR"
echo ""

# Meta files already downloaded separately (skip the 966MB stats file)

# Run chunks in parallel using xargs
seq 0 $((N_CHUNKS - 1)) | xargs -P $N_PARALLEL -I {} bash -c 'download_chunk {}'

echo ""
echo "=== Download Summary ==="
echo "Parquets: $(find $LOCAL_DIR/data -name '*.parquet' | wc -l)"
echo "Ext1 videos: $(find $LOCAL_DIR/videos -path '*/exterior_1_left/*.mp4' | wc -l)"
echo "Ext2 videos: $(find $LOCAL_DIR/videos -path '*/exterior_2_left/*.mp4' | wc -l)"
echo "Total size: $(du -sh $LOCAL_DIR | cut -f1)"
echo "Done!"
