#!/usr/bin/env python3
"""Build the canonical English-only IIW S7 source episode manifest.

The IIW sync contains two title/language variants per production episode.
For training we start from one English-title master per episode and keep the
existing YouTube/bible data only as a cross-reference for the 13 episodes that
were previously available.
"""
from __future__ import annotations

import argparse
import csv
import json
import re
import subprocess
from collections import Counter
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_IIW_ROOT = ROOT / "iiw-totallyspies"
DEFAULT_OUTPUT_JSON = ROOT / "materials/training-data/iiw_english_source_manifest.json"
DEFAULT_OUTPUT_CSV = ROOT / "docs/internal/iiw-english-episode-source-manifest.csv"
MASTER_INDEX = ROOT / "materials/benchmark/youtube-s7-validation/bible/master-index.json"
TRAINING_MANIFEST = ROOT / "materials/training-data/manifest.json"

# English-title production masters selected as canonical visual source.
ENGLISH_TITLES = {
    "01": "PANDAPOCALYPSE",
    "02": "IT TAKES A SLOB",
    "03": "TOTALLY VINTAGE",
    "04": "STINK-O-RAMA",
    "05": "CREEPY CRAWLY CREATURE CATCHER",
    "06": "TOTALLY TROLLING MUCH",
    "07": "OVER-SIMULATED",
    "08": "IT'S TOTALLY A TEST",
    "09": "TERRIBLE TODDLER TOYS",
    "10": "TOTALLY TALENTED",
    "11": "THE DAH WHO",
    "12": "MEGA MOON CHEESE",
    "13": "THE WILD LIFE",
    "14": "WHAT WOOLLY MAMMOTH",
    "15": "MYSTERY ON THE WOOHP EXPRESS",
    "16": "PUMPKIN PARTICLE PERIL V2",
    "17": "UNDERCOVER SUPERVILLAINS",
    "18": "MANDYS MIND-BLOWING MAINFRAME",
    "19": "OLDIES AND GOODIES",
    "20": "TOTALLY PAWSOME",
    "21": "A DOG GONE DAY",
    "22": "SOMETHINGS FISHY",
    "23": "FOREVER LIPTASTIC",
    "24": "GLITTERSPY",
    "25": "LOCKED IN SPACE PERIL",
    "26": "CYBER SWEETHEART",
}

# Existing bible/training episode name -> production episode/title mapping.
EXISTING_BIBLE_TO_PRODUCTION = {
    "Frankenpanda": ("01", "PANDAPOCALYPSE"),
    "It Takes A Slob": ("02", "IT TAKES A SLOB"),
    "Totally Vintage": ("03", "TOTALLY VINTAGE"),
    "Creepy Crawly Creature Catcher": ("05", "CREEPY CRAWLY CREATURE CATCHER"),
    "Totally Trolling, Much?": ("06", "TOTALLY TROLLING MUCH"),
    "Over": ("07", "OVER-SIMULATED"),
    "It's Totally a Test": ("08", "IT'S TOTALLY A TEST"),
    "Totally Talented": ("10", "TOTALLY TALENTED"),
    "The DAH": ("11", "THE DAH WHO"),
    "Mega Moon Cheese": ("12", "MEGA MOON CHEESE"),
    "What Woolly Mammoth": ("14", "WHAT WOOLLY MAMMOTH"),
    "Undercover Supervillains": ("17", "UNDERCOVER SUPERVILLAINS"),
    "Totally Pawsome": ("20", "TOTALLY PAWSOME"),
}


def ffprobe(path: Path) -> dict[str, Any]:
    result = subprocess.run(
        [
            "ffprobe",
            "-v",
            "error",
            "-show_format",
            "-show_streams",
            "-of",
            "json",
            str(path),
        ],
        check=True,
        capture_output=True,
        text=True,
    )
    return json.loads(result.stdout)


def fps_text(rate: str) -> str:
    if not rate or "/" not in rate:
        return rate or ""
    num, den = rate.split("/", 1)
    if den == "0":
        return rate
    return f"{float(num) / float(den):.3f}".rstrip("0").rstrip(".")


def rel(path: Path) -> str:
    try:
        return path.relative_to(ROOT).as_posix()
    except ValueError:
        return path.as_posix()


def parse_master_name(path: Path) -> tuple[str, str] | None:
    match = re.match(r"Totally_Spies_S7_EP(\d{2})_(.*)\.mov$", path.name)
    if not match:
        return None
    episode = match.group(1)
    title = match.group(2).replace("_", " ")
    return episode, title


def load_existing_crossrefs() -> dict[str, dict[str, Any]]:
    bible_by_prod: dict[str, dict[str, Any]] = {}
    if MASTER_INDEX.exists():
        for entry in json.loads(MASTER_INDEX.read_text()):
            mapped = EXISTING_BIBLE_TO_PRODUCTION.get(entry.get("name", ""))
            if not mapped:
                continue
            prod_ep, prod_title = mapped
            bible_by_prod[prod_ep] = {
                "existing_bible_episode_id": entry.get("episode_id", ""),
                "existing_bible_name": entry.get("name", ""),
                "existing_youtube_title": entry.get("title", ""),
                "existing_bible_duration_s": entry.get("duration", ""),
                "existing_bible_shots": entry.get("n_shots", ""),
                "existing_bible_frames": entry.get("n_frames", ""),
                "existing_bible_segments": entry.get("n_segments", ""),
                "mapped_production_title": prod_title,
            }

    clip_counts: Counter[str] = Counter()
    if TRAINING_MANIFEST.exists():
        payload = json.loads(TRAINING_MANIFEST.read_text())
        clips = payload.get("clips", payload if isinstance(payload, list) else [])
        for clip in clips:
            mapped = EXISTING_BIBLE_TO_PRODUCTION.get(clip.get("episode", ""))
            if mapped:
                clip_counts[mapped[0]] += 1
    for prod_ep, count in clip_counts.items():
        bible_by_prod.setdefault(prod_ep, {})["existing_training_clip_count"] = count
    return bible_by_prod


def build_manifest(iiw_root: Path) -> list[dict[str, Any]]:
    h264_dir = iiw_root / "02_Elements/MASTER EP/H264"
    if not h264_dir.exists():
        raise SystemExit(f"Missing master directory: {h264_dir}")

    existing = load_existing_crossrefs()
    by_episode: dict[str, Path] = {}
    for path in sorted(h264_dir.glob("*.mov")):
        parsed = parse_master_name(path)
        if not parsed:
            continue
        episode, title = parsed
        if ENGLISH_TITLES.get(episode) == title:
            by_episode[episode] = path

    missing = sorted(set(ENGLISH_TITLES) - set(by_episode))
    if missing:
        raise SystemExit(f"Missing English masters for production episodes: {', '.join(missing)}")

    rows: list[dict[str, Any]] = []
    for episode in sorted(by_episode):
        path = by_episode[episode]
        meta = ffprobe(path)
        streams = meta.get("streams", [])
        v = next((s for s in streams if s.get("codec_type") == "video"), {})
        audios = [s for s in streams if s.get("codec_type") == "audio"]
        fmt = meta.get("format", {})
        row: dict[str, Any] = {
            "production_episode": episode,
            "production_code": f"7{episode}",
            "canonical_title": ENGLISH_TITLES[episode],
            "source_master_path": rel(path),
            "source_master_basename": path.name,
            "width": v.get("width", ""),
            "height": v.get("height", ""),
            "fps": fps_text(str(v.get("r_frame_rate", ""))),
            "r_frame_rate": v.get("r_frame_rate", ""),
            "video_codec": v.get("codec_name", ""),
            "pixel_format": v.get("pix_fmt", ""),
            "video_bit_rate_bps": int(v.get("bit_rate", 0) or 0),
            "duration_s": round(float(fmt.get("duration", 0) or 0), 3),
            "size_bytes": int(fmt.get("size", 0) or 0),
            "format_bit_rate_bps": int(fmt.get("bit_rate", 0) or 0),
            "audio_stream_count": len(audios),
            "audio_summary": " | ".join(
                f"{a.get('codec_name','')} {a.get('channels','')}ch {a.get('sample_rate','')}Hz {a.get('bit_rate','')}bps"
                for a in audios
            ),
            "source_status": "canonical_english_master",
            "existing_bible_status": "mapped_existing_13" if episode in existing else "new_needs_bible_metadata",
        }
        row.update(existing.get(episode, {}))
        row.setdefault("existing_bible_episode_id", "")
        row.setdefault("existing_bible_name", "")
        row.setdefault("existing_youtube_title", "")
        row.setdefault("existing_bible_duration_s", "")
        row.setdefault("existing_bible_shots", "")
        row.setdefault("existing_bible_frames", "")
        row.setdefault("existing_bible_segments", "")
        row.setdefault("existing_training_clip_count", 0)
        rows.append(row)
    return rows


def write_json(rows: list[dict[str, Any]], output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "schema": "iiw_english_source_manifest/v1",
        "source_root": "iiw-totallyspies/02_Elements/MASTER EP/H264",
        "selection_rule": "English-title production master only, one canonical visual source per production episode.",
        "count": len(rows),
        "episodes": rows,
    }
    output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n")


def write_csv(rows: list[dict[str, Any]], output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)
    fieldnames = [
        "production_episode",
        "production_code",
        "canonical_title",
        "source_master_path",
        "width",
        "height",
        "fps",
        "video_codec",
        "pixel_format",
        "video_bit_rate_bps",
        "duration_s",
        "size_bytes",
        "audio_stream_count",
        "existing_bible_status",
        "existing_bible_episode_id",
        "existing_bible_name",
        "existing_training_clip_count",
        "existing_bible_shots",
        "existing_bible_segments",
        "audio_summary",
    ]
    with output.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--iiw-root", type=Path, default=DEFAULT_IIW_ROOT)
    parser.add_argument("--output-json", type=Path, default=DEFAULT_OUTPUT_JSON)
    parser.add_argument("--output-csv", type=Path, default=DEFAULT_OUTPUT_CSV)
    args = parser.parse_args()

    rows = build_manifest(args.iiw_root)
    write_json(rows, args.output_json)
    write_csv(rows, args.output_csv)
    mapped = sum(1 for r in rows if r["existing_bible_status"] == "mapped_existing_13")
    print(f"Wrote {args.output_json} ({len(rows)} English masters, {mapped} mapped to existing bible)")
    print(f"Wrote {args.output_csv}")


if __name__ == "__main__":
    main()