#!/usr/bin/env python3
"""Build the current Wan2.2 pilot metadata package from IIW pilot clips.

This does not duplicate the 2.2 GB clip folder unless --zip is requested. It
rewrites DiffSynth/Wan metadata from the current manifest, filtering out clips
explicitly marked training_usable=false by the VLM caption pass, and records the
reviewed character identity anchor manifest to use alongside the video clips.
"""
from __future__ import annotations

import argparse
import json
import subprocess
from collections import Counter
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_PILOT_DIR = ROOT / "materials/training-data/iiw-english-pilot"
DEFAULT_IDENTITY_MANIFEST = ROOT / "materials/training-data/iiw-character-identity/review/usable_identity_manifest.vlm_reviewed.json"


def load_json(path: Path) -> Any:
    return json.loads(path.read_text())


def write_json(path: Path, payload: Any) -> None:
    path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n")


def rel(path: Path) -> str:
    return path.relative_to(ROOT).as_posix() if path.is_relative_to(ROOT) else str(path)


def training_prompt(clip: dict[str, Any]) -> str:
    return (clip.get("training_caption") or clip.get("caption") or "").strip()


def build_rows(clips: list[dict[str, Any]]) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
    accepted: list[dict[str, Any]] = []
    rejected: list[dict[str, Any]] = []
    for clip in clips:
        prompt = training_prompt(clip)
        if not prompt:
            rejected.append({"clip": clip.get("clip", ""), "reason": "empty_prompt"})
            continue
        if clip.get("training_usable") is False:
            rejected.append({"clip": clip.get("clip", ""), "reason": "training_usable_false"})
            continue
        accepted.append(clip)

    diffsynth_rows = []
    wan_rows = []
    for clip in accepted:
        video = f"clips/{clip['clip']}"
        first_frame = f"first_frames/{clip['first_frame']}"
        prompt = training_prompt(clip)
        diffsynth_rows.append(
            {
                "prompt": prompt,
                "video": video,
                "input_image": first_frame,
                "episode": clip.get("episode", ""),
                "production_episode": clip.get("production_episode", ""),
                "production_code": clip.get("production_code", ""),
                "source_master_path": clip.get("source_master_path", ""),
                "start_s": clip.get("start_s"),
                "duration": clip.get("duration"),
                "location": clip.get("location", ""),
                "scene_type": clip.get("scene_type", ""),
                "characters": clip.get("characters", []),
                "caption_source": clip.get("caption_source", ""),
            }
        )
        wan_rows.append(
            {
                "media_path": video,
                "first_frame": first_frame,
                "caption": prompt,
                "duration": clip.get("duration", ""),
                "production_episode": clip.get("production_episode", ""),
                "caption_source": clip.get("caption_source", ""),
            }
        )
    return diffsynth_rows, wan_rows, rejected


def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as handle:
        for row in rows:
            handle.write(json.dumps(row, ensure_ascii=False) + "\n")


def zip_package(pilot_dir: Path, output_zip: Path) -> None:
    output_zip.parent.mkdir(parents=True, exist_ok=True)
    if output_zip.exists():
        output_zip.unlink()
    subprocess.run(
        [
            "zip", "-r", str(output_zip),
            "clips", "first_frames", "manifest.json", "extraction_plan.json",
            "diffsynth_metadata.jsonl", "wan21_metadata.json", "wan2.1_metadata.json",
            "wan22_pilot_package_manifest.json",
        ],
        cwd=pilot_dir,
        check=True,
    )


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--pilot-dir", type=Path, default=DEFAULT_PILOT_DIR)
    parser.add_argument("--identity-manifest", type=Path, default=DEFAULT_IDENTITY_MANIFEST)
    parser.add_argument("--zip", action="store_true", help="Also create wan22_iw_english_pilot.zip; duplicates ~2.2GB of media.")
    args = parser.parse_args()

    manifest_path = args.pilot_dir / "manifest.json"
    manifest = load_json(manifest_path)
    clips = manifest.get("clips", [])
    diffsynth_rows, wan_rows, rejected = build_rows(clips)

    write_jsonl(args.pilot_dir / "diffsynth_metadata.jsonl", diffsynth_rows)
    write_json(args.pilot_dir / "wan21_metadata.json", wan_rows)
    write_json(args.pilot_dir / "wan2.1_metadata.json", wan_rows)

    identity_count = 0
    identity_train = 0
    identity_eval = 0
    if args.identity_manifest.exists():
        identity = load_json(args.identity_manifest)
        identity_items = identity.get("items", [])
        identity_count = len(identity_items)
        identity_use_counts = Counter(item.get("use", "") for item in identity_items)
        identity_train = identity_use_counts.get("train_identity_anchor", 0)
        identity_eval = identity_use_counts.get("eval_or_low_weight_reference", 0)

    package_manifest = {
        "schema": "iiw_wan22_pilot_package/v1",
        "pilot_dir": rel(args.pilot_dir),
        "video_clip_count_manifest": len(clips),
        "video_clip_count_training_metadata": len(diffsynth_rows),
        "video_clip_count_rejected_from_training_metadata": len(rejected),
        "rejection_counts": dict(sorted(Counter(row["reason"] for row in rejected).items())),
        "episode_counts_training_metadata": dict(sorted(Counter(row["production_episode"] for row in diffsynth_rows).items())),
        "caption_source_counts_training_metadata": dict(sorted(Counter(row["caption_source"] for row in diffsynth_rows).items())),
        "identity_anchor_manifest": rel(args.identity_manifest) if args.identity_manifest.exists() else "",
        "identity_anchor_count_usable": identity_count,
        "identity_anchor_count_train": identity_train,
        "identity_anchor_count_eval_low_weight": identity_eval,
        "outputs": {
            "diffsynth_metadata": "diffsynth_metadata.jsonl",
            "wan21_metadata": "wan21_metadata.json",
            "wan2_1_metadata": "wan2.1_metadata.json",
        },
        "notes": [
            "Video metadata excludes clips explicitly marked training_usable=false by the VLM caption pass.",
            "Identity anchors are referenced as a companion reviewed manifest; current DiffSynth video metadata remains video-first.",
            "Old YouTube-derived clips remain untouched and are not part of this package.",
        ],
        "rejected_clips": rejected,
    }
    write_json(args.pilot_dir / "wan22_pilot_package_manifest.json", package_manifest)

    if args.zip:
        zip_package(args.pilot_dir, args.pilot_dir / "wan22_iiw_english_pilot.zip")

    print(f"Training metadata rows: {len(diffsynth_rows)}")
    print(f"Rejected rows: {len(rejected)} {dict(sorted(Counter(row['reason'] for row in rejected).items()))}")
    print(f"Identity companion: {identity_count} usable ({identity_train} train, {identity_eval} eval/low-weight)")
    print(f"Wrote {args.pilot_dir / 'wan22_pilot_package_manifest.json'}")


if __name__ == "__main__":
    main()