#!/usr/bin/env python3
"""Build a small character-identity image dataset from exported IIW PNG plates."""
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_DERIVED_MANIFESTS = [
    ROOT / "materials/training-data/iiw-character-plates-pilot/png_2048/derived_manifest.json",
    ROOT / "materials/training-data/iiw-character-plates-secondary/png_2048/derived_manifest.json",
]
DEFAULT_OUTPUT_DIR = ROOT / "materials/training-data/iiw-character-identity"

WEIGHTS = {
    "Alex": 1.0,
    "Clover": 1.0,
    "Sam": 1.0,
    "Zerlina": 0.55,
    "Toby": 0.45,
    "Jerry": 0.40,
    "Mandy": 0.40,
    "Glitterstar": 0.35,
    "Cyberchac": 0.35,
    "WOOHP agents": 0.30,
}


def load_derivatives(paths: list[Path]) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    for path in paths:
        if not path.exists():
            continue
        payload = json.loads(path.read_text())
        for row in payload.get("derivatives", []):
            derived = ROOT / row["derived_path"]
            if not derived.exists():
                continue
            rows.append(row)
    return rows


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--derived-manifest", type=Path, action="append", default=[])
    parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
    args = parser.parse_args()

    manifests = args.derived_manifest or DEFAULT_DERIVED_MANIFESTS
    rows = load_derivatives(manifests)
    if not rows:
        raise SystemExit("No derived character plates found")

    dataset = []
    for row in rows:
        character = row.get("character", "")
        weight = WEIGHTS.get(character, 0.25)
        use = "train_identity_anchor"
        if row.get("asset_type") in {"lineup_sheet", "detail_reference"}:
            use = "eval_or_low_weight_reference"
        entry = {
            "image_path": row["derived_path"],
            "caption": row.get("caption", ""),
            "character": character,
            "asset_type": row.get("asset_type", ""),
            "outfit_hint": row.get("outfit_hint", ""),
            "production_code": row.get("production_code", ""),
            "source_path": row.get("source_path", ""),
            "width": row.get("derived_width", row.get("width", "")),
            "height": row.get("derived_height", row.get("height", "")),
            "weight": weight,
            "use": use,
            "source": "iiw-character-plate-png",
        }
        dataset.append(entry)

    args.output_dir.mkdir(parents=True, exist_ok=True)
    manifest = {
        "schema": "iiw_character_identity_dataset/v1",
        "count": len(dataset),
        "notes": [
            "Small PNG identity-anchor dataset built from IIW production character plates.",
            "Use with controlled sampling weights. Do not over-weight against episode frames.",
            "VLM visual review was attempted but unavailable; this dataset is selected from filenames/metadata and still needs human visual review.",
        ],
        "weights": WEIGHTS,
        "items": dataset,
    }
    (args.output_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False) + "\n")
    with (args.output_dir / "identity_metadata.jsonl").open("w", encoding="utf-8") as handle:
        for entry in dataset:
            handle.write(json.dumps(entry, ensure_ascii=False) + "\n")
    print(f"Wrote {args.output_dir / 'manifest.json'} ({len(dataset)} items)")
    print(f"Wrote {args.output_dir / 'identity_metadata.jsonl'}")


if __name__ == "__main__":
    main()
