#!/usr/bin/env python3
"""Review IIW character identity PNGs with an Ollama vision model.

This intentionally uses the Ollama HTTP API with base64 images because the local
Ollama CLI in this environment can pass redirected PNG bytes as prompt text for
some cloud models. The API path preserves proper image inputs.
"""
from __future__ import annotations

import argparse
import base64
import json
import re
import time
import urllib.error
import urllib.request
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parents[1]
DEFAULT_MANIFEST = ROOT / "materials/training-data/iiw-character-identity/manifest.json"
DEFAULT_OUTPUT_DIR = ROOT / "materials/training-data/iiw-character-identity/review"
DEFAULT_MODEL = "qwen3-vl:235b-cloud"

PROMPT = """Review this single animation production character plate for model-training use.
Return JSON only with keys decision, confidence, visual_description, issues.
decision must be TRAIN, EVAL_ONLY, or EXCLUDE.
Criteria:
- TRAIN: clean character identity or outfit plate with useful character/body/outfit information and minimal distracting text.
- EVAL_ONLY: usable reference sheet, turnaround, expression/detail sheet, heavy title/text, lineup, or unusual layout that should not dominate training.
- EXCLUDE: wrong character, blank, corrupted, duplicate-looking non-informative sheet, mostly text/notes, line-only when a colour version exists, or otherwise unsuitable.
Be concise and describe visible content, not filename assumptions.
"""


def load_json(path: Path) -> dict[str, Any]:
    return json.loads(path.read_text())


def clean_json_text(text: str) -> str:
    text = text.strip()
    fence = re.search(r"```(?:json)?\s*(.*?)\s*```", text, flags=re.S | re.I)
    if fence:
        text = fence.group(1).strip()
    start = text.find("{")
    end = text.rfind("}")
    if start >= 0 and end >= start:
        text = text[start:end + 1]
    return text


def parse_review(text: str) -> dict[str, Any]:
    cleaned = clean_json_text(text)
    data = json.loads(cleaned)
    decision = str(data.get("decision", "")).upper().strip()
    if decision not in {"TRAIN", "EVAL_ONLY", "EXCLUDE"}:
        raise ValueError(f"Invalid decision: {decision!r}")
    data["decision"] = decision
    return data


def ollama_generate(
    *,
    url: str,
    model: str,
    image_path: Path,
    timeout: float,
    num_predict: int,
) -> dict[str, Any]:
    payload = {
        "model": model,
        "prompt": PROMPT,
        "images": [base64.b64encode(image_path.read_bytes()).decode("ascii")],
        "stream": False,
        "format": "json",
        "options": {"num_predict": num_predict},
    }
    req = urllib.request.Request(
        url.rstrip("/") + "/api/generate",
        data=json.dumps(payload).encode("utf-8"),
        headers={"Content-Type": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=timeout) as response:
        return json.loads(response.read())


def review_item(
    item: dict[str, Any],
    *,
    url: str,
    model: str,
    timeout: float,
    num_predict: int,
    retries: int,
    retry_sleep: float,
) -> dict[str, Any]:
    image_path = ROOT / item["image_path"]
    errors: list[str] = []
    for attempt in range(retries + 1):
        try:
            raw = ollama_generate(
                url=url,
                model=model,
                image_path=image_path,
                timeout=timeout,
                num_predict=num_predict,
            )
            review = parse_review(raw.get("response", ""))
            return {
                "image_path": item["image_path"],
                "source_path": item.get("source_path", ""),
                "character": item.get("character", ""),
                "caption": item.get("caption", ""),
                "asset_type": item.get("asset_type", ""),
                "outfit_hint": item.get("outfit_hint", ""),
                "weight": item.get("weight"),
                "review": review,
                "review_model": raw.get("model", model),
                "review_created_at": raw.get("created_at"),
                "review_done_reason": raw.get("done_reason"),
                "review_prompt_eval_count": raw.get("prompt_eval_count"),
                "review_eval_count": raw.get("eval_count"),
                "review_total_duration": raw.get("total_duration"),
                "review_status": "ok",
            }
        except urllib.error.HTTPError as exc:
            detail = exc.read().decode("utf-8", "replace")[:1000]
            errors.append(f"HTTP {exc.code}: {detail}")
        except Exception as exc:  # noqa: BLE001 - record all review failures.
            errors.append(f"{type(exc).__name__}: {exc}")
        if attempt < retries:
            time.sleep(retry_sleep)
    return {
        "image_path": item["image_path"],
        "source_path": item.get("source_path", ""),
        "character": item.get("character", ""),
        "caption": item.get("caption", ""),
        "asset_type": item.get("asset_type", ""),
        "outfit_hint": item.get("outfit_hint", ""),
        "weight": item.get("weight"),
        "review_status": "error",
        "review_errors": errors,
        "review_model": model,
    }


def write_csv(path: Path, rows: list[dict[str, Any]]) -> None:
    import csv

    fields = [
        "decision",
        "confidence",
        "character",
        "asset_type",
        "outfit_hint",
        "image_path",
        "source_path",
        "visual_description",
        "issues",
        "status",
    ]
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(handle, fieldnames=fields)
        writer.writeheader()
        for row in rows:
            review = row.get("review", {})
            writer.writerow({
                "decision": review.get("decision", "ERROR" if row.get("review_status") == "error" else ""),
                "confidence": review.get("confidence", ""),
                "character": row.get("character", ""),
                "asset_type": row.get("asset_type", ""),
                "outfit_hint": row.get("outfit_hint", ""),
                "image_path": row.get("image_path", ""),
                "source_path": row.get("source_path", ""),
                "visual_description": review.get("visual_description", ""),
                "issues": review.get("issues", "; ".join(row.get("review_errors", []))),
                "status": row.get("review_status", ""),
            })


def reviewed_item(item: dict[str, Any], row: dict[str, Any], effective_weight: float) -> dict[str, Any]:
    review = row["review"]
    return {
        **item,
        "weight_original": item.get("weight"),
        "weight": effective_weight,
        "visual_review_decision": review["decision"],
        "visual_review_confidence": review.get("confidence"),
        "visual_review_description": review.get("visual_description", ""),
        "visual_review_issues": review.get("issues", ""),
        "visual_review_model": row.get("review_model"),
    }


def build_train_manifest(source_manifest: dict[str, Any], rows: list[dict[str, Any]]) -> dict[str, Any]:
    by_path = {row["image_path"]: row for row in rows if row.get("review_status") == "ok"}
    reviewed_items: list[dict[str, Any]] = []
    for item in source_manifest.get("items", []):
        row = by_path.get(item["image_path"])
        if not row:
            continue
        review = row["review"]
        if review["decision"] != "TRAIN":
            continue
        reviewed_items.append(reviewed_item(item, row, float(item.get("weight") or 0)))
    return {
        "schema": "iiw_character_identity_dataset/v1",
        "source_manifest": "materials/training-data/iiw-character-identity/manifest.json",
        "review_source": "ollama_vlm",
        "review_model_requested": DEFAULT_MODEL,
        "count": len(reviewed_items),
        "notes": [
            "TRAIN-only subset from Ollama VLM visual review.",
            "Still recommended for human spot-check before final training approval.",
            "EVAL_ONLY and EXCLUDE items remain in review JSON/CSV for audit.",
        ],
        "items": reviewed_items,
    }


def build_usable_manifest(source_manifest: dict[str, Any], rows: list[dict[str, Any]]) -> dict[str, Any]:
    by_path = {row["image_path"]: row for row in rows if row.get("review_status") == "ok"}
    reviewed_items: list[dict[str, Any]] = []
    for item in source_manifest.get("items", []):
        row = by_path.get(item["image_path"])
        if not row:
            continue
        decision = row["review"]["decision"]
        if decision == "EXCLUDE":
            continue
        original_weight = float(item.get("weight") or 0)
        if decision == "TRAIN":
            effective_weight = original_weight
            use = "train_identity_anchor"
        else:
            effective_weight = round(original_weight * 0.25, 4)
            use = "eval_or_low_weight_reference"
        reviewed = reviewed_item(item, row, effective_weight)
        reviewed["use"] = use
        reviewed_items.append(reviewed)
    return {
        "schema": "iiw_character_identity_dataset/v1",
        "source_manifest": "materials/training-data/iiw-character-identity/manifest.json",
        "review_source": "ollama_vlm",
        "review_model_requested": DEFAULT_MODEL,
        "count": len(reviewed_items),
        "notes": [
            "Usable TRAIN + EVAL_ONLY subset from Ollama VLM visual review; EXCLUDE rows removed.",
            "EVAL_ONLY rows are retained only as low-weight references/eval material with 0.25x original weight.",
            "Human spot-check is still recommended before final training approval.",
        ],
        "items": reviewed_items,
    }


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--manifest", type=Path, default=DEFAULT_MANIFEST)
    parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
    parser.add_argument("--model", default=DEFAULT_MODEL)
    parser.add_argument("--ollama-url", default="http://127.0.0.1:11434")
    parser.add_argument("--timeout", type=float, default=180)
    parser.add_argument("--num-predict", type=int, default=180)
    parser.add_argument("--retries", type=int, default=2)
    parser.add_argument("--retry-sleep", type=float, default=3)
    parser.add_argument("--limit", type=int, default=0)
    parser.add_argument("--force", action="store_true")
    args = parser.parse_args()

    source_manifest = load_json(args.manifest)
    items = source_manifest.get("items", [])
    if args.limit:
        items = items[:args.limit]
    args.output_dir.mkdir(parents=True, exist_ok=True)
    review_path = args.output_dir / "ollama_vlm_identity_plate_review.json"
    existing: list[dict[str, Any]] = []
    if review_path.exists() and not args.force:
        existing_payload = load_json(review_path)
        existing = existing_payload.get("reviews", [])
    existing_by_path = {row.get("image_path"): row for row in existing}

    rows: list[dict[str, Any]] = []
    for index, item in enumerate(items, start=1):
        if item["image_path"] in existing_by_path and not args.force:
            row = existing_by_path[item["image_path"]]
            print(f"[{index}/{len(items)}] SKIP {item['image_path']} -> {row.get('review', {}).get('decision', row.get('review_status'))}")
        else:
            print(f"[{index}/{len(items)}] REVIEW {item['image_path']}", flush=True)
            row = review_item(
                item,
                url=args.ollama_url,
                model=args.model,
                timeout=args.timeout,
                num_predict=args.num_predict,
                retries=args.retries,
                retry_sleep=args.retry_sleep,
            )
            decision = row.get("review", {}).get("decision", row.get("review_status"))
            print(f"    -> {decision}", flush=True)
        rows.append(row)
        payload = {
            "schema": "iiw_character_identity_vlm_review/v1",
            "source_manifest": args.manifest.relative_to(ROOT).as_posix() if args.manifest.is_relative_to(ROOT) else str(args.manifest),
            "model_requested": args.model,
            "ollama_url": args.ollama_url,
            "prompt": PROMPT,
            "count": len(rows),
            "reviews": rows,
        }
        review_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n")
        write_csv(args.output_dir / "ollama_vlm_identity_plate_review.csv", rows)

    train_manifest = build_train_manifest(source_manifest, rows)
    train_manifest["review_model_requested"] = args.model
    (args.output_dir / "train_identity_manifest.vlm_reviewed.json").write_text(
        json.dumps(train_manifest, indent=2, ensure_ascii=False) + "\n"
    )
    usable_manifest = build_usable_manifest(source_manifest, rows)
    usable_manifest["review_model_requested"] = args.model
    (args.output_dir / "usable_identity_manifest.vlm_reviewed.json").write_text(
        json.dumps(usable_manifest, indent=2, ensure_ascii=False) + "\n"
    )
    print(f"Wrote {review_path}")
    print(f"Wrote {args.output_dir / 'ollama_vlm_identity_plate_review.csv'}")
    print(f"Wrote {args.output_dir / 'train_identity_manifest.vlm_reviewed.json'}")
    print(f"Wrote {args.output_dir / 'usable_identity_manifest.vlm_reviewed.json'}")


if __name__ == "__main__":
    main()
