#!/usr/bin/env python3
"""Rebuild manifest-derived metadata from an existing training-data directory."""

import argparse
import json
import subprocess
import sys
from pathlib import Path


def ffprobe_video(path: Path) -> dict:
    data = json.loads(subprocess.run([
        "ffprobe", "-v", "error", "-select_streams", "v:0",
        "-show_entries", "stream=width,height,r_frame_rate,duration",
        "-of", "json", str(path)
    ], capture_output=True, text=True, check=True).stdout)
    s = data["streams"][0]
    num, den = map(int, s["r_frame_rate"].split("/"))
    return {
        "width": int(s["width"]),
        "height": int(s["height"]),
        "fps": round(num / den, 3),
        "duration": round(float(s.get("duration", 0)), 3),
    }


def rebuild(training_data_dir: Path, source_video: Path | None = None):
    clips_dir = training_data_dir / "clips"
    frames_dir = training_data_dir / "first_frames"
    if not clips_dir.exists():
        raise SystemExit(f"No clips dir at {clips_dir}")

    entries = []
    for clip in sorted(clips_dir.glob("clip_*.mp4")):
        idx = int(clip.stem.split("_")[1])
        frame_name = f"clip_{idx:03d}.png"
        frame_path = frames_dir / frame_name
        if not frame_path.exists():
            frames_dir.mkdir(parents=True, exist_ok=True)
            subprocess.run(["ffmpeg", "-y", "-i", str(clip), "-vframes", "1", str(frame_path)], capture_output=True, check=True)
        info = ffprobe_video(clip)
        entries.append({
            "clip": clip.name,
            "first_frame": frame_name,
            "duration": info["duration"],
            "width": info["width"],
            "height": info["height"],
            "fps": info["fps"],
            "caption": "",
        })

    manifest_path = training_data_dir / "manifest.json"
    manifest = json.loads(manifest_path.read_text()) if manifest_path.exists() else {"clips": []}
    existing = {c["clip"]: c for c in manifest.get("clips", [])}
    for e in entries:
        if e["clip"] in existing:
            prev = existing[e["clip"]]
            e["caption"] = prev.get("caption", "")
            if "start" in prev:
                e["start"] = prev["start"]
            for key in [
                "transcript",
                "candidate_entities",
                "story_context",
                "shot_prompt_summary",
                "shot_annotation",
                "caption_entities",
                "confidence_notes",
            ]:
                if key in prev:
                    e[key] = prev[key]

    if source_video:
        src_info = ffprobe_video(source_video)
        source = {
            "file": source_video.name,
            "width": src_info["width"],
            "height": src_info["height"],
            "fps": src_info["fps"],
            "duration": src_info["duration"],
        }
    else:
        source = manifest.get("source", {})

    manifest = {**{k: v for k, v in manifest.items() if k != "clips"}, "source": source, "clips": entries}
    manifest_path.write_text(json.dumps(manifest, indent=2) + "\n")

    wan = [{
        "media_path": f"clips/{c['clip']}",
        "first_frame": f"first_frames/{c['first_frame']}",
        "caption": c["caption"],
        "duration": c["duration"],
    } for c in entries]
    (training_data_dir / "wan2.1_metadata.json").write_text(json.dumps(wan, indent=2) + "\n")

    ltx = [{"caption": c["caption"], "media_path": f"clips/{c['clip']}"} for c in entries]
    (training_data_dir / "ltx2_dataset.json").write_text(json.dumps(ltx, indent=2) + "\n")
    with open(training_data_dir / "ltx2_dataset.jsonl", "w") as f:
        for e in ltx:
            f.write(json.dumps(e) + "\n")

    print(f"Rebuilt manifest + metadata for {len(entries)} clips in {training_data_dir}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--training-data-dir", required=True, type=Path)
    parser.add_argument("--source-video", type=Path)
    args = parser.parse_args()
    rebuild(args.training_data_dir, args.source_video)


if __name__ == "__main__":
    main()
