#!/usr/bin/env python3
"""
Migrate existing SQLite (data/osint.db) → PostgreSQL (cultguard).

Safe to re-run: uses INSERT ... ON CONFLICT DO NOTHING throughout.

Usage:
    devenv shell -- python3 ingestion/migrate_to_pg.py
    # or inside devenv shell:
    db-import-sqlite
"""

import json
import sqlite3
from datetime import datetime, timezone
from pathlib import Path

import psycopg2.extras

from db import connect as pg_connect

REPO_ROOT  = Path(__file__).parent.parent
SQLITE_DB  = REPO_ROOT / "data" / "osint.db"
DEFAULT_INV = "lebanon-liberates-2026"   # default investigation ID


def _parse_ts(value: str | None):
    """Coerce ISO8601 string / date string to aware datetime, or None."""
    if not value:
        return None
    for fmt in ("%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S",
                "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
        try:
            dt = datetime.strptime(value.strip(), fmt)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            return dt
        except ValueError:
            continue
    return None


def _parse_json(value: str | None):
    """Parse JSON string → dict/list, or return None."""
    if not value:
        return None
    try:
        return json.loads(value)
    except (json.JSONDecodeError, TypeError):
        return None


def migrate_investigation(pg):
    """Ensure the default investigation row exists."""
    with pg.cursor() as cur:
        cur.execute("""
            INSERT INTO investigations (id, name, description, target_url, status)
            VALUES (%s, %s, %s, %s, 'active')
            ON CONFLICT (id) DO NOTHING
        """, (
            DEFAULT_INV,
            "لبنان يتحرر (Lebanon Liberates Itself)",
            "Facebook Page ID 61585153052901 — suspected coordinated IO network. "
            "Palestinian phone infrastructure, anti-Hezbollah messaging.",
            "https://www.facebook.com/people/لبنان-يتحرر/61585153052901/",
        ))
    pg.commit()
    print(f"  investigation: {DEFAULT_INV}")


def migrate_entities(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM entities").fetchall()
    with pg.cursor() as cur:
        for r in rows:
            cur.execute("""
                INSERT INTO entities
                    (id, investigation_id, type, name, url, platform, country,
                     created_at, collected_at, confidence, is_sock, notes, raw_json)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                ON CONFLICT (id) DO NOTHING
            """, (
                r["id"], DEFAULT_INV, r["type"], r["name"], r["url"],
                r["platform"] or "facebook", r["country"],
                _parse_ts(r["created_at"]),
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
                r["confidence"] or 50,
                bool(r["is_sock"]),
                r["notes"],
                _parse_json(r["raw_json"]),
            ))
    pg.commit()
    print(f"  entities: {len(rows)} rows")


def migrate_content(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM content").fetchall()
    with pg.cursor() as cur:
        for r in rows:
            cur.execute("""
                INSERT INTO content
                    (id, investigation_id, entity_id, type, text, text_lang,
                     text_dialect, published_at, collected_at, likes, shares,
                     comments, url, raw_json)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                ON CONFLICT (id) DO NOTHING
            """, (
                r["id"], DEFAULT_INV, r["entity_id"], r["type"], r["text"],
                r["text_lang"], r["text_dialect"],
                _parse_ts(r["published_at"]),
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
                r["likes"], r["shares"], r["comments"],
                r["url"], _parse_json(r["raw_json"]),
            ))
    pg.commit()
    print(f"  content: {len(rows)} rows")


def migrate_media(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM media").fetchall()
    with pg.cursor() as cur:
        for r in rows:
            cur.execute("""
                INSERT INTO media
                    (id, investigation_id, entity_id, content_id, type,
                     file_path, url_original, sha256, width, height,
                     exif_data, is_ai_generated, ai_score, collected_at, notes)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                ON CONFLICT (id) DO NOTHING
            """, (
                r["id"], DEFAULT_INV, r["entity_id"], r["content_id"], r["type"],
                r["file_path"], r["url_original"], r["sha256"],
                r["width"], r["height"],
                _parse_json(r["exif_json"]),
                bool(r["is_ai_generated"]) if r["is_ai_generated"] is not None else None,
                r["ai_score"],
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
                r["notes"],
            ))
    pg.commit()
    print(f"  media: {len(rows)} rows")


def migrate_http_log(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM http_log").fetchall()
    with pg.cursor() as cur:
        for r in rows:
            cur.execute("""
                INSERT INTO http_log
                    (investigation_id, url, method, request_headers, request_body,
                     response_status, response_headers, response_body,
                     file_path, collected_at, session_id, notes)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """, (
                DEFAULT_INV,
                r["url"], r["method"] or "GET",
                _parse_json(r["request_headers"]),
                r["request_body"],
                r["response_status"],
                _parse_json(r["response_headers"]),
                r["response_body"], r["file_path"],
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
                r["session_id"], r["notes"],
            ))
    pg.commit()
    print(f"  http_log: {len(rows)} rows")


def migrate_relationships(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM relationships").fetchall()
    with pg.cursor() as cur:
        cur.execute("SELECT id FROM entities WHERE investigation_id = %s", (DEFAULT_INV,))
        known_ids = {r["id"] for r in cur.fetchall()}
        skipped = 0
        for r in rows:
            if r["source_id"] not in known_ids or r["target_id"] not in known_ids:
                skipped += 1
                continue
            cur.execute("""
                INSERT INTO relationships
                    (investigation_id, source_id, target_id, type, strength,
                     evidence, collected_at)
                VALUES (%s,%s,%s,%s,%s,%s,%s)
                ON CONFLICT DO NOTHING
            """, (
                DEFAULT_INV,
                r["source_id"], r["target_id"], r["type"],
                r["strength"] or 50, r["evidence"],
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
            ))
    pg.commit()
    print(f"  relationships: {len(rows) - skipped} rows  ({skipped} skipped — dangling FKs)")


def migrate_identifiers(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM identifiers").fetchall()
    with pg.cursor() as cur:
        cur.execute("SELECT id FROM entities WHERE investigation_id = %s", (DEFAULT_INV,))
        known_ids = {r["id"] for r in cur.fetchall()}
        skipped = 0
        for r in rows:
            if r["entity_id"] not in known_ids:
                skipped += 1
                continue
            cur.execute("""
                INSERT INTO identifiers
                    (investigation_id, entity_id, type, value, country_code,
                     carrier, lookup_result, collected_at)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
                ON CONFLICT DO NOTHING
            """, (
                DEFAULT_INV,
                r["entity_id"], r["type"], r["value"],
                r["country_code"], r["carrier"],
                _parse_json(r["lookup_result"]),
                _parse_ts(r["collected_at"]) or datetime.now(timezone.utc),
            ))
    pg.commit()
    print(f"  identifiers: {len(rows) - skipped} rows  ({skipped} skipped — dangling FKs)")


def migrate_annotations(sqlite_con, pg):
    rows = sqlite_con.execute("SELECT * FROM annotations").fetchall()
    with pg.cursor() as cur:
        for r in rows:
            cur.execute("""
                INSERT INTO annotations
                    (investigation_id, ref_type, ref_id, analyst, note,
                     confidence, created_at)
                VALUES (%s,%s,%s,%s,%s,%s,%s)
            """, (
                DEFAULT_INV,
                r["ref_type"], r["ref_id"],
                r["analyst"] or "human",
                r["note"],
                r["confidence"],
                _parse_ts(r["created_at"]) or datetime.now(timezone.utc),
            ))
    pg.commit()
    print(f"  annotations: {len(rows)} rows")


def main():
    if not SQLITE_DB.exists():
        print(f"SQLite DB not found: {SQLITE_DB}")
        print("Run ingestion/migrate_existing.py first.")
        return

    print(f"Source:      {SQLITE_DB}")
    print("Destination: PostgreSQL (cultguard)")
    print()

    sqlite_con = sqlite3.connect(SQLITE_DB)
    sqlite_con.row_factory = sqlite3.Row

    pg = pg_connect()
    try:
        migrate_investigation(pg)
        migrate_entities(sqlite_con, pg)
        migrate_content(sqlite_con, pg)
        migrate_media(sqlite_con, pg)
        migrate_http_log(sqlite_con, pg)
        migrate_relationships(sqlite_con, pg)
        migrate_identifiers(sqlite_con, pg)
        migrate_annotations(sqlite_con, pg)
    finally:
        pg.close()
        sqlite_con.close()

    print("\n✓ Migration complete.")
    print("  Next steps:")
    print("    db-migrate   → ensure latest schema applied")
    print("    db-embed     → generate embeddings into text_embeddings / image_embeddings")


if __name__ == "__main__":
    main()
