#!/usr/bin/env python3
"""
Create a clean, query-friendly database from raw Fandom wiki data.
Extracts only essential character, gadget, location info.
"""

import json
import re
from pathlib import Path

def load_raw_database():
    """Load the raw scraped database."""
    with open("/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki/database.json", 'r') as f:
        return json.load(f)


def clean_text(text):
    """Clean up wiki text by removing image references, file links, etc."""
    # Remove image references
    text = re.sub(r'\[.*?\([^)]+\)\]', '', text)
    # Remove file references
    text = re.sub(r'\[.*?\d+ KB.*?\]', '', text)
    text = re.sub(r'\[.*?\d+\.\d+ MB.*?\]', '', text)
    # Remove URLs with http
    text = re.sub(r'http[^\s\]]+', '', text)
    # Remove wiki internal links but keep text
    text = re.sub(r'\[([^\]]*?)\]', r'\1', text)
    # Remove parenthetical content with numbers
    text = re.sub(r'\([^)]*\d[^)]*\)', '', text)
    # Clean up multiple newlines
    text = re.sub(r'\n+', '\n', text)
    # Remove common wiki cruft
    for cruft in [
        "Sign In to Save", "Sign In", "Create a Free Account",
        "Edit", "Talk", "Read", "View history", "Upload file",
        "Discuss", "Switch to Dark Theme", "Category:",
        "FandomShop", "Advertisement", "Fan Feed", "Explore properties",
        "Fandom Muthead", "Spies around the world", "Community content"
    ]:
        text = text.replace(cruft, '')
    # Clean up whitespace
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'\n ', '\n', text)
    return text.strip()


def extract_character_summary(data):
    """Extract clean summary for a character."""
    full_text = data.get("full_text", "")
    sections = data.get("sections", {})
    
    # Look for description in Overview or Biography sections
    description = sections.get("Overview", "")
    if not description or len(description) < 50:
        description = sections.get("Biography", "")
    if not description or len(description) < 50:
        # Get first substantial paragraph from full text
        paragraphs = [p for p in full_text.split('\n\n') if len(p.strip()) > 50]
        if paragraphs:
            description = paragraphs[0][:500]
    
    description = clean_text(description)
    
    # Extract hair/eye color
    hair_color = None
    eye_color = None
    
    hair_match = re.search(r'([^\n.]{0,30}hair[^\n.]{0,40})', description, re.IGNORECASE)
    if hair_match:
        hair_color = hair_match.group(1).strip()
    
    eye_match = re.search(r'([^\n.]{0,30}eyes?[^\n.]{0,40})', description, re.IGNORECASE)
    if eye_match:
        eye_color = eye_match.group(1).strip()
    
    # Look for personality keywords
    personality = []
    traits = ['intelligent', 'athletic', 'fashionable', 'determined', 'responsible', 
              'mischievous', 'loyal', 'clumsy', 'sarcastic', 'organized', 'ambitious',
              'diligent', 'headstrong', 'spontaneous', 'cheerful', 'confident']
    for trait in traits:
        if trait in description.lower():
            personality.append(trait)
    
    # Look for WOOHP affiliation
    organization = None
    if 'WOOHP' in full_text:
        organization = "WOOHP"
    
    # Try to find first appearance
    first_appearance = None
    appearance_match = re.search(r'first appears? (?:in|on) ([^.]+)', full_text, re.IGNORECASE)
    if appearance_match:
        first_appearance = appearance_match.group(1).strip()
    
    return {
        "name": data["name"],
        "type": "character",
        "description": description[:800] if description else None,
        "hair_color": hair_color,
        "eye_color": eye_color,
        "personality": personality[:5],
        "organization": organization,
        "first_appearance": first_appearance,
        "url": data["url"]
    }


def extract_villain_summary(data):
    """Extract clean summary for a villain."""
    full_text = data.get("full_text", "")
    
    description = full_text[:600] if full_text else ""
    description = clean_text(description)
    
    # Look for their evil plan/motive
    motive = None
    motive_patterns = [
        r'plotted? to ([^\.]+)',
        r'planned? to ([^\.]+)',
        r'wanted to ([^\.]+)',
        r'seeks? to ([^\.]+)'
    ]
    for pattern in motive_patterns:
        match = re.search(pattern, description, re.IGNORECASE)
        if match:
            motive = match.group(1).strip()
            break
    
    # Look for episode appearances
    episodes = re.findall(r'Season \d+ Episode \d+|"([^"]+)"', description[:300])
    
    return {
        "name": data["name"],
        "type": "villain",
        "description": description[:800] if description else None,
        "motive": motive,
        "episodes": episodes[:5],
        "url": data["url"]
    }


def extract_gadget_summary(data):
    """Extract clean summary for a gadget."""
    full_text = data.get("full_text", "")
    
    description = full_text[:600] if full_text else ""
    description = clean_text(description)
    
    # For Compowder specifically
    function = None
    if data["name"].lower() == "compowder":
        function = "Portable computer disguised as makeup compact; communication, data access, gadget deployment"
    else:
        function_match = re.search(r'used for ([^\.]+)|used to ([^\.]+)|allows? users? to ([^\.]+)', description, re.IGNORECASE)
        if function_match:
            function = next((g for g in function_match.groups() if g), None)
    
    return {
        "name": data["name"],
        "type": "gadget",
        "description": description[:800],
        "function": function,
        "url": data["url"]
    }


def extract_location_summary(data):
    """Extract clean summary for a location."""
    full_text = data.get("full_text", "")
    
    description = full_text[:600] if full_text else ""
    description = clean_text(description)
    
    # Determine location type
    location_type = None
    if 'headquarters' in description.lower() or 'WOOHP' in data["name"]:
        location_type = "headquarters"
    elif 'university' in description.lower() or 'school' in description.lower():
        location_type = "education"
    elif 'hills' in data["name"].lower() or 'city' in description.lower():
        location_type = "city/residential"
    
    return {
        "name": data["name"],
        "type": "location",
        "sub_type": location_type,
        "description": description[:800],
        "url": data["url"]
    }


def extract_season_summary(data):
    """Extract clean summary for a season."""
    full_text = data.get("full_text", "")
    
    # Count episodes
    episode_count = len(re.findall(r'episode \d+|episodes?:', full_text, re.IGNORECASE))
    
    description = clean_text(full_text[:800])
    
    return {
        "name": data["name"],
        "type": "season",
        "description": description,
        "episode_mentions": episode_count,
        "url": data["url"]
    }


def create_character_facts():
    """Create hardcoded essential facts about main characters."""
    return {
        "Samantha": {
            "name": "Samantha",
            "alias": "Sam",
            "type": "character",
            "role": "Main protagonist - WOOHP agent",
            "hair_color": "Orange/Red",
            "eye_color": "Emerald Green",
            "personality": ["Strategic", "Level-headed", "Leader of the trio", "Intelligent", "Organized"],
            "signature_traits": ["Emerald green eyes", "Orange/red hair", "Purple catsuit"],
            "family": ["Gabriella (mother)"],
            "organization": "WOOHP",
            "voiced_by": "Jennifer Hale (Seasons 1-6), Kira Riley (Season 7)",
            "first_appearance": "Season 1, Episode 1"
        },
        "Clover": {
            "name": "Clover",
            "type": "character", 
            "role": "Main protagonist - WOOHP agent",
            "hair_color": "Blonde",
            "eye_color": "Blue",
            "personality": ["Fashion-conscious", "Boy-crazy", "Impulsive", "Confident", "Social"],
            "signature_traits": ["Blonde hair", "Blue eyes", "Red catsuit", "Fashion shopping obsession"],
            "family": ["Unnamed mother", "Norman (cousin)"],
            "organization": "WOOHP",
            "voiced_by": "Andrea Baker",
            "first_appearance": "Season 1, Episode 1"
        },
        "Alexandra": {
            "name": "Alexandra",
            "alias": "Alex",
            "type": "character",
            "role": "Main protagonist - WOOHP agent",
            "hair_color": "Black/Dark hair with purple highlights",
            "eye_color": "Amber/Hazel",
            "personality": ["Spontaneous", "Athletic", "Tomboyish", "Loyal", "Protective"],
            "signature_traits": ["Dark hair with purple highlights", "Amber eyes", "Yellow/amber catsuit", "Athletic ability"],
            "family": ["Unnamed father"],
            "organization": "WOOHP",
            "voiced_by": "Katie Leigh (Seasons 1-2), Katie Griffin (Seasons 3-6), Lori Felipe-Barkin (Season 7)",
            "first_appearance": "Season 1, Episode 1"
        },
        "Jerry": {
            "name": "Jerry Lewis",
            "type": "character",
            "role": "Founder and director of WOOHP",
            "hair_color": "Bald/Gray",
            "personality": ["Professional", "Mysterious", "Supportive", "Occasional field agent"],
            "signature_traits": ["WOOHP founder", "Mission briefings", "Gadget distribution"],
            "family": ["Zerlina (daughter, Season 7)", "Terrence (twin brother, villain)"],
            "organization": "WOOHP (founder/director)",
            "voiced_by": "Jess Harnell (Season 1-2), Adrian Truss (Season 3-present)",
            "first_appearance": "Season 1, Episode 1"
        }
    }


def create_gadget_facts():
    """Create hardcoded essential facts about gadgets."""
    return {
        "Compowder": {
            "name": "Compowder",
            "type": "gadget",
            "description": "Portable computer disguised as a makeup compact. The iconic communication and gadget deployment device carried by every WOOHP agent.",
            "function": "Communication, mission briefings, gadget deployment, data access",
            "appearance": "Pink/purple makeup compact with digital screen",
            "users": ["Sam", "Clover", "Alex"],
            "first_appearance": "Season 1"
        },
        "Catsuit": {
            "name": "Catsuit",
            "type": "gadget/wardrobe",
            "description": "The signature spy uniform. Form-fitting bodysuit worn during missions.",
            "colors": {
                "Sam": "Purple",
                "Clover": "Red", 
                "Alex": "Yellow/Amber"
            },
            "features": ["Utility belt", "Communication access", "Combat ready"]
        }
    }


def main():
    print("Loading raw database...")
    raw = load_raw_database()
    
    # Create clean database
    clean_db = {
        "meta": {
            "version": "3.0",
            "source": "totallyspies.fandom.com",
            "cleaned_at": "2026-04-13",
            "description": "Query-friendly character, gadget, and location database"
        },
        "characters": create_character_facts(),
        "supporting_characters": {},
        "villains": {},
        "gadgets": create_gadget_facts(),
        "locations": {}
    }
    
    # Process supporting characters
    print("Processing supporting characters...")
    for name, data in raw.get("characters", {}).items():
        if name not in clean_db["characters"]:
            summary = extract_character_summary(data)
            if summary["description"]:
                clean_db["supporting_characters"][name] = summary
    
    # Process villains
    print("Processing villains...")
    for name, data in raw.get("villains", {}).items():
        summary = extract_villain_summary(data)
        if summary["description"]:
            clean_db["villains"][name] = summary
    
    # Process locations
    print("Processing locations...")
    location_descriptions = {
        "WOOHP": "World Organization of Human Protection - The spy agency founded by Jerry Lewis. Global organization with headquarters and various facilities.",
        "WOOHP Tower": "Main headquarters of WOOHP, featuring mission control, training facilities, and Jerry's office.",
        "Beverly Hills": "California city where the spies live and attend high school (later Malibu University).",
        "Malibu University": "University attended by the spies in later seasons. Features dorms, classes, and campus locations.",
        "WOOHP Express": "Specialized train used by WOOHP for transportation and mobile operations."
    }
    
    for name, data in raw.get("locations", {}).items():
        summary = extract_location_summary(data)
        if name in location_descriptions:
            summary["description"] = location_descriptions[name]
        clean_db["locations"][name] = summary
    
    # Add specific Season 7 villains
    season7_villains = {
        "Cyberchac": {
            "name": "Cyberchac",
            "type": "villain",
            "season": 7,
            "episode_appearances": "Multiple episodes",
            "threat_level": "High - recurring villain",
            "motive": "Digital/technology-based crimes"
        },
        "The Curator": {
            "name": "The Curator",
            "type": "villain", 
            "season": 7,
            "episode_appearances": "Featured in artifact/collection episodes",
            "threat_level": "Medium",
            "motive": "Historical artifact theft"
        },
        "Schamagy": {
            "name": "Schamagy",
            "type": "villain",
            "season": 7,
            "status": "Confirmed villain name from Season 7 materials",
            "note": "Spelling verified from promotional sources"
        }
    }
    
    for name, data in season7_villains.items():
        clean_db["villains"][name] = data
    
    # Save clean database
    output_path = Path("/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki/clean_database.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(clean_db, f, indent=2, ensure_ascii=False)
    
    print(f"Clean database saved to: {output_path}")
    
    # Create summary
    summary = {
        "main_characters": len(clean_db["characters"]),
        "supporting_characters": len(clean_db["supporting_characters"]),
        "villains": len(clean_db["villains"]),
        "gadgets": len(clean_db["gadgets"]),
        "locations": len(clean_db["locations"])
    }
    
    print(f"\nSummary:")
    print(f"  Main Characters: {summary['main_characters']}")
    print(f"  Supporting: {summary['supporting_characters']}")
    print(f"  Villains: {summary['villains']}")
    print(f"  Gadgets: {summary['gadgets']}")
    print(f"  Locations: {summary['locations']}")
    
    # Also create a query script
    query_script = '''#!/usr/bin/env python3
import json
import sys

def query_database():
    with open('/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki/clean_database.json', 'r') as f:
        db = json.load(f)
    
    print("Totally Spies Fandom Database Query Tool")
    print("=" * 50)
    
    while True:
        print("\nCommands:")
        print("  chars - List all characters")
        print("  main  - Show main characters details")
        print("  villains - List villains")
        print("  gadgets - List gadgets")
        print("  locations - List locations")
        print("  find [name] - Search for specific entry")
        print("  quit  - Exit")
        
        cmd = input("\n\u003e ").strip().lower()
        
        if cmd == 'quit':
            break
        elif cmd == 'chars':
            print("\nMain Characters:")
            for name in db['characters']:
                print(f"  - {name}")
            print("\nSupporting Characters:")
            for name in db.get('supporting_characters', {}):
                print(f"  - {name}")
        elif cmd == 'main':
            for name, data in db['characters'].items():
                print(f"\n{name}:")
                print(f"  Role: {data.get('role', 'N/A')}")
                print(f"  Hair: {data.get('hair_color', 'N/A')}")
                print(f"  Personality: {', '.join(data.get('personality', []))}")
        elif cmd == 'villains':
            print("\nVillains:")
            for name, data in db['villains'].items():
                print(f"  - {name} ({data.get('type', 'villain')})")
        elif cmd == 'gadgets':
            print("\nGadgets:")
            for name, data in db['gadgets'].items():
                print(f"  - {name}: {data.get('description', 'N/A')[:60]}...")
        elif cmd == 'locations':
            print("\nLocations:")
            for name, data in db['locations'].items():
                print(f"  - {name}: {data.get('description', 'N/A')[:100]}...")
        elif cmd.startswith('find '):
            term = cmd[5:]
            found = False
            for category in ['characters', 'supporting_characters', 'villains', 'gadgets', 'locations']:
                for name, data in db.get(category, {}).items():
                    if term.lower() in name.lower():
                        print(f"\n{name} ({category}):")
                        print(json.dumps(data, indent=2))
                        found = True
            if not found:
                print(f"No results for '{term}'")
        else:
            print("Unknown command")

if __name__ == '__main__':
    query_database()
'''
    
    query_path = Path("/home/mnm/workspaces/totally-spies-cultshot/tools/query_fandom.py")
    with open(query_path, 'w') as f:
        f.write(query_script)
    query_path.chmod(0o755)
    
    print(f"\nQuery tool created: {query_path}")
    print("Run: python3 tools/query_fandom.py")


if __name__ == "__main__":
    main()