#!/usr/bin/env python3
"""
Totally Spies Fandom Wiki Scraper using jina.ai for clean text extraction
Scrapes character, gadget, location, and episode data from totallyspies.fandom.com
Stores in structured JSON format for query-friendly access.
"""

import json
import re
import subprocess
import sys
from pathlib import Path
from urllib.parse import quote
from html.parser import HTMLParser


def fetch_page_jina(url):
    """Fetch page content using jina.ai text extraction service."""
    jina_url = f"https://r.jina.ai/http://{url.replace('https://', '').replace('http://', '')}"
    try:
        result = subprocess.run(
            ['curl', '-s', '-L', '--max-time', '30', jina_url],
            capture_output=True,
            text=True,
            timeout=60
        )
        return result.stdout
    except Exception as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return None


def parse_jina_content(text):
    """Parse content returned from jina.ai."""
    lines = text.split('\n')
    
    # Remove jina.ai metadata lines
    content_lines = []
    in_content = False
    
    for line in lines:
        # Skip header metadata
        if line.startswith('Title:') or line.startswith('URL Source:') or line.startswith('Published Time:'):
            continue
        if line.startswith('Markdown Content:'):
            in_content = True
            continue
        if not in_content:
            continue
        
        # Skip Fandom navigation cruft
        if any(skip in line for skip in [
            '[Totally Spies Wiki]', '[Sign In]', '[Create a Free Account]',
            '[Edit]', '[Talk]', '[Read]', '[View history]', '[Upload file]',
            'Navigation menu', 'Personal tools', 'Namespaces', 'Variants',
            'Views', 'More', 'Search', 'Categories:', 'Community content',
            'Advertisement', 'Fan Feed', 'Explore properties', 'Fandom Muthead',
            'FandomShop', 'Spies around the world'
        ]):
            continue
        
        content_lines.append(line)
    
    return '\n'.join(content_lines)


def extract_sections(text):
    """Extract sections from markdown content."""
    sections = {}
    current_section = "Overview"
    current_content = []
    
    for line in text.split('\n'):
        # Check for markdown headers
        header_match = re.match(r'^(#{1,4})\s+(.+)$', line)
        if header_match:
            # Save previous section
            if current_content:
                sections[current_section] = '\n'.join(current_content).strip()
            # Start new section
            current_section = header_match.group(2).strip()
            current_content = []
        else:
            current_content.append(line)
    
    # Save final section
    if current_content:
        sections[current_section] = '\n'.join(current_content).strip()
    
    return sections


def extract_character_info(text, name):
    """Extract structured info about a character."""
    info = {
        "name": name,
        "aliases": [],
        "hair_color": None,
        "eye_color": None,
        "occupation": None,
        "organization": None,
        "personality": [],
        "gadgets": [],
        "relationships": {},
        "appearances": [],
        "voiced_by": None,
        "first_appearance": None
    }
    
    # Extract hair color
    hair_match = re.search(r'([^\n]*)hair([^\n]*|\n)', text, re.IGNORECASE)
    if hair_match:
        info["hair_color"] = hair_match.group(0).strip()
    
    # Extract eye color
    eye_match = re.search(r'([^\n]*)eyes?([^\n]*|\n)', text, re.IGNORECASE)
    if eye_match:
        info["eye_color"] = eye_match.group(0).strip()
    
    # Look for WOOHP mentions
    if 'WOOHP' in text or 'World Organization of Human Protection' in text:
        info["organization"] = "WOOHP"
    
    # Extract aliases
    alias_section = re.search(r'aka|also known as|alias', text, re.IGNORECASE)
    if alias_section:
        # Look for quoted names or patterns like "also known as X"
        alias_matches = re.findall(r'"([^"]+)"|aka ([^,.\n]+)', text, re.IGNORECASE)
        for match in alias_matches:
            alias = match[0] if match[0] else match[1]
            if alias and len(alias) < 50:
                info["aliases"].append(alias.strip())
    
    # Look for personality traits
    personality_keywords = ['intelligent', ' athletic ', 'fashionable', 'determined', 'responsible', 'mischievous', 'loyal', 'clumsy', 'sarcastic']
    for keyword in personality_keywords:
        if keyword in text.lower():
            info["personality"].append(keyword.strip())
    
    return info


def extract_gadget_info(text, name):
    """Extract structured info about a gadget."""
    info = {
        "name": name,
        "type": None,
        "function": None,
        "appearances": [],
        "related_gadgets": []
    }
    
    # Function description
    function_patterns = [
        r'used to ([^\.]+)',
        r'allows? (?:the user )?to ([^\.]+)',
        r'functions? as ([^\.]+)',
        r'capable of ([^\.]+)'
    ]
    for pattern in function_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            info["function"] = match.group(1).strip()
            break
    
    return info


def scrape_entity(name, path, entity_type, base_url="https://totallyspies.fandom.com/wiki"):
    """Scrape a specific entity (character, gadget, etc.)."""
    if not path.startswith('http'):
        url = f"{base_url}/{path}"
    else:
        url = path
    
    print(f"Fetching: {name} ({entity_type})...", file=sys.stderr)
    
    text = fetch_page_jina(url)
    if not text or 'Error' in text[:100]:
        print(f"  Failed to fetch {name}", file=sys.stderr)
        return None
    
    content = parse_jina_content(text)
    sections = extract_sections(content)
    
    # Extract type-specific info
    if entity_type == "character":
        metadata = extract_character_info(content, name)
    elif entity_type == "gadget":
        metadata = extract_gadget_info(content, name)
    else:
        metadata = {"name": name}
    
    return {
        "name": name,
        "type": entity_type,
        "url": url,
        "metadata": metadata,
        "sections": sections,
        "full_text": content[:5000]
    }


def main():
    output_dir = Path("/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    database = {
        "meta": {
            "source": "totallyspies.fandom.com (via jina.ai)",
            "scraped_at": None,
            "version": "2.0"
        },
        "characters": {},
        "gadgets": {},
        "locations": {},
        "villains": {},
        "seasons": {}
    }
    
    # Main characters
    characters = [
        ("Samantha", "Samantha"),
        ("Clover", "Clover"),
        ("Alexandra", "Alexandra"),
        ("Jerry Lewis", "Jerry_Lewis"),
        ("Mandy", "Mandy"),
        ("Zerlina Lewis", "Zerlina_Lewis"),
        ("Toby", "Toby"),
        ("Glitterstar", "Glitterstar"),
        ("Britney", "Britney"),
        ("David", "David"),
        ("Arnold Jackson", "Arnold_Jackson"),
        ("Blaine", "Blaine"),
    ]
    
    for name, path in characters:
        data = scrape_entity(name, path, "character")
        if data:
            database["characters"][name] = data
    
    # Gadgets
    gadgets = [
        ("Compowder", "Compowder"),
    ]
    
    for name, path in gadgets:
        data = scrape_entity(name, path, "gadget")
        if data:
            database["gadgets"][name] = data
    
    # Try to get the gadget list separately
    print("Fetching gadget list...", file=sys.stderr)
    gadget_list = scrape_entity("Gadget List", "List_of_Gadgets", "gadget_list")
    if gadget_list:
        database["gadgets"]["_list"] = gadget_list
    
    # Locations
    locations = [
        ("WOOHP", "WOOHP"),
        ("WOOHP Tower", "WOOHP_Tower"),
        ("Beverly Hills", "Beverly_Hills"),
        ("Malibu University", "Malibu_University"),
    ]
    
    for name, path in locations:
        data = scrape_entity(name, path, "location")
        if data:
            database["locations"][name] = data
    
    # Villains
    villains = [
        ("Cyberchac", "Cyberchac"),
        ("The Curator", "The_Curator"),
        ("Schamagy", "Schamagy"),
        ("Terrence Lewis", "Terrence_Lewis"),
        ("Tim Scam", "Tim_Scam"),
    ]
    
    for name, path in villains:
        data = scrape_entity(name, path, "villain")
        if data:
            database["villains"][name] = data
    
    # Seasons
    for season_num in range(1, 8):
        season_name = f"Season {season_num}"
        season_path = f"Season_{season_num}"
        data = scrape_entity(season_name, season_path, "season")
        if data:
            database["seasons"][season_name] = data
    
    # Add metadata
    import datetime
    database["meta"]["scraped_at"] = datetime.datetime.now().isoformat()
    
    # Save database
    with open(output_dir / "database.json", 'w', encoding='utf-8') as f:
        json.dump(database, f, indent=2, ensure_ascii=False)
    
    # Create summary
    summary = {
        "characters": len(database["characters"]),
        "gadgets": len([k for k in database["gadgets"] if not k.startswith('_')]),
        "locations": len(database["locations"]),
        "villains": len(database["villains"]),
        "seasons": len(database["seasons"])
    }
    
    with open(output_dir / "summary.json", 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"\nScraping complete!", file=sys.stderr)
    print(f"Summary: {summary}", file=sys.stderr)
    
    return database


if __name__ == "__main__":
    main()
