#!/usr/bin/env python3
"""
Totally Spies Fandom Wiki Scraper
Scrapes character, gadget, location, and episode data from totallyspies.fandom.com
Stores in structured JSON format for query-friendly access.
"""

import json
import re
import subprocess
import sys
from pathlib import Path
from urllib.parse import urljoin, quote
from html.parser import HTMLParser

class SimpleHTMLTextExtractor(HTMLParser):
    """Extract text from HTML, ignoring tags but preserving structure hints."""
    def __init__(self):
        super().__init__()
        self.text = []
        self.in_script = False
        self.in_style = False
        
    def handle_starttag(self, tag, attrs):
        if tag in ('script', 'style'):
            self.in_script = True
        elif tag in ('p', 'br', 'li', 'h1', 'h2', 'h3', 'h4', 'tr'):
            self.text.append('\n')
        elif tag == 'td':
            self.text.append(' | ')
            
    def handle_endtag(self, tag):
        if tag in ('script', 'style'):
            self.in_script = False
        elif tag in ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'tr'):
            self.text.append('\n')
            
    def handle_data(self, data):
        if not self.in_script and not self.in_style:
            self.text.append(data)
            
    def get_text(self):
        return ' '.join(self.text)


def fetch_page(url):
    """Fetch page content using curl."""
    try:
        result = subprocess.run(
            ['curl', '-s', '-L', '-A', 'Mozilla/5.0 (research purposes)', url],
            capture_output=True,
            text=True,
            timeout=30
        )
        return result.stdout
    except Exception as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return None


def extract_text_from_html(html):
    """Extract readable text from HTML."""
    extractor = SimpleHTMLTextExtractor()
    try:
        extractor.feed(html)
        text = extractor.get_text()
        # Clean up whitespace
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r' +', ' ', text)
        return text.strip()
    except:
        return ""


def extract_infobox_data(html, page_type):
    """Extract data from Fandom infoboxes."""
    data = {}
    
    # Look for infobox patterns
    infobox_pattern = r'data-source="([^"]+)"[^>]*>([^<]+)'
    matches = re.findall(infobox_pattern, html)
    
    for key, value in matches:
        clean_key = key.strip().lower().replace('_', ' ')
        clean_value = value.strip()
        if clean_value:
            data[clean_key] = clean_value
    
    # Alternative pattern for pi-data
    alt_pattern = r'class="pi-data"[^>]*>.*?class="pi-data-label"[^>]*>([^<]+).*?class="pi-data-value"[^>]*>([^<]+)'
    alt_matches = re.findall(alt_pattern, html, re.DOTALL)
    for key, value in alt_matches:
        clean_key = key.strip().lower().replace('_', ' ')
        clean_value = re.sub(r'<[^>]+>', '', value).strip()
        if clean_value and clean_key not in data:
            data[clean_key] = clean_value
    
    return data


def extract_sections(text):
    """Extract sections from wiki text."""
    sections = {}
    current_section = "Overview"
    current_content = []
    
    for line in text.split('\n'):
        # Check for section headers
        if line.strip().startswith('==') and line.strip().endswith('=='):
            # Save previous section
            if current_content:
                sections[current_section] = '\n'.join(current_content).strip()
            # Start new section
            current_section = line.strip().strip('=').strip()
            current_content = []
        elif line.strip():
            current_content.append(line)
    
    # Save final section
    if current_content:
        sections[current_section] = '\n'.join(current_content).strip()
    
    return sections


# URLs to scrape
BASE_URL = "https://totallyspies.fandom.com/wiki"

# Character pages
CHARACTER_PAGES = [
    ("Sam", "/Sam"),
    ("Clover", "/Clover"),
    ("Alexandra" ,"/Alexandra"),
    ("Jerry Lewis", "/Jerry_Lewis"),
    ("Mandy", "/Mandy"),
    ("Zerlina", "/Zerlina_Lewis"),
    ("Toby", "/Toby"),
    ("Glitterstar", "/Glitterstar"),
    ("Britney", "/Britney"),
    ("David", "/David"),
    ("Arnold Jackson", "/Arnold_Jackson"),
]

# Villain categories
VILLAIN_LISTS = [
    "/Category:Villains",
    "/Category:Season_7_Villains",
]

# Gadget/tech pages
GADGET_PAGES = [
    ("Compowder", "/Compowder"),
    ("Gadgets", "/List_of_Gadgets"),
]

# Location pages  
LOCATION_PAGES = [
    ("WOOHP", "/WOOHP"),
    ("WOOHP Tower", "/WOOHP_Tower"),
    ("Beverly Hills", "/Beverly_Hills"),
    ("Malibu University", "/Malibu_University"),
]

# Season 7 specific
SEASON7_PAGES = [
    ("Season 7", "/Season_7"),
    ("Cyberchac", "/Cyberchac"),
    ("The Curator", "/The_Curator"),
    ("Schamagy", "/Schamagy"),
]


def scrape_character(name, path):
    """Scrape a character page."""
    url = urljoin(BASE_URL, path)
    print(f"Scraping character: {name}...", file=sys.stderr)
    
    html = fetch_page(url)
    if not html:
        return None
    
    text = extract_text_from_html(html)
    infobox = extract_infobox_data(html, "character")
    sections = extract_sections(text)
    
    return {
        "name": name,
        "url": url,
        "type": "character",
        "infobox": infobox,
        "sections": sections,
        "full_text": text[:5000]  # Truncated for size
    }


def scrape_villain_list(path):
    """Scrape a villain category/list page."""
    url = urljoin(BASE_URL, path)
    print(f"Scraping villains: {path}...", file=sys.stderr)
    
    html = fetch_page(url)
    if not html:
        return []
    
    # Extract villain links
    villains = []
    # Pattern for category pages
    villain_pattern = r'class="category-page__member-link"[^>]*>([^<]+)'
    matches = re.findall(villain_pattern, html)
    
    for match in matches:
        name = match.strip()
        if name and not any(x in name.lower() for x in ['category:', 'template:', 'file:']):
            villains.append({
                "name": name,
                "source_page": url,
                "scraped_at": None
            })
    
    return villains


def scrape_gadget(name, path):
    """Scrape a gadget/tech page."""
    url = urljoin(BASE_URL, path)
    print(f"Scraping gadget: {name}...", file=sys.stderr)
    
    html = fetch_page(url)
    if not html:
        return None
    
    text = extract_text_from_html(html)
    infobox = extract_infobox_data(html, "gadget")
    sections = extract_sections(text)
    
    return {
        "name": name,
        "url": url,
        "type": "gadget",
        "infobox": infobox,
        "sections": sections,
        "full_text": text[:3000]
    }


def scrape_location(name, path):
    """Scrape a location page."""
    url = urljoin(BASE_URL, path)
    print(f"Scraping location: {name}...", file=sys.stderr)
    
    html = fetch_page(url)
    if not html:
        return None
    
    text = extract_text_from_html(html)
    infobox = extract_infobox_data(html, "location")
    sections = extract_sections(text)
    
    return {
        "name": name,
        "url": url,
        "type": "location",
        "infobox": infobox,
        "sections": sections,
        "full_text": text[:3000]
    }


def main():
    output_dir = Path("/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    database = {
        "meta": {
            "source": "totallyspies.fandom.com",
            "scraped_at": None,  # Will be filled
            "version": "1.0"
        },
        "characters": {},
        "villains": [],
        "gadgets": {},
        "locations": {},
        "season7": {}
    }
    
    # Scrape main characters
    for name, path in CHARACTER_PAGES:
        data = scrape_character(name, path)
        if data:
            database["characters"][name] = data
    
    # Scrape villains
    for path in VILLAIN_LISTS:
        villains = scrape_villain_list(path)
        database["villains"].extend(villains)
    
    # Scrape gadgets
    for name, path in GADGET_PAGES:
        data = scrape_gadget(name, path)
        if data:
            database["gadgets"][name] = data
    
    # Scrape locations
    for name, path in LOCATION_PAGES:
        data = scrape_location(name, path)
        if data:
            database["locations"][name] = data
    
    # Scrape Season 7 content
    for name, path in SEASON7_PAGES:
        if "Season" in name:
            data = scrape_location(name, path)  # Reuse location scraper for season page
        else:
            data = scrape_character(name, path)  # Reuse character scraper
        if data:
            database["season7"][name] = data
    
    # Save main database
    db_path = output_dir / "database.json"
    with open(db_path, 'w', encoding='utf-8') as f:
        json.dump(database, f, indent=2, ensure_ascii=False)
    
    print(f"Database saved to: {db_path}", file=sys.stderr)
    
    # Create summary report
    summary = {
        "characters": len(database["characters"]),
        "villains_listed": len(database["villains"]),
        "gadgets": len(database["gadgets"]),
        "locations": len(database["locations"]),
        "season7_entries": len(database["season7"])
    }
    
    summary_path = output_dir / "summary.json"
    with open(summary_path, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    
    print(f"Summary: {summary}", file=sys.stderr)
    
    # Also create a flat searchable index
    index = []
    for char_name, char_data in database["characters"].items():
        index.append({
            "name": char_name,
            "type": "character",
            "url": char_data["url"],
            "keywords": extract_keywords(char_data["full_text"])
        })
    
    for gad_name, gad_data in database["gadgets"].items():
        index.append({
            "name": gad_name,
            "type": "gadget",
            "url": gad_data["url"],
            "keywords": extract_keywords(gad_data["full_text"])
        })
    
    index_path = output_dir / "search_index.json"
    with open(index_path, 'w', encoding='utf-8') as f:
        json.dump(index, f, indent=2, ensure_ascii=False)
    
    return database


def extract_keywords(text):
    """Extract key terms from text for search indexing."""
    # Simple keyword extraction
    words = re.findall(r'\b[A-Z][a-z]+\b', text)
    word_freq = {}
    for word in words:
        if len(word) > 3 and word not in ['This', 'That', 'With', 'From', 'They', 'Have', 'Were', 'Been']:
            word_freq[word] = word_freq.get(word, 0) + 1
    
    # Return top keywords
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return [w[0] for w in sorted_words[:10]]


if __name__ == "__main__":
    import datetime
    database = main()
    database["meta"]["scraped_at"] = datetime.datetime.now().isoformat()
    
    # Re-save with timestamp
    output_dir = Path("/home/mnm/workspaces/totally-spies-cultshot/data/fandom_wiki")
    with open(output_dir / "database.json", 'w', encoding='utf-8') as f:
        json.dump(database, f, indent=2, ensure_ascii=False)
    
    print("Scraping complete.", file=sys.stderr)