#!/usr/bin/env bash
# Wayback Machine Brand Validator — v2.6 (robust, non-id_ URL, HTML filter)
set -euo pipefail

DOMAIN="${1:-}"
[[ -z "$DOMAIN" ]] && { echo '{"domain":"","status":"error","detail":"no domain"}'; exit 1; }

CACHE_ROOT="data/cache/wayback"
mkdir -p "$CACHE_ROOT"
ROOT="${DOMAIN%.com}"
CACHE_FILE="$CACHE_ROOT/${ROOT}_wayback.txt"

if [[ -s "$CACHE_FILE" ]]; then cat "$CACHE_FILE"; exit 0; fi

YEARS="2005 2010 2012 2015 2018 2020 2022"
BEST_LEN=0
BEST_HTML=""
BEST_TS=""
SNAP_COUNT=0

for YEAR in $YEARS; do
  set +e
  HEAD=$(curl -sI --max-time 5 "https://web.archive.org/web/${YEAR}/${DOMAIN}" 2>/dev/null)
  CURL_STATUS=$?
  set -e
  [[ "$CURL_STATUS" -ne 0 ]] && continue
  [[ -z "$HEAD" ]] && continue

  LOWER=$(echo "$HEAD" | tr '[:upper:]' '[:lower:]')
  LOC=$(echo "$LOWER" | sed -n 's/^location: *//p' | sed 's/\r//' | head -1)
  [[ -z "$LOC" ]] && continue

  SNAP_COUNT=$((SNAP_COUNT + 1))

  # Use wrapper URL (not id_) — more content available
  # The LOC already contains the full wayback URL
  set +e
  HTML=$(curl -sL --max-time 10 "$LOC" 2>/dev/null | tr '\n' ' ' | tr -d '\0')
  CURL_STATUS=$?
  set -e
  [[ "$CURL_STATUS" -ne 0 ]] && continue
  LEN=${#HTML}

  # Must be HTML-like content
  if ! echo "$HTML" | grep -qiE '^\s*<(html|!doctype|head|title|body|div|span|a|img|script|link|meta)\b'; then
    continue
  fi

  # Skip Amazon CAPTCHA placeholders
  if echo "$HTML" | grep -Eiq 'captcha.*amazon|enter.*characters|type.*characters|sorry.*robot|amazon.*validatecaptcha|type the characters'; then
    continue
  fi

  if [[ "$LEN" -gt "$BEST_LEN" ]]; then
    BEST_LEN=$LEN
    BEST_HTML="$HTML"
    BEST_TS=$(echo "$LOC" | grep -oE '[0-9]{14}' | head -1)
  fi
done

TITLE=""; META=""; IS_P="false"; HAS_LI="false"; LI_URL=""; HAS_TW="false"; HAS_FB="false"; BRAND="false"; SCORE=0

if [[ -n "$BEST_HTML" ]]; then
  set +e
  # More robust title extraction — try multiple patterns
  TITLE=$(echo "$BEST_HTML" | grep -oP '(?<=<title>)[^<]+' | head -1 | sed 's/"/“/g' | awk '{$1=$1;print}' | cut -c1-200)
  if [[ -z "$TITLE" ]]; then
    TITLE=$(echo "$BEST_HTML" | grep -oi '<title>[^<]*</title>' | sed 's/<title>//;s/<\/title>//' | head -1 | sed 's/"/“/g' | awk '{$1=$1;print}' | cut -c1-200)
  fi
  META=$(echo "$BEST_HTML" | grep -oi 'meta name="description" content="[^"]*"' | sed 's/.*content="\([^"]*\)".*/\1/' | head -1 | sed 's/"/“/g' | awk '{$1=$1;print}' | cut -c1-300)
  set -e

  echo "$BEST_HTML" | grep -Eiq 'domain is for sale|buy this domain|parked|sedo|afternic|undeveloped|domain parking|available for|domain name is for sale|make offer|buy now' && IS_P="true"
  
  # Extract social URLs from href attributes only — prevents false positives from body text
  echo "$BEST_HTML" | grep -qi 'linkedin.com/company\|linkedin.com/in' && { HAS_LI="true"; LI_URL=$(echo "$BEST_HTML" | grep -oi 'href="https\?://[^"]*linkedin\.com/[^"]*"' | head -1 | sed 's/href="//;s/"$//' || echo ""); }
  echo "$BEST_HTML" | grep -qi 'twitter.com\|x.com' && { HAS_TW="true"; TW_URL=$(echo "$BEST_HTML" | grep -oi 'href="https\?://[^"]*twitter\.com/[^"]*"' | head -1 | sed 's/href="//;s/"$//' || echo ""); [[ -z "$TW_URL" ]] && TW_URL=$(echo "$BEST_HTML" | grep -oi 'href="https\?://[^"]*x\.com/[^"]*"' | head -1 | sed 's/href="//;s/"$//' || echo ""); }
  echo "$BEST_HTML" | grep -qi 'facebook.com' && { HAS_FB="true"; FB_URL=$(echo "$BEST_HTML" | grep -oi 'href="https\?://[^"]*facebook\.com/[^"]*"' | head -1 | sed 's/href="//;s/"$//' || echo ""); }
  
  # Clean trailing junk from URLs (quotes, commas, brackets, HTML)
  LI_URL=$(echo "$LI_URL" | sed 's/["'\''\>\<,}\]]*$//' | sed 's/^["'\''\<\>\[]{]*//')
  TW_URL=$(echo "$TW_URL" | sed 's/["'\''\>\<,}\]]*$//' | sed 's/^["'\''\<\>\[]{]*//')
  FB_URL=$(echo "$FB_URL" | sed 's/["'\''\>\<,}\]]*$//' | sed 's/^["'\''\<\>\[]{]*//')

  (echo "$TITLE"; echo "$META") | grep -Eiq '(inc|llc|ltd|corp|company|limited|solutions|technologies|tech|software|digital|media|group|services|studio|labs|ventures|consulting|agency|partners|platform)' && BRAND="true"

  [[ "$IS_P" == "false" ]] && SCORE=$((SCORE+40))
  
  # Social bonuses only unlock when 2+ platforms have perfect domain matches
  SOCIAL_COUNT=0
  [[ "$HAS_LI" == "true" ]] && SOCIAL_COUNT=$((SOCIAL_COUNT+1))
  [[ "$HAS_TW" == "true" ]] && SOCIAL_COUNT=$((SOCIAL_COUNT+1))
  [[ "$HAS_FB" == "true" ]] && SOCIAL_COUNT=$((SOCIAL_COUNT+1))
  
  if [[ "$SOCIAL_COUNT" -ge 2 ]]; then
    [[ "$HAS_LI" == "true" ]] && SCORE=$((SCORE+35))
    [[ "$HAS_TW" == "true" ]] && SCORE=$((SCORE+10))
    [[ "$HAS_FB" == "true" ]] && SCORE=$((SCORE+10))
  fi
  
  [[ "$BRAND" == "true" ]] && SCORE=$((SCORE+15))
  [[ -n "$TITLE" && "$TITLE" != " " ]] && SCORE=$((SCORE+5))
  [[ $SCORE -gt 100 ]] && SCORE=100
fi

printf '{"domain":"%s","status":"%s","snapshots":%d,"latestTs":"%s","title":"%s","metaDescription":"%s","isParking":%s,"hasLinkedIn":%s,"linkedinUrl":"%s","hasTwitter":%s,"twitterUrl":"%s","hasFacebook":%s,"facebookUrl":"%s","brandSignals":%s,"score":%d}\n' \
  "$DOMAIN" \
  "$([[ -n "$BEST_HTML" ]] && echo "ok" || echo "not_found")" \
  "$SNAP_COUNT" \
  "$BEST_TS" \
  "$TITLE" \
  "$META" \
  "$IS_P" \
  "$HAS_LI" \
  "$LI_URL" \
  "$HAS_TW" \
  "$TW_URL" \
  "$HAS_FB" \
  "$FB_URL" \
  "$BRAND" \
  "$SCORE" | tee "$CACHE_FILE"
