#!/usr/bin/env bash
#
# process_domains.sh
#
# Fetch (or use local) Namecheap market CSV, filter for clean brandable domains,
# score by appeal/value/risk, and output a ranked shortlist.
#
# Usage:
#   ./process_domains.sh [URL]
#
# If URL is omitted, falls back to local Namecheap_Market_Sales.csv.
#
# Env:
#   CUTOFF_YEARS    - age gate (default 3)
#   TMPDIR          - working temp space
#   NAMECHEAP_CSV   - path to source CSV
#   LINKEDIN_CACHE  - cache directory
#   RESULTS_DIR     - output directory
#   DO_MX_CHECK     - "1" to skip checking MX for disposable infra (default 0)
#   TLD_FILTER      - comma-separated TLDs to allow (default all reasonable TLDs)
#
set -euo pipefail

CSV_URL="${1:-}"
LOCAL_CSV="${NAMECHEAP_CSV:-data/raw/Namecheap_Market_Sales.csv}"
CUTOFF_YEARS="${CUTOFF_YEARS:-3}"
TMPDIR="${TMPDIR:-/tmp}"
WORKDIR="$TMPDIR/strikefuse_domains_$(date +%s)"
LINKEDIN_CACHE_DIR="${LINKEDIN_CACHE:-data/cache/linkedin}"
RESULTS_DIR="${RESULTS_DIR:-data/results}"

DO_MX_CHECK="${DO_MX_CHECK:-0}"
TLD_FILTER="${TLD_FILTER:-com}"

# Resolve linkedin_check.sh location (bin/ next to this script)
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LINKEDIN_CHECK="$SCRIPT_DIR/linkedin_check.sh"

die() { echo "ERROR: $*" >&2; exit 1; }

log() { echo "[$(date '+%H:%M:%S')] $*"; }

# Compute cutoff date N years ago, portable Linux/macOS
calc_cutoff() {
  local n="${1:-3}"
  if command -v gdate &>/dev/null; then
    gdate -d "$n years ago" +%Y-%m-%d
  elif date -d "$n years ago" +%Y-%m-%d &>/dev/null; then
    date -d "$n years ago" +%Y-%m-%d
  else
    date -v-${n}y +%Y-%m-%d
  fi
}

CUTOFF=$(calc_cutoff "$CUTOFF_YEARS")
log "Age cutoff: registered before $CUTOFF"

# ------------------------------------------------------------------
# 1. Get CSV
# ------------------------------------------------------------------
mkdir -p "$WORKDIR"
SRC="$WORKDIR/input.csv"

if [[ -n "$CSV_URL" ]]; then
  log "Downloading CSV from $CSV_URL ..."
  curl -fsSL -o "$SRC" "$CSV_URL" || die "Download failed"
else
  if [[ -f "$LOCAL_CSV" ]]; then
    log "Using local $LOCAL_CSV"
    cp "$LOCAL_CSV" "$SRC"
  else
    die "No URL provided and $LOCAL_CSV not found"
  fi
fi

# Verify header
head -1 "$SRC" | grep -q 'url,name,startDate,endDate,price' || die "CSV header looks wrong"

# ------------------------------------------------------------------
# 2. Hard-filter
# ------------------------------------------------------------------
log "Filtering candidates (TLDs: $TLD_FILTER, <\$100, ${CUTOFF_YEARS} years, no random/gibberish) ..."

awk -F, -v cutoff="$CUTOFF" -v tld_filter="$TLD_FILTER" '
BEGIN { OFS="," }
NR==1 { next }
{
  full=$2;
  sld=full; sub(/\.[a-zA-Z0-9-]+$/,"",sld);
  tld=full; sub(/^.*\./,"",tld); tld=tolower(tld);
  l=length(sld);
  tmp=sld; digits=gsub(/[0-9]/,"",tmp);
  tmp=sld; nhyp=gsub(/-/,"",tmp);
  low=tolower(sld);

  # Build TLD allowlist
  split(tld_filter, allowed_tlds, ",");
  for (i in allowed_tlds) allowed_tlds[i] = tolower(allowed_tlds[i]);
  allowed = 0;
  for (i in allowed_tlds) {
    if (tld == allowed_tlds[i]) { allowed = 1; break; }
  }
  if (!allowed) next;
  # Age filter: keep only domains registered BEFORE cutoff (older domains)
  if ($15 == "" || substr($15,1,10) >= cutoff) next;
  if (($5+0) >= 100) next;
  if (l > 15 || digits > 0 || nhyp > 0) next;

  # Bad-word gate
  if (low ~ /xxx|sex|porn|erotic|hustler|cock|cum|fuck|shit|damn|slut|whore|nude|naked|adult|escort|casino|poker|bet|loan|debt|insurance|viagra|cialis|diet|pill/) next;

  # Random/gibberish gate
  tmp=sld; vowels=gsub(/[aeiou]/,"",tmp);
  if (vowels==0 || low ~ /[bcdfghjklmnpqrstvwxz]{4,}/) next;
  if (low ~ /^[bcdfghjklmnpqrstvwxz]{3}/ && low ~ /[bcdfghjklmnpqrstvwxz]{3}$/) next;

  # Output: domain,registeredDate,price,renewPrice,endDate
  gsub(/T.*/,"",$15); gsub(/T.*/,"",$4);
  print full, $15, $5, $7, $4;
}
' "$SRC" > "$WORKDIR/filtered.csv"

FILTERED_COUNT=$(wc -l < "$WORKDIR/filtered.csv" | tr -d ' ')
log "Filtered candidates: $FILTERED_COUNT"

[[ "$FILTERED_COUNT" -eq 0 ]] && die "No candidates after filtering"

# ------------------------------------------------------------------
# 3. Parallel MX taint check — reject only known disposable/bulk email infra
# ------------------------------------------------------------------
log "Checking MX records for disposable/bulk email infra ..."

# Controlled parallelism: use xargs with limited workers to avoid system saturation
MX_WORKERS=20

# Create a single script for MX checking
MX_SCRIPT="$WORKDIR/mx_check.sh"
cat > "$MX_SCRIPT" <<'MXEOF'
#!/usr/bin/env bash
while IFS=, read -r d reg price renew end; do
  mx=$(timeout 5 dig +short MX "$d" 2>/dev/null | head -1)
  if [[ -z "$mx" ]] || ! echo "$mx" | grep -qiE '(eye-mail|emailsendhub|plingest)'; then
    echo "$d,$reg,$price,$renew,$end"
  fi
done < "$1" > "$1.clean"
MXEOF
chmod +x "$MX_SCRIPT"

mkdir -p "$WORKDIR/mx_batches"
split -l 200 "$WORKDIR/filtered.csv" "$WORKDIR/mx_batches/batch_"

# Process batches with limited parallelism
ls "$WORKDIR"/mx_batches/batch_* | xargs -P "$MX_WORKERS" -n 1 "$MX_SCRIPT"

cat "$WORKDIR"/mx_batches/batch_*.clean > "$WORKDIR/clean_mx.csv"
CLEAN_COUNT=$(wc -l < "$WORKDIR/clean_mx.csv" | tr -d ' ')
log "Domains after MX taint filter: $CLEAN_COUNT"

[[ "$CLEAN_COUNT" -eq 0 ]] && die "No clean domains after MX check"

# ------------------------------------------------------------------
# 4. Score
# ------------------------------------------------------------------
SCORER="$WORKDIR/score.pl"
cat > "$SCORER" <<'PERL'
use strict;
use warnings;
use Time::Piece;

my $clean = shift;
my $src   = shift;

my %want;
open my $cf, '<', $clean or die $!;
while (<$cf>) {
  chomp;
  next unless $_;
  my ($d,$reg,$price,$renew,$end) = split /,/;
  $want{lc $d} = 1;
}
close $cf;

my %positive_history = map { $_ => 1 } qw(
  askakorean.com encaustics.com asfans.com clydeauto.com
);

sub age_years {
  my ($date)=@_;
  return 0 unless $date =~ /(\d{4})-(\d{2})-(\d{2})/;
  my ($y,$m,$d)=($1,$2,$3);
  my $years = (localtime)[5]+1900 - $y;
  # rough, close enough
  return $years;
}

sub clamp { my ($x,$lo,$hi)=@_; return $lo if $x<$lo; return $hi if $x>$hi; return $x; }

sub name_score {
  my ($sld)=@_;
  my $s=lc $sld;
  my $len=length($s);
  my $score=0;

  # length
  $score += $len<=6 ? 8 : $len<=8 ? 10 : $len<=10 ? 9 : $len<=12 ? 7 : $len<=15 ? 4 : 1;

  # strong word parts
  my @strong = qw(reliable webhook open snip multi peers captain talk talks hub memo robot zebra basics basic ai iot cure wedding luxury save art sell door home site mem box drop cloud net web host send post relay safe core base node link ping sync byte data wire signal tap port clear clean pure true sure real main fast swift smart mail);
  my @medium = qw(fan fans blue print brain jazz flit event events lite long grape old pet bene dorm waldorf idare);
  my $parts=0;
  for my $w (@strong)   { $parts += 3   if index($s,$w) >= 0; }
  for my $w (@medium)   { $parts += 1.5 if index($s,$w) >= 0; }
  $score += clamp($parts,0,13);

  # pronounceable short brandable bonus
  if ($len <= 7 && $s =~ /[aeiou].*[aeiou]/ && $s !~ /[bcdfghjklmnpqrstvwxz]{4,}/) { $score += 2; }

  # penalties
  $score -= 12 if $s =~ /[bcdfghjklmnpqrstvwxz]{4,}/;
  $score -= 8  if $s !~ /[aeiou]/;
  $score -= 8  if $s =~ /^[bcdfghjklmnpqrstvwxz]{3}/ && $s =~ /[bcdfghjklmnpqrstvwxz]{3}$/;
  $score -= 6  if $s =~ /(x[^aeiou]|q[^u]|[^aeiou]z[^aeiou]|[^aeiou]v[^rl]|kj|gw|dz|pf|xh|kr|vr|wl|wm|wz)/;
  $score -= 8  if $s =~ /(afy$|tix|pex$|mob$|faka|yutian|fuli|cny|xue|zhang|liuyi|guifu|hdy|hbhang|ciyuan|furniture|barbermx|abqary|kidz)/;
  $score -= 10 if $s =~ /(sex|xxx|porn|adult|escort|casino|poker|bet|loan|debt|viagra|cialis|pill|hack|crack|warez|scam|spam|adware|malware|orgasm|panty|pantys)/;
  $score -= 5  if $s =~ /(dancewear|charcoal|medical|nurse|clinic|latex|meme|blog$)/;
  $score -= 3  if $s =~ /(delgado|sarria|wibowo|lohr|andujar|yangon|sino|daqian|bespoke|china)/;

  return clamp(int($score+0.5),0,25);
}

sub relevance_score {
  my ($sld)=@_;
  my $s=lc $sld;
  my $score=0;
  # Strong tech/infra signals
  $score += 20 if $s =~ /(webhook|webhook)/;
  $score += 18 if $s =~ /(multi.*peer|peer|peers)/;
  $score += 17 if $s =~ /(ai|ai |api|api |ml |ml\b)/;
  $score += 16 if $s =~ /(data|data |data-)/;
  # General business/productivity words
  $score += 14 if $s =~ /(hub|cloud|node|link|sync|net|web|site|app|platform|base|core|direct|prime|fast|swift|pro|smart|global|shop|market|store|sell|buy|trade|pay|bank|money|cash|fund|trade)/;
  # Tech/dev signals
  $score += 12 if $s =~ /(code|dev|lab|io|sys|ware|tech|bot|nano|nano|micro|macro|meta|cyber|crypto|block|chain|token|nft|game|play|vr|ar|xr)/;
  # Communication/collaboration
  $score += 10 if $s =~ /(talk|chat|meet|team|work|crew|group|club|comm|social|mail|post|memo|note|task)/;
  # Generic positive business words
  $score += 8  if $s =~ /(green|blue|red|gold|silver|bright|clear|pure|true|sure|real|solid|bold|prime|true|safe|trust|elite|prime|pro|max|top|best|first)/;
  $score += 6  if $score==0 && $s =~ /(art|fan|luxury|wedding|lime|wild|pet|auto|home|life|world|planet|city|nation)/;
  return clamp($score,0,20);
}

sub commercial_score {
  my ($sld,$age,$est,$search,$dr,$bids)=@_;
  my $len=length($sld);
  my $score=0;
  $score += $len<=6 ? 4 : $len<=8 ? 5 : $len<=10 ? 4 : $len<=12 ? 3 : $len<=15 ? 1 : 0;
  $score += $age>=20 ? 4 : $age>=10 ? 3 : $age>=5 ? 2 : 1;
  $score += ($est && $est>=1000) ? 3 : ($est && $est>=100) ? 2 : ($est && $est>0) ? 1 : 0;
  $score += ($search && $search>=10000) ? 2 : ($search && $search>=1000) ? 1.5 : ($search && $search>0) ? 1 : 0;
  $score += ($dr && $dr>=20) ? 1 : ($dr && $dr>=5) ? 0.5 : 0;
  $score += ($bids && $bids>0) ? 1 : 0;
  return clamp(int($score+0.5),0,15);
}

sub cost_score {
  my ($price,$renew,$end)=@_;
  my $total=$price+$renew;
  my $score=0;
  $score += $total<=20 ? 4 : $total<=21 ? 3 : $total<=24 ? 2 : 1;
  $score += $renew<=19 ? 2 : $renew<=25 ? 1 : 0;
  my $days=0;
  my $ref = localtime;
  if ($end =~ /(\d{4})-(\d{2})-(\d{2})/) {
    my $t = Time::Piece->strptime("$1-$2-$3", "%Y-%m-%d");
    $days = int(($t-$ref)/(24*3600));
  }
  $score += $days>=14 ? 3 : $days>=7 ? 2 : $days>=3 ? 1 : 0;
  $score += 1;
  return clamp($score,0,10);
}

sub legal_score {
  my ($sld)=@_;
  my $s=lc $sld;
  return 1 if $s =~ /(sanyo|google|facebook|apple|microsoft|amazon|paypal|bilibili)/;
  return 2 if $s =~ /(askakorean|clydeauto|blakeandujar|armerfamily|ariewibowo|arilohr|delgadosarria)/;
  return 3 if $s =~ /(medical|clinic|nurse|law|auto)/;
  return 5;
}

my @rows;
open my $sf, '<', $src or die $!;
my $header=<$sf>;
while (<$sf>) {
  chomp;
  my @f=split /,/, $_, -1;
  my $domain=lc $f[1];
  next unless $want{$domain};
  my $sld=$domain; $sld =~ s/\.com$//;
  my $reg=$f[14]; my $end=$f[3];
  my $price=$f[4]+0; my $renew=$f[6]+0; my $bid=$f[7]+0;
  my $dr=($f[8] ne '' ? $f[8]+0 : 0);
  my $est=($f[11] ne '' ? $f[11]+0 : 0);
  my $search=($f[13] ne '' ? $f[13]+0 : 0);
  my $age=age_years($reg);

  my $hist = $positive_history{$domain} ? 24 : 16;
  my $name=name_score($sld);
  my $rel=relevance_score($sld);
  my $comm=commercial_score($sld,$age,$est,$search,$dr,$bid);
  my $cost=cost_score($price,$renew,$end);
  my $legal=legal_score($sld);
  my $total=$hist+$name+$rel+$comm+$cost+$legal;

  (my $endd=$end)=~s/T.*//; (my $regd=$reg)=~s/T.*//;
  push @rows, [$total,$domain,$price+$renew,$price,$renew,$endd,$age,$hist,$name,$rel,$comm,$cost,$legal,$est,$search,$dr,$bid];
}
close $sf;

@rows = sort { $b->[0] <=> $a->[0] || $a->[2] <=> $b->[2] || $a->[1] cmp $b->[1] } @rows;

print join(',', qw(score domain total_price price renew expires age hist name relevance commercial cost legal estibot search dr bids)),"\n";
for my $r (@rows) {
  printf "%d,%s,%.2f,%.2f,%.2f,%s,%d,%d,%d,%d,%d,%d,%d,%.0f,%.0f,%.0f,%d\n", @$r;
}
PERL

log "Scoring ..."
perl "$SCORER" "$WORKDIR/clean_mx.csv" "$SRC" > "$WORKDIR/scored_raw.csv"

# ------------------------------------------------------------------
# 5. Post-filter: remove obvious random/pinyin/gibberish after scoring
# ------------------------------------------------------------------

awk -F, 'BEGIN {OFS=","}
NR==1 {print; next}
{
  d=$2; s=d; sub(/\.com$/,"",s);
  if (s ~ /^(afbaby|ahgree|cosseppa|liuyishe|guifuyuan|chongxinpet|lixulawfirm|scamlover|wmbaby|sanycousa|rygasesorias|cocukbahcesi|nmgrenshi|dztygs|jsqmtz|rcmygs|xmyxys|yzsjgl|hhhlgygs|xsznlyj|naturalrxnm|ahfeiyan|avloca|azhj|fulifaka|hbhangmai|hdyutian|menghuijx|shuhuajiqi|xuelianjiaoyu|yanbeijinshu|yiranmiamhua|amateurorgasm|ciyuanhai|carathealth|kvdigitalmedia|untuckiy|winzhi|flitmob|flithop|oaraba|onzep|praeu|wierli|hikevi|paxkers|nstexxa|synogute|tixflit|litu|metatreat|technozire|xpenseco)$/) next;
  if (s ~ /[bcdfghjklmnpqrstvwxz]{5,}/) next;
  if (s ~ /^[bcdfghjklmnpqrstvwxz]{3}/ && s ~ /[bcdfghjklmnpqrstvwxz]{3}$/) next;
  if (s !~ /[aeiou]/) next;
  if (s ~ /panty|adware|spyware|malware|virus|hack|crack|warez|pill|viagra|cialis|loan|debt|bet|poker|casino|escort|adult|sex|xxx|porn|orgasm/) next;
  print;
}' "$WORKDIR/scored_raw.csv" > "$WORKDIR/scored_reviewed.csv"

# ------------------------------------------------------------------
# 6. LinkedIn company match check (cached)
# ------------------------------------------------------------------
LINKEDIN_LIMIT=500
mkdir -p "$LINKEDIN_CACHE_DIR"

log "Checking LinkedIn company profiles for top $LINKEDIN_LIMIT candidates ..."

# Build lookup from domain -> linkedin data
: > "$WORKDIR/linkedin_lookup.csv"

# Extract top N domains, skip header
set +o pipefail
tail -n +2 "$WORKDIR/scored_reviewed.csv" | head -n "$LINKEDIN_LIMIT" | cut -d, -f2 > "$WORKDIR/linkedin_domains.txt"
set -o pipefail

while read -r d; do
  val="$("$LINKEDIN_CHECK" "$d" 2>/dev/null || true)"
  if [[ -z "$val" ]]; then val="none,,,,,,,"; fi
  # Clean any newlines in the value
  val="$(echo "$val" | tr '\n' ' ' | sed 's/[[:space:]]*$//')"
  echo "$val" >> "$WORKDIR/linkedin_lookup.csv"
done < "$WORKDIR/linkedin_domains.txt"

# Build a domain-linked lookup with awk domain as key
paste -d ',' "$WORKDIR/linkedin_domains.txt" "$WORKDIR/linkedin_lookup.csv" > "$WORKDIR/linkedin_map.csv"

# Merge LinkedIn columns into scored_reviewed.csv
# linkedin_check.sh outputs: match, company, url, industry, size, hq, website, desc
awk -v map="$WORKDIR/linkedin_map.csv" '
BEGIN { FS=","; OFS=",";
  while ((getline line < map) > 0) {
    split(line, a, ",");
    dom=a[1]; lm=a[2]; comp=a[3]; url=a[4]; ind=a[5]; sz=a[6]; hq=a[7]; web=a[8]; desc=a[9];
    m[dom]=lm; c[dom]=comp; u[dom]=url; i[dom]=ind; s[dom]=sz; h[dom]=hq; w[dom]=web; d[dom]=desc;
  }
  close(map);
}
NR==1 {
  print $0, "linkedin_match", "linkedin_company", "linkedin_url", "linkedin_industry", "linkedin_size", "linkedin_hq", "linkedin_website", "linkedin_desc", "linkedin_adj", "final_score";
  next;
}
{
  dom=$2;
  lm = (dom in m && m[dom] != "" ? m[dom] : "none");
  lc = (dom in c ? c[dom] : "");
  lu = (dom in u ? u[dom] : "");
  li = (dom in i ? i[dom] : "");
  ls = (dom in s ? s[dom] : "");
  lh = (dom in h ? h[dom] : "");
  lw = (dom in w ? w[dom] : "");
  ld = (dom in d ? d[dom] : "");
  adj = 0;
  if (lm == "exact") adj = 25;
  else if (lm == "strong") adj = 20;
  else adj = 0;
  gsub(/,/, " ", lc); gsub(/,/, " ", lu); gsub(/,/, " ", li);
  gsub(/,/, " ", ls); gsub(/,/, " ", lh); gsub(/,/, " ", lw); gsub(/,/, " ", ld);
  print $0, lm, lc, lu, li, ls, lh, lw, ld, adj, $1+adj;
}
' "$WORKDIR/scored_reviewed.csv" > "$WORKDIR/scored_final.csv"

# Re-sort: exact match first, then strong, then by final_score desc, then by total_price asc
(head -1 "$WORKDIR/scored_final.csv" && \
  tail -n +2 "$WORKDIR/scored_final.csv" | \
  sort -t, -k18,18 -k27,27nr -k3,3n) > "$WORKDIR/scored_final_sorted.csv" && \
  mv "$WORKDIR/scored_final_sorted.csv" "$WORKDIR/scored_final.csv"

# Handle domains not in the top N: they still get processed by the awk above with empty lookups
# No need for a separate append step.

# ------------------------------------------------------------------
# 7. Output
# ------------------------------------------------------------------
mkdir -p "$RESULTS_DIR"
OUTFILE="$RESULTS_DIR/domain_results_$(date +%Y%m%d_%H%M%S).csv"
cp "$WORKDIR/scored_final.csv" "$OUTFILE"
log "Results written to: $OUTFILE"

echo ""
echo "=== Top 30 by final score (LinkedIn matches rank first) ==="
echo ""
head -31 "$WORKDIR/scored_final.csv" | column -t -s,

echo ""
echo "=== LinkedIn exact matches ==="
awk -F, 'NR==1 || $18=="exact"' "$WORKDIR/scored_final.csv" | column -t -s,

echo ""
echo "=== LinkedIn strong matches ==="
awk -F, 'NR==1 || $18=="strong"' "$WORKDIR/scored_final.csv" | column -t -s,

echo ""
echo "=== Shortlist (final_score >= 60) ==="
awk -F, 'NR==1 || $NF>=60' "$WORKDIR/scored_final.csv" | column -t -s, | head -40

echo ""
echo "Workdir: $WORKDIR"
echo "LinkedIn cache: $LINKEDIN_CACHE_DIR"
echo "Results: $RESULTS_DIR"