/** HTML parsing utilities for scraper modules */

export function extractText(html: string, selector: string): string | null {
  const tagMatch = selector.match(/^(\w+)$/);
  if (tagMatch) {
    const re = new RegExp(`<${tagMatch[1]}[^>]*>([^<]*)</${tagMatch[1]}>`, 'i');
    const m = html.match(re);
    return m ? m[1].trim() : null;
  }
  return null;
}

export function extractAll(html: string, pattern: RegExp): string[] {
  const matches: string[] = [];
  let m: RegExpExecArray | null;
  const re = new RegExp(pattern.source, pattern.flags.includes('g') ? pattern.flags : pattern.flags + 'g');
  while ((m = re.exec(html)) !== null) {
    matches.push(m[1] ?? m[0]);
  }
  return matches;
}

export function extractAttr(html: string, tag: string, attr: string): string | null {
  const re = new RegExp(`<${tag}[^>]*\\s${attr}=["']([^"']*)["'][^>]*>`, 'i');
  const m = html.match(re);
  return m ? m[1] : null;
}

export function stripHtml(html: string): string {
  return html
    .replace(/<[^>]+>/g, '')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&nbsp;/g, ' ')
    .trim();
}

export function slugify(text: string): string {
  return text
    .toLowerCase()
    .replace(/[^\w\s-]/g, '')
    .replace(/\s+/g, '-')
    .replace(/-+/g, '-')
    .replace(/^-|-$/g, '');
}

export function parseDuration(text: string): number | undefined {
  const minMatch = text.match(/(\d+)\s*(?:min|minutes?|دقيقة)?/i);
  if (minMatch) return parseInt(minMatch[1], 10);
  const hm = text.match(/(\d+)\s*[h:]\s*(\d+)/i);
  if (hm) return parseInt(hm[1], 10) * 60 + parseInt(hm[2], 10);
  return undefined;
}
