graph_explorer/enrichers/web_search/run.py

#!/usr/bin/env python3
"""Enricher web_search — busca en DuckDuckGo HTML y crea nodos Url.

Wire protocol estandar (issue 0026):
  - stdin:  JSON con node_id, node_name, metadata, ops_db_path, app_dir,
            cache_dir, registry_root, params.
  - stderr: lineas `PROGRESS:<float> <stage>` para feedback de UI.
  - stdout: una linea JSON al final con resumen.
  - exit code 0 = ok, !=0 = error.

DDG endpoints usados:
  1. https://lite.duckduckgo.com/lite/ (POST) — endpoint primario.
     HTML minimo (ano 2009-style), tabla con `<a class='result-link'>` y
     `<td class='result-snippet'>`. Es el menos agresivo con bot
     detection; suele responder 200 cuando el endpoint `html.` ya
     devuelve un challenge "anomaly" desde IPs residenciales/Windows.
  2. https://html.duckduckgo.com/html/ (POST) — fallback. Su parser
     usa `result__a` / `result__snippet`. DDG envuelve los enlaces en
     `//duckduckgo.com/l/?uddg=<encoded>` que hay que decodificar.

Si ambos endpoints devuelven la pagina anti-bot ("anomaly", challenge
captcha), el enricher emite un error claro indicando que se necesita
`web_search_cdp` (issue 0029) — el fallback simple zero-infra no puede
resolver el challenge.
"""
from __future__ import annotations

import html
import json
import os
import re
import sqlite3
import sys
import time
import uuid
from datetime import datetime, timezone
from html.parser import HTMLParser
from urllib.parse import parse_qs, unquote, urlparse


# Issue 0035c — agrupacion automatica de resultados.
#
# Cuando un enricher produce >= GROUP_THRESHOLD resultados, los primeros
# GROUP_PREVIEW_K quedan sueltos colgando del source (estilo
# Twitter/Reddit timeline) y los N-K restantes entran en un nodo Group
# cuadrado. El manifest puede declarar `auto_group_threshold` para
# overridear el default; mas adelante settings UI permitira override
# global. Por ahora esta hardcoded.
DEFAULT_GROUP_THRESHOLD = 50
GROUP_PREVIEW_K         = 10


def _coerce_threshold(raw, default: int) -> int:
    """Acepta int / str numerico / None, devuelve >0 o el default.

    Issue 0035e: el manifest puede declarar `auto_group_threshold: <int>`
    y jobs.cpp lo propaga al subprocess. Cualquier otro valor (None,
    "", 0, no parseable) cae al default global.
    """
    if raw is None or raw == "":
        return default
    try:
        v = int(raw)
    except (TypeError, ValueError):
        return default
    return v if v > 0 else default


def progress(p: float, stage: str = "") -> None:
    sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
    sys.stderr.flush()


def log(msg: str) -> None:
    sys.stderr.write(f"{msg}\n")
    sys.stderr.flush()


def now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def now_ms() -> int:
    return int(time.time() * 1000)


def _ddg_post(url: str, params: dict, headers: dict, timeout: int) -> str:
    try:
        import requests  # type: ignore
        r = requests.post(url, data=params, headers=headers, timeout=timeout)
        return r.text
    except ImportError:
        from urllib.parse import urlencode
        from urllib.request import Request, urlopen
        body = urlencode(params).encode()
        req = Request(url, data=body, headers=headers)
        with urlopen(req, timeout=timeout) as resp:  # type: ignore
            return resp.read().decode("utf-8", errors="replace")


def is_anomaly_page(htmltxt: str) -> bool:
    """Detecta la pagina anti-bot de DDG (challenge captcha)."""
    s = htmltxt.lower()
    return "anomaly" in s and "challenge" in s


def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> tuple[str, str]:
    """Descarga la pagina de resultados de DuckDuckGo.

    Intenta primero `lite.duckduckgo.com/lite/` (HTML minimo, ano-2009
    style, mucho menos agresivo con bot detection que `html.`). Si
    ese endpoint devuelve la pagina anti-bot, cae al endpoint `html.`.
    Devuelve `(html, source)` donde source ∈ {"lite", "html"}.
    """
    params = {"q": query}
    if region:
        params["kl"] = region
    safe_map = {"strict": "1", "moderate": "-1", "off": "-2"}
    if safe in safe_map:
        params["kp"] = safe_map[safe]

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.7",
    }

    htmltxt = _ddg_post("https://lite.duckduckgo.com/lite/", params,
                         headers, timeout)
    if not is_anomaly_page(htmltxt):
        return htmltxt, "lite"

    log("lite endpoint devolvio challenge — fallback a html endpoint")
    htmltxt = _ddg_post("https://html.duckduckgo.com/html/", params,
                         headers, timeout)
    return htmltxt, "html"


def decode_ddg_href(href: str) -> str:
    """Decodifica el href de DDG, que envuelve la URL real en `uddg=`.

    Formatos posibles:
      //duckduckgo.com/l/?uddg=https%3A...&rut=...
      /l/?uddg=https%3A...
      https://example.com/...   (raro, pero ocurre con anuncios o cuando DDG
                                  no envuelve)
    """
    if not href:
        return ""
    if href.startswith("//"):
        href = "https:" + href
    elif href.startswith("/l/"):
        href = "https://duckduckgo.com" + href

    try:
        u = urlparse(href)
        if u.netloc.endswith("duckduckgo.com") and u.path == "/l/":
            qs = parse_qs(u.query)
            target = qs.get("uddg", [""])[0]
            if target:
                return unquote(target)
    except Exception:
        pass
    return href


class _DDGParser(HTMLParser):
    """Extrae resultados (anchor + snippet + rank) del HTML de DDG.

    No intenta ser completo — solo busca `<a class="result__a">` para el
    titulo/url y `<a class="result__snippet">` (o el div equivalente)
    para el texto. Es robusto a cambios menores: si DDG renombra clases,
    el enricher devolvera 0 resultados pero no peta.
    """

    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.results: list[dict] = []
        self._cur: dict | None = None
        self._in_title = False
        self._in_snippet = False
        self._title_buf: list[str] = []
        self._snippet_buf: list[str] = []

    def _classes(self, attrs: list[tuple[str, str | None]]) -> set[str]:
        for k, v in attrs:
            if k == "class" and v:
                return set(v.split())
        return set()

    def _href(self, attrs: list[tuple[str, str | None]]) -> str:
        for k, v in attrs:
            if k == "href" and v:
                return v
        return ""

    def handle_starttag(self, tag: str, attrs):
        if tag != "a":
            return
        cls = self._classes(attrs)
        if "result__a" in cls:
            if self._cur:
                self._flush()
            self._cur = {"href": self._href(attrs), "title": "", "snippet": ""}
            self._in_title = True
            self._title_buf = []
        elif "result__snippet" in cls and self._cur is not None:
            self._in_snippet = True
            self._snippet_buf = []

    def handle_endtag(self, tag: str):
        if tag != "a":
            return
        if self._in_title:
            self._cur and self._cur.update(
                title=" ".join("".join(self._title_buf).split())
            )
            self._in_title = False
        elif self._in_snippet:
            self._cur and self._cur.update(
                snippet=" ".join("".join(self._snippet_buf).split())
            )
            self._in_snippet = False

    def handle_data(self, data: str):
        if self._in_title:
            self._title_buf.append(data)
        elif self._in_snippet:
            self._snippet_buf.append(data)

    def _flush(self):
        if self._cur and self._cur.get("href"):
            self.results.append(self._cur)
        self._cur = None

    def close(self) -> None:
        if self._cur:
            self._flush()
        super().close()


def parse_ddg_html(htmltxt: str) -> list[dict]:
    """Parsea el HTML del endpoint `html.duckduckgo.com`."""
    p = _DDGParser()
    try:
        p.feed(htmltxt)
        p.close()
    except Exception as e:
        log(f"DDG parser failed: {e}")

    out: list[dict] = []
    seen: set[str] = set()
    for i, r in enumerate(p.results):
        url = decode_ddg_href(r.get("href") or "")
        if not url or not url.startswith(("http://", "https://")):
            continue
        if url in seen:
            continue
        seen.add(url)
        out.append({
            "url":     url,
            "title":   r.get("title") or "",
            "snippet": r.get("snippet") or "",
            "rank":    len(out) + 1,
        })
    return out


class _DDGLiteParser(HTMLParser):
    """Parser para `lite.duckduckgo.com/lite/`.

    Estructura tipica:
      <a rel="nofollow" href="<URL>" class='result-link'>title</a>
      ...
      <td class='result-snippet'>snippet text</td>
    Los snippets vienen DESPUES del enlace (no hijo del mismo elemento),
    asi que parea por orden: cada `result-link` consume el siguiente
    `result-snippet`.
    """

    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.results: list[dict] = []
        self._in_link = False
        self._in_snippet = False
        self._cur_href = ""
        self._title_buf: list[str] = []
        self._snippet_buf: list[str] = []
        self._pending_snippet_for: int | None = None

    def _attrs_dict(self, attrs):
        return {k: (v or "") for k, v in attrs}

    def handle_starttag(self, tag: str, attrs):
        a = self._attrs_dict(attrs)
        cls = a.get("class", "")
        if tag == "a" and "result-link" in cls:
            href = a.get("href", "")
            self._in_link = True
            self._cur_href = href
            self._title_buf = []
        elif tag == "td" and "result-snippet" in cls:
            self._in_snippet = True
            self._snippet_buf = []

    def handle_endtag(self, tag: str):
        if self._in_link and tag == "a":
            title = " ".join("".join(self._title_buf).split())
            self.results.append({
                "href":    self._cur_href,
                "title":   title,
                "snippet": "",
            })
            self._pending_snippet_for = len(self.results) - 1
            self._in_link = False
        elif self._in_snippet and tag == "td":
            snippet = " ".join("".join(self._snippet_buf).split())
            if self._pending_snippet_for is not None:
                self.results[self._pending_snippet_for]["snippet"] = snippet
                self._pending_snippet_for = None
            self._in_snippet = False

    def handle_data(self, data: str):
        if self._in_link:
            self._title_buf.append(data)
        elif self._in_snippet:
            self._snippet_buf.append(data)


def parse_ddg_lite(htmltxt: str) -> list[dict]:
    """Parsea el HTML del endpoint `lite.duckduckgo.com/lite/`."""
    p = _DDGLiteParser()
    try:
        p.feed(htmltxt)
        p.close()
    except Exception as e:
        log(f"DDG lite parser failed: {e}")

    out: list[dict] = []
    seen: set[str] = set()
    for r in p.results:
        href = r.get("href") or ""
        # lite envia URLs absolutas directas; aun asi pasamos por
        # decode_ddg_href por si en algun caso DDG envuelve.
        url = decode_ddg_href(href)
        if not url or not url.startswith(("http://", "https://")):
            continue
        # Excluir auto-promociones de DDG (paginas de ayuda).
        if "duckduckgo.com/duckduckgo-help-pages/" in url:
            continue
        if url in seen:
            continue
        seen.add(url)
        out.append({
            "url":     url,
            "title":   r.get("title") or "",
            "snippet": r.get("snippet") or "",
            "rank":    len(out) + 1,
        })
    return out


def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None:
    """Busca un nodo Url existente con la misma url en metadata."""
    cur = conn.execute(
        "SELECT id, metadata FROM entities WHERE type_ref='Url'"
    )
    for row in cur:
        meta_raw = row[1] or "{}"
        try:
            meta = json.loads(meta_raw)
        except Exception:
            continue
        if isinstance(meta, dict) and meta.get("url") == url:
            return row[0]
    return None


def has_group_id_column(conn: sqlite3.Connection) -> bool:
    """Detecta si la columna `group_id` existe en `entities`.

    El proyecto graph_explorer la añade via migracion (issue 0035a),
    pero podriamos correr contra una BD vieja. Si no esta, insertamos
    sin esa columna (resultados sueltos pero con `batch_id` en metadata).
    """
    try:
        cur = conn.execute("PRAGMA table_info(entities)")
        for row in cur:
            if row[1] == "group_id":
                return True
    except sqlite3.Error:
        pass
    return False


def insert_url_entity(conn: sqlite3.Connection, url: str, title: str,
                      snippet: str, rank: int, query: str,
                      batch_id: str = "",
                      group_id: str | None = None,
                      has_group_col: bool = False) -> str:
    """Crea un nodo Url y devuelve su id. Si ya existe, lo reusa y refresca.

    `batch_id` se inyecta en metadata si no esta vacio. `group_id` se
    escribe en la columna homonima cuando existe en el schema y se ha
    pasado un valor; si no, queda NULL (nodo suelto).
    """
    existing = find_url_entity(conn, url)
    ts = now_iso()
    meta = {
        "url":     url,
        "title":   title,
        "snippet": snippet,
        "rank":    rank,
        "query":   query,
        "engine":  "duckduckgo",
        "found_at": ts,
    }
    if batch_id:
        meta["batch_id"] = batch_id
    meta_json = json.dumps(meta, ensure_ascii=False)
    if existing:
        # Si la entidad ya existia, mantenemos su group_id actual (no
        # lo machacamos): un mismo Url puede aparecer en multiples
        # busquedas y el primer Group que lo capturo gana. Solo
        # actualizamos metadata + updated_at.
        conn.execute(
            "UPDATE entities SET metadata=?, updated_at=? WHERE id=?",
            (meta_json, ts, existing),
        )
        return existing

    new_id = f"Url_{now_ms()}_{rank}_{abs(hash(url)) % 100000}"
    name = title[:200] if title else url[:200]
    if has_group_col:
        conn.execute(
            "INSERT INTO entities (id, name, type_ref, source, metadata, "
            " group_id, created_at, updated_at) "
            "VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?, ?)",
            (new_id, name, meta_json, group_id, ts, ts),
        )
    else:
        conn.execute(
            "INSERT INTO entities (id, name, type_ref, source, metadata, "
            " created_at, updated_at) "
            "VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?)",
            (new_id, name, meta_json, ts, ts),
        )
    return new_id


def insert_group_entity(conn: sqlite3.Connection, *, query: str,
                         count: int, batch_id: str) -> str:
    """Crea un nodo Group para los resultados restantes de una busqueda.

    Devuelve el id del Group recien creado.
    """
    ts = now_iso()
    new_id = f"Group_{now_ms()}_{abs(hash(query + batch_id)) % 100000}"
    name = f"web_search: {query} ({count})"
    meta = {
        "enricher": "web_search",
        "query":    query,
        "count":    count,
        "batch_id": batch_id,
    }
    meta_json = json.dumps(meta, ensure_ascii=False)
    conn.execute(
        "INSERT INTO entities (id, name, type_ref, source, metadata, "
        " created_at, updated_at) "
        "VALUES (?, ?, 'Group', 'enricher:web_search', ?, ?, ?)",
        (new_id, name, meta_json, ts, ts),
    )
    return new_id


def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str,
                    name: str) -> bool:
    cur = conn.execute(
        "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
        "AND name=? LIMIT 1",
        (from_id, to_id, name),
    )
    return cur.fetchone() is not None


_REL_COUNTER = 0


def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str,
                    name: str) -> bool:
    global _REL_COUNTER
    if relation_exists(conn, from_id, to_id, name):
        return False
    ts = now_iso()
    _REL_COUNTER += 1
    rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}"
    conn.execute(
        "INSERT INTO relations (id, name, from_entity, to_entity, "
        " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
        (rel_id, name, from_id, to_id, ts, ts),
    )
    return True


def main() -> int:
    raw = sys.stdin.read()
    try:
        ctx = json.loads(raw)
    except Exception as e:
        log(f"stdin not valid JSON: {e}")
        return 2

    node_id      = ctx.get("node_id") or ""
    node_name    = (ctx.get("node_name") or "").strip()
    metadata     = ctx.get("metadata") or {}
    if isinstance(metadata, str):
        try:
            metadata = json.loads(metadata)
        except Exception:
            metadata = {}
    ops_db_path  = ctx.get("ops_db_path") or ""
    params       = ctx.get("params") or {}
    limit        = int(params.get("limit", 10))
    region       = (params.get("region") or "").strip()
    safe         = (params.get("safe")   or "moderate").strip()
    timeout_s    = int(params.get("timeout_s", 15))

    if not node_id or not ops_db_path:
        log("missing node_id / ops_db_path")
        return 2

    # Normalizar backslashes a forward slashes — el path puede llegar
    # con separadores mezclados desde el lado C++ si fs::path se
    # construyo en otro contexto (build cross-platform, copy entre
    # Windows y WSL, etc.).
    ops_db_path = ops_db_path.replace("\\", "/")
    app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/")

    # Resolver a absoluto si llega relativo, usando app_dir como
    # ancla y cwd como fallback. Sin esto sqlite3 crea un fichero
    # vacio si el cwd del subprocess no coincide con el del padre.
    if not os.path.isabs(ops_db_path):
        if app_dir_raw and os.path.isdir(app_dir_raw):
            cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path))
            if os.path.exists(cand):
                ops_db_path = cand
        if not os.path.isabs(ops_db_path):
            ops_db_path = os.path.abspath(ops_db_path)

    if not os.path.exists(ops_db_path):
        log(f"ops_db_path no existe: {ops_db_path} (cwd={os.getcwd()})")
        print(json.dumps({"error": "ops_db not found",
                          "ops_db_path": ops_db_path,
                          "cwd": os.getcwd(),
                          "entities_added": 0, "relations_added": 0}))
        return 7

    # Schema check — si no hay tabla entities, el path es incorrecto
    # o la operations.db esta sin bootstrappear.
    try:
        _c = sqlite3.connect(ops_db_path)
        try:
            row = _c.execute(
                "SELECT name FROM sqlite_master "
                "WHERE type='table' AND name='entities'"
            ).fetchone()
        finally:
            _c.close()
        if not row:
            log(f"sin tabla 'entities' en {ops_db_path}")
            print(json.dumps({
                "error": "operations.db sin tabla 'entities' — "
                         "verifica que graph_explorer haya cargado un "
                         "proyecto valido antes de lanzar el enricher",
                "ops_db_path": ops_db_path,
                "entities_added": 0, "relations_added": 0}))
            return 8
    except sqlite3.Error as e:
        log(f"sqlite open failed: {e}")
        return 9

    # Query: prioridad metadata.query > metadata.text > node_name.
    query = (metadata.get("query") or metadata.get("text") or node_name).strip()
    if not query:
        log("nodo sin query (metadata.query / metadata.text / name)")
        return 2

    progress(0.10, "fetching")
    try:
        htmltxt, source = fetch_ddg(query, timeout=timeout_s,
                                     region=region, safe=safe)
    except Exception as e:
        log(f"DDG fetch failed: {e}")
        print(json.dumps({"error": str(e), "query": query,
                          "entities_added": 0, "relations_added": 0}))
        return 4

    if is_anomaly_page(htmltxt):
        log("DDG devolvio challenge captcha en ambos endpoints — "
            "usar web_search_cdp (issue 0029) para resolver")
        print(json.dumps({
            "error":            "DDG bot challenge — captcha required",
            "query":            query,
            "engine":           "duckduckgo",
            "source":           source,
            "results":          0,
            "entities_added":   0,
            "relations_added":  0,
        }, ensure_ascii=False))
        return 4

    progress(0.55, "parsing")
    # El parser se elige por contenido — si el endpoint y el markup no
    # coinciden (tests con stub que sirve cualquier URL, o un cambio
    # futuro de DDG), aun extraemos resultados. Probamos ambos y nos
    # quedamos con el que devuelva mas.
    results_lite = parse_ddg_lite(htmltxt) if "result-link" in htmltxt else []
    results_html = parse_ddg_html(htmltxt) if "result__a"   in htmltxt else []
    results = results_lite if len(results_lite) >= len(results_html) else results_html
    if limit > 0:
        results = results[:limit]
    log(f"DDG ({source}) returned {len(results)} results "
        f"(lite_parsed={len(results_lite)} html_parsed={len(results_html)})")

    progress(0.80, "applying")
    conn = sqlite3.connect(ops_db_path)
    conn.execute("PRAGMA foreign_keys=OFF")
    entities_added = 0
    relations_added = 0
    group_id: str | None = None
    batch_id = uuid.uuid4().hex
    try:
        has_group_col = has_group_id_column(conn)
        n_total = len(results)
        # Threshold: el manifest puede declarar `auto_group_threshold` y
        # jobs.cpp lo propaga via stdin (issue 0035e). Si no viene, se
        # usa el default interno del enricher.
        threshold = _coerce_threshold(ctx.get("auto_group_threshold"),
                                       DEFAULT_GROUP_THRESHOLD)

        if n_total >= threshold and has_group_col:
            # Modo Twitter/Reddit: K sueltos + Group con N-K hijos.
            group_id = insert_group_entity(
                conn, query=query, count=n_total, batch_id=batch_id,
            )
            entities_added += 1
            if insert_relation(conn, group_id, node_id, "SEARCH_RESULT_OF"):
                relations_added += 1
            preview = results[:GROUP_PREVIEW_K]
            grouped = results[GROUP_PREVIEW_K:]
        else:
            # Comportamiento clasico: todo suelto, sin Group.
            preview = results
            grouped = []

        for r in preview:
            existed = find_url_entity(conn, r["url"]) is not None
            url_id = insert_url_entity(
                conn,
                url=r["url"],
                title=r["title"],
                snippet=r["snippet"],
                rank=r["rank"],
                query=query,
                batch_id=batch_id,
                group_id=None,
                has_group_col=has_group_col,
            )
            if not existed:
                entities_added += 1
            if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"):
                relations_added += 1

        for r in grouped:
            existed = find_url_entity(conn, r["url"]) is not None
            url_id = insert_url_entity(
                conn,
                url=r["url"],
                title=r["title"],
                snippet=r["snippet"],
                rank=r["rank"],
                query=query,
                batch_id=batch_id,
                group_id=group_id,
                has_group_col=has_group_col,
            )
            if not existed:
                entities_added += 1
            # La procedencia es la relacion al source original, no al
            # grupo — el grupo es solo un contenedor visual.
            if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"):
                relations_added += 1
        conn.commit()
    finally:
        conn.close()

    progress(1.0, "done")
    print(json.dumps({
        "query":           query,
        "engine":          "duckduckgo",
        "results":         len(results),
        "entities_added":  entities_added,
        "relations_added": relations_added,
        "batch_id":        batch_id,
        "group_id":        group_id or "",
        "grouped":         bool(group_id),
    }, ensure_ascii=False))
    return 0


if __name__ == "__main__":
    sys.exit(main())