graph_explorer/enrichers/web_search/run.py

#!/usr/bin/env python3
"""Enricher web_search — busca en DuckDuckGo HTML y crea nodos Url.

Wire protocol estandar (issue 0026):
  - stdin:  JSON con node_id, node_name, metadata, ops_db_path, app_dir,
            cache_dir, registry_root, params.
  - stderr: lineas `PROGRESS:<float> <stage>` para feedback de UI.
  - stdout: una linea JSON al final con resumen.
  - exit code 0 = ok, !=0 = error.

DDG endpoint usado: https://html.duckduckgo.com/html/?q=<query>
Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en
redireccion `//duckduckgo.com/l/?uddg=<encoded>` que hay que decodificar.

Para automatizar busquedas masivas en el futuro (sesion persistente,
cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp`
que controle un Chromium remoto via DevTools Protocol. Este es el
fallback simple zero-infra.
"""
from __future__ import annotations

import html
import json
import os
import re
import sqlite3
import sys
import time
from datetime import datetime, timezone
from html.parser import HTMLParser
from urllib.parse import parse_qs, unquote, urlparse


def progress(p: float, stage: str = "") -> None:
    sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
    sys.stderr.flush()


def log(msg: str) -> None:
    sys.stderr.write(f"{msg}\n")
    sys.stderr.flush()


def now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def now_ms() -> int:
    return int(time.time() * 1000)


def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str:
    """Descarga la pagina HTML de resultados de DuckDuckGo.

    El endpoint `html.duckduckgo.com` no requiere JS y respeta los
    parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off,
    -2 moderate). Inyecta cookie para que el "moderate" se aplique sin
    pantalla intermedia.
    """
    params = {"q": query}
    if region:
        params["kl"] = region
    safe_map = {"strict": "1", "moderate": "-1", "off": "-2"}
    if safe in safe_map:
        params["kp"] = safe_map[safe]

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/120 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.7",
    }
    try:
        import requests  # type: ignore
        r = requests.post(
            "https://html.duckduckgo.com/html/",
            data=params,
            headers=headers,
            timeout=timeout,
        )
        return r.text
    except ImportError:
        from urllib.parse import urlencode
        from urllib.request import Request, urlopen
        body = urlencode(params).encode()
        req = Request("https://html.duckduckgo.com/html/", data=body,
                      headers=headers)
        with urlopen(req, timeout=timeout) as resp:  # type: ignore
            return resp.read().decode("utf-8", errors="replace")


def decode_ddg_href(href: str) -> str:
    """Decodifica el href de DDG, que envuelve la URL real en `uddg=`.

    Formatos posibles:
      //duckduckgo.com/l/?uddg=https%3A...&rut=...
      /l/?uddg=https%3A...
      https://example.com/...   (raro, pero ocurre con anuncios o cuando DDG
                                  no envuelve)
    """
    if not href:
        return ""
    if href.startswith("//"):
        href = "https:" + href
    elif href.startswith("/l/"):
        href = "https://duckduckgo.com" + href

    try:
        u = urlparse(href)
        if u.netloc.endswith("duckduckgo.com") and u.path == "/l/":
            qs = parse_qs(u.query)
            target = qs.get("uddg", [""])[0]
            if target:
                return unquote(target)
    except Exception:
        pass
    return href


class _DDGParser(HTMLParser):
    """Extrae resultados (anchor + snippet + rank) del HTML de DDG.

    No intenta ser completo — solo busca `<a class="result__a">` para el
    titulo/url y `<a class="result__snippet">` (o el div equivalente)
    para el texto. Es robusto a cambios menores: si DDG renombra clases,
    el enricher devolvera 0 resultados pero no peta.
    """

    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.results: list[dict] = []
        self._cur: dict | None = None
        self._in_title = False
        self._in_snippet = False
        self._title_buf: list[str] = []
        self._snippet_buf: list[str] = []

    def _classes(self, attrs: list[tuple[str, str | None]]) -> set[str]:
        for k, v in attrs:
            if k == "class" and v:
                return set(v.split())
        return set()

    def _href(self, attrs: list[tuple[str, str | None]]) -> str:
        for k, v in attrs:
            if k == "href" and v:
                return v
        return ""

    def handle_starttag(self, tag: str, attrs):
        if tag != "a":
            return
        cls = self._classes(attrs)
        if "result__a" in cls:
            if self._cur:
                self._flush()
            self._cur = {"href": self._href(attrs), "title": "", "snippet": ""}
            self._in_title = True
            self._title_buf = []
        elif "result__snippet" in cls and self._cur is not None:
            self._in_snippet = True
            self._snippet_buf = []

    def handle_endtag(self, tag: str):
        if tag != "a":
            return
        if self._in_title:
            self._cur and self._cur.update(
                title=" ".join("".join(self._title_buf).split())
            )
            self._in_title = False
        elif self._in_snippet:
            self._cur and self._cur.update(
                snippet=" ".join("".join(self._snippet_buf).split())
            )
            self._in_snippet = False

    def handle_data(self, data: str):
        if self._in_title:
            self._title_buf.append(data)
        elif self._in_snippet:
            self._snippet_buf.append(data)

    def _flush(self):
        if self._cur and self._cur.get("href"):
            self.results.append(self._cur)
        self._cur = None

    def close(self) -> None:
        if self._cur:
            self._flush()
        super().close()


def parse_ddg_html(htmltxt: str) -> list[dict]:
    """Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}]."""
    p = _DDGParser()
    try:
        p.feed(htmltxt)
        p.close()
    except Exception as e:
        log(f"DDG parser failed: {e}")

    out: list[dict] = []
    seen: set[str] = set()
    for i, r in enumerate(p.results):
        url = decode_ddg_href(r.get("href") or "")
        if not url or not url.startswith(("http://", "https://")):
            continue
        if url in seen:
            continue
        seen.add(url)
        out.append({
            "url":     url,
            "title":   r.get("title") or "",
            "snippet": r.get("snippet") or "",
            "rank":    len(out) + 1,
        })
    return out


def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None:
    """Busca un nodo Url existente con la misma url en metadata."""
    cur = conn.execute(
        "SELECT id, metadata FROM entities WHERE type_ref='Url'"
    )
    for row in cur:
        meta_raw = row[1] or "{}"
        try:
            meta = json.loads(meta_raw)
        except Exception:
            continue
        if isinstance(meta, dict) and meta.get("url") == url:
            return row[0]
    return None


def insert_url_entity(conn: sqlite3.Connection, url: str, title: str,
                      snippet: str, rank: int, query: str) -> str:
    """Crea un nodo Url y devuelve su id. Si ya existe, lo reusa y refresca."""
    existing = find_url_entity(conn, url)
    ts = now_iso()
    meta = {
        "url":     url,
        "title":   title,
        "snippet": snippet,
        "rank":    rank,
        "query":   query,
        "engine":  "duckduckgo",
        "found_at": ts,
    }
    meta_json = json.dumps(meta, ensure_ascii=False)
    if existing:
        conn.execute(
            "UPDATE entities SET metadata=?, updated_at=? WHERE id=?",
            (meta_json, ts, existing),
        )
        return existing

    new_id = f"Url_{now_ms()}_{rank}_{abs(hash(url)) % 100000}"
    name = title[:200] if title else url[:200]
    conn.execute(
        "INSERT INTO entities (id, name, type_ref, source, metadata, "
        " created_at, updated_at) "
        "VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?)",
        (new_id, name, meta_json, ts, ts),
    )
    return new_id


def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str,
                    name: str) -> bool:
    cur = conn.execute(
        "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
        "AND name=? LIMIT 1",
        (from_id, to_id, name),
    )
    return cur.fetchone() is not None


_REL_COUNTER = 0


def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str,
                    name: str) -> bool:
    global _REL_COUNTER
    if relation_exists(conn, from_id, to_id, name):
        return False
    ts = now_iso()
    _REL_COUNTER += 1
    rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}"
    conn.execute(
        "INSERT INTO relations (id, name, from_entity, to_entity, "
        " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
        (rel_id, name, from_id, to_id, ts, ts),
    )
    return True


def main() -> int:
    raw = sys.stdin.read()
    try:
        ctx = json.loads(raw)
    except Exception as e:
        log(f"stdin not valid JSON: {e}")
        return 2

    node_id      = ctx.get("node_id") or ""
    node_name    = (ctx.get("node_name") or "").strip()
    metadata     = ctx.get("metadata") or {}
    if isinstance(metadata, str):
        try:
            metadata = json.loads(metadata)
        except Exception:
            metadata = {}
    ops_db_path  = ctx.get("ops_db_path") or ""
    params       = ctx.get("params") or {}
    limit        = int(params.get("limit", 10))
    region       = (params.get("region") or "").strip()
    safe         = (params.get("safe")   or "moderate").strip()
    timeout_s    = int(params.get("timeout_s", 15))

    if not node_id or not ops_db_path:
        log("missing node_id / ops_db_path")
        return 2

    # Normalizar backslashes a forward slashes — el path puede llegar
    # con separadores mezclados desde el lado C++ si fs::path se
    # construyo en otro contexto (build cross-platform, copy entre
    # Windows y WSL, etc.).
    ops_db_path = ops_db_path.replace("\\", "/")
    app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/")

    # Resolver a absoluto si llega relativo, usando app_dir como
    # ancla y cwd como fallback. Sin esto sqlite3 crea un fichero
    # vacio si el cwd del subprocess no coincide con el del padre.
    if not os.path.isabs(ops_db_path):
        if app_dir_raw and os.path.isdir(app_dir_raw):
            cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path))
            if os.path.exists(cand):
                ops_db_path = cand
        if not os.path.isabs(ops_db_path):
            ops_db_path = os.path.abspath(ops_db_path)

    if not os.path.exists(ops_db_path):
        log(f"ops_db_path no existe: {ops_db_path} (cwd={os.getcwd()})")
        print(json.dumps({"error": "ops_db not found",
                          "ops_db_path": ops_db_path,
                          "cwd": os.getcwd(),
                          "entities_added": 0, "relations_added": 0}))
        return 7

    # Schema check — si no hay tabla entities, el path es incorrecto
    # o la operations.db esta sin bootstrappear.
    try:
        _c = sqlite3.connect(ops_db_path)
        try:
            row = _c.execute(
                "SELECT name FROM sqlite_master "
                "WHERE type='table' AND name='entities'"
            ).fetchone()
        finally:
            _c.close()
        if not row:
            log(f"sin tabla 'entities' en {ops_db_path}")
            print(json.dumps({
                "error": "operations.db sin tabla 'entities' — "
                         "verifica que graph_explorer haya cargado un "
                         "proyecto valido antes de lanzar el enricher",
                "ops_db_path": ops_db_path,
                "entities_added": 0, "relations_added": 0}))
            return 8
    except sqlite3.Error as e:
        log(f"sqlite open failed: {e}")
        return 9

    # Query: prioridad metadata.query > metadata.text > node_name.
    query = (metadata.get("query") or metadata.get("text") or node_name).strip()
    if not query:
        log("nodo sin query (metadata.query / metadata.text / name)")
        return 2

    progress(0.10, "fetching")
    try:
        htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe)
    except Exception as e:
        log(f"DDG fetch failed: {e}")
        print(json.dumps({"error": str(e), "query": query,
                          "entities_added": 0, "relations_added": 0}))
        return 4

    progress(0.55, "parsing")
    results = parse_ddg_html(htmltxt)
    if limit > 0:
        results = results[:limit]
    log(f"DDG returned {len(results)} results")

    progress(0.80, "applying")
    conn = sqlite3.connect(ops_db_path)
    conn.execute("PRAGMA foreign_keys=OFF")
    entities_added = 0
    relations_added = 0
    try:
        for r in results:
            existed = find_url_entity(conn, r["url"]) is not None
            url_id = insert_url_entity(
                conn,
                url=r["url"],
                title=r["title"],
                snippet=r["snippet"],
                rank=r["rank"],
                query=query,
            )
            if not existed:
                entities_added += 1
            if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"):
                relations_added += 1
        conn.commit()
    finally:
        conn.close()

    progress(1.0, "done")
    print(json.dumps({
        "query":           query,
        "engine":          "duckduckgo",
        "results":         len(results),
        "entities_added":  entities_added,
        "relations_added": relations_added,
    }, ensure_ascii=False))
    return 0


if __name__ == "__main__":
    sys.exit(main())