#!/usr/bin/env python3 """Enricher web_search — busca en DuckDuckGo HTML y crea nodos Url. Wire protocol estandar (issue 0026): - stdin: JSON con node_id, node_name, metadata, ops_db_path, app_dir, cache_dir, registry_root, params. - stderr: lineas `PROGRESS: ` para feedback de UI. - stdout: una linea JSON al final con resumen. - exit code 0 = ok, !=0 = error. DDG endpoint usado: https://html.duckduckgo.com/html/?q= Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en redireccion `//duckduckgo.com/l/?uddg=` que hay que decodificar. Para automatizar busquedas masivas en el futuro (sesion persistente, cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp` que controle un Chromium remoto via DevTools Protocol. Este es el fallback simple zero-infra. """ from __future__ import annotations import html import json import os import re import sqlite3 import sys import time from datetime import datetime, timezone from html.parser import HTMLParser from urllib.parse import parse_qs, unquote, urlparse def progress(p: float, stage: str = "") -> None: sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def log(msg: str) -> None: sys.stderr.write(f"{msg}\n") sys.stderr.flush() def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str: """Descarga la pagina HTML de resultados de DuckDuckGo. El endpoint `html.duckduckgo.com` no requiere JS y respeta los parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off, -2 moderate). Inyecta cookie para que el "moderate" se aplique sin pantalla intermedia. """ params = {"q": query} if region: params["kl"] = region safe_map = {"strict": "1", "moderate": "-1", "off": "-2"} if safe in safe_map: params["kp"] = safe_map[safe] headers = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.7", } try: import requests # type: ignore r = requests.post( "https://html.duckduckgo.com/html/", data=params, headers=headers, timeout=timeout, ) return r.text except ImportError: from urllib.parse import urlencode from urllib.request import Request, urlopen body = urlencode(params).encode() req = Request("https://html.duckduckgo.com/html/", data=body, headers=headers) with urlopen(req, timeout=timeout) as resp: # type: ignore return resp.read().decode("utf-8", errors="replace") def decode_ddg_href(href: str) -> str: """Decodifica el href de DDG, que envuelve la URL real en `uddg=`. Formatos posibles: //duckduckgo.com/l/?uddg=https%3A...&rut=... /l/?uddg=https%3A... https://example.com/... (raro, pero ocurre con anuncios o cuando DDG no envuelve) """ if not href: return "" if href.startswith("//"): href = "https:" + href elif href.startswith("/l/"): href = "https://duckduckgo.com" + href try: u = urlparse(href) if u.netloc.endswith("duckduckgo.com") and u.path == "/l/": qs = parse_qs(u.query) target = qs.get("uddg", [""])[0] if target: return unquote(target) except Exception: pass return href class _DDGParser(HTMLParser): """Extrae resultados (anchor + snippet + rank) del HTML de DDG. No intenta ser completo — solo busca `` para el titulo/url y `` (o el div equivalente) para el texto. Es robusto a cambios menores: si DDG renombra clases, el enricher devolvera 0 resultados pero no peta. """ def __init__(self) -> None: super().__init__(convert_charrefs=True) self.results: list[dict] = [] self._cur: dict | None = None self._in_title = False self._in_snippet = False self._title_buf: list[str] = [] self._snippet_buf: list[str] = [] def _classes(self, attrs: list[tuple[str, str | None]]) -> set[str]: for k, v in attrs: if k == "class" and v: return set(v.split()) return set() def _href(self, attrs: list[tuple[str, str | None]]) -> str: for k, v in attrs: if k == "href" and v: return v return "" def handle_starttag(self, tag: str, attrs): if tag != "a": return cls = self._classes(attrs) if "result__a" in cls: if self._cur: self._flush() self._cur = {"href": self._href(attrs), "title": "", "snippet": ""} self._in_title = True self._title_buf = [] elif "result__snippet" in cls and self._cur is not None: self._in_snippet = True self._snippet_buf = [] def handle_endtag(self, tag: str): if tag != "a": return if self._in_title: self._cur and self._cur.update( title=" ".join("".join(self._title_buf).split()) ) self._in_title = False elif self._in_snippet: self._cur and self._cur.update( snippet=" ".join("".join(self._snippet_buf).split()) ) self._in_snippet = False def handle_data(self, data: str): if self._in_title: self._title_buf.append(data) elif self._in_snippet: self._snippet_buf.append(data) def _flush(self): if self._cur and self._cur.get("href"): self.results.append(self._cur) self._cur = None def close(self) -> None: if self._cur: self._flush() super().close() def parse_ddg_html(htmltxt: str) -> list[dict]: """Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}].""" p = _DDGParser() try: p.feed(htmltxt) p.close() except Exception as e: log(f"DDG parser failed: {e}") out: list[dict] = [] seen: set[str] = set() for i, r in enumerate(p.results): url = decode_ddg_href(r.get("href") or "") if not url or not url.startswith(("http://", "https://")): continue if url in seen: continue seen.add(url) out.append({ "url": url, "title": r.get("title") or "", "snippet": r.get("snippet") or "", "rank": len(out) + 1, }) return out def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None: """Busca un nodo Url existente con la misma url en metadata.""" cur = conn.execute( "SELECT id, metadata FROM entities WHERE type_ref='Url'" ) for row in cur: meta_raw = row[1] or "{}" try: meta = json.loads(meta_raw) except Exception: continue if isinstance(meta, dict) and meta.get("url") == url: return row[0] return None def insert_url_entity(conn: sqlite3.Connection, url: str, title: str, snippet: str, rank: int, query: str) -> str: """Crea un nodo Url y devuelve su id. Si ya existe, lo reusa y refresca.""" existing = find_url_entity(conn, url) ts = now_iso() meta = { "url": url, "title": title, "snippet": snippet, "rank": rank, "query": query, "engine": "duckduckgo", "found_at": ts, } meta_json = json.dumps(meta, ensure_ascii=False) if existing: conn.execute( "UPDATE entities SET metadata=?, updated_at=? WHERE id=?", (meta_json, ts, existing), ) return existing new_id = f"Url_{now_ms()}_{rank}_{abs(hash(url)) % 100000}" name = title[:200] if title else url[:200] conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, " " created_at, updated_at) " "VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?)", (new_id, name, meta_json, ts, ts), ) return new_id def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str, name: str) -> bool: cur = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? " "AND name=? LIMIT 1", (from_id, to_id, name), ) return cur.fetchone() is not None _REL_COUNTER = 0 def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str, name: str) -> bool: global _REL_COUNTER if relation_exists(conn, from_id, to_id, name): return False ts = now_iso() _REL_COUNTER += 1 rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}" conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, " " created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)", (rel_id, name, from_id, to_id, ts, ts), ) return True def main() -> int: raw = sys.stdin.read() try: ctx = json.loads(raw) except Exception as e: log(f"stdin not valid JSON: {e}") return 2 node_id = ctx.get("node_id") or "" node_name = (ctx.get("node_name") or "").strip() metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db_path = ctx.get("ops_db_path") or "" params = ctx.get("params") or {} limit = int(params.get("limit", 10)) region = (params.get("region") or "").strip() safe = (params.get("safe") or "moderate").strip() timeout_s = int(params.get("timeout_s", 15)) if not node_id or not ops_db_path: log("missing node_id / ops_db_path") return 2 # Normalizar backslashes a forward slashes — el path puede llegar # con separadores mezclados desde el lado C++ si fs::path se # construyo en otro contexto (build cross-platform, copy entre # Windows y WSL, etc.). ops_db_path = ops_db_path.replace("\\", "/") app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/") # Resolver a absoluto si llega relativo, usando app_dir como # ancla y cwd como fallback. Sin esto sqlite3 crea un fichero # vacio si el cwd del subprocess no coincide con el del padre. if not os.path.isabs(ops_db_path): if app_dir_raw and os.path.isdir(app_dir_raw): cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path)) if os.path.exists(cand): ops_db_path = cand if not os.path.isabs(ops_db_path): ops_db_path = os.path.abspath(ops_db_path) if not os.path.exists(ops_db_path): log(f"ops_db_path no existe: {ops_db_path} (cwd={os.getcwd()})") print(json.dumps({"error": "ops_db not found", "ops_db_path": ops_db_path, "cwd": os.getcwd(), "entities_added": 0, "relations_added": 0})) return 7 # Schema check — si no hay tabla entities, el path es incorrecto # o la operations.db esta sin bootstrappear. try: _c = sqlite3.connect(ops_db_path) try: row = _c.execute( "SELECT name FROM sqlite_master " "WHERE type='table' AND name='entities'" ).fetchone() finally: _c.close() if not row: log(f"sin tabla 'entities' en {ops_db_path}") print(json.dumps({ "error": "operations.db sin tabla 'entities' — " "verifica que graph_explorer haya cargado un " "proyecto valido antes de lanzar el enricher", "ops_db_path": ops_db_path, "entities_added": 0, "relations_added": 0})) return 8 except sqlite3.Error as e: log(f"sqlite open failed: {e}") return 9 # Query: prioridad metadata.query > metadata.text > node_name. query = (metadata.get("query") or metadata.get("text") or node_name).strip() if not query: log("nodo sin query (metadata.query / metadata.text / name)") return 2 progress(0.10, "fetching") try: htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe) except Exception as e: log(f"DDG fetch failed: {e}") print(json.dumps({"error": str(e), "query": query, "entities_added": 0, "relations_added": 0})) return 4 progress(0.55, "parsing") results = parse_ddg_html(htmltxt) if limit > 0: results = results[:limit] log(f"DDG returned {len(results)} results") progress(0.80, "applying") conn = sqlite3.connect(ops_db_path) conn.execute("PRAGMA foreign_keys=OFF") entities_added = 0 relations_added = 0 try: for r in results: existed = find_url_entity(conn, r["url"]) is not None url_id = insert_url_entity( conn, url=r["url"], title=r["title"], snippet=r["snippet"], rank=r["rank"], query=query, ) if not existed: entities_added += 1 if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"): relations_added += 1 conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "query": query, "engine": "duckduckgo", "results": len(results), "entities_added": entities_added, "relations_added": relations_added, }, ensure_ascii=False)) return 0 if __name__ == "__main__": sys.exit(main())