Files
graph_explorer/enrichers/web_search/run.py
T
egutierrez 6919ebfe9c feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace
POST a html.duckduckgo.com con la query del nodo, parsea resultados
con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos
Url con relacion SEARCH_RESULT_OF apuntando al nodo origen.

Encadenable: tras web_search, fetch_webpage sobre cada Url completa
el pipeline search -> fetch -> extract.

Defensa contra ops_db_path mal resuelto: normaliza backslashes,
resuelve relativo contra app_dir, valida que la tabla entities
exista antes de tocar nada (exit codes 7/8/9 con JSON resumen).

Tests pytest (16/16 verde): conftest con operations.db temp +
schema minimo, stub de requests via PYTHONPATH para mockear red.
Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links,
extract_text_entities, web_search) + sanity check de manifests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 16:10:13 +02:00

437 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""Enricher web_search — busca en DuckDuckGo HTML y crea nodos Url.
Wire protocol estandar (issue 0026):
- stdin: JSON con node_id, node_name, metadata, ops_db_path, app_dir,
cache_dir, registry_root, params.
- stderr: lineas `PROGRESS:<float> <stage>` para feedback de UI.
- stdout: una linea JSON al final con resumen.
- exit code 0 = ok, !=0 = error.
DDG endpoint usado: https://html.duckduckgo.com/html/?q=<query>
Devuelve HTML estatico, sin JavaScript. Los enlaces vienen envueltos en
redireccion `//duckduckgo.com/l/?uddg=<encoded>` que hay que decodificar.
Para automatizar busquedas masivas en el futuro (sesion persistente,
cookies, JS, captchas) la fase 2 introducira un enricher `web_search_cdp`
que controle un Chromium remoto via DevTools Protocol. Este es el
fallback simple zero-infra.
"""
from __future__ import annotations
import html
import json
import os
import re
import sqlite3
import sys
import time
from datetime import datetime, timezone
from html.parser import HTMLParser
from urllib.parse import parse_qs, unquote, urlparse
def progress(p: float, stage: str = "") -> None:
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
sys.stderr.flush()
def log(msg: str) -> None:
sys.stderr.write(f"{msg}\n")
sys.stderr.flush()
def now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def now_ms() -> int:
return int(time.time() * 1000)
def fetch_ddg(query: str, timeout: int, region: str, safe: str) -> str:
"""Descarga la pagina HTML de resultados de DuckDuckGo.
El endpoint `html.duckduckgo.com` no requiere JS y respeta los
parametros `kl` (region) y `kp` (safe search: 1 strict, -1 off,
-2 moderate). Inyecta cookie para que el "moderate" se aplique sin
pantalla intermedia.
"""
params = {"q": query}
if region:
params["kl"] = region
safe_map = {"strict": "1", "moderate": "-1", "off": "-2"}
if safe in safe_map:
params["kp"] = safe_map[safe]
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.7",
}
try:
import requests # type: ignore
r = requests.post(
"https://html.duckduckgo.com/html/",
data=params,
headers=headers,
timeout=timeout,
)
return r.text
except ImportError:
from urllib.parse import urlencode
from urllib.request import Request, urlopen
body = urlencode(params).encode()
req = Request("https://html.duckduckgo.com/html/", data=body,
headers=headers)
with urlopen(req, timeout=timeout) as resp: # type: ignore
return resp.read().decode("utf-8", errors="replace")
def decode_ddg_href(href: str) -> str:
"""Decodifica el href de DDG, que envuelve la URL real en `uddg=`.
Formatos posibles:
//duckduckgo.com/l/?uddg=https%3A...&rut=...
/l/?uddg=https%3A...
https://example.com/... (raro, pero ocurre con anuncios o cuando DDG
no envuelve)
"""
if not href:
return ""
if href.startswith("//"):
href = "https:" + href
elif href.startswith("/l/"):
href = "https://duckduckgo.com" + href
try:
u = urlparse(href)
if u.netloc.endswith("duckduckgo.com") and u.path == "/l/":
qs = parse_qs(u.query)
target = qs.get("uddg", [""])[0]
if target:
return unquote(target)
except Exception:
pass
return href
class _DDGParser(HTMLParser):
"""Extrae resultados (anchor + snippet + rank) del HTML de DDG.
No intenta ser completo — solo busca `<a class="result__a">` para el
titulo/url y `<a class="result__snippet">` (o el div equivalente)
para el texto. Es robusto a cambios menores: si DDG renombra clases,
el enricher devolvera 0 resultados pero no peta.
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.results: list[dict] = []
self._cur: dict | None = None
self._in_title = False
self._in_snippet = False
self._title_buf: list[str] = []
self._snippet_buf: list[str] = []
def _classes(self, attrs: list[tuple[str, str | None]]) -> set[str]:
for k, v in attrs:
if k == "class" and v:
return set(v.split())
return set()
def _href(self, attrs: list[tuple[str, str | None]]) -> str:
for k, v in attrs:
if k == "href" and v:
return v
return ""
def handle_starttag(self, tag: str, attrs):
if tag != "a":
return
cls = self._classes(attrs)
if "result__a" in cls:
if self._cur:
self._flush()
self._cur = {"href": self._href(attrs), "title": "", "snippet": ""}
self._in_title = True
self._title_buf = []
elif "result__snippet" in cls and self._cur is not None:
self._in_snippet = True
self._snippet_buf = []
def handle_endtag(self, tag: str):
if tag != "a":
return
if self._in_title:
self._cur and self._cur.update(
title=" ".join("".join(self._title_buf).split())
)
self._in_title = False
elif self._in_snippet:
self._cur and self._cur.update(
snippet=" ".join("".join(self._snippet_buf).split())
)
self._in_snippet = False
def handle_data(self, data: str):
if self._in_title:
self._title_buf.append(data)
elif self._in_snippet:
self._snippet_buf.append(data)
def _flush(self):
if self._cur and self._cur.get("href"):
self.results.append(self._cur)
self._cur = None
def close(self) -> None:
if self._cur:
self._flush()
super().close()
def parse_ddg_html(htmltxt: str) -> list[dict]:
"""Parsea el HTML de DDG y devuelve [{url, title, snippet, rank}]."""
p = _DDGParser()
try:
p.feed(htmltxt)
p.close()
except Exception as e:
log(f"DDG parser failed: {e}")
out: list[dict] = []
seen: set[str] = set()
for i, r in enumerate(p.results):
url = decode_ddg_href(r.get("href") or "")
if not url or not url.startswith(("http://", "https://")):
continue
if url in seen:
continue
seen.add(url)
out.append({
"url": url,
"title": r.get("title") or "",
"snippet": r.get("snippet") or "",
"rank": len(out) + 1,
})
return out
def find_url_entity(conn: sqlite3.Connection, url: str) -> str | None:
"""Busca un nodo Url existente con la misma url en metadata."""
cur = conn.execute(
"SELECT id, metadata FROM entities WHERE type_ref='Url'"
)
for row in cur:
meta_raw = row[1] or "{}"
try:
meta = json.loads(meta_raw)
except Exception:
continue
if isinstance(meta, dict) and meta.get("url") == url:
return row[0]
return None
def insert_url_entity(conn: sqlite3.Connection, url: str, title: str,
snippet: str, rank: int, query: str) -> str:
"""Crea un nodo Url y devuelve su id. Si ya existe, lo reusa y refresca."""
existing = find_url_entity(conn, url)
ts = now_iso()
meta = {
"url": url,
"title": title,
"snippet": snippet,
"rank": rank,
"query": query,
"engine": "duckduckgo",
"found_at": ts,
}
meta_json = json.dumps(meta, ensure_ascii=False)
if existing:
conn.execute(
"UPDATE entities SET metadata=?, updated_at=? WHERE id=?",
(meta_json, ts, existing),
)
return existing
new_id = f"Url_{now_ms()}_{rank}_{abs(hash(url)) % 100000}"
name = title[:200] if title else url[:200]
conn.execute(
"INSERT INTO entities (id, name, type_ref, source, metadata, "
" created_at, updated_at) "
"VALUES (?, ?, 'Url', 'enricher:web_search', ?, ?, ?)",
(new_id, name, meta_json, ts, ts),
)
return new_id
def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str,
name: str) -> bool:
cur = conn.execute(
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
"AND name=? LIMIT 1",
(from_id, to_id, name),
)
return cur.fetchone() is not None
_REL_COUNTER = 0
def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str,
name: str) -> bool:
global _REL_COUNTER
if relation_exists(conn, from_id, to_id, name):
return False
ts = now_iso()
_REL_COUNTER += 1
rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}"
conn.execute(
"INSERT INTO relations (id, name, from_entity, to_entity, "
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
(rel_id, name, from_id, to_id, ts, ts),
)
return True
def main() -> int:
raw = sys.stdin.read()
try:
ctx = json.loads(raw)
except Exception as e:
log(f"stdin not valid JSON: {e}")
return 2
node_id = ctx.get("node_id") or ""
node_name = (ctx.get("node_name") or "").strip()
metadata = ctx.get("metadata") or {}
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except Exception:
metadata = {}
ops_db_path = ctx.get("ops_db_path") or ""
params = ctx.get("params") or {}
limit = int(params.get("limit", 10))
region = (params.get("region") or "").strip()
safe = (params.get("safe") or "moderate").strip()
timeout_s = int(params.get("timeout_s", 15))
if not node_id or not ops_db_path:
log("missing node_id / ops_db_path")
return 2
# Normalizar backslashes a forward slashes — el path puede llegar
# con separadores mezclados desde el lado C++ si fs::path se
# construyo en otro contexto (build cross-platform, copy entre
# Windows y WSL, etc.).
ops_db_path = ops_db_path.replace("\\", "/")
app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/")
# Resolver a absoluto si llega relativo, usando app_dir como
# ancla y cwd como fallback. Sin esto sqlite3 crea un fichero
# vacio si el cwd del subprocess no coincide con el del padre.
if not os.path.isabs(ops_db_path):
if app_dir_raw and os.path.isdir(app_dir_raw):
cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path))
if os.path.exists(cand):
ops_db_path = cand
if not os.path.isabs(ops_db_path):
ops_db_path = os.path.abspath(ops_db_path)
if not os.path.exists(ops_db_path):
log(f"ops_db_path no existe: {ops_db_path} (cwd={os.getcwd()})")
print(json.dumps({"error": "ops_db not found",
"ops_db_path": ops_db_path,
"cwd": os.getcwd(),
"entities_added": 0, "relations_added": 0}))
return 7
# Schema check — si no hay tabla entities, el path es incorrecto
# o la operations.db esta sin bootstrappear.
try:
_c = sqlite3.connect(ops_db_path)
try:
row = _c.execute(
"SELECT name FROM sqlite_master "
"WHERE type='table' AND name='entities'"
).fetchone()
finally:
_c.close()
if not row:
log(f"sin tabla 'entities' en {ops_db_path}")
print(json.dumps({
"error": "operations.db sin tabla 'entities'"
"verifica que graph_explorer haya cargado un "
"proyecto valido antes de lanzar el enricher",
"ops_db_path": ops_db_path,
"entities_added": 0, "relations_added": 0}))
return 8
except sqlite3.Error as e:
log(f"sqlite open failed: {e}")
return 9
# Query: prioridad metadata.query > metadata.text > node_name.
query = (metadata.get("query") or metadata.get("text") or node_name).strip()
if not query:
log("nodo sin query (metadata.query / metadata.text / name)")
return 2
progress(0.10, "fetching")
try:
htmltxt = fetch_ddg(query, timeout=timeout_s, region=region, safe=safe)
except Exception as e:
log(f"DDG fetch failed: {e}")
print(json.dumps({"error": str(e), "query": query,
"entities_added": 0, "relations_added": 0}))
return 4
progress(0.55, "parsing")
results = parse_ddg_html(htmltxt)
if limit > 0:
results = results[:limit]
log(f"DDG returned {len(results)} results")
progress(0.80, "applying")
conn = sqlite3.connect(ops_db_path)
conn.execute("PRAGMA foreign_keys=OFF")
entities_added = 0
relations_added = 0
try:
for r in results:
existed = find_url_entity(conn, r["url"]) is not None
url_id = insert_url_entity(
conn,
url=r["url"],
title=r["title"],
snippet=r["snippet"],
rank=r["rank"],
query=query,
)
if not existed:
entities_added += 1
if insert_relation(conn, url_id, node_id, "SEARCH_RESULT_OF"):
relations_added += 1
conn.commit()
finally:
conn.close()
progress(1.0, "done")
print(json.dumps({
"query": query,
"engine": "duckduckgo",
"results": len(results),
"entities_added": entities_added,
"relations_added": relations_added,
}, ensure_ascii=False))
return 0
if __name__ == "__main__":
sys.exit(main())