feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
"""Stub minimo de `requests` para tests de enrichers.
|
||||
|
||||
Lee el plan de respuesta de `_STUB_REQUESTS_PLAN` (env var con path a un
|
||||
JSON). Soporta multiples respuestas indexadas por metodo o por sufijo de
|
||||
URL — la primera coincidencia gana.
|
||||
|
||||
Formato del plan:
|
||||
{
|
||||
"default": {"text": "<html>...</html>", "status": 200,
|
||||
"headers": {"Content-Type": "text/html; charset=utf-8"}},
|
||||
"match": [
|
||||
{"contains": "duckduckgo.com", "text": "...", "status": 200},
|
||||
{"method": "GET", "contains": "example.com", "text": "..."}
|
||||
]
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
class Response:
|
||||
def __init__(self, text: str = "", status_code: int = 200,
|
||||
headers: dict | None = None, url: str = "",
|
||||
encoding: str = "utf-8") -> None:
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
self.headers = headers or {"Content-Type": "text/html; charset=utf-8"}
|
||||
self.url = url
|
||||
self.encoding = encoding
|
||||
self.content = text.encode(encoding, errors="replace")
|
||||
|
||||
def json(self):
|
||||
return json.loads(self.text)
|
||||
|
||||
def raise_for_status(self):
|
||||
if self.status_code >= 400:
|
||||
raise RuntimeError(f"HTTP {self.status_code}")
|
||||
|
||||
|
||||
def _load_plan() -> dict:
|
||||
p = os.environ.get("_STUB_REQUESTS_PLAN")
|
||||
if not p or not os.path.exists(p):
|
||||
return {}
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def _resolve(method: str, url: str) -> Response:
|
||||
plan = _load_plan()
|
||||
for entry in plan.get("match", []):
|
||||
if "method" in entry and entry["method"].upper() != method.upper():
|
||||
continue
|
||||
needle = entry.get("contains") or ""
|
||||
if needle and needle in url:
|
||||
return Response(
|
||||
text=entry.get("text", ""),
|
||||
status_code=int(entry.get("status", 200)),
|
||||
headers=entry.get("headers"),
|
||||
url=url,
|
||||
)
|
||||
d = plan.get("default") or {}
|
||||
return Response(
|
||||
text=d.get("text", ""),
|
||||
status_code=int(d.get("status", 200)),
|
||||
headers=d.get("headers"),
|
||||
url=url,
|
||||
)
|
||||
|
||||
|
||||
def get(url, *args, **kwargs):
|
||||
return _resolve("GET", url)
|
||||
|
||||
|
||||
def post(url, *args, **kwargs):
|
||||
return _resolve("POST", url)
|
||||
|
||||
|
||||
# Compatibilidad con `requests.exceptions.RequestException` si algun
|
||||
# enricher lo importa en el futuro.
|
||||
class RequestException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class exceptions: # noqa: N801
|
||||
RequestException = RequestException
|
||||
Timeout = RequestException
|
||||
ConnectionError = RequestException
|
||||
Reference in New Issue
Block a user