6919ebfe9c
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
90 lines
2.5 KiB
Python
90 lines
2.5 KiB
Python
"""Stub minimo de `requests` para tests de enrichers.
|
|
|
|
Lee el plan de respuesta de `_STUB_REQUESTS_PLAN` (env var con path a un
|
|
JSON). Soporta multiples respuestas indexadas por metodo o por sufijo de
|
|
URL — la primera coincidencia gana.
|
|
|
|
Formato del plan:
|
|
{
|
|
"default": {"text": "<html>...</html>", "status": 200,
|
|
"headers": {"Content-Type": "text/html; charset=utf-8"}},
|
|
"match": [
|
|
{"contains": "duckduckgo.com", "text": "...", "status": 200},
|
|
{"method": "GET", "contains": "example.com", "text": "..."}
|
|
]
|
|
}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
|
|
|
|
class Response:
|
|
def __init__(self, text: str = "", status_code: int = 200,
|
|
headers: dict | None = None, url: str = "",
|
|
encoding: str = "utf-8") -> None:
|
|
self.text = text
|
|
self.status_code = status_code
|
|
self.headers = headers or {"Content-Type": "text/html; charset=utf-8"}
|
|
self.url = url
|
|
self.encoding = encoding
|
|
self.content = text.encode(encoding, errors="replace")
|
|
|
|
def json(self):
|
|
return json.loads(self.text)
|
|
|
|
def raise_for_status(self):
|
|
if self.status_code >= 400:
|
|
raise RuntimeError(f"HTTP {self.status_code}")
|
|
|
|
|
|
def _load_plan() -> dict:
|
|
p = os.environ.get("_STUB_REQUESTS_PLAN")
|
|
if not p or not os.path.exists(p):
|
|
return {}
|
|
with open(p, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def _resolve(method: str, url: str) -> Response:
|
|
plan = _load_plan()
|
|
for entry in plan.get("match", []):
|
|
if "method" in entry and entry["method"].upper() != method.upper():
|
|
continue
|
|
needle = entry.get("contains") or ""
|
|
if needle and needle in url:
|
|
return Response(
|
|
text=entry.get("text", ""),
|
|
status_code=int(entry.get("status", 200)),
|
|
headers=entry.get("headers"),
|
|
url=url,
|
|
)
|
|
d = plan.get("default") or {}
|
|
return Response(
|
|
text=d.get("text", ""),
|
|
status_code=int(d.get("status", 200)),
|
|
headers=d.get("headers"),
|
|
url=url,
|
|
)
|
|
|
|
|
|
def get(url, *args, **kwargs):
|
|
return _resolve("GET", url)
|
|
|
|
|
|
def post(url, *args, **kwargs):
|
|
return _resolve("POST", url)
|
|
|
|
|
|
# Compatibilidad con `requests.exceptions.RequestException` si algun
|
|
# enricher lo importa en el futuro.
|
|
class RequestException(Exception):
|
|
pass
|
|
|
|
|
|
class exceptions: # noqa: N801
|
|
RequestException = RequestException
|
|
Timeout = RequestException
|
|
ConnectionError = RequestException
|