feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
"""Sanity check de los manifests YAML de todos los enrichers.
|
||||
|
||||
Confirma que el set actual cubre los tipos esperados y que cada manifest
|
||||
tiene los campos que `enrichers.cpp` necesita parsear (id, applies_to).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import ENRICHERS_DIR
|
||||
|
||||
|
||||
EXPECTED_IDS = {
|
||||
"extract_domain",
|
||||
"extract_links",
|
||||
"extract_text_entities",
|
||||
"fetch_webpage",
|
||||
"web_search",
|
||||
}
|
||||
|
||||
|
||||
def _parse_simple_yaml(text: str) -> dict:
|
||||
"""Parser ad-hoc que replica lo que hace enrichers.cpp."""
|
||||
out: dict = {}
|
||||
in_skip = False
|
||||
for raw in text.splitlines():
|
||||
line = raw.rstrip("\r")
|
||||
s = line.strip()
|
||||
if not s or s.startswith("#"):
|
||||
continue
|
||||
indented = line and line[0].isspace()
|
||||
if not indented:
|
||||
in_skip = False
|
||||
if in_skip:
|
||||
continue
|
||||
if ":" not in s:
|
||||
continue
|
||||
key, _, val = s.partition(":")
|
||||
key = key.strip()
|
||||
val = val.strip()
|
||||
if val and val[0] in ('"', "'") and val[-1] == val[0]:
|
||||
val = val[1:-1]
|
||||
if key == "params" and not val:
|
||||
in_skip = True
|
||||
out[key] = val
|
||||
return out
|
||||
|
||||
|
||||
def test_all_expected_enrichers_present():
|
||||
found = {p.name for p in ENRICHERS_DIR.iterdir() if p.is_dir()}
|
||||
missing = EXPECTED_IDS - found
|
||||
assert not missing, f"faltan enrichers: {missing}"
|
||||
|
||||
|
||||
def test_each_manifest_has_required_fields():
|
||||
for d in ENRICHERS_DIR.iterdir():
|
||||
if not d.is_dir():
|
||||
continue
|
||||
manifest = d / "manifest.yaml"
|
||||
runpy = d / "run.py"
|
||||
assert manifest.exists(), f"falta manifest: {d.name}"
|
||||
assert runpy.exists(), f"falta run.py: {d.name}"
|
||||
m = _parse_simple_yaml(manifest.read_text(encoding="utf-8"))
|
||||
assert m.get("id") == d.name, f"id no coincide con dir: {d.name}"
|
||||
assert m.get("applies_to"), f"sin applies_to: {d.name}"
|
||||
assert m.get("description"), f"sin description: {d.name}"
|
||||
|
||||
|
||||
def test_web_search_applies_to_text():
|
||||
m = _parse_simple_yaml(
|
||||
(ENRICHERS_DIR / "web_search" / "manifest.yaml").read_text())
|
||||
assert "text" in m["applies_to"].lower()
|
||||
Reference in New Issue
Block a user