6919ebfe9c
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
"""Sanity check de los manifests YAML de todos los enrichers.
|
|
|
|
Confirma que el set actual cubre los tipos esperados y que cada manifest
|
|
tiene los campos que `enrichers.cpp` necesita parsear (id, applies_to).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
from conftest import ENRICHERS_DIR
|
|
|
|
|
|
EXPECTED_IDS = {
|
|
"extract_domain",
|
|
"extract_links",
|
|
"extract_text_entities",
|
|
"fetch_webpage",
|
|
"web_search",
|
|
}
|
|
|
|
|
|
def _parse_simple_yaml(text: str) -> dict:
|
|
"""Parser ad-hoc que replica lo que hace enrichers.cpp."""
|
|
out: dict = {}
|
|
in_skip = False
|
|
for raw in text.splitlines():
|
|
line = raw.rstrip("\r")
|
|
s = line.strip()
|
|
if not s or s.startswith("#"):
|
|
continue
|
|
indented = line and line[0].isspace()
|
|
if not indented:
|
|
in_skip = False
|
|
if in_skip:
|
|
continue
|
|
if ":" not in s:
|
|
continue
|
|
key, _, val = s.partition(":")
|
|
key = key.strip()
|
|
val = val.strip()
|
|
if val and val[0] in ('"', "'") and val[-1] == val[0]:
|
|
val = val[1:-1]
|
|
if key == "params" and not val:
|
|
in_skip = True
|
|
out[key] = val
|
|
return out
|
|
|
|
|
|
def test_all_expected_enrichers_present():
|
|
found = {p.name for p in ENRICHERS_DIR.iterdir() if p.is_dir()}
|
|
missing = EXPECTED_IDS - found
|
|
assert not missing, f"faltan enrichers: {missing}"
|
|
|
|
|
|
def test_each_manifest_has_required_fields():
|
|
for d in ENRICHERS_DIR.iterdir():
|
|
if not d.is_dir():
|
|
continue
|
|
manifest = d / "manifest.yaml"
|
|
runpy = d / "run.py"
|
|
assert manifest.exists(), f"falta manifest: {d.name}"
|
|
assert runpy.exists(), f"falta run.py: {d.name}"
|
|
m = _parse_simple_yaml(manifest.read_text(encoding="utf-8"))
|
|
assert m.get("id") == d.name, f"id no coincide con dir: {d.name}"
|
|
assert m.get("applies_to"), f"sin applies_to: {d.name}"
|
|
assert m.get("description"), f"sin description: {d.name}"
|
|
|
|
|
|
def test_web_search_applies_to_text():
|
|
m = _parse_simple_yaml(
|
|
(ENRICHERS_DIR / "web_search" / "manifest.yaml").read_text())
|
|
assert "text" in m["applies_to"].lower()
|