feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
"""Tests del enricher web_search (DuckDuckGo HTML)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, make_node, run_enricher,
|
||||
stub_requests, TESTS_DIR,
|
||||
)
|
||||
|
||||
|
||||
DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html"
|
||||
|
||||
|
||||
def test_web_search_creates_url_results_for_text_node(
|
||||
ops_db, app_dir, registry_root, tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="tomate",
|
||||
type_ref="text", metadata={})
|
||||
plan = {
|
||||
"match": [
|
||||
{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8"),
|
||||
"status": 200},
|
||||
],
|
||||
"default": {"text": "", "status": 404},
|
||||
}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="tomate", node_type="text",
|
||||
params={"limit": 5})
|
||||
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, f"stderr={err}"
|
||||
assert out is not None, err
|
||||
assert out["engine"] == "duckduckgo"
|
||||
assert out["results"] == 3, out
|
||||
assert out["entities_added"] == 3
|
||||
assert out["relations_added"] == 3
|
||||
|
||||
urls = list_entities(ops_db, type_ref="Url")
|
||||
targets = {e["metadata"].get("url") for e in urls}
|
||||
assert "https://es.wikipedia.org/wiki/Tomate" in targets
|
||||
assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets
|
||||
|
||||
rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
|
||||
assert len(rels) == 3
|
||||
assert all(r["to_entity"] == "t1" for r in rels)
|
||||
|
||||
# Metadata enriquecida.
|
||||
wiki = next(e for e in urls
|
||||
if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate")
|
||||
assert wiki["metadata"]["query"] == "tomate"
|
||||
assert wiki["metadata"]["rank"] == 1
|
||||
assert "Wikipedia" in wiki["metadata"]["title"]
|
||||
|
||||
|
||||
def test_web_search_uses_metadata_query_over_name(ops_db, app_dir,
|
||||
registry_root, tmp_path):
|
||||
"""metadata.query debe ganar prioridad sobre node_name."""
|
||||
make_node(ops_db, node_id="t1", name="placeholder",
|
||||
type_ref="text", metadata={"query": "tomate"})
|
||||
plan = {"match": [{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="placeholder", node_type="text",
|
||||
metadata={"query": "tomate"})
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, err
|
||||
assert out["query"] == "tomate"
|
||||
|
||||
|
||||
def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="tomate", type_ref="text")
|
||||
plan = {"match": [{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="tomate", node_type="text",
|
||||
params={"limit": 1})
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, err
|
||||
assert out["results"] == 1
|
||||
assert out["entities_added"] == 1
|
||||
|
||||
|
||||
def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={})
|
||||
env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="", node_type="text")
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 2
|
||||
assert "sin query" in err
|
||||
Reference in New Issue
Block a user