feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
"""Tests del enricher extract_links — sin red, lee markdown del cache."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, make_node, run_enricher,
|
||||
)
|
||||
|
||||
|
||||
SAMPLE_MD = """# Pagina demo
|
||||
|
||||
Aqui hay [un enlace](https://example.com/articulo) interesante y
|
||||
otro [duplicado](https://example.com/articulo) que no debe contar
|
||||
dos veces.
|
||||
|
||||
Tambien una URL pelada: https://otra.example/path?q=1
|
||||
y https://tercera.example/
|
||||
|
||||
Y un email que NO debe extraer como Url: contact@no.example
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_links_creates_url_nodes(ops_db, app_dir, registry_root):
|
||||
# 1) Crear el cache con el markdown.
|
||||
md_dir = Path(app_dir) / "cache" / "ab"
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = md_dir / "abc.md"
|
||||
md_path.write_text(SAMPLE_MD, encoding="utf-8")
|
||||
rel = md_path.relative_to(app_dir)
|
||||
|
||||
# 2) Crear Webpage con metadata.markdown_path apuntando al cache.
|
||||
make_node(ops_db, node_id="w1", name="demo",
|
||||
type_ref="Webpage", metadata={"markdown_path": str(rel)})
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="demo", node_type="Webpage",
|
||||
metadata={"markdown_path": str(rel)})
|
||||
|
||||
rc, out, err = run_enricher("extract_links", ctx)
|
||||
assert rc == 0, err
|
||||
assert out is not None, err
|
||||
assert out["entities_added"] >= 3, out
|
||||
|
||||
urls = [e["name"] for e in list_entities(ops_db, type_ref="Url")]
|
||||
assert "https://example.com/articulo" in urls
|
||||
assert "https://otra.example/path?q=1" in urls
|
||||
|
||||
rels = list_relations(ops_db, name="LINKS_TO")
|
||||
assert len(rels) >= 3
|
||||
assert all(r["from_entity"] == "w1" for r in rels)
|
||||
|
||||
|
||||
def test_extract_links_without_markdown_path_errors(ops_db, app_dir,
|
||||
registry_root):
|
||||
make_node(ops_db, node_id="w1", name="demo",
|
||||
type_ref="Webpage", metadata={})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="demo", node_type="Webpage")
|
||||
rc, out, err = run_enricher("extract_links", ctx)
|
||||
assert rc != 0, "deberia fallar sin markdown_path"
|
||||
assert out is not None
|
||||
assert "missing markdown_path" in (out.get("error") or "")
|
||||
Reference in New Issue
Block a user