6919ebfe9c
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
78 lines
2.8 KiB
Python
78 lines
2.8 KiB
Python
"""Tests del enricher fetch_webpage con red mockeada via stub de requests."""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from conftest import (
|
|
base_ctx, get_entity, list_entities, list_relations,
|
|
make_node, run_enricher, stub_requests,
|
|
)
|
|
|
|
|
|
SAMPLE_HTML = """<!DOCTYPE html>
|
|
<html><head><title>Acme Demo</title></head>
|
|
<body>
|
|
<h1>Hola</h1>
|
|
<p>Esta es la pagina de prueba con un <a href="/x">enlace</a>.</p>
|
|
<p>Email de contacto: ops@acme.example</p>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
def test_fetch_webpage_creates_domain_and_caches(ops_db, app_dir, registry_root,
|
|
tmp_path):
|
|
make_node(ops_db, node_id="u1", name="acme",
|
|
type_ref="Url", metadata={"url": "https://www.acme.example/"})
|
|
plan = {
|
|
"default": {"text": SAMPLE_HTML, "status": 200,
|
|
"headers": {"Content-Type": "text/html; charset=utf-8"}},
|
|
}
|
|
env = stub_requests(tmp_path, plan)
|
|
|
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
|
node_id="u1", node_name="acme", node_type="Url",
|
|
metadata={"url": "https://www.acme.example/"})
|
|
|
|
rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
|
|
assert rc == 0, f"stderr={err}"
|
|
assert out is not None, err
|
|
assert out["status_code"] == 200
|
|
assert out["title"] == "Acme Demo"
|
|
assert out["entities_added"] == 1 # Domain
|
|
assert out["relations_added"] == 1 # BELONGS_TO
|
|
|
|
# El nodo Url se promueve a Webpage.
|
|
e = get_entity(ops_db, "u1")
|
|
assert e["type_ref"] == "Webpage", e
|
|
assert e["metadata"]["title"] == "Acme Demo"
|
|
assert e["metadata"]["status_code"] == 200
|
|
|
|
# Cache existe.
|
|
html_path = Path(app_dir) / e["metadata"]["html_path"]
|
|
assert html_path.exists()
|
|
assert "Acme Demo" in html_path.read_text(encoding="utf-8")
|
|
|
|
# Domain creado con relacion.
|
|
domains = list_entities(ops_db, type_ref="Domain")
|
|
assert any(d["name"] == "www.acme.example" for d in domains)
|
|
rels = list_relations(ops_db, name="BELONGS_TO")
|
|
assert len(rels) == 1
|
|
|
|
|
|
def test_fetch_webpage_handles_http_error(ops_db, app_dir, registry_root,
|
|
tmp_path):
|
|
make_node(ops_db, node_id="u1", name="bad",
|
|
type_ref="Url", metadata={"url": "https://no.example/"})
|
|
plan = {"default": {"text": "<html></html>", "status": 404}}
|
|
env = stub_requests(tmp_path, plan)
|
|
|
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
|
node_id="u1", node_name="bad", node_type="Url",
|
|
metadata={"url": "https://no.example/"})
|
|
|
|
rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
|
|
# 404 es respuesta valida — exit 0 con status_code en el resumen.
|
|
assert rc == 0, err
|
|
assert out["status_code"] == 404
|