feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers

Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 16:10:13 +02:00
parent 0d2450bac5
commit 6919ebfe9c
20 changed files with 1223 additions and 0 deletions
@@ -0,0 +1,89 @@
+"""Stub minimo de `requests` para tests de enrichers.
+
+Lee el plan de respuesta de `_STUB_REQUESTS_PLAN` (env var con path a un
+JSON). Soporta multiples respuestas indexadas por metodo o por sufijo de
+URL — la primera coincidencia gana.
+
+Formato del plan:
+{
+  "default": {"text": "<html>...</html>", "status": 200,
+              "headers": {"Content-Type": "text/html; charset=utf-8"}},
+  "match": [
+    {"contains": "duckduckgo.com", "text": "...", "status": 200},
+    {"method": "GET", "contains": "example.com", "text": "..."}
+  ]
+}
+"""
+from __future__ import annotations
+
+import json
+import os
+
+
+class Response:
+    def __init__(self, text: str = "", status_code: int = 200,
+                 headers: dict | None = None, url: str = "",
+                 encoding: str = "utf-8") -> None:
+        self.text = text
+        self.status_code = status_code
+        self.headers = headers or {"Content-Type": "text/html; charset=utf-8"}
+        self.url = url
+        self.encoding = encoding
+        self.content = text.encode(encoding, errors="replace")
+
+    def json(self):
+        return json.loads(self.text)
+
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            raise RuntimeError(f"HTTP {self.status_code}")
+
+
+def _load_plan() -> dict:
+    p = os.environ.get("_STUB_REQUESTS_PLAN")
+    if not p or not os.path.exists(p):
+        return {}
+    with open(p, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def _resolve(method: str, url: str) -> Response:
+    plan = _load_plan()
+    for entry in plan.get("match", []):
+        if "method" in entry and entry["method"].upper() != method.upper():
+            continue
+        needle = entry.get("contains") or ""
+        if needle and needle in url:
+            return Response(
+                text=entry.get("text", ""),
+                status_code=int(entry.get("status", 200)),
+                headers=entry.get("headers"),
+                url=url,
+            )
+    d = plan.get("default") or {}
+    return Response(
+        text=d.get("text", ""),
+        status_code=int(d.get("status", 200)),
+        headers=d.get("headers"),
+        url=url,
+    )
+
+
+def get(url, *args, **kwargs):
+    return _resolve("GET", url)
+
+
+def post(url, *args, **kwargs):
+    return _resolve("POST", url)
+
+
+# Compatibilidad con `requests.exceptions.RequestException` si algun
+# enricher lo importa en el futuro.
+class RequestException(Exception):
+    pass
+
+
+class exceptions:  # noqa: N801
+    RequestException = RequestException
+    Timeout = RequestException
+    ConnectionError = RequestException
@@ -0,0 +1,237 @@
+"""Fixtures comunes para tests de enrichers de graph_explorer.
+
+Cada test recibe:
+  - `ops_db`: path a una operations.db con schema minimo en tmp dir
+  - `app_dir`: tmp dir que actua como app_dir (cache_dir = <app_dir>/cache)
+  - `registry_root`: ruta absoluta del registry (para imports en run.py)
+  - `run_enricher(enricher, ctx_overrides)`: helper que invoca run.py via
+    subprocess con el mismo wire protocol que jobs.cpp.
+
+El schema se replica de `fn_operations/project_template/operations.db` —
+solo las columnas que usan los enrichers. Si fn_operations cambia el
+schema, este conftest se actualiza.
+"""
+from __future__ import annotations
+
+import json
+import os
+import sqlite3
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+
+REGISTRY_ROOT = Path(__file__).resolve().parents[5]
+APP_DIR_SRC   = Path(__file__).resolve().parents[1]   # graph_explorer/
+ENRICHERS_DIR = APP_DIR_SRC / "enrichers"
+TESTS_DIR     = Path(__file__).resolve().parent
+STUBS_DIR     = TESTS_DIR / "_stubs"
+PYTHON_BIN    = REGISTRY_ROOT / "python" / ".venv" / "bin" / "python3"
+
+
+def stub_requests(tmp_path: Path, plan: dict) -> dict:
+    """Escribe el plan de respuestas y devuelve el env que activa el stub.
+
+    El stub vive en tests/_stubs/requests.py y se activa via PYTHONPATH.
+    Plan acepta `default` y/o `match` (lista de {contains, status, text}).
+    """
+    plan_file = tmp_path / "_stub_plan.json"
+    plan_file.write_text(json.dumps(plan), encoding="utf-8")
+    return {
+        "PYTHONPATH": str(STUBS_DIR) + os.pathsep + os.environ.get("PYTHONPATH", ""),
+        "_STUB_REQUESTS_PLAN": str(plan_file),
+    }
+
+
+SCHEMA_SQL = """
+CREATE TABLE entities (
+    id          TEXT PRIMARY KEY,
+    name        TEXT NOT NULL,
+    type_ref    TEXT NOT NULL,
+    status      TEXT NOT NULL DEFAULT 'active',
+    description TEXT NOT NULL DEFAULT '',
+    domain      TEXT NOT NULL DEFAULT '',
+    tags        TEXT NOT NULL DEFAULT '[]',
+    source      TEXT NOT NULL,
+    metadata    TEXT NOT NULL DEFAULT '{}',
+    notes       TEXT NOT NULL DEFAULT '',
+    created_at  TEXT NOT NULL,
+    updated_at  TEXT NOT NULL
+);
+CREATE TABLE relations (
+    id          TEXT PRIMARY KEY,
+    name        TEXT NOT NULL,
+    from_entity TEXT NOT NULL DEFAULT '',
+    to_entity   TEXT NOT NULL,
+    via         TEXT NOT NULL DEFAULT '',
+    description TEXT NOT NULL DEFAULT '',
+    purity      TEXT NOT NULL DEFAULT '',
+    direction   TEXT NOT NULL DEFAULT 'unidirectional',
+    weight      REAL,
+    status      TEXT NOT NULL DEFAULT 'designed',
+    started_at  TEXT,
+    ended_at    TEXT,
+    "order"     INTEGER,
+    tags        TEXT NOT NULL DEFAULT '[]',
+    notes       TEXT NOT NULL DEFAULT '',
+    created_at  TEXT NOT NULL,
+    updated_at  TEXT NOT NULL
+);
+"""
+
+
+@pytest.fixture
+def ops_db(tmp_path):
+    """operations.db vacia con schema minimo, lista para insertar nodos."""
+    db = tmp_path / "operations.db"
+    conn = sqlite3.connect(db)
+    conn.executescript(SCHEMA_SQL)
+    conn.commit()
+    conn.close()
+    return db
+
+
+@pytest.fixture
+def app_dir(tmp_path):
+    """Directorio raiz de una 'app' para los enrichers (cache va dentro)."""
+    d = tmp_path / "app"
+    d.mkdir()
+    (d / "cache").mkdir()
+    return d
+
+
+@pytest.fixture
+def registry_root():
+    return REGISTRY_ROOT
+
+
+def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str,
+              metadata: dict | None = None, source: str = "test") -> None:
+    """Inserta un nodo de tipo arbitrario en operations.db."""
+    conn = sqlite3.connect(ops_db)
+    conn.execute(
+        "INSERT INTO entities (id, name, type_ref, source, metadata, "
+        " created_at, updated_at) VALUES (?, ?, ?, ?, ?, "
+        " '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')",
+        (node_id, name, type_ref, source,
+         json.dumps(metadata or {}, ensure_ascii=False)),
+    )
+    conn.commit()
+    conn.close()
+
+
+def get_entity(ops_db: Path, entity_id: str) -> dict | None:
+    conn = sqlite3.connect(ops_db)
+    try:
+        cur = conn.execute(
+            "SELECT id, name, type_ref, source, metadata "
+            "FROM entities WHERE id=?", (entity_id,))
+        row = cur.fetchone()
+    finally:
+        conn.close()
+    if not row:
+        return None
+    md = {}
+    try:
+        md = json.loads(row[4]) if row[4] else {}
+    except Exception:
+        pass
+    return {"id": row[0], "name": row[1], "type_ref": row[2],
+            "source": row[3], "metadata": md}
+
+
+def list_entities(ops_db: Path, type_ref: str | None = None) -> list[dict]:
+    conn = sqlite3.connect(ops_db)
+    try:
+        if type_ref:
+            cur = conn.execute(
+                "SELECT id, name, type_ref, source, metadata "
+                "FROM entities WHERE type_ref=? ORDER BY id", (type_ref,))
+        else:
+            cur = conn.execute(
+                "SELECT id, name, type_ref, source, metadata "
+                "FROM entities ORDER BY id")
+        rows = cur.fetchall()
+    finally:
+        conn.close()
+    out = []
+    for r in rows:
+        try:
+            md = json.loads(r[4]) if r[4] else {}
+        except Exception:
+            md = {}
+        out.append({"id": r[0], "name": r[1], "type_ref": r[2],
+                    "source": r[3], "metadata": md})
+    return out
+
+
+def list_relations(ops_db: Path, name: str | None = None) -> list[dict]:
+    conn = sqlite3.connect(ops_db)
+    try:
+        if name:
+            cur = conn.execute(
+                "SELECT id, name, from_entity, to_entity FROM relations "
+                "WHERE name=? ORDER BY id", (name,))
+        else:
+            cur = conn.execute(
+                "SELECT id, name, from_entity, to_entity FROM relations "
+                "ORDER BY id")
+        rows = cur.fetchall()
+    finally:
+        conn.close()
+    return [{"id": r[0], "name": r[1], "from_entity": r[2], "to_entity": r[3]}
+            for r in rows]
+
+
+def run_enricher(enricher_id: str, ctx: dict, *, env: dict | None = None,
+                 timeout: int = 30) -> tuple[int, dict | None, str]:
+    """Lanza enrichers/<id>/run.py con el wire protocol estandar.
+
+    Returns: (exit_code, stdout_json_or_None, stderr_text)
+    """
+    run_py = ENRICHERS_DIR / enricher_id / "run.py"
+    assert run_py.exists(), f"no existe {run_py}"
+
+    full_env = os.environ.copy()
+    if env:
+        full_env.update(env)
+
+    proc = subprocess.run(
+        [str(PYTHON_BIN), str(run_py)],
+        input=json.dumps(ctx),
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        env=full_env,
+    )
+    parsed: dict | None = None
+    if proc.stdout.strip():
+        # Ultima linea no vacia es el JSON resumen.
+        for line in reversed(proc.stdout.strip().splitlines()):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                parsed = json.loads(line)
+            except Exception:
+                pass
+            break
+    return proc.returncode, parsed, proc.stderr
+
+
+def base_ctx(*, ops_db, app_dir, registry_root, node_id, node_name,
+             node_type, metadata=None, params=None) -> dict:
+    """Construye el ctx tipico que jobs.cpp pasa por stdin."""
+    return {
+        "node_id":       node_id,
+        "node_name":     node_name,
+        "node_type":     node_type,
+        "metadata":      metadata or {},
+        "ops_db_path":   str(ops_db),
+        "app_dir":       str(app_dir),
+        "cache_dir":     str(Path(app_dir) / "cache"),
+        "registry_root": str(registry_root),
+        "params":        params or {},
+    }
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html><head><title>tomate at DuckDuckGo</title></head>
+<body>
+<div class="serp__results">
+  <div class="result">
+    <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fes.wikipedia.org%2Fwiki%2FTomate&amp;rut=abc">Tomate - Wikipedia, la enciclopedia libre</a>
+    <a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fes.wikipedia.org%2Fwiki%2FTomate">El tomate es el fruto comestible de la planta Solanum lycopersicum, una especie de la familia de las solanaceas.</a>
+  </div>
+  <div class="result">
+    <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.botanical-online.com%2Falimentos%2Ftomate-propiedades&amp;rut=def">Tomate: propiedades y beneficios</a>
+    <a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.botanical-online.com%2Falimentos%2Ftomate-propiedades">Propiedades del tomate, beneficios para la salud y composicion nutricional.</a>
+  </div>
+  <div class="result">
+    <a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.recetasgratis.net%2Fbusqueda%2Ftomate&amp;rut=ghi">Recetas con tomate - RecetasGratis</a>
+    <a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.recetasgratis.net%2Fbusqueda%2Ftomate">Encuentra las mejores recetas con tomate paso a paso.</a>
+  </div>
+  <div class="result result--ad">
+    <!-- anuncio sin titulo, no debe contar -->
+    <a href="https://ad.doubleclick.net/x">ad</a>
+  </div>
+</div>
+</body></html>
@@ -0,0 +1,60 @@
+"""Tests del enricher extract_domain.
+
+Pure regex/parsing — sin red. Verifica:
+  - Url con metadata.url crea Domain + BELONGS_TO
+  - Email crea Domain (desde la parte derecha del @)
+  - Si el Domain ya existe se reusa, no se duplica
+"""
+from __future__ import annotations
+
+from conftest import (
+    base_ctx, get_entity, list_entities, list_relations,
+    make_node, run_enricher,
+)
+
+
+def test_url_creates_domain_and_relation(ops_db, app_dir, registry_root):
+    make_node(ops_db, node_id="u1", name="ex",
+              type_ref="Url", metadata={"url": "https://www.example.com/path"})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="u1", node_name="ex", node_type="Url",
+                   metadata={"url": "https://www.example.com/path"})
+
+    rc, out, err = run_enricher("extract_domain", ctx)
+    assert rc == 0, err
+    assert out and out.get("entities_added", 0) >= 1, out
+
+    domains = list_entities(ops_db, type_ref="Domain")
+    assert any(d["name"] == "www.example.com" for d in domains), domains
+
+    rels = list_relations(ops_db, name="BELONGS_TO")
+    assert len(rels) == 1
+    assert rels[0]["from_entity"] == "u1"
+
+
+def test_email_creates_domain(ops_db, app_dir, registry_root):
+    make_node(ops_db, node_id="e1", name="user@aurgi.com",
+              type_ref="Email", metadata={"address": "user@aurgi.com"})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="e1", node_name="user@aurgi.com", node_type="Email")
+    rc, out, err = run_enricher("extract_domain", ctx)
+    assert rc == 0, err
+    domains = list_entities(ops_db, type_ref="Domain")
+    assert any(d["name"] == "aurgi.com" for d in domains), domains
+
+
+def test_existing_domain_is_reused(ops_db, app_dir, registry_root):
+    # Pre-crear un Domain con el mismo nombre.
+    make_node(ops_db, node_id="d1", name="example.com", type_ref="Domain",
+              metadata={})
+    make_node(ops_db, node_id="u1", name="ex", type_ref="Url",
+              metadata={"url": "https://example.com/x"})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="u1", node_name="ex", node_type="Url",
+                   metadata={"url": "https://example.com/x"})
+    rc, out, err = run_enricher("extract_domain", ctx)
+    assert rc == 0, err
+
+    domains = list_entities(ops_db, type_ref="Domain")
+    names = [d["name"] for d in domains]
+    assert names.count("example.com") == 1, domains
@@ -0,0 +1,63 @@
+"""Tests del enricher extract_links — sin red, lee markdown del cache."""
+from __future__ import annotations
+
+from pathlib import Path
+
+from conftest import (
+    base_ctx, list_entities, list_relations, make_node, run_enricher,
+)
+
+
+SAMPLE_MD = """# Pagina demo
+
+Aqui hay [un enlace](https://example.com/articulo) interesante y
+otro [duplicado](https://example.com/articulo) que no debe contar
+dos veces.
+
+Tambien una URL pelada: https://otra.example/path?q=1
+y https://tercera.example/
+
+Y un email que NO debe extraer como Url: contact@no.example
+"""
+
+
+def test_extract_links_creates_url_nodes(ops_db, app_dir, registry_root):
+    # 1) Crear el cache con el markdown.
+    md_dir = Path(app_dir) / "cache" / "ab"
+    md_dir.mkdir(parents=True, exist_ok=True)
+    md_path = md_dir / "abc.md"
+    md_path.write_text(SAMPLE_MD, encoding="utf-8")
+    rel = md_path.relative_to(app_dir)
+
+    # 2) Crear Webpage con metadata.markdown_path apuntando al cache.
+    make_node(ops_db, node_id="w1", name="demo",
+              type_ref="Webpage", metadata={"markdown_path": str(rel)})
+
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="w1", node_name="demo", node_type="Webpage",
+                   metadata={"markdown_path": str(rel)})
+
+    rc, out, err = run_enricher("extract_links", ctx)
+    assert rc == 0, err
+    assert out is not None, err
+    assert out["entities_added"] >= 3, out
+
+    urls = [e["name"] for e in list_entities(ops_db, type_ref="Url")]
+    assert "https://example.com/articulo" in urls
+    assert "https://otra.example/path?q=1" in urls
+
+    rels = list_relations(ops_db, name="LINKS_TO")
+    assert len(rels) >= 3
+    assert all(r["from_entity"] == "w1" for r in rels)
+
+
+def test_extract_links_without_markdown_path_errors(ops_db, app_dir,
+                                                     registry_root):
+    make_node(ops_db, node_id="w1", name="demo",
+              type_ref="Webpage", metadata={})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="w1", node_name="demo", node_type="Webpage")
+    rc, out, err = run_enricher("extract_links", ctx)
+    assert rc != 0, "deberia fallar sin markdown_path"
+    assert out is not None
+    assert "missing markdown_path" in (out.get("error") or "")
@@ -0,0 +1,59 @@
+"""Tests del enricher extract_text_entities — regex IoCs sobre markdown."""
+from __future__ import annotations
+
+from pathlib import Path
+
+from conftest import (
+    base_ctx, list_entities, list_relations, make_node, run_enricher,
+)
+
+
+# Texto con varios IoCs detectables por extract_iocs (regex puro).
+SAMPLE_MD = """# Reporte
+
+Indicators:
+  - Email: bad@evil.example y otra@victim.example
+  - IP:    192.0.2.55
+  - CVE:   CVE-2024-12345
+  - Hash:  44d88612fea8a8f36de82e1278abb02f
+"""
+
+
+def test_extract_iocs_creates_typed_entities(ops_db, app_dir, registry_root):
+    md_dir = Path(app_dir) / "cache" / "cd"
+    md_dir.mkdir(parents=True, exist_ok=True)
+    md_path = md_dir / "ddd.md"
+    md_path.write_text(SAMPLE_MD, encoding="utf-8")
+    rel = md_path.relative_to(app_dir)
+
+    make_node(ops_db, node_id="w1", name="report",
+              type_ref="Webpage", metadata={"markdown_path": str(rel)})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="w1", node_name="report", node_type="Webpage",
+                   metadata={"markdown_path": str(rel)})
+
+    rc, out, err = run_enricher("extract_text_entities", ctx)
+    assert rc == 0, err
+    assert out is not None
+    assert out["entities_added"] >= 3, out
+
+    types = {e["type_ref"] for e in list_entities(ops_db)
+             if e["type_ref"] != "Webpage"}
+    # No exigimos todos los tipos — depende de que extract_iocs cubra cada
+    # patron — pero al menos Email y CVE deberian estar.
+    assert "Email" in types, types
+    assert "CVE" in types, types
+
+    rels = list_relations(ops_db, name="EXTRACTED_FROM")
+    assert len(rels) >= 3
+    assert all(r["to_entity"] == "w1" for r in rels)
+
+
+def test_extract_iocs_without_markdown_errors(ops_db, app_dir, registry_root):
+    make_node(ops_db, node_id="w1", name="empty",
+              type_ref="Webpage", metadata={})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="w1", node_name="empty", node_type="Webpage")
+    rc, out, err = run_enricher("extract_text_entities", ctx)
+    assert rc != 0
+    assert out and "missing markdown_path" in (out.get("error") or "")
@@ -0,0 +1,77 @@
+"""Tests del enricher fetch_webpage con red mockeada via stub de requests."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from conftest import (
+    base_ctx, get_entity, list_entities, list_relations,
+    make_node, run_enricher, stub_requests,
+)
+
+
+SAMPLE_HTML = """<!DOCTYPE html>
+<html><head><title>Acme Demo</title></head>
+<body>
+  <h1>Hola</h1>
+  <p>Esta es la pagina de prueba con un <a href="/x">enlace</a>.</p>
+  <p>Email de contacto: ops@acme.example</p>
+</body></html>
+"""
+
+
+def test_fetch_webpage_creates_domain_and_caches(ops_db, app_dir, registry_root,
+                                                  tmp_path):
+    make_node(ops_db, node_id="u1", name="acme",
+              type_ref="Url", metadata={"url": "https://www.acme.example/"})
+    plan = {
+        "default": {"text": SAMPLE_HTML, "status": 200,
+                    "headers": {"Content-Type": "text/html; charset=utf-8"}},
+    }
+    env = stub_requests(tmp_path, plan)
+
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="u1", node_name="acme", node_type="Url",
+                   metadata={"url": "https://www.acme.example/"})
+
+    rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
+    assert rc == 0, f"stderr={err}"
+    assert out is not None, err
+    assert out["status_code"] == 200
+    assert out["title"] == "Acme Demo"
+    assert out["entities_added"] == 1   # Domain
+    assert out["relations_added"] == 1  # BELONGS_TO
+
+    # El nodo Url se promueve a Webpage.
+    e = get_entity(ops_db, "u1")
+    assert e["type_ref"] == "Webpage", e
+    assert e["metadata"]["title"] == "Acme Demo"
+    assert e["metadata"]["status_code"] == 200
+
+    # Cache existe.
+    html_path = Path(app_dir) / e["metadata"]["html_path"]
+    assert html_path.exists()
+    assert "Acme Demo" in html_path.read_text(encoding="utf-8")
+
+    # Domain creado con relacion.
+    domains = list_entities(ops_db, type_ref="Domain")
+    assert any(d["name"] == "www.acme.example" for d in domains)
+    rels = list_relations(ops_db, name="BELONGS_TO")
+    assert len(rels) == 1
+
+
+def test_fetch_webpage_handles_http_error(ops_db, app_dir, registry_root,
+                                           tmp_path):
+    make_node(ops_db, node_id="u1", name="bad",
+              type_ref="Url", metadata={"url": "https://no.example/"})
+    plan = {"default": {"text": "<html></html>", "status": 404}}
+    env = stub_requests(tmp_path, plan)
+
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="u1", node_name="bad", node_type="Url",
+                   metadata={"url": "https://no.example/"})
+
+    rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
+    # 404 es respuesta valida — exit 0 con status_code en el resumen.
+    assert rc == 0, err
+    assert out["status_code"] == 404
@@ -0,0 +1,72 @@
+"""Sanity check de los manifests YAML de todos los enrichers.
+
+Confirma que el set actual cubre los tipos esperados y que cada manifest
+tiene los campos que `enrichers.cpp` necesita parsear (id, applies_to).
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from conftest import ENRICHERS_DIR
+
+
+EXPECTED_IDS = {
+    "extract_domain",
+    "extract_links",
+    "extract_text_entities",
+    "fetch_webpage",
+    "web_search",
+}
+
+
+def _parse_simple_yaml(text: str) -> dict:
+    """Parser ad-hoc que replica lo que hace enrichers.cpp."""
+    out: dict = {}
+    in_skip = False
+    for raw in text.splitlines():
+        line = raw.rstrip("\r")
+        s = line.strip()
+        if not s or s.startswith("#"):
+            continue
+        indented = line and line[0].isspace()
+        if not indented:
+            in_skip = False
+        if in_skip:
+            continue
+        if ":" not in s:
+            continue
+        key, _, val = s.partition(":")
+        key = key.strip()
+        val = val.strip()
+        if val and val[0] in ('"', "'") and val[-1] == val[0]:
+            val = val[1:-1]
+        if key == "params" and not val:
+            in_skip = True
+        out[key] = val
+    return out
+
+
+def test_all_expected_enrichers_present():
+    found = {p.name for p in ENRICHERS_DIR.iterdir() if p.is_dir()}
+    missing = EXPECTED_IDS - found
+    assert not missing, f"faltan enrichers: {missing}"
+
+
+def test_each_manifest_has_required_fields():
+    for d in ENRICHERS_DIR.iterdir():
+        if not d.is_dir():
+            continue
+        manifest = d / "manifest.yaml"
+        runpy    = d / "run.py"
+        assert manifest.exists(), f"falta manifest: {d.name}"
+        assert runpy.exists(),    f"falta run.py:  {d.name}"
+        m = _parse_simple_yaml(manifest.read_text(encoding="utf-8"))
+        assert m.get("id") == d.name, f"id no coincide con dir: {d.name}"
+        assert m.get("applies_to"), f"sin applies_to: {d.name}"
+        assert m.get("description"), f"sin description: {d.name}"
+
+
+def test_web_search_applies_to_text():
+    m = _parse_simple_yaml(
+        (ENRICHERS_DIR / "web_search" / "manifest.yaml").read_text())
+    assert "text" in m["applies_to"].lower()
@@ -0,0 +1,97 @@
+"""Tests del enricher web_search (DuckDuckGo HTML)."""
+from __future__ import annotations
+
+from pathlib import Path
+
+from conftest import (
+    base_ctx, list_entities, list_relations, make_node, run_enricher,
+    stub_requests, TESTS_DIR,
+)
+
+
+DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html"
+
+
+def test_web_search_creates_url_results_for_text_node(
+        ops_db, app_dir, registry_root, tmp_path):
+    make_node(ops_db, node_id="t1", name="tomate",
+              type_ref="text", metadata={})
+    plan = {
+        "match": [
+            {"contains": "duckduckgo.com",
+             "text": DDG_FIXTURE.read_text(encoding="utf-8"),
+             "status": 200},
+        ],
+        "default": {"text": "", "status": 404},
+    }
+    env = stub_requests(tmp_path, plan)
+
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="t1", node_name="tomate", node_type="text",
+                   params={"limit": 5})
+
+    rc, out, err = run_enricher("web_search", ctx, env=env)
+    assert rc == 0, f"stderr={err}"
+    assert out is not None, err
+    assert out["engine"] == "duckduckgo"
+    assert out["results"] == 3, out
+    assert out["entities_added"] == 3
+    assert out["relations_added"] == 3
+
+    urls = list_entities(ops_db, type_ref="Url")
+    targets = {e["metadata"].get("url") for e in urls}
+    assert "https://es.wikipedia.org/wiki/Tomate" in targets
+    assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets
+
+    rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
+    assert len(rels) == 3
+    assert all(r["to_entity"] == "t1" for r in rels)
+
+    # Metadata enriquecida.
+    wiki = next(e for e in urls
+                if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate")
+    assert wiki["metadata"]["query"] == "tomate"
+    assert wiki["metadata"]["rank"] == 1
+    assert "Wikipedia" in wiki["metadata"]["title"]
+
+
+def test_web_search_uses_metadata_query_over_name(ops_db, app_dir,
+                                                    registry_root, tmp_path):
+    """metadata.query debe ganar prioridad sobre node_name."""
+    make_node(ops_db, node_id="t1", name="placeholder",
+              type_ref="text", metadata={"query": "tomate"})
+    plan = {"match": [{"contains": "duckduckgo.com",
+                       "text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
+    env = stub_requests(tmp_path, plan)
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="t1", node_name="placeholder", node_type="text",
+                   metadata={"query": "tomate"})
+    rc, out, err = run_enricher("web_search", ctx, env=env)
+    assert rc == 0, err
+    assert out["query"] == "tomate"
+
+
+def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root,
+                                              tmp_path):
+    make_node(ops_db, node_id="t1", name="tomate", type_ref="text")
+    plan = {"match": [{"contains": "duckduckgo.com",
+                       "text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
+    env = stub_requests(tmp_path, plan)
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="t1", node_name="tomate", node_type="text",
+                   params={"limit": 1})
+    rc, out, err = run_enricher("web_search", ctx, env=env)
+    assert rc == 0, err
+    assert out["results"] == 1
+    assert out["entities_added"] == 1
+
+
+def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root,
+                                           tmp_path):
+    make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={})
+    env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}})
+    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
+                   node_id="t1", node_name="", node_type="text")
+    rc, out, err = run_enricher("web_search", ctx, env=env)
+    assert rc == 2
+    assert "sin query" in err