feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,89 @@
|
||||
"""Stub minimo de `requests` para tests de enrichers.
|
||||
|
||||
Lee el plan de respuesta de `_STUB_REQUESTS_PLAN` (env var con path a un
|
||||
JSON). Soporta multiples respuestas indexadas por metodo o por sufijo de
|
||||
URL — la primera coincidencia gana.
|
||||
|
||||
Formato del plan:
|
||||
{
|
||||
"default": {"text": "<html>...</html>", "status": 200,
|
||||
"headers": {"Content-Type": "text/html; charset=utf-8"}},
|
||||
"match": [
|
||||
{"contains": "duckduckgo.com", "text": "...", "status": 200},
|
||||
{"method": "GET", "contains": "example.com", "text": "..."}
|
||||
]
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
class Response:
|
||||
def __init__(self, text: str = "", status_code: int = 200,
|
||||
headers: dict | None = None, url: str = "",
|
||||
encoding: str = "utf-8") -> None:
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
self.headers = headers or {"Content-Type": "text/html; charset=utf-8"}
|
||||
self.url = url
|
||||
self.encoding = encoding
|
||||
self.content = text.encode(encoding, errors="replace")
|
||||
|
||||
def json(self):
|
||||
return json.loads(self.text)
|
||||
|
||||
def raise_for_status(self):
|
||||
if self.status_code >= 400:
|
||||
raise RuntimeError(f"HTTP {self.status_code}")
|
||||
|
||||
|
||||
def _load_plan() -> dict:
|
||||
p = os.environ.get("_STUB_REQUESTS_PLAN")
|
||||
if not p or not os.path.exists(p):
|
||||
return {}
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def _resolve(method: str, url: str) -> Response:
|
||||
plan = _load_plan()
|
||||
for entry in plan.get("match", []):
|
||||
if "method" in entry and entry["method"].upper() != method.upper():
|
||||
continue
|
||||
needle = entry.get("contains") or ""
|
||||
if needle and needle in url:
|
||||
return Response(
|
||||
text=entry.get("text", ""),
|
||||
status_code=int(entry.get("status", 200)),
|
||||
headers=entry.get("headers"),
|
||||
url=url,
|
||||
)
|
||||
d = plan.get("default") or {}
|
||||
return Response(
|
||||
text=d.get("text", ""),
|
||||
status_code=int(d.get("status", 200)),
|
||||
headers=d.get("headers"),
|
||||
url=url,
|
||||
)
|
||||
|
||||
|
||||
def get(url, *args, **kwargs):
|
||||
return _resolve("GET", url)
|
||||
|
||||
|
||||
def post(url, *args, **kwargs):
|
||||
return _resolve("POST", url)
|
||||
|
||||
|
||||
# Compatibilidad con `requests.exceptions.RequestException` si algun
|
||||
# enricher lo importa en el futuro.
|
||||
class RequestException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class exceptions: # noqa: N801
|
||||
RequestException = RequestException
|
||||
Timeout = RequestException
|
||||
ConnectionError = RequestException
|
||||
@@ -0,0 +1,237 @@
|
||||
"""Fixtures comunes para tests de enrichers de graph_explorer.
|
||||
|
||||
Cada test recibe:
|
||||
- `ops_db`: path a una operations.db con schema minimo en tmp dir
|
||||
- `app_dir`: tmp dir que actua como app_dir (cache_dir = <app_dir>/cache)
|
||||
- `registry_root`: ruta absoluta del registry (para imports en run.py)
|
||||
- `run_enricher(enricher, ctx_overrides)`: helper que invoca run.py via
|
||||
subprocess con el mismo wire protocol que jobs.cpp.
|
||||
|
||||
El schema se replica de `fn_operations/project_template/operations.db` —
|
||||
solo las columnas que usan los enrichers. Si fn_operations cambia el
|
||||
schema, este conftest se actualiza.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
REGISTRY_ROOT = Path(__file__).resolve().parents[5]
|
||||
APP_DIR_SRC = Path(__file__).resolve().parents[1] # graph_explorer/
|
||||
ENRICHERS_DIR = APP_DIR_SRC / "enrichers"
|
||||
TESTS_DIR = Path(__file__).resolve().parent
|
||||
STUBS_DIR = TESTS_DIR / "_stubs"
|
||||
PYTHON_BIN = REGISTRY_ROOT / "python" / ".venv" / "bin" / "python3"
|
||||
|
||||
|
||||
def stub_requests(tmp_path: Path, plan: dict) -> dict:
|
||||
"""Escribe el plan de respuestas y devuelve el env que activa el stub.
|
||||
|
||||
El stub vive en tests/_stubs/requests.py y se activa via PYTHONPATH.
|
||||
Plan acepta `default` y/o `match` (lista de {contains, status, text}).
|
||||
"""
|
||||
plan_file = tmp_path / "_stub_plan.json"
|
||||
plan_file.write_text(json.dumps(plan), encoding="utf-8")
|
||||
return {
|
||||
"PYTHONPATH": str(STUBS_DIR) + os.pathsep + os.environ.get("PYTHONPATH", ""),
|
||||
"_STUB_REQUESTS_PLAN": str(plan_file),
|
||||
}
|
||||
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE entities (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
type_ref TEXT NOT NULL,
|
||||
status TEXT NOT NULL DEFAULT 'active',
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
domain TEXT NOT NULL DEFAULT '',
|
||||
tags TEXT NOT NULL DEFAULT '[]',
|
||||
source TEXT NOT NULL,
|
||||
metadata TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
);
|
||||
CREATE TABLE relations (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
from_entity TEXT NOT NULL DEFAULT '',
|
||||
to_entity TEXT NOT NULL,
|
||||
via TEXT NOT NULL DEFAULT '',
|
||||
description TEXT NOT NULL DEFAULT '',
|
||||
purity TEXT NOT NULL DEFAULT '',
|
||||
direction TEXT NOT NULL DEFAULT 'unidirectional',
|
||||
weight REAL,
|
||||
status TEXT NOT NULL DEFAULT 'designed',
|
||||
started_at TEXT,
|
||||
ended_at TEXT,
|
||||
"order" INTEGER,
|
||||
tags TEXT NOT NULL DEFAULT '[]',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
);
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ops_db(tmp_path):
|
||||
"""operations.db vacia con schema minimo, lista para insertar nodos."""
|
||||
db = tmp_path / "operations.db"
|
||||
conn = sqlite3.connect(db)
|
||||
conn.executescript(SCHEMA_SQL)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def app_dir(tmp_path):
|
||||
"""Directorio raiz de una 'app' para los enrichers (cache va dentro)."""
|
||||
d = tmp_path / "app"
|
||||
d.mkdir()
|
||||
(d / "cache").mkdir()
|
||||
return d
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def registry_root():
|
||||
return REGISTRY_ROOT
|
||||
|
||||
|
||||
def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str,
|
||||
metadata: dict | None = None, source: str = "test") -> None:
|
||||
"""Inserta un nodo de tipo arbitrario en operations.db."""
|
||||
conn = sqlite3.connect(ops_db)
|
||||
conn.execute(
|
||||
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
||||
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, "
|
||||
" '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')",
|
||||
(node_id, name, type_ref, source,
|
||||
json.dumps(metadata or {}, ensure_ascii=False)),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_entity(ops_db: Path, entity_id: str) -> dict | None:
|
||||
conn = sqlite3.connect(ops_db)
|
||||
try:
|
||||
cur = conn.execute(
|
||||
"SELECT id, name, type_ref, source, metadata "
|
||||
"FROM entities WHERE id=?", (entity_id,))
|
||||
row = cur.fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
if not row:
|
||||
return None
|
||||
md = {}
|
||||
try:
|
||||
md = json.loads(row[4]) if row[4] else {}
|
||||
except Exception:
|
||||
pass
|
||||
return {"id": row[0], "name": row[1], "type_ref": row[2],
|
||||
"source": row[3], "metadata": md}
|
||||
|
||||
|
||||
def list_entities(ops_db: Path, type_ref: str | None = None) -> list[dict]:
|
||||
conn = sqlite3.connect(ops_db)
|
||||
try:
|
||||
if type_ref:
|
||||
cur = conn.execute(
|
||||
"SELECT id, name, type_ref, source, metadata "
|
||||
"FROM entities WHERE type_ref=? ORDER BY id", (type_ref,))
|
||||
else:
|
||||
cur = conn.execute(
|
||||
"SELECT id, name, type_ref, source, metadata "
|
||||
"FROM entities ORDER BY id")
|
||||
rows = cur.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
out = []
|
||||
for r in rows:
|
||||
try:
|
||||
md = json.loads(r[4]) if r[4] else {}
|
||||
except Exception:
|
||||
md = {}
|
||||
out.append({"id": r[0], "name": r[1], "type_ref": r[2],
|
||||
"source": r[3], "metadata": md})
|
||||
return out
|
||||
|
||||
|
||||
def list_relations(ops_db: Path, name: str | None = None) -> list[dict]:
|
||||
conn = sqlite3.connect(ops_db)
|
||||
try:
|
||||
if name:
|
||||
cur = conn.execute(
|
||||
"SELECT id, name, from_entity, to_entity FROM relations "
|
||||
"WHERE name=? ORDER BY id", (name,))
|
||||
else:
|
||||
cur = conn.execute(
|
||||
"SELECT id, name, from_entity, to_entity FROM relations "
|
||||
"ORDER BY id")
|
||||
rows = cur.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [{"id": r[0], "name": r[1], "from_entity": r[2], "to_entity": r[3]}
|
||||
for r in rows]
|
||||
|
||||
|
||||
def run_enricher(enricher_id: str, ctx: dict, *, env: dict | None = None,
|
||||
timeout: int = 30) -> tuple[int, dict | None, str]:
|
||||
"""Lanza enrichers/<id>/run.py con el wire protocol estandar.
|
||||
|
||||
Returns: (exit_code, stdout_json_or_None, stderr_text)
|
||||
"""
|
||||
run_py = ENRICHERS_DIR / enricher_id / "run.py"
|
||||
assert run_py.exists(), f"no existe {run_py}"
|
||||
|
||||
full_env = os.environ.copy()
|
||||
if env:
|
||||
full_env.update(env)
|
||||
|
||||
proc = subprocess.run(
|
||||
[str(PYTHON_BIN), str(run_py)],
|
||||
input=json.dumps(ctx),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
env=full_env,
|
||||
)
|
||||
parsed: dict | None = None
|
||||
if proc.stdout.strip():
|
||||
# Ultima linea no vacia es el JSON resumen.
|
||||
for line in reversed(proc.stdout.strip().splitlines()):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
parsed = json.loads(line)
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
return proc.returncode, parsed, proc.stderr
|
||||
|
||||
|
||||
def base_ctx(*, ops_db, app_dir, registry_root, node_id, node_name,
|
||||
node_type, metadata=None, params=None) -> dict:
|
||||
"""Construye el ctx tipico que jobs.cpp pasa por stdin."""
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"node_name": node_name,
|
||||
"node_type": node_type,
|
||||
"metadata": metadata or {},
|
||||
"ops_db_path": str(ops_db),
|
||||
"app_dir": str(app_dir),
|
||||
"cache_dir": str(Path(app_dir) / "cache"),
|
||||
"registry_root": str(registry_root),
|
||||
"params": params or {},
|
||||
}
|
||||
Vendored
+22
@@ -0,0 +1,22 @@
|
||||
<!DOCTYPE html>
|
||||
<html><head><title>tomate at DuckDuckGo</title></head>
|
||||
<body>
|
||||
<div class="serp__results">
|
||||
<div class="result">
|
||||
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fes.wikipedia.org%2Fwiki%2FTomate&rut=abc">Tomate - Wikipedia, la enciclopedia libre</a>
|
||||
<a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fes.wikipedia.org%2Fwiki%2FTomate">El tomate es el fruto comestible de la planta Solanum lycopersicum, una especie de la familia de las solanaceas.</a>
|
||||
</div>
|
||||
<div class="result">
|
||||
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.botanical-online.com%2Falimentos%2Ftomate-propiedades&rut=def">Tomate: propiedades y beneficios</a>
|
||||
<a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.botanical-online.com%2Falimentos%2Ftomate-propiedades">Propiedades del tomate, beneficios para la salud y composicion nutricional.</a>
|
||||
</div>
|
||||
<div class="result">
|
||||
<a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.recetasgratis.net%2Fbusqueda%2Ftomate&rut=ghi">Recetas con tomate - RecetasGratis</a>
|
||||
<a class="result__snippet" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.recetasgratis.net%2Fbusqueda%2Ftomate">Encuentra las mejores recetas con tomate paso a paso.</a>
|
||||
</div>
|
||||
<div class="result result--ad">
|
||||
<!-- anuncio sin titulo, no debe contar -->
|
||||
<a href="https://ad.doubleclick.net/x">ad</a>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
@@ -0,0 +1,60 @@
|
||||
"""Tests del enricher extract_domain.
|
||||
|
||||
Pure regex/parsing — sin red. Verifica:
|
||||
- Url con metadata.url crea Domain + BELONGS_TO
|
||||
- Email crea Domain (desde la parte derecha del @)
|
||||
- Si el Domain ya existe se reusa, no se duplica
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from conftest import (
|
||||
base_ctx, get_entity, list_entities, list_relations,
|
||||
make_node, run_enricher,
|
||||
)
|
||||
|
||||
|
||||
def test_url_creates_domain_and_relation(ops_db, app_dir, registry_root):
|
||||
make_node(ops_db, node_id="u1", name="ex",
|
||||
type_ref="Url", metadata={"url": "https://www.example.com/path"})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="u1", node_name="ex", node_type="Url",
|
||||
metadata={"url": "https://www.example.com/path"})
|
||||
|
||||
rc, out, err = run_enricher("extract_domain", ctx)
|
||||
assert rc == 0, err
|
||||
assert out and out.get("entities_added", 0) >= 1, out
|
||||
|
||||
domains = list_entities(ops_db, type_ref="Domain")
|
||||
assert any(d["name"] == "www.example.com" for d in domains), domains
|
||||
|
||||
rels = list_relations(ops_db, name="BELONGS_TO")
|
||||
assert len(rels) == 1
|
||||
assert rels[0]["from_entity"] == "u1"
|
||||
|
||||
|
||||
def test_email_creates_domain(ops_db, app_dir, registry_root):
|
||||
make_node(ops_db, node_id="e1", name="user@aurgi.com",
|
||||
type_ref="Email", metadata={"address": "user@aurgi.com"})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="e1", node_name="user@aurgi.com", node_type="Email")
|
||||
rc, out, err = run_enricher("extract_domain", ctx)
|
||||
assert rc == 0, err
|
||||
domains = list_entities(ops_db, type_ref="Domain")
|
||||
assert any(d["name"] == "aurgi.com" for d in domains), domains
|
||||
|
||||
|
||||
def test_existing_domain_is_reused(ops_db, app_dir, registry_root):
|
||||
# Pre-crear un Domain con el mismo nombre.
|
||||
make_node(ops_db, node_id="d1", name="example.com", type_ref="Domain",
|
||||
metadata={})
|
||||
make_node(ops_db, node_id="u1", name="ex", type_ref="Url",
|
||||
metadata={"url": "https://example.com/x"})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="u1", node_name="ex", node_type="Url",
|
||||
metadata={"url": "https://example.com/x"})
|
||||
rc, out, err = run_enricher("extract_domain", ctx)
|
||||
assert rc == 0, err
|
||||
|
||||
domains = list_entities(ops_db, type_ref="Domain")
|
||||
names = [d["name"] for d in domains]
|
||||
assert names.count("example.com") == 1, domains
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Tests del enricher extract_links — sin red, lee markdown del cache."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, make_node, run_enricher,
|
||||
)
|
||||
|
||||
|
||||
SAMPLE_MD = """# Pagina demo
|
||||
|
||||
Aqui hay [un enlace](https://example.com/articulo) interesante y
|
||||
otro [duplicado](https://example.com/articulo) que no debe contar
|
||||
dos veces.
|
||||
|
||||
Tambien una URL pelada: https://otra.example/path?q=1
|
||||
y https://tercera.example/
|
||||
|
||||
Y un email que NO debe extraer como Url: contact@no.example
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_links_creates_url_nodes(ops_db, app_dir, registry_root):
|
||||
# 1) Crear el cache con el markdown.
|
||||
md_dir = Path(app_dir) / "cache" / "ab"
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = md_dir / "abc.md"
|
||||
md_path.write_text(SAMPLE_MD, encoding="utf-8")
|
||||
rel = md_path.relative_to(app_dir)
|
||||
|
||||
# 2) Crear Webpage con metadata.markdown_path apuntando al cache.
|
||||
make_node(ops_db, node_id="w1", name="demo",
|
||||
type_ref="Webpage", metadata={"markdown_path": str(rel)})
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="demo", node_type="Webpage",
|
||||
metadata={"markdown_path": str(rel)})
|
||||
|
||||
rc, out, err = run_enricher("extract_links", ctx)
|
||||
assert rc == 0, err
|
||||
assert out is not None, err
|
||||
assert out["entities_added"] >= 3, out
|
||||
|
||||
urls = [e["name"] for e in list_entities(ops_db, type_ref="Url")]
|
||||
assert "https://example.com/articulo" in urls
|
||||
assert "https://otra.example/path?q=1" in urls
|
||||
|
||||
rels = list_relations(ops_db, name="LINKS_TO")
|
||||
assert len(rels) >= 3
|
||||
assert all(r["from_entity"] == "w1" for r in rels)
|
||||
|
||||
|
||||
def test_extract_links_without_markdown_path_errors(ops_db, app_dir,
|
||||
registry_root):
|
||||
make_node(ops_db, node_id="w1", name="demo",
|
||||
type_ref="Webpage", metadata={})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="demo", node_type="Webpage")
|
||||
rc, out, err = run_enricher("extract_links", ctx)
|
||||
assert rc != 0, "deberia fallar sin markdown_path"
|
||||
assert out is not None
|
||||
assert "missing markdown_path" in (out.get("error") or "")
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Tests del enricher extract_text_entities — regex IoCs sobre markdown."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, make_node, run_enricher,
|
||||
)
|
||||
|
||||
|
||||
# Texto con varios IoCs detectables por extract_iocs (regex puro).
|
||||
SAMPLE_MD = """# Reporte
|
||||
|
||||
Indicators:
|
||||
- Email: bad@evil.example y otra@victim.example
|
||||
- IP: 192.0.2.55
|
||||
- CVE: CVE-2024-12345
|
||||
- Hash: 44d88612fea8a8f36de82e1278abb02f
|
||||
"""
|
||||
|
||||
|
||||
def test_extract_iocs_creates_typed_entities(ops_db, app_dir, registry_root):
|
||||
md_dir = Path(app_dir) / "cache" / "cd"
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = md_dir / "ddd.md"
|
||||
md_path.write_text(SAMPLE_MD, encoding="utf-8")
|
||||
rel = md_path.relative_to(app_dir)
|
||||
|
||||
make_node(ops_db, node_id="w1", name="report",
|
||||
type_ref="Webpage", metadata={"markdown_path": str(rel)})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="report", node_type="Webpage",
|
||||
metadata={"markdown_path": str(rel)})
|
||||
|
||||
rc, out, err = run_enricher("extract_text_entities", ctx)
|
||||
assert rc == 0, err
|
||||
assert out is not None
|
||||
assert out["entities_added"] >= 3, out
|
||||
|
||||
types = {e["type_ref"] for e in list_entities(ops_db)
|
||||
if e["type_ref"] != "Webpage"}
|
||||
# No exigimos todos los tipos — depende de que extract_iocs cubra cada
|
||||
# patron — pero al menos Email y CVE deberian estar.
|
||||
assert "Email" in types, types
|
||||
assert "CVE" in types, types
|
||||
|
||||
rels = list_relations(ops_db, name="EXTRACTED_FROM")
|
||||
assert len(rels) >= 3
|
||||
assert all(r["to_entity"] == "w1" for r in rels)
|
||||
|
||||
|
||||
def test_extract_iocs_without_markdown_errors(ops_db, app_dir, registry_root):
|
||||
make_node(ops_db, node_id="w1", name="empty",
|
||||
type_ref="Webpage", metadata={})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="w1", node_name="empty", node_type="Webpage")
|
||||
rc, out, err = run_enricher("extract_text_entities", ctx)
|
||||
assert rc != 0
|
||||
assert out and "missing markdown_path" in (out.get("error") or "")
|
||||
@@ -0,0 +1,77 @@
|
||||
"""Tests del enricher fetch_webpage con red mockeada via stub de requests."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, get_entity, list_entities, list_relations,
|
||||
make_node, run_enricher, stub_requests,
|
||||
)
|
||||
|
||||
|
||||
SAMPLE_HTML = """<!DOCTYPE html>
|
||||
<html><head><title>Acme Demo</title></head>
|
||||
<body>
|
||||
<h1>Hola</h1>
|
||||
<p>Esta es la pagina de prueba con un <a href="/x">enlace</a>.</p>
|
||||
<p>Email de contacto: ops@acme.example</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
def test_fetch_webpage_creates_domain_and_caches(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="u1", name="acme",
|
||||
type_ref="Url", metadata={"url": "https://www.acme.example/"})
|
||||
plan = {
|
||||
"default": {"text": SAMPLE_HTML, "status": 200,
|
||||
"headers": {"Content-Type": "text/html; charset=utf-8"}},
|
||||
}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="u1", node_name="acme", node_type="Url",
|
||||
metadata={"url": "https://www.acme.example/"})
|
||||
|
||||
rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
|
||||
assert rc == 0, f"stderr={err}"
|
||||
assert out is not None, err
|
||||
assert out["status_code"] == 200
|
||||
assert out["title"] == "Acme Demo"
|
||||
assert out["entities_added"] == 1 # Domain
|
||||
assert out["relations_added"] == 1 # BELONGS_TO
|
||||
|
||||
# El nodo Url se promueve a Webpage.
|
||||
e = get_entity(ops_db, "u1")
|
||||
assert e["type_ref"] == "Webpage", e
|
||||
assert e["metadata"]["title"] == "Acme Demo"
|
||||
assert e["metadata"]["status_code"] == 200
|
||||
|
||||
# Cache existe.
|
||||
html_path = Path(app_dir) / e["metadata"]["html_path"]
|
||||
assert html_path.exists()
|
||||
assert "Acme Demo" in html_path.read_text(encoding="utf-8")
|
||||
|
||||
# Domain creado con relacion.
|
||||
domains = list_entities(ops_db, type_ref="Domain")
|
||||
assert any(d["name"] == "www.acme.example" for d in domains)
|
||||
rels = list_relations(ops_db, name="BELONGS_TO")
|
||||
assert len(rels) == 1
|
||||
|
||||
|
||||
def test_fetch_webpage_handles_http_error(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="u1", name="bad",
|
||||
type_ref="Url", metadata={"url": "https://no.example/"})
|
||||
plan = {"default": {"text": "<html></html>", "status": 404}}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="u1", node_name="bad", node_type="Url",
|
||||
metadata={"url": "https://no.example/"})
|
||||
|
||||
rc, out, err = run_enricher("fetch_webpage", ctx, env=env)
|
||||
# 404 es respuesta valida — exit 0 con status_code en el resumen.
|
||||
assert rc == 0, err
|
||||
assert out["status_code"] == 404
|
||||
@@ -0,0 +1,72 @@
|
||||
"""Sanity check de los manifests YAML de todos los enrichers.
|
||||
|
||||
Confirma que el set actual cubre los tipos esperados y que cada manifest
|
||||
tiene los campos que `enrichers.cpp` necesita parsear (id, applies_to).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import ENRICHERS_DIR
|
||||
|
||||
|
||||
EXPECTED_IDS = {
|
||||
"extract_domain",
|
||||
"extract_links",
|
||||
"extract_text_entities",
|
||||
"fetch_webpage",
|
||||
"web_search",
|
||||
}
|
||||
|
||||
|
||||
def _parse_simple_yaml(text: str) -> dict:
|
||||
"""Parser ad-hoc que replica lo que hace enrichers.cpp."""
|
||||
out: dict = {}
|
||||
in_skip = False
|
||||
for raw in text.splitlines():
|
||||
line = raw.rstrip("\r")
|
||||
s = line.strip()
|
||||
if not s or s.startswith("#"):
|
||||
continue
|
||||
indented = line and line[0].isspace()
|
||||
if not indented:
|
||||
in_skip = False
|
||||
if in_skip:
|
||||
continue
|
||||
if ":" not in s:
|
||||
continue
|
||||
key, _, val = s.partition(":")
|
||||
key = key.strip()
|
||||
val = val.strip()
|
||||
if val and val[0] in ('"', "'") and val[-1] == val[0]:
|
||||
val = val[1:-1]
|
||||
if key == "params" and not val:
|
||||
in_skip = True
|
||||
out[key] = val
|
||||
return out
|
||||
|
||||
|
||||
def test_all_expected_enrichers_present():
|
||||
found = {p.name for p in ENRICHERS_DIR.iterdir() if p.is_dir()}
|
||||
missing = EXPECTED_IDS - found
|
||||
assert not missing, f"faltan enrichers: {missing}"
|
||||
|
||||
|
||||
def test_each_manifest_has_required_fields():
|
||||
for d in ENRICHERS_DIR.iterdir():
|
||||
if not d.is_dir():
|
||||
continue
|
||||
manifest = d / "manifest.yaml"
|
||||
runpy = d / "run.py"
|
||||
assert manifest.exists(), f"falta manifest: {d.name}"
|
||||
assert runpy.exists(), f"falta run.py: {d.name}"
|
||||
m = _parse_simple_yaml(manifest.read_text(encoding="utf-8"))
|
||||
assert m.get("id") == d.name, f"id no coincide con dir: {d.name}"
|
||||
assert m.get("applies_to"), f"sin applies_to: {d.name}"
|
||||
assert m.get("description"), f"sin description: {d.name}"
|
||||
|
||||
|
||||
def test_web_search_applies_to_text():
|
||||
m = _parse_simple_yaml(
|
||||
(ENRICHERS_DIR / "web_search" / "manifest.yaml").read_text())
|
||||
assert "text" in m["applies_to"].lower()
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Tests del enricher web_search (DuckDuckGo HTML)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from conftest import (
|
||||
base_ctx, list_entities, list_relations, make_node, run_enricher,
|
||||
stub_requests, TESTS_DIR,
|
||||
)
|
||||
|
||||
|
||||
DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html"
|
||||
|
||||
|
||||
def test_web_search_creates_url_results_for_text_node(
|
||||
ops_db, app_dir, registry_root, tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="tomate",
|
||||
type_ref="text", metadata={})
|
||||
plan = {
|
||||
"match": [
|
||||
{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8"),
|
||||
"status": 200},
|
||||
],
|
||||
"default": {"text": "", "status": 404},
|
||||
}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="tomate", node_type="text",
|
||||
params={"limit": 5})
|
||||
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, f"stderr={err}"
|
||||
assert out is not None, err
|
||||
assert out["engine"] == "duckduckgo"
|
||||
assert out["results"] == 3, out
|
||||
assert out["entities_added"] == 3
|
||||
assert out["relations_added"] == 3
|
||||
|
||||
urls = list_entities(ops_db, type_ref="Url")
|
||||
targets = {e["metadata"].get("url") for e in urls}
|
||||
assert "https://es.wikipedia.org/wiki/Tomate" in targets
|
||||
assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets
|
||||
|
||||
rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
|
||||
assert len(rels) == 3
|
||||
assert all(r["to_entity"] == "t1" for r in rels)
|
||||
|
||||
# Metadata enriquecida.
|
||||
wiki = next(e for e in urls
|
||||
if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate")
|
||||
assert wiki["metadata"]["query"] == "tomate"
|
||||
assert wiki["metadata"]["rank"] == 1
|
||||
assert "Wikipedia" in wiki["metadata"]["title"]
|
||||
|
||||
|
||||
def test_web_search_uses_metadata_query_over_name(ops_db, app_dir,
|
||||
registry_root, tmp_path):
|
||||
"""metadata.query debe ganar prioridad sobre node_name."""
|
||||
make_node(ops_db, node_id="t1", name="placeholder",
|
||||
type_ref="text", metadata={"query": "tomate"})
|
||||
plan = {"match": [{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="placeholder", node_type="text",
|
||||
metadata={"query": "tomate"})
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, err
|
||||
assert out["query"] == "tomate"
|
||||
|
||||
|
||||
def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="tomate", type_ref="text")
|
||||
plan = {"match": [{"contains": "duckduckgo.com",
|
||||
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
|
||||
env = stub_requests(tmp_path, plan)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="tomate", node_type="text",
|
||||
params={"limit": 1})
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 0, err
|
||||
assert out["results"] == 1
|
||||
assert out["entities_added"] == 1
|
||||
|
||||
|
||||
def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root,
|
||||
tmp_path):
|
||||
make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={})
|
||||
env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="", node_type="text")
|
||||
rc, out, err = run_enricher("web_search", ctx, env=env)
|
||||
assert rc == 2
|
||||
assert "sin query" in err
|
||||
Reference in New Issue
Block a user