Files
graph_explorer/tests/test_web_search.py
T
egutierrez 67f10a8afd feat(0035c): web_search crea Group cuando excede umbral
Cuando un enricher web_search produce >= 50 resultados, los primeros 10
quedan sueltos colgando del source (preview Twitter/Reddit) y los
restantes entran como hijos de un nuevo nodo Group cuadrado.

Cambios:
- enrichers/web_search/run.py:
  - DEFAULT_GROUP_THRESHOLD=50, GROUP_PREVIEW_K=10 (constantes globales).
  - has_group_id_column(): detecta si el schema soporta agrupacion.
  - insert_group_entity(): crea nodo Group con metadata
    {enricher, query, count, batch_id}.
  - insert_url_entity() acepta batch_id y group_id; los inyecta en
    metadata/columna respectivamente. Nodos existentes mantienen su
    group_id actual (no se machaca).
  - Generacion de batch_id (UUID4 hex) por ejecucion, compartido por
    todos los nodos creados (group + sueltos + agrupados).
  - Cada hijo del grupo conserva su relacion individual SEARCH_RESULT_OF
    al source original — la procedencia es la relacion real, no el
    contenedor.
  - El JSON de salida añade batch_id, group_id, grouped.

- tests/conftest.py: añade columna entities.group_id al SCHEMA_SQL y
  expone group_id en list_entities() para que los tests lo verifiquen.

- tests/test_web_search.py: 3 tests nuevos
  - below_threshold_no_group: 5 resultados → 0 Groups, comportamiento clasico.
  - above_threshold_creates_group_and_preview: 100 resultados → 1 Group +
    10 sueltos + 90 con group_id, todos con SEARCH_RESULT_OF al source.
  - batch_id_shared_across_outputs: group + preview + hijos comparten
    batch_id.
  - _build_lite_html() genera HTML sintetico con N resultados sin
    necesidad de fixture estatico grande.

Tests: 35 passed (32 previos + 3 nuevos) en WSL.
       24 passed + 11 skipped en Windows.

Refs: issues/0035c-web-search-creates-groups.md
2026-05-03 14:52:29 +02:00

239 lines
9.1 KiB
Python

"""Tests del enricher web_search (DuckDuckGo HTML)."""
from __future__ import annotations
from pathlib import Path
from conftest import (
base_ctx, list_entities, list_relations, make_node, run_enricher,
stub_requests, TESTS_DIR,
)
DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html"
def test_web_search_creates_url_results_for_text_node(
ops_db, app_dir, registry_root, tmp_path):
make_node(ops_db, node_id="t1", name="tomate",
type_ref="text", metadata={})
plan = {
"match": [
{"contains": "duckduckgo.com",
"text": DDG_FIXTURE.read_text(encoding="utf-8"),
"status": 200},
],
"default": {"text": "", "status": 404},
}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate", node_type="text",
params={"limit": 5})
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, f"stderr={err}"
assert out is not None, err
assert out["engine"] == "duckduckgo"
assert out["results"] == 3, out
assert out["entities_added"] == 3
assert out["relations_added"] == 3
urls = list_entities(ops_db, type_ref="Url")
targets = {e["metadata"].get("url") for e in urls}
assert "https://es.wikipedia.org/wiki/Tomate" in targets
assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets
rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
assert len(rels) == 3
assert all(r["to_entity"] == "t1" for r in rels)
# Metadata enriquecida.
wiki = next(e for e in urls
if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate")
assert wiki["metadata"]["query"] == "tomate"
assert wiki["metadata"]["rank"] == 1
assert "Wikipedia" in wiki["metadata"]["title"]
def test_web_search_uses_metadata_query_over_name(ops_db, app_dir,
registry_root, tmp_path):
"""metadata.query debe ganar prioridad sobre node_name."""
make_node(ops_db, node_id="t1", name="placeholder",
type_ref="text", metadata={"query": "tomate"})
plan = {"match": [{"contains": "duckduckgo.com",
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="placeholder", node_type="text",
metadata={"query": "tomate"})
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, err
assert out["query"] == "tomate"
def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root,
tmp_path):
make_node(ops_db, node_id="t1", name="tomate", type_ref="text")
plan = {"match": [{"contains": "duckduckgo.com",
"text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate", node_type="text",
params={"limit": 1})
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, err
assert out["results"] == 1
assert out["entities_added"] == 1
def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root,
tmp_path):
make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={})
env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}})
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="", node_type="text")
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 2
assert "sin query" in err
# ---------------------------------------------------------------------------
# 0035c — agrupacion automatica cuando se excede el threshold
# ---------------------------------------------------------------------------
def _build_lite_html(n: int) -> str:
"""Genera un HTML estilo lite.duckduckgo.com con N resultados.
Estructura minima que `_DDGLiteParser` consume: por cada resultado
un anchor con `class='result-link'` y un `<td class='result-snippet'>`
con el snippet inmediatamente despues. URLs unicas para evitar la
deduplicacion del parser.
"""
rows = []
for i in range(n):
url = f"https://example.com/result-{i:03d}"
rows.append(
f"<a rel='nofollow' href='{url}' class='result-link'>"
f"Result {i:03d}</a>"
f"<td class='result-snippet'>snippet for result {i:03d}</td>"
)
return (
"<!DOCTYPE html><html><body>"
+ "".join(rows)
+ "</body></html>"
)
def test_web_search_below_threshold_no_group(ops_db, app_dir, registry_root,
tmp_path):
"""5 resultados < threshold → ningun Group, comportamiento clasico."""
make_node(ops_db, node_id="t1", name="tomate",
type_ref="text", metadata={})
plan = {
"match": [
{"contains": "duckduckgo.com",
"text": _build_lite_html(5),
"status": 200},
],
}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate", node_type="text",
params={"limit": 0}) # 0 = sin truncar
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, err
assert out["results"] == 5
assert out["grouped"] is False
assert out["group_id"] == ""
# Sin Group creado.
groups = list_entities(ops_db, type_ref="Group")
assert groups == []
# 5 Urls sueltos.
urls = list_entities(ops_db, type_ref="Url")
assert len(urls) == 5
assert all(u["group_id"] is None for u in urls)
# Todos con la misma batch_id en metadata.
batch_ids = {u["metadata"].get("batch_id") for u in urls}
assert len(batch_ids) == 1 and "" not in batch_ids
def test_web_search_above_threshold_creates_group_and_preview(
ops_db, app_dir, registry_root, tmp_path):
"""100 resultados → 1 Group + 10 sueltos + 90 con group_id."""
make_node(ops_db, node_id="t1", name="tomate",
type_ref="text", metadata={})
plan = {
"match": [
{"contains": "duckduckgo.com",
"text": _build_lite_html(100),
"status": 200},
],
}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate", node_type="text",
params={"limit": 0})
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, err
assert out["results"] == 100
assert out["grouped"] is True
assert out["group_id"]
# Exactamente 1 Group.
groups = list_entities(ops_db, type_ref="Group")
assert len(groups) == 1
g = groups[0]
assert g["metadata"]["count"] == 100
assert g["metadata"]["query"] == "tomate"
assert g["metadata"]["enricher"] == "web_search"
assert g["metadata"].get("batch_id")
# 10 Urls sin group_id (preview) + 90 con group_id.
urls = list_entities(ops_db, type_ref="Url")
assert len(urls) == 100
sueltos = [u for u in urls if u["group_id"] is None]
children = [u for u in urls if u["group_id"] == g["id"]]
assert len(sueltos) == 10
assert len(children) == 90
# Todos los Urls con relacion SEARCH_RESULT_OF al source original.
rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
# 100 Urls + 1 Group = 101 relaciones al source.
from_to_t1 = [r for r in rels if r["to_entity"] == "t1"]
assert len(from_to_t1) == 101
# Group → t1.
assert any(r["from_entity"] == g["id"] for r in from_to_t1)
def test_web_search_batch_id_shared_across_outputs(
ops_db, app_dir, registry_root, tmp_path):
"""Tras un run con grouping, group + preview + hijos comparten batch_id."""
make_node(ops_db, node_id="t1", name="tomate",
type_ref="text", metadata={})
plan = {
"match": [
{"contains": "duckduckgo.com",
"text": _build_lite_html(100),
"status": 200},
],
}
env = stub_requests(tmp_path, plan)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate", node_type="text",
params={"limit": 0})
rc, out, err = run_enricher("web_search", ctx, env=env)
assert rc == 0, err
expected_batch = out["batch_id"]
assert expected_batch
groups = list_entities(ops_db, type_ref="Group")
urls = list_entities(ops_db, type_ref="Url")
all_nodes = groups + urls
assert len(all_nodes) == 101 # 1 Group + 100 Urls
batch_ids = {n["metadata"].get("batch_id") for n in all_nodes}
assert batch_ids == {expected_batch}, (
f"batch_ids inconsistentes: {batch_ids}")