graph_explorer/tests/test_web_search.py

"""Tests del enricher web_search (DuckDuckGo HTML)."""
from __future__ import annotations

from pathlib import Path

from conftest import (
    base_ctx, list_entities, list_relations, make_node, run_enricher,
    stub_requests, TESTS_DIR,
)


DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html"


def test_web_search_creates_url_results_for_text_node(
        ops_db, app_dir, registry_root, tmp_path):
    make_node(ops_db, node_id="t1", name="tomate",
              type_ref="text", metadata={})
    plan = {
        "match": [
            {"contains": "duckduckgo.com",
             "text": DDG_FIXTURE.read_text(encoding="utf-8"),
             "status": 200},
        ],
        "default": {"text": "", "status": 404},
    }
    env = stub_requests(tmp_path, plan)

    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate", node_type="text",
                   params={"limit": 5})

    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, f"stderr={err}"
    assert out is not None, err
    assert out["engine"] == "duckduckgo"
    assert out["results"] == 3, out
    assert out["entities_added"] == 3
    assert out["relations_added"] == 3

    urls = list_entities(ops_db, type_ref="Url")
    targets = {e["metadata"].get("url") for e in urls}
    assert "https://es.wikipedia.org/wiki/Tomate" in targets
    assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets

    rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
    assert len(rels) == 3
    assert all(r["to_entity"] == "t1" for r in rels)

    # Metadata enriquecida.
    wiki = next(e for e in urls
                if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate")
    assert wiki["metadata"]["query"] == "tomate"
    assert wiki["metadata"]["rank"] == 1
    assert "Wikipedia" in wiki["metadata"]["title"]


def test_web_search_uses_metadata_query_over_name(ops_db, app_dir,
                                                    registry_root, tmp_path):
    """metadata.query debe ganar prioridad sobre node_name."""
    make_node(ops_db, node_id="t1", name="placeholder",
              type_ref="text", metadata={"query": "tomate"})
    plan = {"match": [{"contains": "duckduckgo.com",
                       "text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
    env = stub_requests(tmp_path, plan)
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="placeholder", node_type="text",
                   metadata={"query": "tomate"})
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, err
    assert out["query"] == "tomate"


def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root,
                                              tmp_path):
    make_node(ops_db, node_id="t1", name="tomate", type_ref="text")
    plan = {"match": [{"contains": "duckduckgo.com",
                       "text": DDG_FIXTURE.read_text(encoding="utf-8")}]}
    env = stub_requests(tmp_path, plan)
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate", node_type="text",
                   params={"limit": 1})
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, err
    assert out["results"] == 1
    assert out["entities_added"] == 1


def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root,
                                           tmp_path):
    make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={})
    env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="", node_type="text")
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 2
    assert "sin query" in err


# ---------------------------------------------------------------------------
# 0035c — agrupacion automatica cuando se excede el threshold
# ---------------------------------------------------------------------------

def _build_lite_html(n: int) -> str:
    """Genera un HTML estilo lite.duckduckgo.com con N resultados.

    Estructura minima que `_DDGLiteParser` consume: por cada resultado
    un anchor con `class='result-link'` y un `<td class='result-snippet'>`
    con el snippet inmediatamente despues. URLs unicas para evitar la
    deduplicacion del parser.
    """
    rows = []
    for i in range(n):
        url = f"https://example.com/result-{i:03d}"
        rows.append(
            f"<a rel='nofollow' href='{url}' class='result-link'>"
            f"Result {i:03d}</a>"
            f"<td class='result-snippet'>snippet for result {i:03d}</td>"
        )
    return (
        "<!DOCTYPE html><html><body>"
        + "".join(rows)
        + "</body></html>"
    )


def test_web_search_below_threshold_no_group(ops_db, app_dir, registry_root,
                                              tmp_path):
    """5 resultados < threshold → ningun Group, comportamiento clasico."""
    make_node(ops_db, node_id="t1", name="tomate",
              type_ref="text", metadata={})
    plan = {
        "match": [
            {"contains": "duckduckgo.com",
             "text": _build_lite_html(5),
             "status": 200},
        ],
    }
    env = stub_requests(tmp_path, plan)
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate", node_type="text",
                   params={"limit": 0})  # 0 = sin truncar
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, err
    assert out["results"] == 5
    assert out["grouped"] is False
    assert out["group_id"] == ""

    # Sin Group creado.
    groups = list_entities(ops_db, type_ref="Group")
    assert groups == []
    # 5 Urls sueltos.
    urls = list_entities(ops_db, type_ref="Url")
    assert len(urls) == 5
    assert all(u["group_id"] is None for u in urls)
    # Todos con la misma batch_id en metadata.
    batch_ids = {u["metadata"].get("batch_id") for u in urls}
    assert len(batch_ids) == 1 and "" not in batch_ids


def test_web_search_above_threshold_creates_group_and_preview(
        ops_db, app_dir, registry_root, tmp_path):
    """100 resultados → 1 Group + 10 sueltos + 90 con group_id."""
    make_node(ops_db, node_id="t1", name="tomate",
              type_ref="text", metadata={})
    plan = {
        "match": [
            {"contains": "duckduckgo.com",
             "text": _build_lite_html(100),
             "status": 200},
        ],
    }
    env = stub_requests(tmp_path, plan)
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate", node_type="text",
                   params={"limit": 0})
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, err
    assert out["results"] == 100
    assert out["grouped"] is True
    assert out["group_id"]

    # Exactamente 1 Group.
    groups = list_entities(ops_db, type_ref="Group")
    assert len(groups) == 1
    g = groups[0]
    assert g["metadata"]["count"] == 100
    assert g["metadata"]["query"] == "tomate"
    assert g["metadata"]["enricher"] == "web_search"
    assert g["metadata"].get("batch_id")

    # 10 Urls sin group_id (preview) + 90 con group_id.
    urls = list_entities(ops_db, type_ref="Url")
    assert len(urls) == 100
    sueltos = [u for u in urls if u["group_id"] is None]
    children = [u for u in urls if u["group_id"] == g["id"]]
    assert len(sueltos) == 10
    assert len(children) == 90

    # Todos los Urls con relacion SEARCH_RESULT_OF al source original.
    rels = list_relations(ops_db, name="SEARCH_RESULT_OF")
    # 100 Urls + 1 Group = 101 relaciones al source.
    from_to_t1 = [r for r in rels if r["to_entity"] == "t1"]
    assert len(from_to_t1) == 101
    # Group → t1.
    assert any(r["from_entity"] == g["id"] for r in from_to_t1)


def test_web_search_batch_id_shared_across_outputs(
        ops_db, app_dir, registry_root, tmp_path):
    """Tras un run con grouping, group + preview + hijos comparten batch_id."""
    make_node(ops_db, node_id="t1", name="tomate",
              type_ref="text", metadata={})
    plan = {
        "match": [
            {"contains": "duckduckgo.com",
             "text": _build_lite_html(100),
             "status": 200},
        ],
    }
    env = stub_requests(tmp_path, plan)
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate", node_type="text",
                   params={"limit": 0})
    rc, out, err = run_enricher("web_search", ctx, env=env)
    assert rc == 0, err

    expected_batch = out["batch_id"]
    assert expected_batch

    groups = list_entities(ops_db, type_ref="Group")
    urls   = list_entities(ops_db, type_ref="Url")
    all_nodes = groups + urls
    assert len(all_nodes) == 101  # 1 Group + 100 Urls

    batch_ids = {n["metadata"].get("batch_id") for n in all_nodes}
    assert batch_ids == {expected_batch}, (
        f"batch_ids inconsistentes: {batch_ids}")