"""Tests del enricher web_search (DuckDuckGo HTML).""" from __future__ import annotations from pathlib import Path from conftest import ( base_ctx, list_entities, list_relations, make_node, run_enricher, stub_requests, TESTS_DIR, ) DDG_FIXTURE = TESTS_DIR / "fixtures" / "ddg_results.html" def test_web_search_creates_url_results_for_text_node( ops_db, app_dir, registry_root, tmp_path): make_node(ops_db, node_id="t1", name="tomate", type_ref="text", metadata={}) plan = { "match": [ {"contains": "duckduckgo.com", "text": DDG_FIXTURE.read_text(encoding="utf-8"), "status": 200}, ], "default": {"text": "", "status": 404}, } env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate", node_type="text", params={"limit": 5}) rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, f"stderr={err}" assert out is not None, err assert out["engine"] == "duckduckgo" assert out["results"] == 3, out assert out["entities_added"] == 3 assert out["relations_added"] == 3 urls = list_entities(ops_db, type_ref="Url") targets = {e["metadata"].get("url") for e in urls} assert "https://es.wikipedia.org/wiki/Tomate" in targets assert "https://www.botanical-online.com/alimentos/tomate-propiedades" in targets rels = list_relations(ops_db, name="SEARCH_RESULT_OF") assert len(rels) == 3 assert all(r["to_entity"] == "t1" for r in rels) # Metadata enriquecida. wiki = next(e for e in urls if e["metadata"].get("url") == "https://es.wikipedia.org/wiki/Tomate") assert wiki["metadata"]["query"] == "tomate" assert wiki["metadata"]["rank"] == 1 assert "Wikipedia" in wiki["metadata"]["title"] def test_web_search_uses_metadata_query_over_name(ops_db, app_dir, registry_root, tmp_path): """metadata.query debe ganar prioridad sobre node_name.""" make_node(ops_db, node_id="t1", name="placeholder", type_ref="text", metadata={"query": "tomate"}) plan = {"match": [{"contains": "duckduckgo.com", "text": DDG_FIXTURE.read_text(encoding="utf-8")}]} env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="placeholder", node_type="text", metadata={"query": "tomate"}) rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, err assert out["query"] == "tomate" def test_web_search_limit_truncates_results(ops_db, app_dir, registry_root, tmp_path): make_node(ops_db, node_id="t1", name="tomate", type_ref="text") plan = {"match": [{"contains": "duckduckgo.com", "text": DDG_FIXTURE.read_text(encoding="utf-8")}]} env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate", node_type="text", params={"limit": 1}) rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, err assert out["results"] == 1 assert out["entities_added"] == 1 def test_web_search_no_query_fails_clean(ops_db, app_dir, registry_root, tmp_path): make_node(ops_db, node_id="t1", name="", type_ref="text", metadata={}) env = stub_requests(tmp_path, {"default": {"text": "", "status": 200}}) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="", node_type="text") rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 2 assert "sin query" in err # --------------------------------------------------------------------------- # 0035c — agrupacion automatica cuando se excede el threshold # --------------------------------------------------------------------------- def _build_lite_html(n: int) -> str: """Genera un HTML estilo lite.duckduckgo.com con N resultados. Estructura minima que `_DDGLiteParser` consume: por cada resultado un anchor con `class='result-link'` y un `` con el snippet inmediatamente despues. URLs unicas para evitar la deduplicacion del parser. """ rows = [] for i in range(n): url = f"https://example.com/result-{i:03d}" rows.append( f"" f"Result {i:03d}" f"snippet for result {i:03d}" ) return ( "" + "".join(rows) + "" ) def test_web_search_below_threshold_no_group(ops_db, app_dir, registry_root, tmp_path): """5 resultados < threshold → ningun Group, comportamiento clasico.""" make_node(ops_db, node_id="t1", name="tomate", type_ref="text", metadata={}) plan = { "match": [ {"contains": "duckduckgo.com", "text": _build_lite_html(5), "status": 200}, ], } env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate", node_type="text", params={"limit": 0}) # 0 = sin truncar rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, err assert out["results"] == 5 assert out["grouped"] is False assert out["group_id"] == "" # Sin Group creado. groups = list_entities(ops_db, type_ref="Group") assert groups == [] # 5 Urls sueltos. urls = list_entities(ops_db, type_ref="Url") assert len(urls) == 5 assert all(u["group_id"] is None for u in urls) # Todos con la misma batch_id en metadata. batch_ids = {u["metadata"].get("batch_id") for u in urls} assert len(batch_ids) == 1 and "" not in batch_ids def test_web_search_above_threshold_creates_group_and_preview( ops_db, app_dir, registry_root, tmp_path): """100 resultados → 1 Group + 10 sueltos + 90 con group_id.""" make_node(ops_db, node_id="t1", name="tomate", type_ref="text", metadata={}) plan = { "match": [ {"contains": "duckduckgo.com", "text": _build_lite_html(100), "status": 200}, ], } env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate", node_type="text", params={"limit": 0}) rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, err assert out["results"] == 100 assert out["grouped"] is True assert out["group_id"] # Exactamente 1 Group. groups = list_entities(ops_db, type_ref="Group") assert len(groups) == 1 g = groups[0] assert g["metadata"]["count"] == 100 assert g["metadata"]["query"] == "tomate" assert g["metadata"]["enricher"] == "web_search" assert g["metadata"].get("batch_id") # 10 Urls sin group_id (preview) + 90 con group_id. urls = list_entities(ops_db, type_ref="Url") assert len(urls) == 100 sueltos = [u for u in urls if u["group_id"] is None] children = [u for u in urls if u["group_id"] == g["id"]] assert len(sueltos) == 10 assert len(children) == 90 # Todos los Urls con relacion SEARCH_RESULT_OF al source original. rels = list_relations(ops_db, name="SEARCH_RESULT_OF") # 100 Urls + 1 Group = 101 relaciones al source. from_to_t1 = [r for r in rels if r["to_entity"] == "t1"] assert len(from_to_t1) == 101 # Group → t1. assert any(r["from_entity"] == g["id"] for r in from_to_t1) def test_web_search_batch_id_shared_across_outputs( ops_db, app_dir, registry_root, tmp_path): """Tras un run con grouping, group + preview + hijos comparten batch_id.""" make_node(ops_db, node_id="t1", name="tomate", type_ref="text", metadata={}) plan = { "match": [ {"contains": "duckduckgo.com", "text": _build_lite_html(100), "status": 200}, ], } env = stub_requests(tmp_path, plan) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate", node_type="text", params={"limit": 0}) rc, out, err = run_enricher("web_search", ctx, env=env) assert rc == 0, err expected_batch = out["batch_id"] assert expected_batch groups = list_entities(ops_db, type_ref="Group") urls = list_entities(ops_db, type_ref="Url") all_nodes = groups + urls assert len(all_nodes) == 101 # 1 Group + 100 Urls batch_ids = {n["metadata"].get("batch_id") for n in all_nodes} assert batch_ids == {expected_batch}, ( f"batch_ids inconsistentes: {batch_ids}")