Files
graph_explorer/tests/test_extract_domain.py
T
egutierrez 6919ebfe9c feat(enrichers): web_search DuckDuckGo + tests pytest de los 5 enrichers
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace
POST a html.duckduckgo.com con la query del nodo, parsea resultados
con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos
Url con relacion SEARCH_RESULT_OF apuntando al nodo origen.

Encadenable: tras web_search, fetch_webpage sobre cada Url completa
el pipeline search -> fetch -> extract.

Defensa contra ops_db_path mal resuelto: normaliza backslashes,
resuelve relativo contra app_dir, valida que la tabla entities
exista antes de tocar nada (exit codes 7/8/9 con JSON resumen).

Tests pytest (16/16 verde): conftest con operations.db temp +
schema minimo, stub de requests via PYTHONPATH para mockear red.
Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links,
extract_text_entities, web_search) + sanity check de manifests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 16:10:13 +02:00

61 lines
2.5 KiB
Python

"""Tests del enricher extract_domain.
Pure regex/parsing — sin red. Verifica:
- Url con metadata.url crea Domain + BELONGS_TO
- Email crea Domain (desde la parte derecha del @)
- Si el Domain ya existe se reusa, no se duplica
"""
from __future__ import annotations
from conftest import (
base_ctx, get_entity, list_entities, list_relations,
make_node, run_enricher,
)
def test_url_creates_domain_and_relation(ops_db, app_dir, registry_root):
make_node(ops_db, node_id="u1", name="ex",
type_ref="Url", metadata={"url": "https://www.example.com/path"})
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="u1", node_name="ex", node_type="Url",
metadata={"url": "https://www.example.com/path"})
rc, out, err = run_enricher("extract_domain", ctx)
assert rc == 0, err
assert out and out.get("entities_added", 0) >= 1, out
domains = list_entities(ops_db, type_ref="Domain")
assert any(d["name"] == "www.example.com" for d in domains), domains
rels = list_relations(ops_db, name="BELONGS_TO")
assert len(rels) == 1
assert rels[0]["from_entity"] == "u1"
def test_email_creates_domain(ops_db, app_dir, registry_root):
make_node(ops_db, node_id="e1", name="user@aurgi.com",
type_ref="Email", metadata={"address": "user@aurgi.com"})
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="e1", node_name="user@aurgi.com", node_type="Email")
rc, out, err = run_enricher("extract_domain", ctx)
assert rc == 0, err
domains = list_entities(ops_db, type_ref="Domain")
assert any(d["name"] == "aurgi.com" for d in domains), domains
def test_existing_domain_is_reused(ops_db, app_dir, registry_root):
# Pre-crear un Domain con el mismo nombre.
make_node(ops_db, node_id="d1", name="example.com", type_ref="Domain",
metadata={})
make_node(ops_db, node_id="u1", name="ex", type_ref="Url",
metadata={"url": "https://example.com/x"})
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="u1", node_name="ex", node_type="Url",
metadata={"url": "https://example.com/x"})
rc, out, err = run_enricher("extract_domain", ctx)
assert rc == 0, err
domains = list_entities(ops_db, type_ref="Domain")
names = [d["name"] for d in domains]
assert names.count("example.com") == 1, domains