6919ebfe9c
Anade enricher web_search aplicable a nodos text/Concept/Topic. Hace POST a html.duckduckgo.com con la query del nodo, parsea resultados con HTMLParser stdlib, decodifica el redirect uddg= y crea N nodos Url con relacion SEARCH_RESULT_OF apuntando al nodo origen. Encadenable: tras web_search, fetch_webpage sobre cada Url completa el pipeline search -> fetch -> extract. Defensa contra ops_db_path mal resuelto: normaliza backslashes, resuelve relativo contra app_dir, valida que la tabla entities exista antes de tocar nada (exit codes 7/8/9 con JSON resumen). Tests pytest (16/16 verde): conftest con operations.db temp + schema minimo, stub de requests via PYTHONPATH para mockear red. Cubre los 5 enrichers (extract_domain, fetch_webpage, extract_links, extract_text_entities, web_search) + sanity check de manifests. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
61 lines
2.5 KiB
Python
61 lines
2.5 KiB
Python
"""Tests del enricher extract_domain.
|
|
|
|
Pure regex/parsing — sin red. Verifica:
|
|
- Url con metadata.url crea Domain + BELONGS_TO
|
|
- Email crea Domain (desde la parte derecha del @)
|
|
- Si el Domain ya existe se reusa, no se duplica
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from conftest import (
|
|
base_ctx, get_entity, list_entities, list_relations,
|
|
make_node, run_enricher,
|
|
)
|
|
|
|
|
|
def test_url_creates_domain_and_relation(ops_db, app_dir, registry_root):
|
|
make_node(ops_db, node_id="u1", name="ex",
|
|
type_ref="Url", metadata={"url": "https://www.example.com/path"})
|
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
|
node_id="u1", node_name="ex", node_type="Url",
|
|
metadata={"url": "https://www.example.com/path"})
|
|
|
|
rc, out, err = run_enricher("extract_domain", ctx)
|
|
assert rc == 0, err
|
|
assert out and out.get("entities_added", 0) >= 1, out
|
|
|
|
domains = list_entities(ops_db, type_ref="Domain")
|
|
assert any(d["name"] == "www.example.com" for d in domains), domains
|
|
|
|
rels = list_relations(ops_db, name="BELONGS_TO")
|
|
assert len(rels) == 1
|
|
assert rels[0]["from_entity"] == "u1"
|
|
|
|
|
|
def test_email_creates_domain(ops_db, app_dir, registry_root):
|
|
make_node(ops_db, node_id="e1", name="user@aurgi.com",
|
|
type_ref="Email", metadata={"address": "user@aurgi.com"})
|
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
|
node_id="e1", node_name="user@aurgi.com", node_type="Email")
|
|
rc, out, err = run_enricher("extract_domain", ctx)
|
|
assert rc == 0, err
|
|
domains = list_entities(ops_db, type_ref="Domain")
|
|
assert any(d["name"] == "aurgi.com" for d in domains), domains
|
|
|
|
|
|
def test_existing_domain_is_reused(ops_db, app_dir, registry_root):
|
|
# Pre-crear un Domain con el mismo nombre.
|
|
make_node(ops_db, node_id="d1", name="example.com", type_ref="Domain",
|
|
metadata={})
|
|
make_node(ops_db, node_id="u1", name="ex", type_ref="Url",
|
|
metadata={"url": "https://example.com/x"})
|
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
|
node_id="u1", node_name="ex", node_type="Url",
|
|
metadata={"url": "https://example.com/x"})
|
|
rc, out, err = run_enricher("extract_domain", ctx)
|
|
assert rc == 0, err
|
|
|
|
domains = list_entities(ops_db, type_ref="Domain")
|
|
names = [d["name"] for d in domains]
|
|
assert names.count("example.com") == 1, domains
|