graph_explorer/tests/test_extract_links.py

"""Tests del enricher extract_links — sin red, lee markdown del cache."""
from __future__ import annotations

from pathlib import Path

from conftest import (
    base_ctx, list_entities, list_relations, make_node, run_enricher,
)


SAMPLE_MD = """# Pagina demo

Aqui hay [un enlace](https://example.com/articulo) interesante y
otro [duplicado](https://example.com/articulo) que no debe contar
dos veces.

Tambien una URL pelada: https://otra.example/path?q=1
y https://tercera.example/

Y un email que NO debe extraer como Url: contact@no.example
"""


def test_extract_links_creates_url_nodes(ops_db, app_dir, registry_root):
    # 1) Crear el cache con el markdown.
    md_dir = Path(app_dir) / "cache" / "ab"
    md_dir.mkdir(parents=True, exist_ok=True)
    md_path = md_dir / "abc.md"
    md_path.write_text(SAMPLE_MD, encoding="utf-8")
    rel = md_path.relative_to(app_dir)

    # 2) Crear Webpage con metadata.markdown_path apuntando al cache.
    make_node(ops_db, node_id="w1", name="demo",
              type_ref="Webpage", metadata={"markdown_path": str(rel)})

    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="w1", node_name="demo", node_type="Webpage",
                   metadata={"markdown_path": str(rel)})

    rc, out, err = run_enricher("extract_links", ctx)
    assert rc == 0, err
    assert out is not None, err
    assert out["entities_added"] >= 3, out

    urls = [e["name"] for e in list_entities(ops_db, type_ref="Url")]
    assert "https://example.com/articulo" in urls
    assert "https://otra.example/path?q=1" in urls

    rels = list_relations(ops_db, name="LINKS_TO")
    assert len(rels) >= 3
    assert all(r["from_entity"] == "w1" for r in rels)


def test_extract_links_without_markdown_path_errors(ops_db, app_dir,
                                                     registry_root):
    make_node(ops_db, node_id="w1", name="demo",
              type_ref="Webpage", metadata={})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="w1", node_name="demo", node_type="Webpage")
    rc, out, err = run_enricher("extract_links", ctx)
    assert rc != 0, "deberia fallar sin markdown_path"
    assert out is not None
    assert "missing markdown_path" in (out.get("error") or "")