graph_explorer/tests/test_split_sentences.py

"""Tests del enricher split_sentences — split por regex, sin red.

Cubrimos:
  - happy path: 5 frases → 5 nodos Sentence + relaciones SENTENCE_OF.
  - below threshold: ningun Group.
  - above threshold (>=50): 1 Group + K sueltos + N-K agrupados.
  - sin texto: exit 2 con mensaje claro.
"""
from __future__ import annotations

from conftest import (
    base_ctx, list_entities, list_relations, make_node, run_enricher,
)


SAMPLE_TEXT = (
    "El tomate es originario de America. Su cultivo se extendio por Europa "
    "en el siglo XVI. Hoy se considera una hortaliza basica. La variedad "
    "cherry es popular en ensaladas frescas. Existen mas de mil variedades "
    "registradas en el mundo entero."
)


def _build_paragraph(n: int) -> str:
    """Genera un texto con N frases unicas, cada una >=20 chars."""
    rows = []
    for i in range(n):
        rows.append(
            f"Esta es la frase numero {i:03d} con suficiente contenido "
            f"para superar el min_length por defecto del enricher."
        )
    return " ".join(rows)


def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root):
    """Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF."""
    make_node(ops_db, node_id="t1", name="tomate doc",
              type_ref="text", metadata={"text": SAMPLE_TEXT})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="tomate doc", node_type="text",
                   metadata={"text": SAMPLE_TEXT})

    rc, out, err = run_enricher("split_sentences", ctx)
    assert rc == 0, err
    assert out is not None
    assert out["sentences"] == 5, out
    assert out["entities_added"] == 5
    assert out["grouped"] is False
    assert out["group_id"] == ""

    sentences = list_entities(ops_db, type_ref="Sentence")
    assert len(sentences) == 5
    # Todas con metadata.text igual a la frase completa y rank ascendente.
    ranks = sorted(s["metadata"]["rank"] for s in sentences)
    assert ranks == [1, 2, 3, 4, 5]
    # batch_id compartido.
    batch_ids = {s["metadata"]["batch_id"] for s in sentences}
    assert len(batch_ids) == 1

    rels = list_relations(ops_db, name="SENTENCE_OF")
    assert len(rels) == 5
    assert all(r["to_entity"] == "t1" for r in rels)


def test_split_sentences_below_threshold_no_group(ops_db, app_dir,
                                                    registry_root):
    """30 frases → ningun Group (<50)."""
    text = _build_paragraph(30)
    make_node(ops_db, node_id="t1", name="big doc",
              type_ref="text", metadata={"text": text})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="big doc", node_type="text",
                   metadata={"text": text})
    rc, out, err = run_enricher("split_sentences", ctx)
    assert rc == 0, err
    assert out["sentences"] == 30
    assert out["grouped"] is False
    assert out["group_id"] == ""

    groups = list_entities(ops_db, type_ref="Group")
    assert groups == []
    sentences = list_entities(ops_db, type_ref="Sentence")
    assert len(sentences) == 30
    assert all(s["group_id"] is None for s in sentences)


def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
                                                        registry_root):
    """100 frases → 1 Group + 10 sueltos + 90 con group_id."""
    text = _build_paragraph(100)
    make_node(ops_db, node_id="t1", name="huge doc",
              type_ref="text", metadata={"text": text})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="huge doc", node_type="text",
                   metadata={"text": text})
    rc, out, err = run_enricher("split_sentences", ctx)
    assert rc == 0, err
    assert out["sentences"] == 100
    assert out["grouped"] is True
    assert out["group_id"]

    groups = list_entities(ops_db, type_ref="Group")
    assert len(groups) == 1
    g = groups[0]
    assert g["metadata"]["count"] == 100
    assert g["metadata"]["enricher"] == "split_sentences"
    assert g["metadata"]["source_node_id"] == "t1"
    assert g["metadata"].get("batch_id")

    sentences = list_entities(ops_db, type_ref="Sentence")
    assert len(sentences) == 100
    sueltos = [s for s in sentences if s["group_id"] is None]
    children = [s for s in sentences if s["group_id"] == g["id"]]
    assert len(sueltos) == 10
    assert len(children) == 90

    # Group + 100 Sentence = 101 SENTENCE_OF al source.
    rels = list_relations(ops_db, name="SENTENCE_OF")
    to_t1 = [r for r in rels if r["to_entity"] == "t1"]
    assert len(to_t1) == 101
    assert any(r["from_entity"] == g["id"] for r in to_t1)


def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
    """Nodo sin metadata.text/description/query y name corto → exit 2."""
    make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="x", node_type="text")
    rc, out, err = run_enricher("split_sentences", ctx)
    assert rc == 2
    assert out is not None
    assert "demasiado corto" in (out.get("error") or "") or \
           "min_length" in (out.get("error") or "")


def test_split_sentences_uses_metadata_text_priority(ops_db, app_dir,
                                                      registry_root):
    """metadata.text gana sobre node_name aunque ambos tengan texto."""
    make_node(ops_db, node_id="t1", name="placeholder corto",
              type_ref="text", metadata={"text": SAMPLE_TEXT})
    ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
                   node_id="t1", node_name="placeholder corto",
                   node_type="text",
                   metadata={"text": SAMPLE_TEXT})
    rc, out, err = run_enricher("split_sentences", ctx)
    assert rc == 0, err
    assert out["sentences"] == 5  # 5 frases del SAMPLE_TEXT, no 1 del name