Files
graph_explorer/tests/test_split_sentences.py
T
egutierrez 2a5127fcaf fix(enrichers): split_sentences y extract_iocs_text leen entities.notes
El campo `notes` es lo que el usuario escribe en el panel Note del
Inspector (doble click sobre el nodo) — sitio canonico para texto
largo. Antes los enrichers leian metadata.text/description/query como
prioridad, dejando notes ignorado y forzando al usuario a inyectar
texto via la UI metadata-extra (poco descubrible).

Cambios:
- Ambos run.py abren la BD y leen `entities.notes` por SQL antes de
  fallback a node_name. metadata.text/description/query ya no se
  consultan (KISS — solo notes y name).
- conftest.make_node admite kwarg `notes` para inyectar contenido
  en la columna notes desde tests.
- Tests actualizados: SAMPLE_TEXT y los IoC dumps van por `notes=`
  en lugar de `metadata={"text": ...}`.
- Renombrado el test que verificaba prioridad: ahora se llama
  `*_uses_notes_priority` y verifica notes > name.

Tests verdes WSL (44) y Windows (33 + 11 skipped).
2026-05-03 15:36:18 +02:00

143 lines
5.6 KiB
Python

"""Tests del enricher split_sentences — split por regex, sin red.
Cubrimos:
- happy path: 5 frases → 5 nodos Sentence + relaciones SENTENCE_OF.
- below threshold: ningun Group.
- above threshold (>=50): 1 Group + K sueltos + N-K agrupados.
- sin texto: exit 2 con mensaje claro.
"""
from __future__ import annotations
from conftest import (
base_ctx, list_entities, list_relations, make_node, run_enricher,
)
SAMPLE_TEXT = (
"El tomate es originario de America. Su cultivo se extendio por Europa "
"en el siglo XVI. Hoy se considera una hortaliza basica. La variedad "
"cherry es popular en ensaladas frescas. Existen mas de mil variedades "
"registradas en el mundo entero."
)
def _build_paragraph(n: int) -> str:
"""Genera un texto con N frases unicas, cada una >=20 chars."""
rows = []
for i in range(n):
rows.append(
f"Esta es la frase numero {i:03d} con suficiente contenido "
f"para superar el min_length por defecto del enricher."
)
return " ".join(rows)
def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root):
"""Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF."""
make_node(ops_db, node_id="t1", name="tomate doc",
type_ref="text", notes=SAMPLE_TEXT)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="tomate doc", node_type="text")
rc, out, err = run_enricher("split_sentences", ctx)
assert rc == 0, err
assert out is not None
assert out["sentences"] == 5, out
assert out["entities_added"] == 5
assert out["grouped"] is False
assert out["group_id"] == ""
sentences = list_entities(ops_db, type_ref="Sentence")
assert len(sentences) == 5
# Todas con metadata.text igual a la frase completa y rank ascendente.
ranks = sorted(s["metadata"]["rank"] for s in sentences)
assert ranks == [1, 2, 3, 4, 5]
# batch_id compartido.
batch_ids = {s["metadata"]["batch_id"] for s in sentences}
assert len(batch_ids) == 1
rels = list_relations(ops_db, name="SENTENCE_OF")
assert len(rels) == 5
assert all(r["to_entity"] == "t1" for r in rels)
def test_split_sentences_below_threshold_no_group(ops_db, app_dir,
registry_root):
"""30 frases → ningun Group (<50)."""
text = _build_paragraph(30)
make_node(ops_db, node_id="t1", name="big doc",
type_ref="text", notes=text)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="big doc", node_type="text")
rc, out, err = run_enricher("split_sentences", ctx)
assert rc == 0, err
assert out["sentences"] == 30
assert out["grouped"] is False
assert out["group_id"] == ""
groups = list_entities(ops_db, type_ref="Group")
assert groups == []
sentences = list_entities(ops_db, type_ref="Sentence")
assert len(sentences) == 30
assert all(s["group_id"] is None for s in sentences)
def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
registry_root):
"""100 frases → 1 Group + 10 sueltos + 90 con group_id."""
text = _build_paragraph(100)
make_node(ops_db, node_id="t1", name="huge doc",
type_ref="text", notes=text)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="huge doc", node_type="text")
rc, out, err = run_enricher("split_sentences", ctx)
assert rc == 0, err
assert out["sentences"] == 100
assert out["grouped"] is True
assert out["group_id"]
groups = list_entities(ops_db, type_ref="Group")
assert len(groups) == 1
g = groups[0]
assert g["metadata"]["count"] == 100
assert g["metadata"]["enricher"] == "split_sentences"
assert g["metadata"]["source_node_id"] == "t1"
assert g["metadata"].get("batch_id")
sentences = list_entities(ops_db, type_ref="Sentence")
assert len(sentences) == 100
sueltos = [s for s in sentences if s["group_id"] is None]
children = [s for s in sentences if s["group_id"] == g["id"]]
assert len(sueltos) == 10
assert len(children) == 90
# Group + 100 Sentence = 101 SENTENCE_OF al source.
rels = list_relations(ops_db, name="SENTENCE_OF")
to_t1 = [r for r in rels if r["to_entity"] == "t1"]
assert len(to_t1) == 101
assert any(r["from_entity"] == g["id"] for r in to_t1)
def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
"""Nodo sin notes y con name corto → exit 2."""
make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={})
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="x", node_type="text")
rc, out, err = run_enricher("split_sentences", ctx)
assert rc == 2
assert out is not None
assert "demasiado corto" in (out.get("error") or "") or \
"min_length" in (out.get("error") or "")
def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root):
"""`entities.notes` gana sobre node_name aunque ambos tengan texto."""
make_node(ops_db, node_id="t1", name="placeholder corto",
type_ref="text", notes=SAMPLE_TEXT)
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
node_id="t1", node_name="placeholder corto",
node_type="text")
rc, out, err = run_enricher("split_sentences", ctx)
assert rc == 0, err
assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name