"""Tests del enricher split_sentences — split por regex, sin red. Cubrimos: - happy path: 5 frases → 5 nodos Sentence + relaciones SENTENCE_OF. - below threshold: ningun Group. - above threshold (>=50): 1 Group + K sueltos + N-K agrupados. - sin texto: exit 2 con mensaje claro. """ from __future__ import annotations from conftest import ( base_ctx, list_entities, list_relations, make_node, run_enricher, ) SAMPLE_TEXT = ( "El tomate es originario de America. Su cultivo se extendio por Europa " "en el siglo XVI. Hoy se considera una hortaliza basica. La variedad " "cherry es popular en ensaladas frescas. Existen mas de mil variedades " "registradas en el mundo entero." ) def _build_paragraph(n: int) -> str: """Genera un texto con N frases unicas, cada una >=20 chars.""" rows = [] for i in range(n): rows.append( f"Esta es la frase numero {i:03d} con suficiente contenido " f"para superar el min_length por defecto del enricher." ) return " ".join(rows) def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root): """Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF.""" make_node(ops_db, node_id="t1", name="tomate doc", type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="tomate doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out is not None assert out["sentences"] == 5, out assert out["entities_added"] == 5 assert out["grouped"] is False assert out["group_id"] == "" sentences = list_entities(ops_db, type_ref="Sentence") assert len(sentences) == 5 # Todas con metadata.text igual a la frase completa y rank ascendente. ranks = sorted(s["metadata"]["rank"] for s in sentences) assert ranks == [1, 2, 3, 4, 5] # batch_id compartido. batch_ids = {s["metadata"]["batch_id"] for s in sentences} assert len(batch_ids) == 1 rels = list_relations(ops_db, name="SENTENCE_OF") assert len(rels) == 5 assert all(r["to_entity"] == "t1" for r in rels) def test_split_sentences_below_threshold_no_group(ops_db, app_dir, registry_root): """30 frases → ningun Group (<50).""" text = _build_paragraph(30) make_node(ops_db, node_id="t1", name="big doc", type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="big doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 30 assert out["grouped"] is False assert out["group_id"] == "" groups = list_entities(ops_db, type_ref="Group") assert groups == [] sentences = list_entities(ops_db, type_ref="Sentence") assert len(sentences) == 30 assert all(s["group_id"] is None for s in sentences) def test_split_sentences_above_threshold_creates_group(ops_db, app_dir, registry_root): """100 frases → 1 Group + 10 sueltos + 90 con group_id.""" text = _build_paragraph(100) make_node(ops_db, node_id="t1", name="huge doc", type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="huge doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 100 assert out["grouped"] is True assert out["group_id"] groups = list_entities(ops_db, type_ref="Group") assert len(groups) == 1 g = groups[0] assert g["metadata"]["count"] == 100 assert g["metadata"]["enricher"] == "split_sentences" assert g["metadata"]["source_node_id"] == "t1" assert g["metadata"].get("batch_id") sentences = list_entities(ops_db, type_ref="Sentence") assert len(sentences) == 100 sueltos = [s for s in sentences if s["group_id"] is None] children = [s for s in sentences if s["group_id"] == g["id"]] assert len(sueltos) == 10 assert len(children) == 90 # Group + 100 Sentence = 101 SENTENCE_OF al source. rels = list_relations(ops_db, name="SENTENCE_OF") to_t1 = [r for r in rels if r["to_entity"] == "t1"] assert len(to_t1) == 101 assert any(r["from_entity"] == g["id"] for r in to_t1) def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root): """Nodo sin notes y con name corto → exit 2.""" make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={}) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="x", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 2 assert out is not None assert "demasiado corto" in (out.get("error") or "") or \ "min_length" in (out.get("error") or "") def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root): """`entities.notes` gana sobre node_name aunque ambos tengan texto.""" make_node(ops_db, node_id="t1", name="placeholder corto", type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="placeholder corto", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name