52495af779
Manifest YAML puede declarar 'auto_group_threshold: <int>' a nivel top-level. enrichers.cpp lo parsea y lo guarda en EnricherSpec. jobs.cpp lo inyecta como campo opcional 'auto_group_threshold' en el JSON stdin del subprocess. Los enrichers Python que crean Groups (web_search, split_words, split_sentences, extract_iocs_text) leen el campo y, si viene > 0, lo usan en lugar de su DEFAULT_GROUP_THRESHOLD. Helper _coerce_threshold tolera int / str / None / 0 cayendo al default.
350 lines
12 KiB
Python
350 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Enricher split_sentences — parte texto en frases (regex puro, offline).
|
|
|
|
Wire protocol estandar (issue 0026):
|
|
- stdin: JSON con node_id, node_name, metadata, ops_db_path, app_dir,
|
|
cache_dir, registry_root, params.
|
|
- stderr: lineas `PROGRESS:<float> <stage>` para feedback de UI.
|
|
- stdout: una linea JSON al final con resumen.
|
|
- exit code 0 = ok, !=0 = error.
|
|
|
|
Lectura del texto (en orden de prioridad):
|
|
1. `entities.notes` (lo que el usuario escribe en el panel Note
|
|
via doble click — sitio canonico de texto
|
|
largo)
|
|
2. node_name (titulo del nodo, fallback minimo)
|
|
|
|
Si tras esto el texto es < min_length, falla con exit 2 y mensaje claro.
|
|
|
|
Grouping (issue 0035c, mismo patron que web_search):
|
|
- Si len(sentences) >= GROUP_THRESHOLD y la BD soporta group_id:
|
|
* Crea Group `type_ref='Group'` colgando del source con SENTENCE_OF.
|
|
* Primeras GROUP_PREVIEW_K frases sueltas (group_id=NULL).
|
|
* Resto con group_id apuntando al Group recien creado.
|
|
- Si <threshold: todas sueltas, sin Group.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
DEFAULT_GROUP_THRESHOLD = 50
|
|
GROUP_PREVIEW_K = 10
|
|
|
|
|
|
def _coerce_threshold(raw, default: int) -> int:
|
|
"""Acepta int / str numerico / None, devuelve >0 o el default (issue 0035e)."""
|
|
if raw is None or raw == "":
|
|
return default
|
|
try:
|
|
v = int(raw)
|
|
except (TypeError, ValueError):
|
|
return default
|
|
return v if v > 0 else default
|
|
|
|
# Split por delimitador de oracion (.!?) seguido de whitespace seguido de
|
|
# inicial de oracion en mayusculas (incluye acentos espanoles). Robusto
|
|
# para texto en espanol e ingles. Casos limite (abreviaturas como "Sr.",
|
|
# "Dr.") quedan como falsos negativos aceptables — el split es heuristico.
|
|
_SENT_SPLIT_RE = re.compile(r'(?<=[.!?])\s+(?=[A-ZÁÉÍÓÚÜÑ])')
|
|
|
|
|
|
def progress(p: float, stage: str = "") -> None:
|
|
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def log(msg: str) -> None:
|
|
sys.stderr.write(f"{msg}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def now_iso() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def now_ms() -> int:
|
|
return int(time.time() * 1000)
|
|
|
|
|
|
def has_group_id_column(conn: sqlite3.Connection) -> bool:
|
|
"""Detecta si la columna `group_id` existe en `entities`.
|
|
|
|
El schema actual la incluye (issue 0035a) pero las BDs viejas pueden
|
|
no tenerla. Si no esta, insertamos sin esa columna.
|
|
"""
|
|
try:
|
|
cur = conn.execute("PRAGMA table_info(entities)")
|
|
for row in cur:
|
|
if row[1] == "group_id":
|
|
return True
|
|
except sqlite3.Error:
|
|
pass
|
|
return False
|
|
|
|
|
|
def read_text(ops_db_path: str, node_id: str, node_name: str) -> str:
|
|
"""Resuelve el texto a procesar.
|
|
|
|
Prioridad:
|
|
1. `entities.notes` del nodo (lo que el usuario escribe en el panel
|
|
Note via doble click). Es el sitio canonico para texto largo.
|
|
2. `node_name` (titulo del nodo) como fallback minimo.
|
|
"""
|
|
notes = ""
|
|
try:
|
|
c = sqlite3.connect(ops_db_path)
|
|
try:
|
|
row = c.execute(
|
|
"SELECT notes FROM entities WHERE id=?", (node_id,)
|
|
).fetchone()
|
|
if row and isinstance(row[0], str):
|
|
notes = row[0]
|
|
finally:
|
|
c.close()
|
|
except sqlite3.Error:
|
|
notes = ""
|
|
if notes and notes.strip():
|
|
return notes.strip()
|
|
return (node_name or "").strip()
|
|
|
|
|
|
def split_into_sentences(text: str, min_length: int) -> list[str]:
|
|
"""Aplica el regex de split y filtra por longitud minima."""
|
|
parts = _SENT_SPLIT_RE.split(text)
|
|
out: list[str] = []
|
|
for p in parts:
|
|
s = p.strip()
|
|
if len(s) < min_length:
|
|
continue
|
|
out.append(s)
|
|
return out
|
|
|
|
|
|
def insert_sentence(conn: sqlite3.Connection, *, sentence: str, rank: int,
|
|
batch_id: str, group_id: str | None,
|
|
has_group_col: bool) -> str:
|
|
"""Inserta un nodo Sentence y devuelve su id. No deduplica — cada
|
|
ejecucion crea entidades nuevas (las frases pueden repetirse entre
|
|
ejecuciones distintas y el rank/batch las distingue).
|
|
"""
|
|
ts = now_iso()
|
|
new_id = f"Sentence_{now_ms()}_{rank}"
|
|
name = sentence[:80] + ("..." if len(sentence) > 80 else "")
|
|
meta = {
|
|
"text": sentence,
|
|
"rank": rank,
|
|
"batch_id": batch_id,
|
|
}
|
|
meta_json = json.dumps(meta, ensure_ascii=False)
|
|
if has_group_col:
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
|
" group_id, created_at, updated_at) "
|
|
"VALUES (?, ?, 'Sentence', 'enricher:split_sentences', ?, ?, ?, ?)",
|
|
(new_id, name, meta_json, group_id, ts, ts),
|
|
)
|
|
else:
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
|
" created_at, updated_at) "
|
|
"VALUES (?, ?, 'Sentence', 'enricher:split_sentences', ?, ?, ?)",
|
|
(new_id, name, meta_json, ts, ts),
|
|
)
|
|
return new_id
|
|
|
|
|
|
def insert_group_entity(conn: sqlite3.Connection, *, source_node_id: str,
|
|
source_node_name: str, count: int,
|
|
batch_id: str) -> str:
|
|
ts = now_iso()
|
|
new_id = f"Group_{now_ms()}_{abs(hash(source_node_id + batch_id)) % 100000}"
|
|
name = f"split_sentences: {source_node_name} ({count})"
|
|
meta = {
|
|
"enricher": "split_sentences",
|
|
"count": count,
|
|
"batch_id": batch_id,
|
|
"source_node_id": source_node_id,
|
|
}
|
|
meta_json = json.dumps(meta, ensure_ascii=False)
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
|
" created_at, updated_at) "
|
|
"VALUES (?, ?, 'Group', 'enricher:split_sentences', ?, ?, ?)",
|
|
(new_id, name, meta_json, ts, ts),
|
|
)
|
|
return new_id
|
|
|
|
|
|
_REL_COUNTER = 0
|
|
|
|
|
|
def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str,
|
|
name: str) -> bool:
|
|
global _REL_COUNTER
|
|
cur = conn.execute(
|
|
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
|
|
"AND name=? LIMIT 1",
|
|
(from_id, to_id, name),
|
|
)
|
|
if cur.fetchone():
|
|
return False
|
|
ts = now_iso()
|
|
_REL_COUNTER += 1
|
|
rel_id = f"rel_{now_ms()}_{_REL_COUNTER}_{name.lower()}"
|
|
conn.execute(
|
|
"INSERT INTO relations (id, name, from_entity, to_entity, "
|
|
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?)",
|
|
(rel_id, name, from_id, to_id, ts, ts),
|
|
)
|
|
return True
|
|
|
|
|
|
def main() -> int:
|
|
raw = sys.stdin.read()
|
|
try:
|
|
ctx = json.loads(raw)
|
|
except Exception as e:
|
|
log(f"stdin not valid JSON: {e}")
|
|
return 2
|
|
|
|
node_id = ctx.get("node_id") or ""
|
|
node_name = (ctx.get("node_name") or "").strip()
|
|
metadata = ctx.get("metadata") or {}
|
|
if isinstance(metadata, str):
|
|
try:
|
|
metadata = json.loads(metadata)
|
|
except Exception:
|
|
metadata = {}
|
|
ops_db_path = ctx.get("ops_db_path") or ""
|
|
params = ctx.get("params") or {}
|
|
max_sentences = int(params.get("max_sentences", 200))
|
|
min_length = int(params.get("min_length", 20))
|
|
|
|
if not node_id or not ops_db_path:
|
|
log("missing node_id / ops_db_path")
|
|
return 2
|
|
|
|
# Normalizar y resolver path como en web_search.
|
|
ops_db_path = ops_db_path.replace("\\", "/")
|
|
app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/")
|
|
if not os.path.isabs(ops_db_path):
|
|
if app_dir_raw and os.path.isdir(app_dir_raw):
|
|
cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path))
|
|
if os.path.exists(cand):
|
|
ops_db_path = cand
|
|
if not os.path.isabs(ops_db_path):
|
|
ops_db_path = os.path.abspath(ops_db_path)
|
|
|
|
if not os.path.exists(ops_db_path):
|
|
log(f"ops_db_path no existe: {ops_db_path}")
|
|
print(json.dumps({"error": "ops_db not found",
|
|
"ops_db_path": ops_db_path,
|
|
"entities_added": 0, "relations_added": 0}))
|
|
return 7
|
|
|
|
progress(0.10, "reading")
|
|
text = read_text(ops_db_path, node_id, node_name)
|
|
if len(text) < min_length:
|
|
msg = (f"texto demasiado corto ({len(text)} chars < {min_length}). "
|
|
f"Escribe el texto en el panel Note del nodo (doble click "
|
|
f"para abrir) o pon un name mas largo")
|
|
log(msg)
|
|
print(json.dumps({"error": msg, "entities_added": 0,
|
|
"relations_added": 0}))
|
|
return 2
|
|
|
|
progress(0.30, "splitting")
|
|
sentences = split_into_sentences(text, min_length)
|
|
if max_sentences > 0:
|
|
sentences = sentences[:max_sentences]
|
|
|
|
if not sentences:
|
|
msg = (f"sin frases tras split (texto de {len(text)} chars, "
|
|
f"min_length={min_length})")
|
|
log(msg)
|
|
print(json.dumps({"error": msg, "entities_added": 0,
|
|
"relations_added": 0}))
|
|
return 2
|
|
|
|
progress(0.55, "writing")
|
|
conn = sqlite3.connect(ops_db_path)
|
|
conn.execute("PRAGMA foreign_keys=OFF")
|
|
entities_added = 0
|
|
relations_added = 0
|
|
group_id: str | None = None
|
|
batch_id = uuid.uuid4().hex
|
|
try:
|
|
has_group_col = has_group_id_column(conn)
|
|
n_total = len(sentences)
|
|
# Issue 0035e: respeta override del manifest si viene en ctx.
|
|
threshold = _coerce_threshold(ctx.get("auto_group_threshold"),
|
|
DEFAULT_GROUP_THRESHOLD)
|
|
|
|
if n_total >= threshold and has_group_col:
|
|
group_id = insert_group_entity(
|
|
conn,
|
|
source_node_id=node_id,
|
|
source_node_name=node_name or "(text)",
|
|
count=n_total,
|
|
batch_id=batch_id,
|
|
)
|
|
entities_added += 1
|
|
if insert_relation(conn, group_id, node_id, "SENTENCE_OF"):
|
|
relations_added += 1
|
|
preview = sentences[:GROUP_PREVIEW_K]
|
|
grouped = sentences[GROUP_PREVIEW_K:]
|
|
else:
|
|
preview = sentences
|
|
grouped = []
|
|
|
|
# Frases sueltas (preview).
|
|
for i, s in enumerate(preview):
|
|
sid = insert_sentence(
|
|
conn, sentence=s, rank=i + 1, batch_id=batch_id,
|
|
group_id=None, has_group_col=has_group_col,
|
|
)
|
|
entities_added += 1
|
|
if insert_relation(conn, sid, node_id, "SENTENCE_OF"):
|
|
relations_added += 1
|
|
|
|
# Frases agrupadas — siguen colgando del source con SENTENCE_OF.
|
|
for j, s in enumerate(grouped):
|
|
rank = GROUP_PREVIEW_K + j + 1
|
|
sid = insert_sentence(
|
|
conn, sentence=s, rank=rank, batch_id=batch_id,
|
|
group_id=group_id, has_group_col=has_group_col,
|
|
)
|
|
entities_added += 1
|
|
if insert_relation(conn, sid, node_id, "SENTENCE_OF"):
|
|
relations_added += 1
|
|
|
|
if grouped and j % 25 == 0:
|
|
progress(0.55 + 0.40 * (j / max(1, len(grouped))), "writing")
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
progress(1.0, "done")
|
|
print(json.dumps({
|
|
"sentences": len(sentences),
|
|
"entities_added": entities_added,
|
|
"relations_added": relations_added,
|
|
"batch_id": batch_id,
|
|
"group_id": group_id or "",
|
|
"grouped": bool(group_id),
|
|
}, ensure_ascii=False))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|