fix(enrichers): split_sentences y extract_iocs_text leen entities.notes
El campo `notes` es lo que el usuario escribe en el panel Note del
Inspector (doble click sobre el nodo) — sitio canonico para texto
largo. Antes los enrichers leian metadata.text/description/query como
prioridad, dejando notes ignorado y forzando al usuario a inyectar
texto via la UI metadata-extra (poco descubrible).
Cambios:
- Ambos run.py abren la BD y leen `entities.notes` por SQL antes de
fallback a node_name. metadata.text/description/query ya no se
consultan (KISS — solo notes y name).
- conftest.make_node admite kwarg `notes` para inyectar contenido
en la columna notes desde tests.
- Tests actualizados: SAMPLE_TEXT y los IoC dumps van por `notes=`
en lugar de `metadata={"text": ...}`.
- Renombrado el test que verificaba prioridad: ahora se llama
`*_uses_notes_priority` y verifica notes > name.
Tests verdes WSL (44) y Windows (33 + 11 skipped).
This commit is contained in:
Binary file not shown.
+11
-4
@@ -191,15 +191,22 @@ def registry_root():
|
||||
|
||||
|
||||
def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str,
|
||||
metadata: dict | None = None, source: str = "test") -> None:
|
||||
"""Inserta un nodo de tipo arbitrario en operations.db."""
|
||||
metadata: dict | None = None, source: str = "test",
|
||||
notes: str = "") -> None:
|
||||
"""Inserta un nodo de tipo arbitrario en operations.db.
|
||||
|
||||
`notes` se mapea a la columna `entities.notes` — es lo que el
|
||||
panel Note del Inspector edita en la app real, y los enrichers
|
||||
`split_sentences` / `extract_iocs_text` lo leen como fuente de
|
||||
texto canonica.
|
||||
"""
|
||||
conn = sqlite3.connect(ops_db)
|
||||
conn.execute(
|
||||
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
||||
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, "
|
||||
" notes, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, "
|
||||
" '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')",
|
||||
(node_id, name, type_ref, source,
|
||||
json.dumps(metadata or {}, ensure_ascii=False)),
|
||||
json.dumps(metadata or {}, ensure_ascii=False), notes),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
@@ -32,10 +32,9 @@ def _ioc_paragraph(n: int) -> str:
|
||||
def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root):
|
||||
"""Texto con emails, IPs, CVE, hash → entidades creadas con tipos correctos."""
|
||||
make_node(ops_db, node_id="t1", name="incident",
|
||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
||||
type_ref="text", notes=SAMPLE_TEXT)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="incident", node_type="text",
|
||||
metadata={"text": SAMPLE_TEXT})
|
||||
node_id="t1", node_name="incident", node_type="text")
|
||||
|
||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||
assert rc == 0, err
|
||||
@@ -53,17 +52,16 @@ def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root):
|
||||
assert all(r["to_entity"] == "t1" for r in rels)
|
||||
|
||||
|
||||
def test_extract_iocs_text_uses_metadata_text(ops_db, app_dir, registry_root):
|
||||
"""metadata.text se prioriza sobre node_name."""
|
||||
def test_extract_iocs_text_uses_notes_priority(ops_db, app_dir, registry_root):
|
||||
"""`entities.notes` se prioriza sobre node_name."""
|
||||
make_node(ops_db, node_id="t1", name="placeholder",
|
||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
||||
type_ref="text", notes=SAMPLE_TEXT)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="placeholder", node_type="text",
|
||||
metadata={"text": SAMPLE_TEXT})
|
||||
node_id="t1", node_name="placeholder", node_type="text")
|
||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||
assert rc == 0, err
|
||||
# El name "placeholder" no contiene IoCs; si se hubiese usado, no
|
||||
# habria entidades. Ergo entities_added > 0 demuestra que leyo text.
|
||||
# habria entidades. Ergo entities_added > 0 demuestra que leyo notes.
|
||||
assert out["entities_added"] >= 2, out
|
||||
|
||||
|
||||
@@ -83,10 +81,9 @@ def test_extract_iocs_text_above_threshold_creates_group(ops_db, app_dir,
|
||||
""">=50 IoCs → Group heterogeneo con todos dentro (fase 1)."""
|
||||
text = _ioc_paragraph(180) # ~60 emails + ~60 IPs + ~60 CVEs
|
||||
make_node(ops_db, node_id="t1", name="dump",
|
||||
type_ref="text", metadata={"text": text})
|
||||
type_ref="text", notes=text)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="dump", node_type="text",
|
||||
metadata={"text": text})
|
||||
node_id="t1", node_name="dump", node_type="text")
|
||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||
assert rc == 0, err
|
||||
assert out["iocs_found"] >= 50, out
|
||||
|
||||
@@ -35,10 +35,9 @@ def _build_paragraph(n: int) -> str:
|
||||
def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root):
|
||||
"""Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF."""
|
||||
make_node(ops_db, node_id="t1", name="tomate doc",
|
||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
||||
type_ref="text", notes=SAMPLE_TEXT)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="tomate doc", node_type="text",
|
||||
metadata={"text": SAMPLE_TEXT})
|
||||
node_id="t1", node_name="tomate doc", node_type="text")
|
||||
|
||||
rc, out, err = run_enricher("split_sentences", ctx)
|
||||
assert rc == 0, err
|
||||
@@ -67,10 +66,9 @@ def test_split_sentences_below_threshold_no_group(ops_db, app_dir,
|
||||
"""30 frases → ningun Group (<50)."""
|
||||
text = _build_paragraph(30)
|
||||
make_node(ops_db, node_id="t1", name="big doc",
|
||||
type_ref="text", metadata={"text": text})
|
||||
type_ref="text", notes=text)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="big doc", node_type="text",
|
||||
metadata={"text": text})
|
||||
node_id="t1", node_name="big doc", node_type="text")
|
||||
rc, out, err = run_enricher("split_sentences", ctx)
|
||||
assert rc == 0, err
|
||||
assert out["sentences"] == 30
|
||||
@@ -89,10 +87,9 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
|
||||
"""100 frases → 1 Group + 10 sueltos + 90 con group_id."""
|
||||
text = _build_paragraph(100)
|
||||
make_node(ops_db, node_id="t1", name="huge doc",
|
||||
type_ref="text", metadata={"text": text})
|
||||
type_ref="text", notes=text)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="huge doc", node_type="text",
|
||||
metadata={"text": text})
|
||||
node_id="t1", node_name="huge doc", node_type="text")
|
||||
rc, out, err = run_enricher("split_sentences", ctx)
|
||||
assert rc == 0, err
|
||||
assert out["sentences"] == 100
|
||||
@@ -122,7 +119,7 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
|
||||
|
||||
|
||||
def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
|
||||
"""Nodo sin metadata.text/description/query y name corto → exit 2."""
|
||||
"""Nodo sin notes y con name corto → exit 2."""
|
||||
make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={})
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="x", node_type="text")
|
||||
@@ -133,15 +130,13 @@ def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
|
||||
"min_length" in (out.get("error") or "")
|
||||
|
||||
|
||||
def test_split_sentences_uses_metadata_text_priority(ops_db, app_dir,
|
||||
registry_root):
|
||||
"""metadata.text gana sobre node_name aunque ambos tengan texto."""
|
||||
def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root):
|
||||
"""`entities.notes` gana sobre node_name aunque ambos tengan texto."""
|
||||
make_node(ops_db, node_id="t1", name="placeholder corto",
|
||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
||||
type_ref="text", notes=SAMPLE_TEXT)
|
||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||
node_id="t1", node_name="placeholder corto",
|
||||
node_type="text",
|
||||
metadata={"text": SAMPLE_TEXT})
|
||||
node_type="text")
|
||||
rc, out, err = run_enricher("split_sentences", ctx)
|
||||
assert rc == 0, err
|
||||
assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name
|
||||
|
||||
Reference in New Issue
Block a user