Merge issue/text-enrichers-read-notes
This commit is contained in:
@@ -2,9 +2,10 @@
|
|||||||
"""Enricher extract_iocs_text — variante offline de extract_text_entities.
|
"""Enricher extract_iocs_text — variante offline de extract_text_entities.
|
||||||
|
|
||||||
A diferencia de extract_text_entities, este enricher NO depende de un
|
A diferencia de extract_text_entities, este enricher NO depende de un
|
||||||
markdown cacheado (fetch_webpage previo). Lee el texto directamente del
|
markdown cacheado (fetch_webpage previo). Lee el texto directamente
|
||||||
nodo (`metadata.text` > `metadata.description` > `metadata.query` >
|
del nodo (prioridad: `entities.notes` > `node_name`) y aplica el
|
||||||
`node_name`) y aplica el pipeline `extract_iocs` del registry sobre el.
|
pipeline `extract_iocs` del registry sobre el. El campo `notes` es lo
|
||||||
|
que el usuario escribe en el panel Note (doble click sobre el nodo).
|
||||||
|
|
||||||
Sin red, sin dependencias externas — pensado para probar la app
|
Sin red, sin dependencias externas — pensado para probar la app
|
||||||
cuando DDG bloquea con captcha o cuando se trabaja en un entorno
|
cuando DDG bloquea con captcha o cuando se trabaja en un entorno
|
||||||
@@ -76,11 +77,27 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def read_text(metadata: dict, node_name: str) -> str:
|
def read_text(ops_db_path: str, node_id: str, node_name: str) -> str:
|
||||||
for key in ("text", "description", "query"):
|
"""Lee `entities.notes` y cae al `node_name` si esta vacio.
|
||||||
v = metadata.get(key)
|
|
||||||
if isinstance(v, str) and v.strip():
|
`notes` es el campo donde el usuario escribe via el panel Note
|
||||||
return v.strip()
|
(doble click sobre el nodo). Es el sitio canonico para texto largo.
|
||||||
|
"""
|
||||||
|
notes = ""
|
||||||
|
try:
|
||||||
|
c = sqlite3.connect(ops_db_path)
|
||||||
|
try:
|
||||||
|
row = c.execute(
|
||||||
|
"SELECT notes FROM entities WHERE id=?", (node_id,)
|
||||||
|
).fetchone()
|
||||||
|
if row and isinstance(row[0], str):
|
||||||
|
notes = row[0]
|
||||||
|
finally:
|
||||||
|
c.close()
|
||||||
|
except sqlite3.Error:
|
||||||
|
notes = ""
|
||||||
|
if notes and notes.strip():
|
||||||
|
return notes.strip()
|
||||||
return (node_name or "").strip()
|
return (node_name or "").strip()
|
||||||
|
|
||||||
|
|
||||||
@@ -130,10 +147,11 @@ def main() -> int:
|
|||||||
return 7
|
return 7
|
||||||
|
|
||||||
progress(0.10, "reading")
|
progress(0.10, "reading")
|
||||||
text = read_text(metadata, node_name)
|
text = read_text(ops_db_path, node_id, node_name)
|
||||||
if not text:
|
if not text:
|
||||||
msg = ("nodo sin texto. Esperaba metadata.text / description / "
|
msg = ("nodo sin texto. Escribe el contenido en el panel Note "
|
||||||
"query, o un name con contenido")
|
"del nodo (doble click para abrir) o pon un name con "
|
||||||
|
"contenido")
|
||||||
log(msg)
|
log(msg)
|
||||||
print(json.dumps({"error": msg, "entities_added": 0,
|
print(json.dumps({"error": msg, "entities_added": 0,
|
||||||
"relations_added": 0}))
|
"relations_added": 0}))
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ Wire protocol estandar (issue 0026):
|
|||||||
- exit code 0 = ok, !=0 = error.
|
- exit code 0 = ok, !=0 = error.
|
||||||
|
|
||||||
Lectura del texto (en orden de prioridad):
|
Lectura del texto (en orden de prioridad):
|
||||||
1. metadata.text (campo canonico de un nodo Text)
|
1. `entities.notes` (lo que el usuario escribe en el panel Note
|
||||||
2. metadata.description
|
via doble click — sitio canonico de texto
|
||||||
3. metadata.query (compatible con nodos creados desde la barra de busqueda)
|
largo)
|
||||||
4. node_name (fallback minimo)
|
2. node_name (titulo del nodo, fallback minimo)
|
||||||
|
|
||||||
Si tras esto el texto es < min_length, falla con exit 2 y mensaje claro.
|
Si tras esto el texto es < min_length, falla con exit 2 y mensaje claro.
|
||||||
|
|
||||||
@@ -79,12 +79,29 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def read_text(metadata: dict, node_name: str) -> str:
|
def read_text(ops_db_path: str, node_id: str, node_name: str) -> str:
|
||||||
"""Resuelve el texto a partir del orden de prioridad documentado."""
|
"""Resuelve el texto a procesar.
|
||||||
for key in ("text", "description", "query"):
|
|
||||||
v = metadata.get(key)
|
Prioridad:
|
||||||
if isinstance(v, str) and v.strip():
|
1. `entities.notes` del nodo (lo que el usuario escribe en el panel
|
||||||
return v.strip()
|
Note via doble click). Es el sitio canonico para texto largo.
|
||||||
|
2. `node_name` (titulo del nodo) como fallback minimo.
|
||||||
|
"""
|
||||||
|
notes = ""
|
||||||
|
try:
|
||||||
|
c = sqlite3.connect(ops_db_path)
|
||||||
|
try:
|
||||||
|
row = c.execute(
|
||||||
|
"SELECT notes FROM entities WHERE id=?", (node_id,)
|
||||||
|
).fetchone()
|
||||||
|
if row and isinstance(row[0], str):
|
||||||
|
notes = row[0]
|
||||||
|
finally:
|
||||||
|
c.close()
|
||||||
|
except sqlite3.Error:
|
||||||
|
notes = ""
|
||||||
|
if notes and notes.strip():
|
||||||
|
return notes.strip()
|
||||||
return (node_name or "").strip()
|
return (node_name or "").strip()
|
||||||
|
|
||||||
|
|
||||||
@@ -223,11 +240,11 @@ def main() -> int:
|
|||||||
return 7
|
return 7
|
||||||
|
|
||||||
progress(0.10, "reading")
|
progress(0.10, "reading")
|
||||||
text = read_text(metadata, node_name)
|
text = read_text(ops_db_path, node_id, node_name)
|
||||||
if len(text) < min_length:
|
if len(text) < min_length:
|
||||||
msg = (f"texto demasiado corto ({len(text)} chars < {min_length}). "
|
msg = (f"texto demasiado corto ({len(text)} chars < {min_length}). "
|
||||||
f"Esperaba metadata.text / description / query, o un name "
|
f"Escribe el texto en el panel Note del nodo (doble click "
|
||||||
f"con mas contenido")
|
f"para abrir) o pon un name mas largo")
|
||||||
log(msg)
|
log(msg)
|
||||||
print(json.dumps({"error": msg, "entities_added": 0,
|
print(json.dumps({"error": msg, "entities_added": 0,
|
||||||
"relations_added": 0}))
|
"relations_added": 0}))
|
||||||
|
|||||||
Binary file not shown.
+11
-4
@@ -191,15 +191,22 @@ def registry_root():
|
|||||||
|
|
||||||
|
|
||||||
def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str,
|
def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str,
|
||||||
metadata: dict | None = None, source: str = "test") -> None:
|
metadata: dict | None = None, source: str = "test",
|
||||||
"""Inserta un nodo de tipo arbitrario en operations.db."""
|
notes: str = "") -> None:
|
||||||
|
"""Inserta un nodo de tipo arbitrario en operations.db.
|
||||||
|
|
||||||
|
`notes` se mapea a la columna `entities.notes` — es lo que el
|
||||||
|
panel Note del Inspector edita en la app real, y los enrichers
|
||||||
|
`split_sentences` / `extract_iocs_text` lo leen como fuente de
|
||||||
|
texto canonica.
|
||||||
|
"""
|
||||||
conn = sqlite3.connect(ops_db)
|
conn = sqlite3.connect(ops_db)
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
||||||
" created_at, updated_at) VALUES (?, ?, ?, ?, ?, "
|
" notes, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, "
|
||||||
" '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')",
|
" '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')",
|
||||||
(node_id, name, type_ref, source,
|
(node_id, name, type_ref, source,
|
||||||
json.dumps(metadata or {}, ensure_ascii=False)),
|
json.dumps(metadata or {}, ensure_ascii=False), notes),
|
||||||
)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|||||||
@@ -32,10 +32,9 @@ def _ioc_paragraph(n: int) -> str:
|
|||||||
def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root):
|
def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root):
|
||||||
"""Texto con emails, IPs, CVE, hash → entidades creadas con tipos correctos."""
|
"""Texto con emails, IPs, CVE, hash → entidades creadas con tipos correctos."""
|
||||||
make_node(ops_db, node_id="t1", name="incident",
|
make_node(ops_db, node_id="t1", name="incident",
|
||||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
type_ref="text", notes=SAMPLE_TEXT)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="incident", node_type="text",
|
node_id="t1", node_name="incident", node_type="text")
|
||||||
metadata={"text": SAMPLE_TEXT})
|
|
||||||
|
|
||||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
@@ -53,17 +52,16 @@ def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root):
|
|||||||
assert all(r["to_entity"] == "t1" for r in rels)
|
assert all(r["to_entity"] == "t1" for r in rels)
|
||||||
|
|
||||||
|
|
||||||
def test_extract_iocs_text_uses_metadata_text(ops_db, app_dir, registry_root):
|
def test_extract_iocs_text_uses_notes_priority(ops_db, app_dir, registry_root):
|
||||||
"""metadata.text se prioriza sobre node_name."""
|
"""`entities.notes` se prioriza sobre node_name."""
|
||||||
make_node(ops_db, node_id="t1", name="placeholder",
|
make_node(ops_db, node_id="t1", name="placeholder",
|
||||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
type_ref="text", notes=SAMPLE_TEXT)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="placeholder", node_type="text",
|
node_id="t1", node_name="placeholder", node_type="text")
|
||||||
metadata={"text": SAMPLE_TEXT})
|
|
||||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
# El name "placeholder" no contiene IoCs; si se hubiese usado, no
|
# El name "placeholder" no contiene IoCs; si se hubiese usado, no
|
||||||
# habria entidades. Ergo entities_added > 0 demuestra que leyo text.
|
# habria entidades. Ergo entities_added > 0 demuestra que leyo notes.
|
||||||
assert out["entities_added"] >= 2, out
|
assert out["entities_added"] >= 2, out
|
||||||
|
|
||||||
|
|
||||||
@@ -83,10 +81,9 @@ def test_extract_iocs_text_above_threshold_creates_group(ops_db, app_dir,
|
|||||||
""">=50 IoCs → Group heterogeneo con todos dentro (fase 1)."""
|
""">=50 IoCs → Group heterogeneo con todos dentro (fase 1)."""
|
||||||
text = _ioc_paragraph(180) # ~60 emails + ~60 IPs + ~60 CVEs
|
text = _ioc_paragraph(180) # ~60 emails + ~60 IPs + ~60 CVEs
|
||||||
make_node(ops_db, node_id="t1", name="dump",
|
make_node(ops_db, node_id="t1", name="dump",
|
||||||
type_ref="text", metadata={"text": text})
|
type_ref="text", notes=text)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="dump", node_type="text",
|
node_id="t1", node_name="dump", node_type="text")
|
||||||
metadata={"text": text})
|
|
||||||
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
rc, out, err = run_enricher("extract_iocs_text", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
assert out["iocs_found"] >= 50, out
|
assert out["iocs_found"] >= 50, out
|
||||||
|
|||||||
@@ -35,10 +35,9 @@ def _build_paragraph(n: int) -> str:
|
|||||||
def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root):
|
def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root):
|
||||||
"""Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF."""
|
"""Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF."""
|
||||||
make_node(ops_db, node_id="t1", name="tomate doc",
|
make_node(ops_db, node_id="t1", name="tomate doc",
|
||||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
type_ref="text", notes=SAMPLE_TEXT)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="tomate doc", node_type="text",
|
node_id="t1", node_name="tomate doc", node_type="text")
|
||||||
metadata={"text": SAMPLE_TEXT})
|
|
||||||
|
|
||||||
rc, out, err = run_enricher("split_sentences", ctx)
|
rc, out, err = run_enricher("split_sentences", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
@@ -67,10 +66,9 @@ def test_split_sentences_below_threshold_no_group(ops_db, app_dir,
|
|||||||
"""30 frases → ningun Group (<50)."""
|
"""30 frases → ningun Group (<50)."""
|
||||||
text = _build_paragraph(30)
|
text = _build_paragraph(30)
|
||||||
make_node(ops_db, node_id="t1", name="big doc",
|
make_node(ops_db, node_id="t1", name="big doc",
|
||||||
type_ref="text", metadata={"text": text})
|
type_ref="text", notes=text)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="big doc", node_type="text",
|
node_id="t1", node_name="big doc", node_type="text")
|
||||||
metadata={"text": text})
|
|
||||||
rc, out, err = run_enricher("split_sentences", ctx)
|
rc, out, err = run_enricher("split_sentences", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
assert out["sentences"] == 30
|
assert out["sentences"] == 30
|
||||||
@@ -89,10 +87,9 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
|
|||||||
"""100 frases → 1 Group + 10 sueltos + 90 con group_id."""
|
"""100 frases → 1 Group + 10 sueltos + 90 con group_id."""
|
||||||
text = _build_paragraph(100)
|
text = _build_paragraph(100)
|
||||||
make_node(ops_db, node_id="t1", name="huge doc",
|
make_node(ops_db, node_id="t1", name="huge doc",
|
||||||
type_ref="text", metadata={"text": text})
|
type_ref="text", notes=text)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="huge doc", node_type="text",
|
node_id="t1", node_name="huge doc", node_type="text")
|
||||||
metadata={"text": text})
|
|
||||||
rc, out, err = run_enricher("split_sentences", ctx)
|
rc, out, err = run_enricher("split_sentences", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
assert out["sentences"] == 100
|
assert out["sentences"] == 100
|
||||||
@@ -122,7 +119,7 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir,
|
|||||||
|
|
||||||
|
|
||||||
def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
|
def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
|
||||||
"""Nodo sin metadata.text/description/query y name corto → exit 2."""
|
"""Nodo sin notes y con name corto → exit 2."""
|
||||||
make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={})
|
make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={})
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="x", node_type="text")
|
node_id="t1", node_name="x", node_type="text")
|
||||||
@@ -133,15 +130,13 @@ def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root):
|
|||||||
"min_length" in (out.get("error") or "")
|
"min_length" in (out.get("error") or "")
|
||||||
|
|
||||||
|
|
||||||
def test_split_sentences_uses_metadata_text_priority(ops_db, app_dir,
|
def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root):
|
||||||
registry_root):
|
"""`entities.notes` gana sobre node_name aunque ambos tengan texto."""
|
||||||
"""metadata.text gana sobre node_name aunque ambos tengan texto."""
|
|
||||||
make_node(ops_db, node_id="t1", name="placeholder corto",
|
make_node(ops_db, node_id="t1", name="placeholder corto",
|
||||||
type_ref="text", metadata={"text": SAMPLE_TEXT})
|
type_ref="text", notes=SAMPLE_TEXT)
|
||||||
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root,
|
||||||
node_id="t1", node_name="placeholder corto",
|
node_id="t1", node_name="placeholder corto",
|
||||||
node_type="text",
|
node_type="text")
|
||||||
metadata={"text": SAMPLE_TEXT})
|
|
||||||
rc, out, err = run_enricher("split_sentences", ctx)
|
rc, out, err = run_enricher("split_sentences", ctx)
|
||||||
assert rc == 0, err
|
assert rc == 0, err
|
||||||
assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name
|
assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name
|
||||||
|
|||||||
Reference in New Issue
Block a user