diff --git a/enrichers/extract_iocs_text/run.py b/enrichers/extract_iocs_text/run.py index b5204a1..fed3983 100644 --- a/enrichers/extract_iocs_text/run.py +++ b/enrichers/extract_iocs_text/run.py @@ -2,9 +2,10 @@ """Enricher extract_iocs_text — variante offline de extract_text_entities. A diferencia de extract_text_entities, este enricher NO depende de un -markdown cacheado (fetch_webpage previo). Lee el texto directamente del -nodo (`metadata.text` > `metadata.description` > `metadata.query` > -`node_name`) y aplica el pipeline `extract_iocs` del registry sobre el. +markdown cacheado (fetch_webpage previo). Lee el texto directamente +del nodo (prioridad: `entities.notes` > `node_name`) y aplica el +pipeline `extract_iocs` del registry sobre el. El campo `notes` es lo +que el usuario escribe en el panel Note (doble click sobre el nodo). Sin red, sin dependencias externas — pensado para probar la app cuando DDG bloquea con captcha o cuando se trabaja en un entorno @@ -76,11 +77,27 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool: return False -def read_text(metadata: dict, node_name: str) -> str: - for key in ("text", "description", "query"): - v = metadata.get(key) - if isinstance(v, str) and v.strip(): - return v.strip() +def read_text(ops_db_path: str, node_id: str, node_name: str) -> str: + """Lee `entities.notes` y cae al `node_name` si esta vacio. + + `notes` es el campo donde el usuario escribe via el panel Note + (doble click sobre el nodo). Es el sitio canonico para texto largo. + """ + notes = "" + try: + c = sqlite3.connect(ops_db_path) + try: + row = c.execute( + "SELECT notes FROM entities WHERE id=?", (node_id,) + ).fetchone() + if row and isinstance(row[0], str): + notes = row[0] + finally: + c.close() + except sqlite3.Error: + notes = "" + if notes and notes.strip(): + return notes.strip() return (node_name or "").strip() @@ -130,10 +147,11 @@ def main() -> int: return 7 progress(0.10, "reading") - text = read_text(metadata, node_name) + text = read_text(ops_db_path, node_id, node_name) if not text: - msg = ("nodo sin texto. Esperaba metadata.text / description / " - "query, o un name con contenido") + msg = ("nodo sin texto. Escribe el contenido en el panel Note " + "del nodo (doble click para abrir) o pon un name con " + "contenido") log(msg) print(json.dumps({"error": msg, "entities_added": 0, "relations_added": 0})) diff --git a/enrichers/split_sentences/run.py b/enrichers/split_sentences/run.py index e513e78..e125ec9 100644 --- a/enrichers/split_sentences/run.py +++ b/enrichers/split_sentences/run.py @@ -9,10 +9,10 @@ Wire protocol estandar (issue 0026): - exit code 0 = ok, !=0 = error. Lectura del texto (en orden de prioridad): - 1. metadata.text (campo canonico de un nodo Text) - 2. metadata.description - 3. metadata.query (compatible con nodos creados desde la barra de busqueda) - 4. node_name (fallback minimo) + 1. `entities.notes` (lo que el usuario escribe en el panel Note + via doble click — sitio canonico de texto + largo) + 2. node_name (titulo del nodo, fallback minimo) Si tras esto el texto es < min_length, falla con exit 2 y mensaje claro. @@ -79,12 +79,29 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool: return False -def read_text(metadata: dict, node_name: str) -> str: - """Resuelve el texto a partir del orden de prioridad documentado.""" - for key in ("text", "description", "query"): - v = metadata.get(key) - if isinstance(v, str) and v.strip(): - return v.strip() +def read_text(ops_db_path: str, node_id: str, node_name: str) -> str: + """Resuelve el texto a procesar. + + Prioridad: + 1. `entities.notes` del nodo (lo que el usuario escribe en el panel + Note via doble click). Es el sitio canonico para texto largo. + 2. `node_name` (titulo del nodo) como fallback minimo. + """ + notes = "" + try: + c = sqlite3.connect(ops_db_path) + try: + row = c.execute( + "SELECT notes FROM entities WHERE id=?", (node_id,) + ).fetchone() + if row and isinstance(row[0], str): + notes = row[0] + finally: + c.close() + except sqlite3.Error: + notes = "" + if notes and notes.strip(): + return notes.strip() return (node_name or "").strip() @@ -223,11 +240,11 @@ def main() -> int: return 7 progress(0.10, "reading") - text = read_text(metadata, node_name) + text = read_text(ops_db_path, node_id, node_name) if len(text) < min_length: msg = (f"texto demasiado corto ({len(text)} chars < {min_length}). " - f"Esperaba metadata.text / description / query, o un name " - f"con mas contenido") + f"Escribe el texto en el panel Note del nodo (doble click " + f"para abrir) o pon un name mas largo") log(msg) print(json.dumps({"error": msg, "entities_added": 0, "relations_added": 0})) diff --git a/tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index 8fc4bac..0000000 Binary files a/tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc and /dev/null differ diff --git a/tests/conftest.py b/tests/conftest.py index 5effcf4..fc0c739 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -191,15 +191,22 @@ def registry_root(): def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str, - metadata: dict | None = None, source: str = "test") -> None: - """Inserta un nodo de tipo arbitrario en operations.db.""" + metadata: dict | None = None, source: str = "test", + notes: str = "") -> None: + """Inserta un nodo de tipo arbitrario en operations.db. + + `notes` se mapea a la columna `entities.notes` — es lo que el + panel Note del Inspector edita en la app real, y los enrichers + `split_sentences` / `extract_iocs_text` lo leen como fuente de + texto canonica. + """ conn = sqlite3.connect(ops_db) conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, " - " created_at, updated_at) VALUES (?, ?, ?, ?, ?, " + " notes, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, " " '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')", (node_id, name, type_ref, source, - json.dumps(metadata or {}, ensure_ascii=False)), + json.dumps(metadata or {}, ensure_ascii=False), notes), ) conn.commit() conn.close() diff --git a/tests/test_extract_iocs_text.py b/tests/test_extract_iocs_text.py index e5147ca..93068e7 100644 --- a/tests/test_extract_iocs_text.py +++ b/tests/test_extract_iocs_text.py @@ -32,10 +32,9 @@ def _ioc_paragraph(n: int) -> str: def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root): """Texto con emails, IPs, CVE, hash → entidades creadas con tipos correctos.""" make_node(ops_db, node_id="t1", name="incident", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="incident", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="incident", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err @@ -53,17 +52,16 @@ def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root): assert all(r["to_entity"] == "t1" for r in rels) -def test_extract_iocs_text_uses_metadata_text(ops_db, app_dir, registry_root): - """metadata.text se prioriza sobre node_name.""" +def test_extract_iocs_text_uses_notes_priority(ops_db, app_dir, registry_root): + """`entities.notes` se prioriza sobre node_name.""" make_node(ops_db, node_id="t1", name="placeholder", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="placeholder", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="placeholder", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err # El name "placeholder" no contiene IoCs; si se hubiese usado, no - # habria entidades. Ergo entities_added > 0 demuestra que leyo text. + # habria entidades. Ergo entities_added > 0 demuestra que leyo notes. assert out["entities_added"] >= 2, out @@ -83,10 +81,9 @@ def test_extract_iocs_text_above_threshold_creates_group(ops_db, app_dir, """>=50 IoCs → Group heterogeneo con todos dentro (fase 1).""" text = _ioc_paragraph(180) # ~60 emails + ~60 IPs + ~60 CVEs make_node(ops_db, node_id="t1", name="dump", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="dump", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="dump", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err assert out["iocs_found"] >= 50, out diff --git a/tests/test_split_sentences.py b/tests/test_split_sentences.py index 1870710..baf44ab 100644 --- a/tests/test_split_sentences.py +++ b/tests/test_split_sentences.py @@ -35,10 +35,9 @@ def _build_paragraph(n: int) -> str: def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root): """Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF.""" make_node(ops_db, node_id="t1", name="tomate doc", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="tomate doc", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="tomate doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err @@ -67,10 +66,9 @@ def test_split_sentences_below_threshold_no_group(ops_db, app_dir, """30 frases → ningun Group (<50).""" text = _build_paragraph(30) make_node(ops_db, node_id="t1", name="big doc", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="big doc", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="big doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 30 @@ -89,10 +87,9 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir, """100 frases → 1 Group + 10 sueltos + 90 con group_id.""" text = _build_paragraph(100) make_node(ops_db, node_id="t1", name="huge doc", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="huge doc", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="huge doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 100 @@ -122,7 +119,7 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir, def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root): - """Nodo sin metadata.text/description/query y name corto → exit 2.""" + """Nodo sin notes y con name corto → exit 2.""" make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={}) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="x", node_type="text") @@ -133,15 +130,13 @@ def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root): "min_length" in (out.get("error") or "") -def test_split_sentences_uses_metadata_text_priority(ops_db, app_dir, - registry_root): - """metadata.text gana sobre node_name aunque ambos tengan texto.""" +def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root): + """`entities.notes` gana sobre node_name aunque ambos tengan texto.""" make_node(ops_db, node_id="t1", name="placeholder corto", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="placeholder corto", - node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name