From 2a5127fcaf421243f3a0e1a52f4294229a6d15d3 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 3 May 2026 15:36:18 +0200 Subject: [PATCH] fix(enrichers): split_sentences y extract_iocs_text leen entities.notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit El campo `notes` es lo que el usuario escribe en el panel Note del Inspector (doble click sobre el nodo) — sitio canonico para texto largo. Antes los enrichers leian metadata.text/description/query como prioridad, dejando notes ignorado y forzando al usuario a inyectar texto via la UI metadata-extra (poco descubrible). Cambios: - Ambos run.py abren la BD y leen `entities.notes` por SQL antes de fallback a node_name. metadata.text/description/query ya no se consultan (KISS — solo notes y name). - conftest.make_node admite kwarg `notes` para inyectar contenido en la columna notes desde tests. - Tests actualizados: SAMPLE_TEXT y los IoC dumps van por `notes=` en lugar de `metadata={"text": ...}`. - Renombrado el test que verificaba prioridad: ahora se llama `*_uses_notes_priority` y verifica notes > name. Tests verdes WSL (44) y Windows (33 + 11 skipped). --- enrichers/extract_iocs_text/run.py | 40 +++++++++++----- enrichers/split_sentences/run.py | 43 ++++++++++++------ ...st_web_search.cpython-312-pytest-9.0.2.pyc | Bin 13629 -> 0 bytes tests/conftest.py | 15 ++++-- tests/test_extract_iocs_text.py | 21 ++++----- tests/test_split_sentences.py | 27 +++++------ 6 files changed, 90 insertions(+), 56 deletions(-) delete mode 100644 tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc diff --git a/enrichers/extract_iocs_text/run.py b/enrichers/extract_iocs_text/run.py index b5204a1..fed3983 100644 --- a/enrichers/extract_iocs_text/run.py +++ b/enrichers/extract_iocs_text/run.py @@ -2,9 +2,10 @@ """Enricher extract_iocs_text — variante offline de extract_text_entities. A diferencia de extract_text_entities, este enricher NO depende de un -markdown cacheado (fetch_webpage previo). Lee el texto directamente del -nodo (`metadata.text` > `metadata.description` > `metadata.query` > -`node_name`) y aplica el pipeline `extract_iocs` del registry sobre el. +markdown cacheado (fetch_webpage previo). Lee el texto directamente +del nodo (prioridad: `entities.notes` > `node_name`) y aplica el +pipeline `extract_iocs` del registry sobre el. El campo `notes` es lo +que el usuario escribe en el panel Note (doble click sobre el nodo). Sin red, sin dependencias externas — pensado para probar la app cuando DDG bloquea con captcha o cuando se trabaja en un entorno @@ -76,11 +77,27 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool: return False -def read_text(metadata: dict, node_name: str) -> str: - for key in ("text", "description", "query"): - v = metadata.get(key) - if isinstance(v, str) and v.strip(): - return v.strip() +def read_text(ops_db_path: str, node_id: str, node_name: str) -> str: + """Lee `entities.notes` y cae al `node_name` si esta vacio. + + `notes` es el campo donde el usuario escribe via el panel Note + (doble click sobre el nodo). Es el sitio canonico para texto largo. + """ + notes = "" + try: + c = sqlite3.connect(ops_db_path) + try: + row = c.execute( + "SELECT notes FROM entities WHERE id=?", (node_id,) + ).fetchone() + if row and isinstance(row[0], str): + notes = row[0] + finally: + c.close() + except sqlite3.Error: + notes = "" + if notes and notes.strip(): + return notes.strip() return (node_name or "").strip() @@ -130,10 +147,11 @@ def main() -> int: return 7 progress(0.10, "reading") - text = read_text(metadata, node_name) + text = read_text(ops_db_path, node_id, node_name) if not text: - msg = ("nodo sin texto. Esperaba metadata.text / description / " - "query, o un name con contenido") + msg = ("nodo sin texto. Escribe el contenido en el panel Note " + "del nodo (doble click para abrir) o pon un name con " + "contenido") log(msg) print(json.dumps({"error": msg, "entities_added": 0, "relations_added": 0})) diff --git a/enrichers/split_sentences/run.py b/enrichers/split_sentences/run.py index e513e78..e125ec9 100644 --- a/enrichers/split_sentences/run.py +++ b/enrichers/split_sentences/run.py @@ -9,10 +9,10 @@ Wire protocol estandar (issue 0026): - exit code 0 = ok, !=0 = error. Lectura del texto (en orden de prioridad): - 1. metadata.text (campo canonico de un nodo Text) - 2. metadata.description - 3. metadata.query (compatible con nodos creados desde la barra de busqueda) - 4. node_name (fallback minimo) + 1. `entities.notes` (lo que el usuario escribe en el panel Note + via doble click — sitio canonico de texto + largo) + 2. node_name (titulo del nodo, fallback minimo) Si tras esto el texto es < min_length, falla con exit 2 y mensaje claro. @@ -79,12 +79,29 @@ def has_group_id_column(conn: sqlite3.Connection) -> bool: return False -def read_text(metadata: dict, node_name: str) -> str: - """Resuelve el texto a partir del orden de prioridad documentado.""" - for key in ("text", "description", "query"): - v = metadata.get(key) - if isinstance(v, str) and v.strip(): - return v.strip() +def read_text(ops_db_path: str, node_id: str, node_name: str) -> str: + """Resuelve el texto a procesar. + + Prioridad: + 1. `entities.notes` del nodo (lo que el usuario escribe en el panel + Note via doble click). Es el sitio canonico para texto largo. + 2. `node_name` (titulo del nodo) como fallback minimo. + """ + notes = "" + try: + c = sqlite3.connect(ops_db_path) + try: + row = c.execute( + "SELECT notes FROM entities WHERE id=?", (node_id,) + ).fetchone() + if row and isinstance(row[0], str): + notes = row[0] + finally: + c.close() + except sqlite3.Error: + notes = "" + if notes and notes.strip(): + return notes.strip() return (node_name or "").strip() @@ -223,11 +240,11 @@ def main() -> int: return 7 progress(0.10, "reading") - text = read_text(metadata, node_name) + text = read_text(ops_db_path, node_id, node_name) if len(text) < min_length: msg = (f"texto demasiado corto ({len(text)} chars < {min_length}). " - f"Esperaba metadata.text / description / query, o un name " - f"con mas contenido") + f"Escribe el texto en el panel Note del nodo (doble click " + f"para abrir) o pon un name mas largo") log(msg) print(json.dumps({"error": msg, "entities_added": 0, "relations_added": 0})) diff --git a/tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc b/tests/__pycache__/test_web_search.cpython-312-pytest-9.0.2.pyc deleted file mode 100644 index 8fc4bacb025c9fcb1615b14f3ed1f658eda03348..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13629 zcmeG@TWlQHbu+ueeGR#Mza&#!nUutpEj~o*Ws$OFQIajIbYn?T!p0aZ_YTRG_Cehl zN+LVk(2kQ(h=H<2nye8OW&cP>Mb&=!)1L2)Y2GSWgBEzg{D=iEE@o_p_{*VVsNRfPn&qW|-&#NSm5!oMM5UmlK#4}Jv5&jlik z3PdEHq?q!IdPKy%N$;q{?iDFt9LiGu(ExiQC4;HZXo%qzNjX(HTA8XEtxAPQ!>Q`g z>Qv2WO{#XZHdQxTCkkpE@y*JDK>P?t>q+1}VYGn+0XC8lz$PLCY@YE(D`$JptD3Gw zh?5LxJ6PdI& z0<`p}F?}-XHv?laO;zIhb+am&&~!yj>j^!fYG#;`R88_aW-t}ItSadYQO!y^od!n! zdb3K?r^ghkUYW+OnZffX&z(P~48Q)C85mDo*Qcqf&DIby0W8{dQrEgC^;9x0aq%*U zli(Eu@xh&16tyne&kQDV;!Oy*rCSxe6ogu#TazOQm#k8|2HuJ9w*QvZ1__YhP49$A zLPRE&H$9~4rY9@uew*L$8qzJRwQPkRbjk%oG(0%r#7n}YIw9V!xnw9p|`m=#-kkn^GhTjNX?t&88khOo-t2htzI&5R5J83ZFagj7)opH}C zOFpDY2bu8eQk{*J?zb%Ts-QKx+HWXv*qGtSgt|GK?B}I>i8C$2`%Aqp#~X4a-F(wC z>w~dYx@xqPTD39KVg!F7{H@2S*E$Q2$Tp)gA+q{a`gWVw)zWizS?O+6*=^YOcSRyx z7qfCUL#FOzG|C~ z?6lTs4SAk)7&X?4t9|ShS8KOq->nrFhE*4W5eG=8HU3?j#{aIXkN*e8@9IsNJ!&|s zknCo=^P7>?p0APBv$9sYp5^iD?e>=(w0!jTUPlYp8Fj;?+QQze} zX|z`WNPbtg$p{%uOoKKX&Dwu$v5FgvX3(Gqu}*0flS7ZaI$K@sm$4u9PP>>6j8UK@@@y9Hm9EeCSsro=4QTJjP( zYSda*MayGb6)kp4_T927LSR)iXPe02dxFt8EZDYwR@V2}b79DK9k4g5wZ9=lp^cmd5sMvhJP@i^Z10tCkc_Jd08w!=MpuNt|SoV4jX;YizX= zxUF&O|F@qr1D2Txf7$wLLiF^A>CyX4pPos@bk&q}^}4<+nSKlvDG6dq=~zlN1NzLA z3L&6zGmujC7>VhzM_~eyD*T(sbjLHPM-|ih_`bu-q8U)r@eE0%C+>X=n=|S)1My5+ zk0sI?t$_m5r$HoZS~G>jPa)2;>|v!8Lj+NeSpnUMPnv$Bj>jP0WL6{-sf2zH14YYz z)0dgj6f$P|V^dQKNl>$jsuK|SqB9E3WORx#7Bk5D&ic(!?7ZoliqTj~Gef0-8}!wy zrms?rAT5iJJVQgv;%wcnuBn;cs1_L-iXhq_)l3hKFMCWclq2Y$^=mp&DII!L6FL^t zG?nU%^#HKK^8-=wkuRY|AYvC4XPfNyc!|@dH#4o9J~cg&NULV3Bul#seWfl8L^b~U zK5TyAQP}m)gZ!NbO}`bOyN9u?*^M1Trm!6wRAPjv#H@CLgI0m}0!>CmimHS*q8z=` zH2J7=QrD-nmwI|st@~Qya$-s)iCA}rPV^wvbDk|55TqxOcK1Ayb_VL#V{}5*VF+{( z+wKRjVF)0$*&+Ki*REab9)nP4IuVa0_hr&a5DzY|o)|0y7(zzt;Y)Sj6wOQ}R1zZ) z?yNa?^7vaPUQ^yWdG5m5^U9m2U?BDI9*}>pHHo1=w~z>*SD4~1ECD%3Ls@&anK+Yb z+LS;%T2!L@vCB@s>B~UQy5)bE%l~lP33~zuP5}&4eDES5ZwMlUMGfKYN^pim1A<)m z*&!mDhCdU3E{ekYVM&R0rSH}?h_f%c)-|koK4XAvnz8+C z#!PQ4nT&eb1mU*WPluqL4by-PvuPLs#F>UR_cUB`PaAOIMMT?nARe3~DjA|RQSC5M zUN%t@+avTO@Y7QOFe>kbWghj=^FW%SYRwk?HnLm<@N^`K+ehmGz$AKK5Nj@q9}FzS z*6SMFK=DgW zF%oZvUYVLgZ~LOcQ}x_^uT&Sh9}?QympTgd-CtDsD}8gr4{HRuB^Oxi zEQa>Xo%lRdm)pIx=cBgu?d?m4KfJoW{rSb?xfdT+c;v3R6AuGILu9dXDSk&=sXzM8 z*|}Hez4PbRYwPk-$Nc_<+8c+~qz?G4a*`7cEMO_8iY0VK>R70Ccorsd@!W-lH=P2G zd?IzMH*L#H?eqN$Wd7wfsU3c+oaBT93s}mjVhLT5+82n!lRM#{9Qj0QUzfx4*KZ8v zrLNpr0P{n+vjw?pO~T(Qr#VqXf}?klnWJt99N5~-jbk8}+_z9T$8z5)$UFF8P&zDX%R3jLySvvU{HDr>&5l;Q(+2UvMgoE!e7T)hw~$lG(L3vy>(>Rk!|$jhBe zfr8w-CgE?@D&SO+Q5?UE{2X;dU|F}DJyno9U~?{B0g#tF7O%LsC@XOGC{WHOUA*Gn zo({{p+?1F4bDg=_!nVCjhYH*Jft){@hr0}-!QZM?z^Niq9K9>`10ToTa7F6>&6nGS z>O&eT!hFT{c)e6Ul|Kj;Y`IiEkw^t3(|xXVzMllPNazP|J3%=j-}Fp~3Gs=O`24+< zWs~?-EQud3lf$V83m84s5rxIK>$|VR( ziR1>BAZ&a}>H8+0zK2ArWCfWjo-oK%!;(48-*AAX#YrT$LdxaoB$9FLRw8+(DY|`j zzz&pjGjkgb{fw!R39v+IWQry-Gy$G#M9Yj(6#@kiYe5^go4I_Z48~?$oy;UL$bcae z`ZfZzuPB};p_qK582zTr2ylwQyT*eTj6Le#WXQ6(2*t271%HOS_jE5}FCaj-lI}x* zeg(H}`w$yIZ~(vveF@Q{2#z5@KX|#CxdU_z3*rbiIslYlnTlW>!32Uy1PKI}5L`x( z1h8DUxzA~QltF-IH=RP@nBDXWk`#f4Ku0i*;3|S^2(BZT0ia=6=&Lp_H+J`JHg=~q z@Nw;Y7ZDpxA!u3$NJxnh0)N)9VN;&wyt7Jk;zMxu3a>+@X(7t+#b9PUfHM zK`XZptX$vMTDi@(mD{`~HLpsH1kC*_U@0#sme3Wc8LeE4a{=tn4V6BTn$cpsc4H7N z#=~ea4(1Mnvg4*7T8uz*0xd?cknj;&j8-Wx;1Epe2OvJ7yxut|mDf9PfDE4Cg$Kxp zZwrm%zg;>45&RuB4jefHP4`EQ^IMZ_%%j?6)FyowS#}~&5Og4T63s|o!5U?>BYhPg`~bmw0Dh|u z+Ot^)v80Eh!%5vZ3%OdceL4>DR!)K8Zh`3k>KcE379$RhQRT(;}R>u~I&#ECm~(^cYxV+i3bcuO8lwPoAOqy~<6gATWjHDgMPU9OhO>9$I{fA-xz&H;zQ z>#|iwl@WksanK0D!L(BU%So~xBq+kcaHvs|q^nN3F*}E%iQAX_{Uq+#LR(7Q;RJ~{ zlBRD+l2zFte6O3W?=_@jxYxp6n+THI5bOXD^=2N_IKe=`%eI+IDFlm@sH@LbmlYWX&kvjKRQ?Fem(#EnPTXT zxf2_Fj{P4tJ@kt5aS;OU_3h;X>T@I9)A;eok4JvUaK!k(cYsFh2iE<)k%e0O8Q$L+ zVS%l-`J=gxyxg`X;cr!9G$)EkaMXc$sVxUS1@gjU8+9H7vLdyyPja-y^eYO<#1+L1 zDa!aX{tp&Kq4-^d|1D!>o0vapwkkY7)y>l-kUHw7>NT2xV-P{72=nT!j4cGykJ+GP zVvM3Y!VWf!lU<=2u|^!!=?@CAbVl|pq(Zf_yn331;- f_{g06rLRjI`10FgmDu@Zq*C1dU`IgQ!Cv*h^=)f? diff --git a/tests/conftest.py b/tests/conftest.py index 5effcf4..fc0c739 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -191,15 +191,22 @@ def registry_root(): def make_node(ops_db: Path, *, node_id: str, name: str, type_ref: str, - metadata: dict | None = None, source: str = "test") -> None: - """Inserta un nodo de tipo arbitrario en operations.db.""" + metadata: dict | None = None, source: str = "test", + notes: str = "") -> None: + """Inserta un nodo de tipo arbitrario en operations.db. + + `notes` se mapea a la columna `entities.notes` — es lo que el + panel Note del Inspector edita en la app real, y los enrichers + `split_sentences` / `extract_iocs_text` lo leen como fuente de + texto canonica. + """ conn = sqlite3.connect(ops_db) conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, " - " created_at, updated_at) VALUES (?, ?, ?, ?, ?, " + " notes, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, " " '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z')", (node_id, name, type_ref, source, - json.dumps(metadata or {}, ensure_ascii=False)), + json.dumps(metadata or {}, ensure_ascii=False), notes), ) conn.commit() conn.close() diff --git a/tests/test_extract_iocs_text.py b/tests/test_extract_iocs_text.py index e5147ca..93068e7 100644 --- a/tests/test_extract_iocs_text.py +++ b/tests/test_extract_iocs_text.py @@ -32,10 +32,9 @@ def _ioc_paragraph(n: int) -> str: def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root): """Texto con emails, IPs, CVE, hash → entidades creadas con tipos correctos.""" make_node(ops_db, node_id="t1", name="incident", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="incident", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="incident", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err @@ -53,17 +52,16 @@ def test_extract_iocs_text_finds_email_and_ip(ops_db, app_dir, registry_root): assert all(r["to_entity"] == "t1" for r in rels) -def test_extract_iocs_text_uses_metadata_text(ops_db, app_dir, registry_root): - """metadata.text se prioriza sobre node_name.""" +def test_extract_iocs_text_uses_notes_priority(ops_db, app_dir, registry_root): + """`entities.notes` se prioriza sobre node_name.""" make_node(ops_db, node_id="t1", name="placeholder", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="placeholder", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="placeholder", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err # El name "placeholder" no contiene IoCs; si se hubiese usado, no - # habria entidades. Ergo entities_added > 0 demuestra que leyo text. + # habria entidades. Ergo entities_added > 0 demuestra que leyo notes. assert out["entities_added"] >= 2, out @@ -83,10 +81,9 @@ def test_extract_iocs_text_above_threshold_creates_group(ops_db, app_dir, """>=50 IoCs → Group heterogeneo con todos dentro (fase 1).""" text = _ioc_paragraph(180) # ~60 emails + ~60 IPs + ~60 CVEs make_node(ops_db, node_id="t1", name="dump", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="dump", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="dump", node_type="text") rc, out, err = run_enricher("extract_iocs_text", ctx) assert rc == 0, err assert out["iocs_found"] >= 50, out diff --git a/tests/test_split_sentences.py b/tests/test_split_sentences.py index 1870710..baf44ab 100644 --- a/tests/test_split_sentences.py +++ b/tests/test_split_sentences.py @@ -35,10 +35,9 @@ def _build_paragraph(n: int) -> str: def test_split_sentences_creates_sentence_nodes(ops_db, app_dir, registry_root): """Texto con 5 frases distintas → 5 Sentence + 5 SENTENCE_OF.""" make_node(ops_db, node_id="t1", name="tomate doc", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="tomate doc", node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_id="t1", node_name="tomate doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err @@ -67,10 +66,9 @@ def test_split_sentences_below_threshold_no_group(ops_db, app_dir, """30 frases → ningun Group (<50).""" text = _build_paragraph(30) make_node(ops_db, node_id="t1", name="big doc", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="big doc", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="big doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 30 @@ -89,10 +87,9 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir, """100 frases → 1 Group + 10 sueltos + 90 con group_id.""" text = _build_paragraph(100) make_node(ops_db, node_id="t1", name="huge doc", - type_ref="text", metadata={"text": text}) + type_ref="text", notes=text) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, - node_id="t1", node_name="huge doc", node_type="text", - metadata={"text": text}) + node_id="t1", node_name="huge doc", node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 100 @@ -122,7 +119,7 @@ def test_split_sentences_above_threshold_creates_group(ops_db, app_dir, def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root): - """Nodo sin metadata.text/description/query y name corto → exit 2.""" + """Nodo sin notes y con name corto → exit 2.""" make_node(ops_db, node_id="t1", name="x", type_ref="text", metadata={}) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="x", node_type="text") @@ -133,15 +130,13 @@ def test_split_sentences_no_text_fails(ops_db, app_dir, registry_root): "min_length" in (out.get("error") or "") -def test_split_sentences_uses_metadata_text_priority(ops_db, app_dir, - registry_root): - """metadata.text gana sobre node_name aunque ambos tengan texto.""" +def test_split_sentences_uses_notes_priority(ops_db, app_dir, registry_root): + """`entities.notes` gana sobre node_name aunque ambos tengan texto.""" make_node(ops_db, node_id="t1", name="placeholder corto", - type_ref="text", metadata={"text": SAMPLE_TEXT}) + type_ref="text", notes=SAMPLE_TEXT) ctx = base_ctx(ops_db=ops_db, app_dir=app_dir, registry_root=registry_root, node_id="t1", node_name="placeholder corto", - node_type="text", - metadata={"text": SAMPLE_TEXT}) + node_type="text") rc, out, err = run_enricher("split_sentences", ctx) assert rc == 0, err assert out["sentences"] == 5 # 5 frases del SAMPLE_TEXT, no 1 del name