#!/usr/bin/env python3 """Enricher extract_iocs_text — variante offline de extract_text_entities. A diferencia de extract_text_entities, este enricher NO depende de un markdown cacheado (fetch_webpage previo). Lee el texto directamente del nodo (prioridad: `entities.notes` > `node_name`) y aplica el pipeline `extract_iocs` del registry sobre el. El campo `notes` es lo que el usuario escribe en el panel Note (doble click sobre el nodo). Sin red, sin dependencias externas — pensado para probar la app cuando DDG bloquea con captcha o cuando se trabaja en un entorno offline. Wire protocol estandar (issue 0026). Grouping (provisional, fase 1 del issue 0035): Si len(unique_iocs) >= GROUP_THRESHOLD, se crea UN solo Group heterogeneo con todos los IoCs dentro (independientemente de su tipo). La decision 6 del issue 0035 ("multi-tipo → un Group por tipo") es fase 2 — aqui simplificamos para validar el flow end-to-end. Cuando llegue la fase 2, esta logica se sustituira por una que cree N grupos (uno por type_ref que exceda el umbral). """ from __future__ import annotations import json import os import sqlite3 import sys import time import uuid from datetime import datetime, timezone _TYPE_MAP = { "email": ("Email", "address"), "ip_address": ("IPAddress", "address"), "domain": ("Domain", "name"), "file_hash": ("FileHash", "value"), "crypto_wallet": ("CryptoWallet", "address"), "cve_id": ("CVE", "id"), "mac_address": ("MACAddress", "address"), "phone_number": ("Phone", "number"), } DEFAULT_GROUP_THRESHOLD = 50 GROUP_PREVIEW_K = 10 def _coerce_threshold(raw, default: int) -> int: """Acepta int / str numerico / None, devuelve >0 o el default (issue 0035e).""" if raw is None or raw == "": return default try: v = int(raw) except (TypeError, ValueError): return default return v if v > 0 else default def progress(p: float, stage: str = "") -> None: sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def log(msg: str) -> None: sys.stderr.write(f"{msg}\n") sys.stderr.flush() def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def has_group_id_column(conn: sqlite3.Connection) -> bool: try: cur = conn.execute("PRAGMA table_info(entities)") for row in cur: if row[1] == "group_id": return True except sqlite3.Error: pass return False def read_text(ops_db_path: str, node_id: str, node_name: str) -> str: """Lee `entities.notes` y cae al `node_name` si esta vacio. `notes` es el campo donde el usuario escribe via el panel Note (doble click sobre el nodo). Es el sitio canonico para texto largo. """ notes = "" try: c = sqlite3.connect(ops_db_path) try: row = c.execute( "SELECT notes FROM entities WHERE id=?", (node_id,) ).fetchone() if row and isinstance(row[0], str): notes = row[0] finally: c.close() except sqlite3.Error: notes = "" if notes and notes.strip(): return notes.strip() return (node_name or "").strip() def main() -> int: raw = sys.stdin.read() try: ctx = json.loads(raw) except Exception as e: log(f"stdin not valid JSON: {e}") return 2 node_id = ctx.get("node_id") or "" node_name = (ctx.get("node_name") or "").strip() metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db_path = ctx.get("ops_db_path") or "" app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/") registry_root = ctx.get("registry_root") or "" params = ctx.get("params") or {} types_csv = (params.get("types") or "").strip() types_list = ([t.strip() for t in types_csv.split(",") if t.strip()] if types_csv else None) max_entities = int(params.get("max_entities", 500)) if not node_id or not ops_db_path: log("missing node_id / ops_db_path") return 2 ops_db_path = ops_db_path.replace("\\", "/") if not os.path.isabs(ops_db_path): if app_dir_raw and os.path.isdir(app_dir_raw): cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path)) if os.path.exists(cand): ops_db_path = cand if not os.path.isabs(ops_db_path): ops_db_path = os.path.abspath(ops_db_path) if not os.path.exists(ops_db_path): log(f"ops_db_path no existe: {ops_db_path}") print(json.dumps({"error": "ops_db not found", "ops_db_path": ops_db_path, "entities_added": 0, "relations_added": 0})) return 7 progress(0.10, "reading") text = read_text(ops_db_path, node_id, node_name) if not text: msg = ("nodo sin texto. Escribe el contenido en el panel Note " "del nodo (doble click para abrir) o pon un name con " "contenido") log(msg) print(json.dumps({"error": msg, "entities_added": 0, "relations_added": 0})) return 2 progress(0.30, "extracting iocs") # Vendoring: prefiere _vendored/ si existe (binario distribuido); si # no, fallback al registry_root para modo dev local. vendored = os.path.join(os.path.dirname(__file__), "_vendored") if os.path.isdir(vendored): if vendored not in sys.path: sys.path.insert(0, vendored) elif registry_root: py_funcs = os.path.join(registry_root, "python", "functions") if py_funcs not in sys.path: sys.path.insert(0, py_funcs) try: from cybersecurity.extract_iocs import extract_iocs # type: ignore except Exception as e: log(f"no se pudo importar extract_iocs: {e}") print(json.dumps({"error": f"extract_iocs import failed: {e}", "entities_added": 0, "relations_added": 0})) return 5 iocs = extract_iocs(text, types_list) # Dedup por (type, value). seen = set() unique: list[dict] = [] for it in iocs: t = it.get("type") v = it.get("value") or it.get("address") or it.get("name") or "" if not t or not v: continue key = (t, v) if key in seen: continue seen.add(key) unique.append(it) if len(unique) >= max_entities: break progress(0.55, "writing") conn = sqlite3.connect(ops_db_path) conn.execute("PRAGMA foreign_keys=OFF") entities_added = 0 relations_added = 0 new_by_type: dict[str, int] = {} group_id: str | None = None batch_id = uuid.uuid4().hex try: has_group_col = has_group_id_column(conn) n_total = len(unique) # Issue 0035e: respeta override del manifest si viene en ctx. threshold = _coerce_threshold(ctx.get("auto_group_threshold"), DEFAULT_GROUP_THRESHOLD) if n_total >= threshold and has_group_col: # Group heterogeneo (provisional, ver docstring). ts = now_iso() group_id = (f"Group_{now_ms()}_" f"{abs(hash(node_id + batch_id)) % 100000}") group_name = f"iocs: {node_name or '(text)'} ({n_total})" group_meta = { "enricher": "extract_iocs_text", "count": n_total, "batch_id": batch_id, "source_node_id": node_id, } conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, " " created_at, updated_at) " "VALUES (?, ?, 'Group', 'enricher:extract_iocs_text', ?, ?, ?)", (group_id, group_name, json.dumps(group_meta, ensure_ascii=False), ts, ts), ) entities_added += 1 # Relacion EXTRACTED_FROM del Group al source. rel_id = f"rel_{now_ms()}_g_extracted" conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, " " created_at, updated_at) " "VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)", (rel_id, group_id, node_id, ts, ts), ) relations_added += 1 preview = unique[:GROUP_PREVIEW_K] grouped = unique[GROUP_PREVIEW_K:] else: preview = unique grouped = [] def _insert_one(it: dict, idx: int, *, in_group: bool) -> None: nonlocal entities_added, relations_added ioc_type = it.get("type") value = (it.get("value") or it.get("address") or it.get("name") or "") if not value: return type_ref, value_field = _TYPE_MAP.get( ioc_type, (ioc_type or "Text", "value"), ) existed = conn.execute( "SELECT id, group_id FROM entities WHERE type_ref=? " "AND name=? LIMIT 1" if has_group_col else "SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1", (type_ref, value), ).fetchone() ts = now_iso() if existed: target_id = existed[0] # No machacamos group_id existente — un IoC repetido entre # ejecuciones mantiene su primer agrupamiento. else: target_id = f"{type_ref}_{now_ms()}_{idx}" meta = {value_field: value, "batch_id": batch_id} if "start" in it: meta["text_offset"] = it["start"] node_group = group_id if in_group else None if has_group_col: conn.execute( "INSERT INTO entities (id, name, type_ref, source, " " metadata, group_id, created_at, updated_at) " "VALUES (?, ?, ?, 'enricher:extract_iocs_text', ?, " " ?, ?, ?)", (target_id, value, type_ref, json.dumps(meta, ensure_ascii=False), node_group, ts, ts), ) else: conn.execute( "INSERT INTO entities (id, name, type_ref, source, " " metadata, created_at, updated_at) " "VALUES (?, ?, ?, 'enricher:extract_iocs_text', ?, " " ?, ?)", (target_id, value, type_ref, json.dumps(meta, ensure_ascii=False), ts, ts), ) entities_added += 1 new_by_type[type_ref] = new_by_type.get(type_ref, 0) + 1 # Cada IoC mantiene su EXTRACTED_FROM al source original (no # al Group). El Group solo es contenedor visual. rel_exists = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? " "AND name='EXTRACTED_FROM' LIMIT 1", (target_id, node_id), ).fetchone() if not rel_exists: conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, " " created_at, updated_at) " "VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)", (f"rel_{now_ms()}_{idx}_extracted", target_id, node_id, ts, ts), ) relations_added += 1 n_preview = len(preview) n_grouped = len(grouped) for i, it in enumerate(preview): _insert_one(it, i, in_group=False) for j, it in enumerate(grouped): _insert_one(it, n_preview + j, in_group=True) if n_grouped > 0 and j % 20 == 0: progress(0.55 + 0.40 * (j / max(1, n_grouped)), "writing") conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "iocs_found": len(unique), "by_type": new_by_type, "entities_added": entities_added, "relations_added": relations_added, "batch_id": batch_id, "group_id": group_id or "", "grouped": bool(group_id), }, ensure_ascii=False)) return 0 if __name__ == "__main__": sys.exit(main())