#!/usr/bin/env python3 """Enricher extract_text_entities — issue 0028b. Lee la markdown cacheada de un Webpage (metadata.markdown_path) y corre el pipeline puro `extract_iocs` (regex puro, sin coste, sin modelos ML). Para cada IoC encontrado: - Crea o reusa la entidad por (type, name). - Crea relacion EXTRACTED_FROM desde la entidad nueva al Webpage origen. Tipos soportados (mapeo IoC -> type_ref del registry): email -> Email ip_address -> IPAddress domain -> Domain file_hash -> FileHash crypto_wallet -> CryptoWallet cve_id -> CVE mac_address -> MACAddress phone_number -> Phone Futura iteracion: añadir GLiNER/GLiREL para Person/Org/Location etc. """ from __future__ import annotations import json import os import sqlite3 import sys import time from datetime import datetime, timezone _TYPE_MAP = { "email": ("Email", "address"), "ip_address": ("IPAddress", "address"), "domain": ("Domain", "name"), "file_hash": ("FileHash", "value"), "crypto_wallet": ("CryptoWallet", "address"), "cve_id": ("CVE", "id"), "mac_address": ("MACAddress", "address"), "phone_number": ("Phone", "number"), } def progress(p: float, stage: str = "") -> None: sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def log(msg: str) -> None: sys.stderr.write(f"{msg}\n") sys.stderr.flush() def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def main() -> int: ctx = json.loads(sys.stdin.read()) node_id = ctx.get("node_id") or "" metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db = ctx.get("ops_db_path") or "" app_dir = ctx.get("app_dir") or "" registry_root = ctx.get("registry_root") or "" params = ctx.get("params") or {} types_csv = (params.get("types") or "").strip() types_list = [t.strip() for t in types_csv.split(",") if t.strip()] if types_csv else None max_entities = int(params.get("max_entities", 200)) if not node_id or not ops_db: log("missing node_id / ops_db_path") return 2 md_path = metadata.get("markdown_path") or "" if not md_path: log("nodo sin markdown_path — corre fetch_webpage primero") print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.", "entities_added": 0, "relations_added": 0})) return 3 abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path) if not os.path.exists(abs_md): log(f"markdown not found at {abs_md}") print(json.dumps({"error": f"markdown not found: {abs_md}", "entities_added": 0, "relations_added": 0})) return 4 progress(0.10, "reading") text = open(abs_md, "r", encoding="utf-8", errors="replace").read() progress(0.30, "extracting iocs") py_funcs = os.path.join(registry_root, "python", "functions") if py_funcs not in sys.path: sys.path.insert(0, py_funcs) from cybersecurity.extract_iocs import extract_iocs # type: ignore iocs = extract_iocs(text, types_list) # Dedup por (type, value). seen = set() unique = [] for it in iocs: t = it.get("type") v = it.get("value") or it.get("address") or it.get("name") or "" if not t or not v: continue key = (t, v) if key in seen: continue seen.add(key) unique.append(it) if len(unique) >= max_entities: break progress(0.55, "writing") conn = sqlite3.connect(ops_db) entities_added = 0 relations_added = 0 new_by_type: dict[str, int] = {} try: n = len(unique) for i, it in enumerate(unique): ioc_type = it.get("type") value = it.get("value") or it.get("address") or it.get("name") or "" if not value: continue type_ref, value_field = _TYPE_MAP.get(ioc_type, (ioc_type or "Text", "value")) existed = conn.execute( "SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1", (type_ref, value), ).fetchone() if existed: target_id = existed[0] else: target_id = f"{type_ref}_{now_ms()}_{i}" ts = now_iso() meta = {value_field: value} if "start" in it: meta["text_offset"] = it["start"] conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) " "VALUES (?, ?, ?, 'enricher:extract_text_entities', ?, ?, ?)", (target_id, value, type_ref, json.dumps(meta), ts, ts), ) entities_added += 1 new_by_type[type_ref] = new_by_type.get(type_ref, 0) + 1 rel_exists = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='EXTRACTED_FROM' LIMIT 1", (target_id, node_id), ).fetchone() if not rel_exists: ts = now_iso() conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) " "VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)", (f"rel_{now_ms()}_{i}_extracted", target_id, node_id, ts, ts), ) relations_added += 1 if i % 20 == 0 and n > 0: progress(0.55 + 0.40 * (i / n), "writing") conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "iocs_found": len(unique), "by_type": new_by_type, "entities_added": entities_added, "relations_added": relations_added, })) return 0 if __name__ == "__main__": sys.exit(main())