graph_explorer/enrichers/extract_text_entities/run.py

#!/usr/bin/env python3
"""Enricher extract_text_entities — issue 0028b.

Lee la markdown cacheada de un Webpage (metadata.markdown_path) y corre el
pipeline puro `extract_iocs` (regex puro, sin coste, sin modelos ML).

Para cada IoC encontrado:
  - Crea o reusa la entidad por (type, name).
  - Crea relacion EXTRACTED_FROM desde la entidad nueva al Webpage origen.

Tipos soportados (mapeo IoC -> type_ref del registry):
  email         -> Email
  ip_address    -> IPAddress
  domain        -> Domain
  file_hash     -> FileHash
  crypto_wallet -> CryptoWallet
  cve_id        -> CVE
  mac_address   -> MACAddress
  phone_number  -> Phone

Futura iteracion: añadir GLiNER/GLiREL para Person/Org/Location etc.
"""
from __future__ import annotations

import json
import os
import sqlite3
import sys
import time
from datetime import datetime, timezone


_TYPE_MAP = {
    "email":         ("Email",        "address"),
    "ip_address":    ("IPAddress",    "address"),
    "domain":        ("Domain",       "name"),
    "file_hash":     ("FileHash",     "value"),
    "crypto_wallet": ("CryptoWallet", "address"),
    "cve_id":        ("CVE",          "id"),
    "mac_address":   ("MACAddress",   "address"),
    "phone_number":  ("Phone",        "number"),
}


def progress(p: float, stage: str = "") -> None:
    sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
    sys.stderr.flush()


def log(msg: str) -> None:
    sys.stderr.write(f"{msg}\n")
    sys.stderr.flush()


def now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def now_ms() -> int:
    return int(time.time() * 1000)


def main() -> int:
    ctx = json.loads(sys.stdin.read())
    node_id = ctx.get("node_id") or ""
    metadata = ctx.get("metadata") or {}
    if isinstance(metadata, str):
        try: metadata = json.loads(metadata)
        except Exception: metadata = {}
    ops_db = ctx.get("ops_db_path") or ""
    app_dir = ctx.get("app_dir") or ""
    registry_root = ctx.get("registry_root") or ""
    params = ctx.get("params") or {}

    types_csv = (params.get("types") or "").strip()
    types_list = [t.strip() for t in types_csv.split(",") if t.strip()] if types_csv else None
    max_entities = int(params.get("max_entities", 200))

    if not node_id or not ops_db:
        log("missing node_id / ops_db_path")
        return 2

    md_path = metadata.get("markdown_path") or ""
    if not md_path:
        log("nodo sin markdown_path — corre fetch_webpage primero")
        print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.",
                          "entities_added": 0, "relations_added": 0}))
        return 3

    abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path)
    if not os.path.exists(abs_md):
        log(f"markdown not found at {abs_md}")
        print(json.dumps({"error": f"markdown not found: {abs_md}",
                          "entities_added": 0, "relations_added": 0}))
        return 4

    progress(0.10, "reading")
    text = open(abs_md, "r", encoding="utf-8", errors="replace").read()

    progress(0.30, "extracting iocs")
    # Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al
    # registry_root para modo dev local.
    vendored = os.path.join(os.path.dirname(__file__), "_vendored")
    if os.path.isdir(vendored):
        if vendored not in sys.path:
            sys.path.insert(0, vendored)
    elif registry_root:
        py_funcs = os.path.join(registry_root, "python", "functions")
        if py_funcs not in sys.path:
            sys.path.insert(0, py_funcs)
    from cybersecurity.extract_iocs import extract_iocs  # type: ignore

    iocs = extract_iocs(text, types_list)

    # Dedup por (type, value).
    seen = set()
    unique = []
    for it in iocs:
        t = it.get("type")
        v = it.get("value") or it.get("address") or it.get("name") or ""
        if not t or not v:
            continue
        key = (t, v)
        if key in seen:
            continue
        seen.add(key)
        unique.append(it)
        if len(unique) >= max_entities:
            break

    progress(0.55, "writing")
    conn = sqlite3.connect(ops_db)
    entities_added = 0
    relations_added = 0
    new_by_type: dict[str, int] = {}
    try:
        n = len(unique)
        for i, it in enumerate(unique):
            ioc_type = it.get("type")
            value = it.get("value") or it.get("address") or it.get("name") or ""
            if not value:
                continue
            type_ref, value_field = _TYPE_MAP.get(ioc_type, (ioc_type or "Text", "value"))

            existed = conn.execute(
                "SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1",
                (type_ref, value),
            ).fetchone()
            if existed:
                target_id = existed[0]
            else:
                target_id = f"{type_ref}_{now_ms()}_{i}"
                ts = now_iso()
                meta = {value_field: value}
                if "start" in it: meta["text_offset"] = it["start"]
                conn.execute(
                    "INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) "
                    "VALUES (?, ?, ?, 'enricher:extract_text_entities', ?, ?, ?)",
                    (target_id, value, type_ref, json.dumps(meta), ts, ts),
                )
                entities_added += 1
                new_by_type[type_ref] = new_by_type.get(type_ref, 0) + 1

            rel_exists = conn.execute(
                "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='EXTRACTED_FROM' LIMIT 1",
                (target_id, node_id),
            ).fetchone()
            if not rel_exists:
                ts = now_iso()
                conn.execute(
                    "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
                    "VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)",
                    (f"rel_{now_ms()}_{i}_extracted", target_id, node_id, ts, ts),
                )
                relations_added += 1

            if i % 20 == 0 and n > 0:
                progress(0.55 + 0.40 * (i / n), "writing")
        conn.commit()
    finally:
        conn.close()

    progress(1.0, "done")
    print(json.dumps({
        "iocs_found":      len(unique),
        "by_type":         new_by_type,
        "entities_added":  entities_added,
        "relations_added": relations_added,
    }))
    return 0


if __name__ == "__main__":
    sys.exit(main())