52495af779
Manifest YAML puede declarar 'auto_group_threshold: <int>' a nivel top-level. enrichers.cpp lo parsea y lo guarda en EnricherSpec. jobs.cpp lo inyecta como campo opcional 'auto_group_threshold' en el JSON stdin del subprocess. Los enrichers Python que crean Groups (web_search, split_words, split_sentences, extract_iocs_text) leen el campo y, si viene > 0, lo usan en lugar de su DEFAULT_GROUP_THRESHOLD. Helper _coerce_threshold tolera int / str / None / 0 cayendo al default.
352 lines
12 KiB
Python
352 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Enricher extract_iocs_text — variante offline de extract_text_entities.
|
|
|
|
A diferencia de extract_text_entities, este enricher NO depende de un
|
|
markdown cacheado (fetch_webpage previo). Lee el texto directamente
|
|
del nodo (prioridad: `entities.notes` > `node_name`) y aplica el
|
|
pipeline `extract_iocs` del registry sobre el. El campo `notes` es lo
|
|
que el usuario escribe en el panel Note (doble click sobre el nodo).
|
|
|
|
Sin red, sin dependencias externas — pensado para probar la app
|
|
cuando DDG bloquea con captcha o cuando se trabaja en un entorno
|
|
offline.
|
|
|
|
Wire protocol estandar (issue 0026).
|
|
|
|
Grouping (provisional, fase 1 del issue 0035):
|
|
Si len(unique_iocs) >= GROUP_THRESHOLD, se crea UN solo Group
|
|
heterogeneo con todos los IoCs dentro (independientemente de su tipo).
|
|
La decision 6 del issue 0035 ("multi-tipo → un Group por tipo") es
|
|
fase 2 — aqui simplificamos para validar el flow end-to-end.
|
|
Cuando llegue la fase 2, esta logica se sustituira por una que cree
|
|
N grupos (uno por type_ref que exceda el umbral).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
_TYPE_MAP = {
|
|
"email": ("Email", "address"),
|
|
"ip_address": ("IPAddress", "address"),
|
|
"domain": ("Domain", "name"),
|
|
"file_hash": ("FileHash", "value"),
|
|
"crypto_wallet": ("CryptoWallet", "address"),
|
|
"cve_id": ("CVE", "id"),
|
|
"mac_address": ("MACAddress", "address"),
|
|
"phone_number": ("Phone", "number"),
|
|
}
|
|
|
|
|
|
DEFAULT_GROUP_THRESHOLD = 50
|
|
GROUP_PREVIEW_K = 10
|
|
|
|
|
|
def _coerce_threshold(raw, default: int) -> int:
|
|
"""Acepta int / str numerico / None, devuelve >0 o el default (issue 0035e)."""
|
|
if raw is None or raw == "":
|
|
return default
|
|
try:
|
|
v = int(raw)
|
|
except (TypeError, ValueError):
|
|
return default
|
|
return v if v > 0 else default
|
|
|
|
|
|
def progress(p: float, stage: str = "") -> None:
|
|
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def log(msg: str) -> None:
|
|
sys.stderr.write(f"{msg}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def now_iso() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def now_ms() -> int:
|
|
return int(time.time() * 1000)
|
|
|
|
|
|
def has_group_id_column(conn: sqlite3.Connection) -> bool:
|
|
try:
|
|
cur = conn.execute("PRAGMA table_info(entities)")
|
|
for row in cur:
|
|
if row[1] == "group_id":
|
|
return True
|
|
except sqlite3.Error:
|
|
pass
|
|
return False
|
|
|
|
|
|
def read_text(ops_db_path: str, node_id: str, node_name: str) -> str:
|
|
"""Lee `entities.notes` y cae al `node_name` si esta vacio.
|
|
|
|
`notes` es el campo donde el usuario escribe via el panel Note
|
|
(doble click sobre el nodo). Es el sitio canonico para texto largo.
|
|
"""
|
|
notes = ""
|
|
try:
|
|
c = sqlite3.connect(ops_db_path)
|
|
try:
|
|
row = c.execute(
|
|
"SELECT notes FROM entities WHERE id=?", (node_id,)
|
|
).fetchone()
|
|
if row and isinstance(row[0], str):
|
|
notes = row[0]
|
|
finally:
|
|
c.close()
|
|
except sqlite3.Error:
|
|
notes = ""
|
|
if notes and notes.strip():
|
|
return notes.strip()
|
|
return (node_name or "").strip()
|
|
|
|
|
|
def main() -> int:
|
|
raw = sys.stdin.read()
|
|
try:
|
|
ctx = json.loads(raw)
|
|
except Exception as e:
|
|
log(f"stdin not valid JSON: {e}")
|
|
return 2
|
|
|
|
node_id = ctx.get("node_id") or ""
|
|
node_name = (ctx.get("node_name") or "").strip()
|
|
metadata = ctx.get("metadata") or {}
|
|
if isinstance(metadata, str):
|
|
try:
|
|
metadata = json.loads(metadata)
|
|
except Exception:
|
|
metadata = {}
|
|
ops_db_path = ctx.get("ops_db_path") or ""
|
|
app_dir_raw = (ctx.get("app_dir") or "").replace("\\", "/")
|
|
registry_root = ctx.get("registry_root") or ""
|
|
params = ctx.get("params") or {}
|
|
|
|
types_csv = (params.get("types") or "").strip()
|
|
types_list = ([t.strip() for t in types_csv.split(",") if t.strip()]
|
|
if types_csv else None)
|
|
max_entities = int(params.get("max_entities", 500))
|
|
|
|
if not node_id or not ops_db_path:
|
|
log("missing node_id / ops_db_path")
|
|
return 2
|
|
|
|
ops_db_path = ops_db_path.replace("\\", "/")
|
|
if not os.path.isabs(ops_db_path):
|
|
if app_dir_raw and os.path.isdir(app_dir_raw):
|
|
cand = os.path.normpath(os.path.join(app_dir_raw, ops_db_path))
|
|
if os.path.exists(cand):
|
|
ops_db_path = cand
|
|
if not os.path.isabs(ops_db_path):
|
|
ops_db_path = os.path.abspath(ops_db_path)
|
|
if not os.path.exists(ops_db_path):
|
|
log(f"ops_db_path no existe: {ops_db_path}")
|
|
print(json.dumps({"error": "ops_db not found",
|
|
"ops_db_path": ops_db_path,
|
|
"entities_added": 0, "relations_added": 0}))
|
|
return 7
|
|
|
|
progress(0.10, "reading")
|
|
text = read_text(ops_db_path, node_id, node_name)
|
|
if not text:
|
|
msg = ("nodo sin texto. Escribe el contenido en el panel Note "
|
|
"del nodo (doble click para abrir) o pon un name con "
|
|
"contenido")
|
|
log(msg)
|
|
print(json.dumps({"error": msg, "entities_added": 0,
|
|
"relations_added": 0}))
|
|
return 2
|
|
|
|
progress(0.30, "extracting iocs")
|
|
# Vendoring: prefiere _vendored/ si existe (binario distribuido); si
|
|
# no, fallback al registry_root para modo dev local.
|
|
vendored = os.path.join(os.path.dirname(__file__), "_vendored")
|
|
if os.path.isdir(vendored):
|
|
if vendored not in sys.path:
|
|
sys.path.insert(0, vendored)
|
|
elif registry_root:
|
|
py_funcs = os.path.join(registry_root, "python", "functions")
|
|
if py_funcs not in sys.path:
|
|
sys.path.insert(0, py_funcs)
|
|
try:
|
|
from cybersecurity.extract_iocs import extract_iocs # type: ignore
|
|
except Exception as e:
|
|
log(f"no se pudo importar extract_iocs: {e}")
|
|
print(json.dumps({"error": f"extract_iocs import failed: {e}",
|
|
"entities_added": 0, "relations_added": 0}))
|
|
return 5
|
|
|
|
iocs = extract_iocs(text, types_list)
|
|
|
|
# Dedup por (type, value).
|
|
seen = set()
|
|
unique: list[dict] = []
|
|
for it in iocs:
|
|
t = it.get("type")
|
|
v = it.get("value") or it.get("address") or it.get("name") or ""
|
|
if not t or not v:
|
|
continue
|
|
key = (t, v)
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
unique.append(it)
|
|
if len(unique) >= max_entities:
|
|
break
|
|
|
|
progress(0.55, "writing")
|
|
conn = sqlite3.connect(ops_db_path)
|
|
conn.execute("PRAGMA foreign_keys=OFF")
|
|
entities_added = 0
|
|
relations_added = 0
|
|
new_by_type: dict[str, int] = {}
|
|
group_id: str | None = None
|
|
batch_id = uuid.uuid4().hex
|
|
|
|
try:
|
|
has_group_col = has_group_id_column(conn)
|
|
n_total = len(unique)
|
|
# Issue 0035e: respeta override del manifest si viene en ctx.
|
|
threshold = _coerce_threshold(ctx.get("auto_group_threshold"),
|
|
DEFAULT_GROUP_THRESHOLD)
|
|
|
|
if n_total >= threshold and has_group_col:
|
|
# Group heterogeneo (provisional, ver docstring).
|
|
ts = now_iso()
|
|
group_id = (f"Group_{now_ms()}_"
|
|
f"{abs(hash(node_id + batch_id)) % 100000}")
|
|
group_name = f"iocs: {node_name or '(text)'} ({n_total})"
|
|
group_meta = {
|
|
"enricher": "extract_iocs_text",
|
|
"count": n_total,
|
|
"batch_id": batch_id,
|
|
"source_node_id": node_id,
|
|
}
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, metadata, "
|
|
" created_at, updated_at) "
|
|
"VALUES (?, ?, 'Group', 'enricher:extract_iocs_text', ?, ?, ?)",
|
|
(group_id, group_name, json.dumps(group_meta, ensure_ascii=False),
|
|
ts, ts),
|
|
)
|
|
entities_added += 1
|
|
# Relacion EXTRACTED_FROM del Group al source.
|
|
rel_id = f"rel_{now_ms()}_g_extracted"
|
|
conn.execute(
|
|
"INSERT INTO relations (id, name, from_entity, to_entity, "
|
|
" created_at, updated_at) "
|
|
"VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)",
|
|
(rel_id, group_id, node_id, ts, ts),
|
|
)
|
|
relations_added += 1
|
|
preview = unique[:GROUP_PREVIEW_K]
|
|
grouped = unique[GROUP_PREVIEW_K:]
|
|
else:
|
|
preview = unique
|
|
grouped = []
|
|
|
|
def _insert_one(it: dict, idx: int, *, in_group: bool) -> None:
|
|
nonlocal entities_added, relations_added
|
|
ioc_type = it.get("type")
|
|
value = (it.get("value") or it.get("address")
|
|
or it.get("name") or "")
|
|
if not value:
|
|
return
|
|
type_ref, value_field = _TYPE_MAP.get(
|
|
ioc_type, (ioc_type or "Text", "value"),
|
|
)
|
|
existed = conn.execute(
|
|
"SELECT id, group_id FROM entities WHERE type_ref=? "
|
|
"AND name=? LIMIT 1" if has_group_col else
|
|
"SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1",
|
|
(type_ref, value),
|
|
).fetchone()
|
|
ts = now_iso()
|
|
if existed:
|
|
target_id = existed[0]
|
|
# No machacamos group_id existente — un IoC repetido entre
|
|
# ejecuciones mantiene su primer agrupamiento.
|
|
else:
|
|
target_id = f"{type_ref}_{now_ms()}_{idx}"
|
|
meta = {value_field: value, "batch_id": batch_id}
|
|
if "start" in it:
|
|
meta["text_offset"] = it["start"]
|
|
node_group = group_id if in_group else None
|
|
if has_group_col:
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, "
|
|
" metadata, group_id, created_at, updated_at) "
|
|
"VALUES (?, ?, ?, 'enricher:extract_iocs_text', ?, "
|
|
" ?, ?, ?)",
|
|
(target_id, value, type_ref,
|
|
json.dumps(meta, ensure_ascii=False),
|
|
node_group, ts, ts),
|
|
)
|
|
else:
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, "
|
|
" metadata, created_at, updated_at) "
|
|
"VALUES (?, ?, ?, 'enricher:extract_iocs_text', ?, "
|
|
" ?, ?)",
|
|
(target_id, value, type_ref,
|
|
json.dumps(meta, ensure_ascii=False), ts, ts),
|
|
)
|
|
entities_added += 1
|
|
new_by_type[type_ref] = new_by_type.get(type_ref, 0) + 1
|
|
|
|
# Cada IoC mantiene su EXTRACTED_FROM al source original (no
|
|
# al Group). El Group solo es contenedor visual.
|
|
rel_exists = conn.execute(
|
|
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? "
|
|
"AND name='EXTRACTED_FROM' LIMIT 1",
|
|
(target_id, node_id),
|
|
).fetchone()
|
|
if not rel_exists:
|
|
conn.execute(
|
|
"INSERT INTO relations (id, name, from_entity, to_entity, "
|
|
" created_at, updated_at) "
|
|
"VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)",
|
|
(f"rel_{now_ms()}_{idx}_extracted",
|
|
target_id, node_id, ts, ts),
|
|
)
|
|
relations_added += 1
|
|
|
|
n_preview = len(preview)
|
|
n_grouped = len(grouped)
|
|
for i, it in enumerate(preview):
|
|
_insert_one(it, i, in_group=False)
|
|
for j, it in enumerate(grouped):
|
|
_insert_one(it, n_preview + j, in_group=True)
|
|
if n_grouped > 0 and j % 20 == 0:
|
|
progress(0.55 + 0.40 * (j / max(1, n_grouped)), "writing")
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
progress(1.0, "done")
|
|
print(json.dumps({
|
|
"iocs_found": len(unique),
|
|
"by_type": new_by_type,
|
|
"entities_added": entities_added,
|
|
"relations_added": relations_added,
|
|
"batch_id": batch_id,
|
|
"group_id": group_id or "",
|
|
"grouped": bool(group_id),
|
|
}, ensure_ascii=False))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|