Files
graph_explorer/enrichers/extract_text_entities/run.py
T
egutierrez ee0d26ce2d feat(enrichers): vendoring de funciones Python por enricher (issue 0033b)
Cada enricher con `lang: python` y `uses_functions` no vacio ahora
puede empaquetar las funciones del registry que necesita en
`<enricher>/_vendored/`. El run.py importa de ahi en lugar de
`<registry_root>/python/functions/`, lo que hace al binario
distribuible sin dependencia de un fn_registry montado.

Cambios:

1. tools/vendor_enricher_python.sh
   - Lee `uses_functions` del manifest (filtrando IDs `*_py_*`).
   - Resuelve `file_path` desde registry.db.
   - Copia recursivamente con expansion transitiva: si un fichero
     vendorizado importa siblings del mismo dominio, los siblings
     tambien se copian (resuelve el caso `extract_iocs.py` que
     importa 7 modulos hermanos).
   - Genera `.vendor.lock` con `<id>  <sha256>  <src_path>` por
     funcion declarada para auditoria.
   - Idempotente — si todos los hashes coinciden, no rehace nada.

2. Manifests actualizados con `uses_functions`:
   - fetch_webpage:        normalize_url + html_to_markdown
   - extract_links:        extract_urls
   - extract_text_entities: extract_iocs

3. run.py de los 3 enrichers afectados: importan de `_vendored/`
   si existe, fallback a `<registry_root>/python/functions/` en
   modo dev (mantiene los tests pytest funcionando).

4. app.md: anade `cryptography` a python_runtime_deps porque el
   blob `cybersecurity.cybersecurity` lo importa al top.

5. Tests:
   - test_vendor_script.py — 6 tests del script: layout correcto,
     transitive siblings, lock con SHA256, idempotencia, modulos
     importables en aislamiento.
   - 16 tests de enrichers existentes pasan via vendoring (no usan
     registry_root porque _vendored/ tiene prioridad).

6. Issue 0033b movido a issues/completed/.

Tests: 32/32 verde (16 enrichers + 6 dispatcher + 4 runtime + 6
vendor).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:20:41 +02:00

195 lines
6.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""Enricher extract_text_entities — issue 0028b.
Lee la markdown cacheada de un Webpage (metadata.markdown_path) y corre el
pipeline puro `extract_iocs` (regex puro, sin coste, sin modelos ML).
Para cada IoC encontrado:
- Crea o reusa la entidad por (type, name).
- Crea relacion EXTRACTED_FROM desde la entidad nueva al Webpage origen.
Tipos soportados (mapeo IoC -> type_ref del registry):
email -> Email
ip_address -> IPAddress
domain -> Domain
file_hash -> FileHash
crypto_wallet -> CryptoWallet
cve_id -> CVE
mac_address -> MACAddress
phone_number -> Phone
Futura iteracion: añadir GLiNER/GLiREL para Person/Org/Location etc.
"""
from __future__ import annotations
import json
import os
import sqlite3
import sys
import time
from datetime import datetime, timezone
_TYPE_MAP = {
"email": ("Email", "address"),
"ip_address": ("IPAddress", "address"),
"domain": ("Domain", "name"),
"file_hash": ("FileHash", "value"),
"crypto_wallet": ("CryptoWallet", "address"),
"cve_id": ("CVE", "id"),
"mac_address": ("MACAddress", "address"),
"phone_number": ("Phone", "number"),
}
def progress(p: float, stage: str = "") -> None:
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
sys.stderr.flush()
def log(msg: str) -> None:
sys.stderr.write(f"{msg}\n")
sys.stderr.flush()
def now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def now_ms() -> int:
return int(time.time() * 1000)
def main() -> int:
ctx = json.loads(sys.stdin.read())
node_id = ctx.get("node_id") or ""
metadata = ctx.get("metadata") or {}
if isinstance(metadata, str):
try: metadata = json.loads(metadata)
except Exception: metadata = {}
ops_db = ctx.get("ops_db_path") or ""
app_dir = ctx.get("app_dir") or ""
registry_root = ctx.get("registry_root") or ""
params = ctx.get("params") or {}
types_csv = (params.get("types") or "").strip()
types_list = [t.strip() for t in types_csv.split(",") if t.strip()] if types_csv else None
max_entities = int(params.get("max_entities", 200))
if not node_id or not ops_db:
log("missing node_id / ops_db_path")
return 2
md_path = metadata.get("markdown_path") or ""
if not md_path:
log("nodo sin markdown_path — corre fetch_webpage primero")
print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.",
"entities_added": 0, "relations_added": 0}))
return 3
abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path)
if not os.path.exists(abs_md):
log(f"markdown not found at {abs_md}")
print(json.dumps({"error": f"markdown not found: {abs_md}",
"entities_added": 0, "relations_added": 0}))
return 4
progress(0.10, "reading")
text = open(abs_md, "r", encoding="utf-8", errors="replace").read()
progress(0.30, "extracting iocs")
# Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al
# registry_root para modo dev local.
vendored = os.path.join(os.path.dirname(__file__), "_vendored")
if os.path.isdir(vendored):
if vendored not in sys.path:
sys.path.insert(0, vendored)
elif registry_root:
py_funcs = os.path.join(registry_root, "python", "functions")
if py_funcs not in sys.path:
sys.path.insert(0, py_funcs)
from cybersecurity.extract_iocs import extract_iocs # type: ignore
iocs = extract_iocs(text, types_list)
# Dedup por (type, value).
seen = set()
unique = []
for it in iocs:
t = it.get("type")
v = it.get("value") or it.get("address") or it.get("name") or ""
if not t or not v:
continue
key = (t, v)
if key in seen:
continue
seen.add(key)
unique.append(it)
if len(unique) >= max_entities:
break
progress(0.55, "writing")
conn = sqlite3.connect(ops_db)
entities_added = 0
relations_added = 0
new_by_type: dict[str, int] = {}
try:
n = len(unique)
for i, it in enumerate(unique):
ioc_type = it.get("type")
value = it.get("value") or it.get("address") or it.get("name") or ""
if not value:
continue
type_ref, value_field = _TYPE_MAP.get(ioc_type, (ioc_type or "Text", "value"))
existed = conn.execute(
"SELECT id FROM entities WHERE type_ref=? AND name=? LIMIT 1",
(type_ref, value),
).fetchone()
if existed:
target_id = existed[0]
else:
target_id = f"{type_ref}_{now_ms()}_{i}"
ts = now_iso()
meta = {value_field: value}
if "start" in it: meta["text_offset"] = it["start"]
conn.execute(
"INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) "
"VALUES (?, ?, ?, 'enricher:extract_text_entities', ?, ?, ?)",
(target_id, value, type_ref, json.dumps(meta), ts, ts),
)
entities_added += 1
new_by_type[type_ref] = new_by_type.get(type_ref, 0) + 1
rel_exists = conn.execute(
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='EXTRACTED_FROM' LIMIT 1",
(target_id, node_id),
).fetchone()
if not rel_exists:
ts = now_iso()
conn.execute(
"INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
"VALUES (?, 'EXTRACTED_FROM', ?, ?, ?, ?)",
(f"rel_{now_ms()}_{i}_extracted", target_id, node_id, ts, ts),
)
relations_added += 1
if i % 20 == 0 and n > 0:
progress(0.55 + 0.40 * (i / n), "writing")
conn.commit()
finally:
conn.close()
progress(1.0, "done")
print(json.dumps({
"iocs_found": len(unique),
"by_type": new_by_type,
"entities_added": entities_added,
"relations_added": relations_added,
}))
return 0
if __name__ == "__main__":
sys.exit(main())