diff --git a/app.md b/app.md index 0fd230b..b39f9b0 100644 --- a/app.md +++ b/app.md @@ -30,6 +30,7 @@ python_runtime_deps: - requests - certifi - urllib3 + - cryptography --- ## Arquitectura diff --git a/enrichers/extract_links/manifest.yaml b/enrichers/extract_links/manifest.yaml index 625f194..e27e065 100644 --- a/enrichers/extract_links/manifest.yaml +++ b/enrichers/extract_links/manifest.yaml @@ -4,5 +4,7 @@ description: "Lee la markdown cacheada de un Webpage (metadata.markdown_path) y applies_to: [Webpage] emits: [Url] relations: [LINKS_TO] +uses_functions: + - extract_urls_py_cybersecurity params: - { name: max_links, type: int, default: 50 } diff --git a/enrichers/extract_links/run.py b/enrichers/extract_links/run.py index 6519b60..604852f 100755 --- a/enrichers/extract_links/run.py +++ b/enrichers/extract_links/run.py @@ -69,9 +69,16 @@ def main() -> int: text = open(abs_md, "r", encoding="utf-8", errors="replace").read() progress(0.45, "extracting") - py_funcs = os.path.join(registry_root, "python", "functions") - if py_funcs not in sys.path: - sys.path.insert(0, py_funcs) + # Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al + # registry_root para modo dev local. + vendored = os.path.join(os.path.dirname(__file__), "_vendored") + if os.path.isdir(vendored): + if vendored not in sys.path: + sys.path.insert(0, vendored) + elif registry_root: + py_funcs = os.path.join(registry_root, "python", "functions") + if py_funcs not in sys.path: + sys.path.insert(0, py_funcs) from cybersecurity.cybersecurity import extract_urls # type: ignore urls = extract_urls(text) diff --git a/enrichers/extract_text_entities/manifest.yaml b/enrichers/extract_text_entities/manifest.yaml index 8afc75f..e974411 100644 --- a/enrichers/extract_text_entities/manifest.yaml +++ b/enrichers/extract_text_entities/manifest.yaml @@ -4,6 +4,8 @@ description: "Lee la markdown cacheada de un Webpage y extrae IoCs (IPs, emails, applies_to: [Webpage] emits: [Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone] relations: [EXTRACTED_FROM] +uses_functions: + - extract_iocs_py_cybersecurity params: - { name: types, type: string, default: "" } - { name: max_entities, type: int, default: 200 } diff --git a/enrichers/extract_text_entities/run.py b/enrichers/extract_text_entities/run.py index 1d7290d..b742cd3 100755 --- a/enrichers/extract_text_entities/run.py +++ b/enrichers/extract_text_entities/run.py @@ -98,9 +98,16 @@ def main() -> int: text = open(abs_md, "r", encoding="utf-8", errors="replace").read() progress(0.30, "extracting iocs") - py_funcs = os.path.join(registry_root, "python", "functions") - if py_funcs not in sys.path: - sys.path.insert(0, py_funcs) + # Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al + # registry_root para modo dev local. + vendored = os.path.join(os.path.dirname(__file__), "_vendored") + if os.path.isdir(vendored): + if vendored not in sys.path: + sys.path.insert(0, vendored) + elif registry_root: + py_funcs = os.path.join(registry_root, "python", "functions") + if py_funcs not in sys.path: + sys.path.insert(0, py_funcs) from cybersecurity.extract_iocs import extract_iocs # type: ignore iocs = extract_iocs(text, types_list) diff --git a/enrichers/fetch_webpage/manifest.yaml b/enrichers/fetch_webpage/manifest.yaml index d2a7535..b967f0c 100644 --- a/enrichers/fetch_webpage/manifest.yaml +++ b/enrichers/fetch_webpage/manifest.yaml @@ -4,5 +4,8 @@ description: "Descarga HTML de una URL, extrae markdown limpio (readabilipy) y g applies_to: [Url, Webpage] emits: [Domain] relations: [BELONGS_TO] +uses_functions: + - normalize_url_py_cybersecurity + - html_to_markdown_py_core params: - { name: timeout_s, type: int, default: 15 } diff --git a/enrichers/fetch_webpage/run.py b/enrichers/fetch_webpage/run.py index b23d7d3..6d065e5 100755 --- a/enrichers/fetch_webpage/run.py +++ b/enrichers/fetch_webpage/run.py @@ -38,10 +38,16 @@ def log(msg: str) -> None: def load_registry_funcs(registry_root: str): - """Anade el registry al sys.path e importa funciones que usamos.""" - py_funcs = os.path.join(registry_root, "python", "functions") - if py_funcs not in sys.path: - sys.path.insert(0, py_funcs) + """Importa funciones del registry. Prefiere `_vendored/` (issue 0033b); + si no existe, fallback a `/python/functions/` (modo dev).""" + vendored = os.path.join(os.path.dirname(__file__), "_vendored") + if os.path.isdir(vendored): + if vendored not in sys.path: + sys.path.insert(0, vendored) + elif registry_root: + py_funcs = os.path.join(registry_root, "python", "functions") + if py_funcs not in sys.path: + sys.path.insert(0, py_funcs) from cybersecurity.cybersecurity import normalize_url # type: ignore from core.html_to_markdown import html_to_markdown # type: ignore return normalize_url, html_to_markdown diff --git a/issues/completed/0001-claude-chat-agent.md b/issues/completed/0001-claude-chat-agent.md index 88fb17c..4eded0f 100644 --- a/issues/completed/0001-claude-chat-agent.md +++ b/issues/completed/0001-claude-chat-agent.md @@ -1,7 +1,7 @@ --- id: 0001 title: Chat con Claude sobre el grafo -status: pending +status: completed priority: high created: 2026-04-30 --- diff --git a/issues/completed/0003-enricher-web-extract.md b/issues/completed/0003-enricher-web-extract.md index 55173cb..22e5d9b 100644 --- a/issues/completed/0003-enricher-web-extract.md +++ b/issues/completed/0003-enricher-web-extract.md @@ -1,7 +1,7 @@ --- id: 0003 title: Enricher web — descargar URL/dominio y extraer texto -status: pending +status: completed priority: medium created: 2026-04-30 --- diff --git a/issues/completed/0026-jobs-system.md b/issues/completed/0026-jobs-system.md index 3e45aa9..efaab75 100644 --- a/issues/completed/0026-jobs-system.md +++ b/issues/completed/0026-jobs-system.md @@ -1,7 +1,7 @@ --- id: 0026 title: Sistema de jobs — enrichers asincronos en background -status: in_progress +status: completed priority: high created: 2026-05-01 blocks: [0027, 0028, 0029, 0030] diff --git a/issues/completed/0027-webpage-type-cache.md b/issues/completed/0027-webpage-type-cache.md index 1591252..5b52427 100644 --- a/issues/completed/0027-webpage-type-cache.md +++ b/issues/completed/0027-webpage-type-cache.md @@ -1,7 +1,7 @@ --- id: 0027 title: Tipo Webpage + cache de documentos descargados -status: pending +status: completed priority: high created: 2026-05-01 depends_on: [0026] diff --git a/issues/completed/0028-enricher-fetch-webpage.md b/issues/completed/0028-enricher-fetch-webpage.md index 53ec317..4ea35e9 100644 --- a/issues/completed/0028-enricher-fetch-webpage.md +++ b/issues/completed/0028-enricher-fetch-webpage.md @@ -1,7 +1,7 @@ --- id: 0028 title: Enricher fetch_webpage (MVP end-to-end) -status: pending +status: completed priority: high created: 2026-05-01 depends_on: [0026, 0027] diff --git a/issues/completed/0028b-enrichers-extract-trio.md b/issues/completed/0028b-enrichers-extract-trio.md index 8fe18df..8b69e8d 100644 --- a/issues/completed/0028b-enrichers-extract-trio.md +++ b/issues/completed/0028b-enrichers-extract-trio.md @@ -1,7 +1,7 @@ --- id: 0028b title: Enrichers extract_domain, extract_links, extract_text_entities -status: pending +status: completed priority: high created: 2026-05-01 depends_on: [0028] diff --git a/issues/completed/0031-stable-layout-on-reload.md b/issues/completed/0031-stable-layout-on-reload.md index e5e0a8d..aaf04b5 100644 --- a/issues/completed/0031-stable-layout-on-reload.md +++ b/issues/completed/0031-stable-layout-on-reload.md @@ -1,7 +1,7 @@ --- id: 0031 title: Layout estable al recargar — auto-save, halo placement, sin fit, physics off -status: in_progress +status: completed priority: high created: 2026-05-01 related_to: [0026] diff --git a/issues/0033b-vendor-python-functions.md b/issues/completed/0033b-vendor-python-functions.md similarity index 98% rename from issues/0033b-vendor-python-functions.md rename to issues/completed/0033b-vendor-python-functions.md index 9e0ce07..764ed50 100644 --- a/issues/0033b-vendor-python-functions.md +++ b/issues/completed/0033b-vendor-python-functions.md @@ -1,9 +1,10 @@ --- id: 0033b title: Vendoring de funciones Python por enricher -status: pending +status: completed priority: high created: 2026-05-02 +completed: 2026-05-03 depends_on: [0033] --- diff --git a/tests/test_vendor_script.py b/tests/test_vendor_script.py new file mode 100644 index 0000000..06d5320 --- /dev/null +++ b/tests/test_vendor_script.py @@ -0,0 +1,134 @@ +"""Tests del script tools/vendor_enricher_python.sh (issue 0033b). + +Verifica: + - manifest sin uses_functions Python -> no crea _vendored/. + - manifest con un uses_functions -> copia el .py + __init__. + - dep transitiva (extract_iocs importa siblings) -> copia siblings. + - .vendor.lock con SHA256 + path origen. + - Idempotencia: 2da llamada con mismo estado no rehace nada. + - Cambio en el manifest invalida el lock. +""" +from __future__ import annotations + +import hashlib +import os +import shutil +import subprocess +from pathlib import Path + +import pytest + +from conftest import APP_DIR_SRC, REGISTRY_ROOT + + +SCRIPT = APP_DIR_SRC / "tools" / "vendor_enricher_python.sh" + + +def _make_enricher_dir(tmp_path: Path, manifest: str) -> Path: + enr = tmp_path / "test_enricher" + enr.mkdir() + (enr / "manifest.yaml").write_text(manifest, encoding="utf-8") + (enr / "run.py").write_text("# stub\n", encoding="utf-8") + return enr + + +def _run_vendor(enr_dir: Path) -> subprocess.CompletedProcess: + return subprocess.run( + ["bash", str(SCRIPT), str(enr_dir), str(REGISTRY_ROOT)], + capture_output=True, text=True, timeout=20, + ) + + +def test_no_uses_functions_does_not_create_vendored(tmp_path): + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [text]\n") + proc = _run_vendor(enr) + assert proc.returncode == 0, proc.stderr + assert not (enr / "_vendored").exists() + assert not (enr / ".vendor.lock").exists() + + +def test_single_dep_creates_vendored_layout(tmp_path): + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [Url]\n" + "uses_functions:\n" + " - normalize_url_py_cybersecurity\n") + proc = _run_vendor(enr) + assert proc.returncode == 0, proc.stderr + assert (enr / "_vendored" / "__init__.py").exists() + assert (enr / "_vendored" / "cybersecurity" / "__init__.py").exists() + assert (enr / "_vendored" / "cybersecurity" / "cybersecurity.py").exists() + assert (enr / ".vendor.lock").exists() + lock = (enr / ".vendor.lock").read_text() + assert "normalize_url_py_cybersecurity" in lock + + +def test_transitive_siblings_are_copied(tmp_path): + """extract_iocs.py importa 7 modulos siblings — todos deben venir.""" + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [Webpage]\n" + "uses_functions:\n" + " - extract_iocs_py_cybersecurity\n") + proc = _run_vendor(enr) + assert proc.returncode == 0, proc.stderr + cyb = enr / "_vendored" / "cybersecurity" + assert (cyb / "extract_iocs.py").exists() + expected_siblings = { + "extract_ip_addresses.py", "extract_emails.py", + "extract_domains.py", "extract_file_hashes.py", + "extract_crypto_wallets.py", "extract_cve_ids.py", + "extract_mac_addresses.py", "extract_phone_numbers.py", + } + found = {p.name for p in cyb.glob("*.py")} + missing = expected_siblings - found + assert not missing, f"siblings no copiados: {missing}" + + +def test_lock_contains_correct_sha256(tmp_path): + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [Url]\n" + "uses_functions:\n - normalize_url_py_cybersecurity\n") + proc = _run_vendor(enr) + assert proc.returncode == 0, proc.stderr + + src = REGISTRY_ROOT / "python" / "functions" / "cybersecurity" / "cybersecurity.py" + expected_sha = hashlib.sha256(src.read_bytes()).hexdigest() + + lock = (enr / ".vendor.lock").read_text() + assert expected_sha in lock, lock + + +def test_idempotency_skips_when_unchanged(tmp_path): + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [Url]\n" + "uses_functions:\n - normalize_url_py_cybersecurity\n") + p1 = _run_vendor(enr) + assert p1.returncode == 0 + p2 = _run_vendor(enr) + assert p2.returncode == 0 + assert "sin cambios" in p2.stdout, p2.stdout + + +def test_vendored_module_can_be_imported_in_isolation(tmp_path): + """Smoke: el _vendored/ resultante es importable sin registry_root.""" + enr = _make_enricher_dir(tmp_path, + "id: x\nname: x\napplies_to: [Webpage]\n" + "uses_functions:\n - extract_urls_py_cybersecurity\n") + proc = _run_vendor(enr) + assert proc.returncode == 0, proc.stderr + + # Lanzamos un Python externo con _vendored como unico path adicional. + code = ( + "import sys; sys.path.insert(0, 'enrichers_test/_vendored');" + "from cybersecurity.cybersecurity import extract_urls;" + "print(len(extract_urls('foo http://x.com bar')))" + ) + # Crear symlink temporal con el nombre esperado. + fake = tmp_path / "enrichers_test" + fake.symlink_to(enr) + proc2 = subprocess.run( + ["python3", "-c", code], + cwd=str(tmp_path), capture_output=True, text=True, timeout=10, + ) + assert proc2.returncode == 0, proc2.stderr + assert proc2.stdout.strip() == "1" diff --git a/tools/vendor_enricher_python.sh b/tools/vendor_enricher_python.sh new file mode 100755 index 0000000..7bf8ab0 --- /dev/null +++ b/tools/vendor_enricher_python.sh @@ -0,0 +1,187 @@ +#!/usr/bin/env bash +# vendor_enricher_python.sh — copia las funciones Python del registry +# que un enricher declara en `uses_functions` a su directorio +# `_vendored/`. El run.py importa de `_vendored/` en lugar de +# `/python/functions/`, lo que hace al binario +# distribuible sin acceso al fn_registry. +# +# Issue 0033b. +# +# Uso: +# tools/vendor_enricher_python.sh [] +# +# Lee `uses_functions` del manifest YAML, filtra IDs `*_py_*`, +# resuelve `file_path` desde registry.db, copia los .py y todas las +# importaciones siblings dentro del mismo dominio (transitivo). +# +# Genera `.vendor.lock` con ` ` para +# auditoria. Idempotente — si los hashes coinciden, no copia. +# +# Salida: +# /_vendored/__init__.py +# /_vendored//__init__.py +# /_vendored//.py +# /.vendor.lock + +set -euo pipefail + +ENR_DIR="${1:?enricher_dir requerido}" +REGISTRY_ROOT="${2:-${REGISTRY_ROOT:-$(pwd)}}" + +if [[ ! -f "$ENR_DIR/manifest.yaml" ]]; then + echo "ERROR: $ENR_DIR/manifest.yaml no existe" >&2 + exit 1 +fi +if [[ ! -f "$REGISTRY_ROOT/registry.db" ]]; then + echo "ERROR: $REGISTRY_ROOT/registry.db no existe (REGISTRY_ROOT incorrecto)" >&2 + exit 2 +fi + +VENDOR="$ENR_DIR/_vendored" +LOCK="$ENR_DIR/.vendor.lock" + +# ---------------------------------------------------------------------------- +# Leer uses_functions del manifest (subset YAML soportado por el indexer C++). +# Acepta forma inline `[a, b]` o lista en lineas indentadas con `- `. +# ---------------------------------------------------------------------------- +ids=$(awk ' + /^uses_functions:[[:space:]]*\[/ { + line = $0 + sub(/^uses_functions:[[:space:]]*\[/, "", line) + sub(/\].*$/, "", line) + gsub(/[",]/, " ", line) + print line + exit + } + /^uses_functions:[[:space:]]*$/ { collecting = 1; next } + collecting && /^[[:space:]]*-[[:space:]]+/ { + sub(/^[[:space:]]*-[[:space:]]+/, "") + sub(/[[:space:]]*#.*$/, "") + gsub(/[\047"]/, "") + print + next + } + collecting && /^[^[:space:]-]/ { collecting = 0 } +' "$ENR_DIR/manifest.yaml" | tr ' ' '\n' | awk 'NF' | grep '_py_' || true) + +if [[ -z "$ids" ]]; then + # No hay nada que vendorizar — limpiamos vendor/ y lock por si + # quedaron de un manifest anterior. + rm -rf "$VENDOR" "$LOCK" + echo "vendor: $ENR_DIR — sin uses_functions Python" + exit 0 +fi + +# ---------------------------------------------------------------------------- +# Para cada ID, obtener file_path de registry.db. Construir lista de +# (id, abs_src_path, domain, basename) — la unidad atomica de copia. +# ---------------------------------------------------------------------------- +declare -A SEEN # paths absolutos ya procesados (dedup) +declare -A LOCK_NEW # nueva tabla de lock: id -> sha256 src_path + +queue=() +for id in $ids; do + fp=$(sqlite3 "$REGISTRY_ROOT/registry.db" \ + "SELECT file_path FROM functions WHERE id='$id';") + if [[ -z "$fp" ]]; then + echo "WARN: $id no esta en registry.db (skip)" >&2 + continue + fi + abs="$REGISTRY_ROOT/$fp" + if [[ ! -f "$abs" ]]; then + echo "WARN: file_path '$fp' no existe (skip $id)" >&2 + continue + fi + queue+=("$id|$abs") +done + +if [[ ${#queue[@]} -eq 0 ]]; then + rm -rf "$VENDOR" "$LOCK" + echo "vendor: $ENR_DIR — ninguna funcion Python resoluble" + exit 0 +fi + +# ---------------------------------------------------------------------------- +# Idempotencia: comparar hashes vs lock existente. Si todos coinciden, +# salir sin tocar nada. +# ---------------------------------------------------------------------------- +state_hash="" +for entry in "${queue[@]}"; do + id="${entry%%|*}" + abs="${entry#*|}" + sha=$(sha256sum "$abs" | cut -d' ' -f1) + state_hash+="$id|$sha"$'\n' +done +state_sha=$(echo -n "$state_hash" | sha256sum | cut -d' ' -f1) + +if [[ -f "$LOCK" ]]; then + cur_state="" + while IFS= read -r line; do + # formato: " " + eid=$(echo "$line" | awk '{print $1}') + esha=$(echo "$line" | awk '{print $2}') + cur_state+="$eid|$esha"$'\n' + done < "$LOCK" + cur_sha=$(echo -n "$cur_state" | sha256sum | cut -d' ' -f1) + if [[ "$cur_sha" == "$state_sha" ]]; then + echo "vendor: $ENR_DIR — sin cambios (.vendor.lock OK)" + exit 0 + fi +fi + +# ---------------------------------------------------------------------------- +# Copia con expansion transitiva de imports siblings dentro del mismo +# dominio. Si un .py vendorizado tiene `from X import Y` o +# `import X` donde X es un modulo del mismo directorio, X.py tambien +# se copia (solo si existe junto al fuente original). +# ---------------------------------------------------------------------------- +rm -rf "$VENDOR" +mkdir -p "$VENDOR" +touch "$VENDOR/__init__.py" + +copy_with_siblings() { + local abs="$1" + if [[ -n "${SEEN[$abs]:-}" ]]; then return 0; fi + SEEN[$abs]=1 + + # Inferir dominio del path: /python/functions//.py + local rel + rel=$(realpath --relative-to="$REGISTRY_ROOT" "$abs") + local domain + domain=$(echo "$rel" | awk -F/ '{print $(NF-1)}') + local fname + fname=$(basename "$abs") + local dst_dir="$VENDOR/$domain" + mkdir -p "$dst_dir" + touch "$dst_dir/__init__.py" + cp "$abs" "$dst_dir/$fname" + + # Escanear imports siblings: lineas `from import` o + # `import ` donde .py existe en el mismo dir que abs. + local src_dir + src_dir=$(dirname "$abs") + local sibling_names + sibling_names=$(grep -E '^[[:space:]]*(from [a-zA-Z_][a-zA-Z0-9_]+ import|import [a-zA-Z_][a-zA-Z0-9_]+)' "$abs" \ + | sed -E 's/^[[:space:]]*from ([a-zA-Z_][a-zA-Z0-9_]+).*/\1/; s/^[[:space:]]*import ([a-zA-Z_][a-zA-Z0-9_]+).*/\1/' \ + | sort -u) + for name in $sibling_names; do + local sib="$src_dir/$name.py" + if [[ -f "$sib" && "$sib" != "$abs" ]]; then + copy_with_siblings "$sib" + fi + done +} + +> "$LOCK.tmp" +for entry in "${queue[@]}"; do + id="${entry%%|*}" + abs="${entry#*|}" + copy_with_siblings "$abs" + sha=$(sha256sum "$abs" | cut -d' ' -f1) + rel=$(realpath --relative-to="$REGISTRY_ROOT" "$abs") + echo "$id $sha $rel" >> "$LOCK.tmp" +done + +mv "$LOCK.tmp" "$LOCK" +n=$(wc -l < "$LOCK") +echo "vendor: $ENR_DIR — $n funcs declaradas, $(find "$VENDOR" -name '*.py' | wc -l) archivos copiados"