ee0d26ce2d
Cada enricher con `lang: python` y `uses_functions` no vacio ahora
puede empaquetar las funciones del registry que necesita en
`<enricher>/_vendored/`. El run.py importa de ahi en lugar de
`<registry_root>/python/functions/`, lo que hace al binario
distribuible sin dependencia de un fn_registry montado.
Cambios:
1. tools/vendor_enricher_python.sh
- Lee `uses_functions` del manifest (filtrando IDs `*_py_*`).
- Resuelve `file_path` desde registry.db.
- Copia recursivamente con expansion transitiva: si un fichero
vendorizado importa siblings del mismo dominio, los siblings
tambien se copian (resuelve el caso `extract_iocs.py` que
importa 7 modulos hermanos).
- Genera `.vendor.lock` con `<id> <sha256> <src_path>` por
funcion declarada para auditoria.
- Idempotente — si todos los hashes coinciden, no rehace nada.
2. Manifests actualizados con `uses_functions`:
- fetch_webpage: normalize_url + html_to_markdown
- extract_links: extract_urls
- extract_text_entities: extract_iocs
3. run.py de los 3 enrichers afectados: importan de `_vendored/`
si existe, fallback a `<registry_root>/python/functions/` en
modo dev (mantiene los tests pytest funcionando).
4. app.md: anade `cryptography` a python_runtime_deps porque el
blob `cybersecurity.cybersecurity` lo importa al top.
5. Tests:
- test_vendor_script.py — 6 tests del script: layout correcto,
transitive siblings, lock con SHA256, idempotencia, modulos
importables en aislamiento.
- 16 tests de enrichers existentes pasan via vendoring (no usan
registry_root porque _vendored/ tiene prioridad).
6. Issue 0033b movido a issues/completed/.
Tests: 32/32 verde (16 enrichers + 6 dispatcher + 4 runtime + 6
vendor).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
147 lines
4.7 KiB
Python
Executable File
147 lines
4.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Enricher extract_links — issue 0028b.
|
|
|
|
Lee la markdown cacheada de un Webpage (metadata.markdown_path), saca todas
|
|
las URLs unicas con `extract_urls_py_cybersecurity`, y crea/conecta un nodo
|
|
Url por cada URL nueva con relacion LINKS_TO desde el Webpage origen.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
import time
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def progress(p: float, stage: str = "") -> None:
|
|
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def log(msg: str) -> None:
|
|
sys.stderr.write(f"{msg}\n")
|
|
sys.stderr.flush()
|
|
|
|
|
|
def now_iso() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def now_ms() -> int:
|
|
return int(time.time() * 1000)
|
|
|
|
|
|
def main() -> int:
|
|
ctx = json.loads(sys.stdin.read())
|
|
node_id = ctx.get("node_id") or ""
|
|
metadata = ctx.get("metadata") or {}
|
|
if isinstance(metadata, str):
|
|
try: metadata = json.loads(metadata)
|
|
except Exception: metadata = {}
|
|
ops_db = ctx.get("ops_db_path") or ""
|
|
app_dir = ctx.get("app_dir") or ""
|
|
registry_root = ctx.get("registry_root") or ""
|
|
params = ctx.get("params") or {}
|
|
max_links = int(params.get("max_links", 50))
|
|
|
|
if not node_id or not ops_db:
|
|
log("missing node_id / ops_db_path")
|
|
return 2
|
|
|
|
md_path = metadata.get("markdown_path") or ""
|
|
if not md_path:
|
|
log("nodo sin markdown_path — corre fetch_webpage primero")
|
|
print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.",
|
|
"entities_added": 0, "relations_added": 0}))
|
|
return 3
|
|
|
|
# Path relativo a app_dir.
|
|
abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path)
|
|
if not os.path.exists(abs_md):
|
|
log(f"markdown not found at {abs_md}")
|
|
print(json.dumps({"error": f"markdown not found: {abs_md}",
|
|
"entities_added": 0, "relations_added": 0}))
|
|
return 4
|
|
|
|
progress(0.20, "reading")
|
|
text = open(abs_md, "r", encoding="utf-8", errors="replace").read()
|
|
|
|
progress(0.45, "extracting")
|
|
# Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al
|
|
# registry_root para modo dev local.
|
|
vendored = os.path.join(os.path.dirname(__file__), "_vendored")
|
|
if os.path.isdir(vendored):
|
|
if vendored not in sys.path:
|
|
sys.path.insert(0, vendored)
|
|
elif registry_root:
|
|
py_funcs = os.path.join(registry_root, "python", "functions")
|
|
if py_funcs not in sys.path:
|
|
sys.path.insert(0, py_funcs)
|
|
from cybersecurity.cybersecurity import extract_urls # type: ignore
|
|
|
|
urls = extract_urls(text)
|
|
# Dedup conservando orden.
|
|
seen = set()
|
|
unique = []
|
|
for u in urls:
|
|
if u not in seen:
|
|
seen.add(u)
|
|
unique.append(u)
|
|
if max_links > 0:
|
|
unique = unique[:max_links]
|
|
|
|
progress(0.65, "writing")
|
|
conn = sqlite3.connect(ops_db)
|
|
entities_added = 0
|
|
relations_added = 0
|
|
try:
|
|
for i, u in enumerate(unique):
|
|
existed = conn.execute(
|
|
"SELECT id FROM entities WHERE type_ref='Url' AND name=? LIMIT 1",
|
|
(u,),
|
|
).fetchone()
|
|
if existed:
|
|
target_id = existed[0]
|
|
else:
|
|
target_id = f"Url_{now_ms()}_{i}"
|
|
ts = now_iso()
|
|
meta_json = json.dumps({"url": u})
|
|
conn.execute(
|
|
"INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) "
|
|
"VALUES (?, ?, 'Url', 'enricher:extract_links', ?, ?, ?)",
|
|
(target_id, u, meta_json, ts, ts),
|
|
)
|
|
entities_added += 1
|
|
|
|
rel_exists = conn.execute(
|
|
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='LINKS_TO' LIMIT 1",
|
|
(node_id, target_id),
|
|
).fetchone()
|
|
if not rel_exists:
|
|
ts = now_iso()
|
|
conn.execute(
|
|
"INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
|
|
"VALUES (?, 'LINKS_TO', ?, ?, ?, ?)",
|
|
(f"rel_{now_ms()}_{i}_links_to", node_id, target_id, ts, ts),
|
|
)
|
|
relations_added += 1
|
|
if i % 10 == 0:
|
|
progress(0.65 + 0.30 * (i / max(1, len(unique))), "writing")
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
progress(1.0, "done")
|
|
print(json.dumps({
|
|
"links_found": len(unique),
|
|
"entities_added": entities_added,
|
|
"relations_added": relations_added,
|
|
}))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|