#!/usr/bin/env python3 """Enricher extract_links — issue 0028b. Lee la markdown cacheada de un Webpage (metadata.markdown_path), saca todas las URLs unicas con `extract_urls_py_cybersecurity`, y crea/conecta un nodo Url por cada URL nueva con relacion LINKS_TO desde el Webpage origen. """ from __future__ import annotations import json import os import sqlite3 import sys import time from datetime import datetime, timezone def progress(p: float, stage: str = "") -> None: sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def log(msg: str) -> None: sys.stderr.write(f"{msg}\n") sys.stderr.flush() def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def main() -> int: ctx = json.loads(sys.stdin.read()) node_id = ctx.get("node_id") or "" metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db = ctx.get("ops_db_path") or "" app_dir = ctx.get("app_dir") or "" registry_root = ctx.get("registry_root") or "" params = ctx.get("params") or {} max_links = int(params.get("max_links", 50)) if not node_id or not ops_db: log("missing node_id / ops_db_path") return 2 md_path = metadata.get("markdown_path") or "" if not md_path: log("nodo sin markdown_path — corre fetch_webpage primero") print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.", "entities_added": 0, "relations_added": 0})) return 3 # Path relativo a app_dir. abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path) if not os.path.exists(abs_md): log(f"markdown not found at {abs_md}") print(json.dumps({"error": f"markdown not found: {abs_md}", "entities_added": 0, "relations_added": 0})) return 4 progress(0.20, "reading") text = open(abs_md, "r", encoding="utf-8", errors="replace").read() progress(0.45, "extracting") # Prefiere _vendored/ (issue 0033b) si existe; si no, fallback al # registry_root para modo dev local. vendored = os.path.join(os.path.dirname(__file__), "_vendored") if os.path.isdir(vendored): if vendored not in sys.path: sys.path.insert(0, vendored) elif registry_root: py_funcs = os.path.join(registry_root, "python", "functions") if py_funcs not in sys.path: sys.path.insert(0, py_funcs) from cybersecurity.cybersecurity import extract_urls # type: ignore urls = extract_urls(text) # Dedup conservando orden. seen = set() unique = [] for u in urls: if u not in seen: seen.add(u) unique.append(u) if max_links > 0: unique = unique[:max_links] progress(0.65, "writing") conn = sqlite3.connect(ops_db) entities_added = 0 relations_added = 0 try: for i, u in enumerate(unique): existed = conn.execute( "SELECT id FROM entities WHERE type_ref='Url' AND name=? LIMIT 1", (u,), ).fetchone() if existed: target_id = existed[0] else: target_id = f"Url_{now_ms()}_{i}" ts = now_iso() meta_json = json.dumps({"url": u}) conn.execute( "INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) " "VALUES (?, ?, 'Url', 'enricher:extract_links', ?, ?, ?)", (target_id, u, meta_json, ts, ts), ) entities_added += 1 rel_exists = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='LINKS_TO' LIMIT 1", (node_id, target_id), ).fetchone() if not rel_exists: ts = now_iso() conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) " "VALUES (?, 'LINKS_TO', ?, ?, ?, ?)", (f"rel_{now_ms()}_{i}_links_to", node_id, target_id, ts, ts), ) relations_added += 1 if i % 10 == 0: progress(0.65 + 0.30 * (i / max(1, len(unique))), "writing") conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "links_found": len(unique), "entities_added": entities_added, "relations_added": relations_added, })) return 0 if __name__ == "__main__": sys.exit(main())