graph_explorer/enrichers/extract_links/run.py

#!/usr/bin/env python3
"""Enricher extract_links — issue 0028b.

Lee la markdown cacheada de un Webpage (metadata.markdown_path), saca todas
las URLs unicas con `extract_urls_py_cybersecurity`, y crea/conecta un nodo
Url por cada URL nueva con relacion LINKS_TO desde el Webpage origen.
"""
from __future__ import annotations

import json
import os
import sqlite3
import sys
import time
from datetime import datetime, timezone


def progress(p: float, stage: str = "") -> None:
    sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
    sys.stderr.flush()


def log(msg: str) -> None:
    sys.stderr.write(f"{msg}\n")
    sys.stderr.flush()


def now_iso() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def now_ms() -> int:
    return int(time.time() * 1000)


def main() -> int:
    ctx = json.loads(sys.stdin.read())
    node_id = ctx.get("node_id") or ""
    metadata = ctx.get("metadata") or {}
    if isinstance(metadata, str):
        try: metadata = json.loads(metadata)
        except Exception: metadata = {}
    ops_db = ctx.get("ops_db_path") or ""
    app_dir = ctx.get("app_dir") or ""
    registry_root = ctx.get("registry_root") or ""
    params = ctx.get("params") or {}
    max_links = int(params.get("max_links", 50))

    if not node_id or not ops_db:
        log("missing node_id / ops_db_path")
        return 2

    md_path = metadata.get("markdown_path") or ""
    if not md_path:
        log("nodo sin markdown_path — corre fetch_webpage primero")
        print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.",
                          "entities_added": 0, "relations_added": 0}))
        return 3

    # Path relativo a app_dir.
    abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path)
    if not os.path.exists(abs_md):
        log(f"markdown not found at {abs_md}")
        print(json.dumps({"error": f"markdown not found: {abs_md}",
                          "entities_added": 0, "relations_added": 0}))
        return 4

    progress(0.20, "reading")
    text = open(abs_md, "r", encoding="utf-8", errors="replace").read()

    progress(0.45, "extracting")
    py_funcs = os.path.join(registry_root, "python", "functions")
    if py_funcs not in sys.path:
        sys.path.insert(0, py_funcs)
    from cybersecurity.cybersecurity import extract_urls  # type: ignore

    urls = extract_urls(text)
    # Dedup conservando orden.
    seen = set()
    unique = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            unique.append(u)
    if max_links > 0:
        unique = unique[:max_links]

    progress(0.65, "writing")
    conn = sqlite3.connect(ops_db)
    entities_added = 0
    relations_added = 0
    try:
        for i, u in enumerate(unique):
            existed = conn.execute(
                "SELECT id FROM entities WHERE type_ref='Url' AND name=? LIMIT 1",
                (u,),
            ).fetchone()
            if existed:
                target_id = existed[0]
            else:
                target_id = f"Url_{now_ms()}_{i}"
                ts = now_iso()
                meta_json = json.dumps({"url": u})
                conn.execute(
                    "INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) "
                    "VALUES (?, ?, 'Url', 'enricher:extract_links', ?, ?, ?)",
                    (target_id, u, meta_json, ts, ts),
                )
                entities_added += 1

            rel_exists = conn.execute(
                "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='LINKS_TO' LIMIT 1",
                (node_id, target_id),
            ).fetchone()
            if not rel_exists:
                ts = now_iso()
                conn.execute(
                    "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
                    "VALUES (?, 'LINKS_TO', ?, ?, ?, ?)",
                    (f"rel_{now_ms()}_{i}_links_to", node_id, target_id, ts, ts),
                )
                relations_added += 1
            if i % 10 == 0:
                progress(0.65 + 0.30 * (i / max(1, len(unique))), "writing")
        conn.commit()
    finally:
        conn.close()

    progress(1.0, "done")
    print(json.dumps({
        "links_found":     len(unique),
        "entities_added":  entities_added,
        "relations_added": relations_added,
    }))
    return 0


if __name__ == "__main__":
    sys.exit(main())