Files
graph_explorer/enrichers/extract_links/run.py
T
egutierrez 7ec6c4e09f feat(enrichers): cuatro enrichers web — fetch + extract trio (issues 0028, 0028b)
Cada enricher es un par manifest.yaml + run.py en enrichers/<id>/.

1. fetch_webpage (Url, Webpage):
   HTTP GET (requests, fallback urllib) -> html_to_markdown_py_core ->
   sha256(url) -> guarda HTML+MD en cache/<aa>/<sha>.{html,md}. Convierte
   Url -> Webpage con metadata enriquecida (title/status_code/content_type/
   paths/text_length). Crea Domain con relacion BELONGS_TO.

2. extract_domain (Url, Webpage, Email):
   Saca dominio de metadata.url o metadata.address (sin I/O). Crea/conecta
   Domain con BELONGS_TO. Util cuando el usuario quiere ver el dominio
   antes de fetch.

3. extract_links (Webpage):
   Lee metadata.markdown_path -> extract_urls_py_cybersecurity -> dedup ->
   crea nodo Url por enlace + relacion LINKS_TO. Param max_links (50).

4. extract_text_entities (Webpage):
   Lee metadata.markdown_path -> extract_iocs_py_cybersecurity (regex puro,
   sin coste) -> crea entidades por (type, value) tipadas en el registro:
   Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone.
   Cada una con relacion EXTRACTED_FROM al Webpage origen. v1 sin GLiNER/
   GLiREL — esos requieren modelos pre-cargados (futura iteracion).

Probado end-to-end:
  fetch_webpage  https://httpbin.org/html -> 1 Webpage + 1 Domain
  extract_links  -> 2 Url + 2 LINKS_TO
  extract_text_entities -> 8 IoCs (Email, IP*2, CVE, Domain*2, Wallet, Phone)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 18:24:52 +02:00

140 lines
4.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""Enricher extract_links — issue 0028b.
Lee la markdown cacheada de un Webpage (metadata.markdown_path), saca todas
las URLs unicas con `extract_urls_py_cybersecurity`, y crea/conecta un nodo
Url por cada URL nueva con relacion LINKS_TO desde el Webpage origen.
"""
from __future__ import annotations
import json
import os
import sqlite3
import sys
import time
from datetime import datetime, timezone
def progress(p: float, stage: str = "") -> None:
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
sys.stderr.flush()
def log(msg: str) -> None:
sys.stderr.write(f"{msg}\n")
sys.stderr.flush()
def now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def now_ms() -> int:
return int(time.time() * 1000)
def main() -> int:
ctx = json.loads(sys.stdin.read())
node_id = ctx.get("node_id") or ""
metadata = ctx.get("metadata") or {}
if isinstance(metadata, str):
try: metadata = json.loads(metadata)
except Exception: metadata = {}
ops_db = ctx.get("ops_db_path") or ""
app_dir = ctx.get("app_dir") or ""
registry_root = ctx.get("registry_root") or ""
params = ctx.get("params") or {}
max_links = int(params.get("max_links", 50))
if not node_id or not ops_db:
log("missing node_id / ops_db_path")
return 2
md_path = metadata.get("markdown_path") or ""
if not md_path:
log("nodo sin markdown_path — corre fetch_webpage primero")
print(json.dumps({"error": "missing markdown_path. Run fetch_webpage first.",
"entities_added": 0, "relations_added": 0}))
return 3
# Path relativo a app_dir.
abs_md = md_path if os.path.isabs(md_path) else os.path.join(app_dir, md_path)
if not os.path.exists(abs_md):
log(f"markdown not found at {abs_md}")
print(json.dumps({"error": f"markdown not found: {abs_md}",
"entities_added": 0, "relations_added": 0}))
return 4
progress(0.20, "reading")
text = open(abs_md, "r", encoding="utf-8", errors="replace").read()
progress(0.45, "extracting")
py_funcs = os.path.join(registry_root, "python", "functions")
if py_funcs not in sys.path:
sys.path.insert(0, py_funcs)
from cybersecurity.cybersecurity import extract_urls # type: ignore
urls = extract_urls(text)
# Dedup conservando orden.
seen = set()
unique = []
for u in urls:
if u not in seen:
seen.add(u)
unique.append(u)
if max_links > 0:
unique = unique[:max_links]
progress(0.65, "writing")
conn = sqlite3.connect(ops_db)
entities_added = 0
relations_added = 0
try:
for i, u in enumerate(unique):
existed = conn.execute(
"SELECT id FROM entities WHERE type_ref='Url' AND name=? LIMIT 1",
(u,),
).fetchone()
if existed:
target_id = existed[0]
else:
target_id = f"Url_{now_ms()}_{i}"
ts = now_iso()
meta_json = json.dumps({"url": u})
conn.execute(
"INSERT INTO entities (id, name, type_ref, source, metadata, created_at, updated_at) "
"VALUES (?, ?, 'Url', 'enricher:extract_links', ?, ?, ?)",
(target_id, u, meta_json, ts, ts),
)
entities_added += 1
rel_exists = conn.execute(
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='LINKS_TO' LIMIT 1",
(node_id, target_id),
).fetchone()
if not rel_exists:
ts = now_iso()
conn.execute(
"INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
"VALUES (?, 'LINKS_TO', ?, ?, ?, ?)",
(f"rel_{now_ms()}_{i}_links_to", node_id, target_id, ts, ts),
)
relations_added += 1
if i % 10 == 0:
progress(0.65 + 0.30 * (i / max(1, len(unique))), "writing")
conn.commit()
finally:
conn.close()
progress(1.0, "done")
print(json.dumps({
"links_found": len(unique),
"entities_added": entities_added,
"relations_added": relations_added,
}))
return 0
if __name__ == "__main__":
sys.exit(main())