#!/usr/bin/env python3 """Enricher fetch_webpage — issue 0028. Lee JSON de stdin, descarga la URL del nodo, convierte HTML a markdown, guarda blobs en `//.{html,md}`, actualiza el nodo a tipo Webpage con metadata enriquecida y crea/conecta el Domain. Wire protocol (issue 0026): - stdin: JSON con node_id, metadata, ops_db_path, app_dir, cache_dir, registry_root, params. - stderr: lineas `PROGRESS: ` para feedback de UI. - stdout: una linea JSON al final con resumen `{entities_added, ...}`. - exit code 0 = ok, !=0 = error (stderr capturado se muestra en panel). """ from __future__ import annotations import hashlib import json import os import re import sqlite3 import sys import time from datetime import datetime, timezone from pathlib import Path from urllib.parse import urlparse def progress(p: float, stage: str = "") -> None: """Emite linea PROGRESS al stderr para que C++ actualice la UI.""" sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def log(msg: str) -> None: sys.stderr.write(f"{msg}\n") sys.stderr.flush() def load_registry_funcs(registry_root: str): """Anade el registry al sys.path e importa funciones que usamos.""" py_funcs = os.path.join(registry_root, "python", "functions") if py_funcs not in sys.path: sys.path.insert(0, py_funcs) from cybersecurity.cybersecurity import normalize_url # type: ignore from core.html_to_markdown import html_to_markdown # type: ignore return normalize_url, html_to_markdown def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def fetch_with_requests(url: str, timeout: int): """Descarga la URL y retorna (status_code, content_type, html, encoding). Usa `requests` si esta disponible, fallback a urllib. """ try: import requests # type: ignore headers = { "User-Agent": ( "Mozilla/5.0 (graph_explorer/0.1; " "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/graph_explorer) " "Chrome/120 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } r = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True) ct = r.headers.get("Content-Type", "text/html") # `requests` decodifica por encoding HTTP/charset; si falla cae a apparent. return r.status_code, ct, r.text, r.encoding or "utf-8" except ImportError: from urllib.request import Request, urlopen req = Request(url, headers={"User-Agent": "graph_explorer/0.1"}) with urlopen(req, timeout=timeout) as resp: # type: ignore data = resp.read() ct = resp.headers.get("Content-Type", "text/html") enc = "utf-8" m = re.search(r"charset=([\w-]+)", ct, re.I) if m: enc = m.group(1).lower() return resp.status, ct, data.decode(enc, errors="replace"), enc _TITLE_RE = re.compile(r"]*>(.*?)", re.I | re.S) def extract_title(html: str) -> str: m = _TITLE_RE.search(html) if not m: return "" title = re.sub(r"\s+", " ", m.group(1)).strip() if len(title) > 300: title = title[:300] + "…" return title def domain_of(url: str) -> str: try: host = urlparse(url).hostname or "" return host.lower() except Exception: return "" def cache_paths(cache_dir: str, key: str) -> tuple[Path, Path]: """Devuelve (html_path, md_path) y crea el dir intermedio.""" sub = key[:2] base = Path(cache_dir) / sub base.mkdir(parents=True, exist_ok=True) return base / f"{key}.html", base / f"{key}.md" def merge_metadata_json(existing: str, patch: dict) -> str: """Fusiona patch sobre el JSON existente (string) y devuelve nuevo string.""" try: cur = json.loads(existing) if existing else {} if not isinstance(cur, dict): cur = {} except Exception: cur = {} cur.update(patch) return json.dumps(cur, ensure_ascii=False) def upsert_domain(conn: sqlite3.Connection, name: str) -> str: """Crea o reusa entidad Domain por nombre. Retorna su id.""" cur = conn.execute( "SELECT id FROM entities WHERE type_ref='Domain' AND name=? LIMIT 1", (name,), ) row = cur.fetchone() if row: return row[0] new_id = f"Domain_{now_ms()}" ts = now_iso() conn.execute( "INSERT INTO entities (id, name, type_ref, source, created_at, updated_at) " "VALUES (?, ?, 'Domain', 'enricher:fetch_webpage', ?, ?)", (new_id, name, ts, ts), ) return new_id def relation_exists(conn: sqlite3.Connection, from_id: str, to_id: str, name: str) -> bool: cur = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name=? LIMIT 1", (from_id, to_id, name), ) return cur.fetchone() is not None def insert_relation(conn: sqlite3.Connection, from_id: str, to_id: str, name: str) -> bool: if relation_exists(conn, from_id, to_id, name): return False ts = now_iso() rel_id = f"rel_{now_ms()}_{name.lower()}" conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) " "VALUES (?, ?, ?, ?, ?, ?)", (rel_id, name, from_id, to_id, ts, ts), ) return True def main() -> int: raw = sys.stdin.read() try: ctx = json.loads(raw) except Exception as e: log(f"stdin not valid JSON: {e}") return 2 node_id = ctx.get("node_id") or "" node_type = ctx.get("node_type") or "" metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db_path = ctx.get("ops_db_path") or "" cache_dir = ctx.get("cache_dir") or "" registry_root = ctx.get("registry_root") or "" params = ctx.get("params") or {} timeout_s = int(params.get("timeout_s", 15)) if not node_id or not ops_db_path: log("missing node_id / ops_db_path") return 2 # URL puede estar en `url` (Url/Webpage) o `address` (Url legacy). raw_url = (metadata.get("url") or metadata.get("address") or "").strip() if not raw_url: # Fallback: si el nodo no tiene url en metadata, mira el name. raw_url = (ctx.get("node_name") or "").strip() if not raw_url: log("nodo sin url en metadata ni name") return 2 progress(0.05, "normalize") try: normalize_url, html_to_markdown = load_registry_funcs(registry_root) except Exception as e: log(f"registry imports failed: {e}") return 3 try: url = normalize_url(raw_url) except Exception: url = raw_url if not url.startswith(("http://", "https://")): url = "https://" + url progress(0.20, "fetching") try: status, content_type, html, _enc = fetch_with_requests(url, timeout=timeout_s) except Exception as e: log(f"fetch failed: {e}") # Marcamos node con status=-1 para evidencia. conn = sqlite3.connect(ops_db_path) try: cur = conn.execute("SELECT metadata FROM entities WHERE id=?", (node_id,)) row = cur.fetchone() existing_meta = row[0] if row and row[0] else "{}" patch = {"url": url, "fetched_at": now_iso(), "status_code": -1} new_meta = merge_metadata_json(existing_meta, patch) conn.execute( "UPDATE entities SET metadata=?, updated_at=? WHERE id=?", (new_meta, now_iso(), node_id), ) conn.commit() finally: conn.close() print(json.dumps({"error": str(e), "url": url, "entities_added": 0, "relations_added": 0})) return 4 progress(0.55, "parsing") try: markdown = html_to_markdown(html) except Exception as e: log(f"html_to_markdown failed (will save raw): {e}") markdown = "" title = extract_title(html) text_length = len(markdown) if markdown else len(html) progress(0.80, "writing") key = hashlib.sha256(url.encode("utf-8")).hexdigest() html_path, md_path = cache_paths(cache_dir, key) try: html_path.write_text(html, encoding="utf-8", errors="replace") if markdown: md_path.write_text(markdown, encoding="utf-8") except Exception as e: log(f"cache write failed: {e}") return 5 # Paths en metadata se guardan relativos al app_dir para portabilidad. rel_html = os.path.relpath(html_path, ctx.get("app_dir") or cache_dir) rel_md = os.path.relpath(md_path, ctx.get("app_dir") or cache_dir) progress(0.92, "applying") conn = sqlite3.connect(ops_db_path) conn.execute("PRAGMA foreign_keys=OFF") entities_added = 0 relations_added = 0 node_updated = False try: # 1. Update del nodo: convertir Url -> Webpage si aplica + parche meta. cur = conn.execute( "SELECT type_ref, metadata FROM entities WHERE id=?", (node_id,) ) row = cur.fetchone() if not row: log(f"node {node_id} disappeared") return 6 cur_type, cur_meta = row[0], row[1] or "{}" new_type = "Webpage" if cur_type.lower() == "url" else cur_type or "Webpage" patch = { "url": url, "title": title, "status_code": status, "content_type": content_type, "fetched_at": now_iso(), "html_path": rel_html, "markdown_path": rel_md if markdown else "", "text_length": text_length, } new_meta = merge_metadata_json(cur_meta, patch) conn.execute( "UPDATE entities SET type_ref=?, metadata=?, updated_at=? WHERE id=?", (new_type, new_meta, now_iso(), node_id), ) node_updated = True # 2. Crear/conectar Domain. dname = domain_of(url) if dname: existed_before = conn.execute( "SELECT 1 FROM entities WHERE type_ref='Domain' AND name=? LIMIT 1", (dname,), ).fetchone() is not None domain_id = upsert_domain(conn, dname) if not existed_before: entities_added += 1 if insert_relation(conn, node_id, domain_id, "BELONGS_TO"): relations_added += 1 conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "url": url, "status_code": status, "title": title, "text_length": text_length, "html_path": rel_html, "markdown_path": rel_md if markdown else "", "entities_added": entities_added, "relations_added": relations_added, "node_updated": node_updated, }, ensure_ascii=False)) return 0 if __name__ == "__main__": sys.exit(main())