#!/usr/bin/env python3 """Enricher extract_domain — issue 0028b. Saca el dominio de un nodo Url/Webpage (campo metadata.url) o Email (campo metadata.address) y crea/conecta una entidad Domain con relacion BELONGS_TO. No hace I/O de red. """ from __future__ import annotations import json import sqlite3 import sys import time from datetime import datetime, timezone from urllib.parse import urlparse def progress(p: float, stage: str = "") -> None: sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n") sys.stderr.flush() def now_iso() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_ms() -> int: return int(time.time() * 1000) def domain_from_url(url: str) -> str: if not url: return "" if "://" not in url: url = "https://" + url try: return (urlparse(url).hostname or "").lower() except Exception: return "" def domain_from_email(addr: str) -> str: if "@" not in addr: return "" return addr.split("@", 1)[1].strip().lower() def main() -> int: ctx = json.loads(sys.stdin.read()) node_id = ctx.get("node_id") or "" node_type = (ctx.get("node_type") or "").lower() metadata = ctx.get("metadata") or {} if isinstance(metadata, str): try: metadata = json.loads(metadata) except Exception: metadata = {} ops_db = ctx.get("ops_db_path") or "" if not node_id or not ops_db: sys.stderr.write("missing node_id / ops_db_path\n") return 2 progress(0.30, "extracting") dname = "" if node_type == "email": addr = metadata.get("address") or ctx.get("node_name") or "" dname = domain_from_email(addr) else: url = metadata.get("url") or ctx.get("node_name") or "" dname = domain_from_url(url) if not dname: print(json.dumps({"warning": "no domain extractable", "entities_added": 0, "relations_added": 0})) return 0 progress(0.70, "writing") conn = sqlite3.connect(ops_db) entities_added = 0 relations_added = 0 try: existed = conn.execute( "SELECT id FROM entities WHERE type_ref='Domain' AND name=? LIMIT 1", (dname,), ).fetchone() if existed: domain_id = existed[0] else: domain_id = f"Domain_{now_ms()}" ts = now_iso() conn.execute( "INSERT INTO entities (id, name, type_ref, source, created_at, updated_at) " "VALUES (?, ?, 'Domain', 'enricher:extract_domain', ?, ?)", (domain_id, dname, ts, ts), ) entities_added = 1 rel_exists = conn.execute( "SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='BELONGS_TO' LIMIT 1", (node_id, domain_id), ).fetchone() if not rel_exists: ts = now_iso() conn.execute( "INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) " "VALUES (?, 'BELONGS_TO', ?, ?, ?, ?)", (f"rel_{now_ms()}_belongs_to", node_id, domain_id, ts, ts), ) relations_added = 1 conn.commit() finally: conn.close() progress(1.0, "done") print(json.dumps({ "domain": dname, "entities_added": entities_added, "relations_added": relations_added, })) return 0 if __name__ == "__main__": sys.exit(main())