feat(enrichers): cuatro enrichers web — fetch + extract trio (issues 0028, 0028b)
Cada enricher es un par manifest.yaml + run.py en enrichers/<id>/.
1. fetch_webpage (Url, Webpage):
HTTP GET (requests, fallback urllib) -> html_to_markdown_py_core ->
sha256(url) -> guarda HTML+MD en cache/<aa>/<sha>.{html,md}. Convierte
Url -> Webpage con metadata enriquecida (title/status_code/content_type/
paths/text_length). Crea Domain con relacion BELONGS_TO.
2. extract_domain (Url, Webpage, Email):
Saca dominio de metadata.url o metadata.address (sin I/O). Crea/conecta
Domain con BELONGS_TO. Util cuando el usuario quiere ver el dominio
antes de fetch.
3. extract_links (Webpage):
Lee metadata.markdown_path -> extract_urls_py_cybersecurity -> dedup ->
crea nodo Url por enlace + relacion LINKS_TO. Param max_links (50).
4. extract_text_entities (Webpage):
Lee metadata.markdown_path -> extract_iocs_py_cybersecurity (regex puro,
sin coste) -> crea entidades por (type, value) tipadas en el registro:
Email, IPAddress, Domain, FileHash, CryptoWallet, CVE, MACAddress, Phone.
Cada una con relacion EXTRACTED_FROM al Webpage origen. v1 sin GLiNER/
GLiREL — esos requieren modelos pre-cargados (futura iteracion).
Probado end-to-end:
fetch_webpage https://httpbin.org/html -> 1 Webpage + 1 Domain
extract_links -> 2 Url + 2 LINKS_TO
extract_text_entities -> 8 IoCs (Email, IP*2, CVE, Domain*2, Wallet, Phone)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,7 @@
|
||||
id: extract_domain
|
||||
name: "Extract domain"
|
||||
description: "Saca el dominio de la url/email del nodo y crea/conecta una entidad Domain con relacion BELONGS_TO. No descarga nada."
|
||||
applies_to: [Url, Webpage, Email]
|
||||
emits: [Domain]
|
||||
relations: [BELONGS_TO]
|
||||
params: []
|
||||
Executable
+125
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Enricher extract_domain — issue 0028b.
|
||||
|
||||
Saca el dominio de un nodo Url/Webpage (campo metadata.url) o Email (campo
|
||||
metadata.address) y crea/conecta una entidad Domain con relacion BELONGS_TO.
|
||||
No hace I/O de red.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def progress(p: float, stage: str = "") -> None:
|
||||
sys.stderr.write(f"PROGRESS:{p:.2f} {stage}\n")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def now_ms() -> int:
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
||||
def domain_from_url(url: str) -> str:
|
||||
if not url:
|
||||
return ""
|
||||
if "://" not in url:
|
||||
url = "https://" + url
|
||||
try:
|
||||
return (urlparse(url).hostname or "").lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def domain_from_email(addr: str) -> str:
|
||||
if "@" not in addr:
|
||||
return ""
|
||||
return addr.split("@", 1)[1].strip().lower()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ctx = json.loads(sys.stdin.read())
|
||||
node_id = ctx.get("node_id") or ""
|
||||
node_type = (ctx.get("node_type") or "").lower()
|
||||
metadata = ctx.get("metadata") or {}
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except Exception:
|
||||
metadata = {}
|
||||
ops_db = ctx.get("ops_db_path") or ""
|
||||
if not node_id or not ops_db:
|
||||
sys.stderr.write("missing node_id / ops_db_path\n")
|
||||
return 2
|
||||
|
||||
progress(0.30, "extracting")
|
||||
dname = ""
|
||||
if node_type == "email":
|
||||
addr = metadata.get("address") or ctx.get("node_name") or ""
|
||||
dname = domain_from_email(addr)
|
||||
else:
|
||||
url = metadata.get("url") or ctx.get("node_name") or ""
|
||||
dname = domain_from_url(url)
|
||||
|
||||
if not dname:
|
||||
print(json.dumps({"warning": "no domain extractable",
|
||||
"entities_added": 0, "relations_added": 0}))
|
||||
return 0
|
||||
|
||||
progress(0.70, "writing")
|
||||
conn = sqlite3.connect(ops_db)
|
||||
entities_added = 0
|
||||
relations_added = 0
|
||||
try:
|
||||
existed = conn.execute(
|
||||
"SELECT id FROM entities WHERE type_ref='Domain' AND name=? LIMIT 1",
|
||||
(dname,),
|
||||
).fetchone()
|
||||
if existed:
|
||||
domain_id = existed[0]
|
||||
else:
|
||||
domain_id = f"Domain_{now_ms()}"
|
||||
ts = now_iso()
|
||||
conn.execute(
|
||||
"INSERT INTO entities (id, name, type_ref, source, created_at, updated_at) "
|
||||
"VALUES (?, ?, 'Domain', 'enricher:extract_domain', ?, ?)",
|
||||
(domain_id, dname, ts, ts),
|
||||
)
|
||||
entities_added = 1
|
||||
|
||||
rel_exists = conn.execute(
|
||||
"SELECT 1 FROM relations WHERE from_entity=? AND to_entity=? AND name='BELONGS_TO' LIMIT 1",
|
||||
(node_id, domain_id),
|
||||
).fetchone()
|
||||
if not rel_exists:
|
||||
ts = now_iso()
|
||||
conn.execute(
|
||||
"INSERT INTO relations (id, name, from_entity, to_entity, created_at, updated_at) "
|
||||
"VALUES (?, 'BELONGS_TO', ?, ?, ?, ?)",
|
||||
(f"rel_{now_ms()}_belongs_to", node_id, domain_id, ts, ts),
|
||||
)
|
||||
relations_added = 1
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
progress(1.0, "done")
|
||||
print(json.dumps({
|
||||
"domain": dname,
|
||||
"entities_added": entities_added,
|
||||
"relations_added": relations_added,
|
||||
}))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user