feat: initial scaffold of osint_db (DuckDB source-of-truth service)
This commit is contained in:
@@ -0,0 +1,364 @@
|
||||
"""Ingests del service osint_db: vault Obsidian y servidor DAV (Xandikos).
|
||||
|
||||
Las tablas maestras se reconstruyen por reemplazo completo (DELETE + INSERT en
|
||||
una transacción): el vault y Xandikos son las fuentes de verdad, así que cada
|
||||
ingest deja la base exactamente como el origen. Tras cada ingest se re-enlazan
|
||||
los contactos con sus fichas y se reconstruyen las tablas derivadas.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from . import davparse
|
||||
from .config import Config
|
||||
from .db import write_conn
|
||||
from .derived import rebuild_derived
|
||||
from .registry_bridge import (
|
||||
dav_get_collection,
|
||||
dav_list_calendars,
|
||||
list_obsidian_notes,
|
||||
pass_get_secret,
|
||||
read_obsidian_note,
|
||||
)
|
||||
|
||||
def _norm(value):
|
||||
"""Normaliza 'null'/''/None del frontmatter a None real."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str) and value.strip().lower() in ("null", "none", ""):
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def _as_str(value):
|
||||
"""Convierte un valor de frontmatter a str (o None), sin perder números."""
|
||||
v = _norm(value)
|
||||
return None if v is None else str(v)
|
||||
|
||||
|
||||
def _as_list(value) -> list:
|
||||
"""Convierte un valor de frontmatter a lista (los escalares se envuelven)."""
|
||||
v = _norm(value)
|
||||
if v is None:
|
||||
return []
|
||||
return v if isinstance(v, list) else [v]
|
||||
|
||||
|
||||
def _json(value) -> str:
|
||||
"""Serializa un valor a JSON compacto (sin escapar acentos)."""
|
||||
return json.dumps(value, ensure_ascii=False, default=str)
|
||||
|
||||
|
||||
def _dav_uid_from_fuente(fuente) -> str | None:
|
||||
"""Extrae el UID de Xandikos cuando fuente es 'Xandikos UID <uid>'."""
|
||||
if not fuente:
|
||||
return None
|
||||
m = re.search(r"Xandikos UID\s+(\S+)", str(fuente))
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def ingest_vault(cfg: Config) -> dict:
|
||||
"""Escanea el vault completo y reconstruye notes + tablas de entidades.
|
||||
|
||||
Devuelve {status:'ok', notes:N, persons:N, organizations:N, domains:N,
|
||||
cases:N, places:N, skipped_unreadable:N, derived_rebuilt:[...]}.
|
||||
"""
|
||||
if not os.path.isdir(cfg.vault_dir):
|
||||
return {"status": "error", "error": f"vault no encontrado: {cfg.vault_dir}"}
|
||||
|
||||
note_rows: list = []
|
||||
entity_rows: dict = {table: [] for _, _, table in cfg.entity_folders}
|
||||
folder_to_table = {folder: table for folder, _, table in cfg.entity_folders}
|
||||
skipped = 0
|
||||
|
||||
for abs_path in list_obsidian_notes(cfg.vault_dir):
|
||||
rel_path = os.path.relpath(abs_path, cfg.vault_dir)
|
||||
base = os.path.splitext(os.path.basename(abs_path))[0]
|
||||
try:
|
||||
note = read_obsidian_note(abs_path)
|
||||
except Exception: # noqa: BLE001 — una nota corrupta no aborta el ingest
|
||||
skipped += 1
|
||||
continue
|
||||
fm = note.get("frontmatter") or {}
|
||||
mtime = datetime.fromtimestamp(os.path.getmtime(abs_path), tz=timezone.utc)
|
||||
slug = _as_str(fm.get("slug")) or base
|
||||
note_rows.append(
|
||||
[
|
||||
rel_path,
|
||||
slug,
|
||||
_as_str(fm.get("tipo")),
|
||||
_as_str(fm.get("nombre")) or base,
|
||||
mtime,
|
||||
_json(fm),
|
||||
]
|
||||
)
|
||||
|
||||
# Entidad estructurada: ficha de nivel-1 dentro de una carpeta de
|
||||
# entidades (personas/<slug>.md, no personas/<slug>/<doc>.md) y que
|
||||
# no sea una nota de soporte (prefijo _).
|
||||
top_folder = rel_path.split(os.sep)[0]
|
||||
is_level1 = os.path.basename(os.path.dirname(abs_path)) == top_folder
|
||||
if top_folder in folder_to_table and is_level1 and not base.startswith("_"):
|
||||
table = folder_to_table[top_folder]
|
||||
if table == "persons":
|
||||
entity_rows[table].append(
|
||||
[
|
||||
slug,
|
||||
rel_path,
|
||||
_as_str(fm.get("nombre")) or base,
|
||||
_json(_as_list(fm.get("aliases"))),
|
||||
_as_str(fm.get("sexo")),
|
||||
_as_str(fm.get("fecha_nacimiento")),
|
||||
_as_str(fm.get("dni")),
|
||||
_as_str(fm.get("telefono")),
|
||||
_as_str(fm.get("email")),
|
||||
_as_str(fm.get("direccion")),
|
||||
_as_str(fm.get("pais")),
|
||||
_as_str(fm.get("contexto")),
|
||||
_as_str(fm.get("fuente")),
|
||||
_dav_uid_from_fuente(fm.get("fuente")),
|
||||
_json(_as_list(fm.get("tags"))),
|
||||
mtime,
|
||||
]
|
||||
)
|
||||
else:
|
||||
entity_rows[table].append(
|
||||
[
|
||||
slug,
|
||||
rel_path,
|
||||
_as_str(fm.get("nombre")) or base,
|
||||
_json(_as_list(fm.get("tags"))),
|
||||
_json(fm),
|
||||
mtime,
|
||||
]
|
||||
)
|
||||
|
||||
derived_rebuilt: list = []
|
||||
with write_conn(cfg.db_path) as conn:
|
||||
conn.execute("BEGIN")
|
||||
try:
|
||||
conn.execute("DELETE FROM notes")
|
||||
if note_rows:
|
||||
conn.executemany(
|
||||
"INSERT INTO notes VALUES (?, ?, ?, ?, ?, ?)", note_rows
|
||||
)
|
||||
conn.execute("DELETE FROM persons")
|
||||
if entity_rows["persons"]:
|
||||
conn.executemany(
|
||||
"INSERT INTO persons VALUES "
|
||||
"(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
_dedup_by_slug(entity_rows["persons"]),
|
||||
)
|
||||
for table in ("organizations", "domains", "cases", "places"):
|
||||
conn.execute(f"DELETE FROM {table}")
|
||||
if entity_rows[table]:
|
||||
conn.executemany(
|
||||
f"INSERT INTO {table} VALUES (?, ?, ?, ?, ?, ?)",
|
||||
_dedup_by_slug(entity_rows[table]),
|
||||
)
|
||||
_link_contacts(conn)
|
||||
conn.execute("COMMIT")
|
||||
except Exception:
|
||||
conn.execute("ROLLBACK")
|
||||
raise
|
||||
derived_rebuilt = rebuild_derived(conn)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"notes": len(note_rows),
|
||||
"persons": len(_dedup_by_slug(entity_rows["persons"])),
|
||||
"organizations": len(_dedup_by_slug(entity_rows["organizations"])),
|
||||
"domains": len(_dedup_by_slug(entity_rows["domains"])),
|
||||
"cases": len(_dedup_by_slug(entity_rows["cases"])),
|
||||
"places": len(_dedup_by_slug(entity_rows["places"])),
|
||||
"skipped_unreadable": skipped,
|
||||
"derived_rebuilt": derived_rebuilt,
|
||||
}
|
||||
|
||||
|
||||
def _dedup_by_slug(rows: list) -> list:
|
||||
"""Quita filas con slug repetido (gana la primera) para respetar la PK."""
|
||||
seen, out = set(), []
|
||||
for row in rows:
|
||||
if row[0] in seen:
|
||||
continue
|
||||
seen.add(row[0])
|
||||
out.append(row)
|
||||
return out
|
||||
|
||||
|
||||
def ingest_dav(cfg: Config) -> dict:
|
||||
"""Baja las colecciones de Xandikos y reconstruye contacts + events.
|
||||
|
||||
Devuelve {status:'ok', contacts:N, events:N, calendars:[...],
|
||||
contacts_linked:N, derived_rebuilt:[...]} o {status:'error', error}.
|
||||
"""
|
||||
secret = pass_get_secret(cfg.pass_secret)
|
||||
if secret.get("status") != "ok":
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"pass no devolvió el secreto {cfg.pass_secret!r}: "
|
||||
f"{secret.get('error')}",
|
||||
}
|
||||
pwd = secret["value"] # sensible: nunca logear
|
||||
|
||||
coll = dav_get_collection(
|
||||
cfg.dav_base, cfg.dav_user, pwd, cfg.dav_contacts_collection, "vcard"
|
||||
)
|
||||
if coll.get("status") != "ok":
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"CardDAV: {coll.get('error')} (http {coll.get('http_status')})",
|
||||
}
|
||||
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
contact_rows: list = []
|
||||
seen_uids: set = set()
|
||||
for res in coll.get("resources", []):
|
||||
parsed = davparse.parse_vcard(res.get("data", ""))
|
||||
uid = parsed["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
|
||||
if uid in seen_uids:
|
||||
continue
|
||||
seen_uids.add(uid)
|
||||
contact_rows.append(
|
||||
[
|
||||
uid,
|
||||
cfg.dav_contacts_collection,
|
||||
res.get("etag"),
|
||||
parsed["fn"] or None,
|
||||
_json(parsed["tels"]),
|
||||
_json(parsed["emails"]),
|
||||
res.get("data", ""),
|
||||
None, # note_path se rellena en el enlace posterior
|
||||
now,
|
||||
]
|
||||
)
|
||||
|
||||
cals = dav_list_calendars(cfg.dav_base, cfg.dav_user, pwd, cfg.dav_calendar_home)
|
||||
if cals.get("status") != "ok":
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"CalDAV: {cals.get('error')} (http {cals.get('http_status')})",
|
||||
}
|
||||
|
||||
event_rows: list = []
|
||||
seen_event_uids: set = set()
|
||||
calendar_names: list = []
|
||||
for cal in cals.get("calendars", []):
|
||||
cal_name = cal.get("name") or cal.get("href", "").strip("/").rsplit("/", 1)[-1]
|
||||
calendar_names.append(cal_name)
|
||||
cal_coll = dav_get_collection(
|
||||
cfg.dav_base, cfg.dav_user, pwd, cal["href"], "ical"
|
||||
)
|
||||
if cal_coll.get("status") != "ok":
|
||||
return {
|
||||
"status": "error",
|
||||
"error": f"CalDAV {cal_name}: {cal_coll.get('error')} "
|
||||
f"(http {cal_coll.get('http_status')})",
|
||||
}
|
||||
for res in cal_coll.get("resources", []):
|
||||
for ev in davparse.parse_ical_events(res.get("data", "")):
|
||||
uid = ev["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
|
||||
if uid in seen_event_uids:
|
||||
continue
|
||||
seen_event_uids.add(uid)
|
||||
event_rows.append(
|
||||
[
|
||||
uid,
|
||||
cal_name,
|
||||
res.get("etag"),
|
||||
ev["dtstart"] or None,
|
||||
ev["dtend"] or None,
|
||||
ev["all_day"],
|
||||
ev["summary"] or None,
|
||||
ev["location"],
|
||||
ev["rrule"],
|
||||
ev["raw"],
|
||||
now,
|
||||
]
|
||||
)
|
||||
|
||||
with write_conn(cfg.db_path) as conn:
|
||||
conn.execute("BEGIN")
|
||||
try:
|
||||
conn.execute("DELETE FROM contacts")
|
||||
if contact_rows:
|
||||
conn.executemany(
|
||||
"INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
contact_rows,
|
||||
)
|
||||
conn.execute("DELETE FROM events")
|
||||
if event_rows:
|
||||
conn.executemany(
|
||||
"INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
event_rows,
|
||||
)
|
||||
linked = _link_contacts(conn)
|
||||
conn.execute("COMMIT")
|
||||
except Exception:
|
||||
conn.execute("ROLLBACK")
|
||||
raise
|
||||
derived_rebuilt = rebuild_derived(conn)
|
||||
|
||||
return {
|
||||
"status": "ok",
|
||||
"contacts": len(contact_rows),
|
||||
"events": len(event_rows),
|
||||
"calendars": calendar_names,
|
||||
"contacts_linked": linked,
|
||||
"derived_rebuilt": derived_rebuilt,
|
||||
}
|
||||
|
||||
|
||||
def _link_contacts(conn) -> int:
|
||||
"""Enlaza contacts.note_path contra las fichas de persons.
|
||||
|
||||
Orden de matching por fiabilidad: UID estilo osint-<slug> (creado por el
|
||||
push del vault), dav_uid registrado en la ficha, teléfono normalizado y
|
||||
por último email. Devuelve el número de contactos enlazados.
|
||||
"""
|
||||
persons = conn.execute(
|
||||
"SELECT slug, note_path, telefono, email, dav_uid FROM persons"
|
||||
).fetchall()
|
||||
by_slug, by_dav_uid, by_phone, by_email = {}, {}, {}, {}
|
||||
for slug, note_path, telefono, email, dav_uid in persons:
|
||||
by_slug[slug] = note_path
|
||||
if dav_uid:
|
||||
by_dav_uid.setdefault(dav_uid, note_path)
|
||||
if telefono:
|
||||
key = davparse.norm_phone(telefono)
|
||||
if key:
|
||||
by_phone.setdefault(key, note_path)
|
||||
if email:
|
||||
by_email.setdefault(str(email).strip().lower(), note_path)
|
||||
|
||||
contacts = conn.execute("SELECT uid, tels, emails FROM contacts").fetchall()
|
||||
linked = 0
|
||||
for uid, tels_json, emails_json in contacts:
|
||||
note_path = None
|
||||
if uid.startswith("osint-") and uid[len("osint-"):] in by_slug:
|
||||
note_path = by_slug[uid[len("osint-"):]]
|
||||
if note_path is None and uid in by_dav_uid:
|
||||
note_path = by_dav_uid[uid]
|
||||
if note_path is None:
|
||||
for tel in json.loads(tels_json or "[]"):
|
||||
hit = by_phone.get(davparse.norm_phone(tel))
|
||||
if hit:
|
||||
note_path = hit
|
||||
break
|
||||
if note_path is None:
|
||||
for em in json.loads(emails_json or "[]"):
|
||||
hit = by_email.get(str(em).strip().lower())
|
||||
if hit:
|
||||
note_path = hit
|
||||
break
|
||||
if note_path is not None:
|
||||
conn.execute(
|
||||
"UPDATE contacts SET note_path = ? WHERE uid = ?", [note_path, uid]
|
||||
)
|
||||
linked += 1
|
||||
return linked
|
||||
Reference in New Issue
Block a user