Files
osint_db/server/ingest.py
T

365 lines
13 KiB
Python

"""Ingests del service osint_db: vault Obsidian y servidor DAV (Xandikos).
Las tablas maestras se reconstruyen por reemplazo completo (DELETE + INSERT en
una transacción): el vault y Xandikos son las fuentes de verdad, así que cada
ingest deja la base exactamente como el origen. Tras cada ingest se re-enlazan
los contactos con sus fichas y se reconstruyen las tablas derivadas.
"""
from __future__ import annotations
import json
import os
import re
from datetime import datetime, timezone
from . import davparse
from .config import Config
from .db import write_conn
from .derived import rebuild_derived
from .registry_bridge import (
dav_get_collection,
dav_list_calendars,
list_obsidian_notes,
pass_get_secret,
read_obsidian_note,
)
def _norm(value):
"""Normaliza 'null'/''/None del frontmatter a None real."""
if value is None:
return None
if isinstance(value, str) and value.strip().lower() in ("null", "none", ""):
return None
return value
def _as_str(value):
"""Convierte un valor de frontmatter a str (o None), sin perder números."""
v = _norm(value)
return None if v is None else str(v)
def _as_list(value) -> list:
"""Convierte un valor de frontmatter a lista (los escalares se envuelven)."""
v = _norm(value)
if v is None:
return []
return v if isinstance(v, list) else [v]
def _json(value) -> str:
"""Serializa un valor a JSON compacto (sin escapar acentos)."""
return json.dumps(value, ensure_ascii=False, default=str)
def _dav_uid_from_fuente(fuente) -> str | None:
"""Extrae el UID de Xandikos cuando fuente es 'Xandikos UID <uid>'."""
if not fuente:
return None
m = re.search(r"Xandikos UID\s+(\S+)", str(fuente))
return m.group(1) if m else None
def ingest_vault(cfg: Config) -> dict:
"""Escanea el vault completo y reconstruye notes + tablas de entidades.
Devuelve {status:'ok', notes:N, persons:N, organizations:N, domains:N,
cases:N, places:N, skipped_unreadable:N, derived_rebuilt:[...]}.
"""
if not os.path.isdir(cfg.vault_dir):
return {"status": "error", "error": f"vault no encontrado: {cfg.vault_dir}"}
note_rows: list = []
entity_rows: dict = {table: [] for _, _, table in cfg.entity_folders}
folder_to_table = {folder: table for folder, _, table in cfg.entity_folders}
skipped = 0
for abs_path in list_obsidian_notes(cfg.vault_dir):
rel_path = os.path.relpath(abs_path, cfg.vault_dir)
base = os.path.splitext(os.path.basename(abs_path))[0]
try:
note = read_obsidian_note(abs_path)
except Exception: # noqa: BLE001 — una nota corrupta no aborta el ingest
skipped += 1
continue
fm = note.get("frontmatter") or {}
mtime = datetime.fromtimestamp(os.path.getmtime(abs_path), tz=timezone.utc)
slug = _as_str(fm.get("slug")) or base
note_rows.append(
[
rel_path,
slug,
_as_str(fm.get("tipo")),
_as_str(fm.get("nombre")) or base,
mtime,
_json(fm),
]
)
# Entidad estructurada: ficha de nivel-1 dentro de una carpeta de
# entidades (personas/<slug>.md, no personas/<slug>/<doc>.md) y que
# no sea una nota de soporte (prefijo _).
top_folder = rel_path.split(os.sep)[0]
is_level1 = os.path.basename(os.path.dirname(abs_path)) == top_folder
if top_folder in folder_to_table and is_level1 and not base.startswith("_"):
table = folder_to_table[top_folder]
if table == "persons":
entity_rows[table].append(
[
slug,
rel_path,
_as_str(fm.get("nombre")) or base,
_json(_as_list(fm.get("aliases"))),
_as_str(fm.get("sexo")),
_as_str(fm.get("fecha_nacimiento")),
_as_str(fm.get("dni")),
_as_str(fm.get("telefono")),
_as_str(fm.get("email")),
_as_str(fm.get("direccion")),
_as_str(fm.get("pais")),
_as_str(fm.get("contexto")),
_as_str(fm.get("fuente")),
_dav_uid_from_fuente(fm.get("fuente")),
_json(_as_list(fm.get("tags"))),
mtime,
]
)
else:
entity_rows[table].append(
[
slug,
rel_path,
_as_str(fm.get("nombre")) or base,
_json(_as_list(fm.get("tags"))),
_json(fm),
mtime,
]
)
derived_rebuilt: list = []
with write_conn(cfg.db_path) as conn:
conn.execute("BEGIN")
try:
conn.execute("DELETE FROM notes")
if note_rows:
conn.executemany(
"INSERT INTO notes VALUES (?, ?, ?, ?, ?, ?)", note_rows
)
conn.execute("DELETE FROM persons")
if entity_rows["persons"]:
conn.executemany(
"INSERT INTO persons VALUES "
"(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
_dedup_by_slug(entity_rows["persons"]),
)
for table in ("organizations", "domains", "cases", "places"):
conn.execute(f"DELETE FROM {table}")
if entity_rows[table]:
conn.executemany(
f"INSERT INTO {table} VALUES (?, ?, ?, ?, ?, ?)",
_dedup_by_slug(entity_rows[table]),
)
_link_contacts(conn)
conn.execute("COMMIT")
except Exception:
conn.execute("ROLLBACK")
raise
derived_rebuilt = rebuild_derived(conn)
return {
"status": "ok",
"notes": len(note_rows),
"persons": len(_dedup_by_slug(entity_rows["persons"])),
"organizations": len(_dedup_by_slug(entity_rows["organizations"])),
"domains": len(_dedup_by_slug(entity_rows["domains"])),
"cases": len(_dedup_by_slug(entity_rows["cases"])),
"places": len(_dedup_by_slug(entity_rows["places"])),
"skipped_unreadable": skipped,
"derived_rebuilt": derived_rebuilt,
}
def _dedup_by_slug(rows: list) -> list:
"""Quita filas con slug repetido (gana la primera) para respetar la PK."""
seen, out = set(), []
for row in rows:
if row[0] in seen:
continue
seen.add(row[0])
out.append(row)
return out
def ingest_dav(cfg: Config) -> dict:
"""Baja las colecciones de Xandikos y reconstruye contacts + events.
Devuelve {status:'ok', contacts:N, events:N, calendars:[...],
contacts_linked:N, derived_rebuilt:[...]} o {status:'error', error}.
"""
secret = pass_get_secret(cfg.pass_secret)
if secret.get("status") != "ok":
return {
"status": "error",
"error": f"pass no devolvió el secreto {cfg.pass_secret!r}: "
f"{secret.get('error')}",
}
pwd = secret["value"] # sensible: nunca logear
coll = dav_get_collection(
cfg.dav_base, cfg.dav_user, pwd, cfg.dav_contacts_collection, "vcard"
)
if coll.get("status") != "ok":
return {
"status": "error",
"error": f"CardDAV: {coll.get('error')} (http {coll.get('http_status')})",
}
now = datetime.now(tz=timezone.utc)
contact_rows: list = []
seen_uids: set = set()
for res in coll.get("resources", []):
parsed = davparse.parse_vcard(res.get("data", ""))
uid = parsed["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
if uid in seen_uids:
continue
seen_uids.add(uid)
contact_rows.append(
[
uid,
cfg.dav_contacts_collection,
res.get("etag"),
parsed["fn"] or None,
_json(parsed["tels"]),
_json(parsed["emails"]),
res.get("data", ""),
None, # note_path se rellena en el enlace posterior
now,
]
)
cals = dav_list_calendars(cfg.dav_base, cfg.dav_user, pwd, cfg.dav_calendar_home)
if cals.get("status") != "ok":
return {
"status": "error",
"error": f"CalDAV: {cals.get('error')} (http {cals.get('http_status')})",
}
event_rows: list = []
seen_event_uids: set = set()
calendar_names: list = []
for cal in cals.get("calendars", []):
cal_name = cal.get("name") or cal.get("href", "").strip("/").rsplit("/", 1)[-1]
calendar_names.append(cal_name)
cal_coll = dav_get_collection(
cfg.dav_base, cfg.dav_user, pwd, cal["href"], "ical"
)
if cal_coll.get("status") != "ok":
return {
"status": "error",
"error": f"CalDAV {cal_name}: {cal_coll.get('error')} "
f"(http {cal_coll.get('http_status')})",
}
for res in cal_coll.get("resources", []):
for ev in davparse.parse_ical_events(res.get("data", "")):
uid = ev["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
if uid in seen_event_uids:
continue
seen_event_uids.add(uid)
event_rows.append(
[
uid,
cal_name,
res.get("etag"),
ev["dtstart"] or None,
ev["dtend"] or None,
ev["all_day"],
ev["summary"] or None,
ev["location"],
ev["rrule"],
ev["raw"],
now,
]
)
with write_conn(cfg.db_path) as conn:
conn.execute("BEGIN")
try:
conn.execute("DELETE FROM contacts")
if contact_rows:
conn.executemany(
"INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
contact_rows,
)
conn.execute("DELETE FROM events")
if event_rows:
conn.executemany(
"INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
event_rows,
)
linked = _link_contacts(conn)
conn.execute("COMMIT")
except Exception:
conn.execute("ROLLBACK")
raise
derived_rebuilt = rebuild_derived(conn)
return {
"status": "ok",
"contacts": len(contact_rows),
"events": len(event_rows),
"calendars": calendar_names,
"contacts_linked": linked,
"derived_rebuilt": derived_rebuilt,
}
def _link_contacts(conn) -> int:
"""Enlaza contacts.note_path contra las fichas de persons.
Orden de matching por fiabilidad: UID estilo osint-<slug> (creado por el
push del vault), dav_uid registrado en la ficha, teléfono normalizado y
por último email. Devuelve el número de contactos enlazados.
"""
persons = conn.execute(
"SELECT slug, note_path, telefono, email, dav_uid FROM persons"
).fetchall()
by_slug, by_dav_uid, by_phone, by_email = {}, {}, {}, {}
for slug, note_path, telefono, email, dav_uid in persons:
by_slug[slug] = note_path
if dav_uid:
by_dav_uid.setdefault(dav_uid, note_path)
if telefono:
key = davparse.norm_phone(telefono)
if key:
by_phone.setdefault(key, note_path)
if email:
by_email.setdefault(str(email).strip().lower(), note_path)
contacts = conn.execute("SELECT uid, tels, emails FROM contacts").fetchall()
linked = 0
for uid, tels_json, emails_json in contacts:
note_path = None
if uid.startswith("osint-") and uid[len("osint-"):] in by_slug:
note_path = by_slug[uid[len("osint-"):]]
if note_path is None and uid in by_dav_uid:
note_path = by_dav_uid[uid]
if note_path is None:
for tel in json.loads(tels_json or "[]"):
hit = by_phone.get(davparse.norm_phone(tel))
if hit:
note_path = hit
break
if note_path is None:
for em in json.loads(emails_json or "[]"):
hit = by_email.get(str(em).strip().lower())
if hit:
note_path = hit
break
if note_path is not None:
conn.execute(
"UPDATE contacts SET note_path = ? WHERE uid = ?", [note_path, uid]
)
linked += 1
return linked