365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""Ingests del service osint_db: vault Obsidian y servidor DAV (Xandikos).
|
|
|
|
Las tablas maestras se reconstruyen por reemplazo completo (DELETE + INSERT en
|
|
una transacción): el vault y Xandikos son las fuentes de verdad, así que cada
|
|
ingest deja la base exactamente como el origen. Tras cada ingest se re-enlazan
|
|
los contactos con sus fichas y se reconstruyen las tablas derivadas.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
from datetime import datetime, timezone
|
|
|
|
from . import davparse
|
|
from .config import Config
|
|
from .db import write_conn
|
|
from .derived import rebuild_derived
|
|
from .registry_bridge import (
|
|
dav_get_collection,
|
|
dav_list_calendars,
|
|
list_obsidian_notes,
|
|
pass_get_secret,
|
|
read_obsidian_note,
|
|
)
|
|
|
|
def _norm(value):
|
|
"""Normaliza 'null'/''/None del frontmatter a None real."""
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, str) and value.strip().lower() in ("null", "none", ""):
|
|
return None
|
|
return value
|
|
|
|
|
|
def _as_str(value):
|
|
"""Convierte un valor de frontmatter a str (o None), sin perder números."""
|
|
v = _norm(value)
|
|
return None if v is None else str(v)
|
|
|
|
|
|
def _as_list(value) -> list:
|
|
"""Convierte un valor de frontmatter a lista (los escalares se envuelven)."""
|
|
v = _norm(value)
|
|
if v is None:
|
|
return []
|
|
return v if isinstance(v, list) else [v]
|
|
|
|
|
|
def _json(value) -> str:
|
|
"""Serializa un valor a JSON compacto (sin escapar acentos)."""
|
|
return json.dumps(value, ensure_ascii=False, default=str)
|
|
|
|
|
|
def _dav_uid_from_fuente(fuente) -> str | None:
|
|
"""Extrae el UID de Xandikos cuando fuente es 'Xandikos UID <uid>'."""
|
|
if not fuente:
|
|
return None
|
|
m = re.search(r"Xandikos UID\s+(\S+)", str(fuente))
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def ingest_vault(cfg: Config) -> dict:
|
|
"""Escanea el vault completo y reconstruye notes + tablas de entidades.
|
|
|
|
Devuelve {status:'ok', notes:N, persons:N, organizations:N, domains:N,
|
|
cases:N, places:N, skipped_unreadable:N, derived_rebuilt:[...]}.
|
|
"""
|
|
if not os.path.isdir(cfg.vault_dir):
|
|
return {"status": "error", "error": f"vault no encontrado: {cfg.vault_dir}"}
|
|
|
|
note_rows: list = []
|
|
entity_rows: dict = {table: [] for _, _, table in cfg.entity_folders}
|
|
folder_to_table = {folder: table for folder, _, table in cfg.entity_folders}
|
|
skipped = 0
|
|
|
|
for abs_path in list_obsidian_notes(cfg.vault_dir):
|
|
rel_path = os.path.relpath(abs_path, cfg.vault_dir)
|
|
base = os.path.splitext(os.path.basename(abs_path))[0]
|
|
try:
|
|
note = read_obsidian_note(abs_path)
|
|
except Exception: # noqa: BLE001 — una nota corrupta no aborta el ingest
|
|
skipped += 1
|
|
continue
|
|
fm = note.get("frontmatter") or {}
|
|
mtime = datetime.fromtimestamp(os.path.getmtime(abs_path), tz=timezone.utc)
|
|
slug = _as_str(fm.get("slug")) or base
|
|
note_rows.append(
|
|
[
|
|
rel_path,
|
|
slug,
|
|
_as_str(fm.get("tipo")),
|
|
_as_str(fm.get("nombre")) or base,
|
|
mtime,
|
|
_json(fm),
|
|
]
|
|
)
|
|
|
|
# Entidad estructurada: ficha de nivel-1 dentro de una carpeta de
|
|
# entidades (personas/<slug>.md, no personas/<slug>/<doc>.md) y que
|
|
# no sea una nota de soporte (prefijo _).
|
|
top_folder = rel_path.split(os.sep)[0]
|
|
is_level1 = os.path.basename(os.path.dirname(abs_path)) == top_folder
|
|
if top_folder in folder_to_table and is_level1 and not base.startswith("_"):
|
|
table = folder_to_table[top_folder]
|
|
if table == "persons":
|
|
entity_rows[table].append(
|
|
[
|
|
slug,
|
|
rel_path,
|
|
_as_str(fm.get("nombre")) or base,
|
|
_json(_as_list(fm.get("aliases"))),
|
|
_as_str(fm.get("sexo")),
|
|
_as_str(fm.get("fecha_nacimiento")),
|
|
_as_str(fm.get("dni")),
|
|
_as_str(fm.get("telefono")),
|
|
_as_str(fm.get("email")),
|
|
_as_str(fm.get("direccion")),
|
|
_as_str(fm.get("pais")),
|
|
_as_str(fm.get("contexto")),
|
|
_as_str(fm.get("fuente")),
|
|
_dav_uid_from_fuente(fm.get("fuente")),
|
|
_json(_as_list(fm.get("tags"))),
|
|
mtime,
|
|
]
|
|
)
|
|
else:
|
|
entity_rows[table].append(
|
|
[
|
|
slug,
|
|
rel_path,
|
|
_as_str(fm.get("nombre")) or base,
|
|
_json(_as_list(fm.get("tags"))),
|
|
_json(fm),
|
|
mtime,
|
|
]
|
|
)
|
|
|
|
derived_rebuilt: list = []
|
|
with write_conn(cfg.db_path) as conn:
|
|
conn.execute("BEGIN")
|
|
try:
|
|
conn.execute("DELETE FROM notes")
|
|
if note_rows:
|
|
conn.executemany(
|
|
"INSERT INTO notes VALUES (?, ?, ?, ?, ?, ?)", note_rows
|
|
)
|
|
conn.execute("DELETE FROM persons")
|
|
if entity_rows["persons"]:
|
|
conn.executemany(
|
|
"INSERT INTO persons VALUES "
|
|
"(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
|
_dedup_by_slug(entity_rows["persons"]),
|
|
)
|
|
for table in ("organizations", "domains", "cases", "places"):
|
|
conn.execute(f"DELETE FROM {table}")
|
|
if entity_rows[table]:
|
|
conn.executemany(
|
|
f"INSERT INTO {table} VALUES (?, ?, ?, ?, ?, ?)",
|
|
_dedup_by_slug(entity_rows[table]),
|
|
)
|
|
_link_contacts(conn)
|
|
conn.execute("COMMIT")
|
|
except Exception:
|
|
conn.execute("ROLLBACK")
|
|
raise
|
|
derived_rebuilt = rebuild_derived(conn)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"notes": len(note_rows),
|
|
"persons": len(_dedup_by_slug(entity_rows["persons"])),
|
|
"organizations": len(_dedup_by_slug(entity_rows["organizations"])),
|
|
"domains": len(_dedup_by_slug(entity_rows["domains"])),
|
|
"cases": len(_dedup_by_slug(entity_rows["cases"])),
|
|
"places": len(_dedup_by_slug(entity_rows["places"])),
|
|
"skipped_unreadable": skipped,
|
|
"derived_rebuilt": derived_rebuilt,
|
|
}
|
|
|
|
|
|
def _dedup_by_slug(rows: list) -> list:
|
|
"""Quita filas con slug repetido (gana la primera) para respetar la PK."""
|
|
seen, out = set(), []
|
|
for row in rows:
|
|
if row[0] in seen:
|
|
continue
|
|
seen.add(row[0])
|
|
out.append(row)
|
|
return out
|
|
|
|
|
|
def ingest_dav(cfg: Config) -> dict:
|
|
"""Baja las colecciones de Xandikos y reconstruye contacts + events.
|
|
|
|
Devuelve {status:'ok', contacts:N, events:N, calendars:[...],
|
|
contacts_linked:N, derived_rebuilt:[...]} o {status:'error', error}.
|
|
"""
|
|
secret = pass_get_secret(cfg.pass_secret)
|
|
if secret.get("status") != "ok":
|
|
return {
|
|
"status": "error",
|
|
"error": f"pass no devolvió el secreto {cfg.pass_secret!r}: "
|
|
f"{secret.get('error')}",
|
|
}
|
|
pwd = secret["value"] # sensible: nunca logear
|
|
|
|
coll = dav_get_collection(
|
|
cfg.dav_base, cfg.dav_user, pwd, cfg.dav_contacts_collection, "vcard"
|
|
)
|
|
if coll.get("status") != "ok":
|
|
return {
|
|
"status": "error",
|
|
"error": f"CardDAV: {coll.get('error')} (http {coll.get('http_status')})",
|
|
}
|
|
|
|
now = datetime.now(tz=timezone.utc)
|
|
contact_rows: list = []
|
|
seen_uids: set = set()
|
|
for res in coll.get("resources", []):
|
|
parsed = davparse.parse_vcard(res.get("data", ""))
|
|
uid = parsed["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
|
|
if uid in seen_uids:
|
|
continue
|
|
seen_uids.add(uid)
|
|
contact_rows.append(
|
|
[
|
|
uid,
|
|
cfg.dav_contacts_collection,
|
|
res.get("etag"),
|
|
parsed["fn"] or None,
|
|
_json(parsed["tels"]),
|
|
_json(parsed["emails"]),
|
|
res.get("data", ""),
|
|
None, # note_path se rellena en el enlace posterior
|
|
now,
|
|
]
|
|
)
|
|
|
|
cals = dav_list_calendars(cfg.dav_base, cfg.dav_user, pwd, cfg.dav_calendar_home)
|
|
if cals.get("status") != "ok":
|
|
return {
|
|
"status": "error",
|
|
"error": f"CalDAV: {cals.get('error')} (http {cals.get('http_status')})",
|
|
}
|
|
|
|
event_rows: list = []
|
|
seen_event_uids: set = set()
|
|
calendar_names: list = []
|
|
for cal in cals.get("calendars", []):
|
|
cal_name = cal.get("name") or cal.get("href", "").strip("/").rsplit("/", 1)[-1]
|
|
calendar_names.append(cal_name)
|
|
cal_coll = dav_get_collection(
|
|
cfg.dav_base, cfg.dav_user, pwd, cal["href"], "ical"
|
|
)
|
|
if cal_coll.get("status") != "ok":
|
|
return {
|
|
"status": "error",
|
|
"error": f"CalDAV {cal_name}: {cal_coll.get('error')} "
|
|
f"(http {cal_coll.get('http_status')})",
|
|
}
|
|
for res in cal_coll.get("resources", []):
|
|
for ev in davparse.parse_ical_events(res.get("data", "")):
|
|
uid = ev["uid"] or os.path.splitext(os.path.basename(res["href"]))[0]
|
|
if uid in seen_event_uids:
|
|
continue
|
|
seen_event_uids.add(uid)
|
|
event_rows.append(
|
|
[
|
|
uid,
|
|
cal_name,
|
|
res.get("etag"),
|
|
ev["dtstart"] or None,
|
|
ev["dtend"] or None,
|
|
ev["all_day"],
|
|
ev["summary"] or None,
|
|
ev["location"],
|
|
ev["rrule"],
|
|
ev["raw"],
|
|
now,
|
|
]
|
|
)
|
|
|
|
with write_conn(cfg.db_path) as conn:
|
|
conn.execute("BEGIN")
|
|
try:
|
|
conn.execute("DELETE FROM contacts")
|
|
if contact_rows:
|
|
conn.executemany(
|
|
"INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
|
contact_rows,
|
|
)
|
|
conn.execute("DELETE FROM events")
|
|
if event_rows:
|
|
conn.executemany(
|
|
"INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
|
event_rows,
|
|
)
|
|
linked = _link_contacts(conn)
|
|
conn.execute("COMMIT")
|
|
except Exception:
|
|
conn.execute("ROLLBACK")
|
|
raise
|
|
derived_rebuilt = rebuild_derived(conn)
|
|
|
|
return {
|
|
"status": "ok",
|
|
"contacts": len(contact_rows),
|
|
"events": len(event_rows),
|
|
"calendars": calendar_names,
|
|
"contacts_linked": linked,
|
|
"derived_rebuilt": derived_rebuilt,
|
|
}
|
|
|
|
|
|
def _link_contacts(conn) -> int:
|
|
"""Enlaza contacts.note_path contra las fichas de persons.
|
|
|
|
Orden de matching por fiabilidad: UID estilo osint-<slug> (creado por el
|
|
push del vault), dav_uid registrado en la ficha, teléfono normalizado y
|
|
por último email. Devuelve el número de contactos enlazados.
|
|
"""
|
|
persons = conn.execute(
|
|
"SELECT slug, note_path, telefono, email, dav_uid FROM persons"
|
|
).fetchall()
|
|
by_slug, by_dav_uid, by_phone, by_email = {}, {}, {}, {}
|
|
for slug, note_path, telefono, email, dav_uid in persons:
|
|
by_slug[slug] = note_path
|
|
if dav_uid:
|
|
by_dav_uid.setdefault(dav_uid, note_path)
|
|
if telefono:
|
|
key = davparse.norm_phone(telefono)
|
|
if key:
|
|
by_phone.setdefault(key, note_path)
|
|
if email:
|
|
by_email.setdefault(str(email).strip().lower(), note_path)
|
|
|
|
contacts = conn.execute("SELECT uid, tels, emails FROM contacts").fetchall()
|
|
linked = 0
|
|
for uid, tels_json, emails_json in contacts:
|
|
note_path = None
|
|
if uid.startswith("osint-") and uid[len("osint-"):] in by_slug:
|
|
note_path = by_slug[uid[len("osint-"):]]
|
|
if note_path is None and uid in by_dav_uid:
|
|
note_path = by_dav_uid[uid]
|
|
if note_path is None:
|
|
for tel in json.loads(tels_json or "[]"):
|
|
hit = by_phone.get(davparse.norm_phone(tel))
|
|
if hit:
|
|
note_path = hit
|
|
break
|
|
if note_path is None:
|
|
for em in json.loads(emails_json or "[]"):
|
|
hit = by_email.get(str(em).strip().lower())
|
|
if hit:
|
|
note_path = hit
|
|
break
|
|
if note_path is not None:
|
|
conn.execute(
|
|
"UPDATE contacts SET note_path = ? WHERE uid = ?", [note_path, uid]
|
|
)
|
|
linked += 1
|
|
return linked
|