"""Ingests del service osint_db: vault Obsidian y servidor DAV (Xandikos). Las tablas maestras se reconstruyen por reemplazo completo (DELETE + INSERT en una transacción): el vault y Xandikos son las fuentes de verdad, así que cada ingest deja la base exactamente como el origen. Tras cada ingest se re-enlazan los contactos con sus fichas y se reconstruyen las tablas derivadas. """ from __future__ import annotations import json import os import re from datetime import datetime, timezone from . import davparse from .config import Config from .db import write_conn from .derived import rebuild_derived from .registry_bridge import ( dav_get_collection, dav_list_calendars, list_obsidian_notes, pass_get_secret, read_obsidian_note, ) def _norm(value): """Normaliza 'null'/''/None del frontmatter a None real.""" if value is None: return None if isinstance(value, str) and value.strip().lower() in ("null", "none", ""): return None return value def _as_str(value): """Convierte un valor de frontmatter a str (o None), sin perder números.""" v = _norm(value) return None if v is None else str(v) def _as_list(value) -> list: """Convierte un valor de frontmatter a lista (los escalares se envuelven).""" v = _norm(value) if v is None: return [] return v if isinstance(v, list) else [v] def _json(value) -> str: """Serializa un valor a JSON compacto (sin escapar acentos).""" return json.dumps(value, ensure_ascii=False, default=str) def _dav_uid_from_fuente(fuente) -> str | None: """Extrae el UID de Xandikos cuando fuente es 'Xandikos UID '.""" if not fuente: return None m = re.search(r"Xandikos UID\s+(\S+)", str(fuente)) return m.group(1) if m else None def ingest_vault(cfg: Config) -> dict: """Escanea el vault completo y reconstruye notes + tablas de entidades. Devuelve {status:'ok', notes:N, persons:N, organizations:N, domains:N, cases:N, places:N, skipped_unreadable:N, derived_rebuilt:[...]}. """ if not os.path.isdir(cfg.vault_dir): return {"status": "error", "error": f"vault no encontrado: {cfg.vault_dir}"} note_rows: list = [] entity_rows: dict = {table: [] for _, _, table in cfg.entity_folders} folder_to_table = {folder: table for folder, _, table in cfg.entity_folders} skipped = 0 for abs_path in list_obsidian_notes(cfg.vault_dir): rel_path = os.path.relpath(abs_path, cfg.vault_dir) base = os.path.splitext(os.path.basename(abs_path))[0] try: note = read_obsidian_note(abs_path) except Exception: # noqa: BLE001 — una nota corrupta no aborta el ingest skipped += 1 continue fm = note.get("frontmatter") or {} mtime = datetime.fromtimestamp(os.path.getmtime(abs_path), tz=timezone.utc) slug = _as_str(fm.get("slug")) or base note_rows.append( [ rel_path, slug, _as_str(fm.get("tipo")), _as_str(fm.get("nombre")) or base, mtime, _json(fm), ] ) # Entidad estructurada: ficha de nivel-1 dentro de una carpeta de # entidades (personas/.md, no personas//.md) y que # no sea una nota de soporte (prefijo _). top_folder = rel_path.split(os.sep)[0] is_level1 = os.path.basename(os.path.dirname(abs_path)) == top_folder if top_folder in folder_to_table and is_level1 and not base.startswith("_"): table = folder_to_table[top_folder] if table == "persons": entity_rows[table].append( [ slug, rel_path, _as_str(fm.get("nombre")) or base, _json(_as_list(fm.get("aliases"))), _as_str(fm.get("sexo")), _as_str(fm.get("fecha_nacimiento")), _as_str(fm.get("dni")), _as_str(fm.get("telefono")), _as_str(fm.get("email")), _as_str(fm.get("direccion")), _as_str(fm.get("pais")), _as_str(fm.get("contexto")), _as_str(fm.get("fuente")), _dav_uid_from_fuente(fm.get("fuente")), _json(_as_list(fm.get("tags"))), mtime, ] ) else: entity_rows[table].append( [ slug, rel_path, _as_str(fm.get("nombre")) or base, _json(_as_list(fm.get("tags"))), _json(fm), mtime, ] ) derived_rebuilt: list = [] with write_conn(cfg.db_path) as conn: conn.execute("BEGIN") try: conn.execute("DELETE FROM notes") if note_rows: conn.executemany( "INSERT INTO notes VALUES (?, ?, ?, ?, ?, ?)", note_rows ) conn.execute("DELETE FROM persons") if entity_rows["persons"]: conn.executemany( "INSERT INTO persons VALUES " "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", _dedup_by_slug(entity_rows["persons"]), ) for table in ("organizations", "domains", "cases", "places"): conn.execute(f"DELETE FROM {table}") if entity_rows[table]: conn.executemany( f"INSERT INTO {table} VALUES (?, ?, ?, ?, ?, ?)", _dedup_by_slug(entity_rows[table]), ) _link_contacts(conn) conn.execute("COMMIT") except Exception: conn.execute("ROLLBACK") raise derived_rebuilt = rebuild_derived(conn) return { "status": "ok", "notes": len(note_rows), "persons": len(_dedup_by_slug(entity_rows["persons"])), "organizations": len(_dedup_by_slug(entity_rows["organizations"])), "domains": len(_dedup_by_slug(entity_rows["domains"])), "cases": len(_dedup_by_slug(entity_rows["cases"])), "places": len(_dedup_by_slug(entity_rows["places"])), "skipped_unreadable": skipped, "derived_rebuilt": derived_rebuilt, } def _dedup_by_slug(rows: list) -> list: """Quita filas con slug repetido (gana la primera) para respetar la PK.""" seen, out = set(), [] for row in rows: if row[0] in seen: continue seen.add(row[0]) out.append(row) return out def ingest_dav(cfg: Config) -> dict: """Baja las colecciones de Xandikos y reconstruye contacts + events. Devuelve {status:'ok', contacts:N, events:N, calendars:[...], contacts_linked:N, derived_rebuilt:[...]} o {status:'error', error}. """ secret = pass_get_secret(cfg.pass_secret) if secret.get("status") != "ok": return { "status": "error", "error": f"pass no devolvió el secreto {cfg.pass_secret!r}: " f"{secret.get('error')}", } pwd = secret["value"] # sensible: nunca logear coll = dav_get_collection( cfg.dav_base, cfg.dav_user, pwd, cfg.dav_contacts_collection, "vcard" ) if coll.get("status") != "ok": return { "status": "error", "error": f"CardDAV: {coll.get('error')} (http {coll.get('http_status')})", } now = datetime.now(tz=timezone.utc) contact_rows: list = [] seen_uids: set = set() for res in coll.get("resources", []): parsed = davparse.parse_vcard(res.get("data", "")) uid = parsed["uid"] or os.path.splitext(os.path.basename(res["href"]))[0] if uid in seen_uids: continue seen_uids.add(uid) contact_rows.append( [ uid, cfg.dav_contacts_collection, res.get("etag"), parsed["fn"] or None, _json(parsed["tels"]), _json(parsed["emails"]), res.get("data", ""), None, # note_path se rellena en el enlace posterior now, ] ) cals = dav_list_calendars(cfg.dav_base, cfg.dav_user, pwd, cfg.dav_calendar_home) if cals.get("status") != "ok": return { "status": "error", "error": f"CalDAV: {cals.get('error')} (http {cals.get('http_status')})", } event_rows: list = [] seen_event_uids: set = set() calendar_names: list = [] for cal in cals.get("calendars", []): cal_name = cal.get("name") or cal.get("href", "").strip("/").rsplit("/", 1)[-1] calendar_names.append(cal_name) cal_coll = dav_get_collection( cfg.dav_base, cfg.dav_user, pwd, cal["href"], "ical" ) if cal_coll.get("status") != "ok": return { "status": "error", "error": f"CalDAV {cal_name}: {cal_coll.get('error')} " f"(http {cal_coll.get('http_status')})", } for res in cal_coll.get("resources", []): for ev in davparse.parse_ical_events(res.get("data", "")): uid = ev["uid"] or os.path.splitext(os.path.basename(res["href"]))[0] if uid in seen_event_uids: continue seen_event_uids.add(uid) event_rows.append( [ uid, cal_name, res.get("etag"), ev["dtstart"] or None, ev["dtend"] or None, ev["all_day"], ev["summary"] or None, ev["location"], ev["rrule"], ev["raw"], now, ] ) with write_conn(cfg.db_path) as conn: conn.execute("BEGIN") try: conn.execute("DELETE FROM contacts") if contact_rows: conn.executemany( "INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", contact_rows, ) conn.execute("DELETE FROM events") if event_rows: conn.executemany( "INSERT INTO events VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", event_rows, ) linked = _link_contacts(conn) conn.execute("COMMIT") except Exception: conn.execute("ROLLBACK") raise derived_rebuilt = rebuild_derived(conn) return { "status": "ok", "contacts": len(contact_rows), "events": len(event_rows), "calendars": calendar_names, "contacts_linked": linked, "derived_rebuilt": derived_rebuilt, } def _link_contacts(conn) -> int: """Enlaza contacts.note_path contra las fichas de persons. Orden de matching por fiabilidad: UID estilo osint- (creado por el push del vault), dav_uid registrado en la ficha, teléfono normalizado y por último email. Devuelve el número de contactos enlazados. """ persons = conn.execute( "SELECT slug, note_path, telefono, email, dav_uid FROM persons" ).fetchall() by_slug, by_dav_uid, by_phone, by_email = {}, {}, {}, {} for slug, note_path, telefono, email, dav_uid in persons: by_slug[slug] = note_path if dav_uid: by_dav_uid.setdefault(dav_uid, note_path) if telefono: key = davparse.norm_phone(telefono) if key: by_phone.setdefault(key, note_path) if email: by_email.setdefault(str(email).strip().lower(), note_path) contacts = conn.execute("SELECT uid, tels, emails FROM contacts").fetchall() linked = 0 for uid, tels_json, emails_json in contacts: note_path = None if uid.startswith("osint-") and uid[len("osint-"):] in by_slug: note_path = by_slug[uid[len("osint-"):]] if note_path is None and uid in by_dav_uid: note_path = by_dav_uid[uid] if note_path is None: for tel in json.loads(tels_json or "[]"): hit = by_phone.get(davparse.norm_phone(tel)) if hit: note_path = hit break if note_path is None: for em in json.loads(emails_json or "[]"): hit = by_email.get(str(em).strip().lower()) if hit: note_path = hit break if note_path is not None: conn.execute( "UPDATE contacts SET note_path = ? WHERE uid = ?", [note_path, uid] ) linked += 1 return linked