Files
osint_db/tests/test_osint_db.py
T
egutierrez 63f37257cd feat: DuckDB como fuente de verdad (multi-valor, ownership selectivo, escritura, libretas)
F1 — migraciones: 002_multivalue (persons +telefonos/emails/direcciones/extra_fm JSON,
backfill desde singulares con to_json) + 003_addressbooks (tabla addressbooks + seed
idempotente de la libreta por defecto). Conteos intactos (697/1065/98).

F2 — ingest_vault selectivo (anti-pisado): personas que ya existen en DB solo actualizan
note_path + extra_fm vía duckdb_upsert(update_cols=...), NO pisan los campos OWNED por la
DB; personas nuevas = bootstrap completo. _link_contacts enlaza por listas telefonos[]/
emails[] además del singular. ingest_dav itera todas las libretas de la tabla addressbooks.

F3 — escritura estructurada (server/writes.py + endpoints en main.py): CRUD
/api/person|contact|event, /api/addressbook, /api/calendar, /api/person/{slug}/render
(DB→nota preservando la prosa del cuerpo), /api/push/dav (reconcilia DB→Xandikos). El push
DAV y el render ocurren fuera de la transacción de escritura para no bloquear la DB con
latencia de red. registry_bridge.py importa las funciones nuevas; app.md actualizado.

Verificado: 18 tests verdes; ownership probado sobre datos reales (un centinela DB-owned
sobrevivió a POST /api/ingest/vault sobre las 697 fichas); person CRUD + materialización
de la ficha .md en vivo, con cleanup sin residuo.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 00:44:02 +02:00

544 lines
18 KiB
Python

"""Tests del service osint_db: migraciones, ingest del vault, API y render.
Todo corre contra un vault temporal y una base DuckDB temporal, SIN red: el
ingest DAV no se ejercita aquí (requiere Xandikos + pass). El enlace
contacto→ficha sí se prueba insertando un contacto a mano y relanzando el
ingest del vault, que re-enlaza.
"""
from __future__ import annotations
import json
import os
import sys
from datetime import datetime, timezone
import pytest
from fastapi.testclient import TestClient
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from server.config import Config # noqa: E402
from server.db import apply_migrations, write_conn # noqa: E402
from server.main import create_app # noqa: E402
PERSONA_MD = """---
tipo: persona
nombre: "Ana García Pérez"
slug: ana-garcia-perez
aliases: ["Anita"]
sexo: mujer
fecha_nacimiento: 1990-04-12
dni: 12345678Z
telefono: "+34 600 111 222"
email: ana@example.com
direccion: null
pais: españa
relaciones: []
contexto: familia
fuente: "test fixture"
tags: [persona, osint]
---
## Notas
Ficha de prueba.
"""
PERSONA2_MD = """---
tipo: persona
nombre: "Luis Pérez"
slug: luis-perez
aliases: []
sexo: hombre
fecha_nacimiento: null
dni: null
telefono: null
email: null
direccion: null
pais: null
relaciones: []
contexto: movil
fuente: "Xandikos UID abc-123"
tags: [persona, osint, movil]
---
## Notas
"""
ORG_MD = """---
tipo: organizacion
nombre: "Acme S.L."
slug: acme-sl
tags: [organizacion, osint]
---
## Notas
"""
DOC_MD = """---
tipo: documento
doc_tipo: dni
---
Sub-nota de documento (NO debe contar como ficha de persona).
"""
@pytest.fixture()
def cfg(tmp_path):
"""Vault temporal con fichas de fixture + base DuckDB temporal migrada."""
vault = tmp_path / "vault"
(vault / "personas" / "ana-garcia-perez").mkdir(parents=True)
(vault / "organizaciones").mkdir()
(vault / "personas" / "ana-garcia-perez.md").write_text(
PERSONA_MD, encoding="utf-8"
)
(vault / "personas" / "luis-perez.md").write_text(PERSONA2_MD, encoding="utf-8")
(vault / "personas" / "_plantilla.md").write_text(
"---\ntipo: plantilla\n---\n", encoding="utf-8"
)
(vault / "personas" / "ana-garcia-perez" / "dni.md").write_text(
DOC_MD, encoding="utf-8"
)
(vault / "organizaciones" / "acme-sl.md").write_text(ORG_MD, encoding="utf-8")
config = Config(
vault_dir=str(vault),
db_path=str(tmp_path / "data" / "osint.duckdb"),
port=0,
)
apply_migrations(config.db_path)
return config
@pytest.fixture()
def client(cfg):
return TestClient(create_app(cfg))
def test_migrations_son_idempotentes(cfg):
"""La segunda pasada de migraciones no aplica nada (tabla _migrations)."""
assert apply_migrations(cfg.db_path) == []
def test_health(client, cfg):
r = client.get("/api/health").json()
assert r["status"] == "ok"
assert r["db_path"] == cfg.db_path
assert r["tables"] >= 8
def test_ingest_vault_cuenta_entidades(client):
r = client.post("/api/ingest/vault").json()
assert r["status"] == "ok"
# 5 notas: 2 personas + plantilla + sub-nota documento + organización.
assert r["notes"] == 5
# Solo las fichas de nivel-1 sin prefijo _ cuentan como persona.
assert r["persons"] == 2
assert r["organizations"] == 1
assert r["domains"] == 0
assert sorted(r["derived_rebuilt"]) == [
"derived.contact_link_quality",
"derived.event_monthly",
"derived.person_stats",
]
def test_ingest_vault_extrae_dav_uid_de_fuente(client):
client.post("/api/ingest/vault")
r = client.post(
"/api/query",
json={"sql": "SELECT dav_uid FROM persons WHERE slug = 'luis-perez'"},
).json()
assert r["status"] == "ok"
assert r["rows"][0]["dav_uid"] == "abc-123"
def test_api_query_ok_y_error_siempre_http_200(client):
client.post("/api/ingest/vault")
ok = client.post(
"/api/query",
json={"sql": "SELECT slug, nombre FROM persons ORDER BY slug", "max_rows": 10},
)
assert ok.status_code == 200
body = ok.json()
assert body["status"] == "ok"
assert body["columns"] == ["slug", "nombre"]
assert body["row_count"] == 2
assert body["truncated"] is False
assert body["rows"][0]["slug"] == "ana-garcia-perez"
err = client.post("/api/query", json={"sql": "SELECT * FROM tabla_que_no_existe"})
assert err.status_code == 200
assert err.json()["status"] == "error"
assert err.json()["error"]
def test_api_query_es_solo_lectura(client):
client.post("/api/ingest/vault")
r = client.post(
"/api/query", json={"sql": "DELETE FROM persons"}
).json()
assert r["status"] == "error"
def test_catalogo_de_queries_con_nombre(client):
r = client.get("/api/queries").json()
assert r["status"] == "ok"
names = {q["name"] for q in r["queries"]}
assert {
"personas_por_contexto",
"personas_recientes",
"eventos_proximos",
"contactos_sin_nota",
"stats_personas",
} <= names
assert all(q["sql"] and q["description"] for q in r["queries"])
def test_query_named_ok_y_desconocida(client):
client.post("/api/ingest/vault")
r = client.post(
"/api/query/named", json={"name": "personas_por_contexto"}
).json()
assert r["status"] == "ok"
contextos = {row["contexto"]: row["personas"] for row in r["rows"]}
assert contextos == {"familia": 1, "movil": 1}
bad = client.post("/api/query/named", json={"name": "no_existe"}).json()
assert bad["status"] == "error"
def test_tables_inventario(client):
client.post("/api/ingest/vault")
r = client.get("/api/tables").json()
assert r["status"] == "ok"
by_name = {(t["schema"], t["name"]): t for t in r["tables"]}
persons = by_name[("main", "persons")]
assert persons["kind"] == "master"
assert persons["row_count"] == 2
assert {"name": "note_path", "type": "VARCHAR"} in persons["columns"]
stats = by_name[("derived", "person_stats")]
assert stats["kind"] == "derived"
assert ("main", "_migrations") not in by_name
def test_derivadas_sin_note_path(client):
"""Regla dura: ninguna tabla del schema derived referencia notas."""
client.post("/api/ingest/vault")
r = client.post(
"/api/query",
json={
"sql": (
"SELECT table_name, column_name FROM information_schema.columns "
"WHERE table_schema = 'derived' AND column_name LIKE '%note%'"
)
},
).json()
assert r["status"] == "ok"
assert r["rows"] == []
# Y las tres derivadas existen de verdad.
t = client.post(
"/api/query",
json={
"sql": (
"SELECT table_name FROM information_schema.tables "
"WHERE table_schema = 'derived' ORDER BY table_name"
)
},
).json()
assert [row["table_name"] for row in t["rows"]] == [
"contact_link_quality",
"event_monthly",
"person_stats",
]
def test_link_contacts_por_telefono(client, cfg):
"""Un contacto con teléfono que casa con una ficha queda enlazado al re-ingestar."""
client.post("/api/ingest/vault")
now = datetime.now(tz=timezone.utc)
with write_conn(cfg.db_path) as conn:
conn.execute(
"INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
[
"uid-movil-1",
"/enmanuel/contacts/addressbook/",
"etag1",
"Ana G.",
'["600111222"]',
"[]",
"BEGIN:VCARD...",
None,
now,
],
)
conn.execute(
"INSERT INTO contacts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
[
"uid-movil-2",
"/enmanuel/contacts/addressbook/",
"etag2",
"Desconocido",
'["699999999"]',
"[]",
"BEGIN:VCARD...",
None,
now,
],
)
# El ingest del vault re-enlaza contacts y reconstruye derivadas.
client.post("/api/ingest/vault")
r = client.post(
"/api/query",
json={"sql": "SELECT uid, note_path FROM contacts ORDER BY uid"},
).json()
rows = {row["uid"]: row["note_path"] for row in r["rows"]}
assert rows["uid-movil-1"] == os.path.join("personas", "ana-garcia-perez.md")
assert rows["uid-movil-2"] is None
q = client.post("/api/query/named", json={"name": "contactos_sin_nota"}).json()
assert [row["uid"] for row in q["rows"]] == ["uid-movil-2"]
quality = client.post(
"/api/query/named", json={"name": "calidad_enlace_contactos"}
).json()
assert quality["rows"] == [{"total": 2, "linked": 1, "unlinked": 1}]
def test_render_note_crea_bloque_sentinel_y_es_idempotente(client, cfg):
client.post("/api/ingest/vault")
body = {
"note_path": "tableros/personas.md",
"block_id": "personas",
"query": "personas_por_contexto",
"title": "Personas por contexto",
}
r = client.post("/api/render/note", json=body).json()
assert r["status"] == "ok"
assert r["note_path"] == "tableros/personas.md"
assert r["rows_rendered"] == 2
note_file = os.path.join(cfg.vault_dir, "tableros", "personas.md")
content = open(note_file, encoding="utf-8").read()
assert "<!-- osintdb:begin id=personas -->" in content
assert "<!-- osintdb:end id=personas -->" in content
assert "### Personas por contexto" in content
assert "| contexto | personas |" in content
assert "| familia | 1 |" in content
# Idempotente: un segundo render no duplica el bloque ni la tabla.
r2 = client.post("/api/render/note", json=body).json()
assert r2["status"] == "ok"
content2 = open(note_file, encoding="utf-8").read()
assert content2.count("<!-- osintdb:begin id=personas -->") == 1
assert content2.count("| familia | 1 |") == 1
def test_render_note_valida_inputs(client):
client.post("/api/ingest/vault")
# Ni sql ni query.
r = client.post(
"/api/render/note", json={"note_path": "t.md", "block_id": "x"}
).json()
assert r["status"] == "error"
# Query con nombre desconocida.
r = client.post(
"/api/render/note",
json={"note_path": "t.md", "block_id": "x", "query": "nope"},
).json()
assert r["status"] == "error"
# Path traversal fuera del vault.
r = client.post(
"/api/render/note",
json={"note_path": "../fuera.md", "block_id": "x", "query": "stats_personas"},
).json()
assert r["status"] == "error"
assert "fuera del vault" in r["error"]
# --- F1: migraciones multi-valor + addressbooks ----------------------------
def test_migracion_multivalue_y_addressbooks(client, cfg):
"""002 añade columnas multi-valor a persons; 003 crea+seed addressbooks."""
r = client.get("/api/tables").json()
by_name = {(t["schema"], t["name"]): t for t in r["tables"]}
persons_cols = {c["name"] for c in by_name[("main", "persons")]["columns"]}
assert {"telefonos", "emails", "direcciones", "extra_fm"} <= persons_cols
# Las singulares siguen existiendo (compat).
assert {"telefono", "email", "direccion"} <= persons_cols
# La libreta por defecto quedó sembrada.
assert ("main", "addressbooks") in by_name
ab = client.post(
"/api/query", json={"sql": "SELECT slug, collection_path FROM addressbooks"}
).json()
rows = {row["slug"]: row["collection_path"] for row in ab["rows"]}
assert rows["addressbook"] == "/enmanuel/contacts/addressbook/"
# --- F2: ingest selectivo (la DB es dueña de los campos OWNED) --------------
def test_ingest_vault_no_pisa_campo_owned(client, cfg):
"""Un valor escrito por la API persiste tras re-ingestar el vault.
Simula la escritura de un teléfono por la futura API con un UPDATE directo
a la DB; el re-ingest del vault NO debe pisarlo con el frontmatter viejo de
la nota (que tenía '+34 600 111 222').
"""
client.post("/api/ingest/vault")
# La API escribe un teléfono nuevo (multi-valor) en la DB.
with write_conn(cfg.db_path) as conn:
conn.execute(
"UPDATE persons SET telefonos = ?, telefono = ? WHERE slug = ?",
['["+34 999 888 777"]', "+34 999 888 777", "ana-garcia-perez"],
)
# Re-ingest del vault: la ficha de Ana YA existe -> solo refresca
# note_path + extra_fm, NO los campos OWNED.
r = client.post("/api/ingest/vault").json()
assert r["status"] == "ok"
assert r["persons"] == 2
assert r["persons_updated"] == 2 # ambas fichas ya existían
assert r["persons_inserted"] == 0
q = client.post(
"/api/query",
json={
"sql": "SELECT telefono, telefonos FROM persons WHERE slug = 'ana-garcia-perez'"
},
).json()
assert q["rows"][0]["telefono"] == "+34 999 888 777" # el valor de la API, NO el del FM
assert "+34 999 888 777" in q["rows"][0]["telefonos"]
def test_ingest_vault_bootstrapea_ficha_nueva(client, cfg):
"""Una ficha cuyo slug no está en la DB se inserta completa desde el FM."""
# Primer ingest: solo las dos fichas del fixture.
client.post("/api/ingest/vault")
# Añadimos una ficha nueva al vault con teléfono singular en el FM.
nueva = (
"---\n"
"tipo: persona\n"
'nombre: "Marta Ruiz"\n'
"slug: marta-ruiz\n"
'telefono: "+34 611 000 111"\n'
"contexto: trabajo\n"
"campo_libre: valor_raro\n"
"tags: [persona]\n"
"---\n\n## Notas\n"
)
with open(
os.path.join(cfg.vault_dir, "personas", "marta-ruiz.md"), "w", encoding="utf-8"
) as fh:
fh.write(nueva)
r = client.post("/api/ingest/vault").json()
assert r["status"] == "ok"
assert r["persons"] == 3
assert r["persons_inserted"] == 1 # marta-ruiz nueva
q = client.post(
"/api/query",
json={
"sql": "SELECT telefono, telefonos, contexto, extra_fm "
"FROM persons WHERE slug = 'marta-ruiz'"
},
).json()
row = q["rows"][0]
assert row["telefono"] == "+34 611 000 111" # singular derivado de la lista
assert "+34 611 000 111" in row["telefonos"] # lista poblada desde el singular
assert row["contexto"] == "trabajo"
# extra_fm captura el frontmatter no-owned (campo_libre), no los OWNED.
extra = json.loads(row["extra_fm"])
assert extra.get("campo_libre") == "valor_raro"
assert "telefono" not in extra and "contexto" not in extra
# --- F3: endpoints de escritura estructurada (persons, sin red) ------------
def test_api_person_crud_y_materializa(client, cfg):
"""POST /api/person con 2 teléfonos -> fila en DB + nota .md materializada."""
client.post("/api/ingest/vault")
body = {
"slug": "nuevo-contacto",
"nombre": "Nuevo Contacto",
"telefonos": ["+34 600 000 001", "+34 600 000 002"],
"emails": ["nc@example.com"],
"contexto": "trabajo",
"tags": ["persona"],
}
r = client.post("/api/person", json=body).json()
assert r["status"] == "ok"
assert r["inserted"] == 1
assert r["materialized"] is True
# Fila en DB: telefonos como lista, singular = primer elemento.
q = client.post(
"/api/query",
json={
"sql": "SELECT telefono, telefonos, emails, email FROM persons "
"WHERE slug = 'nuevo-contacto'"
},
).json()
row = q["rows"][0]
assert json.loads(row["telefonos"]) == ["+34 600 000 001", "+34 600 000 002"]
assert row["telefono"] == "+34 600 000 001"
assert row["email"] == "nc@example.com"
# Nota .md materializada con la lista telefonos.
note_file = os.path.join(cfg.vault_dir, "personas", "nuevo-contacto.md")
assert os.path.exists(note_file)
content = open(note_file, encoding="utf-8").read()
assert "telefonos:" in content
assert "+34 600 000 001" in content
assert "+34 600 000 002" in content
# PUT actualiza (un solo teléfono ahora).
r = client.put(
"/api/person/nuevo-contacto",
json={"slug": "nuevo-contacto", "nombre": "NC", "telefonos": ["+34 600 000 009"]},
).json()
assert r["status"] == "ok"
assert r["updated"] == 1
q = client.post(
"/api/query",
json={"sql": "SELECT telefono FROM persons WHERE slug = 'nuevo-contacto'"},
).json()
assert q["rows"][0]["telefono"] == "+34 600 000 009"
# DELETE quita la fila de la DB.
r = client.request("DELETE", "/api/person/nuevo-contacto").json()
assert r["status"] == "ok"
q = client.post(
"/api/query",
json={"sql": "SELECT COUNT(*) AS n FROM persons WHERE slug = 'nuevo-contacto'"},
).json()
assert q["rows"][0]["n"] == 0
def test_api_person_render_preserva_prosa(client, cfg):
"""POST /api/person/{slug}/render reescribe el frontmatter SIN tocar la prosa."""
client.post("/api/ingest/vault")
# Ana ya tiene cuerpo "## Notas\nFicha de prueba." en el fixture.
note_file = os.path.join(cfg.vault_dir, "personas", "ana-garcia-perez.md")
before = open(note_file, encoding="utf-8").read()
assert "Ficha de prueba." in before
# Cambiamos el teléfono por la API y re-materializamos.
client.put(
"/api/person/ana-garcia-perez",
json={
"slug": "ana-garcia-perez",
"nombre": "Ana García Pérez",
"telefonos": ["+34 622 333 444"],
},
)
r = client.post("/api/person/ana-garcia-perez/render").json()
assert r["status"] == "ok"
after = open(note_file, encoding="utf-8").read()
assert "Ficha de prueba." in after # prosa preservada
assert "+34 622 333 444" in after # frontmatter actualizado