Files
osint/tools/person_datapoints.py
egutierrez f771c9b883 chore: auto-commit (6 archivos)
- CONVENTIONS.md
- tools/dedup_persons.py
- tools/extract_entities.py
- tools/migrate_external_orgs.py
- tools/normalize_person_frontmatter.py
- tools/person_datapoints.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-11 00:16:47 +02:00

63 lines
2.7 KiB
Python

#!/usr/bin/env python3
"""Reporte de datapoints y score de fiabilidad/completitud por persona en osint.
Para cada ficha personas/<slug>.md calcula:
- score de completitud: campos de identidad presentes / 7 * 100
(sexo, fecha_nacimiento, dni, telefono, email, direccion, pais)
- datapoints totales: campos de identidad presentes + nº documentos + nº attachments + relaciones
- campos faltantes (cuando el score < 100%)
Salida: tabla ordenada por score asc (las menos fiables primero) + totales globales.
Con --json imprime el detalle como JSON. Read-only.
"""
import sys, os, glob, json
sys.path.insert(0, "/home/enmanuel/fn_registry/python/functions")
from obsidian import read_obsidian_note
OSINT = "/home/enmanuel/Obsidian/osint"
IDENT = ["sexo", "fecha_nacimiento", "dni", "telefono", "email", "direccion", "pais"]
def main():
as_json = "--json" in sys.argv
rows = []
tot_dp = 0
for fp in sorted(glob.glob(f"{OSINT}/personas/*.md")):
slug = os.path.splitext(os.path.basename(fp))[0]
if slug.startswith("_"):
continue
fm = read_obsidian_note(fp)["frontmatter"]
present = [k for k in IDENT if fm.get(k) not in (None, "", [])]
missing = [k for k in IDENT if k not in present]
score = round(len(present) / len(IDENT) * 100)
ndocs = len(glob.glob(f"{OSINT}/personas/{slug}/*.md"))
natt = len(glob.glob(f"{OSINT}/attachments/personas/{slug}/*"))
nrel = len(fm.get("relaciones") or [])
dp = len(present) + ndocs + natt + nrel
tot_dp += dp
rows.append({"slug": slug, "score": score, "datapoints": dp,
"ident": len(present), "docs": ndocs, "attachments": natt,
"relaciones": nrel, "faltan": missing})
rows.sort(key=lambda r: (r["score"], -r["datapoints"]))
if as_json:
print(json.dumps({"total_datapoints": tot_dp, "personas": rows}, ensure_ascii=False, indent=2))
return
print(f"PERSONAS: {len(rows)} | datapoints totales: {tot_dp} | "
f"score medio: {round(sum(r['score'] for r in rows)/len(rows))}%\n")
print(f"{'persona':38} {'score':>5} {'dp':>4} {'id':>3} {'doc':>4} {'att':>4} {'rel':>4} faltan")
print("-" * 100)
for r in rows:
flag = "" if r["score"] == 100 else " <-- " + ",".join(r["faltan"])
print(f"{r['slug']:38} {r['score']:>4}% {r['datapoints']:>4} {r['ident']:>3} "
f"{r['docs']:>4} {r['attachments']:>4} {r['relaciones']:>4}{flag}")
bajo = [r for r in rows if r["score"] < 100]
print(f"\nfichas por debajo del 100%: {len(bajo)}/{len(rows)} "
f"({round(len(bajo)/len(rows)*100)}%)")
if __name__ == "__main__":
main()