Files
osint/tools/dedup_persons.py
T
egutierrez f771c9b883 chore: auto-commit (6 archivos)
- CONVENTIONS.md
- tools/dedup_persons.py
- tools/extract_entities.py
- tools/migrate_external_orgs.py
- tools/normalize_person_frontmatter.py
- tools/person_datapoints.py

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-11 00:16:47 +02:00

81 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""Deduplica fichas de persona en osint cuando el slug de una es subconjunto estricto de
tokens de otra (p.ej. manuel-torrubia <= simon-manuel-torrubia = misma persona, nombre largo).
Fusiona la corta en la larga (canonica = nombre mas completo): campos no-null, aliases, docs,
attachments y body. Borra la duplicada. Con --apply ejecuta; sin flag solo lista candidatos.
"""
import sys, os, glob, shutil
sys.path.insert(0, "/home/enmanuel/fn_registry/python/functions")
from obsidian import read_obsidian_note, create_obsidian_note, delete_obsidian_note
OSINT = "/home/enmanuel/Obsidian/osint"
def load():
out = {}
for fp in glob.glob(f"{OSINT}/personas/*.md"):
s = os.path.splitext(os.path.basename(fp))[0]
if s.startswith("_"):
continue
out[s] = read_obsidian_note(fp)
return out
def main():
apply = "--apply" in sys.argv
fichas = load()
slugs = list(fichas)
pairs = []
for a in slugs:
for b in slugs:
if a == b:
continue
ta, tb = set(a.split("-")), set(b.split("-"))
# a subconjunto estricto de b, con >=2 tokens compartidos (evita nombres de pila sueltos)
if ta < tb and len(ta) >= 2:
pairs.append((a, b))
# quedarse con el superset mas grande por cada corta
best = {}
for a, b in pairs:
if a not in best or len(b) > len(best[a]):
best[a] = b
print(f"candidatos a fusion: {len(best)}")
for a, b in best.items():
print(f" {a} -> {b}")
if not apply or not best:
if not apply:
print("\n(dry-run; usa --apply)")
return
for a, b in best.items():
if a not in fichas or b not in fichas:
continue
fa, fb = fichas[a], fichas[b]
new = dict(fb["frontmatter"])
for k, v in fa["frontmatter"].items():
if v not in (None, "", []) and new.get(k) in (None, "", []):
new[k] = v
al = set(new.get("aliases") or [])
al.add(fa["frontmatter"].get("nombre"))
new["aliases"] = sorted(x for x in al if x)
body = fb["body"].rstrip() + f"\n\n<!-- fusionado desde {a} -->\n" + fa["body"].strip()
# mover docs y attachments de la corta a la canonica
for d in glob.glob(f"{OSINT}/personas/{a}/*"):
os.makedirs(f"{OSINT}/personas/{b}", exist_ok=True)
shutil.move(d, f"{OSINT}/personas/{b}/{os.path.basename(d)}")
for at in glob.glob(f"{OSINT}/attachments/personas/{a}/*"):
os.makedirs(f"{OSINT}/attachments/personas/{b}", exist_ok=True)
shutil.move(at, f"{OSINT}/attachments/personas/{b}/{os.path.basename(at)}")
create_obsidian_note(OSINT, f"personas/{b}", body=body, frontmatter=new, overwrite=True)
delete_obsidian_note(f"{OSINT}/personas/{a}.md")
for empty in (f"{OSINT}/personas/{a}", f"{OSINT}/attachments/personas/{a}"):
if os.path.isdir(empty) and not os.listdir(empty):
os.rmdir(empty)
print(f" fusionado {a} -> {b}")
if __name__ == "__main__":
main()