f771c9b883
- CONVENTIONS.md - tools/dedup_persons.py - tools/extract_entities.py - tools/migrate_external_orgs.py - tools/normalize_person_frontmatter.py - tools/person_datapoints.py Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
81 lines
3.1 KiB
Python
81 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Deduplica fichas de persona en osint cuando el slug de una es subconjunto estricto de
|
|
tokens de otra (p.ej. manuel-torrubia <= simon-manuel-torrubia = misma persona, nombre largo).
|
|
|
|
Fusiona la corta en la larga (canonica = nombre mas completo): campos no-null, aliases, docs,
|
|
attachments y body. Borra la duplicada. Con --apply ejecuta; sin flag solo lista candidatos.
|
|
"""
|
|
import sys, os, glob, shutil
|
|
|
|
sys.path.insert(0, "/home/enmanuel/fn_registry/python/functions")
|
|
from obsidian import read_obsidian_note, create_obsidian_note, delete_obsidian_note
|
|
|
|
OSINT = "/home/enmanuel/Obsidian/osint"
|
|
|
|
|
|
def load():
|
|
out = {}
|
|
for fp in glob.glob(f"{OSINT}/personas/*.md"):
|
|
s = os.path.splitext(os.path.basename(fp))[0]
|
|
if s.startswith("_"):
|
|
continue
|
|
out[s] = read_obsidian_note(fp)
|
|
return out
|
|
|
|
|
|
def main():
|
|
apply = "--apply" in sys.argv
|
|
fichas = load()
|
|
slugs = list(fichas)
|
|
pairs = []
|
|
for a in slugs:
|
|
for b in slugs:
|
|
if a == b:
|
|
continue
|
|
ta, tb = set(a.split("-")), set(b.split("-"))
|
|
# a subconjunto estricto de b, con >=2 tokens compartidos (evita nombres de pila sueltos)
|
|
if ta < tb and len(ta) >= 2:
|
|
pairs.append((a, b))
|
|
# quedarse con el superset mas grande por cada corta
|
|
best = {}
|
|
for a, b in pairs:
|
|
if a not in best or len(b) > len(best[a]):
|
|
best[a] = b
|
|
print(f"candidatos a fusion: {len(best)}")
|
|
for a, b in best.items():
|
|
print(f" {a} -> {b}")
|
|
if not apply or not best:
|
|
if not apply:
|
|
print("\n(dry-run; usa --apply)")
|
|
return
|
|
|
|
for a, b in best.items():
|
|
if a not in fichas or b not in fichas:
|
|
continue
|
|
fa, fb = fichas[a], fichas[b]
|
|
new = dict(fb["frontmatter"])
|
|
for k, v in fa["frontmatter"].items():
|
|
if v not in (None, "", []) and new.get(k) in (None, "", []):
|
|
new[k] = v
|
|
al = set(new.get("aliases") or [])
|
|
al.add(fa["frontmatter"].get("nombre"))
|
|
new["aliases"] = sorted(x for x in al if x)
|
|
body = fb["body"].rstrip() + f"\n\n<!-- fusionado desde {a} -->\n" + fa["body"].strip()
|
|
# mover docs y attachments de la corta a la canonica
|
|
for d in glob.glob(f"{OSINT}/personas/{a}/*"):
|
|
os.makedirs(f"{OSINT}/personas/{b}", exist_ok=True)
|
|
shutil.move(d, f"{OSINT}/personas/{b}/{os.path.basename(d)}")
|
|
for at in glob.glob(f"{OSINT}/attachments/personas/{a}/*"):
|
|
os.makedirs(f"{OSINT}/attachments/personas/{b}", exist_ok=True)
|
|
shutil.move(at, f"{OSINT}/attachments/personas/{b}/{os.path.basename(at)}")
|
|
create_obsidian_note(OSINT, f"personas/{b}", body=body, frontmatter=new, overwrite=True)
|
|
delete_obsidian_note(f"{OSINT}/personas/{a}.md")
|
|
for empty in (f"{OSINT}/personas/{a}", f"{OSINT}/attachments/personas/{a}"):
|
|
if os.path.isdir(empty) and not os.listdir(empty):
|
|
os.rmdir(empty)
|
|
print(f" fusionado {a} -> {b}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|