"""vault_dedupe_report — Detecta duplicados en vault_index.db y calcula espacio recuperable.""" from __future__ import annotations import sqlite3 from pathlib import Path def vault_dedupe_report( vault_path: str, min_size: int = 0, db_path: str | None = None, ) -> dict: """Detecta archivos duplicados en un vault a partir de su vault_index.db. Lee la tabla ``files`` de ``vault_index.db`` agrupando por ``sha256`` y retorna todos los grupos con mas de un archivo, ordenados por bytes recuperables de mayor a menor. Args: vault_path: Ruta raiz del vault. Usada como clave en el resultado y para localizar ``vault_index.db`` cuando ``db_path`` es None. min_size: Ignora archivos cuyo ``size`` (bytes) sea menor que este valor. Default 0 = incluir todos los archivos. db_path: Ruta absoluta o relativa a la BD SQLite. Si es None se usa ``/vault_index.db``. Returns: dict con las claves: - ``vault_path``: str — mismo valor recibido. - ``groups``: list de dicts, cada uno con: - ``sha256``: str - ``size``: int — tamanio en bytes de cada copia - ``count``: int — numero de copias encontradas - ``files``: list[str] — rel_paths ordenados lexicograficamente - ``reclaimable_bytes``: int — ``size * (count - 1)`` - ``total_groups``: int — numero de grupos con duplicados - ``total_duplicates``: int — suma de ``(count - 1)`` por grupo - ``total_reclaimable_bytes``: int — bytes totales recuperables - ``scanned_files``: int — total de filas consideradas en la query Raises: RuntimeError: Si la BD no existe, no tiene tabla ``files``, o hay algun error de lectura. """ resolved_db = db_path if db_path is not None else str(Path(vault_path) / "vault_index.db") db_file = Path(resolved_db) if not db_file.exists(): raise RuntimeError( f"No se encontro vault_index.db en '{resolved_db}'. " "Corre 'fn vault index ' primero." ) try: conn = sqlite3.connect(f"file:{resolved_db}?mode=ro", uri=True) except sqlite3.OperationalError as exc: raise RuntimeError(f"No se pudo abrir '{resolved_db}': {exc}") from exc try: # Verificar que existe la tabla files cur = conn.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='files';" ) if cur.fetchone() is None: raise RuntimeError( f"vault_index.db sin tabla 'files'. " "Corre 'fn vault index ' primero." ) # Contar filas totales consideradas (sha256 no vacio, size >= min_size) row = conn.execute( "SELECT COUNT(*) FROM files WHERE size >= ? AND sha256 != '';", (min_size,), ).fetchone() scanned_files: int = row[0] if row else 0 # Query principal: grupos con mas de una copia query = """ SELECT sha256, size, COUNT(*) AS cnt, GROUP_CONCAT(rel_path) AS paths FROM files WHERE size >= ? AND sha256 != '' GROUP BY sha256 HAVING COUNT(*) > 1 ORDER BY size * (COUNT(*) - 1) DESC; """ rows = conn.execute(query, (min_size,)).fetchall() finally: conn.close() groups: list[dict] = [] total_duplicates = 0 total_reclaimable_bytes = 0 for sha256, size, cnt, paths_concat in rows: # GROUP_CONCAT no garantiza orden — ordenar lexicograficamente files = sorted(paths_concat.split(",")) reclaimable = size * (cnt - 1) groups.append( { "sha256": sha256, "size": size, "count": cnt, "files": files, "reclaimable_bytes": reclaimable, } ) total_duplicates += cnt - 1 total_reclaimable_bytes += reclaimable return { "vault_path": vault_path, "groups": groups, "total_groups": len(groups), "total_duplicates": total_duplicates, "total_reclaimable_bytes": total_reclaimable_bytes, "scanned_files": scanned_files, }