Files
fn_registry/python/functions/infra/vault_dedupe_report.py
T
egutierrez e3c8979e8d chore: auto-commit (95 archivos)
- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 00:50:34 +02:00

123 lines
4.3 KiB
Python

"""vault_dedupe_report — Detecta duplicados en vault_index.db y calcula espacio recuperable."""
from __future__ import annotations
import sqlite3
from pathlib import Path
def vault_dedupe_report(
vault_path: str,
min_size: int = 0,
db_path: str | None = None,
) -> dict:
"""Detecta archivos duplicados en un vault a partir de su vault_index.db.
Lee la tabla ``files`` de ``vault_index.db`` agrupando por ``sha256`` y
retorna todos los grupos con mas de un archivo, ordenados por bytes
recuperables de mayor a menor.
Args:
vault_path: Ruta raiz del vault. Usada como clave en el resultado y
para localizar ``vault_index.db`` cuando ``db_path`` es None.
min_size: Ignora archivos cuyo ``size`` (bytes) sea menor que este
valor. Default 0 = incluir todos los archivos.
db_path: Ruta absoluta o relativa a la BD SQLite. Si es None se
usa ``<vault_path>/vault_index.db``.
Returns:
dict con las claves:
- ``vault_path``: str — mismo valor recibido.
- ``groups``: list de dicts, cada uno con:
- ``sha256``: str
- ``size``: int — tamanio en bytes de cada copia
- ``count``: int — numero de copias encontradas
- ``files``: list[str] — rel_paths ordenados lexicograficamente
- ``reclaimable_bytes``: int — ``size * (count - 1)``
- ``total_groups``: int — numero de grupos con duplicados
- ``total_duplicates``: int — suma de ``(count - 1)`` por grupo
- ``total_reclaimable_bytes``: int — bytes totales recuperables
- ``scanned_files``: int — total de filas consideradas en la query
Raises:
RuntimeError: Si la BD no existe, no tiene tabla ``files``, o hay
algun error de lectura.
"""
resolved_db = db_path if db_path is not None else str(Path(vault_path) / "vault_index.db")
db_file = Path(resolved_db)
if not db_file.exists():
raise RuntimeError(
f"No se encontro vault_index.db en '{resolved_db}'. "
"Corre 'fn vault index <name>' primero."
)
try:
conn = sqlite3.connect(f"file:{resolved_db}?mode=ro", uri=True)
except sqlite3.OperationalError as exc:
raise RuntimeError(f"No se pudo abrir '{resolved_db}': {exc}") from exc
try:
# Verificar que existe la tabla files
cur = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='files';"
)
if cur.fetchone() is None:
raise RuntimeError(
f"vault_index.db sin tabla 'files'. "
"Corre 'fn vault index <name>' primero."
)
# Contar filas totales consideradas (sha256 no vacio, size >= min_size)
row = conn.execute(
"SELECT COUNT(*) FROM files WHERE size >= ? AND sha256 != '';",
(min_size,),
).fetchone()
scanned_files: int = row[0] if row else 0
# Query principal: grupos con mas de una copia
query = """
SELECT
sha256,
size,
COUNT(*) AS cnt,
GROUP_CONCAT(rel_path) AS paths
FROM files
WHERE size >= ? AND sha256 != ''
GROUP BY sha256
HAVING COUNT(*) > 1
ORDER BY size * (COUNT(*) - 1) DESC;
"""
rows = conn.execute(query, (min_size,)).fetchall()
finally:
conn.close()
groups: list[dict] = []
total_duplicates = 0
total_reclaimable_bytes = 0
for sha256, size, cnt, paths_concat in rows:
# GROUP_CONCAT no garantiza orden — ordenar lexicograficamente
files = sorted(paths_concat.split(","))
reclaimable = size * (cnt - 1)
groups.append(
{
"sha256": sha256,
"size": size,
"count": cnt,
"files": files,
"reclaimable_bytes": reclaimable,
}
)
total_duplicates += cnt - 1
total_reclaimable_bytes += reclaimable
return {
"vault_path": vault_path,
"groups": groups,
"total_groups": len(groups),
"total_duplicates": total_duplicates,
"total_reclaimable_bytes": total_reclaimable_bytes,
"scanned_files": scanned_files,
}