a802f59f55
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
123 lines
4.3 KiB
Python
123 lines
4.3 KiB
Python
"""vault_dedupe_report — Detecta duplicados en vault_index.db y calcula espacio recuperable."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
|
|
def vault_dedupe_report(
|
|
vault_path: str,
|
|
min_size: int = 0,
|
|
db_path: str | None = None,
|
|
) -> dict:
|
|
"""Detecta archivos duplicados en un vault a partir de su vault_index.db.
|
|
|
|
Lee la tabla ``files`` de ``vault_index.db`` agrupando por ``sha256`` y
|
|
retorna todos los grupos con mas de un archivo, ordenados por bytes
|
|
recuperables de mayor a menor.
|
|
|
|
Args:
|
|
vault_path: Ruta raiz del vault. Usada como clave en el resultado y
|
|
para localizar ``vault_index.db`` cuando ``db_path`` es None.
|
|
min_size: Ignora archivos cuyo ``size`` (bytes) sea menor que este
|
|
valor. Default 0 = incluir todos los archivos.
|
|
db_path: Ruta absoluta o relativa a la BD SQLite. Si es None se
|
|
usa ``<vault_path>/vault_index.db``.
|
|
|
|
Returns:
|
|
dict con las claves:
|
|
- ``vault_path``: str — mismo valor recibido.
|
|
- ``groups``: list de dicts, cada uno con:
|
|
- ``sha256``: str
|
|
- ``size``: int — tamanio en bytes de cada copia
|
|
- ``count``: int — numero de copias encontradas
|
|
- ``files``: list[str] — rel_paths ordenados lexicograficamente
|
|
- ``reclaimable_bytes``: int — ``size * (count - 1)``
|
|
- ``total_groups``: int — numero de grupos con duplicados
|
|
- ``total_duplicates``: int — suma de ``(count - 1)`` por grupo
|
|
- ``total_reclaimable_bytes``: int — bytes totales recuperables
|
|
- ``scanned_files``: int — total de filas consideradas en la query
|
|
|
|
Raises:
|
|
RuntimeError: Si la BD no existe, no tiene tabla ``files``, o hay
|
|
algun error de lectura.
|
|
"""
|
|
resolved_db = db_path if db_path is not None else str(Path(vault_path) / "vault_index.db")
|
|
|
|
db_file = Path(resolved_db)
|
|
if not db_file.exists():
|
|
raise RuntimeError(
|
|
f"No se encontro vault_index.db en '{resolved_db}'. "
|
|
"Corre 'fn vault index <name>' primero."
|
|
)
|
|
|
|
try:
|
|
conn = sqlite3.connect(f"file:{resolved_db}?mode=ro", uri=True)
|
|
except sqlite3.OperationalError as exc:
|
|
raise RuntimeError(f"No se pudo abrir '{resolved_db}': {exc}") from exc
|
|
|
|
try:
|
|
# Verificar que existe la tabla files
|
|
cur = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='files';"
|
|
)
|
|
if cur.fetchone() is None:
|
|
raise RuntimeError(
|
|
f"vault_index.db sin tabla 'files'. "
|
|
"Corre 'fn vault index <name>' primero."
|
|
)
|
|
|
|
# Contar filas totales consideradas (sha256 no vacio, size >= min_size)
|
|
row = conn.execute(
|
|
"SELECT COUNT(*) FROM files WHERE size >= ? AND sha256 != '';",
|
|
(min_size,),
|
|
).fetchone()
|
|
scanned_files: int = row[0] if row else 0
|
|
|
|
# Query principal: grupos con mas de una copia
|
|
query = """
|
|
SELECT
|
|
sha256,
|
|
size,
|
|
COUNT(*) AS cnt,
|
|
GROUP_CONCAT(rel_path) AS paths
|
|
FROM files
|
|
WHERE size >= ? AND sha256 != ''
|
|
GROUP BY sha256
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY size * (COUNT(*) - 1) DESC;
|
|
"""
|
|
rows = conn.execute(query, (min_size,)).fetchall()
|
|
finally:
|
|
conn.close()
|
|
|
|
groups: list[dict] = []
|
|
total_duplicates = 0
|
|
total_reclaimable_bytes = 0
|
|
|
|
for sha256, size, cnt, paths_concat in rows:
|
|
# GROUP_CONCAT no garantiza orden — ordenar lexicograficamente
|
|
files = sorted(paths_concat.split(","))
|
|
reclaimable = size * (cnt - 1)
|
|
groups.append(
|
|
{
|
|
"sha256": sha256,
|
|
"size": size,
|
|
"count": cnt,
|
|
"files": files,
|
|
"reclaimable_bytes": reclaimable,
|
|
}
|
|
)
|
|
total_duplicates += cnt - 1
|
|
total_reclaimable_bytes += reclaimable
|
|
|
|
return {
|
|
"vault_path": vault_path,
|
|
"groups": groups,
|
|
"total_groups": len(groups),
|
|
"total_duplicates": total_duplicates,
|
|
"total_reclaimable_bytes": total_reclaimable_bytes,
|
|
"scanned_files": scanned_files,
|
|
}
|