chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,154 @@
|
||||
"""Tests para vault_dedupe_report."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from vault_dedupe_report import vault_dedupe_report
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_db(tmp_path: Path, rows: list[tuple]) -> Path:
|
||||
"""Crea vault_index.db con la tabla files y las filas dadas.
|
||||
|
||||
rows: lista de (rel_path, size, sha256)
|
||||
"""
|
||||
db_path = tmp_path / "vault_index.db"
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE files (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
size INTEGER,
|
||||
mtime REAL,
|
||||
sha256 TEXT,
|
||||
mime TEXT,
|
||||
ext TEXT,
|
||||
bucket TEXT,
|
||||
sub_bucket TEXT,
|
||||
indexed_at REAL
|
||||
);
|
||||
"""
|
||||
)
|
||||
conn.executemany(
|
||||
"INSERT INTO files (rel_path, size, sha256) VALUES (?, ?, ?);",
|
||||
rows,
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_no_duplicates(tmp_path):
|
||||
"""test_no_duplicates — 3 archivos con sha256 distintos -> groups=[]."""
|
||||
_make_db(tmp_path, [
|
||||
("a/file1.txt", 100, "aaa111"),
|
||||
("a/file2.txt", 200, "bbb222"),
|
||||
("a/file3.txt", 300, "ccc333"),
|
||||
])
|
||||
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
|
||||
|
||||
assert result["groups"] == []
|
||||
assert result["total_groups"] == 0
|
||||
assert result["total_duplicates"] == 0
|
||||
assert result["total_reclaimable_bytes"] == 0
|
||||
assert result["scanned_files"] == 3
|
||||
assert result["vault_path"] == str(tmp_path)
|
||||
|
||||
|
||||
def test_basic_duplicates(tmp_path):
|
||||
"""test_basic_duplicates — 2 archivos mismo sha256 -> 1 group, count=2, reclaimable=size."""
|
||||
_make_db(tmp_path, [
|
||||
("data/orig.jpg", 500, "deadbeef"),
|
||||
("backup/orig.jpg", 500, "deadbeef"),
|
||||
])
|
||||
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
|
||||
|
||||
assert result["total_groups"] == 1
|
||||
assert result["total_duplicates"] == 1
|
||||
assert result["total_reclaimable_bytes"] == 500
|
||||
|
||||
g = result["groups"][0]
|
||||
assert g["sha256"] == "deadbeef"
|
||||
assert g["size"] == 500
|
||||
assert g["count"] == 2
|
||||
assert g["reclaimable_bytes"] == 500
|
||||
assert sorted(g["files"]) == ["backup/orig.jpg", "data/orig.jpg"]
|
||||
|
||||
|
||||
def test_three_in_group(tmp_path):
|
||||
"""test_three_in_group — 3 archivos mismo sha256 -> count=3, reclaimable=size*2."""
|
||||
size = 1000
|
||||
_make_db(tmp_path, [
|
||||
("a/f1.bin", size, "cafebabe"),
|
||||
("b/f2.bin", size, "cafebabe"),
|
||||
("c/f3.bin", size, "cafebabe"),
|
||||
])
|
||||
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
|
||||
|
||||
assert result["total_groups"] == 1
|
||||
assert result["total_duplicates"] == 2
|
||||
assert result["total_reclaimable_bytes"] == size * 2
|
||||
|
||||
g = result["groups"][0]
|
||||
assert g["count"] == 3
|
||||
assert g["reclaimable_bytes"] == size * 2
|
||||
assert g["files"] == sorted(["a/f1.bin", "b/f2.bin", "c/f3.bin"])
|
||||
|
||||
|
||||
def test_min_size_filter(tmp_path):
|
||||
"""test_min_size_filter — duplicados de tamano 50, min_size=100 -> groups=[]."""
|
||||
_make_db(tmp_path, [
|
||||
("x/small1.txt", 50, "tiny123"),
|
||||
("y/small2.txt", 50, "tiny123"),
|
||||
])
|
||||
result = vault_dedupe_report(
|
||||
str(tmp_path),
|
||||
min_size=100,
|
||||
db_path=str(tmp_path / "vault_index.db"),
|
||||
)
|
||||
|
||||
assert result["groups"] == []
|
||||
assert result["total_groups"] == 0
|
||||
assert result["total_reclaimable_bytes"] == 0
|
||||
assert result["scanned_files"] == 0
|
||||
|
||||
|
||||
def test_multiple_groups_ordered(tmp_path):
|
||||
"""test_multiple_groups_ordered — 2 grupos con distinto ahorro -> orden DESC."""
|
||||
# grupo A: 2 copias de 200 bytes -> reclaimable=200
|
||||
# grupo B: 3 copias de 500 bytes -> reclaimable=1000
|
||||
# el grupo B debe salir primero
|
||||
_make_db(tmp_path, [
|
||||
("p/a1.dat", 200, "groupA"),
|
||||
("q/a2.dat", 200, "groupA"),
|
||||
("r/b1.dat", 500, "groupB"),
|
||||
("s/b2.dat", 500, "groupB"),
|
||||
("t/b3.dat", 500, "groupB"),
|
||||
("u/uniq.dat", 999, "unique1"),
|
||||
])
|
||||
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
|
||||
|
||||
assert result["total_groups"] == 2
|
||||
assert result["total_duplicates"] == 3 # (2-1) + (3-1)
|
||||
assert result["total_reclaimable_bytes"] == 1200 # 200 + 1000
|
||||
assert result["scanned_files"] == 6 # 6 filas con sha256 != '' (incluye el unico)
|
||||
|
||||
# Primer grupo debe ser el de mayor ahorro (B: 1000)
|
||||
assert result["groups"][0]["sha256"] == "groupB"
|
||||
assert result["groups"][0]["reclaimable_bytes"] == 1000
|
||||
assert result["groups"][1]["sha256"] == "groupA"
|
||||
assert result["groups"][1]["reclaimable_bytes"] == 200
|
||||
@@ -0,0 +1,153 @@
|
||||
"""Tests para vault_knowledge_parse."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from vault_knowledge_parse import vault_knowledge_parse
|
||||
|
||||
|
||||
def _make_vault(tmp: Path) -> tuple[Path, Path]:
|
||||
"""Crea un vault mínimo con vault_index.db."""
|
||||
db = tmp / "vault_index.db"
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.executescript(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
rel_path TEXT UNIQUE NOT NULL,
|
||||
size_bytes INTEGER,
|
||||
ext TEXT
|
||||
);
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts
|
||||
USING fts5(rel_path, content_text, content='', contentless_delete=1);
|
||||
CREATE TABLE IF NOT EXISTS knowledge_docs (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
title TEXT,
|
||||
frontmatter_json TEXT,
|
||||
headings_json TEXT,
|
||||
parsed_at INTEGER
|
||||
);
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return tmp, db
|
||||
|
||||
|
||||
def _insert_file_entry(db: Path, rel_path: str):
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO files(rel_path, size_bytes, ext) VALUES (?, 0, '.md')",
|
||||
(rel_path,),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_md_with_frontmatter(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/guia.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text(
|
||||
"---\ntitle: Mi Guía\nauthor: Lucas\n---\n\n# Mi Guía\n\nContenido del documento.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["title"] == "Mi Guía"
|
||||
assert result["frontmatter"]["author"] == "Lucas"
|
||||
assert "Contenido del documento" in result["content_text"]
|
||||
assert result["persisted"] is True
|
||||
|
||||
|
||||
def test_md_no_frontmatter(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/sin_fm.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text("# Título\n\nCuerpo sin frontmatter.\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["frontmatter"] == {}
|
||||
assert result["title"] == "Título"
|
||||
assert "Cuerpo sin frontmatter" in result["content_text"]
|
||||
|
||||
|
||||
def test_md_title_from_h1(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/title_h1.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text("# Primer H1\n\nAlgún texto.\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["title"] == "Primer H1"
|
||||
|
||||
|
||||
def test_md_title_from_filename(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/nombre_archivo.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text("Solo texto sin headings ni frontmatter.\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
assert result["title"] == "nombre_archivo"
|
||||
|
||||
|
||||
def test_md_headings_levels(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/headings.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text(
|
||||
"# H1 Título\n\nTexto.\n\n## H2 Sección\n\n### H3 Subsección\n\n## H2 Otra\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
headings = result["headings"]
|
||||
assert len(headings) == 4
|
||||
levels = [h["level"] for h in headings]
|
||||
assert levels == [1, 2, 3, 2]
|
||||
texts = [h["text"] for h in headings]
|
||||
assert "H1 Título" in texts
|
||||
assert "H2 Sección" in texts
|
||||
assert "H3 Subsección" in texts
|
||||
|
||||
|
||||
def test_md_persists_to_fts(tmp_path):
|
||||
vault, db = _make_vault(tmp_path)
|
||||
rel = "docs/fts_md.md"
|
||||
md = vault / rel
|
||||
md.parent.mkdir(parents=True, exist_ok=True)
|
||||
md.write_text("# Documento FTS\n\nPalabra clave: xenolito.\n", encoding="utf-8")
|
||||
_insert_file_entry(db, rel)
|
||||
|
||||
vault_knowledge_parse(str(vault), rel, db_path=str(db))
|
||||
|
||||
conn = sqlite3.connect(str(db))
|
||||
# FTS5 contentless: no permite SELECT directo, usar MATCH
|
||||
row = conn.execute(
|
||||
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'xenolito'",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row is not None, "FTS no encontró 'xenolito'"
|
||||
@@ -0,0 +1,57 @@
|
||||
---
|
||||
name: vault_dedupe_report
|
||||
kind: function
|
||||
lang: py
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def vault_dedupe_report(vault_path: str, min_size: int = 0, db_path: str | None = None) -> dict"
|
||||
description: "Detecta archivos duplicados en un vault leyendo vault_index.db (agrupando por sha256) y calcula el espacio recuperable. Retorna grupos ordenados por bytes recuperables DESC."
|
||||
tags: [vault, dedupe, duplicates, disk, sha256, sqlite]
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta raiz del vault. Usada como clave en el resultado y para localizar vault_index.db cuando db_path es None."
|
||||
- name: min_size
|
||||
desc: "Tamanio minimo en bytes para incluir un archivo en el analisis. Default 0 = todos los archivos."
|
||||
- name: db_path
|
||||
desc: "Override opcional de la ruta a vault_index.db. Si es None se usa <vault_path>/vault_index.db."
|
||||
output: "dict con vault_path, groups (sha256/size/count/files/reclaimable_bytes), total_groups, total_duplicates, total_reclaimable_bytes, scanned_files. groups ordenados por reclaimable_bytes DESC."
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_py_core"
|
||||
imports: ["sqlite3", "pathlib"]
|
||||
tested: true
|
||||
tests:
|
||||
- "test_no_duplicates"
|
||||
- "test_basic_duplicates"
|
||||
- "test_three_in_group"
|
||||
- "test_min_size_filter"
|
||||
- "test_multiple_groups_ordered"
|
||||
test_file_path: "python/functions/infra/tests/test_vault_dedupe_report.py"
|
||||
file_path: "python/functions/infra/vault_dedupe_report.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from infra.vault_dedupe_report import vault_dedupe_report
|
||||
|
||||
report = vault_dedupe_report("/data/vaults/my_vault", min_size=1024)
|
||||
print(f"Grupos duplicados: {report['total_groups']}")
|
||||
print(f"Espacio recuperable: {report['total_reclaimable_bytes'] // (1024**2)} MB")
|
||||
|
||||
for g in report["groups"][:5]:
|
||||
print(f" sha256={g['sha256'][:12]}... size={g['size']} count={g['count']}")
|
||||
for f in g["files"]:
|
||||
print(f" {f}")
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Solo considera filas con `sha256 != ''` (archivos efectivamente hasheados por `vault_inventory_scan_go_infra`).
|
||||
- Abre la BD en modo read-only (`?mode=ro`) para no interferir con escrituras concurrentes.
|
||||
- `GROUP_CONCAT` de SQLite no garantiza orden — los `files` se reordenan lexicograficamente en Python.
|
||||
- Si la BD no existe o le falta la tabla `files`, lanza `RuntimeError` con mensaje orientativo.
|
||||
- Prerequisito: haber corrido `fn vault index <name>` (pipeline `vault_inventory_scan_go_infra` + `vault_index_write_go_infra`) sobre el vault.
|
||||
@@ -0,0 +1,122 @@
|
||||
"""vault_dedupe_report — Detecta duplicados en vault_index.db y calcula espacio recuperable."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def vault_dedupe_report(
|
||||
vault_path: str,
|
||||
min_size: int = 0,
|
||||
db_path: str | None = None,
|
||||
) -> dict:
|
||||
"""Detecta archivos duplicados en un vault a partir de su vault_index.db.
|
||||
|
||||
Lee la tabla ``files`` de ``vault_index.db`` agrupando por ``sha256`` y
|
||||
retorna todos los grupos con mas de un archivo, ordenados por bytes
|
||||
recuperables de mayor a menor.
|
||||
|
||||
Args:
|
||||
vault_path: Ruta raiz del vault. Usada como clave en el resultado y
|
||||
para localizar ``vault_index.db`` cuando ``db_path`` es None.
|
||||
min_size: Ignora archivos cuyo ``size`` (bytes) sea menor que este
|
||||
valor. Default 0 = incluir todos los archivos.
|
||||
db_path: Ruta absoluta o relativa a la BD SQLite. Si es None se
|
||||
usa ``<vault_path>/vault_index.db``.
|
||||
|
||||
Returns:
|
||||
dict con las claves:
|
||||
- ``vault_path``: str — mismo valor recibido.
|
||||
- ``groups``: list de dicts, cada uno con:
|
||||
- ``sha256``: str
|
||||
- ``size``: int — tamanio en bytes de cada copia
|
||||
- ``count``: int — numero de copias encontradas
|
||||
- ``files``: list[str] — rel_paths ordenados lexicograficamente
|
||||
- ``reclaimable_bytes``: int — ``size * (count - 1)``
|
||||
- ``total_groups``: int — numero de grupos con duplicados
|
||||
- ``total_duplicates``: int — suma de ``(count - 1)`` por grupo
|
||||
- ``total_reclaimable_bytes``: int — bytes totales recuperables
|
||||
- ``scanned_files``: int — total de filas consideradas en la query
|
||||
|
||||
Raises:
|
||||
RuntimeError: Si la BD no existe, no tiene tabla ``files``, o hay
|
||||
algun error de lectura.
|
||||
"""
|
||||
resolved_db = db_path if db_path is not None else str(Path(vault_path) / "vault_index.db")
|
||||
|
||||
db_file = Path(resolved_db)
|
||||
if not db_file.exists():
|
||||
raise RuntimeError(
|
||||
f"No se encontro vault_index.db en '{resolved_db}'. "
|
||||
"Corre 'fn vault index <name>' primero."
|
||||
)
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{resolved_db}?mode=ro", uri=True)
|
||||
except sqlite3.OperationalError as exc:
|
||||
raise RuntimeError(f"No se pudo abrir '{resolved_db}': {exc}") from exc
|
||||
|
||||
try:
|
||||
# Verificar que existe la tabla files
|
||||
cur = conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='files';"
|
||||
)
|
||||
if cur.fetchone() is None:
|
||||
raise RuntimeError(
|
||||
f"vault_index.db sin tabla 'files'. "
|
||||
"Corre 'fn vault index <name>' primero."
|
||||
)
|
||||
|
||||
# Contar filas totales consideradas (sha256 no vacio, size >= min_size)
|
||||
row = conn.execute(
|
||||
"SELECT COUNT(*) FROM files WHERE size >= ? AND sha256 != '';",
|
||||
(min_size,),
|
||||
).fetchone()
|
||||
scanned_files: int = row[0] if row else 0
|
||||
|
||||
# Query principal: grupos con mas de una copia
|
||||
query = """
|
||||
SELECT
|
||||
sha256,
|
||||
size,
|
||||
COUNT(*) AS cnt,
|
||||
GROUP_CONCAT(rel_path) AS paths
|
||||
FROM files
|
||||
WHERE size >= ? AND sha256 != ''
|
||||
GROUP BY sha256
|
||||
HAVING COUNT(*) > 1
|
||||
ORDER BY size * (COUNT(*) - 1) DESC;
|
||||
"""
|
||||
rows = conn.execute(query, (min_size,)).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
groups: list[dict] = []
|
||||
total_duplicates = 0
|
||||
total_reclaimable_bytes = 0
|
||||
|
||||
for sha256, size, cnt, paths_concat in rows:
|
||||
# GROUP_CONCAT no garantiza orden — ordenar lexicograficamente
|
||||
files = sorted(paths_concat.split(","))
|
||||
reclaimable = size * (cnt - 1)
|
||||
groups.append(
|
||||
{
|
||||
"sha256": sha256,
|
||||
"size": size,
|
||||
"count": cnt,
|
||||
"files": files,
|
||||
"reclaimable_bytes": reclaimable,
|
||||
}
|
||||
)
|
||||
total_duplicates += cnt - 1
|
||||
total_reclaimable_bytes += reclaimable
|
||||
|
||||
return {
|
||||
"vault_path": vault_path,
|
||||
"groups": groups,
|
||||
"total_groups": len(groups),
|
||||
"total_duplicates": total_duplicates,
|
||||
"total_reclaimable_bytes": total_reclaimable_bytes,
|
||||
"scanned_files": scanned_files,
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
name: vault_knowledge_parse
|
||||
kind: function
|
||||
lang: py
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def vault_knowledge_parse(vault_path: str, rel_path: str, db_path: str | None = None) -> dict"
|
||||
description: "Parsea un archivo Markdown del vault: extrae YAML frontmatter, título, headings y cuerpo; persiste en knowledge_docs y actualiza files_fts para búsqueda por contenido."
|
||||
tags: [vault, markdown, knowledge, frontmatter, headings, fts, infra]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [json, re, sqlite3, time, pathlib, yaml]
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta absoluta a la raiz del vault donde vive el Markdown y vault_index.db."
|
||||
- name: rel_path
|
||||
desc: "Ruta relativa al archivo .md dentro del vault (ej. 'docs/guia.md')."
|
||||
- name: db_path
|
||||
desc: "Override opcional de la ruta a vault_index.db. Por defecto <vault_path>/vault_index.db."
|
||||
output: "Dict con: rel_path (str), title (str), frontmatter (dict), headings (list de {level, text}), content_text (str cuerpo sin frontmatter), persisted (bool)."
|
||||
tested: true
|
||||
tests:
|
||||
- "test_md_with_frontmatter"
|
||||
- "test_md_no_frontmatter"
|
||||
- "test_md_title_from_h1"
|
||||
- "test_md_title_from_filename"
|
||||
- "test_md_headings_levels"
|
||||
- "test_md_persists_to_fts"
|
||||
test_file_path: "python/functions/infra/tests/test_vault_knowledge_parse.py"
|
||||
file_path: "python/functions/infra/vault_knowledge_parse.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```python
|
||||
from vault_knowledge_parse import vault_knowledge_parse
|
||||
|
||||
result = vault_knowledge_parse("/vaults/mi_vault", "docs/guia_operaciones.md")
|
||||
# {
|
||||
# "rel_path": "docs/guia_operaciones.md",
|
||||
# "title": "Guía de Operaciones",
|
||||
# "frontmatter": {"author": "Lucas", "tags": ["ops"]},
|
||||
# "headings": [{"level": 1, "text": "Guía de Operaciones"}, {"level": 2, "text": "Instalación"}],
|
||||
# "content_text": "# Guía de Operaciones\n\n## Instalación\n...",
|
||||
# "persisted": True
|
||||
# }
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Prioridad de título: frontmatter["title"] > primer H1 en el cuerpo > basename sin extensión.
|
||||
- Frontmatter YAML delimitado por `---\n` al inicio del archivo. Si no hay frontmatter, se retorna {}.
|
||||
- content_text es el cuerpo completo sin el bloque frontmatter (incluye los headings H1-H6).
|
||||
- El rowid de files_fts se ancla al rowid de la tabla files para que vault_search funcione correctamente.
|
||||
- Si vault_index.db no existe, retorna el dict sin intentar persistir (persisted=False).
|
||||
- Dependencias: pyyaml (ya instalado en python/.venv).
|
||||
@@ -0,0 +1,142 @@
|
||||
"""vault_knowledge_parse — Parsea un Markdown del vault y persiste en knowledge_docs."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _parse_frontmatter(text: str) -> tuple[dict, str]:
|
||||
"""Separa YAML frontmatter del cuerpo. Retorna (frontmatter_dict, body)."""
|
||||
if not text.startswith("---\n") and not text.startswith("---\r\n"):
|
||||
return {}, text
|
||||
|
||||
# Buscar cierre del frontmatter
|
||||
end = text.find("\n---", 4)
|
||||
if end == -1:
|
||||
return {}, text
|
||||
|
||||
yaml_block = text[4:end].strip()
|
||||
body = text[end + 4:].lstrip("\n\r")
|
||||
|
||||
try:
|
||||
import yaml
|
||||
|
||||
fm = yaml.safe_load(yaml_block) or {}
|
||||
if not isinstance(fm, dict):
|
||||
fm = {}
|
||||
except Exception:
|
||||
fm = {}
|
||||
|
||||
return fm, body
|
||||
|
||||
|
||||
def _extract_headings(body: str) -> list[dict]:
|
||||
"""Extrae headings Markdown (# ... ### ...) del cuerpo."""
|
||||
headings = []
|
||||
for line in body.splitlines():
|
||||
m = re.match(r"^(#{1,6})\s+(.*)", line)
|
||||
if m:
|
||||
headings.append({"level": len(m.group(1)), "text": m.group(2).strip()})
|
||||
return headings
|
||||
|
||||
|
||||
def _extract_title(frontmatter: dict, body: str, basename: str) -> str:
|
||||
"""Extrae título: frontmatter['title'] > primer H1 > basename."""
|
||||
if frontmatter.get("title"):
|
||||
return str(frontmatter["title"])
|
||||
for line in body.splitlines():
|
||||
m = re.match(r"^#\s+(.*)", line)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return basename
|
||||
|
||||
|
||||
def vault_knowledge_parse(
|
||||
vault_path: str,
|
||||
rel_path: str,
|
||||
db_path: str | None = None,
|
||||
) -> dict:
|
||||
"""Parsea un archivo Markdown del vault: extrae frontmatter, título, headings y cuerpo.
|
||||
|
||||
Args:
|
||||
vault_path: Ruta absoluta a la raiz del vault.
|
||||
rel_path: Ruta relativa al archivo Markdown dentro del vault.
|
||||
db_path: Override opcional de la ruta a vault_index.db.
|
||||
|
||||
Returns:
|
||||
Dict con: rel_path, title, frontmatter, headings, content_text, persisted.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Si el archivo no existe o no se puede leer.
|
||||
"""
|
||||
vault = Path(vault_path)
|
||||
md_file = vault / rel_path
|
||||
if not md_file.exists():
|
||||
raise RuntimeError(f"vault_knowledge_parse: archivo no encontrado: {md_file}")
|
||||
|
||||
db = Path(db_path) if db_path else vault / "vault_index.db"
|
||||
|
||||
try:
|
||||
text = md_file.read_text(encoding="utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = md_file.read_text(encoding="latin-1", errors="replace")
|
||||
|
||||
frontmatter, body = _parse_frontmatter(text)
|
||||
headings = _extract_headings(body)
|
||||
basename = md_file.stem
|
||||
title = _extract_title(frontmatter, body, basename)
|
||||
content_text = body
|
||||
|
||||
# Persistir en vault_index.db
|
||||
persisted = False
|
||||
if db.exists():
|
||||
conn = sqlite3.connect(str(db))
|
||||
try:
|
||||
now = int(time.time())
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO knowledge_docs(rel_path, title, frontmatter_json, headings_json, parsed_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(rel_path) DO UPDATE SET
|
||||
title=excluded.title,
|
||||
frontmatter_json=excluded.frontmatter_json,
|
||||
headings_json=excluded.headings_json,
|
||||
parsed_at=excluded.parsed_at
|
||||
""",
|
||||
(
|
||||
rel_path,
|
||||
title,
|
||||
json.dumps(frontmatter, ensure_ascii=False),
|
||||
json.dumps(headings, ensure_ascii=False),
|
||||
now,
|
||||
),
|
||||
)
|
||||
# Actualizar files_fts (rowid debe coincidir con files)
|
||||
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO files_fts(rowid, rel_path, content_text)
|
||||
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
|
||||
""",
|
||||
(rel_path, rel_path, content_text),
|
||||
)
|
||||
conn.commit()
|
||||
persisted = True
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
"rel_path": rel_path,
|
||||
"title": title,
|
||||
"frontmatter": frontmatter,
|
||||
"headings": headings,
|
||||
"content_text": content_text,
|
||||
"persisted": persisted,
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
name: vault_profile_dispatch
|
||||
kind: function
|
||||
lang: py
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "def vault_profile_dispatch(vault_path: str, rel_path: str, kind: str, db_path: str | None = None) -> dict"
|
||||
description: "CLI dispatcher que enruta un archivo del vault al profiler correcto segun su tipo (csv/pdf/md). Thin wrapper sobre vault_csv_profile, vault_pdf_extract y vault_knowledge_parse. Usable desde Go via os/exec para procesar archivos en bulk."
|
||||
tags: [vault, profile, dispatch, profiler, csv, pdf, md, infra]
|
||||
uses_functions:
|
||||
- vault_csv_profile_py_datascience
|
||||
- vault_pdf_extract_py_datascience
|
||||
- vault_knowledge_parse_py_infra
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: []
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta absoluta a la raiz del vault."
|
||||
- name: rel_path
|
||||
desc: "Ruta relativa del archivo dentro del vault."
|
||||
- name: kind
|
||||
desc: "Tipo de profiler: csv | pdf | md."
|
||||
- name: db_path
|
||||
desc: "Override de la ruta a vault_index.db. Default: <vault_path>/vault_index.db."
|
||||
output: "Dict con resultado del profiler correspondiente. Para csv: {rel_path, cols, n_rows, encoding, date_min, date_max, persisted}. Para pdf: {rel_path, page_count, text_len, extracted_to, persisted}. Para md: resultado de vault_knowledge_parse."
|
||||
tested: false
|
||||
tests: []
|
||||
test_file_path: ""
|
||||
file_path: "python/functions/infra/vault_profile_dispatch.py"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```bash
|
||||
# Desde CLI
|
||||
python3 python/functions/infra/vault_profile_dispatch.py \
|
||||
--vault /home/lucas/vaults/turismo_spain \
|
||||
--rel-path data/raw/report.csv \
|
||||
--kind csv
|
||||
|
||||
# Desde Go via os/exec (patron usado en fn vault profile)
|
||||
python3 vault_profile_dispatch.py --vault <path> --rel-path <p> --kind csv
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
Disenado para ser invocado desde Go via `os/exec`. Imprime resultado como JSON a stdout.
|
||||
Codigos de salida: 0=exito, 1=args faltantes, 2=kind desconocido, 3=error del profiler.
|
||||
|
||||
Detecta automaticamente el PYTHONPATH mirando `FN_REGISTRY_ROOT` o subiendo desde su propia ubicacion.
|
||||
@@ -0,0 +1,92 @@
|
||||
"""vault_profile_dispatch — CLI dispatcher that routes a single vault file to the right profiler.
|
||||
|
||||
Usage:
|
||||
python3 vault_profile_dispatch.py --vault <path> --rel-path <p> --kind csv|pdf|md [--db-path <p>]
|
||||
|
||||
Exit codes:
|
||||
0 success (result printed as JSON)
|
||||
1 missing required argument
|
||||
2 unknown kind
|
||||
3 profiler raised an error
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _python_path_setup() -> None:
|
||||
"""Ensure the registry python/functions directory is on sys.path."""
|
||||
# Try FN_REGISTRY_ROOT env first, then walk up from this file's location.
|
||||
registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
|
||||
if not registry_root:
|
||||
# This file lives at python/functions/infra/vault_profile_dispatch.py
|
||||
# So the registry root is four levels up from __file__.
|
||||
candidate = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if (candidate / "go.mod").exists():
|
||||
registry_root = str(candidate)
|
||||
|
||||
if registry_root:
|
||||
fn_path = str(Path(registry_root) / "python" / "functions")
|
||||
if fn_path not in sys.path:
|
||||
sys.path.insert(0, fn_path)
|
||||
|
||||
|
||||
def dispatch(vault_path: str, rel_path: str, kind: str, db_path: str | None) -> dict:
|
||||
"""Call the appropriate profiler based on kind."""
|
||||
if kind == "csv":
|
||||
from datascience.vault_csv_profile import vault_csv_profile
|
||||
return vault_csv_profile(vault_path, rel_path, db_path)
|
||||
elif kind == "pdf":
|
||||
from datascience.vault_pdf_extract import vault_pdf_extract
|
||||
return vault_pdf_extract(vault_path, rel_path, db_path)
|
||||
elif kind == "md":
|
||||
from infra.vault_knowledge_parse import vault_knowledge_parse
|
||||
return vault_knowledge_parse(vault_path, rel_path, db_path)
|
||||
else:
|
||||
raise ValueError(f"unknown kind: {kind!r} (expected csv, pdf, or md)")
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
_python_path_setup()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="vault_profile_dispatch",
|
||||
description="Route a single vault file to the right profiler (csv/pdf/md).",
|
||||
)
|
||||
parser.add_argument("--vault", required=True, help="Absolute path to vault root")
|
||||
parser.add_argument("--rel-path", required=True, dest="rel_path", help="Relative path of file inside vault")
|
||||
parser.add_argument(
|
||||
"--kind",
|
||||
required=True,
|
||||
choices=["csv", "pdf", "md"],
|
||||
help="Profiler kind: csv | pdf | md",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db-path",
|
||||
dest="db_path",
|
||||
default=None,
|
||||
help="Override path to vault_index.db (default: <vault>/vault_index.db)",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
try:
|
||||
result = dispatch(args.vault, args.rel_path, args.kind, args.db_path)
|
||||
except ValueError as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 2
|
||||
except Exception as exc:
|
||||
print(f"error: {exc}", file=sys.stderr)
|
||||
return 3
|
||||
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user