chore: auto-commit (95 archivos)

- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-13 00:50:34 +02:00
parent ef60449e64
commit a802f59f55
189 changed files with 18964 additions and 330 deletions
@@ -0,0 +1,154 @@
"""Tests para vault_dedupe_report."""
from __future__ import annotations
import os
import sqlite3
import sys
from pathlib import Path
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from vault_dedupe_report import vault_dedupe_report
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_db(tmp_path: Path, rows: list[tuple]) -> Path:
"""Crea vault_index.db con la tabla files y las filas dadas.
rows: lista de (rel_path, size, sha256)
"""
db_path = tmp_path / "vault_index.db"
conn = sqlite3.connect(str(db_path))
conn.execute(
"""
CREATE TABLE files (
rel_path TEXT PRIMARY KEY,
size INTEGER,
mtime REAL,
sha256 TEXT,
mime TEXT,
ext TEXT,
bucket TEXT,
sub_bucket TEXT,
indexed_at REAL
);
"""
)
conn.executemany(
"INSERT INTO files (rel_path, size, sha256) VALUES (?, ?, ?);",
rows,
)
conn.commit()
conn.close()
return db_path
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_no_duplicates(tmp_path):
"""test_no_duplicates — 3 archivos con sha256 distintos -> groups=[]."""
_make_db(tmp_path, [
("a/file1.txt", 100, "aaa111"),
("a/file2.txt", 200, "bbb222"),
("a/file3.txt", 300, "ccc333"),
])
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
assert result["groups"] == []
assert result["total_groups"] == 0
assert result["total_duplicates"] == 0
assert result["total_reclaimable_bytes"] == 0
assert result["scanned_files"] == 3
assert result["vault_path"] == str(tmp_path)
def test_basic_duplicates(tmp_path):
"""test_basic_duplicates — 2 archivos mismo sha256 -> 1 group, count=2, reclaimable=size."""
_make_db(tmp_path, [
("data/orig.jpg", 500, "deadbeef"),
("backup/orig.jpg", 500, "deadbeef"),
])
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
assert result["total_groups"] == 1
assert result["total_duplicates"] == 1
assert result["total_reclaimable_bytes"] == 500
g = result["groups"][0]
assert g["sha256"] == "deadbeef"
assert g["size"] == 500
assert g["count"] == 2
assert g["reclaimable_bytes"] == 500
assert sorted(g["files"]) == ["backup/orig.jpg", "data/orig.jpg"]
def test_three_in_group(tmp_path):
"""test_three_in_group — 3 archivos mismo sha256 -> count=3, reclaimable=size*2."""
size = 1000
_make_db(tmp_path, [
("a/f1.bin", size, "cafebabe"),
("b/f2.bin", size, "cafebabe"),
("c/f3.bin", size, "cafebabe"),
])
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
assert result["total_groups"] == 1
assert result["total_duplicates"] == 2
assert result["total_reclaimable_bytes"] == size * 2
g = result["groups"][0]
assert g["count"] == 3
assert g["reclaimable_bytes"] == size * 2
assert g["files"] == sorted(["a/f1.bin", "b/f2.bin", "c/f3.bin"])
def test_min_size_filter(tmp_path):
"""test_min_size_filter — duplicados de tamano 50, min_size=100 -> groups=[]."""
_make_db(tmp_path, [
("x/small1.txt", 50, "tiny123"),
("y/small2.txt", 50, "tiny123"),
])
result = vault_dedupe_report(
str(tmp_path),
min_size=100,
db_path=str(tmp_path / "vault_index.db"),
)
assert result["groups"] == []
assert result["total_groups"] == 0
assert result["total_reclaimable_bytes"] == 0
assert result["scanned_files"] == 0
def test_multiple_groups_ordered(tmp_path):
"""test_multiple_groups_ordered — 2 grupos con distinto ahorro -> orden DESC."""
# grupo A: 2 copias de 200 bytes -> reclaimable=200
# grupo B: 3 copias de 500 bytes -> reclaimable=1000
# el grupo B debe salir primero
_make_db(tmp_path, [
("p/a1.dat", 200, "groupA"),
("q/a2.dat", 200, "groupA"),
("r/b1.dat", 500, "groupB"),
("s/b2.dat", 500, "groupB"),
("t/b3.dat", 500, "groupB"),
("u/uniq.dat", 999, "unique1"),
])
result = vault_dedupe_report(str(tmp_path), db_path=str(tmp_path / "vault_index.db"))
assert result["total_groups"] == 2
assert result["total_duplicates"] == 3 # (2-1) + (3-1)
assert result["total_reclaimable_bytes"] == 1200 # 200 + 1000
assert result["scanned_files"] == 6 # 6 filas con sha256 != '' (incluye el unico)
# Primer grupo debe ser el de mayor ahorro (B: 1000)
assert result["groups"][0]["sha256"] == "groupB"
assert result["groups"][0]["reclaimable_bytes"] == 1000
assert result["groups"][1]["sha256"] == "groupA"
assert result["groups"][1]["reclaimable_bytes"] == 200
@@ -0,0 +1,153 @@
"""Tests para vault_knowledge_parse."""
from __future__ import annotations
import os
import sqlite3
import sys
from pathlib import Path
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from vault_knowledge_parse import vault_knowledge_parse
def _make_vault(tmp: Path) -> tuple[Path, Path]:
"""Crea un vault mínimo con vault_index.db."""
db = tmp / "vault_index.db"
conn = sqlite3.connect(str(db))
conn.executescript(
"""
CREATE TABLE IF NOT EXISTS files (
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
rel_path TEXT UNIQUE NOT NULL,
size_bytes INTEGER,
ext TEXT
);
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts
USING fts5(rel_path, content_text, content='', contentless_delete=1);
CREATE TABLE IF NOT EXISTS knowledge_docs (
rel_path TEXT PRIMARY KEY,
title TEXT,
frontmatter_json TEXT,
headings_json TEXT,
parsed_at INTEGER
);
"""
)
conn.commit()
conn.close()
return tmp, db
def _insert_file_entry(db: Path, rel_path: str):
conn = sqlite3.connect(str(db))
conn.execute(
"INSERT OR IGNORE INTO files(rel_path, size_bytes, ext) VALUES (?, 0, '.md')",
(rel_path,),
)
conn.commit()
conn.close()
def test_md_with_frontmatter(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/guia.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text(
"---\ntitle: Mi Guía\nauthor: Lucas\n---\n\n# Mi Guía\n\nContenido del documento.\n",
encoding="utf-8",
)
_insert_file_entry(db, rel)
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
assert result["title"] == "Mi Guía"
assert result["frontmatter"]["author"] == "Lucas"
assert "Contenido del documento" in result["content_text"]
assert result["persisted"] is True
def test_md_no_frontmatter(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/sin_fm.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text("# Título\n\nCuerpo sin frontmatter.\n", encoding="utf-8")
_insert_file_entry(db, rel)
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
assert result["frontmatter"] == {}
assert result["title"] == "Título"
assert "Cuerpo sin frontmatter" in result["content_text"]
def test_md_title_from_h1(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/title_h1.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text("# Primer H1\n\nAlgún texto.\n", encoding="utf-8")
_insert_file_entry(db, rel)
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
assert result["title"] == "Primer H1"
def test_md_title_from_filename(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/nombre_archivo.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text("Solo texto sin headings ni frontmatter.\n", encoding="utf-8")
_insert_file_entry(db, rel)
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
assert result["title"] == "nombre_archivo"
def test_md_headings_levels(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/headings.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text(
"# H1 Título\n\nTexto.\n\n## H2 Sección\n\n### H3 Subsección\n\n## H2 Otra\n",
encoding="utf-8",
)
_insert_file_entry(db, rel)
result = vault_knowledge_parse(str(vault), rel, db_path=str(db))
headings = result["headings"]
assert len(headings) == 4
levels = [h["level"] for h in headings]
assert levels == [1, 2, 3, 2]
texts = [h["text"] for h in headings]
assert "H1 Título" in texts
assert "H2 Sección" in texts
assert "H3 Subsección" in texts
def test_md_persists_to_fts(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "docs/fts_md.md"
md = vault / rel
md.parent.mkdir(parents=True, exist_ok=True)
md.write_text("# Documento FTS\n\nPalabra clave: xenolito.\n", encoding="utf-8")
_insert_file_entry(db, rel)
vault_knowledge_parse(str(vault), rel, db_path=str(db))
conn = sqlite3.connect(str(db))
# FTS5 contentless: no permite SELECT directo, usar MATCH
row = conn.execute(
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'xenolito'",
).fetchone()
conn.close()
assert row is not None, "FTS no encontró 'xenolito'"
@@ -0,0 +1,57 @@
---
name: vault_dedupe_report
kind: function
lang: py
domain: infra
version: "1.0.0"
purity: impure
signature: "def vault_dedupe_report(vault_path: str, min_size: int = 0, db_path: str | None = None) -> dict"
description: "Detecta archivos duplicados en un vault leyendo vault_index.db (agrupando por sha256) y calcula el espacio recuperable. Retorna grupos ordenados por bytes recuperables DESC."
tags: [vault, dedupe, duplicates, disk, sha256, sqlite]
params:
- name: vault_path
desc: "Ruta raiz del vault. Usada como clave en el resultado y para localizar vault_index.db cuando db_path es None."
- name: min_size
desc: "Tamanio minimo en bytes para incluir un archivo en el analisis. Default 0 = todos los archivos."
- name: db_path
desc: "Override opcional de la ruta a vault_index.db. Si es None se usa <vault_path>/vault_index.db."
output: "dict con vault_path, groups (sha256/size/count/files/reclaimable_bytes), total_groups, total_duplicates, total_reclaimable_bytes, scanned_files. groups ordenados por reclaimable_bytes DESC."
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_py_core"
imports: ["sqlite3", "pathlib"]
tested: true
tests:
- "test_no_duplicates"
- "test_basic_duplicates"
- "test_three_in_group"
- "test_min_size_filter"
- "test_multiple_groups_ordered"
test_file_path: "python/functions/infra/tests/test_vault_dedupe_report.py"
file_path: "python/functions/infra/vault_dedupe_report.py"
---
## Ejemplo
```python
from infra.vault_dedupe_report import vault_dedupe_report
report = vault_dedupe_report("/data/vaults/my_vault", min_size=1024)
print(f"Grupos duplicados: {report['total_groups']}")
print(f"Espacio recuperable: {report['total_reclaimable_bytes'] // (1024**2)} MB")
for g in report["groups"][:5]:
print(f" sha256={g['sha256'][:12]}... size={g['size']} count={g['count']}")
for f in g["files"]:
print(f" {f}")
```
## Notas
- Solo considera filas con `sha256 != ''` (archivos efectivamente hasheados por `vault_inventory_scan_go_infra`).
- Abre la BD en modo read-only (`?mode=ro`) para no interferir con escrituras concurrentes.
- `GROUP_CONCAT` de SQLite no garantiza orden — los `files` se reordenan lexicograficamente en Python.
- Si la BD no existe o le falta la tabla `files`, lanza `RuntimeError` con mensaje orientativo.
- Prerequisito: haber corrido `fn vault index <name>` (pipeline `vault_inventory_scan_go_infra` + `vault_index_write_go_infra`) sobre el vault.
@@ -0,0 +1,122 @@
"""vault_dedupe_report — Detecta duplicados en vault_index.db y calcula espacio recuperable."""
from __future__ import annotations
import sqlite3
from pathlib import Path
def vault_dedupe_report(
vault_path: str,
min_size: int = 0,
db_path: str | None = None,
) -> dict:
"""Detecta archivos duplicados en un vault a partir de su vault_index.db.
Lee la tabla ``files`` de ``vault_index.db`` agrupando por ``sha256`` y
retorna todos los grupos con mas de un archivo, ordenados por bytes
recuperables de mayor a menor.
Args:
vault_path: Ruta raiz del vault. Usada como clave en el resultado y
para localizar ``vault_index.db`` cuando ``db_path`` es None.
min_size: Ignora archivos cuyo ``size`` (bytes) sea menor que este
valor. Default 0 = incluir todos los archivos.
db_path: Ruta absoluta o relativa a la BD SQLite. Si es None se
usa ``<vault_path>/vault_index.db``.
Returns:
dict con las claves:
- ``vault_path``: str — mismo valor recibido.
- ``groups``: list de dicts, cada uno con:
- ``sha256``: str
- ``size``: int — tamanio en bytes de cada copia
- ``count``: int — numero de copias encontradas
- ``files``: list[str] — rel_paths ordenados lexicograficamente
- ``reclaimable_bytes``: int — ``size * (count - 1)``
- ``total_groups``: int — numero de grupos con duplicados
- ``total_duplicates``: int — suma de ``(count - 1)`` por grupo
- ``total_reclaimable_bytes``: int — bytes totales recuperables
- ``scanned_files``: int — total de filas consideradas en la query
Raises:
RuntimeError: Si la BD no existe, no tiene tabla ``files``, o hay
algun error de lectura.
"""
resolved_db = db_path if db_path is not None else str(Path(vault_path) / "vault_index.db")
db_file = Path(resolved_db)
if not db_file.exists():
raise RuntimeError(
f"No se encontro vault_index.db en '{resolved_db}'. "
"Corre 'fn vault index <name>' primero."
)
try:
conn = sqlite3.connect(f"file:{resolved_db}?mode=ro", uri=True)
except sqlite3.OperationalError as exc:
raise RuntimeError(f"No se pudo abrir '{resolved_db}': {exc}") from exc
try:
# Verificar que existe la tabla files
cur = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='files';"
)
if cur.fetchone() is None:
raise RuntimeError(
f"vault_index.db sin tabla 'files'. "
"Corre 'fn vault index <name>' primero."
)
# Contar filas totales consideradas (sha256 no vacio, size >= min_size)
row = conn.execute(
"SELECT COUNT(*) FROM files WHERE size >= ? AND sha256 != '';",
(min_size,),
).fetchone()
scanned_files: int = row[0] if row else 0
# Query principal: grupos con mas de una copia
query = """
SELECT
sha256,
size,
COUNT(*) AS cnt,
GROUP_CONCAT(rel_path) AS paths
FROM files
WHERE size >= ? AND sha256 != ''
GROUP BY sha256
HAVING COUNT(*) > 1
ORDER BY size * (COUNT(*) - 1) DESC;
"""
rows = conn.execute(query, (min_size,)).fetchall()
finally:
conn.close()
groups: list[dict] = []
total_duplicates = 0
total_reclaimable_bytes = 0
for sha256, size, cnt, paths_concat in rows:
# GROUP_CONCAT no garantiza orden — ordenar lexicograficamente
files = sorted(paths_concat.split(","))
reclaimable = size * (cnt - 1)
groups.append(
{
"sha256": sha256,
"size": size,
"count": cnt,
"files": files,
"reclaimable_bytes": reclaimable,
}
)
total_duplicates += cnt - 1
total_reclaimable_bytes += reclaimable
return {
"vault_path": vault_path,
"groups": groups,
"total_groups": len(groups),
"total_duplicates": total_duplicates,
"total_reclaimable_bytes": total_reclaimable_bytes,
"scanned_files": scanned_files,
}
@@ -0,0 +1,60 @@
---
name: vault_knowledge_parse
kind: function
lang: py
domain: infra
version: "1.0.0"
purity: impure
signature: "def vault_knowledge_parse(vault_path: str, rel_path: str, db_path: str | None = None) -> dict"
description: "Parsea un archivo Markdown del vault: extrae YAML frontmatter, título, headings y cuerpo; persiste en knowledge_docs y actualiza files_fts para búsqueda por contenido."
tags: [vault, markdown, knowledge, frontmatter, headings, fts, infra]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [json, re, sqlite3, time, pathlib, yaml]
params:
- name: vault_path
desc: "Ruta absoluta a la raiz del vault donde vive el Markdown y vault_index.db."
- name: rel_path
desc: "Ruta relativa al archivo .md dentro del vault (ej. 'docs/guia.md')."
- name: db_path
desc: "Override opcional de la ruta a vault_index.db. Por defecto <vault_path>/vault_index.db."
output: "Dict con: rel_path (str), title (str), frontmatter (dict), headings (list de {level, text}), content_text (str cuerpo sin frontmatter), persisted (bool)."
tested: true
tests:
- "test_md_with_frontmatter"
- "test_md_no_frontmatter"
- "test_md_title_from_h1"
- "test_md_title_from_filename"
- "test_md_headings_levels"
- "test_md_persists_to_fts"
test_file_path: "python/functions/infra/tests/test_vault_knowledge_parse.py"
file_path: "python/functions/infra/vault_knowledge_parse.py"
---
## Ejemplo
```python
from vault_knowledge_parse import vault_knowledge_parse
result = vault_knowledge_parse("/vaults/mi_vault", "docs/guia_operaciones.md")
# {
# "rel_path": "docs/guia_operaciones.md",
# "title": "Guía de Operaciones",
# "frontmatter": {"author": "Lucas", "tags": ["ops"]},
# "headings": [{"level": 1, "text": "Guía de Operaciones"}, {"level": 2, "text": "Instalación"}],
# "content_text": "# Guía de Operaciones\n\n## Instalación\n...",
# "persisted": True
# }
```
## Notas
- Prioridad de título: frontmatter["title"] > primer H1 en el cuerpo > basename sin extensión.
- Frontmatter YAML delimitado por `---\n` al inicio del archivo. Si no hay frontmatter, se retorna {}.
- content_text es el cuerpo completo sin el bloque frontmatter (incluye los headings H1-H6).
- El rowid de files_fts se ancla al rowid de la tabla files para que vault_search funcione correctamente.
- Si vault_index.db no existe, retorna el dict sin intentar persistir (persisted=False).
- Dependencias: pyyaml (ya instalado en python/.venv).
@@ -0,0 +1,142 @@
"""vault_knowledge_parse — Parsea un Markdown del vault y persiste en knowledge_docs."""
from __future__ import annotations
import json
import re
import sqlite3
import time
from pathlib import Path
def _parse_frontmatter(text: str) -> tuple[dict, str]:
"""Separa YAML frontmatter del cuerpo. Retorna (frontmatter_dict, body)."""
if not text.startswith("---\n") and not text.startswith("---\r\n"):
return {}, text
# Buscar cierre del frontmatter
end = text.find("\n---", 4)
if end == -1:
return {}, text
yaml_block = text[4:end].strip()
body = text[end + 4:].lstrip("\n\r")
try:
import yaml
fm = yaml.safe_load(yaml_block) or {}
if not isinstance(fm, dict):
fm = {}
except Exception:
fm = {}
return fm, body
def _extract_headings(body: str) -> list[dict]:
"""Extrae headings Markdown (# ... ### ...) del cuerpo."""
headings = []
for line in body.splitlines():
m = re.match(r"^(#{1,6})\s+(.*)", line)
if m:
headings.append({"level": len(m.group(1)), "text": m.group(2).strip()})
return headings
def _extract_title(frontmatter: dict, body: str, basename: str) -> str:
"""Extrae título: frontmatter['title'] > primer H1 > basename."""
if frontmatter.get("title"):
return str(frontmatter["title"])
for line in body.splitlines():
m = re.match(r"^#\s+(.*)", line)
if m:
return m.group(1).strip()
return basename
def vault_knowledge_parse(
vault_path: str,
rel_path: str,
db_path: str | None = None,
) -> dict:
"""Parsea un archivo Markdown del vault: extrae frontmatter, título, headings y cuerpo.
Args:
vault_path: Ruta absoluta a la raiz del vault.
rel_path: Ruta relativa al archivo Markdown dentro del vault.
db_path: Override opcional de la ruta a vault_index.db.
Returns:
Dict con: rel_path, title, frontmatter, headings, content_text, persisted.
Raises:
RuntimeError: Si el archivo no existe o no se puede leer.
"""
vault = Path(vault_path)
md_file = vault / rel_path
if not md_file.exists():
raise RuntimeError(f"vault_knowledge_parse: archivo no encontrado: {md_file}")
db = Path(db_path) if db_path else vault / "vault_index.db"
try:
text = md_file.read_text(encoding="utf-8")
except UnicodeDecodeError:
text = md_file.read_text(encoding="latin-1", errors="replace")
frontmatter, body = _parse_frontmatter(text)
headings = _extract_headings(body)
basename = md_file.stem
title = _extract_title(frontmatter, body, basename)
content_text = body
# Persistir en vault_index.db
persisted = False
if db.exists():
conn = sqlite3.connect(str(db))
try:
now = int(time.time())
conn.execute(
"""
INSERT INTO knowledge_docs(rel_path, title, frontmatter_json, headings_json, parsed_at)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(rel_path) DO UPDATE SET
title=excluded.title,
frontmatter_json=excluded.frontmatter_json,
headings_json=excluded.headings_json,
parsed_at=excluded.parsed_at
""",
(
rel_path,
title,
json.dumps(frontmatter, ensure_ascii=False),
json.dumps(headings, ensure_ascii=False),
now,
),
)
# Actualizar files_fts (rowid debe coincidir con files)
conn.execute("DELETE FROM files_fts WHERE rel_path = ?", (rel_path,))
conn.execute(
"""
INSERT INTO files_fts(rowid, rel_path, content_text)
VALUES ((SELECT rowid FROM files WHERE rel_path = ?), ?, ?)
""",
(rel_path, rel_path, content_text),
)
conn.commit()
persisted = True
except Exception:
conn.rollback()
raise
finally:
conn.close()
return {
"rel_path": rel_path,
"title": title,
"frontmatter": frontmatter,
"headings": headings,
"content_text": content_text,
"persisted": persisted,
}
@@ -0,0 +1,54 @@
---
name: vault_profile_dispatch
kind: function
lang: py
domain: infra
version: "1.0.0"
purity: impure
signature: "def vault_profile_dispatch(vault_path: str, rel_path: str, kind: str, db_path: str | None = None) -> dict"
description: "CLI dispatcher que enruta un archivo del vault al profiler correcto segun su tipo (csv/pdf/md). Thin wrapper sobre vault_csv_profile, vault_pdf_extract y vault_knowledge_parse. Usable desde Go via os/exec para procesar archivos en bulk."
tags: [vault, profile, dispatch, profiler, csv, pdf, md, infra]
uses_functions:
- vault_csv_profile_py_datascience
- vault_pdf_extract_py_datascience
- vault_knowledge_parse_py_infra
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: []
params:
- name: vault_path
desc: "Ruta absoluta a la raiz del vault."
- name: rel_path
desc: "Ruta relativa del archivo dentro del vault."
- name: kind
desc: "Tipo de profiler: csv | pdf | md."
- name: db_path
desc: "Override de la ruta a vault_index.db. Default: <vault_path>/vault_index.db."
output: "Dict con resultado del profiler correspondiente. Para csv: {rel_path, cols, n_rows, encoding, date_min, date_max, persisted}. Para pdf: {rel_path, page_count, text_len, extracted_to, persisted}. Para md: resultado de vault_knowledge_parse."
tested: false
tests: []
test_file_path: ""
file_path: "python/functions/infra/vault_profile_dispatch.py"
---
## Ejemplo
```bash
# Desde CLI
python3 python/functions/infra/vault_profile_dispatch.py \
--vault /home/lucas/vaults/turismo_spain \
--rel-path data/raw/report.csv \
--kind csv
# Desde Go via os/exec (patron usado en fn vault profile)
python3 vault_profile_dispatch.py --vault <path> --rel-path <p> --kind csv
```
## Notas
Disenado para ser invocado desde Go via `os/exec`. Imprime resultado como JSON a stdout.
Codigos de salida: 0=exito, 1=args faltantes, 2=kind desconocido, 3=error del profiler.
Detecta automaticamente el PYTHONPATH mirando `FN_REGISTRY_ROOT` o subiendo desde su propia ubicacion.
@@ -0,0 +1,92 @@
"""vault_profile_dispatch — CLI dispatcher that routes a single vault file to the right profiler.
Usage:
python3 vault_profile_dispatch.py --vault <path> --rel-path <p> --kind csv|pdf|md [--db-path <p>]
Exit codes:
0 success (result printed as JSON)
1 missing required argument
2 unknown kind
3 profiler raised an error
"""
from __future__ import annotations
import argparse
import json
import sys
import os
from pathlib import Path
def _python_path_setup() -> None:
"""Ensure the registry python/functions directory is on sys.path."""
# Try FN_REGISTRY_ROOT env first, then walk up from this file's location.
registry_root = os.environ.get("FN_REGISTRY_ROOT", "")
if not registry_root:
# This file lives at python/functions/infra/vault_profile_dispatch.py
# So the registry root is four levels up from __file__.
candidate = Path(__file__).resolve().parent.parent.parent.parent
if (candidate / "go.mod").exists():
registry_root = str(candidate)
if registry_root:
fn_path = str(Path(registry_root) / "python" / "functions")
if fn_path not in sys.path:
sys.path.insert(0, fn_path)
def dispatch(vault_path: str, rel_path: str, kind: str, db_path: str | None) -> dict:
"""Call the appropriate profiler based on kind."""
if kind == "csv":
from datascience.vault_csv_profile import vault_csv_profile
return vault_csv_profile(vault_path, rel_path, db_path)
elif kind == "pdf":
from datascience.vault_pdf_extract import vault_pdf_extract
return vault_pdf_extract(vault_path, rel_path, db_path)
elif kind == "md":
from infra.vault_knowledge_parse import vault_knowledge_parse
return vault_knowledge_parse(vault_path, rel_path, db_path)
else:
raise ValueError(f"unknown kind: {kind!r} (expected csv, pdf, or md)")
def main(argv: list[str] | None = None) -> int:
_python_path_setup()
parser = argparse.ArgumentParser(
prog="vault_profile_dispatch",
description="Route a single vault file to the right profiler (csv/pdf/md).",
)
parser.add_argument("--vault", required=True, help="Absolute path to vault root")
parser.add_argument("--rel-path", required=True, dest="rel_path", help="Relative path of file inside vault")
parser.add_argument(
"--kind",
required=True,
choices=["csv", "pdf", "md"],
help="Profiler kind: csv | pdf | md",
)
parser.add_argument(
"--db-path",
dest="db_path",
default=None,
help="Override path to vault_index.db (default: <vault>/vault_index.db)",
)
args = parser.parse_args(argv)
try:
result = dispatch(args.vault, args.rel_path, args.kind, args.db_path)
except ValueError as exc:
print(f"error: {exc}", file=sys.stderr)
return 2
except Exception as exc:
print(f"error: {exc}", file=sys.stderr)
return 3
print(json.dumps(result, indent=2, default=str))
return 0
if __name__ == "__main__":
sys.exit(main())