chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,238 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MlEnvCheck holds the result of a single ML environment probe.
|
||||
type MlEnvCheck struct {
|
||||
Name string `json:"name"` // e.g. "cuda_toolkit", "python_venv"
|
||||
Status string `json:"status"` // "ok" | "missing" | "warning" | "unknown"
|
||||
Version string `json:"version,omitempty"` // version string if detected
|
||||
Detail string `json:"detail,omitempty"` // human-readable extra info
|
||||
}
|
||||
|
||||
// MlEnvReport is the full ML environment audit result.
|
||||
type MlEnvReport struct {
|
||||
Gpus []GpuInfo `json:"gpus"`
|
||||
Checks []MlEnvCheck `json:"checks"`
|
||||
OverallOK bool `json:"overall_ok"`
|
||||
GeneratedAt int64 `json:"generated_at"`
|
||||
}
|
||||
|
||||
// AuditMlEnv probes the ML environment rooted at registryRoot.
|
||||
// It checks for NVIDIA drivers, CUDA toolkit, Python venv, key Python
|
||||
// packages and optional tools (sd, llama-cli) and a local vault path.
|
||||
// Returns a non-nil MlEnvReport even when individual checks fail; the
|
||||
// function itself only errors if a fundamental system call cannot be
|
||||
// attempted.
|
||||
func AuditMlEnv(registryRoot string) (MlEnvReport, error) {
|
||||
report := MlEnvReport{
|
||||
GeneratedAt: time.Now().Unix(),
|
||||
}
|
||||
|
||||
// --- GPU detection (composes GetGpuInfo) ---
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
// Non-fatal: record absence.
|
||||
gpus = []GpuInfo{}
|
||||
}
|
||||
report.Gpus = gpus
|
||||
|
||||
checks := []MlEnvCheck{}
|
||||
|
||||
// --- nvidia-smi ---
|
||||
checks = append(checks, probeCommand("nvidia_smi", "nvidia-smi", []string{"--version"}, 5))
|
||||
|
||||
// --- nvcc (CUDA toolkit compiler) ---
|
||||
nvcc := probeNvcc()
|
||||
checks = append(checks, nvcc)
|
||||
|
||||
// --- Python venv ---
|
||||
venvCheck := probeVenv(registryRoot)
|
||||
checks = append(checks, venvCheck)
|
||||
|
||||
// Python venv path for subsequent checks.
|
||||
venvPy := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
|
||||
|
||||
// --- Python packages ---
|
||||
for _, pkg := range []string{"torch", "diffusers", "transformers", "huggingface_hub", "stable_diffusion_cpp_python"} {
|
||||
checks = append(checks, probePythonPackage(venvPy, pkg))
|
||||
}
|
||||
|
||||
// --- sd.cpp CLI ---
|
||||
checks = append(checks, probeCommand("sd_cli", "sd", []string{"--version"}, 5))
|
||||
|
||||
// --- llama.cpp CLI ---
|
||||
checks = append(checks, probeCommand("llama_cpp", "llama-cli", []string{"--version"}, 5))
|
||||
|
||||
// --- imagegen_vault ---
|
||||
checks = append(checks, probeImagegenVault())
|
||||
|
||||
report.Checks = checks
|
||||
|
||||
// OverallOK: no "missing" checks (warning is tolerated) and at least 1 GPU.
|
||||
overallOK := len(gpus) > 0
|
||||
for _, c := range checks {
|
||||
if c.Status == "missing" {
|
||||
// stable_diffusion_cpp_python and sd_cli are optional — downgrade to warning-only.
|
||||
if c.Name == "stable_diffusion_cpp_python" || c.Name == "sd_cli" || c.Name == "llama_cpp" {
|
||||
continue
|
||||
}
|
||||
overallOK = false
|
||||
}
|
||||
}
|
||||
report.OverallOK = overallOK
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// probeCommand checks whether a binary is available in PATH by running it with
|
||||
// the given args and recording any version output.
|
||||
func probeCommand(name, binary string, args []string, timeoutSec int) MlEnvCheck {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
path, err := exec.LookPath(binary)
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: name, Status: "missing", Detail: fmt.Sprintf("%s not found in PATH", binary)}
|
||||
}
|
||||
|
||||
out, err := exec.CommandContext(ctx, path, args...).CombinedOutput()
|
||||
version := strings.TrimSpace(string(out))
|
||||
if len(version) > 120 {
|
||||
version = version[:120]
|
||||
}
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: name, Status: "warning", Version: version, Detail: fmt.Sprintf("exit error: %v", err)}
|
||||
}
|
||||
return MlEnvCheck{Name: name, Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probeNvcc extracts the CUDA toolkit version from nvcc --version output.
|
||||
func probeNvcc() MlEnvCheck {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
path, err := exec.LookPath("nvcc")
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "nvcc", Status: "missing", Detail: "nvcc not found in PATH (CUDA toolkit not installed)"}
|
||||
}
|
||||
|
||||
out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "nvcc", Status: "warning", Detail: fmt.Sprintf("nvcc --version failed: %v", err)}
|
||||
}
|
||||
|
||||
// Extract version from line like: "Cuda compilation tools, release 12.4, V12.4.99"
|
||||
version := ""
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "release") {
|
||||
parts := strings.Split(line, ",")
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if strings.HasPrefix(p, "release") {
|
||||
version = strings.TrimSpace(strings.TrimPrefix(p, "release"))
|
||||
break
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if version == "" {
|
||||
version = strings.TrimSpace(string(out))
|
||||
if len(version) > 80 {
|
||||
version = version[:80]
|
||||
}
|
||||
}
|
||||
return MlEnvCheck{Name: "nvcc", Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probeVenv checks that the Python venv exists and is functional.
|
||||
func probeVenv(registryRoot string) MlEnvCheck {
|
||||
py := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
|
||||
if _, err := os.Stat(py); os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: "python_venv", Status: "missing", Detail: fmt.Sprintf("not found: %s", py)}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, py, "--version").CombinedOutput()
|
||||
version := strings.TrimSpace(string(out))
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "python_venv", Status: "warning", Version: version, Detail: fmt.Sprintf("python3 --version failed: %v", err)}
|
||||
}
|
||||
return MlEnvCheck{Name: "python_venv", Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probePythonPackage imports a package in the venv Python and extracts __version__.
|
||||
func probePythonPackage(venvPy, pkg string) MlEnvCheck {
|
||||
// Map package name → import name (for packages with different import names).
|
||||
importName := pkg
|
||||
switch pkg {
|
||||
case "stable_diffusion_cpp_python":
|
||||
importName = "stable_diffusion_cpp"
|
||||
case "huggingface_hub":
|
||||
importName = "huggingface_hub"
|
||||
}
|
||||
|
||||
// Check that the venv python binary exists first.
|
||||
if _, err := os.Stat(venvPy); os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: pkg, Status: "unknown", Detail: "python_venv not available"}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
script := fmt.Sprintf("import %s; v = getattr(%s, '__version__', None); print(v or 'unknown')", importName, importName)
|
||||
out, err := exec.CommandContext(ctx, venvPy, "-c", script).CombinedOutput()
|
||||
output := strings.TrimSpace(string(out))
|
||||
|
||||
if err != nil {
|
||||
// Module not found → missing; other errors → warning.
|
||||
detail := output
|
||||
if len(detail) > 200 {
|
||||
detail = detail[:200]
|
||||
}
|
||||
if strings.Contains(output, "ModuleNotFoundError") || strings.Contains(output, "No module named") {
|
||||
return MlEnvCheck{Name: pkg, Status: "missing", Detail: fmt.Sprintf("%s not installed", importName)}
|
||||
}
|
||||
return MlEnvCheck{Name: pkg, Status: "warning", Detail: detail}
|
||||
}
|
||||
return MlEnvCheck{Name: pkg, Status: "ok", Version: output}
|
||||
}
|
||||
|
||||
// probeImagegenVault checks that ~/vaults/imagegen_models exists and lists subdirs.
|
||||
func probeImagegenVault() MlEnvCheck {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "unknown", Detail: "cannot determine home directory"}
|
||||
}
|
||||
vaultPath := filepath.Join(home, "vaults", "imagegen_models")
|
||||
entries, err := os.ReadDir(vaultPath)
|
||||
if os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "missing", Detail: fmt.Sprintf("vault not found: %s", vaultPath)}
|
||||
}
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "warning", Detail: fmt.Sprintf("cannot read vault: %v", err)}
|
||||
}
|
||||
|
||||
subdirs := []string{}
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
subdirs = append(subdirs, e.Name())
|
||||
}
|
||||
}
|
||||
detail := fmt.Sprintf("subdirs: %s", strings.Join(subdirs, ", "))
|
||||
if len(subdirs) == 0 {
|
||||
detail = "vault exists but is empty"
|
||||
}
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "ok", Detail: detail}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
---
|
||||
name: audit_ml_env
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
|
||||
description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
|
||||
tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
|
||||
uses_functions: [get_gpu_info_go_infra]
|
||||
uses_types: [gpu_info_go_infra]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [context, fmt, os, os/exec, path/filepath, strings, time]
|
||||
tested: true
|
||||
tests:
|
||||
- "report no nil y tiene checks"
|
||||
- "generated_at es positivo"
|
||||
- "checks tiene al menos 4 entradas"
|
||||
- "gpus puede ser vacio en CI"
|
||||
test_file_path: "functions/infra/audit_ml_env_test.go"
|
||||
file_path: "functions/infra/audit_ml_env.go"
|
||||
params:
|
||||
- name: registryRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
|
||||
output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
|
||||
---
|
||||
|
||||
## Checks realizados
|
||||
|
||||
| Check | Tipo | Critico |
|
||||
|---|---|---|
|
||||
| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
|
||||
| `nvcc` | CUDA toolkit version | no |
|
||||
| `python_venv` | exists + `python3 --version` | si |
|
||||
| `torch` | `import torch; __version__` | si |
|
||||
| `diffusers` | `import diffusers; __version__` | si |
|
||||
| `transformers` | `import transformers; __version__` | si |
|
||||
| `huggingface_hub` | `import huggingface_hub; __version__` | si |
|
||||
| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
|
||||
| `sd_cli` | `sd --version` in PATH | no (opcional) |
|
||||
| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
|
||||
| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
root := "/home/lucas/fn_registry"
|
||||
report, err := AuditMlEnv(root)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, c := range report.Checks {
|
||||
fmt.Printf("%-40s %s %s\n", c.Name, c.Status, c.Version)
|
||||
}
|
||||
fmt.Printf("OverallOK: %v\n", report.OverallOK)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
|
||||
- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
|
||||
- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
|
||||
- No escribe nada en disco. Read-only.
|
||||
- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
|
||||
@@ -0,0 +1,53 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAuditMlEnv(t *testing.T) {
|
||||
// Use the actual registry root relative to the test binary location.
|
||||
// Tests run from the package directory; go up two levels.
|
||||
registryRoot := "../.."
|
||||
|
||||
t.Run("report no nil y tiene checks", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if report.Checks == nil {
|
||||
t.Fatal("report.Checks is nil")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("generated_at es positivo", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if report.GeneratedAt <= 0 {
|
||||
t.Errorf("GeneratedAt should be positive unix timestamp, got %d", report.GeneratedAt)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("checks tiene al menos 4 entradas", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if len(report.Checks) < 4 {
|
||||
t.Errorf("expected at least 4 checks, got %d", len(report.Checks))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("gpus puede ser vacio en CI", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
// Gpus may be empty in CI without a GPU; that's OK.
|
||||
// Just verify the field is not nil.
|
||||
if report.Gpus == nil {
|
||||
t.Error("report.Gpus should be a non-nil slice (can be empty)")
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GetGpuInfo queries NVIDIA GPUs via nvidia-smi and returns a slice of GpuInfo.
|
||||
// If nvidia-smi is not installed or no NVIDIA GPU is present, returns an empty
|
||||
// slice and a nil error (absence of NVIDIA hardware is not an error).
|
||||
func GetGpuInfo() ([]GpuInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
|
||||
if err != nil {
|
||||
// nvidia-smi not installed or no NVIDIA device — not an error.
|
||||
var exitErr *exec.ExitError
|
||||
if errors.Is(err, exec.ErrNotFound) || errors.As(err, &exitErr) {
|
||||
return []GpuInfo{}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("gpu_info: nvidia-smi: %w", err)
|
||||
}
|
||||
|
||||
r := csv.NewReader(strings.NewReader(strings.TrimSpace(string(out))))
|
||||
r.TrimLeadingSpace = true
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gpu_info: parse csv: %w", err)
|
||||
}
|
||||
|
||||
gpus := make([]GpuInfo, 0, len(records))
|
||||
for _, rec := range records {
|
||||
if len(rec) < 6 {
|
||||
continue
|
||||
}
|
||||
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(rec[0]))
|
||||
totalMb, _ := strconv.Atoi(strings.TrimSpace(rec[2]))
|
||||
freeMb, _ := strconv.Atoi(strings.TrimSpace(rec[3]))
|
||||
|
||||
gpus = append(gpus, GpuInfo{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(rec[1]),
|
||||
VramTotalMb: totalMb,
|
||||
VramFreeMb: freeMb,
|
||||
DriverVersion: strings.TrimSpace(rec[4]),
|
||||
CudaVersion: strings.TrimSpace(rec[5]),
|
||||
})
|
||||
}
|
||||
|
||||
return gpus, nil
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
name: get_gpu_info
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func GetGpuInfo() ([]GpuInfo, error)"
|
||||
description: "Consulta GPUs NVIDIA via nvidia-smi y retorna un slice de GpuInfo con index, nombre, VRAM total/libre, driver y version CUDA. Si nvidia-smi no esta instalado o no hay GPU NVIDIA, retorna slice vacio y nil (ausencia de hardware no es error)."
|
||||
tags: [gpu, nvidia, cuda, hardware, infra, probe]
|
||||
uses_functions: []
|
||||
uses_types: ["gpu_info_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [encoding/csv, errors, fmt, os/exec, strconv, strings]
|
||||
params:
|
||||
- name: (ninguno)
|
||||
desc: "No toma parametros. Lee el estado del sistema via nvidia-smi."
|
||||
output: "Slice de GpuInfo con una entrada por GPU detectada. Slice vacio si no hay GPUs NVIDIA o nvidia-smi no esta instalado. Error solo si nvidia-smi existe pero falla inesperadamente al parsear la salida CSV."
|
||||
tested: true
|
||||
tests:
|
||||
- "retorna slice vacio y nil cuando no hay GPU NVIDIA"
|
||||
- "linea GPU RTX 3080 tipica"
|
||||
- "dos GPUs en el CSV"
|
||||
- "CSV vacio retorna slice vacio"
|
||||
- "linea con menos de 6 campos se ignora"
|
||||
- "espacios extra en los valores se eliminan"
|
||||
- "campos del struct GpuInfo correctos"
|
||||
test_file_path: "functions/infra/get_gpu_info_test.go"
|
||||
file_path: "functions/infra/get_gpu_info.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
fmt.Println("No NVIDIA GPUs detected")
|
||||
} else {
|
||||
for _, g := range gpus {
|
||||
fmt.Printf("[%d] %s VRAM: %d/%d MiB Driver: %s CUDA: %s\n",
|
||||
g.Index, g.Name, g.VramFreeMb, g.VramTotalMb,
|
||||
g.DriverVersion, g.CudaVersion)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Salida nvidia-smi
|
||||
|
||||
Ejecuta:
|
||||
```
|
||||
nvidia-smi --query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version --format=csv,noheader,nounits
|
||||
```
|
||||
|
||||
Ejemplo de salida con una GPU:
|
||||
```
|
||||
0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Requiere `nvidia-smi` en PATH (parte del driver NVIDIA).
|
||||
- La columna `cuda_version` en nvidia-smi refleja la version maxima de CUDA soportada por el driver, no la del toolkit instalado.
|
||||
- Para comprobar el toolkit CUDA instalado, usar `cuda_toolkit_check_bash_infra`.
|
||||
- En maquinas sin GPU NVIDIA retorna `([]GpuInfo{}, nil)` — el caller puede tratar esto como "sin GPU disponible".
|
||||
- No ejecutar tests automatizados para esta funcion en CI sin GPU; verificar manualmente o con mock.
|
||||
@@ -0,0 +1,165 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestGetGpuInfoNoGpu verifica que la funcion retorna slice vacio sin error
|
||||
// cuando nvidia-smi no esta instalado o no hay GPU NVIDIA presente.
|
||||
// Este test pasa en cualquier maquina, con o sin GPU.
|
||||
func TestGetGpuInfoNoGpu(t *testing.T) {
|
||||
t.Run("retorna slice vacio y nil cuando no hay GPU NVIDIA", func(t *testing.T) {
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
t.Errorf("GetGpuInfo() error inesperado: %v", err)
|
||||
}
|
||||
// En maquinas sin nvidia-smi el resultado debe ser un slice vacio (no nil)
|
||||
if gpus == nil {
|
||||
t.Error("GetGpuInfo() retorno nil, se esperaba slice vacio []GpuInfo{}")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// parseCsvNvidiaSmi replica la logica de parsing de GetGpuInfo para tests unitarios.
|
||||
// Recibe el output de nvidia-smi --format=csv,noheader,nounits y retorna []GpuInfo.
|
||||
func parseCsvNvidiaSmi(output string) ([]GpuInfo, error) {
|
||||
trimmed := strings.TrimSpace(output)
|
||||
if trimmed == "" {
|
||||
return []GpuInfo{}, nil
|
||||
}
|
||||
lines := strings.Split(trimmed, "\n")
|
||||
gpus := make([]GpuInfo, 0, len(lines))
|
||||
for _, line := range lines {
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
totalMb, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||||
freeMb, _ := strconv.Atoi(strings.TrimSpace(parts[3]))
|
||||
gpus = append(gpus, GpuInfo{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
VramTotalMb: totalMb,
|
||||
VramFreeMb: freeMb,
|
||||
DriverVersion: strings.TrimSpace(parts[4]),
|
||||
CudaVersion: strings.TrimSpace(parts[5]),
|
||||
})
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// TestParseCsvNvidiaSmi verifica el parsing de la salida CSV de nvidia-smi
|
||||
// sin requerir GPU real ni nvidia-smi instalado.
|
||||
func TestParseCsvNvidiaSmi(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
csvInput string
|
||||
wantLen int
|
||||
wantIndex int
|
||||
wantName string
|
||||
wantVramTotal int
|
||||
wantVramFree int
|
||||
wantDriver string
|
||||
wantCuda string
|
||||
}{
|
||||
{
|
||||
name: "linea GPU RTX 3080 tipica",
|
||||
csvInput: "0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4",
|
||||
wantLen: 1,
|
||||
wantIndex: 0,
|
||||
wantName: "NVIDIA GeForce RTX 3080",
|
||||
wantVramTotal: 10240,
|
||||
wantVramFree: 8192,
|
||||
wantDriver: "550.54.15",
|
||||
wantCuda: "12.4",
|
||||
},
|
||||
{
|
||||
name: "dos GPUs en el CSV",
|
||||
csvInput: "0, GPU A, 8192, 4096, 525.0, 12.0\n1, GPU B, 24576, 20000, 525.0, 12.0",
|
||||
wantLen: 2,
|
||||
},
|
||||
{
|
||||
name: "CSV vacio retorna slice vacio",
|
||||
csvInput: "",
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "linea con menos de 6 campos se ignora",
|
||||
csvInput: "0, GPU, 8192",
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "espacios extra en los valores se eliminan",
|
||||
csvInput: " 1 , NVIDIA RTX 4090 , 24576 , 20000 , 545.0 , 12.6 ",
|
||||
wantLen: 1,
|
||||
wantIndex: 1,
|
||||
wantName: "NVIDIA RTX 4090",
|
||||
wantVramTotal: 24576,
|
||||
wantVramFree: 20000,
|
||||
wantDriver: "545.0",
|
||||
wantCuda: "12.6",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
gpus, err := parseCsvNvidiaSmi(tc.csvInput)
|
||||
if err != nil {
|
||||
t.Fatalf("error inesperado: %v", err)
|
||||
}
|
||||
if len(gpus) != tc.wantLen {
|
||||
t.Fatalf("len(gpus) = %d, quería %d", len(gpus), tc.wantLen)
|
||||
}
|
||||
if tc.wantLen == 1 {
|
||||
g := gpus[0]
|
||||
if g.Index != tc.wantIndex {
|
||||
t.Errorf("Index = %d, quería %d", g.Index, tc.wantIndex)
|
||||
}
|
||||
if g.Name != tc.wantName {
|
||||
t.Errorf("Name = %q, quería %q", g.Name, tc.wantName)
|
||||
}
|
||||
if g.VramTotalMb != tc.wantVramTotal {
|
||||
t.Errorf("VramTotalMb = %d, quería %d", g.VramTotalMb, tc.wantVramTotal)
|
||||
}
|
||||
if g.VramFreeMb != tc.wantVramFree {
|
||||
t.Errorf("VramFreeMb = %d, quería %d", g.VramFreeMb, tc.wantVramFree)
|
||||
}
|
||||
if g.DriverVersion != tc.wantDriver {
|
||||
t.Errorf("DriverVersion = %q, quería %q", g.DriverVersion, tc.wantDriver)
|
||||
}
|
||||
if g.CudaVersion != tc.wantCuda {
|
||||
t.Errorf("CudaVersion = %q, quería %q", g.CudaVersion, tc.wantCuda)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGpuInfoStruct verifica los campos del tipo GpuInfo.
|
||||
func TestGpuInfoStruct(t *testing.T) {
|
||||
t.Run("campos del struct GpuInfo correctos", func(t *testing.T) {
|
||||
g := GpuInfo{
|
||||
Index: 0,
|
||||
Name: "NVIDIA GeForce GTX 1080",
|
||||
VramTotalMb: 8192,
|
||||
VramFreeMb: 6144,
|
||||
DriverVersion: "470.0",
|
||||
CudaVersion: "11.4",
|
||||
}
|
||||
if g.Index != 0 {
|
||||
t.Errorf("Index = %d", g.Index)
|
||||
}
|
||||
if g.Name != "NVIDIA GeForce GTX 1080" {
|
||||
t.Errorf("Name = %q", g.Name)
|
||||
}
|
||||
if g.VramTotalMb != 8192 {
|
||||
t.Errorf("VramTotalMb = %d", g.VramTotalMb)
|
||||
}
|
||||
if g.VramFreeMb != 6144 {
|
||||
t.Errorf("VramFreeMb = %d", g.VramFreeMb)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package infra
|
||||
|
||||
// GpuInfo describe una GPU detectada en el sistema con sus capacidades de VRAM
|
||||
// y versiones de driver y CUDA.
|
||||
type GpuInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
VramTotalMb int `json:"vram_total_mb"`
|
||||
VramFreeMb int `json:"vram_free_mb"`
|
||||
DriverVersion string `json:"driver_version"`
|
||||
CudaVersion string `json:"cuda_version,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AggregateReport summarises the result of a VaultAggregateIndex run.
|
||||
type AggregateReport struct {
|
||||
VaultsProcessed int
|
||||
VaultsSkipped int // vaults without a vault_index.db
|
||||
TotalFiles int
|
||||
Errors []string // non-fatal per-vault errors
|
||||
}
|
||||
|
||||
// VaultAggregateIndex reads all vault manifests from repoRoot, opens each
|
||||
// vault_index.db and copies all file records into the central registry.db
|
||||
// vault_files table. The table is created if it does not exist (idempotent).
|
||||
//
|
||||
// For each vault the previous rows are deleted and replaced atomically, so
|
||||
// re-running always produces a clean, non-duplicated state.
|
||||
//
|
||||
// Returns an AggregateReport with counts. Per-vault errors are non-fatal
|
||||
// (logged in report.Errors); only fatal errors (e.g. registry.db
|
||||
// unreachable) are returned as the error value.
|
||||
func VaultAggregateIndex(repoRoot string) (AggregateReport, error) {
|
||||
var report AggregateReport
|
||||
|
||||
// 1. Open registry.db
|
||||
registryDB, err := SQLiteOpen(filepath.Join(repoRoot, "registry.db"), "")
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_aggregate_index: open registry.db: %w", err)
|
||||
}
|
||||
defer registryDB.Close()
|
||||
|
||||
// 2. Idempotent schema migration
|
||||
for _, stmt := range []string{
|
||||
`CREATE TABLE IF NOT EXISTS vault_files (
|
||||
vault_id TEXT NOT NULL,
|
||||
vault_name TEXT NOT NULL,
|
||||
rel_path TEXT NOT NULL,
|
||||
size INTEGER NOT NULL,
|
||||
mtime INTEGER NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
mime TEXT NOT NULL DEFAULT '',
|
||||
ext TEXT NOT NULL DEFAULT '',
|
||||
bucket TEXT NOT NULL DEFAULT '',
|
||||
sub_bucket TEXT NOT NULL DEFAULT '',
|
||||
indexed_at INTEGER NOT NULL,
|
||||
PRIMARY KEY (vault_id, rel_path)
|
||||
);`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_vault_files_sha256 ON vault_files(sha256);`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_vault_files_vault ON vault_files(vault_id);`,
|
||||
} {
|
||||
if _, err := registryDB.Exec(stmt); err != nil {
|
||||
if !isIdempotentMigrationError(err) {
|
||||
return report, fmt.Errorf("vault_aggregate_index: schema: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Read manifest
|
||||
entries, err := VaultManifestRead(repoRoot)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_aggregate_index: manifest: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC().Unix()
|
||||
|
||||
for _, entry := range entries {
|
||||
vaultID := vaultIDFromEntry(entry)
|
||||
vaultName := entry.Name
|
||||
vaultPath := entry.Path
|
||||
|
||||
indexPath := filepath.Join(vaultPath, "vault_index.db")
|
||||
if _, statErr := os.Stat(indexPath); statErr != nil {
|
||||
report.VaultsSkipped++
|
||||
continue
|
||||
}
|
||||
|
||||
vaultDB, openErr := VaultIndexOpen(vaultPath)
|
||||
if openErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: open index: %v", vaultName, openErr))
|
||||
continue
|
||||
}
|
||||
|
||||
rows, queryErr := vaultDB.Query(
|
||||
`SELECT rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket FROM files`,
|
||||
)
|
||||
if queryErr != nil {
|
||||
vaultDB.Close()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: query files: %v", vaultName, queryErr))
|
||||
continue
|
||||
}
|
||||
|
||||
type fileRow struct {
|
||||
RelPath string
|
||||
Size int64
|
||||
Mtime int64
|
||||
Sha256 string
|
||||
Mime string
|
||||
Ext string
|
||||
Bucket string
|
||||
SubBucket string
|
||||
}
|
||||
var fileRows []fileRow
|
||||
for rows.Next() {
|
||||
var r fileRow
|
||||
if scanErr := rows.Scan(&r.RelPath, &r.Size, &r.Mtime, &r.Sha256, &r.Mime, &r.Ext, &r.Bucket, &r.SubBucket); scanErr != nil {
|
||||
continue
|
||||
}
|
||||
fileRows = append(fileRows, r)
|
||||
}
|
||||
rows.Close()
|
||||
vaultDB.Close()
|
||||
|
||||
// Atomic replace in registry.db
|
||||
tx, txErr := registryDB.Begin()
|
||||
if txErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: begin tx: %v", vaultName, txErr))
|
||||
continue
|
||||
}
|
||||
|
||||
if _, delErr := tx.Exec(`DELETE FROM vault_files WHERE vault_id = ?`, vaultID); delErr != nil {
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: delete: %v", vaultName, delErr))
|
||||
continue
|
||||
}
|
||||
|
||||
stmt, prepErr := tx.Prepare(`
|
||||
INSERT INTO vault_files
|
||||
(vault_id, vault_name, rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
||||
if prepErr != nil {
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: prepare: %v", vaultName, prepErr))
|
||||
continue
|
||||
}
|
||||
|
||||
for _, r := range fileRows {
|
||||
if _, insErr := stmt.Exec(vaultID, vaultName, r.RelPath, r.Size, r.Mtime, r.Sha256, r.Mime, r.Ext, r.Bucket, r.SubBucket, now); insErr != nil {
|
||||
stmt.Close()
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: insert %s: %v", vaultName, r.RelPath, insErr))
|
||||
continue
|
||||
}
|
||||
}
|
||||
stmt.Close()
|
||||
|
||||
if commitErr := tx.Commit(); commitErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: commit: %v", vaultName, commitErr))
|
||||
continue
|
||||
}
|
||||
|
||||
report.VaultsProcessed++
|
||||
report.TotalFiles += len(fileRows)
|
||||
}
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// vaultIDFromEntry constructs the canonical vault ID used in registry.db.
|
||||
// Pattern: "<vault_name>_<project_id>" — consistent with the vaults table.
|
||||
func vaultIDFromEntry(e VaultManifestEntry) string {
|
||||
if e.ProjectID == "" {
|
||||
return e.Name
|
||||
}
|
||||
return e.Name + "_" + e.ProjectID
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: vault_aggregate_index
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultAggregateIndex(repoRoot string) (AggregateReport, error)"
|
||||
description: "Agrega los índices de todos los vaults del registry en la tabla vault_files de registry.db. Lee cada vault_index.db (via VaultIndexOpen) y reemplaza las filas de forma atómica. Idempotente: re-ejecutar limpia y reescribe sin duplicar."
|
||||
tags: [vault, index, aggregate, registry]
|
||||
uses_functions:
|
||||
- "vault_manifest_read_go_infra"
|
||||
- "vault_index_open_go_infra"
|
||||
- "sqlite_open_go_infra"
|
||||
uses_types:
|
||||
- "vault_file_go_infra"
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "database/sql"
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "time"
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultAggregateIndex_NoVaults"
|
||||
- "TestVaultAggregateIndex_VaultWithoutIndex"
|
||||
- "TestVaultAggregateIndex_HappyPath"
|
||||
- "TestVaultAggregateIndex_ReRunReplaces"
|
||||
test_file_path: "functions/infra/vault_aggregate_index_test.go"
|
||||
file_path: "functions/infra/vault_aggregate_index.go"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry (contiene registry.db y projects/)."
|
||||
output: "AggregateReport con VaultsProcessed, VaultsSkipped (sin vault_index.db), TotalFiles y Errors (errores no fatales por vault). Error fatal solo si registry.db no se puede abrir."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
report, err := infra.VaultAggregateIndex("/home/lucas/fn_registry")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("Processed: %d vaults, %d files\n", report.VaultsProcessed, report.TotalFiles)
|
||||
for _, e := range report.Errors {
|
||||
fmt.Println("warning:", e)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Requiere que `registry/migrations/012_vault_files.sql` haya sido aplicado (o que el indexer lo aplique al arrancar). La función aplica la migración de forma idempotente ella misma con `CREATE TABLE IF NOT EXISTS`.
|
||||
- Por cada vault: `DELETE WHERE vault_id = ?` + batch `INSERT` dentro de una transacción. Re-run siempre produce el mismo resultado.
|
||||
- Vaults sin `vault_index.db` se cuentan en `VaultsSkipped` y se omiten sin error.
|
||||
- El `vault_id` sigue el patrón `<vault_name>_<project_id>`, consistente con la tabla `vaults` de registry.db.
|
||||
@@ -0,0 +1,175 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// setupAggregateTestRepo creates a minimal repo layout:
|
||||
//
|
||||
// <root>/
|
||||
// registry.db (SQLite, empty)
|
||||
// projects/<project>/vaults/vault.yaml
|
||||
// <vaultPath>/ (optionally with vault_index.db populated)
|
||||
func setupAggregateTestRepo(t *testing.T, vaultName, projectID, vaultPath string, withIndex bool) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
|
||||
// Create registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("create registry.db: %v", err)
|
||||
}
|
||||
regDB.Close()
|
||||
|
||||
// Create project vault manifest
|
||||
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
|
||||
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir projects: %v", err)
|
||||
}
|
||||
manifestYAML := "vaults:\n - name: " + vaultName + "\n description: test\n path: " + vaultPath + "\n tags: []\n"
|
||||
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifestYAML), 0644); err != nil {
|
||||
t.Fatalf("write vault.yaml: %v", err)
|
||||
}
|
||||
|
||||
// Create vault dir
|
||||
if err := os.MkdirAll(vaultPath, 0755); err != nil {
|
||||
t.Fatalf("mkdir vault: %v", err)
|
||||
}
|
||||
|
||||
if withIndex {
|
||||
// Create a vault_index.db with one file row
|
||||
vdb, err := VaultIndexOpen(vaultPath)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
now := time.Now().UTC().Unix()
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/sample.csv", 1024, now, "deadbeef", "text/csv", ".csv", "data", "raw", now)
|
||||
if err != nil {
|
||||
t.Fatalf("insert test file: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
}
|
||||
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_NoVaults(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
// No manifests, just registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("create registry.db: %v", err)
|
||||
}
|
||||
regDB.Close()
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsProcessed != 0 {
|
||||
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
|
||||
}
|
||||
if len(report.Errors) != 0 {
|
||||
t.Errorf("Errors: want empty, got %v", report.Errors)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_VaultWithoutIndex(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, false /* no vault_index.db */)
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsSkipped != 1 {
|
||||
t.Errorf("VaultsSkipped: want 1, got %d", report.VaultsSkipped)
|
||||
}
|
||||
if report.VaultsProcessed != 0 {
|
||||
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_HappyPath(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsProcessed != 1 {
|
||||
t.Errorf("VaultsProcessed: want 1, got %d", report.VaultsProcessed)
|
||||
}
|
||||
if report.TotalFiles != 1 {
|
||||
t.Errorf("TotalFiles: want 1, got %d", report.TotalFiles)
|
||||
}
|
||||
|
||||
// Verify row exists in registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("open registry.db: %v", err)
|
||||
}
|
||||
defer regDB.Close()
|
||||
|
||||
var count int
|
||||
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
|
||||
t.Fatalf("count vault_files: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("vault_files count: want 1, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_ReRunReplaces(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
|
||||
|
||||
// First run
|
||||
if _, err := VaultAggregateIndex(root); err != nil {
|
||||
t.Fatalf("first run: %v", err)
|
||||
}
|
||||
|
||||
// Add a second file to vault_index.db
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("reopen vault index: %v", err)
|
||||
}
|
||||
now := time.Now().UTC().Unix()
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/extra.csv", 512, now, "cafebabe", "text/csv", ".csv", "data", "raw", now)
|
||||
if err != nil {
|
||||
t.Fatalf("insert second file: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
// Second run
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("second run: %v", err)
|
||||
}
|
||||
if report.TotalFiles != 2 {
|
||||
t.Errorf("TotalFiles: want 2, got %d", report.TotalFiles)
|
||||
}
|
||||
|
||||
// Verify no duplicates — exactly 2 rows
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("open registry.db: %v", err)
|
||||
}
|
||||
defer regDB.Close()
|
||||
|
||||
var count int
|
||||
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
|
||||
t.Fatalf("count vault_files: %v", err)
|
||||
}
|
||||
if count != 2 {
|
||||
t.Errorf("vault_files count after re-run: want 2, got %d", count)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package infra
|
||||
|
||||
import "sort"
|
||||
|
||||
// VaultFileChange holds the before/after state of a file whose content changed.
|
||||
type VaultFileChange struct {
|
||||
RelPath string
|
||||
Prev VaultFile
|
||||
Curr VaultFile
|
||||
}
|
||||
|
||||
// VaultDiffReport is the result of comparing two VaultFile slices.
|
||||
type VaultDiffReport struct {
|
||||
Added []VaultFile // in curr but not in prev (by rel_path)
|
||||
Removed []VaultFile // in prev but not in curr
|
||||
Changed []VaultFileChange // same rel_path, different sha256
|
||||
Unchanged int // files present in both with identical sha256
|
||||
}
|
||||
|
||||
// VaultDiff computes the difference between two vault snapshots.
|
||||
// It indexes both slices by RelPath, then classifies each entry as
|
||||
// Added, Removed, Changed, or Unchanged. All output slices are sorted
|
||||
// by RelPath ascending. The function is pure and deterministic.
|
||||
func VaultDiff(prev, curr []VaultFile) VaultDiffReport {
|
||||
prevMap := make(map[string]VaultFile, len(prev))
|
||||
for _, f := range prev {
|
||||
prevMap[f.RelPath] = f
|
||||
}
|
||||
currMap := make(map[string]VaultFile, len(curr))
|
||||
for _, f := range curr {
|
||||
currMap[f.RelPath] = f
|
||||
}
|
||||
|
||||
var report VaultDiffReport
|
||||
|
||||
for _, f := range curr {
|
||||
p, exists := prevMap[f.RelPath]
|
||||
if !exists {
|
||||
report.Added = append(report.Added, f)
|
||||
} else if p.Sha256 != f.Sha256 {
|
||||
report.Changed = append(report.Changed, VaultFileChange{
|
||||
RelPath: f.RelPath,
|
||||
Prev: p,
|
||||
Curr: f,
|
||||
})
|
||||
} else {
|
||||
report.Unchanged++
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range prev {
|
||||
if _, exists := currMap[f.RelPath]; !exists {
|
||||
report.Removed = append(report.Removed, f)
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(report.Added, func(i, j int) bool {
|
||||
return report.Added[i].RelPath < report.Added[j].RelPath
|
||||
})
|
||||
sort.Slice(report.Removed, func(i, j int) bool {
|
||||
return report.Removed[i].RelPath < report.Removed[j].RelPath
|
||||
})
|
||||
sort.Slice(report.Changed, func(i, j int) bool {
|
||||
return report.Changed[i].RelPath < report.Changed[j].RelPath
|
||||
})
|
||||
|
||||
return report
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: vault_diff
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "func VaultDiff(prev, curr []VaultFile) VaultDiffReport"
|
||||
description: "Computes the diff between two vault snapshots (slices of VaultFile). Returns Added, Removed, Changed and Unchanged counts. Pure and deterministic — no I/O."
|
||||
tags: [vault, diff, comparison, pure]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["sort"]
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultDiff_NoChanges"
|
||||
- "TestVaultDiff_AllAdded"
|
||||
- "TestVaultDiff_AllRemoved"
|
||||
- "TestVaultDiff_ContentChanged"
|
||||
- "TestVaultDiff_Mixed"
|
||||
test_file_path: "functions/infra/vault_diff_test.go"
|
||||
file_path: "functions/infra/vault_diff.go"
|
||||
params:
|
||||
- name: prev
|
||||
desc: "Snapshot anterior — slice de VaultFile del estado previo del vault (puede ser nil para diff desde cero)."
|
||||
- name: curr
|
||||
desc: "Snapshot actual — slice de VaultFile del estado corriente del vault (puede ser nil para diff de borrado total)."
|
||||
output: "VaultDiffReport con Added (nuevos), Removed (eliminados), Changed (mismo rel_path, sha256 distinto) y Unchanged (identicos). Todos los slices ordenados por RelPath ASC."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
prev, _ := infra.VaultInventoryScan(oldPath, "my_vault_proj", "my_vault")
|
||||
curr, _ := infra.VaultInventoryScan(newPath, "my_vault_proj", "my_vault")
|
||||
report := infra.VaultDiff(prev, curr)
|
||||
fmt.Printf("Added: %d, Removed: %d, Changed: %d, Unchanged: %d\n",
|
||||
len(report.Added), len(report.Removed), len(report.Changed), report.Unchanged)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Usa `RelPath` como clave de identidad de archivo (no nombre, no sha256).
|
||||
- Dos archivos con mismo `RelPath` pero diferente `Sha256` se consideran Changed.
|
||||
- Los slices del report se ordenan por `RelPath` ASC para salida deterministica.
|
||||
- Función pura: no toca disco ni BD.
|
||||
@@ -0,0 +1,126 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func makeVF(relPath, sha256 string) VaultFile {
|
||||
return VaultFile{
|
||||
VaultID: "test_vault",
|
||||
VaultName: "test",
|
||||
RelPath: relPath,
|
||||
Sha256: sha256,
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_NoChanges(t *testing.T) {
|
||||
files := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(files, files)
|
||||
if len(report.Added) != 0 {
|
||||
t.Errorf("Added: want 0, got %d", len(report.Added))
|
||||
}
|
||||
if len(report.Removed) != 0 {
|
||||
t.Errorf("Removed: want 0, got %d", len(report.Removed))
|
||||
}
|
||||
if len(report.Changed) != 0 {
|
||||
t.Errorf("Changed: want 0, got %d", len(report.Changed))
|
||||
}
|
||||
if report.Unchanged != 2 {
|
||||
t.Errorf("Unchanged: want 2, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_AllAdded(t *testing.T) {
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(nil, curr)
|
||||
if len(report.Added) != 2 {
|
||||
t.Errorf("Added: want 2, got %d", len(report.Added))
|
||||
}
|
||||
if len(report.Removed) != 0 {
|
||||
t.Errorf("Removed: want 0, got %d", len(report.Removed))
|
||||
}
|
||||
if report.Added[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Added[0]: want data/a.csv, got %s", report.Added[0].RelPath)
|
||||
}
|
||||
if report.Added[1].RelPath != "data/b.csv" {
|
||||
t.Errorf("Added[1]: want data/b.csv, got %s", report.Added[1].RelPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_AllRemoved(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(prev, nil)
|
||||
if len(report.Removed) != 2 {
|
||||
t.Errorf("Removed: want 2, got %d", len(report.Removed))
|
||||
}
|
||||
if len(report.Added) != 0 {
|
||||
t.Errorf("Added: want 0, got %d", len(report.Added))
|
||||
}
|
||||
if report.Removed[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Removed[0]: want data/a.csv, got %s", report.Removed[0].RelPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_ContentChanged(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "old_hash"),
|
||||
}
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "new_hash"),
|
||||
}
|
||||
report := VaultDiff(prev, curr)
|
||||
if len(report.Changed) != 1 {
|
||||
t.Fatalf("Changed: want 1, got %d", len(report.Changed))
|
||||
}
|
||||
if report.Changed[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Changed[0].RelPath: want data/a.csv, got %s", report.Changed[0].RelPath)
|
||||
}
|
||||
if report.Changed[0].Prev.Sha256 != "old_hash" {
|
||||
t.Errorf("Changed[0].Prev.Sha256: want old_hash, got %s", report.Changed[0].Prev.Sha256)
|
||||
}
|
||||
if report.Changed[0].Curr.Sha256 != "new_hash" {
|
||||
t.Errorf("Changed[0].Curr.Sha256: want new_hash, got %s", report.Changed[0].Curr.Sha256)
|
||||
}
|
||||
if len(report.Added) != 0 || len(report.Removed) != 0 {
|
||||
t.Errorf("Expected no added/removed, got %d/%d", len(report.Added), len(report.Removed))
|
||||
}
|
||||
if report.Unchanged != 0 {
|
||||
t.Errorf("Unchanged: want 0, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_Mixed(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
makeVF("data/c.csv", "ccc"),
|
||||
}
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"), // unchanged
|
||||
makeVF("data/b.csv", "bbb_new"), // changed
|
||||
makeVF("data/d.csv", "ddd"), // added
|
||||
}
|
||||
report := VaultDiff(prev, curr)
|
||||
|
||||
if len(report.Added) != 1 || report.Added[0].RelPath != "data/d.csv" {
|
||||
t.Errorf("Added: want [data/d.csv], got %v", report.Added)
|
||||
}
|
||||
if len(report.Removed) != 1 || report.Removed[0].RelPath != "data/c.csv" {
|
||||
t.Errorf("Removed: want [data/c.csv], got %v", report.Removed)
|
||||
}
|
||||
if len(report.Changed) != 1 || report.Changed[0].RelPath != "data/b.csv" {
|
||||
t.Errorf("Changed: want [data/b.csv], got %v", report.Changed)
|
||||
}
|
||||
if report.Unchanged != 1 {
|
||||
t.Errorf("Unchanged: want 1, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// VaultDoctorEntry holds the health report for a single vault.
|
||||
type VaultDoctorEntry struct {
|
||||
VaultName string `json:"vault_name"`
|
||||
VaultPath string `json:"vault_path"`
|
||||
ProjectID string `json:"project_id"`
|
||||
Issues []string `json:"issues"` // human-readable issues; empty = healthy
|
||||
IndexedFiles int `json:"indexed_files"` // 0 if no vault_index.db
|
||||
LastIndexedAt int64 `json:"last_indexed_at"` // unix seconds; 0 if N/A
|
||||
DiskFiles int `json:"disk_files"` // count via WalkDir (no hashing)
|
||||
Status string `json:"status"` // "ok" | "warning" | "error"
|
||||
}
|
||||
|
||||
// VaultDoctor audits every vault declared in projects/*/vaults/vault.yaml under
|
||||
// repoRoot. For each vault it performs a series of checks (disk presence, layout,
|
||||
// index existence, staleness, drift) and returns a slice of VaultDoctorEntry.
|
||||
//
|
||||
// The function is read-only: it never writes to disk or any database.
|
||||
// Returns an error only if VaultManifestRead fails (manifest parse error).
|
||||
func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error) {
|
||||
entries, err := VaultManifestRead(repoRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_doctor: read manifests: %w", err)
|
||||
}
|
||||
|
||||
results := make([]VaultDoctorEntry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
result := auditVault(e)
|
||||
results = append(results, result)
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func auditVault(e VaultManifestEntry) VaultDoctorEntry {
|
||||
entry := VaultDoctorEntry{
|
||||
VaultName: e.Name,
|
||||
VaultPath: e.Path,
|
||||
ProjectID: e.ProjectID,
|
||||
}
|
||||
|
||||
// Resolve symlinks for disk checks
|
||||
realPath, err := filepath.EvalSymlinks(e.Path)
|
||||
if err != nil || realPath == "" {
|
||||
realPath = e.Path
|
||||
}
|
||||
|
||||
// CHECK 1: directory_missing
|
||||
info, statErr := os.Stat(realPath)
|
||||
if statErr != nil || !info.IsDir() {
|
||||
entry.Issues = append(entry.Issues, "directory_missing")
|
||||
entry.Status = "error"
|
||||
return entry
|
||||
}
|
||||
|
||||
// COUNT disk files (cheap walk — no hashing, no mime detection)
|
||||
diskCount := countDiskFiles(realPath)
|
||||
entry.DiskFiles = diskCount
|
||||
|
||||
// CHECK 2: layout_missing / non_standard_layout
|
||||
hasData := dirExists(filepath.Join(realPath, "data"))
|
||||
hasKnowledge := dirExists(filepath.Join(realPath, "knowledge"))
|
||||
if !hasData && !hasKnowledge {
|
||||
// Check if it looks like a non-standard but intentional layout
|
||||
if hasNonStandardLayout(realPath) {
|
||||
entry.Issues = append(entry.Issues, "non_standard_layout")
|
||||
} else {
|
||||
entry.Issues = append(entry.Issues, "layout_missing")
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK 3: index_missing
|
||||
indexPath := filepath.Join(realPath, "vault_index.db")
|
||||
_, indexStatErr := os.Stat(indexPath)
|
||||
if indexStatErr != nil {
|
||||
entry.Issues = append(entry.Issues, "index_missing")
|
||||
entry.setWarningStatus()
|
||||
entry.setFinalStatus()
|
||||
return entry
|
||||
}
|
||||
|
||||
// Open vault index (read-only) for checks 4 and 5
|
||||
vdb, openErr := VaultIndexOpen(realPath)
|
||||
if openErr != nil {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_open_error: %v", openErr))
|
||||
entry.setWarningStatus()
|
||||
return entry
|
||||
}
|
||||
defer vdb.Close()
|
||||
|
||||
// Query indexed file count and max indexed_at
|
||||
var indexedCount int
|
||||
var maxIndexedAt int64
|
||||
row := vdb.QueryRow(`SELECT COUNT(*), COALESCE(MAX(indexed_at), 0) FROM files`)
|
||||
if scanErr := row.Scan(&indexedCount, &maxIndexedAt); scanErr != nil {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_query_error: %v", scanErr))
|
||||
} else {
|
||||
entry.IndexedFiles = indexedCount
|
||||
entry.LastIndexedAt = maxIndexedAt
|
||||
}
|
||||
|
||||
// CHECK 4: index_stale — any file on disk newer than MAX(indexed_at)
|
||||
if maxIndexedAt > 0 {
|
||||
maxTime := time.Unix(maxIndexedAt, 0)
|
||||
if isIndexStale(realPath, maxTime) {
|
||||
entry.Issues = append(entry.Issues, "index_stale")
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK 5: index_drift — disk file count != indexed count
|
||||
if indexedCount != diskCount {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_drift: disk=%d indexed=%d", diskCount, indexedCount))
|
||||
}
|
||||
|
||||
// CHECK 6: empty_vault
|
||||
if diskCount == 0 {
|
||||
entry.Issues = append(entry.Issues, "empty_vault")
|
||||
}
|
||||
|
||||
entry.setFinalStatus()
|
||||
return entry
|
||||
}
|
||||
|
||||
// setWarningStatus sets status to warning if not already error.
|
||||
func (e *VaultDoctorEntry) setWarningStatus() {
|
||||
if e.Status != "error" {
|
||||
e.Status = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// setFinalStatus derives the final Status from Issues.
|
||||
func (e *VaultDoctorEntry) setFinalStatus() {
|
||||
if e.Status == "error" {
|
||||
return
|
||||
}
|
||||
if len(e.Issues) == 0 {
|
||||
e.Status = "ok"
|
||||
} else {
|
||||
e.Status = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// countDiskFiles walks realPath and counts regular files, excluding:
|
||||
// vault_index.db*, .git/, hidden files/dirs at any depth.
|
||||
func countDiskFiles(realPath string) int {
|
||||
count := 0
|
||||
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
name := d.Name()
|
||||
// Skip hidden entries
|
||||
if strings.HasPrefix(name, ".") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// Skip .git
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
// Skip vault_index.db files
|
||||
if !d.IsDir() && (name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal") {
|
||||
return nil
|
||||
}
|
||||
if !d.IsDir() {
|
||||
count++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return count
|
||||
}
|
||||
|
||||
// isIndexStale returns true if any regular file under realPath has an mtime
|
||||
// strictly after maxTime (excluding vault_index.db* and hidden files).
|
||||
func isIndexStale(realPath string, maxTime time.Time) bool {
|
||||
stale := false
|
||||
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil || stale {
|
||||
return nil
|
||||
}
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
if !d.IsDir() {
|
||||
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
|
||||
return nil
|
||||
}
|
||||
fi, statErr := d.Info()
|
||||
if statErr == nil && fi.ModTime().After(maxTime) {
|
||||
stale = true
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return stale
|
||||
}
|
||||
|
||||
// hasNonStandardLayout returns true when a vault directory contains
|
||||
// subdirectories that are clearly intentional but not data/knowledge.
|
||||
// Heuristic: any subdir at the vault root that is not data/knowledge.
|
||||
func hasNonStandardLayout(realPath string) bool {
|
||||
entries, err := os.ReadDir(realPath)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
standardDirs := map[string]bool{"data": true, "knowledge": true, ".git": true}
|
||||
for _, e := range entries {
|
||||
if e.IsDir() && !standardDirs[e.Name()] && !strings.HasPrefix(e.Name(), ".") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: vault_doctor
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error)"
|
||||
description: "Audita la salud de todos los vaults declarados en projects/*/vaults/vault.yaml. Comprueba existencia del directorio, layout estándar, presencia del índice, staleness y drift entre disco e índice. Read-only."
|
||||
tags: [vault, doctor, health, audit]
|
||||
uses_functions:
|
||||
- "vault_manifest_read_go_infra"
|
||||
- "vault_index_open_go_infra"
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
- "time"
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultDoctor_OK"
|
||||
- "TestVaultDoctor_MissingDir"
|
||||
- "TestVaultDoctor_NoIndex"
|
||||
- "TestVaultDoctor_LayoutDrift"
|
||||
- "TestVaultDoctor_EmptyVault"
|
||||
test_file_path: "functions/infra/vault_doctor_test.go"
|
||||
file_path: "functions/infra/vault_doctor.go"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry (donde están projects/ y registry.db)."
|
||||
output: "Slice de VaultDoctorEntry con Status (ok/warning/error), Issues, DiskFiles, IndexedFiles y LastIndexedAt por vault. Error fatal solo si los manifests no se pueden leer."
|
||||
---
|
||||
|
||||
## Checks aplicados
|
||||
|
||||
| Check | Condición | Severidad |
|
||||
|---|---|---|
|
||||
| `directory_missing` | `e.Path` no existe en disco | error |
|
||||
| `layout_missing` | no hay `data/` ni `knowledge/` en la raíz del vault | warning |
|
||||
| `non_standard_layout` | no hay `data/`/`knowledge/` pero sí otros subdirectorios (ej. imagegen_models) | warning |
|
||||
| `index_missing` | no existe `vault_index.db` | warning |
|
||||
| `index_stale` | algún archivo en disco tiene mtime > MAX(indexed_at) | warning |
|
||||
| `index_drift` | count disco != count en tabla `files` | warning |
|
||||
| `empty_vault` | DiskFiles == 0 | warning |
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
entries, err := infra.VaultDoctor("/home/lucas/fn_registry")
|
||||
for _, e := range entries {
|
||||
fmt.Printf("%-30s %-8s files=%d issues=%v\n",
|
||||
e.VaultName, e.Status, e.DiskFiles, e.Issues)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Función read-only: nunca escribe en disco ni en ninguna base de datos.
|
||||
- `countDiskFiles` usa `filepath.WalkDir` sin hash (cheap) — excluye `vault_index.db*`, `.git/` y ficheros ocultos.
|
||||
- `isIndexStale` también usa WalkDir; compara mtime de archivos con MAX(indexed_at) de la BD.
|
||||
- El VaultIndexOpen de sólo lectura no crea el DB (si no existe, retorna error y se reporta `index_missing`).
|
||||
@@ -0,0 +1,211 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// setupDoctorRepo creates a repo layout with one vault in a project manifest.
|
||||
// vaultPath must be an absolute path that already exists (or not, for missing tests).
|
||||
func setupDoctorRepo(t *testing.T, vaultName, projectID, vaultPath string) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
|
||||
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir projects: %v", err)
|
||||
}
|
||||
manifest := "vaults:\n - name: " + vaultName + "\n description: test vault\n path: " + vaultPath + "\n tags: []\n"
|
||||
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifest), 0644); err != nil {
|
||||
t.Fatalf("write vault.yaml: %v", err)
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultDoctor_OK(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
|
||||
// Proper layout
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create a file with a past mtime so the index is not stale
|
||||
samplePath := filepath.Join(vaultDir, "data", "raw", "sample.csv")
|
||||
if err := os.WriteFile(samplePath, []byte("a,b\n1,2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
pastTime := time.Now().Add(-1 * time.Hour)
|
||||
if err := os.Chtimes(samplePath, pastTime, pastTime); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create vault_index.db with the file indexed after its mtime
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
futureIndexed := time.Now().Unix() // indexed_at is now — after file mtime
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/sample.csv", 8, pastTime.Unix(), "deadbeef", "text/csv", ".csv", "data", "raw", futureIndexed)
|
||||
if err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
root := setupDoctorRepo(t, "my_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "ok" {
|
||||
t.Errorf("Status: want ok, got %s (issues: %v)", e.Status, e.Issues)
|
||||
}
|
||||
if len(e.Issues) != 0 {
|
||||
t.Errorf("Issues: want empty, got %v", e.Issues)
|
||||
}
|
||||
if e.DiskFiles != 1 {
|
||||
t.Errorf("DiskFiles: want 1, got %d", e.DiskFiles)
|
||||
}
|
||||
if e.IndexedFiles != 1 {
|
||||
t.Errorf("IndexedFiles: want 1, got %d", e.IndexedFiles)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_MissingDir(t *testing.T) {
|
||||
missingPath := filepath.Join(t.TempDir(), "does_not_exist")
|
||||
root := setupDoctorRepo(t, "missing_vault", "my_proj", missingPath)
|
||||
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "error" {
|
||||
t.Errorf("Status: want error, got %s", e.Status)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "directory_missing" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected directory_missing issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_NoIndex(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// Proper layout but no vault_index.db
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "a.csv"), []byte("x"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
root := setupDoctorRepo(t, "no_index_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s", e.Status)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "index_missing" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected index_missing issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_LayoutDrift(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// No data/ or knowledge/ — just a random file at root
|
||||
if err := os.WriteFile(filepath.Join(vaultDir, "something.txt"), []byte("hi"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
root := setupDoctorRepo(t, "layout_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s", e.Status)
|
||||
}
|
||||
foundLayout := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "layout_missing" || issue == "non_standard_layout" {
|
||||
foundLayout = true
|
||||
}
|
||||
}
|
||||
if !foundLayout {
|
||||
t.Errorf("Expected layout_missing or non_standard_layout, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_EmptyVault(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// data/ and knowledge/ exist but are empty
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create vault_index.db (empty)
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
root := setupDoctorRepo(t, "empty_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s (issues: %v)", e.Status, e.Issues)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "empty_vault" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected empty_vault issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package infra
|
||||
|
||||
// VaultFile describes a single file inside a vault directory.
|
||||
// It carries identity (vault + relative path), content metadata (size, mtime, sha256, mime)
|
||||
// and structural classification (bucket, sub-bucket).
|
||||
type VaultFile struct {
|
||||
VaultID string `json:"vault_id"` // e.g. "turismo_spain_app_turismo"
|
||||
VaultName string `json:"vault_name"` // e.g. "turismo_spain"
|
||||
RelPath string `json:"rel_path"` // path relative to vault root, e.g. "data/raw/foo.csv"
|
||||
Size int64 `json:"size"` // bytes
|
||||
Mtime int64 `json:"mtime"` // unix seconds (UTC)
|
||||
Sha256 string `json:"sha256"` // hex lowercase
|
||||
Mime string `json:"mime"` // e.g. "text/csv"
|
||||
Ext string `json:"ext"` // e.g. ".csv"
|
||||
// Bucket is the top-level classification: "data" or "knowledge".
|
||||
Bucket string `json:"bucket"`
|
||||
// SubBucket is the second-level directory within the bucket.
|
||||
// Known values: raw, processed, exports (data); decisions, domains, models,
|
||||
// benchmarks, test_documents (knowledge). Empty string for files at bucket root.
|
||||
SubBucket string `json:"sub_bucket"`
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
size INTEGER NOT NULL,
|
||||
mtime INTEGER NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
mime TEXT NOT NULL DEFAULT '',
|
||||
ext TEXT NOT NULL DEFAULT '',
|
||||
bucket TEXT NOT NULL DEFAULT '',
|
||||
sub_bucket TEXT NOT NULL DEFAULT '',
|
||||
indexed_at INTEGER NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_sha256 ON files(sha256);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_bucket ON files(bucket, sub_bucket);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
||||
rel_path,
|
||||
content_text,
|
||||
content='',
|
||||
tokenize='unicode61 remove_diacritics 2'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS csv_profiles (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
cols_json TEXT NOT NULL,
|
||||
n_rows INTEGER NOT NULL,
|
||||
encoding TEXT NOT NULL DEFAULT '',
|
||||
date_min TEXT,
|
||||
date_max TEXT,
|
||||
profiled_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS pdf_extracts (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
page_count INTEGER NOT NULL,
|
||||
text_len INTEGER NOT NULL,
|
||||
extracted_to TEXT,
|
||||
extracted_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS knowledge_docs (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
frontmatter_json TEXT NOT NULL DEFAULT '{}',
|
||||
headings_json TEXT NOT NULL DEFAULT '[]',
|
||||
parsed_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
@@ -0,0 +1,30 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"embed"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
//go:embed vault_index_migrations/*.sql
|
||||
var vaultIndexMigrationsFS embed.FS
|
||||
|
||||
// VaultIndexOpen opens (or creates) the vault_index.db inside vaultPath.
|
||||
// It applies all embedded migrations idempotently and returns a ready-to-use
|
||||
// *sql.DB. The caller is responsible for closing the connection.
|
||||
//
|
||||
// The database is opened with WAL mode and foreign keys enabled via SQLiteOpen.
|
||||
// Migrations are applied from vault_index_migrations/*.sql in lexicographic order.
|
||||
func VaultIndexOpen(vaultPath string) (*sql.DB, error) {
|
||||
dbPath := filepath.Join(vaultPath, "vault_index.db")
|
||||
db, err := SQLiteOpen(dbPath, "")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_index_open: %w", err)
|
||||
}
|
||||
if err := ApplyMigrations(db, vaultIndexMigrationsFS, "vault_index_migrations/*.sql"); err != nil {
|
||||
db.Close()
|
||||
return nil, fmt.Errorf("vault_index_open: apply migrations: %w", err)
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
name: vault_index_open
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultIndexOpen(vaultPath string) (*sql.DB, error)"
|
||||
description: "Abre (o crea) vault_index.db dentro de vaultPath con WAL + FK y aplica las migraciones embebidas idempotentemente. El caller cierra la conexion."
|
||||
tags: [vault, sqlite, index, migration, infra]
|
||||
uses_functions: ["sqlite_open_go_infra", "sqlite_apply_migrations_go_infra"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, embed, fmt, path/filepath]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta o relativa al directorio raiz del vault"
|
||||
output: "*sql.DB apuntando a <vaultPath>/vault_index.db con schema completo aplicado; el caller es responsable de cerrar"
|
||||
tested: true
|
||||
tests:
|
||||
- "crea vault_index.db en tmpdir vacio"
|
||||
- "segunda apertura no falla (idempotente)"
|
||||
- "todas las tablas esperadas existen en sqlite_master"
|
||||
- "fts5 INSERT y MATCH funcionan"
|
||||
test_file_path: "functions/infra/vault_index_open_test.go"
|
||||
file_path: "functions/infra/vault_index_open.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
db, err := VaultIndexOpen("/data/vaults/turismo_spain")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
El archivo de base de datos se crea en `<vaultPath>/vault_index.db`. Las migraciones
|
||||
viven en `vault_index_migrations/*.sql` embebidas via `//go:embed` en el mismo paquete.
|
||||
|
||||
Schema creado por `001_init.sql`:
|
||||
- `files` — inventario de archivos (PK: rel_path)
|
||||
- `files_fts` — tabla FTS5 virtual para busqueda de texto (content_text lo llenan profilers posteriores)
|
||||
- `csv_profiles` — perfil de columnas/filas para .csv (FK → files)
|
||||
- `pdf_extracts` — metadatos de extraccion de texto para .pdf (FK → files)
|
||||
- `knowledge_docs` — headings/frontmatter para .md del bucket knowledge (FK → files)
|
||||
|
||||
`SQLiteOpen` abre con WAL mode + foreign keys. `ApplyMigrations` es idempotente:
|
||||
los errores de "already exists" y "duplicate column" se ignoran silenciosamente.
|
||||
@@ -0,0 +1,107 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVaultIndexOpen_CreatesDB(t *testing.T) {
|
||||
t.Run("crea vault_index.db en tmpdir vacio", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
dbPath := filepath.Join(dir, "vault_index.db")
|
||||
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
|
||||
t.Fatalf("vault_index.db no fue creado en %s", dir)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_Idempotent(t *testing.T) {
|
||||
t.Run("segunda apertura no falla (idempotente)", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
db1, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("primera apertura: %v", err)
|
||||
}
|
||||
db1.Close()
|
||||
|
||||
db2, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("segunda apertura: %v", err)
|
||||
}
|
||||
db2.Close()
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_AppliesAllMigrations(t *testing.T) {
|
||||
t.Run("todas las tablas esperadas existen en sqlite_master", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
expectedTables := []string{
|
||||
"files",
|
||||
"csv_profiles",
|
||||
"pdf_extracts",
|
||||
"knowledge_docs",
|
||||
}
|
||||
for _, tbl := range expectedTables {
|
||||
assertTableExists(t, db, tbl)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_FTS5Works(t *testing.T) {
|
||||
t.Run("fts5 INSERT y MATCH funcionan", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Insert a row into files_fts (content='' table, manual INSERT required)
|
||||
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`,
|
||||
"data/raw/informe_ventas.csv", "ventas trimestrales empresa")
|
||||
if err != nil {
|
||||
t.Fatalf("INSERT files_fts: %v", err)
|
||||
}
|
||||
|
||||
var count int
|
||||
err = db.QueryRow(
|
||||
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'ventas'`,
|
||||
).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("FTS MATCH query: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("FTS MATCH: got %d rows, want 1", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// assertTableExists verifies that a table (or virtual table) exists in sqlite_master.
|
||||
func assertTableExists(t *testing.T, db *sql.DB, name string) {
|
||||
t.Helper()
|
||||
var exists int
|
||||
err := db.QueryRow(
|
||||
`SELECT count(*) FROM sqlite_master WHERE name = ?`, name,
|
||||
).Scan(&exists)
|
||||
if err != nil {
|
||||
t.Fatalf("sqlite_master query for %q: %v", name, err)
|
||||
}
|
||||
if exists == 0 {
|
||||
t.Errorf("table/vtable %q not found in sqlite_master", name)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// WriteReport summarises the outcome of a VaultIndexWrite call.
|
||||
type WriteReport struct {
|
||||
Inserted int // rows newly inserted into files
|
||||
Updated int // rows updated (upserted) in files
|
||||
Pruned int // rows deleted from files (only when prune=true)
|
||||
FTS int // rows inserted into files_fts
|
||||
}
|
||||
|
||||
// VaultIndexWrite upserts a slice of VaultFile into the vault_index.db opened
|
||||
// as db, updates the files_fts FTS5 table, and optionally prunes stale rows.
|
||||
//
|
||||
// All changes run inside a single transaction.
|
||||
//
|
||||
// Counting strategy: the set of rel_paths already in the DB is read before the
|
||||
// loop. An upsert is counted as Inserted if the rel_path was absent, Updated if
|
||||
// it was present. This avoids N+1 queries while remaining correct.
|
||||
//
|
||||
// FTS5: all affected rows are deleted and re-inserted with rel_path and empty
|
||||
// content_text. Downstream profilers (csv_profiles, pdf_extracts, knowledge_docs)
|
||||
// are responsible for populating content_text with meaningful text.
|
||||
//
|
||||
// Prune: if prune=true, every row in files whose rel_path is NOT in the provided
|
||||
// slice is deleted. Cascades to csv_profiles, pdf_extracts, knowledge_docs via FK.
|
||||
func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error) {
|
||||
var report WriteReport
|
||||
if len(files) == 0 && !prune {
|
||||
return report, nil
|
||||
}
|
||||
|
||||
tx, err := db.Begin()
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: begin tx: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
tx.Rollback() //nolint:errcheck
|
||||
}
|
||||
}()
|
||||
|
||||
// Load existing rel_paths into a set to distinguish insert vs update.
|
||||
existing := make(map[string]struct{})
|
||||
rows, err := tx.Query(`SELECT rel_path FROM files`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: query existing: %w", err)
|
||||
}
|
||||
for rows.Next() {
|
||||
var rp string
|
||||
if err := rows.Scan(&rp); err != nil {
|
||||
rows.Close()
|
||||
return report, fmt.Errorf("vault_index_write: scan existing: %w", err)
|
||||
}
|
||||
existing[rp] = struct{}{}
|
||||
}
|
||||
rows.Close()
|
||||
if err := rows.Err(); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: rows err: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now().Unix()
|
||||
|
||||
upsertStmt, err := tx.Prepare(`
|
||||
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(rel_path) DO UPDATE SET
|
||||
size = excluded.size,
|
||||
mtime = excluded.mtime,
|
||||
sha256 = excluded.sha256,
|
||||
mime = excluded.mime,
|
||||
ext = excluded.ext,
|
||||
bucket = excluded.bucket,
|
||||
sub_bucket = excluded.sub_bucket,
|
||||
indexed_at = excluded.indexed_at
|
||||
`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare upsert: %w", err)
|
||||
}
|
||||
defer upsertStmt.Close()
|
||||
|
||||
ftsDeleteStmt, err := tx.Prepare(`DELETE FROM files_fts WHERE rel_path = ?`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare fts delete: %w", err)
|
||||
}
|
||||
defer ftsDeleteStmt.Close()
|
||||
|
||||
ftsInsertStmt, err := tx.Prepare(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, '')`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare fts insert: %w", err)
|
||||
}
|
||||
defer ftsInsertStmt.Close()
|
||||
|
||||
for _, f := range files {
|
||||
_, err = upsertStmt.Exec(
|
||||
f.RelPath, f.Size, f.Mtime, f.Sha256,
|
||||
f.Mime, f.Ext, f.Bucket, f.SubBucket, now,
|
||||
)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: upsert %q: %w", f.RelPath, err)
|
||||
}
|
||||
|
||||
if _, wasExisting := existing[f.RelPath]; wasExisting {
|
||||
report.Updated++
|
||||
} else {
|
||||
report.Inserted++
|
||||
}
|
||||
|
||||
// Refresh FTS row.
|
||||
if _, err = ftsDeleteStmt.Exec(f.RelPath); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: fts delete %q: %w", f.RelPath, err)
|
||||
}
|
||||
if _, err = ftsInsertStmt.Exec(f.RelPath); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: fts insert %q: %w", f.RelPath, err)
|
||||
}
|
||||
report.FTS++
|
||||
}
|
||||
|
||||
// Prune rows not present in the incoming slice.
|
||||
if prune && len(files) > 0 {
|
||||
keep := make([]string, len(files))
|
||||
for i, f := range files {
|
||||
keep[i] = "'" + strings.ReplaceAll(f.RelPath, "'", "''") + "'"
|
||||
}
|
||||
inClause := strings.Join(keep, ",")
|
||||
res, err := tx.Exec(fmt.Sprintf(
|
||||
`DELETE FROM files WHERE rel_path NOT IN (%s)`, inClause,
|
||||
))
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prune: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
report.Pruned = int(n)
|
||||
} else if prune && len(files) == 0 {
|
||||
// prune=true with empty slice means delete everything.
|
||||
res, err := tx.Exec(`DELETE FROM files`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prune all: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
report.Pruned = int(n)
|
||||
}
|
||||
|
||||
if err = tx.Commit(); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: commit: %w", err)
|
||||
}
|
||||
return report, nil
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
name: vault_index_write
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error)"
|
||||
description: "Upserta un slice de VaultFile en vault_index.db (tabla files + FTS5 files_fts) dentro de una sola transaccion. Cuenta Inserted/Updated/FTS. Con prune=true elimina filas no presentes en el slice."
|
||||
tags: [vault, sqlite, index, write, upsert, fts, infra]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, fmt, strings, time]
|
||||
params:
|
||||
- name: db
|
||||
desc: "*sql.DB abierto sobre vault_index.db (tipicamente retornado por VaultIndexOpen)"
|
||||
- name: files
|
||||
desc: "slice de VaultFile a insertar/actualizar; puede ser vacio"
|
||||
- name: prune
|
||||
desc: "si true, elimina de 'files' todas las filas cuyo rel_path no este en el slice (sincronizacion destructiva)"
|
||||
output: "WriteReport con conteos Inserted/Updated/Pruned/FTS; error si falla la transaccion"
|
||||
tested: true
|
||||
tests:
|
||||
- "N archivos nuevos — Inserted=N"
|
||||
- "re-escritura con mtime distinto — Updated=N"
|
||||
- "prune elimina filas ausentes"
|
||||
- "sin prune, filas previas persisten"
|
||||
- "FTS5 MATCH funciona tras escritura"
|
||||
test_file_path: "functions/infra/vault_index_write_test.go"
|
||||
file_path: "functions/infra/vault_index_write.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
db, _ := VaultIndexOpen("/data/vaults/turismo")
|
||||
defer db.Close()
|
||||
|
||||
files, _ := VaultInventoryScan("/data/vaults/turismo", "turismo_v1", "turismo")
|
||||
report, err := VaultIndexWrite(db, files, true)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("inserted=%d updated=%d pruned=%d fts=%d\n",
|
||||
report.Inserted, report.Updated, report.Pruned, report.FTS)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
### WriteReport
|
||||
Struct local al paquete infra:
|
||||
```go
|
||||
type WriteReport struct {
|
||||
Inserted int
|
||||
Updated int
|
||||
Pruned int
|
||||
FTS int
|
||||
}
|
||||
```
|
||||
|
||||
### Estrategia de conteo Inserted vs Updated
|
||||
Se carga el conjunto de rel_paths existentes en un map antes del loop. Un upsert
|
||||
se clasifica como Inserted si el rel_path no estaba en el map, Updated si estaba.
|
||||
Esto evita N+1 SELECTs y es correcto porque la transaccion serializa los cambios.
|
||||
|
||||
### FTS5
|
||||
`files_fts` usa `content=''` (tabla de contenido externo vacio). Para cada archivo
|
||||
se borra la fila FTS existente y se reinserta con `content_text=''`. Los profilers
|
||||
posteriores (csv_profiles, knowledge_docs) son responsables de actualizar
|
||||
`content_text` con texto indexable real.
|
||||
|
||||
### Prune
|
||||
Con `prune=true` se construye un IN clause con los rel_paths del slice. La FK con
|
||||
`ON DELETE CASCADE` propaga el DELETE a csv_profiles, pdf_extracts y knowledge_docs
|
||||
automaticamente. Con slice vacio + prune=true se borra todo (DELETE FROM files).
|
||||
|
||||
### Escapado SQL
|
||||
El IN clause se construye escapando las comillas simples en rel_path (duplicandolas).
|
||||
Evita inyeccion en rutas con apostrofos. Para entornos con rutas controladas
|
||||
(interior de vaults sin apostrofos) esto es suficiente; para entornos adversariales
|
||||
usar parametros binding con VALUES multiples via prepared statement.
|
||||
@@ -0,0 +1,210 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// makeTestVaultFile creates a minimal VaultFile for testing.
|
||||
func makeTestVaultFile(relPath, mime, bucket, subBucket string) VaultFile {
|
||||
return VaultFile{
|
||||
VaultID: "test_vault",
|
||||
VaultName: "test",
|
||||
RelPath: relPath,
|
||||
Size: 100,
|
||||
Mtime: time.Now().Unix(),
|
||||
Sha256: "abc123def456abc123def456abc123def456abc123def456abc123def456abc1",
|
||||
Mime: mime,
|
||||
Ext: ".csv",
|
||||
Bucket: bucket,
|
||||
SubBucket: subBucket,
|
||||
}
|
||||
}
|
||||
|
||||
func openInMemoryVaultIndex(t *testing.T) interface{ Close() error } {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
return db
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_FreshInsert(t *testing.T) {
|
||||
t.Run("N archivos nuevos — Inserted=N", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("knowledge/decisions/x.md", "text/markdown", "knowledge", "decisions"),
|
||||
}
|
||||
|
||||
report, err := VaultIndexWrite(db, files, false)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexWrite: %v", err)
|
||||
}
|
||||
if report.Inserted != 3 {
|
||||
t.Errorf("Inserted = %d, want 3", report.Inserted)
|
||||
}
|
||||
if report.Updated != 0 {
|
||||
t.Errorf("Updated = %d, want 0", report.Updated)
|
||||
}
|
||||
if report.Pruned != 0 {
|
||||
t.Errorf("Pruned = %d, want 0", report.Pruned)
|
||||
}
|
||||
if report.FTS != 3 {
|
||||
t.Errorf("FTS = %d, want 3", report.FTS)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_Upsert(t *testing.T) {
|
||||
t.Run("re-escritura con mtime distinto — Updated=N", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
|
||||
if _, err := VaultIndexWrite(db, files, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Modify mtime to simulate file change.
|
||||
files[0].Mtime = time.Now().Unix() + 100
|
||||
files[1].Mtime = time.Now().Unix() + 200
|
||||
|
||||
report, err := VaultIndexWrite(db, files, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second write: %v", err)
|
||||
}
|
||||
if report.Inserted != 0 {
|
||||
t.Errorf("Inserted = %d, want 0", report.Inserted)
|
||||
}
|
||||
if report.Updated != 2 {
|
||||
t.Errorf("Updated = %d, want 2", report.Updated)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_Prune(t *testing.T) {
|
||||
t.Run("prune elimina filas ausentes", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Write A and B.
|
||||
ab := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, ab, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Write only A with prune=true — B should be deleted.
|
||||
onlyA := []VaultFile{ab[0]}
|
||||
report, err := VaultIndexWrite(db, onlyA, true)
|
||||
if err != nil {
|
||||
t.Fatalf("prune write: %v", err)
|
||||
}
|
||||
if report.Pruned != 1 {
|
||||
t.Errorf("Pruned = %d, want 1", report.Pruned)
|
||||
}
|
||||
|
||||
// Verify B is gone.
|
||||
var count int
|
||||
err = db.QueryRow(`SELECT count(*) FROM files WHERE rel_path = 'data/raw/b.csv'`).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("b.csv still present after prune")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_NoPrune(t *testing.T) {
|
||||
t.Run("sin prune, filas previas persisten", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
ab := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, ab, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Write only A without prune — B must remain.
|
||||
onlyA := []VaultFile{ab[0]}
|
||||
report, err := VaultIndexWrite(db, onlyA, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second write: %v", err)
|
||||
}
|
||||
if report.Pruned != 0 {
|
||||
t.Errorf("Pruned = %d, want 0", report.Pruned)
|
||||
}
|
||||
|
||||
var count int
|
||||
err = db.QueryRow(`SELECT count(*) FROM files`).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if count != 2 {
|
||||
t.Errorf("files count = %d, want 2", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_FTSMatch(t *testing.T) {
|
||||
t.Run("FTS5 MATCH funciona tras escritura", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/foo_report.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/bar_data.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, files, false); err != nil {
|
||||
t.Fatalf("write: %v", err)
|
||||
}
|
||||
|
||||
// FTS5 on rel_path column: MATCH 'foo*'
|
||||
var count int
|
||||
err = db.QueryRow(
|
||||
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'rel_path:foo*'`,
|
||||
).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("FTS MATCH query: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("FTS MATCH rel_path:foo* = %d rows, want 1", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
|
||||
// for every regular file found, skipping:
|
||||
// - vault_index.db, vault_index.db-shm, vault_index.db-wal
|
||||
// - .git/ directories at any depth
|
||||
// - hidden files/dirs (names starting with ".") at the vault root level only
|
||||
//
|
||||
// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
|
||||
// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
|
||||
//
|
||||
// MIME detection priority:
|
||||
// 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
|
||||
// 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
|
||||
//
|
||||
// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
|
||||
// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
|
||||
// for upload validation, not for open-ended inventory scanning where any MIME is valid.
|
||||
// http.DetectContentType provides the same magic-byte detection without the allowlist
|
||||
// coupling and handles a broader set of formats including text/plain for CSV fallback.
|
||||
func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
|
||||
var files []VaultFile
|
||||
|
||||
err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
name := d.Name()
|
||||
|
||||
// Skip .git directories at any depth.
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
|
||||
// Skip hidden entries (names starting with ".") at vault root only.
|
||||
if strings.HasPrefix(name, ".") {
|
||||
rel, relErr := filepath.Rel(vaultPath, path)
|
||||
if relErr == nil {
|
||||
// At root level the relative path has no separator.
|
||||
if !strings.Contains(filepath.ToSlash(rel), "/") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip vault_index.db and its WAL/SHM sidecar files.
|
||||
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
|
||||
return nil
|
||||
}
|
||||
|
||||
rel, err := filepath.Rel(vaultPath, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
|
||||
}
|
||||
rel = filepath.ToSlash(rel)
|
||||
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
|
||||
}
|
||||
|
||||
// Compute sha256 by streaming — avoids loading large files into memory.
|
||||
sha, err := fileSha256(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
|
||||
}
|
||||
|
||||
mime, err := detectVaultFileMime(path, name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(name))
|
||||
bucket, subBucket := vaultBucketParts(rel)
|
||||
|
||||
files = append(files, VaultFile{
|
||||
VaultID: vaultID,
|
||||
VaultName: vaultName,
|
||||
RelPath: rel,
|
||||
Size: info.Size(),
|
||||
Mtime: info.ModTime().UTC().Unix(),
|
||||
Sha256: sha,
|
||||
Mime: mime,
|
||||
Ext: ext,
|
||||
Bucket: bucket,
|
||||
SubBucket: subBucket,
|
||||
})
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
|
||||
}
|
||||
|
||||
sort.Slice(files, func(i, j int) bool {
|
||||
return files[i].RelPath < files[j].RelPath
|
||||
})
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
|
||||
func fileSha256(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
h := sha256.New()
|
||||
if _, err := io.Copy(h, f); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// detectVaultFileMime returns the MIME type for a vault file.
|
||||
// Extension overrides take priority; otherwise http.DetectContentType is used.
|
||||
func detectVaultFileMime(path, name string) (string, error) {
|
||||
ext := strings.ToLower(filepath.Ext(name))
|
||||
switch ext {
|
||||
case ".csv":
|
||||
return "text/csv", nil
|
||||
case ".md":
|
||||
return "text/markdown", nil
|
||||
case ".parquet":
|
||||
return "application/parquet", nil
|
||||
}
|
||||
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
buf := make([]byte, 512)
|
||||
n, err := f.Read(buf)
|
||||
if err != nil && err != io.EOF {
|
||||
return "", err
|
||||
}
|
||||
return http.DetectContentType(buf[:n]), nil
|
||||
}
|
||||
|
||||
// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
|
||||
// the second-level sub-bucket from a forward-slash relative path.
|
||||
// Returns empty strings for files at vault root or with no recognisable bucket.
|
||||
func vaultBucketParts(relPath string) (bucket, subBucket string) {
|
||||
parts := strings.SplitN(relPath, "/", 3)
|
||||
if len(parts) < 1 {
|
||||
return "", ""
|
||||
}
|
||||
bucket = parts[0]
|
||||
if len(parts) >= 2 {
|
||||
subBucket = parts[1]
|
||||
}
|
||||
return bucket, subBucket
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
---
|
||||
name: vault_inventory_scan
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error)"
|
||||
description: "Recorre vaultPath con filepath.WalkDir y retorna un slice de VaultFile ordenado por RelPath para cada archivo regular, computando sha256 por streaming, MIME por extension/magic y bucket/sub-bucket por posicion en el arbol."
|
||||
tags: [vault, inventory, scan, filesystem, sha256, mime, infra]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [crypto/sha256, encoding/hex, fmt, io, net/http, os, path/filepath, sort, strings]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta o relativa al directorio raiz del vault"
|
||||
- name: vaultID
|
||||
desc: "identificador del vault (ej: turismo_spain_app_turismo) — se copia a cada VaultFile"
|
||||
- name: vaultName
|
||||
desc: "nombre legible del vault (ej: turismo_spain) — se copia a cada VaultFile"
|
||||
output: "slice de VaultFile ordenado lexicograficamente por RelPath; slice vacio (no nil) si el vault esta vacio"
|
||||
tested: true
|
||||
tests:
|
||||
- "tmpdir vacio retorna slice vacio"
|
||||
- "data layout — bucket y sub_bucket correctos"
|
||||
- "knowledge layout — bucket y sub_bucket correctos"
|
||||
- "omite vault_index.db y .git"
|
||||
- "sha256 determinista para mismo contenido"
|
||||
- "orden lexicografico del resultado"
|
||||
test_file_path: "functions/infra/vault_inventory_scan_test.go"
|
||||
file_path: "functions/infra/vault_inventory_scan.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
files, err := VaultInventoryScan("/data/vaults/turismo_spain", "turismo_spain_v1", "turismo_spain")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, f := range files {
|
||||
fmt.Printf("%s %s %s/%s\n", f.RelPath, f.Mime, f.Bucket, f.SubBucket)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
### Archivos omitidos
|
||||
- `vault_index.db`, `vault_index.db-shm`, `vault_index.db-wal` (siempre)
|
||||
- `.git/` en cualquier profundidad (SkipDir)
|
||||
- Entradas cuyo nombre empieza por `.` solo en la raiz del vault (nivel 0)
|
||||
|
||||
### Deteccion de MIME
|
||||
`file_validate_type_go_infra` (FileValidateType) no se usa porque su firma
|
||||
requiere una lista blanca de tipos permitidos y retorna (mime, bool) — esta
|
||||
disenada para validacion de uploads, no para escaneo inventarial donde
|
||||
cualquier MIME es valido. Se usan en su lugar:
|
||||
|
||||
1. Override por extension (prioridad alta): `.csv` → `text/csv`, `.md` → `text/markdown`,
|
||||
`.parquet` → `application/parquet`. Necesario porque `http.DetectContentType`
|
||||
clasifica CSV como `text/plain` y no conoce Parquet.
|
||||
2. `http.DetectContentType` sobre primeros 512 bytes (magic bytes, stdlib) para el resto.
|
||||
|
||||
### SHA-256
|
||||
Calculado por streaming con `io.Copy` a `sha256.New()` — no carga el archivo completo
|
||||
a memoria. Valido para archivos de cualquier tamano.
|
||||
|
||||
### Bucket / SubBucket
|
||||
Derivados de la posicion en el arbol:
|
||||
- `bucket` = primer segmento del RelPath (tipicamente "data" o "knowledge")
|
||||
- `subBucket` = segundo segmento si existe; vacio si el archivo esta en la raiz del bucket
|
||||
@@ -0,0 +1,182 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func writeTestFile(t *testing.T, dir, rel, content string) {
|
||||
t.Helper()
|
||||
full := filepath.Join(dir, filepath.FromSlash(rel))
|
||||
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", filepath.Dir(full), err)
|
||||
}
|
||||
if err := os.WriteFile(full, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", full, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Empty(t *testing.T) {
|
||||
t.Run("tmpdir vacio retorna slice vacio", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
files, err := VaultInventoryScan(dir, "v1", "test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 0 {
|
||||
t.Errorf("expected 0 files, got %d", len(files))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_DataLayout(t *testing.T) {
|
||||
t.Run("data layout — bucket y sub_bucket correctos", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "data/raw/a.csv", "col1,col2\n1,2\n")
|
||||
writeTestFile(t, dir, "data/processed/b.parquet", "PAR1fakedata")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 2 {
|
||||
t.Fatalf("expected 2 files, got %d", len(files))
|
||||
}
|
||||
|
||||
// files are sorted: data/processed/b.parquet < data/raw/a.csv
|
||||
b := files[0]
|
||||
if b.RelPath != "data/processed/b.parquet" {
|
||||
t.Errorf("files[0].RelPath = %q, want data/processed/b.parquet", b.RelPath)
|
||||
}
|
||||
if b.Bucket != "data" {
|
||||
t.Errorf("files[0].Bucket = %q, want data", b.Bucket)
|
||||
}
|
||||
if b.SubBucket != "processed" {
|
||||
t.Errorf("files[0].SubBucket = %q, want processed", b.SubBucket)
|
||||
}
|
||||
if b.Mime != "application/parquet" {
|
||||
t.Errorf("files[0].Mime = %q, want application/parquet", b.Mime)
|
||||
}
|
||||
if b.Ext != ".parquet" {
|
||||
t.Errorf("files[0].Ext = %q, want .parquet", b.Ext)
|
||||
}
|
||||
if b.VaultID != "vid" {
|
||||
t.Errorf("VaultID = %q, want vid", b.VaultID)
|
||||
}
|
||||
|
||||
a := files[1]
|
||||
if a.RelPath != "data/raw/a.csv" {
|
||||
t.Errorf("files[1].RelPath = %q, want data/raw/a.csv", a.RelPath)
|
||||
}
|
||||
if a.Mime != "text/csv" {
|
||||
t.Errorf("files[1].Mime = %q, want text/csv", a.Mime)
|
||||
}
|
||||
if a.Bucket != "data" || a.SubBucket != "raw" {
|
||||
t.Errorf("files[1]: bucket=%q subBucket=%q, want data/raw", a.Bucket, a.SubBucket)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_KnowledgeLayout(t *testing.T) {
|
||||
t.Run("knowledge layout — bucket y sub_bucket correctos", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "knowledge/decisions/x.md", "# Decision\n\ncontent")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("expected 1 file, got %d", len(files))
|
||||
}
|
||||
f := files[0]
|
||||
if f.RelPath != "knowledge/decisions/x.md" {
|
||||
t.Errorf("RelPath = %q", f.RelPath)
|
||||
}
|
||||
if f.Bucket != "knowledge" {
|
||||
t.Errorf("Bucket = %q, want knowledge", f.Bucket)
|
||||
}
|
||||
if f.SubBucket != "decisions" {
|
||||
t.Errorf("SubBucket = %q, want decisions", f.SubBucket)
|
||||
}
|
||||
if f.Mime != "text/markdown" {
|
||||
t.Errorf("Mime = %q, want text/markdown", f.Mime)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_SkipsIndexAndGit(t *testing.T) {
|
||||
t.Run("omite vault_index.db y .git", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "vault_index.db", "sqlite data")
|
||||
writeTestFile(t, dir, "vault_index.db-wal", "wal data")
|
||||
writeTestFile(t, dir, ".git/HEAD", "ref: refs/heads/master")
|
||||
writeTestFile(t, dir, "data/raw/real.csv", "a,b\n1,2\n")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("expected 1 file (real.csv), got %d: %v", len(files), relPaths(files))
|
||||
}
|
||||
if files[0].RelPath != "data/raw/real.csv" {
|
||||
t.Errorf("unexpected file: %q", files[0].RelPath)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Sha256Deterministic(t *testing.T) {
|
||||
t.Run("sha256 determinista para mismo contenido", func(t *testing.T) {
|
||||
dir1 := t.TempDir()
|
||||
dir2 := t.TempDir()
|
||||
content := "deterministic content 123\n"
|
||||
writeTestFile(t, dir1, "data/raw/f.csv", content)
|
||||
writeTestFile(t, dir2, "data/raw/f.csv", content)
|
||||
|
||||
files1, err := VaultInventoryScan(dir1, "v1", "vault1")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
files2, err := VaultInventoryScan(dir2, "v2", "vault2")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if files1[0].Sha256 != files2[0].Sha256 {
|
||||
t.Errorf("sha256 mismatch: %q vs %q", files1[0].Sha256, files2[0].Sha256)
|
||||
}
|
||||
if len(files1[0].Sha256) != 64 {
|
||||
t.Errorf("sha256 length = %d, want 64", len(files1[0].Sha256))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Sorted(t *testing.T) {
|
||||
t.Run("orden lexicografico del resultado", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "knowledge/decisions/z.md", "z")
|
||||
writeTestFile(t, dir, "data/raw/a.csv", "a")
|
||||
writeTestFile(t, dir, "data/processed/m.parquet", "m")
|
||||
writeTestFile(t, dir, "knowledge/domains/b.md", "b")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "v", "v")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for i := 1; i < len(files); i++ {
|
||||
if files[i].RelPath < files[i-1].RelPath {
|
||||
t.Errorf("not sorted at index %d: %q < %q", i, files[i].RelPath, files[i-1].RelPath)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// relPaths is a helper for test error messages.
|
||||
func relPaths(files []VaultFile) []string {
|
||||
out := make([]string, len(files))
|
||||
for i, f := range files {
|
||||
out[i] = f.RelPath
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,252 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// LayoutReport describes what VaultLayoutEnsure did (or would do) to a vault directory.
|
||||
type LayoutReport struct {
|
||||
VaultPath string `json:"vault_path"`
|
||||
Created []string `json:"created"` // dirs created (relative paths)
|
||||
Migrated []string `json:"migrated"` // renames executed, format "src -> dst" (relative)
|
||||
AlreadyOK []string `json:"already_ok"` // dirs that already existed at the target location
|
||||
Skipped []string `json:"skipped"` // unrecognized root-level entries, left untouched
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
// dataBuckets are root-level directories that belong under data/.
|
||||
var dataBuckets = []string{"raw", "processed", "exports"}
|
||||
|
||||
// knowledgeBuckets are root-level directories that belong under knowledge/.
|
||||
var knowledgeBuckets = []string{"decisions", "domains", "models", "benchmarks", "test_documents"}
|
||||
|
||||
// knownRootFiles are root-level files that should be moved to knowledge/.
|
||||
var knownRootFiles = []string{"README.md", "README.txt"}
|
||||
|
||||
// VaultLayoutEnsure ensures a vault directory uses the canonical hybrid layout:
|
||||
//
|
||||
// data/{raw,processed,exports}
|
||||
// knowledge/{decisions,domains,models,benchmarks,test_documents}
|
||||
//
|
||||
// Legacy vaults that have these directories at the root are migrated by renaming
|
||||
// (or merging when both src and dst already exist). The operation is idempotent:
|
||||
// a second run returns everything in AlreadyOK.
|
||||
//
|
||||
// When dryRun is true the function computes the report but does not touch the disk.
|
||||
func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error) {
|
||||
report := LayoutReport{DryRun: dryRun}
|
||||
|
||||
// --- resolve path ---
|
||||
vaultPath = strings.TrimRight(vaultPath, "/\\")
|
||||
|
||||
var err error
|
||||
vaultPath, err = filepath.Abs(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: abs(%q): %w", vaultPath, err)
|
||||
}
|
||||
|
||||
// Follow symlinks for the vault root itself.
|
||||
resolved, err := filepath.EvalSymlinks(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: eval symlinks %q: %w", vaultPath, err)
|
||||
}
|
||||
vaultPath = resolved
|
||||
report.VaultPath = vaultPath
|
||||
|
||||
// --- check that vault exists and is a directory ---
|
||||
info, err := os.Stat(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: stat %q: %w", vaultPath, err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return report, fmt.Errorf("vault_layout_ensure: %q is not a directory", vaultPath)
|
||||
}
|
||||
|
||||
// --- ensure top-level containers ---
|
||||
for _, container := range []string{"data", "knowledge"} {
|
||||
dst := filepath.Join(vaultPath, container)
|
||||
if err := ensureDir(dst, dryRun, container, &report); err != nil {
|
||||
return report, err
|
||||
}
|
||||
}
|
||||
|
||||
// --- build migration table: root name -> relative destination ---
|
||||
type migration struct {
|
||||
rootName string // name in vault root (dir or file)
|
||||
dstRel string // relative destination path inside vault
|
||||
isFile bool
|
||||
}
|
||||
|
||||
var migrations []migration
|
||||
for _, b := range dataBuckets {
|
||||
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("data", b)})
|
||||
}
|
||||
for _, b := range knowledgeBuckets {
|
||||
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("knowledge", b)})
|
||||
}
|
||||
for _, rf := range knownRootFiles {
|
||||
migrations = append(migrations, migration{rootName: rf, dstRel: filepath.Join("knowledge", "README.md"), isFile: true})
|
||||
}
|
||||
|
||||
// Track which root names are "known" so we can compute Skipped.
|
||||
knownNames := make(map[string]struct{})
|
||||
for _, m := range migrations {
|
||||
knownNames[strings.ToLower(m.rootName)] = struct{}{}
|
||||
}
|
||||
knownNames["data"] = struct{}{}
|
||||
knownNames["knowledge"] = struct{}{}
|
||||
|
||||
// --- apply migrations ---
|
||||
for _, m := range migrations {
|
||||
src := filepath.Join(vaultPath, m.rootName)
|
||||
dst := filepath.Join(vaultPath, m.dstRel)
|
||||
srcRel := m.rootName
|
||||
dstRel := m.dstRel
|
||||
|
||||
srcExists := pathExists(src)
|
||||
dstExists := pathExists(dst)
|
||||
|
||||
switch {
|
||||
case srcExists && dstExists:
|
||||
// Both exist: merge if directory, error on file collision.
|
||||
if m.isFile {
|
||||
return report, fmt.Errorf("vault_layout_ensure: conflict: both %q and %q exist", srcRel, dstRel)
|
||||
}
|
||||
if err := mergeDirs(src, dst, srcRel, dstRel, dryRun, &report); err != nil {
|
||||
return report, err
|
||||
}
|
||||
|
||||
case srcExists && !dstExists:
|
||||
// Only source exists: rename.
|
||||
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", srcRel, dstRel))
|
||||
if !dryRun {
|
||||
if err := os.Rename(src, dst); err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", src, dst, err)
|
||||
}
|
||||
}
|
||||
|
||||
case !srcExists && dstExists:
|
||||
// Already migrated.
|
||||
report.AlreadyOK = append(report.AlreadyOK, dstRel)
|
||||
|
||||
default:
|
||||
// Neither exists: create empty destination directory (skip for files).
|
||||
if !m.isFile {
|
||||
report.Created = append(report.Created, dstRel)
|
||||
if !dryRun {
|
||||
if err := os.MkdirAll(dst, 0o755); err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: mkdir %q: %w", dst, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- collect skipped (unrecognized root entries) ---
|
||||
entries, err := os.ReadDir(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: readdir %q: %w", vaultPath, err)
|
||||
}
|
||||
for _, e := range entries {
|
||||
if _, known := knownNames[strings.ToLower(e.Name())]; !known {
|
||||
report.Skipped = append(report.Skipped, e.Name())
|
||||
}
|
||||
}
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// ensureDir adds the dir to Created (and creates it) if it doesn't exist,
|
||||
// or to AlreadyOK if it does. Used for top-level containers "data" and "knowledge".
|
||||
func ensureDir(path string, dryRun bool, rel string, report *LayoutReport) error {
|
||||
if pathExists(path) {
|
||||
report.AlreadyOK = append(report.AlreadyOK, rel)
|
||||
return nil
|
||||
}
|
||||
report.Created = append(report.Created, rel)
|
||||
if dryRun {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: mkdir %q: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mergeDirs moves the contents of src into dst, then removes src if empty.
|
||||
// Returns an error if any file in src already exists in dst (no overwrite policy).
|
||||
func mergeDirs(src, dst, srcRel, dstRel string, dryRun bool, report *LayoutReport) error {
|
||||
children, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: readdir %q: %w", src, err)
|
||||
}
|
||||
|
||||
for _, child := range children {
|
||||
childDst := filepath.Join(dst, child.Name())
|
||||
if pathExists(childDst) {
|
||||
return fmt.Errorf("vault_layout_ensure: merge conflict: %q already exists in %q (cannot overwrite %q)",
|
||||
child.Name(), dstRel, filepath.Join(srcRel, child.Name()))
|
||||
}
|
||||
childSrc := filepath.Join(src, child.Name())
|
||||
childSrcRel := filepath.Join(srcRel, child.Name())
|
||||
childDstRel := filepath.Join(dstRel, child.Name())
|
||||
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", childSrcRel, childDstRel))
|
||||
if !dryRun {
|
||||
if err := os.Rename(childSrc, childDst); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", childSrc, childDst, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the now-empty src directory.
|
||||
if !dryRun {
|
||||
// Re-check emptiness after renames.
|
||||
remaining, _ := os.ReadDir(src)
|
||||
if len(remaining) == 0 {
|
||||
if err := os.Remove(src); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: remove empty src %q: %w", src, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// pathExists returns true if path exists (any type).
|
||||
func pathExists(path string) bool {
|
||||
_, err := os.Lstat(path)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// dirIsEmpty returns true if a directory exists and has no entries.
|
||||
func dirIsEmpty(path string) bool {
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return len(entries) == 0
|
||||
}
|
||||
|
||||
// _ prevents "declared but not used" if dirIsEmpty is only used in tests.
|
||||
var _ = dirIsEmpty
|
||||
|
||||
// vaultLayoutKnownNames returns the set of root-level names managed by this function.
|
||||
// Exported for use in tests.
|
||||
func vaultLayoutKnownNames() map[string]struct{} {
|
||||
known := make(map[string]struct{})
|
||||
for _, b := range dataBuckets {
|
||||
known[b] = struct{}{}
|
||||
}
|
||||
for _, b := range knowledgeBuckets {
|
||||
known[b] = struct{}{}
|
||||
}
|
||||
for _, rf := range knownRootFiles {
|
||||
known[strings.ToLower(rf)] = struct{}{}
|
||||
}
|
||||
known["data"] = struct{}{}
|
||||
known["knowledge"] = struct{}{}
|
||||
return known
|
||||
}
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
---
|
||||
name: vault_layout_ensure
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error)"
|
||||
description: "Normaliza el layout de un vault al esquema hibrido canónico data/{raw,processed,exports} + knowledge/{decisions,domains,models,benchmarks,test_documents}. Migra directorios legacy en la raíz del vault a su ubicación correcta; idempotente."
|
||||
tags: [vault, layout, migration, infra, filesystem, idempotent]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta al directorio raíz del vault. Puede ser absoluta, relativa o un symlink — se resuelve con filepath.Abs + filepath.EvalSymlinks. Trailing slashes se ignoran."
|
||||
- name: dry_run
|
||||
desc: "Si true, calcula el reporte completo (qué se crearía, migraría, etc.) pero no modifica el disco. Util para previsualizar antes de ejecutar."
|
||||
output: "LayoutReport con: VaultPath (ruta resuelta), Created (dirs creados), Migrated (renombres ejecutados, formato 'src -> dst'), AlreadyOK (destinos que ya existían), Skipped (entradas en raíz no reconocidas, no tocadas), DryRun (flag). Error si el path no existe, no es directorio, o hay conflicto de merge (mismo nombre de archivo en src y dst)."
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultLayoutEnsure_DryRun_NoChange"
|
||||
- "TestVaultLayoutEnsure_FreshDir_CreatesLayout"
|
||||
- "TestVaultLayoutEnsure_LegacyDataLayout_Migrates"
|
||||
- "TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates"
|
||||
- "TestVaultLayoutEnsure_AlreadyMigrated_Idempotent"
|
||||
- "TestVaultLayoutEnsure_Mixed_PartialMigration"
|
||||
- "TestVaultLayoutEnsure_MergeConflict_Errors"
|
||||
- "TestVaultLayoutEnsure_UnknownFiles_Skipped"
|
||||
- "TestVaultLayoutEnsure_NotADir_Errors"
|
||||
test_file_path: "functions/infra/vault_layout_ensure_test.go"
|
||||
file_path: "functions/infra/vault_layout_ensure.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
// Previsualizar sin tocar disco:
|
||||
report, err := VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", true)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("Would migrate: %v\n", report.Migrated)
|
||||
fmt.Printf("Would create: %v\n", report.Created)
|
||||
|
||||
// Ejecutar la migración:
|
||||
report, err = VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", false)
|
||||
if err != nil {
|
||||
log.Fatalf("migration failed: %v", err)
|
||||
}
|
||||
fmt.Printf("Migrated: %v\n", report.Migrated)
|
||||
fmt.Printf("Created: %v\n", report.Created)
|
||||
fmt.Printf("Skipped: %v\n", report.Skipped)
|
||||
```
|
||||
|
||||
## Comportamiento detallado
|
||||
|
||||
**Directorios gestionados:**
|
||||
|
||||
| Raíz (legacy) | Destino canónico |
|
||||
|---|---|
|
||||
| `raw/` | `data/raw/` |
|
||||
| `processed/` | `data/processed/` |
|
||||
| `exports/` | `data/exports/` |
|
||||
| `decisions/` | `knowledge/decisions/` |
|
||||
| `domains/` | `knowledge/domains/` |
|
||||
| `models/` | `knowledge/models/` |
|
||||
| `benchmarks/` | `knowledge/benchmarks/` |
|
||||
| `test_documents/` | `knowledge/test_documents/` |
|
||||
| `README.md` / `README.txt` | `knowledge/README.md` |
|
||||
|
||||
**Lógica de migración (por cada entrada conocida):**
|
||||
|
||||
- Solo `src` existe → rename atómico `src` → `dst`, registrado en `Migrated`.
|
||||
- Solo `dst` existe → ya migrado, registrado en `AlreadyOK`.
|
||||
- Ambos existen (dir) → merge: mueve cada hijo de `src/` a `dst/`; error si mismo nombre. Registrado en `Migrated` por hijo.
|
||||
- Ambos existen (archivo README) → error inmediato con paths concretos.
|
||||
- Ninguno existe → crea `dst` vacío, registrado en `Created`.
|
||||
|
||||
**Archivos/dirs no reconocidos** en la raíz (`.git`, `vault_index.db`, archivos custom) se registran en `Skipped` y no se tocan.
|
||||
|
||||
**Idempotencia:** segunda ejecución sobre un vault ya migrado reporta todo en `AlreadyOK` y no toca disco.
|
||||
|
||||
## Notas
|
||||
|
||||
`LayoutReport` es un tipo local de esta función (no un tipo del registry). El struct exportado vive en `functions/infra/vault_layout_ensure.go` junto con la función.
|
||||
|
||||
Para aplicar la migración a múltiples vaults en batch, invocar desde un pipeline que lea los paths de `vault.yaml` (ver `vault_manifest_read_go_infra`) y llame a `VaultLayoutEnsure` en cada uno.
|
||||
@@ -0,0 +1,394 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mkVaultDir creates a temporary directory tree for tests.
|
||||
// entries is a list of relative paths to create.
|
||||
// Paths ending in "/" are directories; others are files with placeholder content.
|
||||
func mkVaultDir(t *testing.T, entries []string) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
for _, e := range entries {
|
||||
full := filepath.Join(root, filepath.FromSlash(e))
|
||||
if e[len(e)-1] == '/' {
|
||||
if err := os.MkdirAll(full, 0o755); err != nil {
|
||||
t.Fatalf("mkVaultDir: mkdir %q: %v", full, err)
|
||||
}
|
||||
} else {
|
||||
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
|
||||
t.Fatalf("mkVaultDir: mkdir parent %q: %v", full, err)
|
||||
}
|
||||
if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
|
||||
t.Fatalf("mkVaultDir: write %q: %v", full, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_DryRun_NoChange(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/file1.csv",
|
||||
"processed/",
|
||||
})
|
||||
|
||||
before := snapshotDir(t, root)
|
||||
report, err := VaultLayoutEnsure(root, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !report.DryRun {
|
||||
t.Error("DryRun flag not set in report")
|
||||
}
|
||||
after := snapshotDir(t, root)
|
||||
if !mapEqual(before, after) {
|
||||
t.Errorf("dry-run modified disk: before=%v after=%v", before, after)
|
||||
}
|
||||
// Should have planned a migration for raw and processed.
|
||||
if len(report.Migrated) == 0 {
|
||||
t.Error("expected Migrated to be non-empty in dry-run plan")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_FreshDir_CreatesLayout(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{}) // empty vault
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// All standard dirs should be created.
|
||||
wantCreated := []string{
|
||||
"data", "knowledge",
|
||||
filepath.Join("data", "raw"),
|
||||
filepath.Join("data", "processed"),
|
||||
filepath.Join("data", "exports"),
|
||||
filepath.Join("knowledge", "decisions"),
|
||||
filepath.Join("knowledge", "domains"),
|
||||
filepath.Join("knowledge", "models"),
|
||||
filepath.Join("knowledge", "benchmarks"),
|
||||
filepath.Join("knowledge", "test_documents"),
|
||||
}
|
||||
createdSet := toSet(report.Created)
|
||||
for _, w := range wantCreated {
|
||||
if _, ok := createdSet[w]; !ok {
|
||||
t.Errorf("expected Created to contain %q, got %v", w, report.Created)
|
||||
}
|
||||
}
|
||||
|
||||
// All directories must actually exist on disk.
|
||||
for _, w := range wantCreated {
|
||||
full := filepath.Join(root, w)
|
||||
info, err := os.Stat(full)
|
||||
if err != nil {
|
||||
t.Errorf("expected %q to exist: %v", full, err)
|
||||
continue
|
||||
}
|
||||
if !info.IsDir() {
|
||||
t.Errorf("%q should be a directory", full)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_LegacyDataLayout_Migrates(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/file1.parquet",
|
||||
"raw/file2.parquet",
|
||||
"processed/",
|
||||
"processed/clean.csv",
|
||||
"exports/",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// raw and processed should appear in Migrated (as dirs, top-level rename).
|
||||
migratedSet := toSet(report.Migrated)
|
||||
for _, pair := range []string{
|
||||
"raw -> " + filepath.Join("data", "raw"),
|
||||
"processed -> " + filepath.Join("data", "processed"),
|
||||
} {
|
||||
if _, ok := migratedSet[pair]; !ok {
|
||||
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
// Files must have moved.
|
||||
for _, f := range []string{
|
||||
filepath.Join("data", "raw", "file1.parquet"),
|
||||
filepath.Join("data", "raw", "file2.parquet"),
|
||||
filepath.Join("data", "processed", "clean.csv"),
|
||||
} {
|
||||
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
|
||||
t.Errorf("expected %q to exist after migration: %v", f, err)
|
||||
}
|
||||
}
|
||||
// Old dirs must be gone.
|
||||
for _, d := range []string{"raw", "processed"} {
|
||||
if pathExists(filepath.Join(root, d)) {
|
||||
t.Errorf("expected legacy dir %q to be removed", d)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"decisions/",
|
||||
"decisions/2024-01.md",
|
||||
"models/",
|
||||
"models/ner_v1.pkl",
|
||||
"README.md",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// decisions and models should appear in Migrated.
|
||||
migratedSet := toSet(report.Migrated)
|
||||
for _, pair := range []string{
|
||||
"decisions -> " + filepath.Join("knowledge", "decisions"),
|
||||
"models -> " + filepath.Join("knowledge", "models"),
|
||||
"README.md -> " + filepath.Join("knowledge", "README.md"),
|
||||
} {
|
||||
if _, ok := migratedSet[pair]; !ok {
|
||||
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
// Files must be at new location.
|
||||
for _, f := range []string{
|
||||
filepath.Join("knowledge", "decisions", "2024-01.md"),
|
||||
filepath.Join("knowledge", "models", "ner_v1.pkl"),
|
||||
filepath.Join("knowledge", "README.md"),
|
||||
} {
|
||||
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
|
||||
t.Errorf("expected %q to exist after migration: %v", f, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_AlreadyMigrated_Idempotent(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/file.csv",
|
||||
"data/processed/",
|
||||
"data/exports/",
|
||||
"knowledge/",
|
||||
"knowledge/decisions/",
|
||||
"knowledge/domains/",
|
||||
"knowledge/models/",
|
||||
"knowledge/benchmarks/",
|
||||
"knowledge/test_documents/",
|
||||
})
|
||||
|
||||
report1, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("first run error: %v", err)
|
||||
}
|
||||
if len(report1.Migrated) != 0 {
|
||||
t.Errorf("first run on fully-migrated vault should have no migrations, got %v", report1.Migrated)
|
||||
}
|
||||
|
||||
before := snapshotDir(t, root)
|
||||
report2, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second run error: %v", err)
|
||||
}
|
||||
after := snapshotDir(t, root)
|
||||
|
||||
if !mapEqual(before, after) {
|
||||
t.Error("second run modified disk (not idempotent)")
|
||||
}
|
||||
if len(report2.Migrated) != 0 {
|
||||
t.Errorf("second run should produce no migrations, got %v", report2.Migrated)
|
||||
}
|
||||
if len(report2.AlreadyOK) == 0 {
|
||||
t.Error("second run should report existing dirs as AlreadyOK")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_Mixed_PartialMigration(t *testing.T) {
|
||||
// data/raw already migrated; exports still at root; knowledge dirs in legacy positions.
|
||||
root := mkVaultDir(t, []string{
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/already_here.csv",
|
||||
"exports/",
|
||||
"exports/report.pdf",
|
||||
"decisions/",
|
||||
"decisions/2023-note.md",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// data/raw should be AlreadyOK.
|
||||
if !sliceContains(report.AlreadyOK, filepath.Join("data", "raw")) {
|
||||
t.Errorf("data/raw should be AlreadyOK, got AlreadyOK=%v", report.AlreadyOK)
|
||||
}
|
||||
// exports should be migrated.
|
||||
exportsMigrated := false
|
||||
for _, m := range report.Migrated {
|
||||
if m == "exports -> "+filepath.Join("data", "exports") {
|
||||
exportsMigrated = true
|
||||
}
|
||||
}
|
||||
if !exportsMigrated {
|
||||
t.Errorf("exports should be migrated, Migrated=%v", report.Migrated)
|
||||
}
|
||||
// decisions should be migrated.
|
||||
decisionsMigrated := false
|
||||
for _, m := range report.Migrated {
|
||||
if m == "decisions -> "+filepath.Join("knowledge", "decisions") {
|
||||
decisionsMigrated = true
|
||||
}
|
||||
}
|
||||
if !decisionsMigrated {
|
||||
t.Errorf("decisions should be migrated, Migrated=%v", report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_MergeConflict_Errors(t *testing.T) {
|
||||
// Both src (raw/) and dst (data/raw/) exist and have a file with the same name.
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/collision.csv",
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/collision.csv", // same name -> conflict
|
||||
})
|
||||
|
||||
_, err := VaultLayoutEnsure(root, false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for merge conflict, got nil")
|
||||
}
|
||||
if !contains(err.Error(), "conflict") && !contains(err.Error(), "collision.csv") {
|
||||
t.Errorf("error should mention conflict or the file name, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_UnknownFiles_Skipped(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
".git/",
|
||||
"vault_index.db",
|
||||
"my_custom_notes.txt",
|
||||
"raw/",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
skippedSet := toSet(report.Skipped)
|
||||
for _, name := range []string{".git", "vault_index.db", "my_custom_notes.txt"} {
|
||||
if _, ok := skippedSet[name]; !ok {
|
||||
t.Errorf("expected %q in Skipped, got %v", name, report.Skipped)
|
||||
}
|
||||
}
|
||||
// raw should NOT be in Skipped (it's a known bucket).
|
||||
if _, ok := skippedSet["raw"]; ok {
|
||||
t.Error("raw should not appear in Skipped — it is a known bucket")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_NotADir_Errors(t *testing.T) {
|
||||
t.Run("non-existent path", func(t *testing.T) {
|
||||
_, err := VaultLayoutEnsure("/tmp/does_not_exist_fn_registry_test_xyz", false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-existent path")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("path is a file", func(t *testing.T) {
|
||||
f, err := os.CreateTemp("", "vault_layout_*.txt")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
f.Close()
|
||||
defer os.Remove(f.Name())
|
||||
|
||||
_, err = VaultLayoutEnsure(f.Name(), false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when vaultPath is a file, not a dir")
|
||||
}
|
||||
if !contains(err.Error(), "not a directory") {
|
||||
t.Errorf("error should mention 'not a directory', got: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// --- helpers ---
|
||||
|
||||
// snapshotDir returns a map of relative path -> exists for all entries under root.
|
||||
func snapshotDir(t *testing.T, root string) map[string]bool {
|
||||
t.Helper()
|
||||
snap := make(map[string]bool)
|
||||
err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, _ := filepath.Rel(root, path)
|
||||
snap[rel] = true
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("snapshotDir: %v", err)
|
||||
}
|
||||
return snap
|
||||
}
|
||||
|
||||
func mapEqual(a, b map[string]bool) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for k := range a {
|
||||
if !b[k] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func toSet(ss []string) map[string]struct{} {
|
||||
m := make(map[string]struct{}, len(ss))
|
||||
for _, s := range ss {
|
||||
m[s] = struct{}{}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func sliceContains(ss []string, target string) bool {
|
||||
for _, s := range ss {
|
||||
if s == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func contains(s, sub string) bool {
|
||||
return len(s) >= len(sub) && (s == sub || len(sub) == 0 ||
|
||||
func() bool {
|
||||
for i := 0; i <= len(s)-len(sub); i++ {
|
||||
if s[i:i+len(sub)] == sub {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}())
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// VaultManifestEntry is a single vault entry parsed from a projects/<proj>/vaults/vault.yaml.
|
||||
type VaultManifestEntry struct {
|
||||
ProjectID string // basename of projects/<proj>/, inferred from manifest path
|
||||
Name string // vault name as declared in vault.yaml
|
||||
Description string // human description
|
||||
Path string // absolute path to the vault directory
|
||||
Tags []string // tags declared in vault.yaml
|
||||
ManifestFile string // absolute path to the vault.yaml this entry came from
|
||||
}
|
||||
|
||||
// vaultYAML mirrors the vault.yaml schema (only the fields we care about).
|
||||
type vaultYAML struct {
|
||||
Vaults []struct {
|
||||
Name string `yaml:"name"`
|
||||
Description string `yaml:"description"`
|
||||
Path string `yaml:"path"`
|
||||
Tags []string `yaml:"tags"`
|
||||
} `yaml:"vaults"`
|
||||
}
|
||||
|
||||
// VaultManifestRead globs all projects/*/vaults/vault.yaml under repoRoot, parses each
|
||||
// manifest and returns a flat slice of VaultManifestEntry.
|
||||
//
|
||||
// Rules:
|
||||
// - If a manifest fails to parse, an error is returned immediately with the file path.
|
||||
// - If no manifests are found, an empty slice is returned (not an error).
|
||||
// - ProjectID is inferred from the directory component between "projects/" and "/vaults/".
|
||||
func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error) {
|
||||
pattern := filepath.Join(repoRoot, "projects", "*", "vaults", "vault.yaml")
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: glob %q: %w", pattern, err)
|
||||
}
|
||||
|
||||
var out []VaultManifestEntry
|
||||
for _, manifestPath := range matches {
|
||||
entries, err := parseVaultManifest(manifestPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, entries...)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func parseVaultManifest(manifestPath string) ([]VaultManifestEntry, error) {
|
||||
data, err := os.ReadFile(manifestPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: read %q: %w", manifestPath, err)
|
||||
}
|
||||
|
||||
var raw vaultYAML
|
||||
if err := yaml.Unmarshal(data, &raw); err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: parse %q: %w", manifestPath, err)
|
||||
}
|
||||
|
||||
projectID := inferProjectID(manifestPath)
|
||||
|
||||
entries := make([]VaultManifestEntry, 0, len(raw.Vaults))
|
||||
for _, v := range raw.Vaults {
|
||||
entries = append(entries, VaultManifestEntry{
|
||||
ProjectID: projectID,
|
||||
Name: v.Name,
|
||||
Description: v.Description,
|
||||
Path: v.Path,
|
||||
Tags: v.Tags,
|
||||
ManifestFile: manifestPath,
|
||||
})
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// inferProjectID extracts the project basename from a path of the form
|
||||
// .../projects/<proj>/vaults/vault.yaml.
|
||||
func inferProjectID(manifestPath string) string {
|
||||
// Normalize separators and split.
|
||||
parts := strings.Split(filepath.ToSlash(manifestPath), "/")
|
||||
// Walk backwards: vault.yaml -> vaults -> <proj> -> projects -> ...
|
||||
for i, p := range parts {
|
||||
if p == "projects" && i+1 < len(parts) {
|
||||
return parts[i+1]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
name: vault_manifest_read
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error)"
|
||||
description: "Lee todos los manifests vault.yaml bajo projects/*/vaults/ del repo y devuelve una lista plana de entradas de vault con su ProjectID inferido del path."
|
||||
tags: [vault, manifest, yaml, infra, projects, storage]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
- "gopkg.in/yaml.v3"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del repositorio fn_registry. Se usa como base para el glob projects/*/vaults/vault.yaml."
|
||||
output: "Slice plano de VaultManifestEntry (ProjectID, Name, Description, Path, Tags, ManifestFile). Vacio si no hay manifests. Error si un yaml no parsea, con el path concreto en el mensaje."
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultManifestRead_HappyPath"
|
||||
- "TestVaultManifestRead_MalformedYAML"
|
||||
- "TestVaultManifestRead_EmptyDir"
|
||||
test_file_path: "functions/infra/vault_manifest_read_test.go"
|
||||
file_path: "functions/infra/vault_manifest_read.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
entries, err := VaultManifestRead("/home/lucas/fn_registry")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, e := range entries {
|
||||
fmt.Printf("%s/%s -> %s\n", e.ProjectID, e.Name, e.Path)
|
||||
}
|
||||
// app_turismo/turismo_spain -> /home/lucas/vaults/turismo_spain
|
||||
// app_finance/finance_data -> /home/lucas/vaults/finance_data
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
`VaultManifestEntry` es un tipo local de esta funcion (no un tipo del registry). Contiene:
|
||||
- `ProjectID` — basename del directorio `projects/<proj>/`, inferido del path del manifest.
|
||||
- `Name`, `Description`, `Path`, `Tags` — copiados del yaml tal cual.
|
||||
- `ManifestFile` — path absoluto al vault.yaml de origen, util para mensajes de error y trazabilidad.
|
||||
|
||||
El parseo usa `gopkg.in/yaml.v3` (ya en go.mod). Si un manifest falla, la funcion devuelve
|
||||
error inmediatamente con el path del fichero problemático. Los manifests sin entradas
|
||||
`vaults:` contribuyen cero entries (no es error). Si no existe ningun `projects/*/vaults/vault.yaml`
|
||||
el resultado es slice vacio sin error.
|
||||
@@ -0,0 +1,113 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVaultManifestRead_HappyPath(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
writeManifest(t, root, "app_turismo", `
|
||||
vaults:
|
||||
- name: turismo_spain
|
||||
description: "Datos de turismo en Espana"
|
||||
path: "/home/lucas/vaults/turismo_spain"
|
||||
tags: [turismo, espana]
|
||||
- name: turismo_raw
|
||||
description: "Datos brutos sin procesar"
|
||||
path: "/home/lucas/vaults/turismo_raw"
|
||||
tags: [raw]
|
||||
`)
|
||||
|
||||
writeManifest(t, root, "app_finance", `
|
||||
vaults:
|
||||
- name: finance_data
|
||||
description: "Datos financieros"
|
||||
path: "/home/lucas/vaults/finance_data"
|
||||
tags: [finance]
|
||||
`)
|
||||
|
||||
entries, err := VaultManifestRead(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 3 {
|
||||
t.Fatalf("got %d entries, want 3", len(entries))
|
||||
}
|
||||
|
||||
// Build index by name for order-independent assertions.
|
||||
byName := make(map[string]VaultManifestEntry, len(entries))
|
||||
for _, e := range entries {
|
||||
byName[e.Name] = e
|
||||
}
|
||||
|
||||
// Check turismo_spain entry.
|
||||
e, ok := byName["turismo_spain"]
|
||||
if !ok {
|
||||
t.Fatal("missing entry 'turismo_spain'")
|
||||
}
|
||||
if e.ProjectID != "app_turismo" {
|
||||
t.Errorf("turismo_spain.ProjectID = %q, want %q", e.ProjectID, "app_turismo")
|
||||
}
|
||||
if e.Path != "/home/lucas/vaults/turismo_spain" {
|
||||
t.Errorf("turismo_spain.Path = %q, want %q", e.Path, "/home/lucas/vaults/turismo_spain")
|
||||
}
|
||||
if len(e.Tags) != 2 || e.Tags[0] != "turismo" {
|
||||
t.Errorf("turismo_spain.Tags = %v, want [turismo espana]", e.Tags)
|
||||
}
|
||||
if e.ManifestFile == "" {
|
||||
t.Error("turismo_spain.ManifestFile is empty")
|
||||
}
|
||||
|
||||
// Check finance_data entry belongs to app_finance.
|
||||
ef, ok := byName["finance_data"]
|
||||
if !ok {
|
||||
t.Fatal("missing entry 'finance_data'")
|
||||
}
|
||||
if ef.ProjectID != "app_finance" {
|
||||
t.Errorf("finance_data.ProjectID = %q, want %q", ef.ProjectID, "app_finance")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultManifestRead_MalformedYAML(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
writeManifest(t, root, "bad_project", `
|
||||
vaults:
|
||||
- name: [invalid yaml
|
||||
path: missing_bracket
|
||||
`)
|
||||
|
||||
_, err := VaultManifestRead(root)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for malformed YAML, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultManifestRead_EmptyDir(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
// No projects/ directory at all — glob returns no matches.
|
||||
entries, err := VaultManifestRead(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error for empty dir: %v", err)
|
||||
}
|
||||
if len(entries) != 0 {
|
||||
t.Fatalf("got %d entries, want 0", len(entries))
|
||||
}
|
||||
}
|
||||
|
||||
// writeManifest creates <root>/projects/<proj>/vaults/vault.yaml with the given content.
|
||||
func writeManifest(t *testing.T, root, proj, content string) {
|
||||
t.Helper()
|
||||
dir := filepath.Join(root, "projects", proj, "vaults")
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", dir, err)
|
||||
}
|
||||
f := filepath.Join(dir, "vault.yaml")
|
||||
if err := os.WriteFile(f, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VaultSearchHit is a single result returned by VaultSearch.
|
||||
type VaultSearchHit struct {
|
||||
VaultPath string `json:"vault_path"`
|
||||
VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
|
||||
RelPath string `json:"rel_path"`
|
||||
Size int64 `json:"size"`
|
||||
Mtime int64 `json:"mtime"`
|
||||
Mime string `json:"mime"`
|
||||
Bucket string `json:"bucket"`
|
||||
SubBucket string `json:"sub_bucket"`
|
||||
Snippet string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
|
||||
}
|
||||
|
||||
// VaultSearch searches vault_index.db inside vaultPath for files matching query.
|
||||
//
|
||||
// Behaviour:
|
||||
// 1. Opens vault_index.db via VaultIndexOpen.
|
||||
// 2. If limit <= 0, defaults to 50.
|
||||
// 3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
|
||||
// is populated by profilers). Because the FTS5 table uses content='' (contentless),
|
||||
// column values are not stored; results are correlated back to files via a LIKE
|
||||
// match on rel_path for path tokens, or via an IN clause of matched rowids for
|
||||
// content_text matches.
|
||||
// 4. Also searches files.rel_path with LIKE to find path matches.
|
||||
// 5. Results from both searches are merged (deduplication by rel_path).
|
||||
// 6. If both FTS5 and LIKE queries fail, returns the error.
|
||||
// 7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
|
||||
func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
|
||||
db, err := VaultIndexOpen(vaultPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_search: open index: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
vaultName := resolveVaultName(vaultPath)
|
||||
|
||||
hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_search: %w", err)
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
|
||||
// 1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
|
||||
// Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
|
||||
// 2. LIKE on files.rel_path (always reliable for path searching).
|
||||
//
|
||||
// Results are deduplicated by rel_path, up to limit entries.
|
||||
func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
|
||||
seen := make(map[string]struct{})
|
||||
var hits []VaultSearchHit
|
||||
|
||||
// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
|
||||
// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
|
||||
// We get matching rowids from FTS5, then look up files by rowid.
|
||||
// This is reliable for content_text matches because VaultIndexWrite inserts
|
||||
// content_text rows independently of the path rows (profilers update them).
|
||||
// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
|
||||
ftsQuery := safeFTSQuery(query)
|
||||
ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
|
||||
if ftsErr == nil {
|
||||
for _, h := range ftsHits {
|
||||
if len(hits) >= limit {
|
||||
break
|
||||
}
|
||||
if _, ok := seen[h.RelPath]; !ok {
|
||||
seen[h.RelPath] = struct{}{}
|
||||
hits = append(hits, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
|
||||
// If it failed with a non-syntax error, still continue to LIKE fallback.
|
||||
|
||||
// Strategy 2: LIKE on rel_path — reliable path search.
|
||||
// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
|
||||
// word-like token so the LIKE pattern is still useful.
|
||||
likeQuery := simplifyForLike(query)
|
||||
if len(hits) < limit && likeQuery != "" {
|
||||
remaining := limit - len(hits)
|
||||
likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
|
||||
if likeErr != nil && ftsErr != nil {
|
||||
// Both failed — return a combined error.
|
||||
return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
|
||||
}
|
||||
for _, h := range likeHits {
|
||||
if len(hits) >= limit {
|
||||
break
|
||||
}
|
||||
if _, ok := seen[h.RelPath]; !ok {
|
||||
seen[h.RelPath] = struct{}{}
|
||||
hits = append(hits, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hits == nil {
|
||||
hits = []VaultSearchHit{}
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
|
||||
// back to the files table.
|
||||
//
|
||||
// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
|
||||
// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
|
||||
// This works correctly when content_text was populated by a profiler that did NOT
|
||||
// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
|
||||
// without changing the rowid). For the current VaultIndexWrite implementation
|
||||
// (which inserts content_text='' and profilers update it in-place), the rowids
|
||||
// remain stable after profiling.
|
||||
func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
|
||||
// Get matching rowids from FTS5.
|
||||
const qRowids = `
|
||||
SELECT rowid
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?`
|
||||
|
||||
rows, err := db.Query(qRowids, safeQuery, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var rowids []int64
|
||||
for rows.Next() {
|
||||
var rid int64
|
||||
if err := rows.Scan(&rid); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rowids = append(rowids, rid)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(rowids) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
|
||||
// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
|
||||
var hits []VaultSearchHit
|
||||
for _, rid := range rowids {
|
||||
var h VaultSearchHit
|
||||
err := db.QueryRow(`
|
||||
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
|
||||
FROM files WHERE rowid = ?`, rid,
|
||||
).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
|
||||
if err != nil {
|
||||
// rowid mismatch (happens after update cycles) — skip gracefully.
|
||||
continue
|
||||
}
|
||||
h.VaultPath = vaultPath
|
||||
h.VaultName = vaultName
|
||||
h.Snippet = ""
|
||||
hits = append(hits, h)
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
|
||||
func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
|
||||
const qLike = `
|
||||
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
|
||||
FROM files
|
||||
WHERE rel_path LIKE '%' || ? || '%'
|
||||
ORDER BY mtime DESC
|
||||
LIMIT ?`
|
||||
|
||||
rows, err := db.Query(qLike, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var hits []VaultSearchHit
|
||||
for rows.Next() {
|
||||
var h VaultSearchHit
|
||||
if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
h.VaultPath = vaultPath
|
||||
h.VaultName = vaultName
|
||||
h.Snippet = ""
|
||||
hits = append(hits, h)
|
||||
}
|
||||
return hits, rows.Err()
|
||||
}
|
||||
|
||||
// resolveVaultName returns the basename of vaultPath after resolving symlinks.
|
||||
// Falls back to filepath.Base if EvalSymlinks fails.
|
||||
func resolveVaultName(vaultPath string) string {
|
||||
resolved, err := filepath.EvalSymlinks(vaultPath)
|
||||
if err != nil {
|
||||
resolved = vaultPath
|
||||
}
|
||||
return filepath.Base(resolved)
|
||||
}
|
||||
|
||||
// safeFTSQuery wraps the query in double-quotes if it does not already contain
|
||||
// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
|
||||
// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
|
||||
func safeFTSQuery(query string) string {
|
||||
q := strings.TrimSpace(query)
|
||||
if q == "" {
|
||||
return q
|
||||
}
|
||||
upper := strings.ToUpper(q)
|
||||
// If user already uses explicit operators or column prefix, pass through.
|
||||
if strings.ContainsAny(q, ":") ||
|
||||
strings.Contains(upper, " AND ") ||
|
||||
strings.Contains(upper, " OR ") ||
|
||||
strings.Contains(upper, " NOT ") {
|
||||
return q
|
||||
}
|
||||
// Escape any double-quotes in the query before wrapping.
|
||||
escaped := strings.ReplaceAll(q, `"`, `""`)
|
||||
return `"` + escaped + `"`
|
||||
}
|
||||
|
||||
// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
|
||||
func isFTSSyntaxError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
return strings.Contains(msg, "syntax error") ||
|
||||
strings.Contains(msg, "no such column") ||
|
||||
strings.Contains(msg, "fts5: syntax error")
|
||||
}
|
||||
|
||||
// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
|
||||
// When the query contains FTS5 special characters (colons, double-quotes, operators),
|
||||
// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
|
||||
// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
|
||||
// syntactically complex or contains column-prefix syntax like "foo:bar:".
|
||||
func simplifyForLike(query string) string {
|
||||
q := strings.TrimSpace(query)
|
||||
var token strings.Builder
|
||||
for _, r := range q {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
|
||||
token.WriteRune(r)
|
||||
} else if token.Len() > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return token.String()
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
---
|
||||
name: vault_search
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error)"
|
||||
description: "Busca en vault_index.db de un vault usando FTS5 sobre files_fts. Si el query rompe el parser FTS5, hace fallback a LIKE sobre rel_path. Retorna hits con snippet de contexto."
|
||||
tags: [vault, search, fts5, sqlite, infra]
|
||||
uses_functions: ["vault_index_open_go_infra"]
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, fmt, path/filepath, strings]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta al directorio raiz del vault (puede ser symlink)"
|
||||
- name: query
|
||||
desc: "termino o frase de busqueda; se escapa automaticamente para FTS5 salvo que ya incluya operadores booleanos o prefijos de columna"
|
||||
- name: limit
|
||||
desc: "maximo de resultados; si es <= 0 se usa 50"
|
||||
output: "slice de VaultSearchHit ordenado por rank FTS5 (o mtime DESC en fallback LIKE); slice vacio si no hay resultados"
|
||||
tested: true
|
||||
tests:
|
||||
- "FTS match devuelve hit con snippet"
|
||||
- "query sin resultados retorna slice vacio"
|
||||
- "limit se respeta"
|
||||
- "query FTS invalida activa fallback LIKE"
|
||||
- "limit cero usa 50 por defecto"
|
||||
test_file_path: "functions/infra/vault_search_test.go"
|
||||
file_path: "functions/infra/vault_search.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
hits, err := infra.VaultSearch("/home/lucas/vaults/turismo_spain", "hoteles", 20)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, h := range hits {
|
||||
fmt.Printf("[%s] %s %s\n", h.VaultName, h.RelPath, h.Snippet)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
`VaultSearchHit` es un struct local definido en este archivo (no en `vault_file.go`)
|
||||
porque combina campos de `files` + metadatos de contexto de busqueda (Snippet, VaultPath, VaultName).
|
||||
|
||||
**FTS5 safety:** el helper `safeFTSQuery` envuelve la query en comillas dobles
|
||||
cuando no contiene operadores booleanos ni prefijos de columna. Esto evita errores
|
||||
del parser en tokens como `foo:bar:` o `hello-world`.
|
||||
|
||||
**Fallback LIKE:** si el MATCH falla con un error de sintaxis FTS5, se ejecuta
|
||||
`WHERE rel_path LIKE '%' || query || '%'`. Los hits del fallback tienen `Snippet=""`.
|
||||
|
||||
**VaultName:** se deriva del `filepath.Base(filepath.EvalSymlinks(vaultPath))`.
|
||||
Si `EvalSymlinks` falla (e.g. symlink roto), usa `filepath.Base(vaultPath)`.
|
||||
@@ -0,0 +1,147 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// openTestVaultDB creates a fresh vault_index.db in a temp dir and returns the path.
|
||||
func openTestVaultDir(t *testing.T) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
db.Close()
|
||||
return dir
|
||||
}
|
||||
|
||||
// seedVaultFile inserts a row into files + files_fts.
|
||||
func seedVaultFile(t *testing.T, dir, relPath, mime, bucket, subBucket, contentText string, size int64) {
|
||||
t.Helper()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen seed: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
now := time.Now().Unix()
|
||||
_, err = db.Exec(`
|
||||
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, 'aabbccdd', ?, '', ?, ?, ?)`,
|
||||
relPath, size, now, mime, bucket, subBucket, now,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("seed files: %v", err)
|
||||
}
|
||||
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`, relPath, contentText)
|
||||
if err != nil {
|
||||
t.Fatalf("seed files_fts: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
func TestVaultSearch_FTSMatch(t *testing.T) {
|
||||
t.Run("FTS match devuelve hit con snippet", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
seedVaultFile(t, dir, "data/raw/informe.csv", "text/csv", "data", "raw",
|
||||
"ventas trimestrales empresa iberica", 1024)
|
||||
seedVaultFile(t, dir, "data/raw/other.csv", "text/csv", "data", "raw",
|
||||
"productos inventario almacen", 512)
|
||||
|
||||
hits, err := VaultSearch(dir, "ventas", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 1 {
|
||||
t.Fatalf("got %d hits, want 1", len(hits))
|
||||
}
|
||||
if hits[0].RelPath != "data/raw/informe.csv" {
|
||||
t.Errorf("RelPath = %q, want data/raw/informe.csv", hits[0].RelPath)
|
||||
}
|
||||
if hits[0].VaultName == "" {
|
||||
t.Errorf("VaultName should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_NoMatch(t *testing.T) {
|
||||
t.Run("query sin resultados retorna slice vacio", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
seedVaultFile(t, dir, "data/raw/file.csv", "text/csv", "data", "raw", "some content", 100)
|
||||
|
||||
hits, err := VaultSearch(dir, "zzznomatch", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 0 {
|
||||
t.Errorf("got %d hits, want 0", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_LimitRespected(t *testing.T) {
|
||||
t.Run("limit se respeta", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
for i := 0; i < 10; i++ {
|
||||
path := "data/raw/file" + string(rune('a'+i)) + ".csv"
|
||||
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "common keyword everywhere", 100)
|
||||
}
|
||||
|
||||
hits, err := VaultSearch(dir, "common", 3)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 3 {
|
||||
t.Errorf("got %d hits, want 3", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_BadFTSQuery_FallbackLike(t *testing.T) {
|
||||
t.Run("query FTS invalida activa fallback LIKE", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
// Insert a file whose rel_path contains "foobar" so LIKE can find it.
|
||||
seedVaultFile(t, dir, "data/raw/foobar_report.csv", "text/csv", "data", "raw", "", 200)
|
||||
|
||||
// "foo:bar:" — colon after a non-column name triggers FTS5 parser error.
|
||||
// safeFTSQuery passes it through unchanged because it contains ":"
|
||||
// → FTS5 "no such column: bar" → fallback LIKE on rel_path.
|
||||
hits, err := VaultSearch(dir, "foo:bar:", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) == 0 {
|
||||
t.Errorf("expected fallback LIKE to find foobar_report.csv, got 0 hits")
|
||||
}
|
||||
for _, h := range hits {
|
||||
if h.Snippet != "" {
|
||||
t.Errorf("fallback hits should have empty Snippet, got %q", h.Snippet)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_LimitZeroDefaults(t *testing.T) {
|
||||
t.Run("limit cero usa 50 por defecto", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
// Insert 55 files with the same keyword.
|
||||
for i := 0; i < 55; i++ {
|
||||
path := "data/raw/doc" + string(rune('a')) + string(rune(int('0')+i%10)) + ".csv"
|
||||
if i >= 10 {
|
||||
path = "data/raw/doc" + string(rune('b'+i/10-1)) + string(rune(int('0')+i%10)) + ".csv"
|
||||
}
|
||||
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "keyword alpha beta", 100)
|
||||
}
|
||||
|
||||
hits, err := VaultSearch(dir, "keyword", 0)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 50 {
|
||||
t.Errorf("got %d hits, want 50 (default limit)", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user