chore: auto-commit (95 archivos)

- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-13 00:50:34 +02:00
parent a2bbf23374
commit e3c8979e8d
189 changed files with 18964 additions and 330 deletions
+155
View File
@@ -0,0 +1,155 @@
package core
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"sync"
"syscall"
"time"
)
// StreamEvent es una linea capturada de stdout o stderr del subproceso.
type StreamEvent struct {
Stream string // "stdout" | "stderr"
Line string // sin trailing newline
Time time.Time // timestamp de recepcion
}
// StreamResult es el resultado final del subproceso, enviado por el canal de
// resultados cuando ambos pipes han llegado a EOF y el proceso ha terminado.
type StreamResult struct {
ExitCode int
Err error
DurationMs int64
}
// SubprocessStream lanza name con args como subproceso y retorna dos canales:
// - events: recibe StreamEvent (linea de stdout/stderr) hasta EOF de ambos pipes.
// - result: recibe exactamente un StreamResult cuando el proceso termina.
//
// env se concatena con os.Environ(). stdin puede ser nil.
//
// Cancelar ctx envia SIGTERM al proceso; si no termina en 2 segundos, SIGKILL.
// El caller DEBE consumir events hasta que se cierre o cancelar ctx para evitar
// bloquear las goroutines internas.
func SubprocessStream(
ctx context.Context,
name string,
args []string,
env []string,
stdin io.Reader,
) (<-chan StreamEvent, <-chan StreamResult) {
events := make(chan StreamEvent, 64)
results := make(chan StreamResult, 1)
go func() {
defer close(events)
defer close(results)
start := time.Now()
cmd := exec.CommandContext(ctx, name, args...)
// Entorno: base + extra
if len(env) > 0 {
cmd.Env = append(os.Environ(), env...)
}
if stdin != nil {
cmd.Stdin = stdin
}
// Process group propio para matar hijos al recibir SIGTERM/SIGKILL
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stdout pipe: %w", err), DurationMs: 0}
return
}
stderrPipe, err := cmd.StderrPipe()
if err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stderr pipe: %w", err), DurationMs: 0}
return
}
if err := cmd.Start(); err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("start: %w", err), DurationMs: 0}
return
}
// Goroutine de supervision de ctx: SIGTERM → grace 2s → SIGKILL
ctxDone := make(chan struct{})
go func() {
select {
case <-ctx.Done():
if cmd.Process != nil {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM)
timer := time.NewTimer(2 * time.Second)
defer timer.Stop()
select {
case <-timer.C:
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
case <-ctxDone:
}
}
case <-ctxDone:
}
}()
send := func(stream, line string) {
ev := StreamEvent{Stream: stream, Line: line, Time: time.Now()}
select {
case events <- ev:
case <-ctx.Done():
}
}
// Leer stdout y stderr concurrentemente
const bufSize = 1024 * 1024 // 1 MB para lineas largas (sd-cli progress, etc.)
var wg sync.WaitGroup
scanPipe := func(r io.Reader, stream string) {
defer wg.Done()
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, bufSize), bufSize)
for sc.Scan() {
send(stream, sc.Text())
}
}
wg.Add(2)
go scanPipe(stdoutPipe, "stdout")
go scanPipe(stderrPipe, "stderr")
wg.Wait()
close(ctxDone) // señal al supervisor de ctx para que pare
exitCode := 0
var waitErr error
if err := cmd.Wait(); err != nil {
waitErr = err
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode = exitErr.ExitCode()
waitErr = nil // exit code no-cero no es un error de spawn
}
}
// Si el contexto fue cancelado, reportar como error de cancelacion
if ctx.Err() != nil && waitErr == nil {
waitErr = ctx.Err()
}
results <- StreamResult{
ExitCode: exitCode,
Err: waitErr,
DurationMs: time.Since(start).Milliseconds(),
}
}()
return events, results
}
+69
View File
@@ -0,0 +1,69 @@
---
name: subprocess_stream
kind: function
lang: go
domain: core
version: "1.0.0"
purity: impure
signature: "func SubprocessStream(ctx context.Context, name string, args []string, env []string, stdin io.Reader) (<-chan StreamEvent, <-chan StreamResult)"
description: "Lanza un subproceso y retorna dos canales: uno con StreamEvent (linea de stdout/stderr con timestamp) y otro con un unico StreamResult (ExitCode, Err, DurationMs). Cancelar ctx envia SIGTERM al proceso; si no termina en 2s, SIGKILL."
tags: [subprocess, exec, stream, stdout, stderr, process, concurrency, io, primitiva]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [bufio, context, fmt, io, os, os/exec, sync, syscall, time]
params:
- name: ctx
desc: "Contexto de cancelacion. Al cancelar, el proceso recibe SIGTERM; si no muere en 2s, SIGKILL. Usar context.WithTimeout para acotar duracion maxima."
- name: name
desc: "Nombre o path del ejecutable a lanzar (ej. 'echo', '/usr/bin/python3')."
- name: args
desc: "Argumentos del proceso. Puede ser nil o vacio."
- name: env
desc: "Variables de entorno adicionales en formato 'KEY=VALUE'. Se concatenan con os.Environ(). Puede ser nil."
- name: stdin
desc: "Stdin del proceso. Puede ser nil si el proceso no necesita entrada."
output: "Dos canales: events (<-chan StreamEvent) cerrado cuando ambos pipes EOF; result (<-chan StreamResult) con exactamente un valor cuando el proceso termina. El caller DEBE consumir events hasta cierre o cancelar ctx para evitar bloquear goroutines internas."
tested: true
tests:
- "echo stdout llega como evento y ExitCode 0"
- "stderr llega como evento con stream stderr"
- "exit code no-cero se reporta en StreamResult"
- "ctx cancelado termina el proceso"
- "multiples lineas stdout"
test_file_path: "functions/core/subprocess_stream_test.go"
file_path: "functions/core/subprocess_stream.go"
---
## Ejemplo
```go
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
events, results := core.SubprocessStream(ctx, "grep", []string{"-rn", "TODO", "."}, nil, nil)
for ev := range events {
switch ev.Stream {
case "stdout":
fmt.Println(ev.Line)
case "stderr":
fmt.Fprintln(os.Stderr, "[stderr]", ev.Line)
}
}
res := <-results
if res.ExitCode != 0 || res.Err != nil {
log.Printf("grep exit=%d err=%v duration=%dms", res.ExitCode, res.Err, res.DurationMs)
}
```
## Notas
- El canal `events` tiene buffer de 64. Si el caller deja de consumir y el buffer se llena, las goroutinas internas se bloquean hasta que haya espacio o el ctx sea cancelado.
- El scanner de cada pipe tiene un buffer de 1 MB para tolerar lineas muy largas (progreso de CLIs tipo sd-cli, barras ANSI largas).
- Los structs `StreamEvent` y `StreamResult` se declaran en el mismo archivo para que el paquete `core` los exporte sin imports adicionales.
- Generaliza el patron de `claude_stream_go_core` desacoplando el lanzamiento de subprocesos del protocolo especifico de claude (NDJSON/stream-json). `claude_stream_go_core` puede reimplementarse internamente usando esta funcion como primitiva.
- `cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}` crea un process group propio; SIGTERM/SIGKILL se envian con `Kill(-pgid, sig)` para matar tambien los procesos hijo del hijo.
+132
View File
@@ -0,0 +1,132 @@
package core
import (
"context"
"testing"
"time"
)
func TestSubprocessStream(t *testing.T) {
t.Run("echo stdout llega como evento y ExitCode 0", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "echo", []string{"hola"}, nil, nil)
var got []StreamEvent
for ev := range events {
got = append(got, ev)
}
res := <-results
if res.ExitCode != 0 {
t.Errorf("ExitCode = %d, want 0 (err: %v)", res.ExitCode, res.Err)
}
if res.Err != nil {
t.Errorf("unexpected Err: %v", res.Err)
}
if len(got) != 1 {
t.Fatalf("got %d events, want 1", len(got))
}
if got[0].Stream != "stdout" {
t.Errorf("Stream = %q, want %q", got[0].Stream, "stdout")
}
if got[0].Line != "hola" {
t.Errorf("Line = %q, want %q", got[0].Line, "hola")
}
})
t.Run("stderr llega como evento con stream stderr", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// sh -c "echo msg >&2" escribe a stderr
events, results := SubprocessStream(ctx, "sh", []string{"-c", "echo error_msg >&2"}, nil, nil)
var got []StreamEvent
for ev := range events {
got = append(got, ev)
}
res := <-results
if res.ExitCode != 0 {
t.Errorf("ExitCode = %d, want 0", res.ExitCode)
}
if len(got) != 1 {
t.Fatalf("got %d events, want 1", len(got))
}
if got[0].Stream != "stderr" {
t.Errorf("Stream = %q, want %q", got[0].Stream, "stderr")
}
if got[0].Line != "error_msg" {
t.Errorf("Line = %q, want %q", got[0].Line, "error_msg")
}
})
t.Run("exit code no-cero se reporta en StreamResult", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "sh", []string{"-c", "exit 42"}, nil, nil)
for range events {
}
res := <-results
if res.ExitCode != 42 {
t.Errorf("ExitCode = %d, want 42", res.ExitCode)
}
if res.Err != nil {
t.Errorf("unexpected Err: %v", res.Err)
}
})
t.Run("ctx cancelado termina el proceso", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// proceso que dura mucho; cancelamos enseguida
ctxShort, cancelShort := context.WithTimeout(ctx, 100*time.Millisecond)
defer cancelShort()
events, results := SubprocessStream(ctxShort, "sleep", []string{"60"}, nil, nil)
for range events {
}
res := <-results
// Tras cancelacion el proceso debe haber terminado (ExitCode != 0 o Err de ctx)
if res.ExitCode == 0 && res.Err == nil {
t.Error("expected non-zero exit or ctx error after cancellation")
}
if res.DurationMs > 3000 {
t.Errorf("took %d ms, expected < 3000 (should have been killed)", res.DurationMs)
}
})
t.Run("multiples lineas stdout", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "sh", []string{"-c", "printf 'a\nb\nc\n'"}, nil, nil)
var lines []string
for ev := range events {
if ev.Stream == "stdout" {
lines = append(lines, ev.Line)
}
}
<-results
if len(lines) != 3 {
t.Fatalf("got %d stdout lines, want 3: %v", len(lines), lines)
}
want := []string{"a", "b", "c"}
for i, w := range want {
if lines[i] != w {
t.Errorf("line[%d] = %q, want %q", i, lines[i], w)
}
}
})
}
+238
View File
@@ -0,0 +1,238 @@
package infra
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// MlEnvCheck holds the result of a single ML environment probe.
type MlEnvCheck struct {
Name string `json:"name"` // e.g. "cuda_toolkit", "python_venv"
Status string `json:"status"` // "ok" | "missing" | "warning" | "unknown"
Version string `json:"version,omitempty"` // version string if detected
Detail string `json:"detail,omitempty"` // human-readable extra info
}
// MlEnvReport is the full ML environment audit result.
type MlEnvReport struct {
Gpus []GpuInfo `json:"gpus"`
Checks []MlEnvCheck `json:"checks"`
OverallOK bool `json:"overall_ok"`
GeneratedAt int64 `json:"generated_at"`
}
// AuditMlEnv probes the ML environment rooted at registryRoot.
// It checks for NVIDIA drivers, CUDA toolkit, Python venv, key Python
// packages and optional tools (sd, llama-cli) and a local vault path.
// Returns a non-nil MlEnvReport even when individual checks fail; the
// function itself only errors if a fundamental system call cannot be
// attempted.
func AuditMlEnv(registryRoot string) (MlEnvReport, error) {
report := MlEnvReport{
GeneratedAt: time.Now().Unix(),
}
// --- GPU detection (composes GetGpuInfo) ---
gpus, err := GetGpuInfo()
if err != nil {
// Non-fatal: record absence.
gpus = []GpuInfo{}
}
report.Gpus = gpus
checks := []MlEnvCheck{}
// --- nvidia-smi ---
checks = append(checks, probeCommand("nvidia_smi", "nvidia-smi", []string{"--version"}, 5))
// --- nvcc (CUDA toolkit compiler) ---
nvcc := probeNvcc()
checks = append(checks, nvcc)
// --- Python venv ---
venvCheck := probeVenv(registryRoot)
checks = append(checks, venvCheck)
// Python venv path for subsequent checks.
venvPy := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
// --- Python packages ---
for _, pkg := range []string{"torch", "diffusers", "transformers", "huggingface_hub", "stable_diffusion_cpp_python"} {
checks = append(checks, probePythonPackage(venvPy, pkg))
}
// --- sd.cpp CLI ---
checks = append(checks, probeCommand("sd_cli", "sd", []string{"--version"}, 5))
// --- llama.cpp CLI ---
checks = append(checks, probeCommand("llama_cpp", "llama-cli", []string{"--version"}, 5))
// --- imagegen_vault ---
checks = append(checks, probeImagegenVault())
report.Checks = checks
// OverallOK: no "missing" checks (warning is tolerated) and at least 1 GPU.
overallOK := len(gpus) > 0
for _, c := range checks {
if c.Status == "missing" {
// stable_diffusion_cpp_python and sd_cli are optional — downgrade to warning-only.
if c.Name == "stable_diffusion_cpp_python" || c.Name == "sd_cli" || c.Name == "llama_cpp" {
continue
}
overallOK = false
}
}
report.OverallOK = overallOK
return report, nil
}
// probeCommand checks whether a binary is available in PATH by running it with
// the given args and recording any version output.
func probeCommand(name, binary string, args []string, timeoutSec int) MlEnvCheck {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
defer cancel()
path, err := exec.LookPath(binary)
if err != nil {
return MlEnvCheck{Name: name, Status: "missing", Detail: fmt.Sprintf("%s not found in PATH", binary)}
}
out, err := exec.CommandContext(ctx, path, args...).CombinedOutput()
version := strings.TrimSpace(string(out))
if len(version) > 120 {
version = version[:120]
}
if err != nil {
return MlEnvCheck{Name: name, Status: "warning", Version: version, Detail: fmt.Sprintf("exit error: %v", err)}
}
return MlEnvCheck{Name: name, Status: "ok", Version: version}
}
// probeNvcc extracts the CUDA toolkit version from nvcc --version output.
func probeNvcc() MlEnvCheck {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
path, err := exec.LookPath("nvcc")
if err != nil {
return MlEnvCheck{Name: "nvcc", Status: "missing", Detail: "nvcc not found in PATH (CUDA toolkit not installed)"}
}
out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
if err != nil {
return MlEnvCheck{Name: "nvcc", Status: "warning", Detail: fmt.Sprintf("nvcc --version failed: %v", err)}
}
// Extract version from line like: "Cuda compilation tools, release 12.4, V12.4.99"
version := ""
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(line, "release") {
parts := strings.Split(line, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if strings.HasPrefix(p, "release") {
version = strings.TrimSpace(strings.TrimPrefix(p, "release"))
break
}
}
break
}
}
if version == "" {
version = strings.TrimSpace(string(out))
if len(version) > 80 {
version = version[:80]
}
}
return MlEnvCheck{Name: "nvcc", Status: "ok", Version: version}
}
// probeVenv checks that the Python venv exists and is functional.
func probeVenv(registryRoot string) MlEnvCheck {
py := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
if _, err := os.Stat(py); os.IsNotExist(err) {
return MlEnvCheck{Name: "python_venv", Status: "missing", Detail: fmt.Sprintf("not found: %s", py)}
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
out, err := exec.CommandContext(ctx, py, "--version").CombinedOutput()
version := strings.TrimSpace(string(out))
if err != nil {
return MlEnvCheck{Name: "python_venv", Status: "warning", Version: version, Detail: fmt.Sprintf("python3 --version failed: %v", err)}
}
return MlEnvCheck{Name: "python_venv", Status: "ok", Version: version}
}
// probePythonPackage imports a package in the venv Python and extracts __version__.
func probePythonPackage(venvPy, pkg string) MlEnvCheck {
// Map package name → import name (for packages with different import names).
importName := pkg
switch pkg {
case "stable_diffusion_cpp_python":
importName = "stable_diffusion_cpp"
case "huggingface_hub":
importName = "huggingface_hub"
}
// Check that the venv python binary exists first.
if _, err := os.Stat(venvPy); os.IsNotExist(err) {
return MlEnvCheck{Name: pkg, Status: "unknown", Detail: "python_venv not available"}
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
script := fmt.Sprintf("import %s; v = getattr(%s, '__version__', None); print(v or 'unknown')", importName, importName)
out, err := exec.CommandContext(ctx, venvPy, "-c", script).CombinedOutput()
output := strings.TrimSpace(string(out))
if err != nil {
// Module not found → missing; other errors → warning.
detail := output
if len(detail) > 200 {
detail = detail[:200]
}
if strings.Contains(output, "ModuleNotFoundError") || strings.Contains(output, "No module named") {
return MlEnvCheck{Name: pkg, Status: "missing", Detail: fmt.Sprintf("%s not installed", importName)}
}
return MlEnvCheck{Name: pkg, Status: "warning", Detail: detail}
}
return MlEnvCheck{Name: pkg, Status: "ok", Version: output}
}
// probeImagegenVault checks that ~/vaults/imagegen_models exists and lists subdirs.
func probeImagegenVault() MlEnvCheck {
home, err := os.UserHomeDir()
if err != nil {
return MlEnvCheck{Name: "imagegen_vault", Status: "unknown", Detail: "cannot determine home directory"}
}
vaultPath := filepath.Join(home, "vaults", "imagegen_models")
entries, err := os.ReadDir(vaultPath)
if os.IsNotExist(err) {
return MlEnvCheck{Name: "imagegen_vault", Status: "missing", Detail: fmt.Sprintf("vault not found: %s", vaultPath)}
}
if err != nil {
return MlEnvCheck{Name: "imagegen_vault", Status: "warning", Detail: fmt.Sprintf("cannot read vault: %v", err)}
}
subdirs := []string{}
for _, e := range entries {
if e.IsDir() {
subdirs = append(subdirs, e.Name())
}
}
detail := fmt.Sprintf("subdirs: %s", strings.Join(subdirs, ", "))
if len(subdirs) == 0 {
detail = "vault exists but is empty"
}
return MlEnvCheck{Name: "imagegen_vault", Status: "ok", Detail: detail}
}
+67
View File
@@ -0,0 +1,67 @@
---
name: audit_ml_env
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
uses_functions: [get_gpu_info_go_infra]
uses_types: [gpu_info_go_infra]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [context, fmt, os, os/exec, path/filepath, strings, time]
tested: true
tests:
- "report no nil y tiene checks"
- "generated_at es positivo"
- "checks tiene al menos 4 entradas"
- "gpus puede ser vacio en CI"
test_file_path: "functions/infra/audit_ml_env_test.go"
file_path: "functions/infra/audit_ml_env.go"
params:
- name: registryRoot
desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
---
## Checks realizados
| Check | Tipo | Critico |
|---|---|---|
| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
| `nvcc` | CUDA toolkit version | no |
| `python_venv` | exists + `python3 --version` | si |
| `torch` | `import torch; __version__` | si |
| `diffusers` | `import diffusers; __version__` | si |
| `transformers` | `import transformers; __version__` | si |
| `huggingface_hub` | `import huggingface_hub; __version__` | si |
| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
| `sd_cli` | `sd --version` in PATH | no (opcional) |
| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
## Ejemplo
```go
root := "/home/lucas/fn_registry"
report, err := AuditMlEnv(root)
if err != nil {
log.Fatal(err)
}
for _, c := range report.Checks {
fmt.Printf("%-40s %s %s\n", c.Name, c.Status, c.Version)
}
fmt.Printf("OverallOK: %v\n", report.OverallOK)
```
## Notas
- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
- No escribe nada en disco. Read-only.
- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
+53
View File
@@ -0,0 +1,53 @@
package infra
import (
"testing"
)
func TestAuditMlEnv(t *testing.T) {
// Use the actual registry root relative to the test binary location.
// Tests run from the package directory; go up two levels.
registryRoot := "../.."
t.Run("report no nil y tiene checks", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if report.Checks == nil {
t.Fatal("report.Checks is nil")
}
})
t.Run("generated_at es positivo", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if report.GeneratedAt <= 0 {
t.Errorf("GeneratedAt should be positive unix timestamp, got %d", report.GeneratedAt)
}
})
t.Run("checks tiene al menos 4 entradas", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if len(report.Checks) < 4 {
t.Errorf("expected at least 4 checks, got %d", len(report.Checks))
}
})
t.Run("gpus puede ser vacio en CI", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
// Gpus may be empty in CI without a GPU; that's OK.
// Just verify the field is not nil.
if report.Gpus == nil {
t.Error("report.Gpus should be a non-nil slice (can be empty)")
}
})
}
+60
View File
@@ -0,0 +1,60 @@
package infra
import (
"encoding/csv"
"errors"
"fmt"
"os/exec"
"strconv"
"strings"
)
// GetGpuInfo queries NVIDIA GPUs via nvidia-smi and returns a slice of GpuInfo.
// If nvidia-smi is not installed or no NVIDIA GPU is present, returns an empty
// slice and a nil error (absence of NVIDIA hardware is not an error).
func GetGpuInfo() ([]GpuInfo, error) {
out, err := exec.Command(
"nvidia-smi",
"--query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
// nvidia-smi not installed or no NVIDIA device — not an error.
var exitErr *exec.ExitError
if errors.Is(err, exec.ErrNotFound) || errors.As(err, &exitErr) {
return []GpuInfo{}, nil
}
return nil, fmt.Errorf("gpu_info: nvidia-smi: %w", err)
}
r := csv.NewReader(strings.NewReader(strings.TrimSpace(string(out))))
r.TrimLeadingSpace = true
records, err := r.ReadAll()
if err != nil {
return nil, fmt.Errorf("gpu_info: parse csv: %w", err)
}
gpus := make([]GpuInfo, 0, len(records))
for _, rec := range records {
if len(rec) < 6 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(rec[0]))
totalMb, _ := strconv.Atoi(strings.TrimSpace(rec[2]))
freeMb, _ := strconv.Atoi(strings.TrimSpace(rec[3]))
gpus = append(gpus, GpuInfo{
Index: idx,
Name: strings.TrimSpace(rec[1]),
VramTotalMb: totalMb,
VramFreeMb: freeMb,
DriverVersion: strings.TrimSpace(rec[4]),
CudaVersion: strings.TrimSpace(rec[5]),
})
}
return gpus, nil
}
+70
View File
@@ -0,0 +1,70 @@
---
name: get_gpu_info
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func GetGpuInfo() ([]GpuInfo, error)"
description: "Consulta GPUs NVIDIA via nvidia-smi y retorna un slice de GpuInfo con index, nombre, VRAM total/libre, driver y version CUDA. Si nvidia-smi no esta instalado o no hay GPU NVIDIA, retorna slice vacio y nil (ausencia de hardware no es error)."
tags: [gpu, nvidia, cuda, hardware, infra, probe]
uses_functions: []
uses_types: ["gpu_info_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [encoding/csv, errors, fmt, os/exec, strconv, strings]
params:
- name: (ninguno)
desc: "No toma parametros. Lee el estado del sistema via nvidia-smi."
output: "Slice de GpuInfo con una entrada por GPU detectada. Slice vacio si no hay GPUs NVIDIA o nvidia-smi no esta instalado. Error solo si nvidia-smi existe pero falla inesperadamente al parsear la salida CSV."
tested: true
tests:
- "retorna slice vacio y nil cuando no hay GPU NVIDIA"
- "linea GPU RTX 3080 tipica"
- "dos GPUs en el CSV"
- "CSV vacio retorna slice vacio"
- "linea con menos de 6 campos se ignora"
- "espacios extra en los valores se eliminan"
- "campos del struct GpuInfo correctos"
test_file_path: "functions/infra/get_gpu_info_test.go"
file_path: "functions/infra/get_gpu_info.go"
---
## Ejemplo
```go
gpus, err := GetGpuInfo()
if err != nil {
log.Fatal(err)
}
if len(gpus) == 0 {
fmt.Println("No NVIDIA GPUs detected")
} else {
for _, g := range gpus {
fmt.Printf("[%d] %s VRAM: %d/%d MiB Driver: %s CUDA: %s\n",
g.Index, g.Name, g.VramFreeMb, g.VramTotalMb,
g.DriverVersion, g.CudaVersion)
}
}
```
## Salida nvidia-smi
Ejecuta:
```
nvidia-smi --query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version --format=csv,noheader,nounits
```
Ejemplo de salida con una GPU:
```
0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4
```
## Notas
- Requiere `nvidia-smi` en PATH (parte del driver NVIDIA).
- La columna `cuda_version` en nvidia-smi refleja la version maxima de CUDA soportada por el driver, no la del toolkit instalado.
- Para comprobar el toolkit CUDA instalado, usar `cuda_toolkit_check_bash_infra`.
- En maquinas sin GPU NVIDIA retorna `([]GpuInfo{}, nil)` — el caller puede tratar esto como "sin GPU disponible".
- No ejecutar tests automatizados para esta funcion en CI sin GPU; verificar manualmente o con mock.
+165
View File
@@ -0,0 +1,165 @@
package infra
import (
"strconv"
"strings"
"testing"
)
// TestGetGpuInfoNoGpu verifica que la funcion retorna slice vacio sin error
// cuando nvidia-smi no esta instalado o no hay GPU NVIDIA presente.
// Este test pasa en cualquier maquina, con o sin GPU.
func TestGetGpuInfoNoGpu(t *testing.T) {
t.Run("retorna slice vacio y nil cuando no hay GPU NVIDIA", func(t *testing.T) {
gpus, err := GetGpuInfo()
if err != nil {
t.Errorf("GetGpuInfo() error inesperado: %v", err)
}
// En maquinas sin nvidia-smi el resultado debe ser un slice vacio (no nil)
if gpus == nil {
t.Error("GetGpuInfo() retorno nil, se esperaba slice vacio []GpuInfo{}")
}
})
}
// parseCsvNvidiaSmi replica la logica de parsing de GetGpuInfo para tests unitarios.
// Recibe el output de nvidia-smi --format=csv,noheader,nounits y retorna []GpuInfo.
func parseCsvNvidiaSmi(output string) ([]GpuInfo, error) {
trimmed := strings.TrimSpace(output)
if trimmed == "" {
return []GpuInfo{}, nil
}
lines := strings.Split(trimmed, "\n")
gpus := make([]GpuInfo, 0, len(lines))
for _, line := range lines {
parts := strings.Split(line, ",")
if len(parts) < 6 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
totalMb, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
freeMb, _ := strconv.Atoi(strings.TrimSpace(parts[3]))
gpus = append(gpus, GpuInfo{
Index: idx,
Name: strings.TrimSpace(parts[1]),
VramTotalMb: totalMb,
VramFreeMb: freeMb,
DriverVersion: strings.TrimSpace(parts[4]),
CudaVersion: strings.TrimSpace(parts[5]),
})
}
return gpus, nil
}
// TestParseCsvNvidiaSmi verifica el parsing de la salida CSV de nvidia-smi
// sin requerir GPU real ni nvidia-smi instalado.
func TestParseCsvNvidiaSmi(t *testing.T) {
tests := []struct {
name string
csvInput string
wantLen int
wantIndex int
wantName string
wantVramTotal int
wantVramFree int
wantDriver string
wantCuda string
}{
{
name: "linea GPU RTX 3080 tipica",
csvInput: "0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4",
wantLen: 1,
wantIndex: 0,
wantName: "NVIDIA GeForce RTX 3080",
wantVramTotal: 10240,
wantVramFree: 8192,
wantDriver: "550.54.15",
wantCuda: "12.4",
},
{
name: "dos GPUs en el CSV",
csvInput: "0, GPU A, 8192, 4096, 525.0, 12.0\n1, GPU B, 24576, 20000, 525.0, 12.0",
wantLen: 2,
},
{
name: "CSV vacio retorna slice vacio",
csvInput: "",
wantLen: 0,
},
{
name: "linea con menos de 6 campos se ignora",
csvInput: "0, GPU, 8192",
wantLen: 0,
},
{
name: "espacios extra en los valores se eliminan",
csvInput: " 1 , NVIDIA RTX 4090 , 24576 , 20000 , 545.0 , 12.6 ",
wantLen: 1,
wantIndex: 1,
wantName: "NVIDIA RTX 4090",
wantVramTotal: 24576,
wantVramFree: 20000,
wantDriver: "545.0",
wantCuda: "12.6",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
gpus, err := parseCsvNvidiaSmi(tc.csvInput)
if err != nil {
t.Fatalf("error inesperado: %v", err)
}
if len(gpus) != tc.wantLen {
t.Fatalf("len(gpus) = %d, quería %d", len(gpus), tc.wantLen)
}
if tc.wantLen == 1 {
g := gpus[0]
if g.Index != tc.wantIndex {
t.Errorf("Index = %d, quería %d", g.Index, tc.wantIndex)
}
if g.Name != tc.wantName {
t.Errorf("Name = %q, quería %q", g.Name, tc.wantName)
}
if g.VramTotalMb != tc.wantVramTotal {
t.Errorf("VramTotalMb = %d, quería %d", g.VramTotalMb, tc.wantVramTotal)
}
if g.VramFreeMb != tc.wantVramFree {
t.Errorf("VramFreeMb = %d, quería %d", g.VramFreeMb, tc.wantVramFree)
}
if g.DriverVersion != tc.wantDriver {
t.Errorf("DriverVersion = %q, quería %q", g.DriverVersion, tc.wantDriver)
}
if g.CudaVersion != tc.wantCuda {
t.Errorf("CudaVersion = %q, quería %q", g.CudaVersion, tc.wantCuda)
}
}
})
}
}
// TestGpuInfoStruct verifica los campos del tipo GpuInfo.
func TestGpuInfoStruct(t *testing.T) {
t.Run("campos del struct GpuInfo correctos", func(t *testing.T) {
g := GpuInfo{
Index: 0,
Name: "NVIDIA GeForce GTX 1080",
VramTotalMb: 8192,
VramFreeMb: 6144,
DriverVersion: "470.0",
CudaVersion: "11.4",
}
if g.Index != 0 {
t.Errorf("Index = %d", g.Index)
}
if g.Name != "NVIDIA GeForce GTX 1080" {
t.Errorf("Name = %q", g.Name)
}
if g.VramTotalMb != 8192 {
t.Errorf("VramTotalMb = %d", g.VramTotalMb)
}
if g.VramFreeMb != 6144 {
t.Errorf("VramFreeMb = %d", g.VramFreeMb)
}
})
}
+12
View File
@@ -0,0 +1,12 @@
package infra
// GpuInfo describe una GPU detectada en el sistema con sus capacidades de VRAM
// y versiones de driver y CUDA.
type GpuInfo struct {
Index int `json:"index"`
Name string `json:"name"`
VramTotalMb int `json:"vram_total_mb"`
VramFreeMb int `json:"vram_free_mb"`
DriverVersion string `json:"driver_version"`
CudaVersion string `json:"cuda_version,omitempty"`
}
+171
View File
@@ -0,0 +1,171 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"time"
)
// AggregateReport summarises the result of a VaultAggregateIndex run.
type AggregateReport struct {
VaultsProcessed int
VaultsSkipped int // vaults without a vault_index.db
TotalFiles int
Errors []string // non-fatal per-vault errors
}
// VaultAggregateIndex reads all vault manifests from repoRoot, opens each
// vault_index.db and copies all file records into the central registry.db
// vault_files table. The table is created if it does not exist (idempotent).
//
// For each vault the previous rows are deleted and replaced atomically, so
// re-running always produces a clean, non-duplicated state.
//
// Returns an AggregateReport with counts. Per-vault errors are non-fatal
// (logged in report.Errors); only fatal errors (e.g. registry.db
// unreachable) are returned as the error value.
func VaultAggregateIndex(repoRoot string) (AggregateReport, error) {
var report AggregateReport
// 1. Open registry.db
registryDB, err := SQLiteOpen(filepath.Join(repoRoot, "registry.db"), "")
if err != nil {
return report, fmt.Errorf("vault_aggregate_index: open registry.db: %w", err)
}
defer registryDB.Close()
// 2. Idempotent schema migration
for _, stmt := range []string{
`CREATE TABLE IF NOT EXISTS vault_files (
vault_id TEXT NOT NULL,
vault_name TEXT NOT NULL,
rel_path TEXT NOT NULL,
size INTEGER NOT NULL,
mtime INTEGER NOT NULL,
sha256 TEXT NOT NULL,
mime TEXT NOT NULL DEFAULT '',
ext TEXT NOT NULL DEFAULT '',
bucket TEXT NOT NULL DEFAULT '',
sub_bucket TEXT NOT NULL DEFAULT '',
indexed_at INTEGER NOT NULL,
PRIMARY KEY (vault_id, rel_path)
);`,
`CREATE INDEX IF NOT EXISTS idx_vault_files_sha256 ON vault_files(sha256);`,
`CREATE INDEX IF NOT EXISTS idx_vault_files_vault ON vault_files(vault_id);`,
} {
if _, err := registryDB.Exec(stmt); err != nil {
if !isIdempotentMigrationError(err) {
return report, fmt.Errorf("vault_aggregate_index: schema: %w", err)
}
}
}
// 3. Read manifest
entries, err := VaultManifestRead(repoRoot)
if err != nil {
return report, fmt.Errorf("vault_aggregate_index: manifest: %w", err)
}
now := time.Now().UTC().Unix()
for _, entry := range entries {
vaultID := vaultIDFromEntry(entry)
vaultName := entry.Name
vaultPath := entry.Path
indexPath := filepath.Join(vaultPath, "vault_index.db")
if _, statErr := os.Stat(indexPath); statErr != nil {
report.VaultsSkipped++
continue
}
vaultDB, openErr := VaultIndexOpen(vaultPath)
if openErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: open index: %v", vaultName, openErr))
continue
}
rows, queryErr := vaultDB.Query(
`SELECT rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket FROM files`,
)
if queryErr != nil {
vaultDB.Close()
report.Errors = append(report.Errors, fmt.Sprintf("%s: query files: %v", vaultName, queryErr))
continue
}
type fileRow struct {
RelPath string
Size int64
Mtime int64
Sha256 string
Mime string
Ext string
Bucket string
SubBucket string
}
var fileRows []fileRow
for rows.Next() {
var r fileRow
if scanErr := rows.Scan(&r.RelPath, &r.Size, &r.Mtime, &r.Sha256, &r.Mime, &r.Ext, &r.Bucket, &r.SubBucket); scanErr != nil {
continue
}
fileRows = append(fileRows, r)
}
rows.Close()
vaultDB.Close()
// Atomic replace in registry.db
tx, txErr := registryDB.Begin()
if txErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: begin tx: %v", vaultName, txErr))
continue
}
if _, delErr := tx.Exec(`DELETE FROM vault_files WHERE vault_id = ?`, vaultID); delErr != nil {
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: delete: %v", vaultName, delErr))
continue
}
stmt, prepErr := tx.Prepare(`
INSERT INTO vault_files
(vault_id, vault_name, rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
if prepErr != nil {
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: prepare: %v", vaultName, prepErr))
continue
}
for _, r := range fileRows {
if _, insErr := stmt.Exec(vaultID, vaultName, r.RelPath, r.Size, r.Mtime, r.Sha256, r.Mime, r.Ext, r.Bucket, r.SubBucket, now); insErr != nil {
stmt.Close()
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: insert %s: %v", vaultName, r.RelPath, insErr))
continue
}
}
stmt.Close()
if commitErr := tx.Commit(); commitErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: commit: %v", vaultName, commitErr))
continue
}
report.VaultsProcessed++
report.TotalFiles += len(fileRows)
}
return report, nil
}
// vaultIDFromEntry constructs the canonical vault ID used in registry.db.
// Pattern: "<vault_name>_<project_id>" — consistent with the vaults table.
func vaultIDFromEntry(e VaultManifestEntry) string {
if e.ProjectID == "" {
return e.Name
}
return e.Name + "_" + e.ProjectID
}
+58
View File
@@ -0,0 +1,58 @@
---
name: vault_aggregate_index
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultAggregateIndex(repoRoot string) (AggregateReport, error)"
description: "Agrega los índices de todos los vaults del registry en la tabla vault_files de registry.db. Lee cada vault_index.db (via VaultIndexOpen) y reemplaza las filas de forma atómica. Idempotente: re-ejecutar limpia y reescribe sin duplicar."
tags: [vault, index, aggregate, registry]
uses_functions:
- "vault_manifest_read_go_infra"
- "vault_index_open_go_infra"
- "sqlite_open_go_infra"
uses_types:
- "vault_file_go_infra"
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "database/sql"
- "fmt"
- "os"
- "path/filepath"
- "time"
tested: true
tests:
- "TestVaultAggregateIndex_NoVaults"
- "TestVaultAggregateIndex_VaultWithoutIndex"
- "TestVaultAggregateIndex_HappyPath"
- "TestVaultAggregateIndex_ReRunReplaces"
test_file_path: "functions/infra/vault_aggregate_index_test.go"
file_path: "functions/infra/vault_aggregate_index.go"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del fn_registry (contiene registry.db y projects/)."
output: "AggregateReport con VaultsProcessed, VaultsSkipped (sin vault_index.db), TotalFiles y Errors (errores no fatales por vault). Error fatal solo si registry.db no se puede abrir."
---
## Ejemplo
```go
report, err := infra.VaultAggregateIndex("/home/lucas/fn_registry")
if err != nil {
log.Fatal(err)
}
fmt.Printf("Processed: %d vaults, %d files\n", report.VaultsProcessed, report.TotalFiles)
for _, e := range report.Errors {
fmt.Println("warning:", e)
}
```
## Notas
- Requiere que `registry/migrations/012_vault_files.sql` haya sido aplicado (o que el indexer lo aplique al arrancar). La función aplica la migración de forma idempotente ella misma con `CREATE TABLE IF NOT EXISTS`.
- Por cada vault: `DELETE WHERE vault_id = ?` + batch `INSERT` dentro de una transacción. Re-run siempre produce el mismo resultado.
- Vaults sin `vault_index.db` se cuentan en `VaultsSkipped` y se omiten sin error.
- El `vault_id` sigue el patrón `<vault_name>_<project_id>`, consistente con la tabla `vaults` de registry.db.
@@ -0,0 +1,175 @@
package infra
import (
"os"
"path/filepath"
"testing"
"time"
)
// setupAggregateTestRepo creates a minimal repo layout:
//
// <root>/
// registry.db (SQLite, empty)
// projects/<project>/vaults/vault.yaml
// <vaultPath>/ (optionally with vault_index.db populated)
func setupAggregateTestRepo(t *testing.T, vaultName, projectID, vaultPath string, withIndex bool) string {
t.Helper()
root := t.TempDir()
// Create registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("create registry.db: %v", err)
}
regDB.Close()
// Create project vault manifest
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
t.Fatalf("mkdir projects: %v", err)
}
manifestYAML := "vaults:\n - name: " + vaultName + "\n description: test\n path: " + vaultPath + "\n tags: []\n"
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifestYAML), 0644); err != nil {
t.Fatalf("write vault.yaml: %v", err)
}
// Create vault dir
if err := os.MkdirAll(vaultPath, 0755); err != nil {
t.Fatalf("mkdir vault: %v", err)
}
if withIndex {
// Create a vault_index.db with one file row
vdb, err := VaultIndexOpen(vaultPath)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
now := time.Now().UTC().Unix()
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/sample.csv", 1024, now, "deadbeef", "text/csv", ".csv", "data", "raw", now)
if err != nil {
t.Fatalf("insert test file: %v", err)
}
vdb.Close()
}
return root
}
func TestVaultAggregateIndex_NoVaults(t *testing.T) {
root := t.TempDir()
// No manifests, just registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("create registry.db: %v", err)
}
regDB.Close()
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsProcessed != 0 {
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
}
if len(report.Errors) != 0 {
t.Errorf("Errors: want empty, got %v", report.Errors)
}
}
func TestVaultAggregateIndex_VaultWithoutIndex(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, false /* no vault_index.db */)
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsSkipped != 1 {
t.Errorf("VaultsSkipped: want 1, got %d", report.VaultsSkipped)
}
if report.VaultsProcessed != 0 {
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
}
}
func TestVaultAggregateIndex_HappyPath(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsProcessed != 1 {
t.Errorf("VaultsProcessed: want 1, got %d", report.VaultsProcessed)
}
if report.TotalFiles != 1 {
t.Errorf("TotalFiles: want 1, got %d", report.TotalFiles)
}
// Verify row exists in registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("open registry.db: %v", err)
}
defer regDB.Close()
var count int
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
t.Fatalf("count vault_files: %v", err)
}
if count != 1 {
t.Errorf("vault_files count: want 1, got %d", count)
}
}
func TestVaultAggregateIndex_ReRunReplaces(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
// First run
if _, err := VaultAggregateIndex(root); err != nil {
t.Fatalf("first run: %v", err)
}
// Add a second file to vault_index.db
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("reopen vault index: %v", err)
}
now := time.Now().UTC().Unix()
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/extra.csv", 512, now, "cafebabe", "text/csv", ".csv", "data", "raw", now)
if err != nil {
t.Fatalf("insert second file: %v", err)
}
vdb.Close()
// Second run
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("second run: %v", err)
}
if report.TotalFiles != 2 {
t.Errorf("TotalFiles: want 2, got %d", report.TotalFiles)
}
// Verify no duplicates — exactly 2 rows
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("open registry.db: %v", err)
}
defer regDB.Close()
var count int
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
t.Fatalf("count vault_files: %v", err)
}
if count != 2 {
t.Errorf("vault_files count after re-run: want 2, got %d", count)
}
}
+68
View File
@@ -0,0 +1,68 @@
package infra
import "sort"
// VaultFileChange holds the before/after state of a file whose content changed.
type VaultFileChange struct {
RelPath string
Prev VaultFile
Curr VaultFile
}
// VaultDiffReport is the result of comparing two VaultFile slices.
type VaultDiffReport struct {
Added []VaultFile // in curr but not in prev (by rel_path)
Removed []VaultFile // in prev but not in curr
Changed []VaultFileChange // same rel_path, different sha256
Unchanged int // files present in both with identical sha256
}
// VaultDiff computes the difference between two vault snapshots.
// It indexes both slices by RelPath, then classifies each entry as
// Added, Removed, Changed, or Unchanged. All output slices are sorted
// by RelPath ascending. The function is pure and deterministic.
func VaultDiff(prev, curr []VaultFile) VaultDiffReport {
prevMap := make(map[string]VaultFile, len(prev))
for _, f := range prev {
prevMap[f.RelPath] = f
}
currMap := make(map[string]VaultFile, len(curr))
for _, f := range curr {
currMap[f.RelPath] = f
}
var report VaultDiffReport
for _, f := range curr {
p, exists := prevMap[f.RelPath]
if !exists {
report.Added = append(report.Added, f)
} else if p.Sha256 != f.Sha256 {
report.Changed = append(report.Changed, VaultFileChange{
RelPath: f.RelPath,
Prev: p,
Curr: f,
})
} else {
report.Unchanged++
}
}
for _, f := range prev {
if _, exists := currMap[f.RelPath]; !exists {
report.Removed = append(report.Removed, f)
}
}
sort.Slice(report.Added, func(i, j int) bool {
return report.Added[i].RelPath < report.Added[j].RelPath
})
sort.Slice(report.Removed, func(i, j int) bool {
return report.Removed[i].RelPath < report.Removed[j].RelPath
})
sort.Slice(report.Changed, func(i, j int) bool {
return report.Changed[i].RelPath < report.Changed[j].RelPath
})
return report
}
+49
View File
@@ -0,0 +1,49 @@
---
name: vault_diff
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: pure
signature: "func VaultDiff(prev, curr []VaultFile) VaultDiffReport"
description: "Computes the diff between two vault snapshots (slices of VaultFile). Returns Added, Removed, Changed and Unchanged counts. Pure and deterministic — no I/O."
tags: [vault, diff, comparison, pure]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: ""
imports: ["sort"]
tested: true
tests:
- "TestVaultDiff_NoChanges"
- "TestVaultDiff_AllAdded"
- "TestVaultDiff_AllRemoved"
- "TestVaultDiff_ContentChanged"
- "TestVaultDiff_Mixed"
test_file_path: "functions/infra/vault_diff_test.go"
file_path: "functions/infra/vault_diff.go"
params:
- name: prev
desc: "Snapshot anterior — slice de VaultFile del estado previo del vault (puede ser nil para diff desde cero)."
- name: curr
desc: "Snapshot actual — slice de VaultFile del estado corriente del vault (puede ser nil para diff de borrado total)."
output: "VaultDiffReport con Added (nuevos), Removed (eliminados), Changed (mismo rel_path, sha256 distinto) y Unchanged (identicos). Todos los slices ordenados por RelPath ASC."
---
## Ejemplo
```go
prev, _ := infra.VaultInventoryScan(oldPath, "my_vault_proj", "my_vault")
curr, _ := infra.VaultInventoryScan(newPath, "my_vault_proj", "my_vault")
report := infra.VaultDiff(prev, curr)
fmt.Printf("Added: %d, Removed: %d, Changed: %d, Unchanged: %d\n",
len(report.Added), len(report.Removed), len(report.Changed), report.Unchanged)
```
## Notas
- Usa `RelPath` como clave de identidad de archivo (no nombre, no sha256).
- Dos archivos con mismo `RelPath` pero diferente `Sha256` se consideran Changed.
- Los slices del report se ordenan por `RelPath` ASC para salida deterministica.
- Función pura: no toca disco ni BD.
+126
View File
@@ -0,0 +1,126 @@
package infra
import (
"testing"
)
func makeVF(relPath, sha256 string) VaultFile {
return VaultFile{
VaultID: "test_vault",
VaultName: "test",
RelPath: relPath,
Sha256: sha256,
}
}
func TestVaultDiff_NoChanges(t *testing.T) {
files := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(files, files)
if len(report.Added) != 0 {
t.Errorf("Added: want 0, got %d", len(report.Added))
}
if len(report.Removed) != 0 {
t.Errorf("Removed: want 0, got %d", len(report.Removed))
}
if len(report.Changed) != 0 {
t.Errorf("Changed: want 0, got %d", len(report.Changed))
}
if report.Unchanged != 2 {
t.Errorf("Unchanged: want 2, got %d", report.Unchanged)
}
}
func TestVaultDiff_AllAdded(t *testing.T) {
curr := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(nil, curr)
if len(report.Added) != 2 {
t.Errorf("Added: want 2, got %d", len(report.Added))
}
if len(report.Removed) != 0 {
t.Errorf("Removed: want 0, got %d", len(report.Removed))
}
if report.Added[0].RelPath != "data/a.csv" {
t.Errorf("Added[0]: want data/a.csv, got %s", report.Added[0].RelPath)
}
if report.Added[1].RelPath != "data/b.csv" {
t.Errorf("Added[1]: want data/b.csv, got %s", report.Added[1].RelPath)
}
}
func TestVaultDiff_AllRemoved(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(prev, nil)
if len(report.Removed) != 2 {
t.Errorf("Removed: want 2, got %d", len(report.Removed))
}
if len(report.Added) != 0 {
t.Errorf("Added: want 0, got %d", len(report.Added))
}
if report.Removed[0].RelPath != "data/a.csv" {
t.Errorf("Removed[0]: want data/a.csv, got %s", report.Removed[0].RelPath)
}
}
func TestVaultDiff_ContentChanged(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "old_hash"),
}
curr := []VaultFile{
makeVF("data/a.csv", "new_hash"),
}
report := VaultDiff(prev, curr)
if len(report.Changed) != 1 {
t.Fatalf("Changed: want 1, got %d", len(report.Changed))
}
if report.Changed[0].RelPath != "data/a.csv" {
t.Errorf("Changed[0].RelPath: want data/a.csv, got %s", report.Changed[0].RelPath)
}
if report.Changed[0].Prev.Sha256 != "old_hash" {
t.Errorf("Changed[0].Prev.Sha256: want old_hash, got %s", report.Changed[0].Prev.Sha256)
}
if report.Changed[0].Curr.Sha256 != "new_hash" {
t.Errorf("Changed[0].Curr.Sha256: want new_hash, got %s", report.Changed[0].Curr.Sha256)
}
if len(report.Added) != 0 || len(report.Removed) != 0 {
t.Errorf("Expected no added/removed, got %d/%d", len(report.Added), len(report.Removed))
}
if report.Unchanged != 0 {
t.Errorf("Unchanged: want 0, got %d", report.Unchanged)
}
}
func TestVaultDiff_Mixed(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
makeVF("data/c.csv", "ccc"),
}
curr := []VaultFile{
makeVF("data/a.csv", "aaa"), // unchanged
makeVF("data/b.csv", "bbb_new"), // changed
makeVF("data/d.csv", "ddd"), // added
}
report := VaultDiff(prev, curr)
if len(report.Added) != 1 || report.Added[0].RelPath != "data/d.csv" {
t.Errorf("Added: want [data/d.csv], got %v", report.Added)
}
if len(report.Removed) != 1 || report.Removed[0].RelPath != "data/c.csv" {
t.Errorf("Removed: want [data/c.csv], got %v", report.Removed)
}
if len(report.Changed) != 1 || report.Changed[0].RelPath != "data/b.csv" {
t.Errorf("Changed: want [data/b.csv], got %v", report.Changed)
}
if report.Unchanged != 1 {
t.Errorf("Unchanged: want 1, got %d", report.Unchanged)
}
}
+230
View File
@@ -0,0 +1,230 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
// VaultDoctorEntry holds the health report for a single vault.
type VaultDoctorEntry struct {
VaultName string `json:"vault_name"`
VaultPath string `json:"vault_path"`
ProjectID string `json:"project_id"`
Issues []string `json:"issues"` // human-readable issues; empty = healthy
IndexedFiles int `json:"indexed_files"` // 0 if no vault_index.db
LastIndexedAt int64 `json:"last_indexed_at"` // unix seconds; 0 if N/A
DiskFiles int `json:"disk_files"` // count via WalkDir (no hashing)
Status string `json:"status"` // "ok" | "warning" | "error"
}
// VaultDoctor audits every vault declared in projects/*/vaults/vault.yaml under
// repoRoot. For each vault it performs a series of checks (disk presence, layout,
// index existence, staleness, drift) and returns a slice of VaultDoctorEntry.
//
// The function is read-only: it never writes to disk or any database.
// Returns an error only if VaultManifestRead fails (manifest parse error).
func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error) {
entries, err := VaultManifestRead(repoRoot)
if err != nil {
return nil, fmt.Errorf("vault_doctor: read manifests: %w", err)
}
results := make([]VaultDoctorEntry, 0, len(entries))
for _, e := range entries {
result := auditVault(e)
results = append(results, result)
}
return results, nil
}
func auditVault(e VaultManifestEntry) VaultDoctorEntry {
entry := VaultDoctorEntry{
VaultName: e.Name,
VaultPath: e.Path,
ProjectID: e.ProjectID,
}
// Resolve symlinks for disk checks
realPath, err := filepath.EvalSymlinks(e.Path)
if err != nil || realPath == "" {
realPath = e.Path
}
// CHECK 1: directory_missing
info, statErr := os.Stat(realPath)
if statErr != nil || !info.IsDir() {
entry.Issues = append(entry.Issues, "directory_missing")
entry.Status = "error"
return entry
}
// COUNT disk files (cheap walk — no hashing, no mime detection)
diskCount := countDiskFiles(realPath)
entry.DiskFiles = diskCount
// CHECK 2: layout_missing / non_standard_layout
hasData := dirExists(filepath.Join(realPath, "data"))
hasKnowledge := dirExists(filepath.Join(realPath, "knowledge"))
if !hasData && !hasKnowledge {
// Check if it looks like a non-standard but intentional layout
if hasNonStandardLayout(realPath) {
entry.Issues = append(entry.Issues, "non_standard_layout")
} else {
entry.Issues = append(entry.Issues, "layout_missing")
}
}
// CHECK 3: index_missing
indexPath := filepath.Join(realPath, "vault_index.db")
_, indexStatErr := os.Stat(indexPath)
if indexStatErr != nil {
entry.Issues = append(entry.Issues, "index_missing")
entry.setWarningStatus()
entry.setFinalStatus()
return entry
}
// Open vault index (read-only) for checks 4 and 5
vdb, openErr := VaultIndexOpen(realPath)
if openErr != nil {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_open_error: %v", openErr))
entry.setWarningStatus()
return entry
}
defer vdb.Close()
// Query indexed file count and max indexed_at
var indexedCount int
var maxIndexedAt int64
row := vdb.QueryRow(`SELECT COUNT(*), COALESCE(MAX(indexed_at), 0) FROM files`)
if scanErr := row.Scan(&indexedCount, &maxIndexedAt); scanErr != nil {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_query_error: %v", scanErr))
} else {
entry.IndexedFiles = indexedCount
entry.LastIndexedAt = maxIndexedAt
}
// CHECK 4: index_stale — any file on disk newer than MAX(indexed_at)
if maxIndexedAt > 0 {
maxTime := time.Unix(maxIndexedAt, 0)
if isIndexStale(realPath, maxTime) {
entry.Issues = append(entry.Issues, "index_stale")
}
}
// CHECK 5: index_drift — disk file count != indexed count
if indexedCount != diskCount {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_drift: disk=%d indexed=%d", diskCount, indexedCount))
}
// CHECK 6: empty_vault
if diskCount == 0 {
entry.Issues = append(entry.Issues, "empty_vault")
}
entry.setFinalStatus()
return entry
}
// setWarningStatus sets status to warning if not already error.
func (e *VaultDoctorEntry) setWarningStatus() {
if e.Status != "error" {
e.Status = "warning"
}
}
// setFinalStatus derives the final Status from Issues.
func (e *VaultDoctorEntry) setFinalStatus() {
if e.Status == "error" {
return
}
if len(e.Issues) == 0 {
e.Status = "ok"
} else {
e.Status = "warning"
}
}
// countDiskFiles walks realPath and counts regular files, excluding:
// vault_index.db*, .git/, hidden files/dirs at any depth.
func countDiskFiles(realPath string) int {
count := 0
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return nil
}
name := d.Name()
// Skip hidden entries
if strings.HasPrefix(name, ".") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
// Skip .git
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
// Skip vault_index.db files
if !d.IsDir() && (name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal") {
return nil
}
if !d.IsDir() {
count++
}
return nil
})
return count
}
// isIndexStale returns true if any regular file under realPath has an mtime
// strictly after maxTime (excluding vault_index.db* and hidden files).
func isIndexStale(realPath string, maxTime time.Time) bool {
stale := false
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
if err != nil || stale {
return nil
}
name := d.Name()
if strings.HasPrefix(name, ".") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
if !d.IsDir() {
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
return nil
}
fi, statErr := d.Info()
if statErr == nil && fi.ModTime().After(maxTime) {
stale = true
}
}
return nil
})
return stale
}
// hasNonStandardLayout returns true when a vault directory contains
// subdirectories that are clearly intentional but not data/knowledge.
// Heuristic: any subdir at the vault root that is not data/knowledge.
func hasNonStandardLayout(realPath string) bool {
entries, err := os.ReadDir(realPath)
if err != nil {
return false
}
standardDirs := map[string]bool{"data": true, "knowledge": true, ".git": true}
for _, e := range entries {
if e.IsDir() && !standardDirs[e.Name()] && !strings.HasPrefix(e.Name(), ".") {
return true
}
}
return false
}
+66
View File
@@ -0,0 +1,66 @@
---
name: vault_doctor
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error)"
description: "Audita la salud de todos los vaults declarados en projects/*/vaults/vault.yaml. Comprueba existencia del directorio, layout estándar, presencia del índice, staleness y drift entre disco e índice. Read-only."
tags: [vault, doctor, health, audit]
uses_functions:
- "vault_manifest_read_go_infra"
- "vault_index_open_go_infra"
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "time"
tested: true
tests:
- "TestVaultDoctor_OK"
- "TestVaultDoctor_MissingDir"
- "TestVaultDoctor_NoIndex"
- "TestVaultDoctor_LayoutDrift"
- "TestVaultDoctor_EmptyVault"
test_file_path: "functions/infra/vault_doctor_test.go"
file_path: "functions/infra/vault_doctor.go"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del fn_registry (donde están projects/ y registry.db)."
output: "Slice de VaultDoctorEntry con Status (ok/warning/error), Issues, DiskFiles, IndexedFiles y LastIndexedAt por vault. Error fatal solo si los manifests no se pueden leer."
---
## Checks aplicados
| Check | Condición | Severidad |
|---|---|---|
| `directory_missing` | `e.Path` no existe en disco | error |
| `layout_missing` | no hay `data/` ni `knowledge/` en la raíz del vault | warning |
| `non_standard_layout` | no hay `data/`/`knowledge/` pero sí otros subdirectorios (ej. imagegen_models) | warning |
| `index_missing` | no existe `vault_index.db` | warning |
| `index_stale` | algún archivo en disco tiene mtime > MAX(indexed_at) | warning |
| `index_drift` | count disco != count en tabla `files` | warning |
| `empty_vault` | DiskFiles == 0 | warning |
## Ejemplo
```go
entries, err := infra.VaultDoctor("/home/lucas/fn_registry")
for _, e := range entries {
fmt.Printf("%-30s %-8s files=%d issues=%v\n",
e.VaultName, e.Status, e.DiskFiles, e.Issues)
}
```
## Notas
- Función read-only: nunca escribe en disco ni en ninguna base de datos.
- `countDiskFiles` usa `filepath.WalkDir` sin hash (cheap) — excluye `vault_index.db*`, `.git/` y ficheros ocultos.
- `isIndexStale` también usa WalkDir; compara mtime de archivos con MAX(indexed_at) de la BD.
- El VaultIndexOpen de sólo lectura no crea el DB (si no existe, retorna error y se reporta `index_missing`).
+211
View File
@@ -0,0 +1,211 @@
package infra
import (
"os"
"path/filepath"
"testing"
"time"
)
// setupDoctorRepo creates a repo layout with one vault in a project manifest.
// vaultPath must be an absolute path that already exists (or not, for missing tests).
func setupDoctorRepo(t *testing.T, vaultName, projectID, vaultPath string) string {
t.Helper()
root := t.TempDir()
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
t.Fatalf("mkdir projects: %v", err)
}
manifest := "vaults:\n - name: " + vaultName + "\n description: test vault\n path: " + vaultPath + "\n tags: []\n"
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifest), 0644); err != nil {
t.Fatalf("write vault.yaml: %v", err)
}
return root
}
func TestVaultDoctor_OK(t *testing.T) {
vaultDir := t.TempDir()
// Proper layout
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
t.Fatal(err)
}
// Create a file with a past mtime so the index is not stale
samplePath := filepath.Join(vaultDir, "data", "raw", "sample.csv")
if err := os.WriteFile(samplePath, []byte("a,b\n1,2\n"), 0644); err != nil {
t.Fatal(err)
}
pastTime := time.Now().Add(-1 * time.Hour)
if err := os.Chtimes(samplePath, pastTime, pastTime); err != nil {
t.Fatal(err)
}
// Create vault_index.db with the file indexed after its mtime
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
futureIndexed := time.Now().Unix() // indexed_at is now — after file mtime
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/sample.csv", 8, pastTime.Unix(), "deadbeef", "text/csv", ".csv", "data", "raw", futureIndexed)
if err != nil {
t.Fatalf("insert: %v", err)
}
vdb.Close()
root := setupDoctorRepo(t, "my_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "ok" {
t.Errorf("Status: want ok, got %s (issues: %v)", e.Status, e.Issues)
}
if len(e.Issues) != 0 {
t.Errorf("Issues: want empty, got %v", e.Issues)
}
if e.DiskFiles != 1 {
t.Errorf("DiskFiles: want 1, got %d", e.DiskFiles)
}
if e.IndexedFiles != 1 {
t.Errorf("IndexedFiles: want 1, got %d", e.IndexedFiles)
}
}
func TestVaultDoctor_MissingDir(t *testing.T) {
missingPath := filepath.Join(t.TempDir(), "does_not_exist")
root := setupDoctorRepo(t, "missing_vault", "my_proj", missingPath)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "error" {
t.Errorf("Status: want error, got %s", e.Status)
}
found := false
for _, issue := range e.Issues {
if issue == "directory_missing" {
found = true
}
}
if !found {
t.Errorf("Expected directory_missing issue, got %v", e.Issues)
}
}
func TestVaultDoctor_NoIndex(t *testing.T) {
vaultDir := t.TempDir()
// Proper layout but no vault_index.db
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "a.csv"), []byte("x"), 0644); err != nil {
t.Fatal(err)
}
root := setupDoctorRepo(t, "no_index_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s", e.Status)
}
found := false
for _, issue := range e.Issues {
if issue == "index_missing" {
found = true
}
}
if !found {
t.Errorf("Expected index_missing issue, got %v", e.Issues)
}
}
func TestVaultDoctor_LayoutDrift(t *testing.T) {
vaultDir := t.TempDir()
// No data/ or knowledge/ — just a random file at root
if err := os.WriteFile(filepath.Join(vaultDir, "something.txt"), []byte("hi"), 0644); err != nil {
t.Fatal(err)
}
root := setupDoctorRepo(t, "layout_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s", e.Status)
}
foundLayout := false
for _, issue := range e.Issues {
if issue == "layout_missing" || issue == "non_standard_layout" {
foundLayout = true
}
}
if !foundLayout {
t.Errorf("Expected layout_missing or non_standard_layout, got %v", e.Issues)
}
}
func TestVaultDoctor_EmptyVault(t *testing.T) {
vaultDir := t.TempDir()
// data/ and knowledge/ exist but are empty
if err := os.MkdirAll(filepath.Join(vaultDir, "data"), 0755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
t.Fatal(err)
}
// Create vault_index.db (empty)
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
vdb.Close()
root := setupDoctorRepo(t, "empty_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s (issues: %v)", e.Status, e.Issues)
}
found := false
for _, issue := range e.Issues {
if issue == "empty_vault" {
found = true
}
}
if !found {
t.Errorf("Expected empty_vault issue, got %v", e.Issues)
}
}
+21
View File
@@ -0,0 +1,21 @@
package infra
// VaultFile describes a single file inside a vault directory.
// It carries identity (vault + relative path), content metadata (size, mtime, sha256, mime)
// and structural classification (bucket, sub-bucket).
type VaultFile struct {
VaultID string `json:"vault_id"` // e.g. "turismo_spain_app_turismo"
VaultName string `json:"vault_name"` // e.g. "turismo_spain"
RelPath string `json:"rel_path"` // path relative to vault root, e.g. "data/raw/foo.csv"
Size int64 `json:"size"` // bytes
Mtime int64 `json:"mtime"` // unix seconds (UTC)
Sha256 string `json:"sha256"` // hex lowercase
Mime string `json:"mime"` // e.g. "text/csv"
Ext string `json:"ext"` // e.g. ".csv"
// Bucket is the top-level classification: "data" or "knowledge".
Bucket string `json:"bucket"`
// SubBucket is the second-level directory within the bucket.
// Known values: raw, processed, exports (data); decisions, domains, models,
// benchmarks, test_documents (knowledge). Empty string for files at bucket root.
SubBucket string `json:"sub_bucket"`
}
@@ -0,0 +1,49 @@
CREATE TABLE IF NOT EXISTS files (
rel_path TEXT PRIMARY KEY,
size INTEGER NOT NULL,
mtime INTEGER NOT NULL,
sha256 TEXT NOT NULL,
mime TEXT NOT NULL DEFAULT '',
ext TEXT NOT NULL DEFAULT '',
bucket TEXT NOT NULL DEFAULT '',
sub_bucket TEXT NOT NULL DEFAULT '',
indexed_at INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_files_sha256 ON files(sha256);
CREATE INDEX IF NOT EXISTS idx_files_bucket ON files(bucket, sub_bucket);
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
rel_path,
content_text,
content='',
tokenize='unicode61 remove_diacritics 2'
);
CREATE TABLE IF NOT EXISTS csv_profiles (
rel_path TEXT PRIMARY KEY,
cols_json TEXT NOT NULL,
n_rows INTEGER NOT NULL,
encoding TEXT NOT NULL DEFAULT '',
date_min TEXT,
date_max TEXT,
profiled_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS pdf_extracts (
rel_path TEXT PRIMARY KEY,
page_count INTEGER NOT NULL,
text_len INTEGER NOT NULL,
extracted_to TEXT,
extracted_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS knowledge_docs (
rel_path TEXT PRIMARY KEY,
title TEXT NOT NULL DEFAULT '',
frontmatter_json TEXT NOT NULL DEFAULT '{}',
headings_json TEXT NOT NULL DEFAULT '[]',
parsed_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
+30
View File
@@ -0,0 +1,30 @@
package infra
import (
"database/sql"
"embed"
"fmt"
"path/filepath"
)
//go:embed vault_index_migrations/*.sql
var vaultIndexMigrationsFS embed.FS
// VaultIndexOpen opens (or creates) the vault_index.db inside vaultPath.
// It applies all embedded migrations idempotently and returns a ready-to-use
// *sql.DB. The caller is responsible for closing the connection.
//
// The database is opened with WAL mode and foreign keys enabled via SQLiteOpen.
// Migrations are applied from vault_index_migrations/*.sql in lexicographic order.
func VaultIndexOpen(vaultPath string) (*sql.DB, error) {
dbPath := filepath.Join(vaultPath, "vault_index.db")
db, err := SQLiteOpen(dbPath, "")
if err != nil {
return nil, fmt.Errorf("vault_index_open: %w", err)
}
if err := ApplyMigrations(db, vaultIndexMigrationsFS, "vault_index_migrations/*.sql"); err != nil {
db.Close()
return nil, fmt.Errorf("vault_index_open: apply migrations: %w", err)
}
return db, nil
}
+54
View File
@@ -0,0 +1,54 @@
---
name: vault_index_open
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultIndexOpen(vaultPath string) (*sql.DB, error)"
description: "Abre (o crea) vault_index.db dentro de vaultPath con WAL + FK y aplica las migraciones embebidas idempotentemente. El caller cierra la conexion."
tags: [vault, sqlite, index, migration, infra]
uses_functions: ["sqlite_open_go_infra", "sqlite_apply_migrations_go_infra"]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, embed, fmt, path/filepath]
params:
- name: vaultPath
desc: "ruta absoluta o relativa al directorio raiz del vault"
output: "*sql.DB apuntando a <vaultPath>/vault_index.db con schema completo aplicado; el caller es responsable de cerrar"
tested: true
tests:
- "crea vault_index.db en tmpdir vacio"
- "segunda apertura no falla (idempotente)"
- "todas las tablas esperadas existen en sqlite_master"
- "fts5 INSERT y MATCH funcionan"
test_file_path: "functions/infra/vault_index_open_test.go"
file_path: "functions/infra/vault_index_open.go"
---
## Ejemplo
```go
db, err := VaultIndexOpen("/data/vaults/turismo_spain")
if err != nil {
log.Fatal(err)
}
defer db.Close()
```
## Notas
El archivo de base de datos se crea en `<vaultPath>/vault_index.db`. Las migraciones
viven en `vault_index_migrations/*.sql` embebidas via `//go:embed` en el mismo paquete.
Schema creado por `001_init.sql`:
- `files` — inventario de archivos (PK: rel_path)
- `files_fts` — tabla FTS5 virtual para busqueda de texto (content_text lo llenan profilers posteriores)
- `csv_profiles` — perfil de columnas/filas para .csv (FK → files)
- `pdf_extracts` — metadatos de extraccion de texto para .pdf (FK → files)
- `knowledge_docs` — headings/frontmatter para .md del bucket knowledge (FK → files)
`SQLiteOpen` abre con WAL mode + foreign keys. `ApplyMigrations` es idempotente:
los errores de "already exists" y "duplicate column" se ignoran silenciosamente.
+107
View File
@@ -0,0 +1,107 @@
package infra
import (
"database/sql"
"os"
"path/filepath"
"testing"
)
func TestVaultIndexOpen_CreatesDB(t *testing.T) {
t.Run("crea vault_index.db en tmpdir vacio", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
dbPath := filepath.Join(dir, "vault_index.db")
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
t.Fatalf("vault_index.db no fue creado en %s", dir)
}
})
}
func TestVaultIndexOpen_Idempotent(t *testing.T) {
t.Run("segunda apertura no falla (idempotente)", func(t *testing.T) {
dir := t.TempDir()
db1, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("primera apertura: %v", err)
}
db1.Close()
db2, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("segunda apertura: %v", err)
}
db2.Close()
})
}
func TestVaultIndexOpen_AppliesAllMigrations(t *testing.T) {
t.Run("todas las tablas esperadas existen en sqlite_master", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
expectedTables := []string{
"files",
"csv_profiles",
"pdf_extracts",
"knowledge_docs",
}
for _, tbl := range expectedTables {
assertTableExists(t, db, tbl)
}
})
}
func TestVaultIndexOpen_FTS5Works(t *testing.T) {
t.Run("fts5 INSERT y MATCH funcionan", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
// Insert a row into files_fts (content='' table, manual INSERT required)
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`,
"data/raw/informe_ventas.csv", "ventas trimestrales empresa")
if err != nil {
t.Fatalf("INSERT files_fts: %v", err)
}
var count int
err = db.QueryRow(
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'ventas'`,
).Scan(&count)
if err != nil {
t.Fatalf("FTS MATCH query: %v", err)
}
if count != 1 {
t.Errorf("FTS MATCH: got %d rows, want 1", count)
}
})
}
// assertTableExists verifies that a table (or virtual table) exists in sqlite_master.
func assertTableExists(t *testing.T, db *sql.DB, name string) {
t.Helper()
var exists int
err := db.QueryRow(
`SELECT count(*) FROM sqlite_master WHERE name = ?`, name,
).Scan(&exists)
if err != nil {
t.Fatalf("sqlite_master query for %q: %v", name, err)
}
if exists == 0 {
t.Errorf("table/vtable %q not found in sqlite_master", name)
}
}
+154
View File
@@ -0,0 +1,154 @@
package infra
import (
"database/sql"
"fmt"
"strings"
"time"
)
// WriteReport summarises the outcome of a VaultIndexWrite call.
type WriteReport struct {
Inserted int // rows newly inserted into files
Updated int // rows updated (upserted) in files
Pruned int // rows deleted from files (only when prune=true)
FTS int // rows inserted into files_fts
}
// VaultIndexWrite upserts a slice of VaultFile into the vault_index.db opened
// as db, updates the files_fts FTS5 table, and optionally prunes stale rows.
//
// All changes run inside a single transaction.
//
// Counting strategy: the set of rel_paths already in the DB is read before the
// loop. An upsert is counted as Inserted if the rel_path was absent, Updated if
// it was present. This avoids N+1 queries while remaining correct.
//
// FTS5: all affected rows are deleted and re-inserted with rel_path and empty
// content_text. Downstream profilers (csv_profiles, pdf_extracts, knowledge_docs)
// are responsible for populating content_text with meaningful text.
//
// Prune: if prune=true, every row in files whose rel_path is NOT in the provided
// slice is deleted. Cascades to csv_profiles, pdf_extracts, knowledge_docs via FK.
func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error) {
var report WriteReport
if len(files) == 0 && !prune {
return report, nil
}
tx, err := db.Begin()
if err != nil {
return report, fmt.Errorf("vault_index_write: begin tx: %w", err)
}
defer func() {
if err != nil {
tx.Rollback() //nolint:errcheck
}
}()
// Load existing rel_paths into a set to distinguish insert vs update.
existing := make(map[string]struct{})
rows, err := tx.Query(`SELECT rel_path FROM files`)
if err != nil {
return report, fmt.Errorf("vault_index_write: query existing: %w", err)
}
for rows.Next() {
var rp string
if err := rows.Scan(&rp); err != nil {
rows.Close()
return report, fmt.Errorf("vault_index_write: scan existing: %w", err)
}
existing[rp] = struct{}{}
}
rows.Close()
if err := rows.Err(); err != nil {
return report, fmt.Errorf("vault_index_write: rows err: %w", err)
}
now := time.Now().Unix()
upsertStmt, err := tx.Prepare(`
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(rel_path) DO UPDATE SET
size = excluded.size,
mtime = excluded.mtime,
sha256 = excluded.sha256,
mime = excluded.mime,
ext = excluded.ext,
bucket = excluded.bucket,
sub_bucket = excluded.sub_bucket,
indexed_at = excluded.indexed_at
`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare upsert: %w", err)
}
defer upsertStmt.Close()
ftsDeleteStmt, err := tx.Prepare(`DELETE FROM files_fts WHERE rel_path = ?`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare fts delete: %w", err)
}
defer ftsDeleteStmt.Close()
ftsInsertStmt, err := tx.Prepare(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, '')`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare fts insert: %w", err)
}
defer ftsInsertStmt.Close()
for _, f := range files {
_, err = upsertStmt.Exec(
f.RelPath, f.Size, f.Mtime, f.Sha256,
f.Mime, f.Ext, f.Bucket, f.SubBucket, now,
)
if err != nil {
return report, fmt.Errorf("vault_index_write: upsert %q: %w", f.RelPath, err)
}
if _, wasExisting := existing[f.RelPath]; wasExisting {
report.Updated++
} else {
report.Inserted++
}
// Refresh FTS row.
if _, err = ftsDeleteStmt.Exec(f.RelPath); err != nil {
return report, fmt.Errorf("vault_index_write: fts delete %q: %w", f.RelPath, err)
}
if _, err = ftsInsertStmt.Exec(f.RelPath); err != nil {
return report, fmt.Errorf("vault_index_write: fts insert %q: %w", f.RelPath, err)
}
report.FTS++
}
// Prune rows not present in the incoming slice.
if prune && len(files) > 0 {
keep := make([]string, len(files))
for i, f := range files {
keep[i] = "'" + strings.ReplaceAll(f.RelPath, "'", "''") + "'"
}
inClause := strings.Join(keep, ",")
res, err := tx.Exec(fmt.Sprintf(
`DELETE FROM files WHERE rel_path NOT IN (%s)`, inClause,
))
if err != nil {
return report, fmt.Errorf("vault_index_write: prune: %w", err)
}
n, _ := res.RowsAffected()
report.Pruned = int(n)
} else if prune && len(files) == 0 {
// prune=true with empty slice means delete everything.
res, err := tx.Exec(`DELETE FROM files`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prune all: %w", err)
}
n, _ := res.RowsAffected()
report.Pruned = int(n)
}
if err = tx.Commit(); err != nil {
return report, fmt.Errorf("vault_index_write: commit: %w", err)
}
return report, nil
}
+84
View File
@@ -0,0 +1,84 @@
---
name: vault_index_write
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error)"
description: "Upserta un slice de VaultFile en vault_index.db (tabla files + FTS5 files_fts) dentro de una sola transaccion. Cuenta Inserted/Updated/FTS. Con prune=true elimina filas no presentes en el slice."
tags: [vault, sqlite, index, write, upsert, fts, infra]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, fmt, strings, time]
params:
- name: db
desc: "*sql.DB abierto sobre vault_index.db (tipicamente retornado por VaultIndexOpen)"
- name: files
desc: "slice de VaultFile a insertar/actualizar; puede ser vacio"
- name: prune
desc: "si true, elimina de 'files' todas las filas cuyo rel_path no este en el slice (sincronizacion destructiva)"
output: "WriteReport con conteos Inserted/Updated/Pruned/FTS; error si falla la transaccion"
tested: true
tests:
- "N archivos nuevos — Inserted=N"
- "re-escritura con mtime distinto — Updated=N"
- "prune elimina filas ausentes"
- "sin prune, filas previas persisten"
- "FTS5 MATCH funciona tras escritura"
test_file_path: "functions/infra/vault_index_write_test.go"
file_path: "functions/infra/vault_index_write.go"
---
## Ejemplo
```go
db, _ := VaultIndexOpen("/data/vaults/turismo")
defer db.Close()
files, _ := VaultInventoryScan("/data/vaults/turismo", "turismo_v1", "turismo")
report, err := VaultIndexWrite(db, files, true)
if err != nil {
log.Fatal(err)
}
fmt.Printf("inserted=%d updated=%d pruned=%d fts=%d\n",
report.Inserted, report.Updated, report.Pruned, report.FTS)
```
## Notas
### WriteReport
Struct local al paquete infra:
```go
type WriteReport struct {
Inserted int
Updated int
Pruned int
FTS int
}
```
### Estrategia de conteo Inserted vs Updated
Se carga el conjunto de rel_paths existentes en un map antes del loop. Un upsert
se clasifica como Inserted si el rel_path no estaba en el map, Updated si estaba.
Esto evita N+1 SELECTs y es correcto porque la transaccion serializa los cambios.
### FTS5
`files_fts` usa `content=''` (tabla de contenido externo vacio). Para cada archivo
se borra la fila FTS existente y se reinserta con `content_text=''`. Los profilers
posteriores (csv_profiles, knowledge_docs) son responsables de actualizar
`content_text` con texto indexable real.
### Prune
Con `prune=true` se construye un IN clause con los rel_paths del slice. La FK con
`ON DELETE CASCADE` propaga el DELETE a csv_profiles, pdf_extracts y knowledge_docs
automaticamente. Con slice vacio + prune=true se borra todo (DELETE FROM files).
### Escapado SQL
El IN clause se construye escapando las comillas simples en rel_path (duplicandolas).
Evita inyeccion en rutas con apostrofos. Para entornos con rutas controladas
(interior de vaults sin apostrofos) esto es suficiente; para entornos adversariales
usar parametros binding con VALUES multiples via prepared statement.
+210
View File
@@ -0,0 +1,210 @@
package infra
import (
"testing"
"time"
)
// makeTestVaultFile creates a minimal VaultFile for testing.
func makeTestVaultFile(relPath, mime, bucket, subBucket string) VaultFile {
return VaultFile{
VaultID: "test_vault",
VaultName: "test",
RelPath: relPath,
Size: 100,
Mtime: time.Now().Unix(),
Sha256: "abc123def456abc123def456abc123def456abc123def456abc123def456abc1",
Mime: mime,
Ext: ".csv",
Bucket: bucket,
SubBucket: subBucket,
}
}
func openInMemoryVaultIndex(t *testing.T) interface{ Close() error } {
t.Helper()
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
return db
}
func TestVaultIndexWrite_FreshInsert(t *testing.T) {
t.Run("N archivos nuevos — Inserted=N", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
makeTestVaultFile("knowledge/decisions/x.md", "text/markdown", "knowledge", "decisions"),
}
report, err := VaultIndexWrite(db, files, false)
if err != nil {
t.Fatalf("VaultIndexWrite: %v", err)
}
if report.Inserted != 3 {
t.Errorf("Inserted = %d, want 3", report.Inserted)
}
if report.Updated != 0 {
t.Errorf("Updated = %d, want 0", report.Updated)
}
if report.Pruned != 0 {
t.Errorf("Pruned = %d, want 0", report.Pruned)
}
if report.FTS != 3 {
t.Errorf("FTS = %d, want 3", report.FTS)
}
})
}
func TestVaultIndexWrite_Upsert(t *testing.T) {
t.Run("re-escritura con mtime distinto — Updated=N", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, files, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Modify mtime to simulate file change.
files[0].Mtime = time.Now().Unix() + 100
files[1].Mtime = time.Now().Unix() + 200
report, err := VaultIndexWrite(db, files, false)
if err != nil {
t.Fatalf("second write: %v", err)
}
if report.Inserted != 0 {
t.Errorf("Inserted = %d, want 0", report.Inserted)
}
if report.Updated != 2 {
t.Errorf("Updated = %d, want 2", report.Updated)
}
})
}
func TestVaultIndexWrite_Prune(t *testing.T) {
t.Run("prune elimina filas ausentes", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
// Write A and B.
ab := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, ab, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Write only A with prune=true — B should be deleted.
onlyA := []VaultFile{ab[0]}
report, err := VaultIndexWrite(db, onlyA, true)
if err != nil {
t.Fatalf("prune write: %v", err)
}
if report.Pruned != 1 {
t.Errorf("Pruned = %d, want 1", report.Pruned)
}
// Verify B is gone.
var count int
err = db.QueryRow(`SELECT count(*) FROM files WHERE rel_path = 'data/raw/b.csv'`).Scan(&count)
if err != nil {
t.Fatalf("query: %v", err)
}
if count != 0 {
t.Errorf("b.csv still present after prune")
}
})
}
func TestVaultIndexWrite_NoPrune(t *testing.T) {
t.Run("sin prune, filas previas persisten", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
ab := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, ab, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Write only A without prune — B must remain.
onlyA := []VaultFile{ab[0]}
report, err := VaultIndexWrite(db, onlyA, false)
if err != nil {
t.Fatalf("second write: %v", err)
}
if report.Pruned != 0 {
t.Errorf("Pruned = %d, want 0", report.Pruned)
}
var count int
err = db.QueryRow(`SELECT count(*) FROM files`).Scan(&count)
if err != nil {
t.Fatalf("query: %v", err)
}
if count != 2 {
t.Errorf("files count = %d, want 2", count)
}
})
}
func TestVaultIndexWrite_FTSMatch(t *testing.T) {
t.Run("FTS5 MATCH funciona tras escritura", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/foo_report.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/bar_data.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, files, false); err != nil {
t.Fatalf("write: %v", err)
}
// FTS5 on rel_path column: MATCH 'foo*'
var count int
err = db.QueryRow(
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'rel_path:foo*'`,
).Scan(&count)
if err != nil {
t.Fatalf("FTS MATCH query: %v", err)
}
if count != 1 {
t.Errorf("FTS MATCH rel_path:foo* = %d rows, want 1", count)
}
})
}
+174
View File
@@ -0,0 +1,174 @@
package infra
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
)
// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
// for every regular file found, skipping:
// - vault_index.db, vault_index.db-shm, vault_index.db-wal
// - .git/ directories at any depth
// - hidden files/dirs (names starting with ".") at the vault root level only
//
// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
//
// MIME detection priority:
// 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
// 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
//
// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
// for upload validation, not for open-ended inventory scanning where any MIME is valid.
// http.DetectContentType provides the same magic-byte detection without the allowlist
// coupling and handles a broader set of formats including text/plain for CSV fallback.
func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
var files []VaultFile
err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
name := d.Name()
// Skip .git directories at any depth.
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
// Skip hidden entries (names starting with ".") at vault root only.
if strings.HasPrefix(name, ".") {
rel, relErr := filepath.Rel(vaultPath, path)
if relErr == nil {
// At root level the relative path has no separator.
if !strings.Contains(filepath.ToSlash(rel), "/") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
}
}
if d.IsDir() {
return nil
}
// Skip vault_index.db and its WAL/SHM sidecar files.
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
return nil
}
rel, err := filepath.Rel(vaultPath, path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
}
rel = filepath.ToSlash(rel)
info, err := d.Info()
if err != nil {
return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
}
// Compute sha256 by streaming — avoids loading large files into memory.
sha, err := fileSha256(path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
}
mime, err := detectVaultFileMime(path, name)
if err != nil {
return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
}
ext := strings.ToLower(filepath.Ext(name))
bucket, subBucket := vaultBucketParts(rel)
files = append(files, VaultFile{
VaultID: vaultID,
VaultName: vaultName,
RelPath: rel,
Size: info.Size(),
Mtime: info.ModTime().UTC().Unix(),
Sha256: sha,
Mime: mime,
Ext: ext,
Bucket: bucket,
SubBucket: subBucket,
})
return nil
})
if err != nil {
return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
}
sort.Slice(files, func(i, j int) bool {
return files[i].RelPath < files[j].RelPath
})
return files, nil
}
// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
func fileSha256(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// detectVaultFileMime returns the MIME type for a vault file.
// Extension overrides take priority; otherwise http.DetectContentType is used.
func detectVaultFileMime(path, name string) (string, error) {
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".csv":
return "text/csv", nil
case ".md":
return "text/markdown", nil
case ".parquet":
return "application/parquet", nil
}
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
buf := make([]byte, 512)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
return http.DetectContentType(buf[:n]), nil
}
// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
// the second-level sub-bucket from a forward-slash relative path.
// Returns empty strings for files at vault root or with no recognisable bucket.
func vaultBucketParts(relPath string) (bucket, subBucket string) {
parts := strings.SplitN(relPath, "/", 3)
if len(parts) < 1 {
return "", ""
}
bucket = parts[0]
if len(parts) >= 2 {
subBucket = parts[1]
}
return bucket, subBucket
}
+74
View File
@@ -0,0 +1,74 @@
---
name: vault_inventory_scan
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error)"
description: "Recorre vaultPath con filepath.WalkDir y retorna un slice de VaultFile ordenado por RelPath para cada archivo regular, computando sha256 por streaming, MIME por extension/magic y bucket/sub-bucket por posicion en el arbol."
tags: [vault, inventory, scan, filesystem, sha256, mime, infra]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [crypto/sha256, encoding/hex, fmt, io, net/http, os, path/filepath, sort, strings]
params:
- name: vaultPath
desc: "ruta absoluta o relativa al directorio raiz del vault"
- name: vaultID
desc: "identificador del vault (ej: turismo_spain_app_turismo) — se copia a cada VaultFile"
- name: vaultName
desc: "nombre legible del vault (ej: turismo_spain) — se copia a cada VaultFile"
output: "slice de VaultFile ordenado lexicograficamente por RelPath; slice vacio (no nil) si el vault esta vacio"
tested: true
tests:
- "tmpdir vacio retorna slice vacio"
- "data layout — bucket y sub_bucket correctos"
- "knowledge layout — bucket y sub_bucket correctos"
- "omite vault_index.db y .git"
- "sha256 determinista para mismo contenido"
- "orden lexicografico del resultado"
test_file_path: "functions/infra/vault_inventory_scan_test.go"
file_path: "functions/infra/vault_inventory_scan.go"
---
## Ejemplo
```go
files, err := VaultInventoryScan("/data/vaults/turismo_spain", "turismo_spain_v1", "turismo_spain")
if err != nil {
log.Fatal(err)
}
for _, f := range files {
fmt.Printf("%s %s %s/%s\n", f.RelPath, f.Mime, f.Bucket, f.SubBucket)
}
```
## Notas
### Archivos omitidos
- `vault_index.db`, `vault_index.db-shm`, `vault_index.db-wal` (siempre)
- `.git/` en cualquier profundidad (SkipDir)
- Entradas cuyo nombre empieza por `.` solo en la raiz del vault (nivel 0)
### Deteccion de MIME
`file_validate_type_go_infra` (FileValidateType) no se usa porque su firma
requiere una lista blanca de tipos permitidos y retorna (mime, bool) — esta
disenada para validacion de uploads, no para escaneo inventarial donde
cualquier MIME es valido. Se usan en su lugar:
1. Override por extension (prioridad alta): `.csv``text/csv`, `.md``text/markdown`,
`.parquet``application/parquet`. Necesario porque `http.DetectContentType`
clasifica CSV como `text/plain` y no conoce Parquet.
2. `http.DetectContentType` sobre primeros 512 bytes (magic bytes, stdlib) para el resto.
### SHA-256
Calculado por streaming con `io.Copy` a `sha256.New()` — no carga el archivo completo
a memoria. Valido para archivos de cualquier tamano.
### Bucket / SubBucket
Derivados de la posicion en el arbol:
- `bucket` = primer segmento del RelPath (tipicamente "data" o "knowledge")
- `subBucket` = segundo segmento si existe; vacio si el archivo esta en la raiz del bucket
@@ -0,0 +1,182 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
func writeTestFile(t *testing.T, dir, rel, content string) {
t.Helper()
full := filepath.Join(dir, filepath.FromSlash(rel))
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
t.Fatalf("mkdir %s: %v", filepath.Dir(full), err)
}
if err := os.WriteFile(full, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", full, err)
}
}
func TestVaultInventoryScan_Empty(t *testing.T) {
t.Run("tmpdir vacio retorna slice vacio", func(t *testing.T) {
dir := t.TempDir()
files, err := VaultInventoryScan(dir, "v1", "test")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 0 {
t.Errorf("expected 0 files, got %d", len(files))
}
})
}
func TestVaultInventoryScan_DataLayout(t *testing.T) {
t.Run("data layout — bucket y sub_bucket correctos", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "data/raw/a.csv", "col1,col2\n1,2\n")
writeTestFile(t, dir, "data/processed/b.parquet", "PAR1fakedata")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 2 {
t.Fatalf("expected 2 files, got %d", len(files))
}
// files are sorted: data/processed/b.parquet < data/raw/a.csv
b := files[0]
if b.RelPath != "data/processed/b.parquet" {
t.Errorf("files[0].RelPath = %q, want data/processed/b.parquet", b.RelPath)
}
if b.Bucket != "data" {
t.Errorf("files[0].Bucket = %q, want data", b.Bucket)
}
if b.SubBucket != "processed" {
t.Errorf("files[0].SubBucket = %q, want processed", b.SubBucket)
}
if b.Mime != "application/parquet" {
t.Errorf("files[0].Mime = %q, want application/parquet", b.Mime)
}
if b.Ext != ".parquet" {
t.Errorf("files[0].Ext = %q, want .parquet", b.Ext)
}
if b.VaultID != "vid" {
t.Errorf("VaultID = %q, want vid", b.VaultID)
}
a := files[1]
if a.RelPath != "data/raw/a.csv" {
t.Errorf("files[1].RelPath = %q, want data/raw/a.csv", a.RelPath)
}
if a.Mime != "text/csv" {
t.Errorf("files[1].Mime = %q, want text/csv", a.Mime)
}
if a.Bucket != "data" || a.SubBucket != "raw" {
t.Errorf("files[1]: bucket=%q subBucket=%q, want data/raw", a.Bucket, a.SubBucket)
}
})
}
func TestVaultInventoryScan_KnowledgeLayout(t *testing.T) {
t.Run("knowledge layout — bucket y sub_bucket correctos", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "knowledge/decisions/x.md", "# Decision\n\ncontent")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 1 {
t.Fatalf("expected 1 file, got %d", len(files))
}
f := files[0]
if f.RelPath != "knowledge/decisions/x.md" {
t.Errorf("RelPath = %q", f.RelPath)
}
if f.Bucket != "knowledge" {
t.Errorf("Bucket = %q, want knowledge", f.Bucket)
}
if f.SubBucket != "decisions" {
t.Errorf("SubBucket = %q, want decisions", f.SubBucket)
}
if f.Mime != "text/markdown" {
t.Errorf("Mime = %q, want text/markdown", f.Mime)
}
})
}
func TestVaultInventoryScan_SkipsIndexAndGit(t *testing.T) {
t.Run("omite vault_index.db y .git", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "vault_index.db", "sqlite data")
writeTestFile(t, dir, "vault_index.db-wal", "wal data")
writeTestFile(t, dir, ".git/HEAD", "ref: refs/heads/master")
writeTestFile(t, dir, "data/raw/real.csv", "a,b\n1,2\n")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 1 {
t.Fatalf("expected 1 file (real.csv), got %d: %v", len(files), relPaths(files))
}
if files[0].RelPath != "data/raw/real.csv" {
t.Errorf("unexpected file: %q", files[0].RelPath)
}
})
}
func TestVaultInventoryScan_Sha256Deterministic(t *testing.T) {
t.Run("sha256 determinista para mismo contenido", func(t *testing.T) {
dir1 := t.TempDir()
dir2 := t.TempDir()
content := "deterministic content 123\n"
writeTestFile(t, dir1, "data/raw/f.csv", content)
writeTestFile(t, dir2, "data/raw/f.csv", content)
files1, err := VaultInventoryScan(dir1, "v1", "vault1")
if err != nil {
t.Fatal(err)
}
files2, err := VaultInventoryScan(dir2, "v2", "vault2")
if err != nil {
t.Fatal(err)
}
if files1[0].Sha256 != files2[0].Sha256 {
t.Errorf("sha256 mismatch: %q vs %q", files1[0].Sha256, files2[0].Sha256)
}
if len(files1[0].Sha256) != 64 {
t.Errorf("sha256 length = %d, want 64", len(files1[0].Sha256))
}
})
}
func TestVaultInventoryScan_Sorted(t *testing.T) {
t.Run("orden lexicografico del resultado", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "knowledge/decisions/z.md", "z")
writeTestFile(t, dir, "data/raw/a.csv", "a")
writeTestFile(t, dir, "data/processed/m.parquet", "m")
writeTestFile(t, dir, "knowledge/domains/b.md", "b")
files, err := VaultInventoryScan(dir, "v", "v")
if err != nil {
t.Fatal(err)
}
for i := 1; i < len(files); i++ {
if files[i].RelPath < files[i-1].RelPath {
t.Errorf("not sorted at index %d: %q < %q", i, files[i].RelPath, files[i-1].RelPath)
}
}
})
}
// relPaths is a helper for test error messages.
func relPaths(files []VaultFile) []string {
out := make([]string, len(files))
for i, f := range files {
out[i] = f.RelPath
}
return out
}
+252
View File
@@ -0,0 +1,252 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
)
// LayoutReport describes what VaultLayoutEnsure did (or would do) to a vault directory.
type LayoutReport struct {
VaultPath string `json:"vault_path"`
Created []string `json:"created"` // dirs created (relative paths)
Migrated []string `json:"migrated"` // renames executed, format "src -> dst" (relative)
AlreadyOK []string `json:"already_ok"` // dirs that already existed at the target location
Skipped []string `json:"skipped"` // unrecognized root-level entries, left untouched
DryRun bool `json:"dry_run"`
}
// dataBuckets are root-level directories that belong under data/.
var dataBuckets = []string{"raw", "processed", "exports"}
// knowledgeBuckets are root-level directories that belong under knowledge/.
var knowledgeBuckets = []string{"decisions", "domains", "models", "benchmarks", "test_documents"}
// knownRootFiles are root-level files that should be moved to knowledge/.
var knownRootFiles = []string{"README.md", "README.txt"}
// VaultLayoutEnsure ensures a vault directory uses the canonical hybrid layout:
//
// data/{raw,processed,exports}
// knowledge/{decisions,domains,models,benchmarks,test_documents}
//
// Legacy vaults that have these directories at the root are migrated by renaming
// (or merging when both src and dst already exist). The operation is idempotent:
// a second run returns everything in AlreadyOK.
//
// When dryRun is true the function computes the report but does not touch the disk.
func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error) {
report := LayoutReport{DryRun: dryRun}
// --- resolve path ---
vaultPath = strings.TrimRight(vaultPath, "/\\")
var err error
vaultPath, err = filepath.Abs(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: abs(%q): %w", vaultPath, err)
}
// Follow symlinks for the vault root itself.
resolved, err := filepath.EvalSymlinks(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: eval symlinks %q: %w", vaultPath, err)
}
vaultPath = resolved
report.VaultPath = vaultPath
// --- check that vault exists and is a directory ---
info, err := os.Stat(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: stat %q: %w", vaultPath, err)
}
if !info.IsDir() {
return report, fmt.Errorf("vault_layout_ensure: %q is not a directory", vaultPath)
}
// --- ensure top-level containers ---
for _, container := range []string{"data", "knowledge"} {
dst := filepath.Join(vaultPath, container)
if err := ensureDir(dst, dryRun, container, &report); err != nil {
return report, err
}
}
// --- build migration table: root name -> relative destination ---
type migration struct {
rootName string // name in vault root (dir or file)
dstRel string // relative destination path inside vault
isFile bool
}
var migrations []migration
for _, b := range dataBuckets {
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("data", b)})
}
for _, b := range knowledgeBuckets {
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("knowledge", b)})
}
for _, rf := range knownRootFiles {
migrations = append(migrations, migration{rootName: rf, dstRel: filepath.Join("knowledge", "README.md"), isFile: true})
}
// Track which root names are "known" so we can compute Skipped.
knownNames := make(map[string]struct{})
for _, m := range migrations {
knownNames[strings.ToLower(m.rootName)] = struct{}{}
}
knownNames["data"] = struct{}{}
knownNames["knowledge"] = struct{}{}
// --- apply migrations ---
for _, m := range migrations {
src := filepath.Join(vaultPath, m.rootName)
dst := filepath.Join(vaultPath, m.dstRel)
srcRel := m.rootName
dstRel := m.dstRel
srcExists := pathExists(src)
dstExists := pathExists(dst)
switch {
case srcExists && dstExists:
// Both exist: merge if directory, error on file collision.
if m.isFile {
return report, fmt.Errorf("vault_layout_ensure: conflict: both %q and %q exist", srcRel, dstRel)
}
if err := mergeDirs(src, dst, srcRel, dstRel, dryRun, &report); err != nil {
return report, err
}
case srcExists && !dstExists:
// Only source exists: rename.
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", srcRel, dstRel))
if !dryRun {
if err := os.Rename(src, dst); err != nil {
return report, fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", src, dst, err)
}
}
case !srcExists && dstExists:
// Already migrated.
report.AlreadyOK = append(report.AlreadyOK, dstRel)
default:
// Neither exists: create empty destination directory (skip for files).
if !m.isFile {
report.Created = append(report.Created, dstRel)
if !dryRun {
if err := os.MkdirAll(dst, 0o755); err != nil {
return report, fmt.Errorf("vault_layout_ensure: mkdir %q: %w", dst, err)
}
}
}
}
}
// --- collect skipped (unrecognized root entries) ---
entries, err := os.ReadDir(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: readdir %q: %w", vaultPath, err)
}
for _, e := range entries {
if _, known := knownNames[strings.ToLower(e.Name())]; !known {
report.Skipped = append(report.Skipped, e.Name())
}
}
return report, nil
}
// ensureDir adds the dir to Created (and creates it) if it doesn't exist,
// or to AlreadyOK if it does. Used for top-level containers "data" and "knowledge".
func ensureDir(path string, dryRun bool, rel string, report *LayoutReport) error {
if pathExists(path) {
report.AlreadyOK = append(report.AlreadyOK, rel)
return nil
}
report.Created = append(report.Created, rel)
if dryRun {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return fmt.Errorf("vault_layout_ensure: mkdir %q: %w", path, err)
}
return nil
}
// mergeDirs moves the contents of src into dst, then removes src if empty.
// Returns an error if any file in src already exists in dst (no overwrite policy).
func mergeDirs(src, dst, srcRel, dstRel string, dryRun bool, report *LayoutReport) error {
children, err := os.ReadDir(src)
if err != nil {
return fmt.Errorf("vault_layout_ensure: readdir %q: %w", src, err)
}
for _, child := range children {
childDst := filepath.Join(dst, child.Name())
if pathExists(childDst) {
return fmt.Errorf("vault_layout_ensure: merge conflict: %q already exists in %q (cannot overwrite %q)",
child.Name(), dstRel, filepath.Join(srcRel, child.Name()))
}
childSrc := filepath.Join(src, child.Name())
childSrcRel := filepath.Join(srcRel, child.Name())
childDstRel := filepath.Join(dstRel, child.Name())
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", childSrcRel, childDstRel))
if !dryRun {
if err := os.Rename(childSrc, childDst); err != nil {
return fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", childSrc, childDst, err)
}
}
}
// Remove the now-empty src directory.
if !dryRun {
// Re-check emptiness after renames.
remaining, _ := os.ReadDir(src)
if len(remaining) == 0 {
if err := os.Remove(src); err != nil {
return fmt.Errorf("vault_layout_ensure: remove empty src %q: %w", src, err)
}
}
}
return nil
}
// pathExists returns true if path exists (any type).
func pathExists(path string) bool {
_, err := os.Lstat(path)
return err == nil
}
// dirIsEmpty returns true if a directory exists and has no entries.
func dirIsEmpty(path string) bool {
entries, err := os.ReadDir(path)
if err != nil {
return false
}
return len(entries) == 0
}
// _ prevents "declared but not used" if dirIsEmpty is only used in tests.
var _ = dirIsEmpty
// vaultLayoutKnownNames returns the set of root-level names managed by this function.
// Exported for use in tests.
func vaultLayoutKnownNames() map[string]struct{} {
known := make(map[string]struct{})
for _, b := range dataBuckets {
known[b] = struct{}{}
}
for _, b := range knowledgeBuckets {
known[b] = struct{}{}
}
for _, rf := range knownRootFiles {
known[strings.ToLower(rf)] = struct{}{}
}
known["data"] = struct{}{}
known["knowledge"] = struct{}{}
return known
}
+95
View File
@@ -0,0 +1,95 @@
---
name: vault_layout_ensure
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error)"
description: "Normaliza el layout de un vault al esquema hibrido canónico data/{raw,processed,exports} + knowledge/{decisions,domains,models,benchmarks,test_documents}. Migra directorios legacy en la raíz del vault a su ubicación correcta; idempotente."
tags: [vault, layout, migration, infra, filesystem, idempotent]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
params:
- name: vault_path
desc: "Ruta al directorio raíz del vault. Puede ser absoluta, relativa o un symlink — se resuelve con filepath.Abs + filepath.EvalSymlinks. Trailing slashes se ignoran."
- name: dry_run
desc: "Si true, calcula el reporte completo (qué se crearía, migraría, etc.) pero no modifica el disco. Util para previsualizar antes de ejecutar."
output: "LayoutReport con: VaultPath (ruta resuelta), Created (dirs creados), Migrated (renombres ejecutados, formato 'src -> dst'), AlreadyOK (destinos que ya existían), Skipped (entradas en raíz no reconocidas, no tocadas), DryRun (flag). Error si el path no existe, no es directorio, o hay conflicto de merge (mismo nombre de archivo en src y dst)."
tested: true
tests:
- "TestVaultLayoutEnsure_DryRun_NoChange"
- "TestVaultLayoutEnsure_FreshDir_CreatesLayout"
- "TestVaultLayoutEnsure_LegacyDataLayout_Migrates"
- "TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates"
- "TestVaultLayoutEnsure_AlreadyMigrated_Idempotent"
- "TestVaultLayoutEnsure_Mixed_PartialMigration"
- "TestVaultLayoutEnsure_MergeConflict_Errors"
- "TestVaultLayoutEnsure_UnknownFiles_Skipped"
- "TestVaultLayoutEnsure_NotADir_Errors"
test_file_path: "functions/infra/vault_layout_ensure_test.go"
file_path: "functions/infra/vault_layout_ensure.go"
---
## Ejemplo
```go
// Previsualizar sin tocar disco:
report, err := VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", true)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Would migrate: %v\n", report.Migrated)
fmt.Printf("Would create: %v\n", report.Created)
// Ejecutar la migración:
report, err = VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", false)
if err != nil {
log.Fatalf("migration failed: %v", err)
}
fmt.Printf("Migrated: %v\n", report.Migrated)
fmt.Printf("Created: %v\n", report.Created)
fmt.Printf("Skipped: %v\n", report.Skipped)
```
## Comportamiento detallado
**Directorios gestionados:**
| Raíz (legacy) | Destino canónico |
|---|---|
| `raw/` | `data/raw/` |
| `processed/` | `data/processed/` |
| `exports/` | `data/exports/` |
| `decisions/` | `knowledge/decisions/` |
| `domains/` | `knowledge/domains/` |
| `models/` | `knowledge/models/` |
| `benchmarks/` | `knowledge/benchmarks/` |
| `test_documents/` | `knowledge/test_documents/` |
| `README.md` / `README.txt` | `knowledge/README.md` |
**Lógica de migración (por cada entrada conocida):**
- Solo `src` existe → rename atómico `src``dst`, registrado en `Migrated`.
- Solo `dst` existe → ya migrado, registrado en `AlreadyOK`.
- Ambos existen (dir) → merge: mueve cada hijo de `src/` a `dst/`; error si mismo nombre. Registrado en `Migrated` por hijo.
- Ambos existen (archivo README) → error inmediato con paths concretos.
- Ninguno existe → crea `dst` vacío, registrado en `Created`.
**Archivos/dirs no reconocidos** en la raíz (`.git`, `vault_index.db`, archivos custom) se registran en `Skipped` y no se tocan.
**Idempotencia:** segunda ejecución sobre un vault ya migrado reporta todo en `AlreadyOK` y no toca disco.
## Notas
`LayoutReport` es un tipo local de esta función (no un tipo del registry). El struct exportado vive en `functions/infra/vault_layout_ensure.go` junto con la función.
Para aplicar la migración a múltiples vaults en batch, invocar desde un pipeline que lea los paths de `vault.yaml` (ver `vault_manifest_read_go_infra`) y llame a `VaultLayoutEnsure` en cada uno.
+394
View File
@@ -0,0 +1,394 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
// mkVaultDir creates a temporary directory tree for tests.
// entries is a list of relative paths to create.
// Paths ending in "/" are directories; others are files with placeholder content.
func mkVaultDir(t *testing.T, entries []string) string {
t.Helper()
root := t.TempDir()
for _, e := range entries {
full := filepath.Join(root, filepath.FromSlash(e))
if e[len(e)-1] == '/' {
if err := os.MkdirAll(full, 0o755); err != nil {
t.Fatalf("mkVaultDir: mkdir %q: %v", full, err)
}
} else {
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
t.Fatalf("mkVaultDir: mkdir parent %q: %v", full, err)
}
if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
t.Fatalf("mkVaultDir: write %q: %v", full, err)
}
}
}
return root
}
func TestVaultLayoutEnsure_DryRun_NoChange(t *testing.T) {
root := mkVaultDir(t, []string{
"raw/",
"raw/file1.csv",
"processed/",
})
before := snapshotDir(t, root)
report, err := VaultLayoutEnsure(root, true)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !report.DryRun {
t.Error("DryRun flag not set in report")
}
after := snapshotDir(t, root)
if !mapEqual(before, after) {
t.Errorf("dry-run modified disk: before=%v after=%v", before, after)
}
// Should have planned a migration for raw and processed.
if len(report.Migrated) == 0 {
t.Error("expected Migrated to be non-empty in dry-run plan")
}
}
func TestVaultLayoutEnsure_FreshDir_CreatesLayout(t *testing.T) {
root := mkVaultDir(t, []string{}) // empty vault
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// All standard dirs should be created.
wantCreated := []string{
"data", "knowledge",
filepath.Join("data", "raw"),
filepath.Join("data", "processed"),
filepath.Join("data", "exports"),
filepath.Join("knowledge", "decisions"),
filepath.Join("knowledge", "domains"),
filepath.Join("knowledge", "models"),
filepath.Join("knowledge", "benchmarks"),
filepath.Join("knowledge", "test_documents"),
}
createdSet := toSet(report.Created)
for _, w := range wantCreated {
if _, ok := createdSet[w]; !ok {
t.Errorf("expected Created to contain %q, got %v", w, report.Created)
}
}
// All directories must actually exist on disk.
for _, w := range wantCreated {
full := filepath.Join(root, w)
info, err := os.Stat(full)
if err != nil {
t.Errorf("expected %q to exist: %v", full, err)
continue
}
if !info.IsDir() {
t.Errorf("%q should be a directory", full)
}
}
}
func TestVaultLayoutEnsure_LegacyDataLayout_Migrates(t *testing.T) {
root := mkVaultDir(t, []string{
"raw/",
"raw/file1.parquet",
"raw/file2.parquet",
"processed/",
"processed/clean.csv",
"exports/",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// raw and processed should appear in Migrated (as dirs, top-level rename).
migratedSet := toSet(report.Migrated)
for _, pair := range []string{
"raw -> " + filepath.Join("data", "raw"),
"processed -> " + filepath.Join("data", "processed"),
} {
if _, ok := migratedSet[pair]; !ok {
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
}
}
// Files must have moved.
for _, f := range []string{
filepath.Join("data", "raw", "file1.parquet"),
filepath.Join("data", "raw", "file2.parquet"),
filepath.Join("data", "processed", "clean.csv"),
} {
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
t.Errorf("expected %q to exist after migration: %v", f, err)
}
}
// Old dirs must be gone.
for _, d := range []string{"raw", "processed"} {
if pathExists(filepath.Join(root, d)) {
t.Errorf("expected legacy dir %q to be removed", d)
}
}
}
func TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates(t *testing.T) {
root := mkVaultDir(t, []string{
"decisions/",
"decisions/2024-01.md",
"models/",
"models/ner_v1.pkl",
"README.md",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// decisions and models should appear in Migrated.
migratedSet := toSet(report.Migrated)
for _, pair := range []string{
"decisions -> " + filepath.Join("knowledge", "decisions"),
"models -> " + filepath.Join("knowledge", "models"),
"README.md -> " + filepath.Join("knowledge", "README.md"),
} {
if _, ok := migratedSet[pair]; !ok {
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
}
}
// Files must be at new location.
for _, f := range []string{
filepath.Join("knowledge", "decisions", "2024-01.md"),
filepath.Join("knowledge", "models", "ner_v1.pkl"),
filepath.Join("knowledge", "README.md"),
} {
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
t.Errorf("expected %q to exist after migration: %v", f, err)
}
}
}
func TestVaultLayoutEnsure_AlreadyMigrated_Idempotent(t *testing.T) {
root := mkVaultDir(t, []string{
"data/",
"data/raw/",
"data/raw/file.csv",
"data/processed/",
"data/exports/",
"knowledge/",
"knowledge/decisions/",
"knowledge/domains/",
"knowledge/models/",
"knowledge/benchmarks/",
"knowledge/test_documents/",
})
report1, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("first run error: %v", err)
}
if len(report1.Migrated) != 0 {
t.Errorf("first run on fully-migrated vault should have no migrations, got %v", report1.Migrated)
}
before := snapshotDir(t, root)
report2, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("second run error: %v", err)
}
after := snapshotDir(t, root)
if !mapEqual(before, after) {
t.Error("second run modified disk (not idempotent)")
}
if len(report2.Migrated) != 0 {
t.Errorf("second run should produce no migrations, got %v", report2.Migrated)
}
if len(report2.AlreadyOK) == 0 {
t.Error("second run should report existing dirs as AlreadyOK")
}
}
func TestVaultLayoutEnsure_Mixed_PartialMigration(t *testing.T) {
// data/raw already migrated; exports still at root; knowledge dirs in legacy positions.
root := mkVaultDir(t, []string{
"data/",
"data/raw/",
"data/raw/already_here.csv",
"exports/",
"exports/report.pdf",
"decisions/",
"decisions/2023-note.md",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// data/raw should be AlreadyOK.
if !sliceContains(report.AlreadyOK, filepath.Join("data", "raw")) {
t.Errorf("data/raw should be AlreadyOK, got AlreadyOK=%v", report.AlreadyOK)
}
// exports should be migrated.
exportsMigrated := false
for _, m := range report.Migrated {
if m == "exports -> "+filepath.Join("data", "exports") {
exportsMigrated = true
}
}
if !exportsMigrated {
t.Errorf("exports should be migrated, Migrated=%v", report.Migrated)
}
// decisions should be migrated.
decisionsMigrated := false
for _, m := range report.Migrated {
if m == "decisions -> "+filepath.Join("knowledge", "decisions") {
decisionsMigrated = true
}
}
if !decisionsMigrated {
t.Errorf("decisions should be migrated, Migrated=%v", report.Migrated)
}
}
func TestVaultLayoutEnsure_MergeConflict_Errors(t *testing.T) {
// Both src (raw/) and dst (data/raw/) exist and have a file with the same name.
root := mkVaultDir(t, []string{
"raw/",
"raw/collision.csv",
"data/",
"data/raw/",
"data/raw/collision.csv", // same name -> conflict
})
_, err := VaultLayoutEnsure(root, false)
if err == nil {
t.Fatal("expected error for merge conflict, got nil")
}
if !contains(err.Error(), "conflict") && !contains(err.Error(), "collision.csv") {
t.Errorf("error should mention conflict or the file name, got: %v", err)
}
}
func TestVaultLayoutEnsure_UnknownFiles_Skipped(t *testing.T) {
root := mkVaultDir(t, []string{
".git/",
"vault_index.db",
"my_custom_notes.txt",
"raw/",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
skippedSet := toSet(report.Skipped)
for _, name := range []string{".git", "vault_index.db", "my_custom_notes.txt"} {
if _, ok := skippedSet[name]; !ok {
t.Errorf("expected %q in Skipped, got %v", name, report.Skipped)
}
}
// raw should NOT be in Skipped (it's a known bucket).
if _, ok := skippedSet["raw"]; ok {
t.Error("raw should not appear in Skipped — it is a known bucket")
}
}
func TestVaultLayoutEnsure_NotADir_Errors(t *testing.T) {
t.Run("non-existent path", func(t *testing.T) {
_, err := VaultLayoutEnsure("/tmp/does_not_exist_fn_registry_test_xyz", false)
if err == nil {
t.Fatal("expected error for non-existent path")
}
})
t.Run("path is a file", func(t *testing.T) {
f, err := os.CreateTemp("", "vault_layout_*.txt")
if err != nil {
t.Fatal(err)
}
f.Close()
defer os.Remove(f.Name())
_, err = VaultLayoutEnsure(f.Name(), false)
if err == nil {
t.Fatal("expected error when vaultPath is a file, not a dir")
}
if !contains(err.Error(), "not a directory") {
t.Errorf("error should mention 'not a directory', got: %v", err)
}
})
}
// --- helpers ---
// snapshotDir returns a map of relative path -> exists for all entries under root.
func snapshotDir(t *testing.T, root string) map[string]bool {
t.Helper()
snap := make(map[string]bool)
err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
rel, _ := filepath.Rel(root, path)
snap[rel] = true
return nil
})
if err != nil {
t.Fatalf("snapshotDir: %v", err)
}
return snap
}
func mapEqual(a, b map[string]bool) bool {
if len(a) != len(b) {
return false
}
for k := range a {
if !b[k] {
return false
}
}
return true
}
func toSet(ss []string) map[string]struct{} {
m := make(map[string]struct{}, len(ss))
for _, s := range ss {
m[s] = struct{}{}
}
return m
}
func sliceContains(ss []string, target string) bool {
for _, s := range ss {
if s == target {
return true
}
}
return false
}
func contains(s, sub string) bool {
return len(s) >= len(sub) && (s == sub || len(sub) == 0 ||
func() bool {
for i := 0; i <= len(s)-len(sub); i++ {
if s[i:i+len(sub)] == sub {
return true
}
}
return false
}())
}
+96
View File
@@ -0,0 +1,96 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
"gopkg.in/yaml.v3"
)
// VaultManifestEntry is a single vault entry parsed from a projects/<proj>/vaults/vault.yaml.
type VaultManifestEntry struct {
ProjectID string // basename of projects/<proj>/, inferred from manifest path
Name string // vault name as declared in vault.yaml
Description string // human description
Path string // absolute path to the vault directory
Tags []string // tags declared in vault.yaml
ManifestFile string // absolute path to the vault.yaml this entry came from
}
// vaultYAML mirrors the vault.yaml schema (only the fields we care about).
type vaultYAML struct {
Vaults []struct {
Name string `yaml:"name"`
Description string `yaml:"description"`
Path string `yaml:"path"`
Tags []string `yaml:"tags"`
} `yaml:"vaults"`
}
// VaultManifestRead globs all projects/*/vaults/vault.yaml under repoRoot, parses each
// manifest and returns a flat slice of VaultManifestEntry.
//
// Rules:
// - If a manifest fails to parse, an error is returned immediately with the file path.
// - If no manifests are found, an empty slice is returned (not an error).
// - ProjectID is inferred from the directory component between "projects/" and "/vaults/".
func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error) {
pattern := filepath.Join(repoRoot, "projects", "*", "vaults", "vault.yaml")
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("vault_manifest_read: glob %q: %w", pattern, err)
}
var out []VaultManifestEntry
for _, manifestPath := range matches {
entries, err := parseVaultManifest(manifestPath)
if err != nil {
return nil, err
}
out = append(out, entries...)
}
return out, nil
}
func parseVaultManifest(manifestPath string) ([]VaultManifestEntry, error) {
data, err := os.ReadFile(manifestPath)
if err != nil {
return nil, fmt.Errorf("vault_manifest_read: read %q: %w", manifestPath, err)
}
var raw vaultYAML
if err := yaml.Unmarshal(data, &raw); err != nil {
return nil, fmt.Errorf("vault_manifest_read: parse %q: %w", manifestPath, err)
}
projectID := inferProjectID(manifestPath)
entries := make([]VaultManifestEntry, 0, len(raw.Vaults))
for _, v := range raw.Vaults {
entries = append(entries, VaultManifestEntry{
ProjectID: projectID,
Name: v.Name,
Description: v.Description,
Path: v.Path,
Tags: v.Tags,
ManifestFile: manifestPath,
})
}
return entries, nil
}
// inferProjectID extracts the project basename from a path of the form
// .../projects/<proj>/vaults/vault.yaml.
func inferProjectID(manifestPath string) string {
// Normalize separators and split.
parts := strings.Split(filepath.ToSlash(manifestPath), "/")
// Walk backwards: vault.yaml -> vaults -> <proj> -> projects -> ...
for i, p := range parts {
if p == "projects" && i+1 < len(parts) {
return parts[i+1]
}
}
return ""
}
+59
View File
@@ -0,0 +1,59 @@
---
name: vault_manifest_read
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error)"
description: "Lee todos los manifests vault.yaml bajo projects/*/vaults/ del repo y devuelve una lista plana de entradas de vault con su ProjectID inferido del path."
tags: [vault, manifest, yaml, infra, projects, storage]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "gopkg.in/yaml.v3"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del repositorio fn_registry. Se usa como base para el glob projects/*/vaults/vault.yaml."
output: "Slice plano de VaultManifestEntry (ProjectID, Name, Description, Path, Tags, ManifestFile). Vacio si no hay manifests. Error si un yaml no parsea, con el path concreto en el mensaje."
tested: true
tests:
- "TestVaultManifestRead_HappyPath"
- "TestVaultManifestRead_MalformedYAML"
- "TestVaultManifestRead_EmptyDir"
test_file_path: "functions/infra/vault_manifest_read_test.go"
file_path: "functions/infra/vault_manifest_read.go"
---
## Ejemplo
```go
entries, err := VaultManifestRead("/home/lucas/fn_registry")
if err != nil {
log.Fatal(err)
}
for _, e := range entries {
fmt.Printf("%s/%s -> %s\n", e.ProjectID, e.Name, e.Path)
}
// app_turismo/turismo_spain -> /home/lucas/vaults/turismo_spain
// app_finance/finance_data -> /home/lucas/vaults/finance_data
```
## Notas
`VaultManifestEntry` es un tipo local de esta funcion (no un tipo del registry). Contiene:
- `ProjectID` — basename del directorio `projects/<proj>/`, inferido del path del manifest.
- `Name`, `Description`, `Path`, `Tags` — copiados del yaml tal cual.
- `ManifestFile` — path absoluto al vault.yaml de origen, util para mensajes de error y trazabilidad.
El parseo usa `gopkg.in/yaml.v3` (ya en go.mod). Si un manifest falla, la funcion devuelve
error inmediatamente con el path del fichero problemático. Los manifests sin entradas
`vaults:` contribuyen cero entries (no es error). Si no existe ningun `projects/*/vaults/vault.yaml`
el resultado es slice vacio sin error.
+113
View File
@@ -0,0 +1,113 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
func TestVaultManifestRead_HappyPath(t *testing.T) {
root := t.TempDir()
writeManifest(t, root, "app_turismo", `
vaults:
- name: turismo_spain
description: "Datos de turismo en Espana"
path: "/home/lucas/vaults/turismo_spain"
tags: [turismo, espana]
- name: turismo_raw
description: "Datos brutos sin procesar"
path: "/home/lucas/vaults/turismo_raw"
tags: [raw]
`)
writeManifest(t, root, "app_finance", `
vaults:
- name: finance_data
description: "Datos financieros"
path: "/home/lucas/vaults/finance_data"
tags: [finance]
`)
entries, err := VaultManifestRead(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 3 {
t.Fatalf("got %d entries, want 3", len(entries))
}
// Build index by name for order-independent assertions.
byName := make(map[string]VaultManifestEntry, len(entries))
for _, e := range entries {
byName[e.Name] = e
}
// Check turismo_spain entry.
e, ok := byName["turismo_spain"]
if !ok {
t.Fatal("missing entry 'turismo_spain'")
}
if e.ProjectID != "app_turismo" {
t.Errorf("turismo_spain.ProjectID = %q, want %q", e.ProjectID, "app_turismo")
}
if e.Path != "/home/lucas/vaults/turismo_spain" {
t.Errorf("turismo_spain.Path = %q, want %q", e.Path, "/home/lucas/vaults/turismo_spain")
}
if len(e.Tags) != 2 || e.Tags[0] != "turismo" {
t.Errorf("turismo_spain.Tags = %v, want [turismo espana]", e.Tags)
}
if e.ManifestFile == "" {
t.Error("turismo_spain.ManifestFile is empty")
}
// Check finance_data entry belongs to app_finance.
ef, ok := byName["finance_data"]
if !ok {
t.Fatal("missing entry 'finance_data'")
}
if ef.ProjectID != "app_finance" {
t.Errorf("finance_data.ProjectID = %q, want %q", ef.ProjectID, "app_finance")
}
}
func TestVaultManifestRead_MalformedYAML(t *testing.T) {
root := t.TempDir()
writeManifest(t, root, "bad_project", `
vaults:
- name: [invalid yaml
path: missing_bracket
`)
_, err := VaultManifestRead(root)
if err == nil {
t.Fatal("expected error for malformed YAML, got nil")
}
}
func TestVaultManifestRead_EmptyDir(t *testing.T) {
root := t.TempDir()
// No projects/ directory at all — glob returns no matches.
entries, err := VaultManifestRead(root)
if err != nil {
t.Fatalf("unexpected error for empty dir: %v", err)
}
if len(entries) != 0 {
t.Fatalf("got %d entries, want 0", len(entries))
}
}
// writeManifest creates <root>/projects/<proj>/vaults/vault.yaml with the given content.
func writeManifest(t *testing.T, root, proj, content string) {
t.Helper()
dir := filepath.Join(root, "projects", proj, "vaults")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", dir, err)
}
f := filepath.Join(dir, "vault.yaml")
if err := os.WriteFile(f, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", f, err)
}
}
+265
View File
@@ -0,0 +1,265 @@
package infra
import (
"database/sql"
"fmt"
"path/filepath"
"strings"
)
// VaultSearchHit is a single result returned by VaultSearch.
type VaultSearchHit struct {
VaultPath string `json:"vault_path"`
VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
RelPath string `json:"rel_path"`
Size int64 `json:"size"`
Mtime int64 `json:"mtime"`
Mime string `json:"mime"`
Bucket string `json:"bucket"`
SubBucket string `json:"sub_bucket"`
Snippet string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
}
// VaultSearch searches vault_index.db inside vaultPath for files matching query.
//
// Behaviour:
// 1. Opens vault_index.db via VaultIndexOpen.
// 2. If limit <= 0, defaults to 50.
// 3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
// is populated by profilers). Because the FTS5 table uses content='' (contentless),
// column values are not stored; results are correlated back to files via a LIKE
// match on rel_path for path tokens, or via an IN clause of matched rowids for
// content_text matches.
// 4. Also searches files.rel_path with LIKE to find path matches.
// 5. Results from both searches are merged (deduplication by rel_path).
// 6. If both FTS5 and LIKE queries fail, returns the error.
// 7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
if limit <= 0 {
limit = 50
}
db, err := VaultIndexOpen(vaultPath)
if err != nil {
return nil, fmt.Errorf("vault_search: open index: %w", err)
}
defer db.Close()
vaultName := resolveVaultName(vaultPath)
hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
if err != nil {
return nil, fmt.Errorf("vault_search: %w", err)
}
return hits, nil
}
// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
// 1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
// Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
// 2. LIKE on files.rel_path (always reliable for path searching).
//
// Results are deduplicated by rel_path, up to limit entries.
func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
seen := make(map[string]struct{})
var hits []VaultSearchHit
// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
// We get matching rowids from FTS5, then look up files by rowid.
// This is reliable for content_text matches because VaultIndexWrite inserts
// content_text rows independently of the path rows (profilers update them).
// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
ftsQuery := safeFTSQuery(query)
ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
if ftsErr == nil {
for _, h := range ftsHits {
if len(hits) >= limit {
break
}
if _, ok := seen[h.RelPath]; !ok {
seen[h.RelPath] = struct{}{}
hits = append(hits, h)
}
}
}
// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
// If it failed with a non-syntax error, still continue to LIKE fallback.
// Strategy 2: LIKE on rel_path — reliable path search.
// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
// word-like token so the LIKE pattern is still useful.
likeQuery := simplifyForLike(query)
if len(hits) < limit && likeQuery != "" {
remaining := limit - len(hits)
likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
if likeErr != nil && ftsErr != nil {
// Both failed — return a combined error.
return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
}
for _, h := range likeHits {
if len(hits) >= limit {
break
}
if _, ok := seen[h.RelPath]; !ok {
seen[h.RelPath] = struct{}{}
hits = append(hits, h)
}
}
}
if hits == nil {
hits = []VaultSearchHit{}
}
return hits, nil
}
// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
// back to the files table.
//
// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
// This works correctly when content_text was populated by a profiler that did NOT
// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
// without changing the rowid). For the current VaultIndexWrite implementation
// (which inserts content_text='' and profilers update it in-place), the rowids
// remain stable after profiling.
func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
// Get matching rowids from FTS5.
const qRowids = `
SELECT rowid
FROM files_fts
WHERE files_fts MATCH ?
ORDER BY rank
LIMIT ?`
rows, err := db.Query(qRowids, safeQuery, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var rowids []int64
for rows.Next() {
var rid int64
if err := rows.Scan(&rid); err != nil {
return nil, err
}
rowids = append(rowids, rid)
}
if err := rows.Err(); err != nil {
return nil, err
}
if len(rowids) == 0 {
return nil, nil
}
// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
var hits []VaultSearchHit
for _, rid := range rowids {
var h VaultSearchHit
err := db.QueryRow(`
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
FROM files WHERE rowid = ?`, rid,
).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
if err != nil {
// rowid mismatch (happens after update cycles) — skip gracefully.
continue
}
h.VaultPath = vaultPath
h.VaultName = vaultName
h.Snippet = ""
hits = append(hits, h)
}
return hits, nil
}
// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
const qLike = `
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
FROM files
WHERE rel_path LIKE '%' || ? || '%'
ORDER BY mtime DESC
LIMIT ?`
rows, err := db.Query(qLike, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var hits []VaultSearchHit
for rows.Next() {
var h VaultSearchHit
if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
return nil, err
}
h.VaultPath = vaultPath
h.VaultName = vaultName
h.Snippet = ""
hits = append(hits, h)
}
return hits, rows.Err()
}
// resolveVaultName returns the basename of vaultPath after resolving symlinks.
// Falls back to filepath.Base if EvalSymlinks fails.
func resolveVaultName(vaultPath string) string {
resolved, err := filepath.EvalSymlinks(vaultPath)
if err != nil {
resolved = vaultPath
}
return filepath.Base(resolved)
}
// safeFTSQuery wraps the query in double-quotes if it does not already contain
// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
func safeFTSQuery(query string) string {
q := strings.TrimSpace(query)
if q == "" {
return q
}
upper := strings.ToUpper(q)
// If user already uses explicit operators or column prefix, pass through.
if strings.ContainsAny(q, ":") ||
strings.Contains(upper, " AND ") ||
strings.Contains(upper, " OR ") ||
strings.Contains(upper, " NOT ") {
return q
}
// Escape any double-quotes in the query before wrapping.
escaped := strings.ReplaceAll(q, `"`, `""`)
return `"` + escaped + `"`
}
// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
func isFTSSyntaxError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
return strings.Contains(msg, "syntax error") ||
strings.Contains(msg, "no such column") ||
strings.Contains(msg, "fts5: syntax error")
}
// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
// When the query contains FTS5 special characters (colons, double-quotes, operators),
// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
// syntactically complex or contains column-prefix syntax like "foo:bar:".
func simplifyForLike(query string) string {
q := strings.TrimSpace(query)
var token strings.Builder
for _, r := range q {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
token.WriteRune(r)
} else if token.Len() > 0 {
break
}
}
return token.String()
}
+61
View File
@@ -0,0 +1,61 @@
---
name: vault_search
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error)"
description: "Busca en vault_index.db de un vault usando FTS5 sobre files_fts. Si el query rompe el parser FTS5, hace fallback a LIKE sobre rel_path. Retorna hits con snippet de contexto."
tags: [vault, search, fts5, sqlite, infra]
uses_functions: ["vault_index_open_go_infra"]
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, fmt, path/filepath, strings]
params:
- name: vaultPath
desc: "ruta absoluta al directorio raiz del vault (puede ser symlink)"
- name: query
desc: "termino o frase de busqueda; se escapa automaticamente para FTS5 salvo que ya incluya operadores booleanos o prefijos de columna"
- name: limit
desc: "maximo de resultados; si es <= 0 se usa 50"
output: "slice de VaultSearchHit ordenado por rank FTS5 (o mtime DESC en fallback LIKE); slice vacio si no hay resultados"
tested: true
tests:
- "FTS match devuelve hit con snippet"
- "query sin resultados retorna slice vacio"
- "limit se respeta"
- "query FTS invalida activa fallback LIKE"
- "limit cero usa 50 por defecto"
test_file_path: "functions/infra/vault_search_test.go"
file_path: "functions/infra/vault_search.go"
---
## Ejemplo
```go
hits, err := infra.VaultSearch("/home/lucas/vaults/turismo_spain", "hoteles", 20)
if err != nil {
log.Fatal(err)
}
for _, h := range hits {
fmt.Printf("[%s] %s %s\n", h.VaultName, h.RelPath, h.Snippet)
}
```
## Notas
`VaultSearchHit` es un struct local definido en este archivo (no en `vault_file.go`)
porque combina campos de `files` + metadatos de contexto de busqueda (Snippet, VaultPath, VaultName).
**FTS5 safety:** el helper `safeFTSQuery` envuelve la query en comillas dobles
cuando no contiene operadores booleanos ni prefijos de columna. Esto evita errores
del parser en tokens como `foo:bar:` o `hello-world`.
**Fallback LIKE:** si el MATCH falla con un error de sintaxis FTS5, se ejecuta
`WHERE rel_path LIKE '%' || query || '%'`. Los hits del fallback tienen `Snippet=""`.
**VaultName:** se deriva del `filepath.Base(filepath.EvalSymlinks(vaultPath))`.
Si `EvalSymlinks` falla (e.g. symlink roto), usa `filepath.Base(vaultPath)`.
+147
View File
@@ -0,0 +1,147 @@
package infra
import (
"testing"
"time"
)
// openTestVaultDB creates a fresh vault_index.db in a temp dir and returns the path.
func openTestVaultDir(t *testing.T) string {
t.Helper()
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
db.Close()
return dir
}
// seedVaultFile inserts a row into files + files_fts.
func seedVaultFile(t *testing.T, dir, relPath, mime, bucket, subBucket, contentText string, size int64) {
t.Helper()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen seed: %v", err)
}
defer db.Close()
now := time.Now().Unix()
_, err = db.Exec(`
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, 'aabbccdd', ?, '', ?, ?, ?)`,
relPath, size, now, mime, bucket, subBucket, now,
)
if err != nil {
t.Fatalf("seed files: %v", err)
}
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`, relPath, contentText)
if err != nil {
t.Fatalf("seed files_fts: %v", err)
}
}
// --- Tests ---
func TestVaultSearch_FTSMatch(t *testing.T) {
t.Run("FTS match devuelve hit con snippet", func(t *testing.T) {
dir := openTestVaultDir(t)
seedVaultFile(t, dir, "data/raw/informe.csv", "text/csv", "data", "raw",
"ventas trimestrales empresa iberica", 1024)
seedVaultFile(t, dir, "data/raw/other.csv", "text/csv", "data", "raw",
"productos inventario almacen", 512)
hits, err := VaultSearch(dir, "ventas", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 1 {
t.Fatalf("got %d hits, want 1", len(hits))
}
if hits[0].RelPath != "data/raw/informe.csv" {
t.Errorf("RelPath = %q, want data/raw/informe.csv", hits[0].RelPath)
}
if hits[0].VaultName == "" {
t.Errorf("VaultName should not be empty")
}
})
}
func TestVaultSearch_NoMatch(t *testing.T) {
t.Run("query sin resultados retorna slice vacio", func(t *testing.T) {
dir := openTestVaultDir(t)
seedVaultFile(t, dir, "data/raw/file.csv", "text/csv", "data", "raw", "some content", 100)
hits, err := VaultSearch(dir, "zzznomatch", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 0 {
t.Errorf("got %d hits, want 0", len(hits))
}
})
}
func TestVaultSearch_LimitRespected(t *testing.T) {
t.Run("limit se respeta", func(t *testing.T) {
dir := openTestVaultDir(t)
for i := 0; i < 10; i++ {
path := "data/raw/file" + string(rune('a'+i)) + ".csv"
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "common keyword everywhere", 100)
}
hits, err := VaultSearch(dir, "common", 3)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 3 {
t.Errorf("got %d hits, want 3", len(hits))
}
})
}
func TestVaultSearch_BadFTSQuery_FallbackLike(t *testing.T) {
t.Run("query FTS invalida activa fallback LIKE", func(t *testing.T) {
dir := openTestVaultDir(t)
// Insert a file whose rel_path contains "foobar" so LIKE can find it.
seedVaultFile(t, dir, "data/raw/foobar_report.csv", "text/csv", "data", "raw", "", 200)
// "foo:bar:" — colon after a non-column name triggers FTS5 parser error.
// safeFTSQuery passes it through unchanged because it contains ":"
// → FTS5 "no such column: bar" → fallback LIKE on rel_path.
hits, err := VaultSearch(dir, "foo:bar:", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) == 0 {
t.Errorf("expected fallback LIKE to find foobar_report.csv, got 0 hits")
}
for _, h := range hits {
if h.Snippet != "" {
t.Errorf("fallback hits should have empty Snippet, got %q", h.Snippet)
}
}
})
}
func TestVaultSearch_LimitZeroDefaults(t *testing.T) {
t.Run("limit cero usa 50 por defecto", func(t *testing.T) {
dir := openTestVaultDir(t)
// Insert 55 files with the same keyword.
for i := 0; i < 55; i++ {
path := "data/raw/doc" + string(rune('a')) + string(rune(int('0')+i%10)) + ".csv"
if i >= 10 {
path = "data/raw/doc" + string(rune('b'+i/10-1)) + string(rune(int('0')+i%10)) + ".csv"
}
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "keyword alpha beta", 100)
}
hits, err := VaultSearch(dir, "keyword", 0)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 50 {
t.Errorf("got %d hits, want 50 (default limit)", len(hits))
}
})
}
+20
View File
@@ -0,0 +1,20 @@
package ml
import "encoding/json"
// GenconfigMarshal serializa un GenerationConfig a JSON canonico con indent de 2 espacios.
// El formato es identico al de Python json.dumps(indent=2, sort_keys=False):
// keys en el orden de declaracion del struct, snake_case, campos omitempty ausentes si zero.
func GenconfigMarshal(cfg GenerationConfig) ([]byte, error) {
return json.MarshalIndent(cfg, "", " ")
}
// GenconfigUnmarshal deserializa JSON (compacto o con indent) a GenerationConfig.
// Los campos JSON deben usar snake_case: negative_prompt, cfg_scale, model_type, etc.
func GenconfigUnmarshal(data []byte) (GenerationConfig, error) {
var cfg GenerationConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return GenerationConfig{}, err
}
return cfg, nil
}
+84
View File
@@ -0,0 +1,84 @@
---
name: genconfig_json_marshal
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: impure
signature: "func GenconfigMarshal(cfg GenerationConfig) ([]byte, error)\nfunc GenconfigUnmarshal(data []byte) (GenerationConfig, error)"
description: "Wrappers json.Marshal/Unmarshal para GenerationConfig con formato canonico (MarshalIndent 2 espacios). Garantiza roundtrip identico al Python: json.dumps(indent=2, sort_keys=False). Campos JSON en snake_case."
tags: [ml, json, marshal, unmarshal, serialization, generation, canonical]
uses_functions: []
uses_types: [generation_config_go_ml]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["encoding/json"]
params:
- name: cfg
desc: "GenerationConfig a serializar. Campos omitempty (negative_prompt, loras, clip_skip) se omiten si son zero/nil/empty."
- name: data
desc: "JSON bytes a deserializar. Acepta formato compacto o con indent. Keys deben ser snake_case (negative_prompt, cfg_scale, model_type, etc.)."
output: "GenconfigMarshal: bytes JSON con indent 2 espacios, orden de campos segun declaracion del struct (prompt, negative_prompt, seed, steps, cfg_scale, sampler, width, height, model, loras, clip_skip). GenconfigUnmarshal: GenerationConfig poblado o error de parsing."
tested: true
tests:
- "roundtrip marshal unmarshal produce config igual"
- "json cross-language snake_case keys se deserializan correctamente"
test_file_path: "functions/ml/genconfig_test.go"
file_path: "functions/ml/genconfig_json_marshal.go"
---
## Ejemplo
```go
cfg := ml.GenerationConfig{
Prompt: "a mountain at sunset",
Seed: 1234,
Steps: 30,
CfgScale: 7.0,
Sampler: "euler",
Width: 768,
Height: 512,
Model: ml.ModelRef{Name: "sdxl-base", ModelType: "sdxl", Quantization: "fp16"},
}
b, err := ml.GenconfigMarshal(cfg)
// b == {
// "prompt": "a mountain at sunset",
// "seed": 1234,
// ...
// }
cfg2, err := ml.GenconfigUnmarshal(b)
// cfg2 == cfg (DeepEqual)
```
## Notas
### Formato canonico y compatibilidad con Python
`GenconfigMarshal` usa `json.MarshalIndent(cfg, "", " ")`. El formato resultante es identico al que produce Python con `model.model_dump_json()` o `json.dumps(data, indent=2)` cuando `sort_keys=False`:
- Keys en orden de declaracion del struct (no alfabetico).
- Indent de 2 espacios, sin trailing whitespace.
- Campos omitempty ausentes si zero: `negative_prompt` ausente si `""`, `loras` ausente si `[]`, `clip_skip` ausente si `nil`.
### Keys JSON (snake_case obligatorio)
| Campo Go | Key JSON |
|---|---|
| `Prompt` | `"prompt"` |
| `NegativePrompt` | `"negative_prompt"` |
| `Seed` | `"seed"` |
| `Steps` | `"steps"` |
| `CfgScale` | `"cfg_scale"` |
| `Sampler` | `"sampler"` |
| `Width` | `"width"` |
| `Height` | `"height"` |
| `Model.ModelType` | `"model_type"` |
| `Model.Quantization` | `"quantization"` |
| `ClipSkip` | `"clip_skip"` |
### Por que impure
Los errores de `json.Unmarshal` son errores de parsing del input externo, no de I/O, pero se modelan como `(T, error)` para forzar manejo explicito en el caller. Marcado `impure` con `error_type: error_go_core` por convencion del registry.
+260
View File
@@ -0,0 +1,260 @@
package ml
import (
"reflect"
"strings"
"testing"
)
// ---------------------------------------------------------------------------
// TestGenconfigToSdcliArgs
// ---------------------------------------------------------------------------
func TestGenconfigToSdcliArgs(t *testing.T) {
clipSkip := 2
t.Run("config basico sin loras ni clip_skip", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "a cat",
Seed: 42,
Steps: 20,
CfgScale: 7.5,
Sampler: "euler",
Width: 512,
Height: 512,
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
want := []string{
"--prompt", "a cat",
"--seed", "42",
"--steps", "20",
"--cfg-scale", "7.5",
"--width", "512",
"--height", "512",
"--sampling-method", "euler",
}
if !reflect.DeepEqual(args, want) {
t.Errorf("got %v\nwant %v", args, want)
}
})
t.Run("loras se emiten como pares path:weight", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "portrait",
Seed: 1,
Steps: 10,
CfgScale: 7.0,
Sampler: "euler",
Width: 512,
Height: 512,
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1.safetensors"},
Loras: []LoraRef{
{Path: "/loras/detail.safetensors", Weight: 0.8},
{Path: "/loras/style.safetensors", Weight: 0.5},
},
ClipSkip: &clipSkip,
}
args := GenconfigToSdcliArgs(cfg)
// Verificar que existen los pares --lora para ambas loras
loraIdx := indexAll(args, "--lora")
if len(loraIdx) != 2 {
t.Fatalf("esperaba 2 flags --lora, got %d en %v", len(loraIdx), args)
}
wantLoras := []string{
"/loras/detail.safetensors:0.8",
"/loras/style.safetensors:0.5",
}
for i, idx := range loraIdx {
if idx+1 >= len(args) {
t.Fatalf("--lora[%d] sin valor siguiente", i)
}
if args[idx+1] != wantLoras[i] {
t.Errorf("lora[%d]: got %q, want %q", i, args[idx+1], wantLoras[i])
}
}
// Verificar --model y --clip-skip presentes
if !containsPair(args, "--model", "/models/v1.safetensors") {
t.Errorf("--model no encontrado en %v", args)
}
if !containsPair(args, "--clip-skip", "2") {
t.Errorf("--clip-skip no encontrado en %v", args)
}
})
t.Run("sampler dpm++2m se traduce a dpmpp2m", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "x",
Seed: 0,
Steps: 1,
CfgScale: 1.0,
Sampler: "dpm++2m",
Width: 64,
Height: 64,
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
if !containsPair(args, "--sampling-method", "dpmpp2m") {
t.Errorf("sampler no traducido; args=%v", args)
}
})
t.Run("negative_prompt vacio no genera flag", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "x",
NegativePrompt: "",
Seed: 0,
Steps: 1,
CfgScale: 1.0,
Sampler: "euler",
Width: 64,
Height: 64,
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
for _, a := range args {
if a == "--negative-prompt" {
t.Errorf("flag --negative-prompt presente aunque NegativePrompt es vacio")
}
}
})
}
// ---------------------------------------------------------------------------
// TestGenconfigMarshalRoundtrip
// ---------------------------------------------------------------------------
func TestGenconfigMarshalRoundtrip(t *testing.T) {
t.Run("roundtrip marshal unmarshal produce config igual", func(t *testing.T) {
clip := 2
cfg := GenerationConfig{
Prompt: "sunset over the mountains",
NegativePrompt: "blurry, low quality",
Seed: 99,
Steps: 30,
CfgScale: 7.5,
Sampler: "dpm++2m",
Width: 768,
Height: 512,
Model: ModelRef{
Name: "sdxl-base",
ModelType: "sdxl",
Quantization: "fp16",
Path: "/models/sdxl.safetensors",
},
Loras: []LoraRef{
{Path: "/loras/detail.safetensors", Weight: 0.8},
},
ClipSkip: &clip,
}
b, err := GenconfigMarshal(cfg)
if err != nil {
t.Fatalf("GenconfigMarshal: %v", err)
}
got, err := GenconfigUnmarshal(b)
if err != nil {
t.Fatalf("GenconfigUnmarshal: %v", err)
}
if !reflect.DeepEqual(cfg, got) {
t.Errorf("roundtrip diverge\norig: %+v\ngot: %+v", cfg, got)
}
})
}
// ---------------------------------------------------------------------------
// TestGenconfigCrossLanguageJSON
// ---------------------------------------------------------------------------
func TestGenconfigCrossLanguageJSON(t *testing.T) {
// Fixture escrito a mano replicando lo que generaria Python:
// json.dumps(config.model_dump(), indent=2)
// Keys en snake_case, orden de declaracion del dataclass Python.
fixture := `{
"prompt": "a dragon",
"negative_prompt": "ugly",
"seed": 1234,
"steps": 25,
"cfg_scale": 7.0,
"sampler": "euler_a",
"width": 512,
"height": 512,
"model": {
"name": "v1-5",
"model_type": "sd15",
"quantization": "fp16"
},
"loras": [
{
"path": "/loras/dragon.safetensors",
"weight": 0.9
}
]
}`
t.Run("json cross-language snake_case keys se deserializan correctamente", func(t *testing.T) {
cfg, err := GenconfigUnmarshal([]byte(fixture))
if err != nil {
t.Fatalf("GenconfigUnmarshal fixture: %v", err)
}
// Verificar campos clave
if cfg.Prompt != "a dragon" {
t.Errorf("Prompt: got %q", cfg.Prompt)
}
if cfg.NegativePrompt != "ugly" {
t.Errorf("NegativePrompt: got %q", cfg.NegativePrompt)
}
if cfg.CfgScale != 7.0 {
t.Errorf("CfgScale: got %v", cfg.CfgScale)
}
if cfg.Model.ModelType != "sd15" {
t.Errorf("Model.ModelType: got %q", cfg.Model.ModelType)
}
if len(cfg.Loras) != 1 || cfg.Loras[0].Weight != 0.9 {
t.Errorf("Loras: got %+v", cfg.Loras)
}
// Re-marshal y verificar que las keys snake_case siguen presentes
b, err := GenconfigMarshal(cfg)
if err != nil {
t.Fatalf("GenconfigMarshal: %v", err)
}
s := string(b)
for _, key := range []string{"negative_prompt", "cfg_scale", "model_type", "quantization"} {
if !strings.Contains(s, `"`+key+`"`) {
t.Errorf("key %q ausente en JSON re-serializado:\n%s", key, s)
}
}
})
}
// ---------------------------------------------------------------------------
// helpers
// ---------------------------------------------------------------------------
// indexAll retorna todos los indices de val en slice.
func indexAll(slice []string, val string) []int {
var out []int
for i, s := range slice {
if s == val {
out = append(out, i)
}
}
return out
}
// containsPair verifica que flag seguido de value aparece en slice.
func containsPair(slice []string, flag, value string) bool {
for i := 0; i+1 < len(slice); i++ {
if slice[i] == flag && slice[i+1] == value {
return true
}
}
return false
}
+59
View File
@@ -0,0 +1,59 @@
package ml
import (
"fmt"
"strconv"
)
// samplerMap traduce nombres canonicos del dominio ml a flags de stable-diffusion.cpp.
var samplerMap = map[string]string{
"euler": "euler",
"euler_a": "euler_a",
"dpm++2m": "dpmpp2m",
"dpm++2m_v2": "dpmpp2mv2",
"heun": "heun",
"dpm2": "dpm2",
"lcm": "lcm",
}
// GenconfigToSdcliArgs convierte un GenerationConfig en una lista de argumentos
// CLI para stable-diffusion.cpp (sd.exe / sd binario).
// Espejo Go de genconfig_to_sdcpp_args_py_ml.
//
// Loras se emiten como pares repetidos "--lora" "path:weight".
// Si el sampler no existe en samplerMap se usa el valor literal sin traducir.
// La funcion es pura: sin I/O, sin estado, determinista.
func GenconfigToSdcliArgs(cfg GenerationConfig) []string {
args := []string{
"--prompt", cfg.Prompt,
"--seed", strconv.FormatInt(cfg.Seed, 10),
"--steps", strconv.Itoa(cfg.Steps),
"--cfg-scale", strconv.FormatFloat(cfg.CfgScale, 'f', -1, 64),
"--width", strconv.Itoa(cfg.Width),
"--height", strconv.Itoa(cfg.Height),
}
if cfg.NegativePrompt != "" {
args = append(args, "--negative-prompt", cfg.NegativePrompt)
}
sampler := cfg.Sampler
if mapped, ok := samplerMap[sampler]; ok {
sampler = mapped
}
args = append(args, "--sampling-method", sampler)
if cfg.Model.Path != "" {
args = append(args, "--model", cfg.Model.Path)
}
if cfg.ClipSkip != nil {
args = append(args, "--clip-skip", strconv.Itoa(*cfg.ClipSkip))
}
for _, lora := range cfg.Loras {
args = append(args, "--lora", fmt.Sprintf("%s:%g", lora.Path, lora.Weight))
}
return args
}
+59
View File
@@ -0,0 +1,59 @@
---
name: genconfig_to_sdcli_args
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: pure
signature: "func GenconfigToSdcliArgs(cfg GenerationConfig) []string"
description: "Convierte un GenerationConfig en argumentos CLI para stable-diffusion.cpp. Espejo Go de genconfig_to_sdcpp_args_py_ml. Loras se emiten como pares repetidos --lora path:weight. Sampler traducido via samplerMap canonico."
tags: [ml, stable-diffusion, cli, args, generation, pure]
uses_functions: []
uses_types: [generation_config_go_ml]
returns: []
returns_optional: false
error_type: ""
imports: ["fmt", "strconv"]
params:
- name: cfg
desc: "Parametros completos de generacion de imagen. Sampler debe ser uno de los valores de SamplerName. Model.Path se emite como --model si no esta vacio."
output: "Slice de strings listos para pasar a exec.Command o similar. Incluye --prompt, --seed, --steps, --cfg-scale, --width, --height, --sampling-method, opcionales --negative-prompt / --model / --clip-skip, y pares --lora path:weight por cada LoraRef."
tested: true
tests:
- "config basico sin loras ni clip_skip"
- "loras se emiten como pares path:weight"
- "sampler dpm++2m se traduce a dpmpp2m"
- "negative_prompt vacio no genera flag"
test_file_path: "functions/ml/genconfig_test.go"
file_path: "functions/ml/genconfig_to_sdcli_args.go"
---
## Ejemplo
```go
clip := 2
cfg := ml.GenerationConfig{
Prompt: "a cat",
Seed: 42,
Steps: 20,
CfgScale: 7.5,
Sampler: "dpm++2m",
Width: 512,
Height: 512,
Model: ml.ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1-5.safetensors"},
Loras: []ml.LoraRef{{Path: "/loras/detail.safetensors", Weight: 0.8}},
ClipSkip: &clip,
}
args := ml.GenconfigToSdcliArgs(cfg)
// args == ["--prompt","a cat","--seed","42","--steps","20",
// "--cfg-scale","7.5","--width","512","--height","512",
// "--sampling-method","dpmpp2m","--model","/models/v1-5.safetensors",
// "--clip-skip","2","--lora","/loras/detail.safetensors:0.8"]
```
## Notas
- `samplerMap` traduce nombres canonicos del dominio ml a los identificadores que acepta stable-diffusion.cpp. Si el sampler no esta en el mapa se usa el valor literal.
- El flag de modelo (`--model`) solo se emite si `cfg.Model.Path != ""`.
- `%g` en `fmt.Sprintf` para el peso de la lora elimina ceros insignificantes: `0.800000``0.8`.
- Funcion pura: misma entrada, misma salida. Sin I/O ni estado global.
+18
View File
@@ -0,0 +1,18 @@
package ml
// GenerationConfig parametriza una solicitud de generacion de imagen.
// Espejo JSON-compatible de GenerationConfig_py_ml: los tags json coinciden
// con los campos snake_case del dataclass Python para roundtrip sin perdida.
type GenerationConfig struct {
Prompt string `json:"prompt"`
NegativePrompt string `json:"negative_prompt,omitempty"`
Seed int64 `json:"seed"`
Steps int `json:"steps"`
CfgScale float64 `json:"cfg_scale"`
Sampler string `json:"sampler"`
Width int `json:"width"`
Height int `json:"height"`
Model ModelRef `json:"model"`
Loras []LoraRef `json:"loras,omitempty"`
ClipSkip *int `json:"clip_skip,omitempty"`
}
+12
View File
@@ -0,0 +1,12 @@
package ml
// ImageGenResult contiene la imagen generada y su metadata de ejecucion.
// ImageBytes transporta los bytes raw del PNG y se excluye del JSON
// (campo json:"-") porque viaja por canal binario separado.
type ImageGenResult struct {
ImageBytes []byte `json:"-"`
Format string `json:"format"`
Meta map[string]any `json:"meta"`
DurationMs int64 `json:"duration_ms"`
VramPeakMb *int `json:"vram_peak_mb,omitempty"`
}
+9
View File
@@ -0,0 +1,9 @@
package ml
import "context"
// ImageGenerator define el contrato para cualquier backend de generacion de imagenes.
// Las implementaciones pueden ser locales (ComfyUI, diffusers) o remotas (API).
type ImageGenerator interface {
Generate(ctx context.Context, cfg GenerationConfig) (ImageGenResult, error)
}
+8
View File
@@ -0,0 +1,8 @@
package ml
// LoraRef referencia un adaptador LoRA con su peso de fusión y escala opcional.
type LoraRef struct {
Path string `json:"path"`
Weight float64 `json:"weight"`
Scale *float64 `json:"scale,omitempty"`
}
+10
View File
@@ -0,0 +1,10 @@
package ml
// ModelRef identifica un modelo de generacion de imagenes por nombre, tipo,
// cuantizacion y path opcional en disco.
type ModelRef struct {
Name string `json:"name"`
ModelType string `json:"model_type"` // sd15|sdxl|flux_dev|...
Quantization string `json:"quantization"` // fp16|q8_0|...
Path string `json:"path,omitempty"`
}
+78
View File
@@ -0,0 +1,78 @@
package ml
import (
"regexp"
"strconv"
)
// SdcliProgress contiene el estado de progreso parseado de una linea de stderr de sd-cli.
type SdcliProgress struct {
Step int `json:"step"`
TotalSteps int `json:"total_steps"`
ItPerSec float64 `json:"it_per_sec"`
Percent float64 `json:"percent"`
}
// reProgress1 parsea el formato compacto: " 3/30 | 0.84it/s | 10%"
var reProgress1 = regexp.MustCompile(`\s*(\d+)\s*/\s*(\d+)\s*\|[^|]*?([\d.]+)\s*it/s[^|]*?\|\s*([\d.]+)\s*%`)
// reProgress2 parsea el formato verbose: "sampling: step 3 of 30 (0.84 it/s)"
var reProgress2 = regexp.MustCompile(`step\s+(\d+)\s+of\s+(\d+)\s*\(\s*([\d.]+)\s*it/s\)`)
// reProgress3 parsea el formato minimal: "step 3/30" o "progress: 3/30"
var reProgress3 = regexp.MustCompile(`(?:progress[:\s]+)?(\d+)\s*/\s*(\d+)`)
// SdcliParseProgress parsea una linea de stderr de stable-diffusion.cpp / sd-cli
// y extrae el estado de progreso. Retorna (SdcliProgress, true) si la linea
// contiene informacion de progreso reconocible; (zero, false) en caso contrario.
// Funcion pura: sin I/O, sin estado mutable, determinista.
func SdcliParseProgress(line string) (SdcliProgress, bool) {
// Formato 1: " 3/30 | 0.84it/s | 10%"
if m := reProgress1.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
pct, err4 := strconv.ParseFloat(m[4], 64)
if err1 == nil && err2 == nil && err3 == nil && err4 == nil {
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: itPerSec,
Percent: pct,
}, true
}
}
// Formato 2: "sampling: step 3 of 30 (0.84 it/s)"
if m := reProgress2.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
if err1 == nil && err2 == nil && err3 == nil && total > 0 {
pct := 100.0 * float64(step) / float64(total)
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: itPerSec,
Percent: pct,
}, true
}
}
// Formato 3: "step 3/30" o "progress: 3/30" sin velocidad
if m := reProgress3.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
if err1 == nil && err2 == nil && total > 0 {
pct := 100.0 * float64(step) / float64(total)
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: 0,
Percent: pct,
}, true
}
}
return SdcliProgress{}, false
}
+50
View File
@@ -0,0 +1,50 @@
---
name: sdcli_parse_progress
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: pure
signature: "func SdcliParseProgress(line string) (SdcliProgress, bool)"
description: "Parsea una linea de stderr de stable-diffusion.cpp / sd-cli y extrae el estado de progreso. Soporta el formato compacto '3/30 | 0.84it/s | 10%', el formato verbose 'sampling: step 3 of 30 (0.84 it/s)', y el formato minimal 'progress: 3/30'. Retorna (zero, false) si la linea no contiene informacion de progreso reconocible."
tags: [ml, stable-diffusion, sdcli, progress, parser, stderr, pure]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["regexp", "strconv"]
params:
- name: line
desc: "Una linea de stderr emitida por sd-cli / stable-diffusion.cpp durante la fase de sampling. Puede contener espacios al inicio o final."
output: "Par (SdcliProgress, bool). bool=true si se reconocio un patron de progreso; SdcliProgress contiene Step (paso actual), TotalSteps (pasos totales), ItPerSec (iteraciones por segundo, 0 si no disponible) y Percent (porcentaje 0-100 calculado o leido de la linea). bool=false y struct zero si la linea no contiene progreso."
tested: true
tests:
- "formato estandar compacto step/total/itpersec/percent"
- "linea sin patron retorna false"
- "formato sampling verbose con velocidad"
file_path: "functions/ml/sdcli_parse_progress.go"
test_file_path: "functions/ml/sdcli_parse_progress_test.go"
---
## Ejemplo
```go
p, ok := ml.SdcliParseProgress(" 3/30 | 0.84it/s | 10%")
// ok = true
// p = SdcliProgress{Step:3, TotalSteps:30, ItPerSec:0.84, Percent:10.0}
p2, ok2 := ml.SdcliParseProgress("sampling: step 15 of 30 (1.2 it/s)")
// ok2 = true
// p2 = SdcliProgress{Step:15, TotalSteps:30, ItPerSec:1.2, Percent:50.0}
_, ok3 := ml.SdcliParseProgress("loading model...")
// ok3 = false
```
## Notas
- Regexps precompiladas como vars de paquete (se compilan una sola vez al init del paquete).
- Tolerante a variaciones de espaciado gracias a `\s*` en los patrones.
- El campo `Percent` en el formato verbose se calcula como `100 * step / total` (no se lee de la linea porque ese formato no lo emite).
- Funcion pura: sin I/O, sin estado mutable, determinista.
+103
View File
@@ -0,0 +1,103 @@
package ml
import (
"math"
"testing"
)
func TestSdcliParseProgress_StandardFormat(t *testing.T) {
line := " 3/30 | 0.84it/s | 10%"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 3 {
t.Errorf("Step: got %d, want 3", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
}
if math.Abs(got.Percent-10.0) > 1e-9 {
t.Errorf("Percent: got %v, want 10.0", got.Percent)
}
}
func TestSdcliParseProgress_NoMatch(t *testing.T) {
cases := []string{
"loading model...",
"",
"error: out of memory",
"clip model loaded",
"generating image...",
}
for _, line := range cases {
_, ok := SdcliParseProgress(line)
if ok {
t.Errorf("expected no match for %q, but got match", line)
}
}
}
func TestSdcliParseProgress_AltFormat(t *testing.T) {
t.Run("formato sampling verbose", func(t *testing.T) {
line := "sampling: step 3 of 30 (0.84 it/s)"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 3 {
t.Errorf("Step: got %d, want 3", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
}
expectedPct := 100.0 * 3.0 / 30.0
if math.Abs(got.Percent-expectedPct) > 1e-6 {
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
}
})
t.Run("formato step/total sin velocidad", func(t *testing.T) {
line := "progress: 15/20"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 15 {
t.Errorf("Step: got %d, want 15", got.Step)
}
if got.TotalSteps != 20 {
t.Errorf("TotalSteps: got %d, want 20", got.TotalSteps)
}
if got.ItPerSec != 0 {
t.Errorf("ItPerSec: got %v, want 0", got.ItPerSec)
}
expectedPct := 75.0
if math.Abs(got.Percent-expectedPct) > 1e-6 {
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
}
})
t.Run("formato con espacios variables y mayor velocidad", func(t *testing.T) {
line := " 20/30 | 12.50it/s | 66%"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 20 {
t.Errorf("Step: got %d, want 20", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-12.5) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 12.5", got.ItPerSec)
}
})
}