chore: auto-commit (95 archivos)
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,155 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
// StreamEvent es una linea capturada de stdout o stderr del subproceso.
|
||||
type StreamEvent struct {
|
||||
Stream string // "stdout" | "stderr"
|
||||
Line string // sin trailing newline
|
||||
Time time.Time // timestamp de recepcion
|
||||
}
|
||||
|
||||
// StreamResult es el resultado final del subproceso, enviado por el canal de
|
||||
// resultados cuando ambos pipes han llegado a EOF y el proceso ha terminado.
|
||||
type StreamResult struct {
|
||||
ExitCode int
|
||||
Err error
|
||||
DurationMs int64
|
||||
}
|
||||
|
||||
// SubprocessStream lanza name con args como subproceso y retorna dos canales:
|
||||
// - events: recibe StreamEvent (linea de stdout/stderr) hasta EOF de ambos pipes.
|
||||
// - result: recibe exactamente un StreamResult cuando el proceso termina.
|
||||
//
|
||||
// env se concatena con os.Environ(). stdin puede ser nil.
|
||||
//
|
||||
// Cancelar ctx envia SIGTERM al proceso; si no termina en 2 segundos, SIGKILL.
|
||||
// El caller DEBE consumir events hasta que se cierre o cancelar ctx para evitar
|
||||
// bloquear las goroutines internas.
|
||||
func SubprocessStream(
|
||||
ctx context.Context,
|
||||
name string,
|
||||
args []string,
|
||||
env []string,
|
||||
stdin io.Reader,
|
||||
) (<-chan StreamEvent, <-chan StreamResult) {
|
||||
events := make(chan StreamEvent, 64)
|
||||
results := make(chan StreamResult, 1)
|
||||
|
||||
go func() {
|
||||
defer close(events)
|
||||
defer close(results)
|
||||
|
||||
start := time.Now()
|
||||
|
||||
cmd := exec.CommandContext(ctx, name, args...)
|
||||
|
||||
// Entorno: base + extra
|
||||
if len(env) > 0 {
|
||||
cmd.Env = append(os.Environ(), env...)
|
||||
}
|
||||
|
||||
if stdin != nil {
|
||||
cmd.Stdin = stdin
|
||||
}
|
||||
|
||||
// Process group propio para matar hijos al recibir SIGTERM/SIGKILL
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
|
||||
stdoutPipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stdout pipe: %w", err), DurationMs: 0}
|
||||
return
|
||||
}
|
||||
stderrPipe, err := cmd.StderrPipe()
|
||||
if err != nil {
|
||||
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stderr pipe: %w", err), DurationMs: 0}
|
||||
return
|
||||
}
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("start: %w", err), DurationMs: 0}
|
||||
return
|
||||
}
|
||||
|
||||
// Goroutine de supervision de ctx: SIGTERM → grace 2s → SIGKILL
|
||||
ctxDone := make(chan struct{})
|
||||
go func() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM)
|
||||
timer := time.NewTimer(2 * time.Second)
|
||||
defer timer.Stop()
|
||||
select {
|
||||
case <-timer.C:
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
case <-ctxDone:
|
||||
}
|
||||
}
|
||||
case <-ctxDone:
|
||||
}
|
||||
}()
|
||||
|
||||
send := func(stream, line string) {
|
||||
ev := StreamEvent{Stream: stream, Line: line, Time: time.Now()}
|
||||
select {
|
||||
case events <- ev:
|
||||
case <-ctx.Done():
|
||||
}
|
||||
}
|
||||
|
||||
// Leer stdout y stderr concurrentemente
|
||||
const bufSize = 1024 * 1024 // 1 MB para lineas largas (sd-cli progress, etc.)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
scanPipe := func(r io.Reader, stream string) {
|
||||
defer wg.Done()
|
||||
sc := bufio.NewScanner(r)
|
||||
sc.Buffer(make([]byte, bufSize), bufSize)
|
||||
for sc.Scan() {
|
||||
send(stream, sc.Text())
|
||||
}
|
||||
}
|
||||
|
||||
wg.Add(2)
|
||||
go scanPipe(stdoutPipe, "stdout")
|
||||
go scanPipe(stderrPipe, "stderr")
|
||||
|
||||
wg.Wait()
|
||||
close(ctxDone) // señal al supervisor de ctx para que pare
|
||||
|
||||
exitCode := 0
|
||||
var waitErr error
|
||||
if err := cmd.Wait(); err != nil {
|
||||
waitErr = err
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
exitCode = exitErr.ExitCode()
|
||||
waitErr = nil // exit code no-cero no es un error de spawn
|
||||
}
|
||||
}
|
||||
|
||||
// Si el contexto fue cancelado, reportar como error de cancelacion
|
||||
if ctx.Err() != nil && waitErr == nil {
|
||||
waitErr = ctx.Err()
|
||||
}
|
||||
|
||||
results <- StreamResult{
|
||||
ExitCode: exitCode,
|
||||
Err: waitErr,
|
||||
DurationMs: time.Since(start).Milliseconds(),
|
||||
}
|
||||
}()
|
||||
|
||||
return events, results
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
---
|
||||
name: subprocess_stream
|
||||
kind: function
|
||||
lang: go
|
||||
domain: core
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func SubprocessStream(ctx context.Context, name string, args []string, env []string, stdin io.Reader) (<-chan StreamEvent, <-chan StreamResult)"
|
||||
description: "Lanza un subproceso y retorna dos canales: uno con StreamEvent (linea de stdout/stderr con timestamp) y otro con un unico StreamResult (ExitCode, Err, DurationMs). Cancelar ctx envia SIGTERM al proceso; si no termina en 2s, SIGKILL."
|
||||
tags: [subprocess, exec, stream, stdout, stderr, process, concurrency, io, primitiva]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [bufio, context, fmt, io, os, os/exec, sync, syscall, time]
|
||||
params:
|
||||
- name: ctx
|
||||
desc: "Contexto de cancelacion. Al cancelar, el proceso recibe SIGTERM; si no muere en 2s, SIGKILL. Usar context.WithTimeout para acotar duracion maxima."
|
||||
- name: name
|
||||
desc: "Nombre o path del ejecutable a lanzar (ej. 'echo', '/usr/bin/python3')."
|
||||
- name: args
|
||||
desc: "Argumentos del proceso. Puede ser nil o vacio."
|
||||
- name: env
|
||||
desc: "Variables de entorno adicionales en formato 'KEY=VALUE'. Se concatenan con os.Environ(). Puede ser nil."
|
||||
- name: stdin
|
||||
desc: "Stdin del proceso. Puede ser nil si el proceso no necesita entrada."
|
||||
output: "Dos canales: events (<-chan StreamEvent) cerrado cuando ambos pipes EOF; result (<-chan StreamResult) con exactamente un valor cuando el proceso termina. El caller DEBE consumir events hasta cierre o cancelar ctx para evitar bloquear goroutines internas."
|
||||
tested: true
|
||||
tests:
|
||||
- "echo stdout llega como evento y ExitCode 0"
|
||||
- "stderr llega como evento con stream stderr"
|
||||
- "exit code no-cero se reporta en StreamResult"
|
||||
- "ctx cancelado termina el proceso"
|
||||
- "multiples lineas stdout"
|
||||
test_file_path: "functions/core/subprocess_stream_test.go"
|
||||
file_path: "functions/core/subprocess_stream.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
events, results := core.SubprocessStream(ctx, "grep", []string{"-rn", "TODO", "."}, nil, nil)
|
||||
|
||||
for ev := range events {
|
||||
switch ev.Stream {
|
||||
case "stdout":
|
||||
fmt.Println(ev.Line)
|
||||
case "stderr":
|
||||
fmt.Fprintln(os.Stderr, "[stderr]", ev.Line)
|
||||
}
|
||||
}
|
||||
|
||||
res := <-results
|
||||
if res.ExitCode != 0 || res.Err != nil {
|
||||
log.Printf("grep exit=%d err=%v duration=%dms", res.ExitCode, res.Err, res.DurationMs)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- El canal `events` tiene buffer de 64. Si el caller deja de consumir y el buffer se llena, las goroutinas internas se bloquean hasta que haya espacio o el ctx sea cancelado.
|
||||
- El scanner de cada pipe tiene un buffer de 1 MB para tolerar lineas muy largas (progreso de CLIs tipo sd-cli, barras ANSI largas).
|
||||
- Los structs `StreamEvent` y `StreamResult` se declaran en el mismo archivo para que el paquete `core` los exporte sin imports adicionales.
|
||||
- Generaliza el patron de `claude_stream_go_core` desacoplando el lanzamiento de subprocesos del protocolo especifico de claude (NDJSON/stream-json). `claude_stream_go_core` puede reimplementarse internamente usando esta funcion como primitiva.
|
||||
- `cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}` crea un process group propio; SIGTERM/SIGKILL se envian con `Kill(-pgid, sig)` para matar tambien los procesos hijo del hijo.
|
||||
@@ -0,0 +1,132 @@
|
||||
package core
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestSubprocessStream(t *testing.T) {
|
||||
t.Run("echo stdout llega como evento y ExitCode 0", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
events, results := SubprocessStream(ctx, "echo", []string{"hola"}, nil, nil)
|
||||
|
||||
var got []StreamEvent
|
||||
for ev := range events {
|
||||
got = append(got, ev)
|
||||
}
|
||||
|
||||
res := <-results
|
||||
|
||||
if res.ExitCode != 0 {
|
||||
t.Errorf("ExitCode = %d, want 0 (err: %v)", res.ExitCode, res.Err)
|
||||
}
|
||||
if res.Err != nil {
|
||||
t.Errorf("unexpected Err: %v", res.Err)
|
||||
}
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("got %d events, want 1", len(got))
|
||||
}
|
||||
if got[0].Stream != "stdout" {
|
||||
t.Errorf("Stream = %q, want %q", got[0].Stream, "stdout")
|
||||
}
|
||||
if got[0].Line != "hola" {
|
||||
t.Errorf("Line = %q, want %q", got[0].Line, "hola")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("stderr llega como evento con stream stderr", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// sh -c "echo msg >&2" escribe a stderr
|
||||
events, results := SubprocessStream(ctx, "sh", []string{"-c", "echo error_msg >&2"}, nil, nil)
|
||||
|
||||
var got []StreamEvent
|
||||
for ev := range events {
|
||||
got = append(got, ev)
|
||||
}
|
||||
res := <-results
|
||||
|
||||
if res.ExitCode != 0 {
|
||||
t.Errorf("ExitCode = %d, want 0", res.ExitCode)
|
||||
}
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("got %d events, want 1", len(got))
|
||||
}
|
||||
if got[0].Stream != "stderr" {
|
||||
t.Errorf("Stream = %q, want %q", got[0].Stream, "stderr")
|
||||
}
|
||||
if got[0].Line != "error_msg" {
|
||||
t.Errorf("Line = %q, want %q", got[0].Line, "error_msg")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("exit code no-cero se reporta en StreamResult", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
events, results := SubprocessStream(ctx, "sh", []string{"-c", "exit 42"}, nil, nil)
|
||||
|
||||
for range events {
|
||||
}
|
||||
res := <-results
|
||||
|
||||
if res.ExitCode != 42 {
|
||||
t.Errorf("ExitCode = %d, want 42", res.ExitCode)
|
||||
}
|
||||
if res.Err != nil {
|
||||
t.Errorf("unexpected Err: %v", res.Err)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("ctx cancelado termina el proceso", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// proceso que dura mucho; cancelamos enseguida
|
||||
ctxShort, cancelShort := context.WithTimeout(ctx, 100*time.Millisecond)
|
||||
defer cancelShort()
|
||||
|
||||
events, results := SubprocessStream(ctxShort, "sleep", []string{"60"}, nil, nil)
|
||||
|
||||
for range events {
|
||||
}
|
||||
res := <-results
|
||||
|
||||
// Tras cancelacion el proceso debe haber terminado (ExitCode != 0 o Err de ctx)
|
||||
if res.ExitCode == 0 && res.Err == nil {
|
||||
t.Error("expected non-zero exit or ctx error after cancellation")
|
||||
}
|
||||
if res.DurationMs > 3000 {
|
||||
t.Errorf("took %d ms, expected < 3000 (should have been killed)", res.DurationMs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("multiples lineas stdout", func(t *testing.T) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
events, results := SubprocessStream(ctx, "sh", []string{"-c", "printf 'a\nb\nc\n'"}, nil, nil)
|
||||
|
||||
var lines []string
|
||||
for ev := range events {
|
||||
if ev.Stream == "stdout" {
|
||||
lines = append(lines, ev.Line)
|
||||
}
|
||||
}
|
||||
<-results
|
||||
|
||||
if len(lines) != 3 {
|
||||
t.Fatalf("got %d stdout lines, want 3: %v", len(lines), lines)
|
||||
}
|
||||
want := []string{"a", "b", "c"}
|
||||
for i, w := range want {
|
||||
if lines[i] != w {
|
||||
t.Errorf("line[%d] = %q, want %q", i, lines[i], w)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MlEnvCheck holds the result of a single ML environment probe.
|
||||
type MlEnvCheck struct {
|
||||
Name string `json:"name"` // e.g. "cuda_toolkit", "python_venv"
|
||||
Status string `json:"status"` // "ok" | "missing" | "warning" | "unknown"
|
||||
Version string `json:"version,omitempty"` // version string if detected
|
||||
Detail string `json:"detail,omitempty"` // human-readable extra info
|
||||
}
|
||||
|
||||
// MlEnvReport is the full ML environment audit result.
|
||||
type MlEnvReport struct {
|
||||
Gpus []GpuInfo `json:"gpus"`
|
||||
Checks []MlEnvCheck `json:"checks"`
|
||||
OverallOK bool `json:"overall_ok"`
|
||||
GeneratedAt int64 `json:"generated_at"`
|
||||
}
|
||||
|
||||
// AuditMlEnv probes the ML environment rooted at registryRoot.
|
||||
// It checks for NVIDIA drivers, CUDA toolkit, Python venv, key Python
|
||||
// packages and optional tools (sd, llama-cli) and a local vault path.
|
||||
// Returns a non-nil MlEnvReport even when individual checks fail; the
|
||||
// function itself only errors if a fundamental system call cannot be
|
||||
// attempted.
|
||||
func AuditMlEnv(registryRoot string) (MlEnvReport, error) {
|
||||
report := MlEnvReport{
|
||||
GeneratedAt: time.Now().Unix(),
|
||||
}
|
||||
|
||||
// --- GPU detection (composes GetGpuInfo) ---
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
// Non-fatal: record absence.
|
||||
gpus = []GpuInfo{}
|
||||
}
|
||||
report.Gpus = gpus
|
||||
|
||||
checks := []MlEnvCheck{}
|
||||
|
||||
// --- nvidia-smi ---
|
||||
checks = append(checks, probeCommand("nvidia_smi", "nvidia-smi", []string{"--version"}, 5))
|
||||
|
||||
// --- nvcc (CUDA toolkit compiler) ---
|
||||
nvcc := probeNvcc()
|
||||
checks = append(checks, nvcc)
|
||||
|
||||
// --- Python venv ---
|
||||
venvCheck := probeVenv(registryRoot)
|
||||
checks = append(checks, venvCheck)
|
||||
|
||||
// Python venv path for subsequent checks.
|
||||
venvPy := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
|
||||
|
||||
// --- Python packages ---
|
||||
for _, pkg := range []string{"torch", "diffusers", "transformers", "huggingface_hub", "stable_diffusion_cpp_python"} {
|
||||
checks = append(checks, probePythonPackage(venvPy, pkg))
|
||||
}
|
||||
|
||||
// --- sd.cpp CLI ---
|
||||
checks = append(checks, probeCommand("sd_cli", "sd", []string{"--version"}, 5))
|
||||
|
||||
// --- llama.cpp CLI ---
|
||||
checks = append(checks, probeCommand("llama_cpp", "llama-cli", []string{"--version"}, 5))
|
||||
|
||||
// --- imagegen_vault ---
|
||||
checks = append(checks, probeImagegenVault())
|
||||
|
||||
report.Checks = checks
|
||||
|
||||
// OverallOK: no "missing" checks (warning is tolerated) and at least 1 GPU.
|
||||
overallOK := len(gpus) > 0
|
||||
for _, c := range checks {
|
||||
if c.Status == "missing" {
|
||||
// stable_diffusion_cpp_python and sd_cli are optional — downgrade to warning-only.
|
||||
if c.Name == "stable_diffusion_cpp_python" || c.Name == "sd_cli" || c.Name == "llama_cpp" {
|
||||
continue
|
||||
}
|
||||
overallOK = false
|
||||
}
|
||||
}
|
||||
report.OverallOK = overallOK
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// probeCommand checks whether a binary is available in PATH by running it with
|
||||
// the given args and recording any version output.
|
||||
func probeCommand(name, binary string, args []string, timeoutSec int) MlEnvCheck {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
path, err := exec.LookPath(binary)
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: name, Status: "missing", Detail: fmt.Sprintf("%s not found in PATH", binary)}
|
||||
}
|
||||
|
||||
out, err := exec.CommandContext(ctx, path, args...).CombinedOutput()
|
||||
version := strings.TrimSpace(string(out))
|
||||
if len(version) > 120 {
|
||||
version = version[:120]
|
||||
}
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: name, Status: "warning", Version: version, Detail: fmt.Sprintf("exit error: %v", err)}
|
||||
}
|
||||
return MlEnvCheck{Name: name, Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probeNvcc extracts the CUDA toolkit version from nvcc --version output.
|
||||
func probeNvcc() MlEnvCheck {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
path, err := exec.LookPath("nvcc")
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "nvcc", Status: "missing", Detail: "nvcc not found in PATH (CUDA toolkit not installed)"}
|
||||
}
|
||||
|
||||
out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "nvcc", Status: "warning", Detail: fmt.Sprintf("nvcc --version failed: %v", err)}
|
||||
}
|
||||
|
||||
// Extract version from line like: "Cuda compilation tools, release 12.4, V12.4.99"
|
||||
version := ""
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "release") {
|
||||
parts := strings.Split(line, ",")
|
||||
for _, p := range parts {
|
||||
p = strings.TrimSpace(p)
|
||||
if strings.HasPrefix(p, "release") {
|
||||
version = strings.TrimSpace(strings.TrimPrefix(p, "release"))
|
||||
break
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if version == "" {
|
||||
version = strings.TrimSpace(string(out))
|
||||
if len(version) > 80 {
|
||||
version = version[:80]
|
||||
}
|
||||
}
|
||||
return MlEnvCheck{Name: "nvcc", Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probeVenv checks that the Python venv exists and is functional.
|
||||
func probeVenv(registryRoot string) MlEnvCheck {
|
||||
py := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
|
||||
if _, err := os.Stat(py); os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: "python_venv", Status: "missing", Detail: fmt.Sprintf("not found: %s", py)}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, py, "--version").CombinedOutput()
|
||||
version := strings.TrimSpace(string(out))
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "python_venv", Status: "warning", Version: version, Detail: fmt.Sprintf("python3 --version failed: %v", err)}
|
||||
}
|
||||
return MlEnvCheck{Name: "python_venv", Status: "ok", Version: version}
|
||||
}
|
||||
|
||||
// probePythonPackage imports a package in the venv Python and extracts __version__.
|
||||
func probePythonPackage(venvPy, pkg string) MlEnvCheck {
|
||||
// Map package name → import name (for packages with different import names).
|
||||
importName := pkg
|
||||
switch pkg {
|
||||
case "stable_diffusion_cpp_python":
|
||||
importName = "stable_diffusion_cpp"
|
||||
case "huggingface_hub":
|
||||
importName = "huggingface_hub"
|
||||
}
|
||||
|
||||
// Check that the venv python binary exists first.
|
||||
if _, err := os.Stat(venvPy); os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: pkg, Status: "unknown", Detail: "python_venv not available"}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
script := fmt.Sprintf("import %s; v = getattr(%s, '__version__', None); print(v or 'unknown')", importName, importName)
|
||||
out, err := exec.CommandContext(ctx, venvPy, "-c", script).CombinedOutput()
|
||||
output := strings.TrimSpace(string(out))
|
||||
|
||||
if err != nil {
|
||||
// Module not found → missing; other errors → warning.
|
||||
detail := output
|
||||
if len(detail) > 200 {
|
||||
detail = detail[:200]
|
||||
}
|
||||
if strings.Contains(output, "ModuleNotFoundError") || strings.Contains(output, "No module named") {
|
||||
return MlEnvCheck{Name: pkg, Status: "missing", Detail: fmt.Sprintf("%s not installed", importName)}
|
||||
}
|
||||
return MlEnvCheck{Name: pkg, Status: "warning", Detail: detail}
|
||||
}
|
||||
return MlEnvCheck{Name: pkg, Status: "ok", Version: output}
|
||||
}
|
||||
|
||||
// probeImagegenVault checks that ~/vaults/imagegen_models exists and lists subdirs.
|
||||
func probeImagegenVault() MlEnvCheck {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "unknown", Detail: "cannot determine home directory"}
|
||||
}
|
||||
vaultPath := filepath.Join(home, "vaults", "imagegen_models")
|
||||
entries, err := os.ReadDir(vaultPath)
|
||||
if os.IsNotExist(err) {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "missing", Detail: fmt.Sprintf("vault not found: %s", vaultPath)}
|
||||
}
|
||||
if err != nil {
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "warning", Detail: fmt.Sprintf("cannot read vault: %v", err)}
|
||||
}
|
||||
|
||||
subdirs := []string{}
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
subdirs = append(subdirs, e.Name())
|
||||
}
|
||||
}
|
||||
detail := fmt.Sprintf("subdirs: %s", strings.Join(subdirs, ", "))
|
||||
if len(subdirs) == 0 {
|
||||
detail = "vault exists but is empty"
|
||||
}
|
||||
return MlEnvCheck{Name: "imagegen_vault", Status: "ok", Detail: detail}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
---
|
||||
name: audit_ml_env
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
|
||||
description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
|
||||
tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
|
||||
uses_functions: [get_gpu_info_go_infra]
|
||||
uses_types: [gpu_info_go_infra]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [context, fmt, os, os/exec, path/filepath, strings, time]
|
||||
tested: true
|
||||
tests:
|
||||
- "report no nil y tiene checks"
|
||||
- "generated_at es positivo"
|
||||
- "checks tiene al menos 4 entradas"
|
||||
- "gpus puede ser vacio en CI"
|
||||
test_file_path: "functions/infra/audit_ml_env_test.go"
|
||||
file_path: "functions/infra/audit_ml_env.go"
|
||||
params:
|
||||
- name: registryRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
|
||||
output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
|
||||
---
|
||||
|
||||
## Checks realizados
|
||||
|
||||
| Check | Tipo | Critico |
|
||||
|---|---|---|
|
||||
| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
|
||||
| `nvcc` | CUDA toolkit version | no |
|
||||
| `python_venv` | exists + `python3 --version` | si |
|
||||
| `torch` | `import torch; __version__` | si |
|
||||
| `diffusers` | `import diffusers; __version__` | si |
|
||||
| `transformers` | `import transformers; __version__` | si |
|
||||
| `huggingface_hub` | `import huggingface_hub; __version__` | si |
|
||||
| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
|
||||
| `sd_cli` | `sd --version` in PATH | no (opcional) |
|
||||
| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
|
||||
| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
root := "/home/lucas/fn_registry"
|
||||
report, err := AuditMlEnv(root)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, c := range report.Checks {
|
||||
fmt.Printf("%-40s %s %s\n", c.Name, c.Status, c.Version)
|
||||
}
|
||||
fmt.Printf("OverallOK: %v\n", report.OverallOK)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
|
||||
- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
|
||||
- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
|
||||
- No escribe nada en disco. Read-only.
|
||||
- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
|
||||
@@ -0,0 +1,53 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestAuditMlEnv(t *testing.T) {
|
||||
// Use the actual registry root relative to the test binary location.
|
||||
// Tests run from the package directory; go up two levels.
|
||||
registryRoot := "../.."
|
||||
|
||||
t.Run("report no nil y tiene checks", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if report.Checks == nil {
|
||||
t.Fatal("report.Checks is nil")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("generated_at es positivo", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if report.GeneratedAt <= 0 {
|
||||
t.Errorf("GeneratedAt should be positive unix timestamp, got %d", report.GeneratedAt)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("checks tiene al menos 4 entradas", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
if len(report.Checks) < 4 {
|
||||
t.Errorf("expected at least 4 checks, got %d", len(report.Checks))
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("gpus puede ser vacio en CI", func(t *testing.T) {
|
||||
report, err := AuditMlEnv(registryRoot)
|
||||
if err != nil {
|
||||
t.Fatalf("AuditMlEnv returned error: %v", err)
|
||||
}
|
||||
// Gpus may be empty in CI without a GPU; that's OK.
|
||||
// Just verify the field is not nil.
|
||||
if report.Gpus == nil {
|
||||
t.Error("report.Gpus should be a non-nil slice (can be empty)")
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// GetGpuInfo queries NVIDIA GPUs via nvidia-smi and returns a slice of GpuInfo.
|
||||
// If nvidia-smi is not installed or no NVIDIA GPU is present, returns an empty
|
||||
// slice and a nil error (absence of NVIDIA hardware is not an error).
|
||||
func GetGpuInfo() ([]GpuInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
|
||||
if err != nil {
|
||||
// nvidia-smi not installed or no NVIDIA device — not an error.
|
||||
var exitErr *exec.ExitError
|
||||
if errors.Is(err, exec.ErrNotFound) || errors.As(err, &exitErr) {
|
||||
return []GpuInfo{}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("gpu_info: nvidia-smi: %w", err)
|
||||
}
|
||||
|
||||
r := csv.NewReader(strings.NewReader(strings.TrimSpace(string(out))))
|
||||
r.TrimLeadingSpace = true
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("gpu_info: parse csv: %w", err)
|
||||
}
|
||||
|
||||
gpus := make([]GpuInfo, 0, len(records))
|
||||
for _, rec := range records {
|
||||
if len(rec) < 6 {
|
||||
continue
|
||||
}
|
||||
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(rec[0]))
|
||||
totalMb, _ := strconv.Atoi(strings.TrimSpace(rec[2]))
|
||||
freeMb, _ := strconv.Atoi(strings.TrimSpace(rec[3]))
|
||||
|
||||
gpus = append(gpus, GpuInfo{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(rec[1]),
|
||||
VramTotalMb: totalMb,
|
||||
VramFreeMb: freeMb,
|
||||
DriverVersion: strings.TrimSpace(rec[4]),
|
||||
CudaVersion: strings.TrimSpace(rec[5]),
|
||||
})
|
||||
}
|
||||
|
||||
return gpus, nil
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
name: get_gpu_info
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func GetGpuInfo() ([]GpuInfo, error)"
|
||||
description: "Consulta GPUs NVIDIA via nvidia-smi y retorna un slice de GpuInfo con index, nombre, VRAM total/libre, driver y version CUDA. Si nvidia-smi no esta instalado o no hay GPU NVIDIA, retorna slice vacio y nil (ausencia de hardware no es error)."
|
||||
tags: [gpu, nvidia, cuda, hardware, infra, probe]
|
||||
uses_functions: []
|
||||
uses_types: ["gpu_info_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [encoding/csv, errors, fmt, os/exec, strconv, strings]
|
||||
params:
|
||||
- name: (ninguno)
|
||||
desc: "No toma parametros. Lee el estado del sistema via nvidia-smi."
|
||||
output: "Slice de GpuInfo con una entrada por GPU detectada. Slice vacio si no hay GPUs NVIDIA o nvidia-smi no esta instalado. Error solo si nvidia-smi existe pero falla inesperadamente al parsear la salida CSV."
|
||||
tested: true
|
||||
tests:
|
||||
- "retorna slice vacio y nil cuando no hay GPU NVIDIA"
|
||||
- "linea GPU RTX 3080 tipica"
|
||||
- "dos GPUs en el CSV"
|
||||
- "CSV vacio retorna slice vacio"
|
||||
- "linea con menos de 6 campos se ignora"
|
||||
- "espacios extra en los valores se eliminan"
|
||||
- "campos del struct GpuInfo correctos"
|
||||
test_file_path: "functions/infra/get_gpu_info_test.go"
|
||||
file_path: "functions/infra/get_gpu_info.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
fmt.Println("No NVIDIA GPUs detected")
|
||||
} else {
|
||||
for _, g := range gpus {
|
||||
fmt.Printf("[%d] %s VRAM: %d/%d MiB Driver: %s CUDA: %s\n",
|
||||
g.Index, g.Name, g.VramFreeMb, g.VramTotalMb,
|
||||
g.DriverVersion, g.CudaVersion)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Salida nvidia-smi
|
||||
|
||||
Ejecuta:
|
||||
```
|
||||
nvidia-smi --query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version --format=csv,noheader,nounits
|
||||
```
|
||||
|
||||
Ejemplo de salida con una GPU:
|
||||
```
|
||||
0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Requiere `nvidia-smi` en PATH (parte del driver NVIDIA).
|
||||
- La columna `cuda_version` en nvidia-smi refleja la version maxima de CUDA soportada por el driver, no la del toolkit instalado.
|
||||
- Para comprobar el toolkit CUDA instalado, usar `cuda_toolkit_check_bash_infra`.
|
||||
- En maquinas sin GPU NVIDIA retorna `([]GpuInfo{}, nil)` — el caller puede tratar esto como "sin GPU disponible".
|
||||
- No ejecutar tests automatizados para esta funcion en CI sin GPU; verificar manualmente o con mock.
|
||||
@@ -0,0 +1,165 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestGetGpuInfoNoGpu verifica que la funcion retorna slice vacio sin error
|
||||
// cuando nvidia-smi no esta instalado o no hay GPU NVIDIA presente.
|
||||
// Este test pasa en cualquier maquina, con o sin GPU.
|
||||
func TestGetGpuInfoNoGpu(t *testing.T) {
|
||||
t.Run("retorna slice vacio y nil cuando no hay GPU NVIDIA", func(t *testing.T) {
|
||||
gpus, err := GetGpuInfo()
|
||||
if err != nil {
|
||||
t.Errorf("GetGpuInfo() error inesperado: %v", err)
|
||||
}
|
||||
// En maquinas sin nvidia-smi el resultado debe ser un slice vacio (no nil)
|
||||
if gpus == nil {
|
||||
t.Error("GetGpuInfo() retorno nil, se esperaba slice vacio []GpuInfo{}")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// parseCsvNvidiaSmi replica la logica de parsing de GetGpuInfo para tests unitarios.
|
||||
// Recibe el output de nvidia-smi --format=csv,noheader,nounits y retorna []GpuInfo.
|
||||
func parseCsvNvidiaSmi(output string) ([]GpuInfo, error) {
|
||||
trimmed := strings.TrimSpace(output)
|
||||
if trimmed == "" {
|
||||
return []GpuInfo{}, nil
|
||||
}
|
||||
lines := strings.Split(trimmed, "\n")
|
||||
gpus := make([]GpuInfo, 0, len(lines))
|
||||
for _, line := range lines {
|
||||
parts := strings.Split(line, ",")
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
totalMb, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
||||
freeMb, _ := strconv.Atoi(strings.TrimSpace(parts[3]))
|
||||
gpus = append(gpus, GpuInfo{
|
||||
Index: idx,
|
||||
Name: strings.TrimSpace(parts[1]),
|
||||
VramTotalMb: totalMb,
|
||||
VramFreeMb: freeMb,
|
||||
DriverVersion: strings.TrimSpace(parts[4]),
|
||||
CudaVersion: strings.TrimSpace(parts[5]),
|
||||
})
|
||||
}
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
// TestParseCsvNvidiaSmi verifica el parsing de la salida CSV de nvidia-smi
|
||||
// sin requerir GPU real ni nvidia-smi instalado.
|
||||
func TestParseCsvNvidiaSmi(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
csvInput string
|
||||
wantLen int
|
||||
wantIndex int
|
||||
wantName string
|
||||
wantVramTotal int
|
||||
wantVramFree int
|
||||
wantDriver string
|
||||
wantCuda string
|
||||
}{
|
||||
{
|
||||
name: "linea GPU RTX 3080 tipica",
|
||||
csvInput: "0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4",
|
||||
wantLen: 1,
|
||||
wantIndex: 0,
|
||||
wantName: "NVIDIA GeForce RTX 3080",
|
||||
wantVramTotal: 10240,
|
||||
wantVramFree: 8192,
|
||||
wantDriver: "550.54.15",
|
||||
wantCuda: "12.4",
|
||||
},
|
||||
{
|
||||
name: "dos GPUs en el CSV",
|
||||
csvInput: "0, GPU A, 8192, 4096, 525.0, 12.0\n1, GPU B, 24576, 20000, 525.0, 12.0",
|
||||
wantLen: 2,
|
||||
},
|
||||
{
|
||||
name: "CSV vacio retorna slice vacio",
|
||||
csvInput: "",
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "linea con menos de 6 campos se ignora",
|
||||
csvInput: "0, GPU, 8192",
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "espacios extra en los valores se eliminan",
|
||||
csvInput: " 1 , NVIDIA RTX 4090 , 24576 , 20000 , 545.0 , 12.6 ",
|
||||
wantLen: 1,
|
||||
wantIndex: 1,
|
||||
wantName: "NVIDIA RTX 4090",
|
||||
wantVramTotal: 24576,
|
||||
wantVramFree: 20000,
|
||||
wantDriver: "545.0",
|
||||
wantCuda: "12.6",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
gpus, err := parseCsvNvidiaSmi(tc.csvInput)
|
||||
if err != nil {
|
||||
t.Fatalf("error inesperado: %v", err)
|
||||
}
|
||||
if len(gpus) != tc.wantLen {
|
||||
t.Fatalf("len(gpus) = %d, quería %d", len(gpus), tc.wantLen)
|
||||
}
|
||||
if tc.wantLen == 1 {
|
||||
g := gpus[0]
|
||||
if g.Index != tc.wantIndex {
|
||||
t.Errorf("Index = %d, quería %d", g.Index, tc.wantIndex)
|
||||
}
|
||||
if g.Name != tc.wantName {
|
||||
t.Errorf("Name = %q, quería %q", g.Name, tc.wantName)
|
||||
}
|
||||
if g.VramTotalMb != tc.wantVramTotal {
|
||||
t.Errorf("VramTotalMb = %d, quería %d", g.VramTotalMb, tc.wantVramTotal)
|
||||
}
|
||||
if g.VramFreeMb != tc.wantVramFree {
|
||||
t.Errorf("VramFreeMb = %d, quería %d", g.VramFreeMb, tc.wantVramFree)
|
||||
}
|
||||
if g.DriverVersion != tc.wantDriver {
|
||||
t.Errorf("DriverVersion = %q, quería %q", g.DriverVersion, tc.wantDriver)
|
||||
}
|
||||
if g.CudaVersion != tc.wantCuda {
|
||||
t.Errorf("CudaVersion = %q, quería %q", g.CudaVersion, tc.wantCuda)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestGpuInfoStruct verifica los campos del tipo GpuInfo.
|
||||
func TestGpuInfoStruct(t *testing.T) {
|
||||
t.Run("campos del struct GpuInfo correctos", func(t *testing.T) {
|
||||
g := GpuInfo{
|
||||
Index: 0,
|
||||
Name: "NVIDIA GeForce GTX 1080",
|
||||
VramTotalMb: 8192,
|
||||
VramFreeMb: 6144,
|
||||
DriverVersion: "470.0",
|
||||
CudaVersion: "11.4",
|
||||
}
|
||||
if g.Index != 0 {
|
||||
t.Errorf("Index = %d", g.Index)
|
||||
}
|
||||
if g.Name != "NVIDIA GeForce GTX 1080" {
|
||||
t.Errorf("Name = %q", g.Name)
|
||||
}
|
||||
if g.VramTotalMb != 8192 {
|
||||
t.Errorf("VramTotalMb = %d", g.VramTotalMb)
|
||||
}
|
||||
if g.VramFreeMb != 6144 {
|
||||
t.Errorf("VramFreeMb = %d", g.VramFreeMb)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package infra
|
||||
|
||||
// GpuInfo describe una GPU detectada en el sistema con sus capacidades de VRAM
|
||||
// y versiones de driver y CUDA.
|
||||
type GpuInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
VramTotalMb int `json:"vram_total_mb"`
|
||||
VramFreeMb int `json:"vram_free_mb"`
|
||||
DriverVersion string `json:"driver_version"`
|
||||
CudaVersion string `json:"cuda_version,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AggregateReport summarises the result of a VaultAggregateIndex run.
|
||||
type AggregateReport struct {
|
||||
VaultsProcessed int
|
||||
VaultsSkipped int // vaults without a vault_index.db
|
||||
TotalFiles int
|
||||
Errors []string // non-fatal per-vault errors
|
||||
}
|
||||
|
||||
// VaultAggregateIndex reads all vault manifests from repoRoot, opens each
|
||||
// vault_index.db and copies all file records into the central registry.db
|
||||
// vault_files table. The table is created if it does not exist (idempotent).
|
||||
//
|
||||
// For each vault the previous rows are deleted and replaced atomically, so
|
||||
// re-running always produces a clean, non-duplicated state.
|
||||
//
|
||||
// Returns an AggregateReport with counts. Per-vault errors are non-fatal
|
||||
// (logged in report.Errors); only fatal errors (e.g. registry.db
|
||||
// unreachable) are returned as the error value.
|
||||
func VaultAggregateIndex(repoRoot string) (AggregateReport, error) {
|
||||
var report AggregateReport
|
||||
|
||||
// 1. Open registry.db
|
||||
registryDB, err := SQLiteOpen(filepath.Join(repoRoot, "registry.db"), "")
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_aggregate_index: open registry.db: %w", err)
|
||||
}
|
||||
defer registryDB.Close()
|
||||
|
||||
// 2. Idempotent schema migration
|
||||
for _, stmt := range []string{
|
||||
`CREATE TABLE IF NOT EXISTS vault_files (
|
||||
vault_id TEXT NOT NULL,
|
||||
vault_name TEXT NOT NULL,
|
||||
rel_path TEXT NOT NULL,
|
||||
size INTEGER NOT NULL,
|
||||
mtime INTEGER NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
mime TEXT NOT NULL DEFAULT '',
|
||||
ext TEXT NOT NULL DEFAULT '',
|
||||
bucket TEXT NOT NULL DEFAULT '',
|
||||
sub_bucket TEXT NOT NULL DEFAULT '',
|
||||
indexed_at INTEGER NOT NULL,
|
||||
PRIMARY KEY (vault_id, rel_path)
|
||||
);`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_vault_files_sha256 ON vault_files(sha256);`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_vault_files_vault ON vault_files(vault_id);`,
|
||||
} {
|
||||
if _, err := registryDB.Exec(stmt); err != nil {
|
||||
if !isIdempotentMigrationError(err) {
|
||||
return report, fmt.Errorf("vault_aggregate_index: schema: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Read manifest
|
||||
entries, err := VaultManifestRead(repoRoot)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_aggregate_index: manifest: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC().Unix()
|
||||
|
||||
for _, entry := range entries {
|
||||
vaultID := vaultIDFromEntry(entry)
|
||||
vaultName := entry.Name
|
||||
vaultPath := entry.Path
|
||||
|
||||
indexPath := filepath.Join(vaultPath, "vault_index.db")
|
||||
if _, statErr := os.Stat(indexPath); statErr != nil {
|
||||
report.VaultsSkipped++
|
||||
continue
|
||||
}
|
||||
|
||||
vaultDB, openErr := VaultIndexOpen(vaultPath)
|
||||
if openErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: open index: %v", vaultName, openErr))
|
||||
continue
|
||||
}
|
||||
|
||||
rows, queryErr := vaultDB.Query(
|
||||
`SELECT rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket FROM files`,
|
||||
)
|
||||
if queryErr != nil {
|
||||
vaultDB.Close()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: query files: %v", vaultName, queryErr))
|
||||
continue
|
||||
}
|
||||
|
||||
type fileRow struct {
|
||||
RelPath string
|
||||
Size int64
|
||||
Mtime int64
|
||||
Sha256 string
|
||||
Mime string
|
||||
Ext string
|
||||
Bucket string
|
||||
SubBucket string
|
||||
}
|
||||
var fileRows []fileRow
|
||||
for rows.Next() {
|
||||
var r fileRow
|
||||
if scanErr := rows.Scan(&r.RelPath, &r.Size, &r.Mtime, &r.Sha256, &r.Mime, &r.Ext, &r.Bucket, &r.SubBucket); scanErr != nil {
|
||||
continue
|
||||
}
|
||||
fileRows = append(fileRows, r)
|
||||
}
|
||||
rows.Close()
|
||||
vaultDB.Close()
|
||||
|
||||
// Atomic replace in registry.db
|
||||
tx, txErr := registryDB.Begin()
|
||||
if txErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: begin tx: %v", vaultName, txErr))
|
||||
continue
|
||||
}
|
||||
|
||||
if _, delErr := tx.Exec(`DELETE FROM vault_files WHERE vault_id = ?`, vaultID); delErr != nil {
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: delete: %v", vaultName, delErr))
|
||||
continue
|
||||
}
|
||||
|
||||
stmt, prepErr := tx.Prepare(`
|
||||
INSERT INTO vault_files
|
||||
(vault_id, vault_name, rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
|
||||
if prepErr != nil {
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: prepare: %v", vaultName, prepErr))
|
||||
continue
|
||||
}
|
||||
|
||||
for _, r := range fileRows {
|
||||
if _, insErr := stmt.Exec(vaultID, vaultName, r.RelPath, r.Size, r.Mtime, r.Sha256, r.Mime, r.Ext, r.Bucket, r.SubBucket, now); insErr != nil {
|
||||
stmt.Close()
|
||||
tx.Rollback()
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: insert %s: %v", vaultName, r.RelPath, insErr))
|
||||
continue
|
||||
}
|
||||
}
|
||||
stmt.Close()
|
||||
|
||||
if commitErr := tx.Commit(); commitErr != nil {
|
||||
report.Errors = append(report.Errors, fmt.Sprintf("%s: commit: %v", vaultName, commitErr))
|
||||
continue
|
||||
}
|
||||
|
||||
report.VaultsProcessed++
|
||||
report.TotalFiles += len(fileRows)
|
||||
}
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// vaultIDFromEntry constructs the canonical vault ID used in registry.db.
|
||||
// Pattern: "<vault_name>_<project_id>" — consistent with the vaults table.
|
||||
func vaultIDFromEntry(e VaultManifestEntry) string {
|
||||
if e.ProjectID == "" {
|
||||
return e.Name
|
||||
}
|
||||
return e.Name + "_" + e.ProjectID
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
---
|
||||
name: vault_aggregate_index
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultAggregateIndex(repoRoot string) (AggregateReport, error)"
|
||||
description: "Agrega los índices de todos los vaults del registry en la tabla vault_files de registry.db. Lee cada vault_index.db (via VaultIndexOpen) y reemplaza las filas de forma atómica. Idempotente: re-ejecutar limpia y reescribe sin duplicar."
|
||||
tags: [vault, index, aggregate, registry]
|
||||
uses_functions:
|
||||
- "vault_manifest_read_go_infra"
|
||||
- "vault_index_open_go_infra"
|
||||
- "sqlite_open_go_infra"
|
||||
uses_types:
|
||||
- "vault_file_go_infra"
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "database/sql"
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "time"
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultAggregateIndex_NoVaults"
|
||||
- "TestVaultAggregateIndex_VaultWithoutIndex"
|
||||
- "TestVaultAggregateIndex_HappyPath"
|
||||
- "TestVaultAggregateIndex_ReRunReplaces"
|
||||
test_file_path: "functions/infra/vault_aggregate_index_test.go"
|
||||
file_path: "functions/infra/vault_aggregate_index.go"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry (contiene registry.db y projects/)."
|
||||
output: "AggregateReport con VaultsProcessed, VaultsSkipped (sin vault_index.db), TotalFiles y Errors (errores no fatales por vault). Error fatal solo si registry.db no se puede abrir."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
report, err := infra.VaultAggregateIndex("/home/lucas/fn_registry")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("Processed: %d vaults, %d files\n", report.VaultsProcessed, report.TotalFiles)
|
||||
for _, e := range report.Errors {
|
||||
fmt.Println("warning:", e)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Requiere que `registry/migrations/012_vault_files.sql` haya sido aplicado (o que el indexer lo aplique al arrancar). La función aplica la migración de forma idempotente ella misma con `CREATE TABLE IF NOT EXISTS`.
|
||||
- Por cada vault: `DELETE WHERE vault_id = ?` + batch `INSERT` dentro de una transacción. Re-run siempre produce el mismo resultado.
|
||||
- Vaults sin `vault_index.db` se cuentan en `VaultsSkipped` y se omiten sin error.
|
||||
- El `vault_id` sigue el patrón `<vault_name>_<project_id>`, consistente con la tabla `vaults` de registry.db.
|
||||
@@ -0,0 +1,175 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// setupAggregateTestRepo creates a minimal repo layout:
|
||||
//
|
||||
// <root>/
|
||||
// registry.db (SQLite, empty)
|
||||
// projects/<project>/vaults/vault.yaml
|
||||
// <vaultPath>/ (optionally with vault_index.db populated)
|
||||
func setupAggregateTestRepo(t *testing.T, vaultName, projectID, vaultPath string, withIndex bool) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
|
||||
// Create registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("create registry.db: %v", err)
|
||||
}
|
||||
regDB.Close()
|
||||
|
||||
// Create project vault manifest
|
||||
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
|
||||
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir projects: %v", err)
|
||||
}
|
||||
manifestYAML := "vaults:\n - name: " + vaultName + "\n description: test\n path: " + vaultPath + "\n tags: []\n"
|
||||
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifestYAML), 0644); err != nil {
|
||||
t.Fatalf("write vault.yaml: %v", err)
|
||||
}
|
||||
|
||||
// Create vault dir
|
||||
if err := os.MkdirAll(vaultPath, 0755); err != nil {
|
||||
t.Fatalf("mkdir vault: %v", err)
|
||||
}
|
||||
|
||||
if withIndex {
|
||||
// Create a vault_index.db with one file row
|
||||
vdb, err := VaultIndexOpen(vaultPath)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
now := time.Now().UTC().Unix()
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/sample.csv", 1024, now, "deadbeef", "text/csv", ".csv", "data", "raw", now)
|
||||
if err != nil {
|
||||
t.Fatalf("insert test file: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
}
|
||||
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_NoVaults(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
// No manifests, just registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("create registry.db: %v", err)
|
||||
}
|
||||
regDB.Close()
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsProcessed != 0 {
|
||||
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
|
||||
}
|
||||
if len(report.Errors) != 0 {
|
||||
t.Errorf("Errors: want empty, got %v", report.Errors)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_VaultWithoutIndex(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, false /* no vault_index.db */)
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsSkipped != 1 {
|
||||
t.Errorf("VaultsSkipped: want 1, got %d", report.VaultsSkipped)
|
||||
}
|
||||
if report.VaultsProcessed != 0 {
|
||||
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_HappyPath(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
|
||||
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if report.VaultsProcessed != 1 {
|
||||
t.Errorf("VaultsProcessed: want 1, got %d", report.VaultsProcessed)
|
||||
}
|
||||
if report.TotalFiles != 1 {
|
||||
t.Errorf("TotalFiles: want 1, got %d", report.TotalFiles)
|
||||
}
|
||||
|
||||
// Verify row exists in registry.db
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("open registry.db: %v", err)
|
||||
}
|
||||
defer regDB.Close()
|
||||
|
||||
var count int
|
||||
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
|
||||
t.Fatalf("count vault_files: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("vault_files count: want 1, got %d", count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultAggregateIndex_ReRunReplaces(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
|
||||
|
||||
// First run
|
||||
if _, err := VaultAggregateIndex(root); err != nil {
|
||||
t.Fatalf("first run: %v", err)
|
||||
}
|
||||
|
||||
// Add a second file to vault_index.db
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("reopen vault index: %v", err)
|
||||
}
|
||||
now := time.Now().UTC().Unix()
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/extra.csv", 512, now, "cafebabe", "text/csv", ".csv", "data", "raw", now)
|
||||
if err != nil {
|
||||
t.Fatalf("insert second file: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
// Second run
|
||||
report, err := VaultAggregateIndex(root)
|
||||
if err != nil {
|
||||
t.Fatalf("second run: %v", err)
|
||||
}
|
||||
if report.TotalFiles != 2 {
|
||||
t.Errorf("TotalFiles: want 2, got %d", report.TotalFiles)
|
||||
}
|
||||
|
||||
// Verify no duplicates — exactly 2 rows
|
||||
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
|
||||
if err != nil {
|
||||
t.Fatalf("open registry.db: %v", err)
|
||||
}
|
||||
defer regDB.Close()
|
||||
|
||||
var count int
|
||||
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
|
||||
t.Fatalf("count vault_files: %v", err)
|
||||
}
|
||||
if count != 2 {
|
||||
t.Errorf("vault_files count after re-run: want 2, got %d", count)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
package infra
|
||||
|
||||
import "sort"
|
||||
|
||||
// VaultFileChange holds the before/after state of a file whose content changed.
|
||||
type VaultFileChange struct {
|
||||
RelPath string
|
||||
Prev VaultFile
|
||||
Curr VaultFile
|
||||
}
|
||||
|
||||
// VaultDiffReport is the result of comparing two VaultFile slices.
|
||||
type VaultDiffReport struct {
|
||||
Added []VaultFile // in curr but not in prev (by rel_path)
|
||||
Removed []VaultFile // in prev but not in curr
|
||||
Changed []VaultFileChange // same rel_path, different sha256
|
||||
Unchanged int // files present in both with identical sha256
|
||||
}
|
||||
|
||||
// VaultDiff computes the difference between two vault snapshots.
|
||||
// It indexes both slices by RelPath, then classifies each entry as
|
||||
// Added, Removed, Changed, or Unchanged. All output slices are sorted
|
||||
// by RelPath ascending. The function is pure and deterministic.
|
||||
func VaultDiff(prev, curr []VaultFile) VaultDiffReport {
|
||||
prevMap := make(map[string]VaultFile, len(prev))
|
||||
for _, f := range prev {
|
||||
prevMap[f.RelPath] = f
|
||||
}
|
||||
currMap := make(map[string]VaultFile, len(curr))
|
||||
for _, f := range curr {
|
||||
currMap[f.RelPath] = f
|
||||
}
|
||||
|
||||
var report VaultDiffReport
|
||||
|
||||
for _, f := range curr {
|
||||
p, exists := prevMap[f.RelPath]
|
||||
if !exists {
|
||||
report.Added = append(report.Added, f)
|
||||
} else if p.Sha256 != f.Sha256 {
|
||||
report.Changed = append(report.Changed, VaultFileChange{
|
||||
RelPath: f.RelPath,
|
||||
Prev: p,
|
||||
Curr: f,
|
||||
})
|
||||
} else {
|
||||
report.Unchanged++
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range prev {
|
||||
if _, exists := currMap[f.RelPath]; !exists {
|
||||
report.Removed = append(report.Removed, f)
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(report.Added, func(i, j int) bool {
|
||||
return report.Added[i].RelPath < report.Added[j].RelPath
|
||||
})
|
||||
sort.Slice(report.Removed, func(i, j int) bool {
|
||||
return report.Removed[i].RelPath < report.Removed[j].RelPath
|
||||
})
|
||||
sort.Slice(report.Changed, func(i, j int) bool {
|
||||
return report.Changed[i].RelPath < report.Changed[j].RelPath
|
||||
})
|
||||
|
||||
return report
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
---
|
||||
name: vault_diff
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "func VaultDiff(prev, curr []VaultFile) VaultDiffReport"
|
||||
description: "Computes the diff between two vault snapshots (slices of VaultFile). Returns Added, Removed, Changed and Unchanged counts. Pure and deterministic — no I/O."
|
||||
tags: [vault, diff, comparison, pure]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["sort"]
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultDiff_NoChanges"
|
||||
- "TestVaultDiff_AllAdded"
|
||||
- "TestVaultDiff_AllRemoved"
|
||||
- "TestVaultDiff_ContentChanged"
|
||||
- "TestVaultDiff_Mixed"
|
||||
test_file_path: "functions/infra/vault_diff_test.go"
|
||||
file_path: "functions/infra/vault_diff.go"
|
||||
params:
|
||||
- name: prev
|
||||
desc: "Snapshot anterior — slice de VaultFile del estado previo del vault (puede ser nil para diff desde cero)."
|
||||
- name: curr
|
||||
desc: "Snapshot actual — slice de VaultFile del estado corriente del vault (puede ser nil para diff de borrado total)."
|
||||
output: "VaultDiffReport con Added (nuevos), Removed (eliminados), Changed (mismo rel_path, sha256 distinto) y Unchanged (identicos). Todos los slices ordenados por RelPath ASC."
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
prev, _ := infra.VaultInventoryScan(oldPath, "my_vault_proj", "my_vault")
|
||||
curr, _ := infra.VaultInventoryScan(newPath, "my_vault_proj", "my_vault")
|
||||
report := infra.VaultDiff(prev, curr)
|
||||
fmt.Printf("Added: %d, Removed: %d, Changed: %d, Unchanged: %d\n",
|
||||
len(report.Added), len(report.Removed), len(report.Changed), report.Unchanged)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Usa `RelPath` como clave de identidad de archivo (no nombre, no sha256).
|
||||
- Dos archivos con mismo `RelPath` pero diferente `Sha256` se consideran Changed.
|
||||
- Los slices del report se ordenan por `RelPath` ASC para salida deterministica.
|
||||
- Función pura: no toca disco ni BD.
|
||||
@@ -0,0 +1,126 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func makeVF(relPath, sha256 string) VaultFile {
|
||||
return VaultFile{
|
||||
VaultID: "test_vault",
|
||||
VaultName: "test",
|
||||
RelPath: relPath,
|
||||
Sha256: sha256,
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_NoChanges(t *testing.T) {
|
||||
files := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(files, files)
|
||||
if len(report.Added) != 0 {
|
||||
t.Errorf("Added: want 0, got %d", len(report.Added))
|
||||
}
|
||||
if len(report.Removed) != 0 {
|
||||
t.Errorf("Removed: want 0, got %d", len(report.Removed))
|
||||
}
|
||||
if len(report.Changed) != 0 {
|
||||
t.Errorf("Changed: want 0, got %d", len(report.Changed))
|
||||
}
|
||||
if report.Unchanged != 2 {
|
||||
t.Errorf("Unchanged: want 2, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_AllAdded(t *testing.T) {
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(nil, curr)
|
||||
if len(report.Added) != 2 {
|
||||
t.Errorf("Added: want 2, got %d", len(report.Added))
|
||||
}
|
||||
if len(report.Removed) != 0 {
|
||||
t.Errorf("Removed: want 0, got %d", len(report.Removed))
|
||||
}
|
||||
if report.Added[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Added[0]: want data/a.csv, got %s", report.Added[0].RelPath)
|
||||
}
|
||||
if report.Added[1].RelPath != "data/b.csv" {
|
||||
t.Errorf("Added[1]: want data/b.csv, got %s", report.Added[1].RelPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_AllRemoved(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
}
|
||||
report := VaultDiff(prev, nil)
|
||||
if len(report.Removed) != 2 {
|
||||
t.Errorf("Removed: want 2, got %d", len(report.Removed))
|
||||
}
|
||||
if len(report.Added) != 0 {
|
||||
t.Errorf("Added: want 0, got %d", len(report.Added))
|
||||
}
|
||||
if report.Removed[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Removed[0]: want data/a.csv, got %s", report.Removed[0].RelPath)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_ContentChanged(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "old_hash"),
|
||||
}
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "new_hash"),
|
||||
}
|
||||
report := VaultDiff(prev, curr)
|
||||
if len(report.Changed) != 1 {
|
||||
t.Fatalf("Changed: want 1, got %d", len(report.Changed))
|
||||
}
|
||||
if report.Changed[0].RelPath != "data/a.csv" {
|
||||
t.Errorf("Changed[0].RelPath: want data/a.csv, got %s", report.Changed[0].RelPath)
|
||||
}
|
||||
if report.Changed[0].Prev.Sha256 != "old_hash" {
|
||||
t.Errorf("Changed[0].Prev.Sha256: want old_hash, got %s", report.Changed[0].Prev.Sha256)
|
||||
}
|
||||
if report.Changed[0].Curr.Sha256 != "new_hash" {
|
||||
t.Errorf("Changed[0].Curr.Sha256: want new_hash, got %s", report.Changed[0].Curr.Sha256)
|
||||
}
|
||||
if len(report.Added) != 0 || len(report.Removed) != 0 {
|
||||
t.Errorf("Expected no added/removed, got %d/%d", len(report.Added), len(report.Removed))
|
||||
}
|
||||
if report.Unchanged != 0 {
|
||||
t.Errorf("Unchanged: want 0, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDiff_Mixed(t *testing.T) {
|
||||
prev := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"),
|
||||
makeVF("data/b.csv", "bbb"),
|
||||
makeVF("data/c.csv", "ccc"),
|
||||
}
|
||||
curr := []VaultFile{
|
||||
makeVF("data/a.csv", "aaa"), // unchanged
|
||||
makeVF("data/b.csv", "bbb_new"), // changed
|
||||
makeVF("data/d.csv", "ddd"), // added
|
||||
}
|
||||
report := VaultDiff(prev, curr)
|
||||
|
||||
if len(report.Added) != 1 || report.Added[0].RelPath != "data/d.csv" {
|
||||
t.Errorf("Added: want [data/d.csv], got %v", report.Added)
|
||||
}
|
||||
if len(report.Removed) != 1 || report.Removed[0].RelPath != "data/c.csv" {
|
||||
t.Errorf("Removed: want [data/c.csv], got %v", report.Removed)
|
||||
}
|
||||
if len(report.Changed) != 1 || report.Changed[0].RelPath != "data/b.csv" {
|
||||
t.Errorf("Changed: want [data/b.csv], got %v", report.Changed)
|
||||
}
|
||||
if report.Unchanged != 1 {
|
||||
t.Errorf("Unchanged: want 1, got %d", report.Unchanged)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// VaultDoctorEntry holds the health report for a single vault.
|
||||
type VaultDoctorEntry struct {
|
||||
VaultName string `json:"vault_name"`
|
||||
VaultPath string `json:"vault_path"`
|
||||
ProjectID string `json:"project_id"`
|
||||
Issues []string `json:"issues"` // human-readable issues; empty = healthy
|
||||
IndexedFiles int `json:"indexed_files"` // 0 if no vault_index.db
|
||||
LastIndexedAt int64 `json:"last_indexed_at"` // unix seconds; 0 if N/A
|
||||
DiskFiles int `json:"disk_files"` // count via WalkDir (no hashing)
|
||||
Status string `json:"status"` // "ok" | "warning" | "error"
|
||||
}
|
||||
|
||||
// VaultDoctor audits every vault declared in projects/*/vaults/vault.yaml under
|
||||
// repoRoot. For each vault it performs a series of checks (disk presence, layout,
|
||||
// index existence, staleness, drift) and returns a slice of VaultDoctorEntry.
|
||||
//
|
||||
// The function is read-only: it never writes to disk or any database.
|
||||
// Returns an error only if VaultManifestRead fails (manifest parse error).
|
||||
func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error) {
|
||||
entries, err := VaultManifestRead(repoRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_doctor: read manifests: %w", err)
|
||||
}
|
||||
|
||||
results := make([]VaultDoctorEntry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
result := auditVault(e)
|
||||
results = append(results, result)
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func auditVault(e VaultManifestEntry) VaultDoctorEntry {
|
||||
entry := VaultDoctorEntry{
|
||||
VaultName: e.Name,
|
||||
VaultPath: e.Path,
|
||||
ProjectID: e.ProjectID,
|
||||
}
|
||||
|
||||
// Resolve symlinks for disk checks
|
||||
realPath, err := filepath.EvalSymlinks(e.Path)
|
||||
if err != nil || realPath == "" {
|
||||
realPath = e.Path
|
||||
}
|
||||
|
||||
// CHECK 1: directory_missing
|
||||
info, statErr := os.Stat(realPath)
|
||||
if statErr != nil || !info.IsDir() {
|
||||
entry.Issues = append(entry.Issues, "directory_missing")
|
||||
entry.Status = "error"
|
||||
return entry
|
||||
}
|
||||
|
||||
// COUNT disk files (cheap walk — no hashing, no mime detection)
|
||||
diskCount := countDiskFiles(realPath)
|
||||
entry.DiskFiles = diskCount
|
||||
|
||||
// CHECK 2: layout_missing / non_standard_layout
|
||||
hasData := dirExists(filepath.Join(realPath, "data"))
|
||||
hasKnowledge := dirExists(filepath.Join(realPath, "knowledge"))
|
||||
if !hasData && !hasKnowledge {
|
||||
// Check if it looks like a non-standard but intentional layout
|
||||
if hasNonStandardLayout(realPath) {
|
||||
entry.Issues = append(entry.Issues, "non_standard_layout")
|
||||
} else {
|
||||
entry.Issues = append(entry.Issues, "layout_missing")
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK 3: index_missing
|
||||
indexPath := filepath.Join(realPath, "vault_index.db")
|
||||
_, indexStatErr := os.Stat(indexPath)
|
||||
if indexStatErr != nil {
|
||||
entry.Issues = append(entry.Issues, "index_missing")
|
||||
entry.setWarningStatus()
|
||||
entry.setFinalStatus()
|
||||
return entry
|
||||
}
|
||||
|
||||
// Open vault index (read-only) for checks 4 and 5
|
||||
vdb, openErr := VaultIndexOpen(realPath)
|
||||
if openErr != nil {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_open_error: %v", openErr))
|
||||
entry.setWarningStatus()
|
||||
return entry
|
||||
}
|
||||
defer vdb.Close()
|
||||
|
||||
// Query indexed file count and max indexed_at
|
||||
var indexedCount int
|
||||
var maxIndexedAt int64
|
||||
row := vdb.QueryRow(`SELECT COUNT(*), COALESCE(MAX(indexed_at), 0) FROM files`)
|
||||
if scanErr := row.Scan(&indexedCount, &maxIndexedAt); scanErr != nil {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_query_error: %v", scanErr))
|
||||
} else {
|
||||
entry.IndexedFiles = indexedCount
|
||||
entry.LastIndexedAt = maxIndexedAt
|
||||
}
|
||||
|
||||
// CHECK 4: index_stale — any file on disk newer than MAX(indexed_at)
|
||||
if maxIndexedAt > 0 {
|
||||
maxTime := time.Unix(maxIndexedAt, 0)
|
||||
if isIndexStale(realPath, maxTime) {
|
||||
entry.Issues = append(entry.Issues, "index_stale")
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK 5: index_drift — disk file count != indexed count
|
||||
if indexedCount != diskCount {
|
||||
entry.Issues = append(entry.Issues, fmt.Sprintf("index_drift: disk=%d indexed=%d", diskCount, indexedCount))
|
||||
}
|
||||
|
||||
// CHECK 6: empty_vault
|
||||
if diskCount == 0 {
|
||||
entry.Issues = append(entry.Issues, "empty_vault")
|
||||
}
|
||||
|
||||
entry.setFinalStatus()
|
||||
return entry
|
||||
}
|
||||
|
||||
// setWarningStatus sets status to warning if not already error.
|
||||
func (e *VaultDoctorEntry) setWarningStatus() {
|
||||
if e.Status != "error" {
|
||||
e.Status = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// setFinalStatus derives the final Status from Issues.
|
||||
func (e *VaultDoctorEntry) setFinalStatus() {
|
||||
if e.Status == "error" {
|
||||
return
|
||||
}
|
||||
if len(e.Issues) == 0 {
|
||||
e.Status = "ok"
|
||||
} else {
|
||||
e.Status = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// countDiskFiles walks realPath and counts regular files, excluding:
|
||||
// vault_index.db*, .git/, hidden files/dirs at any depth.
|
||||
func countDiskFiles(realPath string) int {
|
||||
count := 0
|
||||
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
name := d.Name()
|
||||
// Skip hidden entries
|
||||
if strings.HasPrefix(name, ".") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// Skip .git
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
// Skip vault_index.db files
|
||||
if !d.IsDir() && (name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal") {
|
||||
return nil
|
||||
}
|
||||
if !d.IsDir() {
|
||||
count++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return count
|
||||
}
|
||||
|
||||
// isIndexStale returns true if any regular file under realPath has an mtime
|
||||
// strictly after maxTime (excluding vault_index.db* and hidden files).
|
||||
func isIndexStale(realPath string, maxTime time.Time) bool {
|
||||
stale := false
|
||||
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil || stale {
|
||||
return nil
|
||||
}
|
||||
name := d.Name()
|
||||
if strings.HasPrefix(name, ".") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
if !d.IsDir() {
|
||||
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
|
||||
return nil
|
||||
}
|
||||
fi, statErr := d.Info()
|
||||
if statErr == nil && fi.ModTime().After(maxTime) {
|
||||
stale = true
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return stale
|
||||
}
|
||||
|
||||
// hasNonStandardLayout returns true when a vault directory contains
|
||||
// subdirectories that are clearly intentional but not data/knowledge.
|
||||
// Heuristic: any subdir at the vault root that is not data/knowledge.
|
||||
func hasNonStandardLayout(realPath string) bool {
|
||||
entries, err := os.ReadDir(realPath)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
standardDirs := map[string]bool{"data": true, "knowledge": true, ".git": true}
|
||||
for _, e := range entries {
|
||||
if e.IsDir() && !standardDirs[e.Name()] && !strings.HasPrefix(e.Name(), ".") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: vault_doctor
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error)"
|
||||
description: "Audita la salud de todos los vaults declarados en projects/*/vaults/vault.yaml. Comprueba existencia del directorio, layout estándar, presencia del índice, staleness y drift entre disco e índice. Read-only."
|
||||
tags: [vault, doctor, health, audit]
|
||||
uses_functions:
|
||||
- "vault_manifest_read_go_infra"
|
||||
- "vault_index_open_go_infra"
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
- "time"
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultDoctor_OK"
|
||||
- "TestVaultDoctor_MissingDir"
|
||||
- "TestVaultDoctor_NoIndex"
|
||||
- "TestVaultDoctor_LayoutDrift"
|
||||
- "TestVaultDoctor_EmptyVault"
|
||||
test_file_path: "functions/infra/vault_doctor_test.go"
|
||||
file_path: "functions/infra/vault_doctor.go"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del fn_registry (donde están projects/ y registry.db)."
|
||||
output: "Slice de VaultDoctorEntry con Status (ok/warning/error), Issues, DiskFiles, IndexedFiles y LastIndexedAt por vault. Error fatal solo si los manifests no se pueden leer."
|
||||
---
|
||||
|
||||
## Checks aplicados
|
||||
|
||||
| Check | Condición | Severidad |
|
||||
|---|---|---|
|
||||
| `directory_missing` | `e.Path` no existe en disco | error |
|
||||
| `layout_missing` | no hay `data/` ni `knowledge/` en la raíz del vault | warning |
|
||||
| `non_standard_layout` | no hay `data/`/`knowledge/` pero sí otros subdirectorios (ej. imagegen_models) | warning |
|
||||
| `index_missing` | no existe `vault_index.db` | warning |
|
||||
| `index_stale` | algún archivo en disco tiene mtime > MAX(indexed_at) | warning |
|
||||
| `index_drift` | count disco != count en tabla `files` | warning |
|
||||
| `empty_vault` | DiskFiles == 0 | warning |
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
entries, err := infra.VaultDoctor("/home/lucas/fn_registry")
|
||||
for _, e := range entries {
|
||||
fmt.Printf("%-30s %-8s files=%d issues=%v\n",
|
||||
e.VaultName, e.Status, e.DiskFiles, e.Issues)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Función read-only: nunca escribe en disco ni en ninguna base de datos.
|
||||
- `countDiskFiles` usa `filepath.WalkDir` sin hash (cheap) — excluye `vault_index.db*`, `.git/` y ficheros ocultos.
|
||||
- `isIndexStale` también usa WalkDir; compara mtime de archivos con MAX(indexed_at) de la BD.
|
||||
- El VaultIndexOpen de sólo lectura no crea el DB (si no existe, retorna error y se reporta `index_missing`).
|
||||
@@ -0,0 +1,211 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// setupDoctorRepo creates a repo layout with one vault in a project manifest.
|
||||
// vaultPath must be an absolute path that already exists (or not, for missing tests).
|
||||
func setupDoctorRepo(t *testing.T, vaultName, projectID, vaultPath string) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
|
||||
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
|
||||
t.Fatalf("mkdir projects: %v", err)
|
||||
}
|
||||
manifest := "vaults:\n - name: " + vaultName + "\n description: test vault\n path: " + vaultPath + "\n tags: []\n"
|
||||
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifest), 0644); err != nil {
|
||||
t.Fatalf("write vault.yaml: %v", err)
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultDoctor_OK(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
|
||||
// Proper layout
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create a file with a past mtime so the index is not stale
|
||||
samplePath := filepath.Join(vaultDir, "data", "raw", "sample.csv")
|
||||
if err := os.WriteFile(samplePath, []byte("a,b\n1,2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
pastTime := time.Now().Add(-1 * time.Hour)
|
||||
if err := os.Chtimes(samplePath, pastTime, pastTime); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create vault_index.db with the file indexed after its mtime
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
futureIndexed := time.Now().Unix() // indexed_at is now — after file mtime
|
||||
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
||||
"data/raw/sample.csv", 8, pastTime.Unix(), "deadbeef", "text/csv", ".csv", "data", "raw", futureIndexed)
|
||||
if err != nil {
|
||||
t.Fatalf("insert: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
root := setupDoctorRepo(t, "my_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "ok" {
|
||||
t.Errorf("Status: want ok, got %s (issues: %v)", e.Status, e.Issues)
|
||||
}
|
||||
if len(e.Issues) != 0 {
|
||||
t.Errorf("Issues: want empty, got %v", e.Issues)
|
||||
}
|
||||
if e.DiskFiles != 1 {
|
||||
t.Errorf("DiskFiles: want 1, got %d", e.DiskFiles)
|
||||
}
|
||||
if e.IndexedFiles != 1 {
|
||||
t.Errorf("IndexedFiles: want 1, got %d", e.IndexedFiles)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_MissingDir(t *testing.T) {
|
||||
missingPath := filepath.Join(t.TempDir(), "does_not_exist")
|
||||
root := setupDoctorRepo(t, "missing_vault", "my_proj", missingPath)
|
||||
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "error" {
|
||||
t.Errorf("Status: want error, got %s", e.Status)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "directory_missing" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected directory_missing issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_NoIndex(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// Proper layout but no vault_index.db
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "a.csv"), []byte("x"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
root := setupDoctorRepo(t, "no_index_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s", e.Status)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "index_missing" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected index_missing issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_LayoutDrift(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// No data/ or knowledge/ — just a random file at root
|
||||
if err := os.WriteFile(filepath.Join(vaultDir, "something.txt"), []byte("hi"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
root := setupDoctorRepo(t, "layout_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s", e.Status)
|
||||
}
|
||||
foundLayout := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "layout_missing" || issue == "non_standard_layout" {
|
||||
foundLayout = true
|
||||
}
|
||||
}
|
||||
if !foundLayout {
|
||||
t.Errorf("Expected layout_missing or non_standard_layout, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultDoctor_EmptyVault(t *testing.T) {
|
||||
vaultDir := t.TempDir()
|
||||
// data/ and knowledge/ exist but are empty
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "data"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create vault_index.db (empty)
|
||||
vdb, err := VaultIndexOpen(vaultDir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
vdb.Close()
|
||||
|
||||
root := setupDoctorRepo(t, "empty_vault", "my_proj", vaultDir)
|
||||
entries, err := VaultDoctor(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 1 {
|
||||
t.Fatalf("expected 1 entry, got %d", len(entries))
|
||||
}
|
||||
e := entries[0]
|
||||
if e.Status != "warning" {
|
||||
t.Errorf("Status: want warning, got %s (issues: %v)", e.Status, e.Issues)
|
||||
}
|
||||
found := false
|
||||
for _, issue := range e.Issues {
|
||||
if issue == "empty_vault" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("Expected empty_vault issue, got %v", e.Issues)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package infra
|
||||
|
||||
// VaultFile describes a single file inside a vault directory.
|
||||
// It carries identity (vault + relative path), content metadata (size, mtime, sha256, mime)
|
||||
// and structural classification (bucket, sub-bucket).
|
||||
type VaultFile struct {
|
||||
VaultID string `json:"vault_id"` // e.g. "turismo_spain_app_turismo"
|
||||
VaultName string `json:"vault_name"` // e.g. "turismo_spain"
|
||||
RelPath string `json:"rel_path"` // path relative to vault root, e.g. "data/raw/foo.csv"
|
||||
Size int64 `json:"size"` // bytes
|
||||
Mtime int64 `json:"mtime"` // unix seconds (UTC)
|
||||
Sha256 string `json:"sha256"` // hex lowercase
|
||||
Mime string `json:"mime"` // e.g. "text/csv"
|
||||
Ext string `json:"ext"` // e.g. ".csv"
|
||||
// Bucket is the top-level classification: "data" or "knowledge".
|
||||
Bucket string `json:"bucket"`
|
||||
// SubBucket is the second-level directory within the bucket.
|
||||
// Known values: raw, processed, exports (data); decisions, domains, models,
|
||||
// benchmarks, test_documents (knowledge). Empty string for files at bucket root.
|
||||
SubBucket string `json:"sub_bucket"`
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
size INTEGER NOT NULL,
|
||||
mtime INTEGER NOT NULL,
|
||||
sha256 TEXT NOT NULL,
|
||||
mime TEXT NOT NULL DEFAULT '',
|
||||
ext TEXT NOT NULL DEFAULT '',
|
||||
bucket TEXT NOT NULL DEFAULT '',
|
||||
sub_bucket TEXT NOT NULL DEFAULT '',
|
||||
indexed_at INTEGER NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_sha256 ON files(sha256);
|
||||
CREATE INDEX IF NOT EXISTS idx_files_bucket ON files(bucket, sub_bucket);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
||||
rel_path,
|
||||
content_text,
|
||||
content='',
|
||||
tokenize='unicode61 remove_diacritics 2'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS csv_profiles (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
cols_json TEXT NOT NULL,
|
||||
n_rows INTEGER NOT NULL,
|
||||
encoding TEXT NOT NULL DEFAULT '',
|
||||
date_min TEXT,
|
||||
date_max TEXT,
|
||||
profiled_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS pdf_extracts (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
page_count INTEGER NOT NULL,
|
||||
text_len INTEGER NOT NULL,
|
||||
extracted_to TEXT,
|
||||
extracted_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS knowledge_docs (
|
||||
rel_path TEXT PRIMARY KEY,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
frontmatter_json TEXT NOT NULL DEFAULT '{}',
|
||||
headings_json TEXT NOT NULL DEFAULT '[]',
|
||||
parsed_at INTEGER NOT NULL,
|
||||
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
|
||||
);
|
||||
@@ -0,0 +1,30 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"embed"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
//go:embed vault_index_migrations/*.sql
|
||||
var vaultIndexMigrationsFS embed.FS
|
||||
|
||||
// VaultIndexOpen opens (or creates) the vault_index.db inside vaultPath.
|
||||
// It applies all embedded migrations idempotently and returns a ready-to-use
|
||||
// *sql.DB. The caller is responsible for closing the connection.
|
||||
//
|
||||
// The database is opened with WAL mode and foreign keys enabled via SQLiteOpen.
|
||||
// Migrations are applied from vault_index_migrations/*.sql in lexicographic order.
|
||||
func VaultIndexOpen(vaultPath string) (*sql.DB, error) {
|
||||
dbPath := filepath.Join(vaultPath, "vault_index.db")
|
||||
db, err := SQLiteOpen(dbPath, "")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_index_open: %w", err)
|
||||
}
|
||||
if err := ApplyMigrations(db, vaultIndexMigrationsFS, "vault_index_migrations/*.sql"); err != nil {
|
||||
db.Close()
|
||||
return nil, fmt.Errorf("vault_index_open: apply migrations: %w", err)
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
name: vault_index_open
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultIndexOpen(vaultPath string) (*sql.DB, error)"
|
||||
description: "Abre (o crea) vault_index.db dentro de vaultPath con WAL + FK y aplica las migraciones embebidas idempotentemente. El caller cierra la conexion."
|
||||
tags: [vault, sqlite, index, migration, infra]
|
||||
uses_functions: ["sqlite_open_go_infra", "sqlite_apply_migrations_go_infra"]
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, embed, fmt, path/filepath]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta o relativa al directorio raiz del vault"
|
||||
output: "*sql.DB apuntando a <vaultPath>/vault_index.db con schema completo aplicado; el caller es responsable de cerrar"
|
||||
tested: true
|
||||
tests:
|
||||
- "crea vault_index.db en tmpdir vacio"
|
||||
- "segunda apertura no falla (idempotente)"
|
||||
- "todas las tablas esperadas existen en sqlite_master"
|
||||
- "fts5 INSERT y MATCH funcionan"
|
||||
test_file_path: "functions/infra/vault_index_open_test.go"
|
||||
file_path: "functions/infra/vault_index_open.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
db, err := VaultIndexOpen("/data/vaults/turismo_spain")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
El archivo de base de datos se crea en `<vaultPath>/vault_index.db`. Las migraciones
|
||||
viven en `vault_index_migrations/*.sql` embebidas via `//go:embed` en el mismo paquete.
|
||||
|
||||
Schema creado por `001_init.sql`:
|
||||
- `files` — inventario de archivos (PK: rel_path)
|
||||
- `files_fts` — tabla FTS5 virtual para busqueda de texto (content_text lo llenan profilers posteriores)
|
||||
- `csv_profiles` — perfil de columnas/filas para .csv (FK → files)
|
||||
- `pdf_extracts` — metadatos de extraccion de texto para .pdf (FK → files)
|
||||
- `knowledge_docs` — headings/frontmatter para .md del bucket knowledge (FK → files)
|
||||
|
||||
`SQLiteOpen` abre con WAL mode + foreign keys. `ApplyMigrations` es idempotente:
|
||||
los errores de "already exists" y "duplicate column" se ignoran silenciosamente.
|
||||
@@ -0,0 +1,107 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVaultIndexOpen_CreatesDB(t *testing.T) {
|
||||
t.Run("crea vault_index.db en tmpdir vacio", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
dbPath := filepath.Join(dir, "vault_index.db")
|
||||
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
|
||||
t.Fatalf("vault_index.db no fue creado en %s", dir)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_Idempotent(t *testing.T) {
|
||||
t.Run("segunda apertura no falla (idempotente)", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
|
||||
db1, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("primera apertura: %v", err)
|
||||
}
|
||||
db1.Close()
|
||||
|
||||
db2, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("segunda apertura: %v", err)
|
||||
}
|
||||
db2.Close()
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_AppliesAllMigrations(t *testing.T) {
|
||||
t.Run("todas las tablas esperadas existen en sqlite_master", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
expectedTables := []string{
|
||||
"files",
|
||||
"csv_profiles",
|
||||
"pdf_extracts",
|
||||
"knowledge_docs",
|
||||
}
|
||||
for _, tbl := range expectedTables {
|
||||
assertTableExists(t, db, tbl)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexOpen_FTS5Works(t *testing.T) {
|
||||
t.Run("fts5 INSERT y MATCH funcionan", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Insert a row into files_fts (content='' table, manual INSERT required)
|
||||
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`,
|
||||
"data/raw/informe_ventas.csv", "ventas trimestrales empresa")
|
||||
if err != nil {
|
||||
t.Fatalf("INSERT files_fts: %v", err)
|
||||
}
|
||||
|
||||
var count int
|
||||
err = db.QueryRow(
|
||||
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'ventas'`,
|
||||
).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("FTS MATCH query: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("FTS MATCH: got %d rows, want 1", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// assertTableExists verifies that a table (or virtual table) exists in sqlite_master.
|
||||
func assertTableExists(t *testing.T, db *sql.DB, name string) {
|
||||
t.Helper()
|
||||
var exists int
|
||||
err := db.QueryRow(
|
||||
`SELECT count(*) FROM sqlite_master WHERE name = ?`, name,
|
||||
).Scan(&exists)
|
||||
if err != nil {
|
||||
t.Fatalf("sqlite_master query for %q: %v", name, err)
|
||||
}
|
||||
if exists == 0 {
|
||||
t.Errorf("table/vtable %q not found in sqlite_master", name)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// WriteReport summarises the outcome of a VaultIndexWrite call.
|
||||
type WriteReport struct {
|
||||
Inserted int // rows newly inserted into files
|
||||
Updated int // rows updated (upserted) in files
|
||||
Pruned int // rows deleted from files (only when prune=true)
|
||||
FTS int // rows inserted into files_fts
|
||||
}
|
||||
|
||||
// VaultIndexWrite upserts a slice of VaultFile into the vault_index.db opened
|
||||
// as db, updates the files_fts FTS5 table, and optionally prunes stale rows.
|
||||
//
|
||||
// All changes run inside a single transaction.
|
||||
//
|
||||
// Counting strategy: the set of rel_paths already in the DB is read before the
|
||||
// loop. An upsert is counted as Inserted if the rel_path was absent, Updated if
|
||||
// it was present. This avoids N+1 queries while remaining correct.
|
||||
//
|
||||
// FTS5: all affected rows are deleted and re-inserted with rel_path and empty
|
||||
// content_text. Downstream profilers (csv_profiles, pdf_extracts, knowledge_docs)
|
||||
// are responsible for populating content_text with meaningful text.
|
||||
//
|
||||
// Prune: if prune=true, every row in files whose rel_path is NOT in the provided
|
||||
// slice is deleted. Cascades to csv_profiles, pdf_extracts, knowledge_docs via FK.
|
||||
func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error) {
|
||||
var report WriteReport
|
||||
if len(files) == 0 && !prune {
|
||||
return report, nil
|
||||
}
|
||||
|
||||
tx, err := db.Begin()
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: begin tx: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
tx.Rollback() //nolint:errcheck
|
||||
}
|
||||
}()
|
||||
|
||||
// Load existing rel_paths into a set to distinguish insert vs update.
|
||||
existing := make(map[string]struct{})
|
||||
rows, err := tx.Query(`SELECT rel_path FROM files`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: query existing: %w", err)
|
||||
}
|
||||
for rows.Next() {
|
||||
var rp string
|
||||
if err := rows.Scan(&rp); err != nil {
|
||||
rows.Close()
|
||||
return report, fmt.Errorf("vault_index_write: scan existing: %w", err)
|
||||
}
|
||||
existing[rp] = struct{}{}
|
||||
}
|
||||
rows.Close()
|
||||
if err := rows.Err(); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: rows err: %w", err)
|
||||
}
|
||||
|
||||
now := time.Now().Unix()
|
||||
|
||||
upsertStmt, err := tx.Prepare(`
|
||||
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(rel_path) DO UPDATE SET
|
||||
size = excluded.size,
|
||||
mtime = excluded.mtime,
|
||||
sha256 = excluded.sha256,
|
||||
mime = excluded.mime,
|
||||
ext = excluded.ext,
|
||||
bucket = excluded.bucket,
|
||||
sub_bucket = excluded.sub_bucket,
|
||||
indexed_at = excluded.indexed_at
|
||||
`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare upsert: %w", err)
|
||||
}
|
||||
defer upsertStmt.Close()
|
||||
|
||||
ftsDeleteStmt, err := tx.Prepare(`DELETE FROM files_fts WHERE rel_path = ?`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare fts delete: %w", err)
|
||||
}
|
||||
defer ftsDeleteStmt.Close()
|
||||
|
||||
ftsInsertStmt, err := tx.Prepare(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, '')`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prepare fts insert: %w", err)
|
||||
}
|
||||
defer ftsInsertStmt.Close()
|
||||
|
||||
for _, f := range files {
|
||||
_, err = upsertStmt.Exec(
|
||||
f.RelPath, f.Size, f.Mtime, f.Sha256,
|
||||
f.Mime, f.Ext, f.Bucket, f.SubBucket, now,
|
||||
)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: upsert %q: %w", f.RelPath, err)
|
||||
}
|
||||
|
||||
if _, wasExisting := existing[f.RelPath]; wasExisting {
|
||||
report.Updated++
|
||||
} else {
|
||||
report.Inserted++
|
||||
}
|
||||
|
||||
// Refresh FTS row.
|
||||
if _, err = ftsDeleteStmt.Exec(f.RelPath); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: fts delete %q: %w", f.RelPath, err)
|
||||
}
|
||||
if _, err = ftsInsertStmt.Exec(f.RelPath); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: fts insert %q: %w", f.RelPath, err)
|
||||
}
|
||||
report.FTS++
|
||||
}
|
||||
|
||||
// Prune rows not present in the incoming slice.
|
||||
if prune && len(files) > 0 {
|
||||
keep := make([]string, len(files))
|
||||
for i, f := range files {
|
||||
keep[i] = "'" + strings.ReplaceAll(f.RelPath, "'", "''") + "'"
|
||||
}
|
||||
inClause := strings.Join(keep, ",")
|
||||
res, err := tx.Exec(fmt.Sprintf(
|
||||
`DELETE FROM files WHERE rel_path NOT IN (%s)`, inClause,
|
||||
))
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prune: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
report.Pruned = int(n)
|
||||
} else if prune && len(files) == 0 {
|
||||
// prune=true with empty slice means delete everything.
|
||||
res, err := tx.Exec(`DELETE FROM files`)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: prune all: %w", err)
|
||||
}
|
||||
n, _ := res.RowsAffected()
|
||||
report.Pruned = int(n)
|
||||
}
|
||||
|
||||
if err = tx.Commit(); err != nil {
|
||||
return report, fmt.Errorf("vault_index_write: commit: %w", err)
|
||||
}
|
||||
return report, nil
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
name: vault_index_write
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error)"
|
||||
description: "Upserta un slice de VaultFile en vault_index.db (tabla files + FTS5 files_fts) dentro de una sola transaccion. Cuenta Inserted/Updated/FTS. Con prune=true elimina filas no presentes en el slice."
|
||||
tags: [vault, sqlite, index, write, upsert, fts, infra]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, fmt, strings, time]
|
||||
params:
|
||||
- name: db
|
||||
desc: "*sql.DB abierto sobre vault_index.db (tipicamente retornado por VaultIndexOpen)"
|
||||
- name: files
|
||||
desc: "slice de VaultFile a insertar/actualizar; puede ser vacio"
|
||||
- name: prune
|
||||
desc: "si true, elimina de 'files' todas las filas cuyo rel_path no este en el slice (sincronizacion destructiva)"
|
||||
output: "WriteReport con conteos Inserted/Updated/Pruned/FTS; error si falla la transaccion"
|
||||
tested: true
|
||||
tests:
|
||||
- "N archivos nuevos — Inserted=N"
|
||||
- "re-escritura con mtime distinto — Updated=N"
|
||||
- "prune elimina filas ausentes"
|
||||
- "sin prune, filas previas persisten"
|
||||
- "FTS5 MATCH funciona tras escritura"
|
||||
test_file_path: "functions/infra/vault_index_write_test.go"
|
||||
file_path: "functions/infra/vault_index_write.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
db, _ := VaultIndexOpen("/data/vaults/turismo")
|
||||
defer db.Close()
|
||||
|
||||
files, _ := VaultInventoryScan("/data/vaults/turismo", "turismo_v1", "turismo")
|
||||
report, err := VaultIndexWrite(db, files, true)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("inserted=%d updated=%d pruned=%d fts=%d\n",
|
||||
report.Inserted, report.Updated, report.Pruned, report.FTS)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
### WriteReport
|
||||
Struct local al paquete infra:
|
||||
```go
|
||||
type WriteReport struct {
|
||||
Inserted int
|
||||
Updated int
|
||||
Pruned int
|
||||
FTS int
|
||||
}
|
||||
```
|
||||
|
||||
### Estrategia de conteo Inserted vs Updated
|
||||
Se carga el conjunto de rel_paths existentes en un map antes del loop. Un upsert
|
||||
se clasifica como Inserted si el rel_path no estaba en el map, Updated si estaba.
|
||||
Esto evita N+1 SELECTs y es correcto porque la transaccion serializa los cambios.
|
||||
|
||||
### FTS5
|
||||
`files_fts` usa `content=''` (tabla de contenido externo vacio). Para cada archivo
|
||||
se borra la fila FTS existente y se reinserta con `content_text=''`. Los profilers
|
||||
posteriores (csv_profiles, knowledge_docs) son responsables de actualizar
|
||||
`content_text` con texto indexable real.
|
||||
|
||||
### Prune
|
||||
Con `prune=true` se construye un IN clause con los rel_paths del slice. La FK con
|
||||
`ON DELETE CASCADE` propaga el DELETE a csv_profiles, pdf_extracts y knowledge_docs
|
||||
automaticamente. Con slice vacio + prune=true se borra todo (DELETE FROM files).
|
||||
|
||||
### Escapado SQL
|
||||
El IN clause se construye escapando las comillas simples en rel_path (duplicandolas).
|
||||
Evita inyeccion en rutas con apostrofos. Para entornos con rutas controladas
|
||||
(interior de vaults sin apostrofos) esto es suficiente; para entornos adversariales
|
||||
usar parametros binding con VALUES multiples via prepared statement.
|
||||
@@ -0,0 +1,210 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// makeTestVaultFile creates a minimal VaultFile for testing.
|
||||
func makeTestVaultFile(relPath, mime, bucket, subBucket string) VaultFile {
|
||||
return VaultFile{
|
||||
VaultID: "test_vault",
|
||||
VaultName: "test",
|
||||
RelPath: relPath,
|
||||
Size: 100,
|
||||
Mtime: time.Now().Unix(),
|
||||
Sha256: "abc123def456abc123def456abc123def456abc123def456abc123def456abc1",
|
||||
Mime: mime,
|
||||
Ext: ".csv",
|
||||
Bucket: bucket,
|
||||
SubBucket: subBucket,
|
||||
}
|
||||
}
|
||||
|
||||
func openInMemoryVaultIndex(t *testing.T) interface{ Close() error } {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
return db
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_FreshInsert(t *testing.T) {
|
||||
t.Run("N archivos nuevos — Inserted=N", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("knowledge/decisions/x.md", "text/markdown", "knowledge", "decisions"),
|
||||
}
|
||||
|
||||
report, err := VaultIndexWrite(db, files, false)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexWrite: %v", err)
|
||||
}
|
||||
if report.Inserted != 3 {
|
||||
t.Errorf("Inserted = %d, want 3", report.Inserted)
|
||||
}
|
||||
if report.Updated != 0 {
|
||||
t.Errorf("Updated = %d, want 0", report.Updated)
|
||||
}
|
||||
if report.Pruned != 0 {
|
||||
t.Errorf("Pruned = %d, want 0", report.Pruned)
|
||||
}
|
||||
if report.FTS != 3 {
|
||||
t.Errorf("FTS = %d, want 3", report.FTS)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_Upsert(t *testing.T) {
|
||||
t.Run("re-escritura con mtime distinto — Updated=N", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
|
||||
if _, err := VaultIndexWrite(db, files, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Modify mtime to simulate file change.
|
||||
files[0].Mtime = time.Now().Unix() + 100
|
||||
files[1].Mtime = time.Now().Unix() + 200
|
||||
|
||||
report, err := VaultIndexWrite(db, files, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second write: %v", err)
|
||||
}
|
||||
if report.Inserted != 0 {
|
||||
t.Errorf("Inserted = %d, want 0", report.Inserted)
|
||||
}
|
||||
if report.Updated != 2 {
|
||||
t.Errorf("Updated = %d, want 2", report.Updated)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_Prune(t *testing.T) {
|
||||
t.Run("prune elimina filas ausentes", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// Write A and B.
|
||||
ab := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, ab, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Write only A with prune=true — B should be deleted.
|
||||
onlyA := []VaultFile{ab[0]}
|
||||
report, err := VaultIndexWrite(db, onlyA, true)
|
||||
if err != nil {
|
||||
t.Fatalf("prune write: %v", err)
|
||||
}
|
||||
if report.Pruned != 1 {
|
||||
t.Errorf("Pruned = %d, want 1", report.Pruned)
|
||||
}
|
||||
|
||||
// Verify B is gone.
|
||||
var count int
|
||||
err = db.QueryRow(`SELECT count(*) FROM files WHERE rel_path = 'data/raw/b.csv'`).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if count != 0 {
|
||||
t.Errorf("b.csv still present after prune")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_NoPrune(t *testing.T) {
|
||||
t.Run("sin prune, filas previas persisten", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
ab := []VaultFile{
|
||||
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, ab, false); err != nil {
|
||||
t.Fatalf("first write: %v", err)
|
||||
}
|
||||
|
||||
// Write only A without prune — B must remain.
|
||||
onlyA := []VaultFile{ab[0]}
|
||||
report, err := VaultIndexWrite(db, onlyA, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second write: %v", err)
|
||||
}
|
||||
if report.Pruned != 0 {
|
||||
t.Errorf("Pruned = %d, want 0", report.Pruned)
|
||||
}
|
||||
|
||||
var count int
|
||||
err = db.QueryRow(`SELECT count(*) FROM files`).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("query: %v", err)
|
||||
}
|
||||
if count != 2 {
|
||||
t.Errorf("files count = %d, want 2", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultIndexWrite_FTSMatch(t *testing.T) {
|
||||
t.Run("FTS5 MATCH funciona tras escritura", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
files := []VaultFile{
|
||||
makeTestVaultFile("data/raw/foo_report.csv", "text/csv", "data", "raw"),
|
||||
makeTestVaultFile("data/raw/bar_data.csv", "text/csv", "data", "raw"),
|
||||
}
|
||||
if _, err := VaultIndexWrite(db, files, false); err != nil {
|
||||
t.Fatalf("write: %v", err)
|
||||
}
|
||||
|
||||
// FTS5 on rel_path column: MATCH 'foo*'
|
||||
var count int
|
||||
err = db.QueryRow(
|
||||
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'rel_path:foo*'`,
|
||||
).Scan(&count)
|
||||
if err != nil {
|
||||
t.Fatalf("FTS MATCH query: %v", err)
|
||||
}
|
||||
if count != 1 {
|
||||
t.Errorf("FTS MATCH rel_path:foo* = %d rows, want 1", count)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
|
||||
// for every regular file found, skipping:
|
||||
// - vault_index.db, vault_index.db-shm, vault_index.db-wal
|
||||
// - .git/ directories at any depth
|
||||
// - hidden files/dirs (names starting with ".") at the vault root level only
|
||||
//
|
||||
// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
|
||||
// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
|
||||
//
|
||||
// MIME detection priority:
|
||||
// 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
|
||||
// 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
|
||||
//
|
||||
// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
|
||||
// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
|
||||
// for upload validation, not for open-ended inventory scanning where any MIME is valid.
|
||||
// http.DetectContentType provides the same magic-byte detection without the allowlist
|
||||
// coupling and handles a broader set of formats including text/plain for CSV fallback.
|
||||
func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
|
||||
var files []VaultFile
|
||||
|
||||
err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
name := d.Name()
|
||||
|
||||
// Skip .git directories at any depth.
|
||||
if d.IsDir() && name == ".git" {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
|
||||
// Skip hidden entries (names starting with ".") at vault root only.
|
||||
if strings.HasPrefix(name, ".") {
|
||||
rel, relErr := filepath.Rel(vaultPath, path)
|
||||
if relErr == nil {
|
||||
// At root level the relative path has no separator.
|
||||
if !strings.Contains(filepath.ToSlash(rel), "/") {
|
||||
if d.IsDir() {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if d.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Skip vault_index.db and its WAL/SHM sidecar files.
|
||||
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
|
||||
return nil
|
||||
}
|
||||
|
||||
rel, err := filepath.Rel(vaultPath, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
|
||||
}
|
||||
rel = filepath.ToSlash(rel)
|
||||
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
|
||||
}
|
||||
|
||||
// Compute sha256 by streaming — avoids loading large files into memory.
|
||||
sha, err := fileSha256(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
|
||||
}
|
||||
|
||||
mime, err := detectVaultFileMime(path, name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
|
||||
}
|
||||
|
||||
ext := strings.ToLower(filepath.Ext(name))
|
||||
bucket, subBucket := vaultBucketParts(rel)
|
||||
|
||||
files = append(files, VaultFile{
|
||||
VaultID: vaultID,
|
||||
VaultName: vaultName,
|
||||
RelPath: rel,
|
||||
Size: info.Size(),
|
||||
Mtime: info.ModTime().UTC().Unix(),
|
||||
Sha256: sha,
|
||||
Mime: mime,
|
||||
Ext: ext,
|
||||
Bucket: bucket,
|
||||
SubBucket: subBucket,
|
||||
})
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
|
||||
}
|
||||
|
||||
sort.Slice(files, func(i, j int) bool {
|
||||
return files[i].RelPath < files[j].RelPath
|
||||
})
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
|
||||
func fileSha256(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
h := sha256.New()
|
||||
if _, err := io.Copy(h, f); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// detectVaultFileMime returns the MIME type for a vault file.
|
||||
// Extension overrides take priority; otherwise http.DetectContentType is used.
|
||||
func detectVaultFileMime(path, name string) (string, error) {
|
||||
ext := strings.ToLower(filepath.Ext(name))
|
||||
switch ext {
|
||||
case ".csv":
|
||||
return "text/csv", nil
|
||||
case ".md":
|
||||
return "text/markdown", nil
|
||||
case ".parquet":
|
||||
return "application/parquet", nil
|
||||
}
|
||||
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
buf := make([]byte, 512)
|
||||
n, err := f.Read(buf)
|
||||
if err != nil && err != io.EOF {
|
||||
return "", err
|
||||
}
|
||||
return http.DetectContentType(buf[:n]), nil
|
||||
}
|
||||
|
||||
// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
|
||||
// the second-level sub-bucket from a forward-slash relative path.
|
||||
// Returns empty strings for files at vault root or with no recognisable bucket.
|
||||
func vaultBucketParts(relPath string) (bucket, subBucket string) {
|
||||
parts := strings.SplitN(relPath, "/", 3)
|
||||
if len(parts) < 1 {
|
||||
return "", ""
|
||||
}
|
||||
bucket = parts[0]
|
||||
if len(parts) >= 2 {
|
||||
subBucket = parts[1]
|
||||
}
|
||||
return bucket, subBucket
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
---
|
||||
name: vault_inventory_scan
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error)"
|
||||
description: "Recorre vaultPath con filepath.WalkDir y retorna un slice de VaultFile ordenado por RelPath para cada archivo regular, computando sha256 por streaming, MIME por extension/magic y bucket/sub-bucket por posicion en el arbol."
|
||||
tags: [vault, inventory, scan, filesystem, sha256, mime, infra]
|
||||
uses_functions: []
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [crypto/sha256, encoding/hex, fmt, io, net/http, os, path/filepath, sort, strings]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta o relativa al directorio raiz del vault"
|
||||
- name: vaultID
|
||||
desc: "identificador del vault (ej: turismo_spain_app_turismo) — se copia a cada VaultFile"
|
||||
- name: vaultName
|
||||
desc: "nombre legible del vault (ej: turismo_spain) — se copia a cada VaultFile"
|
||||
output: "slice de VaultFile ordenado lexicograficamente por RelPath; slice vacio (no nil) si el vault esta vacio"
|
||||
tested: true
|
||||
tests:
|
||||
- "tmpdir vacio retorna slice vacio"
|
||||
- "data layout — bucket y sub_bucket correctos"
|
||||
- "knowledge layout — bucket y sub_bucket correctos"
|
||||
- "omite vault_index.db y .git"
|
||||
- "sha256 determinista para mismo contenido"
|
||||
- "orden lexicografico del resultado"
|
||||
test_file_path: "functions/infra/vault_inventory_scan_test.go"
|
||||
file_path: "functions/infra/vault_inventory_scan.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
files, err := VaultInventoryScan("/data/vaults/turismo_spain", "turismo_spain_v1", "turismo_spain")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, f := range files {
|
||||
fmt.Printf("%s %s %s/%s\n", f.RelPath, f.Mime, f.Bucket, f.SubBucket)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
### Archivos omitidos
|
||||
- `vault_index.db`, `vault_index.db-shm`, `vault_index.db-wal` (siempre)
|
||||
- `.git/` en cualquier profundidad (SkipDir)
|
||||
- Entradas cuyo nombre empieza por `.` solo en la raiz del vault (nivel 0)
|
||||
|
||||
### Deteccion de MIME
|
||||
`file_validate_type_go_infra` (FileValidateType) no se usa porque su firma
|
||||
requiere una lista blanca de tipos permitidos y retorna (mime, bool) — esta
|
||||
disenada para validacion de uploads, no para escaneo inventarial donde
|
||||
cualquier MIME es valido. Se usan en su lugar:
|
||||
|
||||
1. Override por extension (prioridad alta): `.csv` → `text/csv`, `.md` → `text/markdown`,
|
||||
`.parquet` → `application/parquet`. Necesario porque `http.DetectContentType`
|
||||
clasifica CSV como `text/plain` y no conoce Parquet.
|
||||
2. `http.DetectContentType` sobre primeros 512 bytes (magic bytes, stdlib) para el resto.
|
||||
|
||||
### SHA-256
|
||||
Calculado por streaming con `io.Copy` a `sha256.New()` — no carga el archivo completo
|
||||
a memoria. Valido para archivos de cualquier tamano.
|
||||
|
||||
### Bucket / SubBucket
|
||||
Derivados de la posicion en el arbol:
|
||||
- `bucket` = primer segmento del RelPath (tipicamente "data" o "knowledge")
|
||||
- `subBucket` = segundo segmento si existe; vacio si el archivo esta en la raiz del bucket
|
||||
@@ -0,0 +1,182 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func writeTestFile(t *testing.T, dir, rel, content string) {
|
||||
t.Helper()
|
||||
full := filepath.Join(dir, filepath.FromSlash(rel))
|
||||
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", filepath.Dir(full), err)
|
||||
}
|
||||
if err := os.WriteFile(full, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", full, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Empty(t *testing.T) {
|
||||
t.Run("tmpdir vacio retorna slice vacio", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
files, err := VaultInventoryScan(dir, "v1", "test")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 0 {
|
||||
t.Errorf("expected 0 files, got %d", len(files))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_DataLayout(t *testing.T) {
|
||||
t.Run("data layout — bucket y sub_bucket correctos", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "data/raw/a.csv", "col1,col2\n1,2\n")
|
||||
writeTestFile(t, dir, "data/processed/b.parquet", "PAR1fakedata")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 2 {
|
||||
t.Fatalf("expected 2 files, got %d", len(files))
|
||||
}
|
||||
|
||||
// files are sorted: data/processed/b.parquet < data/raw/a.csv
|
||||
b := files[0]
|
||||
if b.RelPath != "data/processed/b.parquet" {
|
||||
t.Errorf("files[0].RelPath = %q, want data/processed/b.parquet", b.RelPath)
|
||||
}
|
||||
if b.Bucket != "data" {
|
||||
t.Errorf("files[0].Bucket = %q, want data", b.Bucket)
|
||||
}
|
||||
if b.SubBucket != "processed" {
|
||||
t.Errorf("files[0].SubBucket = %q, want processed", b.SubBucket)
|
||||
}
|
||||
if b.Mime != "application/parquet" {
|
||||
t.Errorf("files[0].Mime = %q, want application/parquet", b.Mime)
|
||||
}
|
||||
if b.Ext != ".parquet" {
|
||||
t.Errorf("files[0].Ext = %q, want .parquet", b.Ext)
|
||||
}
|
||||
if b.VaultID != "vid" {
|
||||
t.Errorf("VaultID = %q, want vid", b.VaultID)
|
||||
}
|
||||
|
||||
a := files[1]
|
||||
if a.RelPath != "data/raw/a.csv" {
|
||||
t.Errorf("files[1].RelPath = %q, want data/raw/a.csv", a.RelPath)
|
||||
}
|
||||
if a.Mime != "text/csv" {
|
||||
t.Errorf("files[1].Mime = %q, want text/csv", a.Mime)
|
||||
}
|
||||
if a.Bucket != "data" || a.SubBucket != "raw" {
|
||||
t.Errorf("files[1]: bucket=%q subBucket=%q, want data/raw", a.Bucket, a.SubBucket)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_KnowledgeLayout(t *testing.T) {
|
||||
t.Run("knowledge layout — bucket y sub_bucket correctos", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "knowledge/decisions/x.md", "# Decision\n\ncontent")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("expected 1 file, got %d", len(files))
|
||||
}
|
||||
f := files[0]
|
||||
if f.RelPath != "knowledge/decisions/x.md" {
|
||||
t.Errorf("RelPath = %q", f.RelPath)
|
||||
}
|
||||
if f.Bucket != "knowledge" {
|
||||
t.Errorf("Bucket = %q, want knowledge", f.Bucket)
|
||||
}
|
||||
if f.SubBucket != "decisions" {
|
||||
t.Errorf("SubBucket = %q, want decisions", f.SubBucket)
|
||||
}
|
||||
if f.Mime != "text/markdown" {
|
||||
t.Errorf("Mime = %q, want text/markdown", f.Mime)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_SkipsIndexAndGit(t *testing.T) {
|
||||
t.Run("omite vault_index.db y .git", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "vault_index.db", "sqlite data")
|
||||
writeTestFile(t, dir, "vault_index.db-wal", "wal data")
|
||||
writeTestFile(t, dir, ".git/HEAD", "ref: refs/heads/master")
|
||||
writeTestFile(t, dir, "data/raw/real.csv", "a,b\n1,2\n")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "vid", "vname")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(files) != 1 {
|
||||
t.Fatalf("expected 1 file (real.csv), got %d: %v", len(files), relPaths(files))
|
||||
}
|
||||
if files[0].RelPath != "data/raw/real.csv" {
|
||||
t.Errorf("unexpected file: %q", files[0].RelPath)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Sha256Deterministic(t *testing.T) {
|
||||
t.Run("sha256 determinista para mismo contenido", func(t *testing.T) {
|
||||
dir1 := t.TempDir()
|
||||
dir2 := t.TempDir()
|
||||
content := "deterministic content 123\n"
|
||||
writeTestFile(t, dir1, "data/raw/f.csv", content)
|
||||
writeTestFile(t, dir2, "data/raw/f.csv", content)
|
||||
|
||||
files1, err := VaultInventoryScan(dir1, "v1", "vault1")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
files2, err := VaultInventoryScan(dir2, "v2", "vault2")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if files1[0].Sha256 != files2[0].Sha256 {
|
||||
t.Errorf("sha256 mismatch: %q vs %q", files1[0].Sha256, files2[0].Sha256)
|
||||
}
|
||||
if len(files1[0].Sha256) != 64 {
|
||||
t.Errorf("sha256 length = %d, want 64", len(files1[0].Sha256))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultInventoryScan_Sorted(t *testing.T) {
|
||||
t.Run("orden lexicografico del resultado", func(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
writeTestFile(t, dir, "knowledge/decisions/z.md", "z")
|
||||
writeTestFile(t, dir, "data/raw/a.csv", "a")
|
||||
writeTestFile(t, dir, "data/processed/m.parquet", "m")
|
||||
writeTestFile(t, dir, "knowledge/domains/b.md", "b")
|
||||
|
||||
files, err := VaultInventoryScan(dir, "v", "v")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for i := 1; i < len(files); i++ {
|
||||
if files[i].RelPath < files[i-1].RelPath {
|
||||
t.Errorf("not sorted at index %d: %q < %q", i, files[i].RelPath, files[i-1].RelPath)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// relPaths is a helper for test error messages.
|
||||
func relPaths(files []VaultFile) []string {
|
||||
out := make([]string, len(files))
|
||||
for i, f := range files {
|
||||
out[i] = f.RelPath
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,252 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// LayoutReport describes what VaultLayoutEnsure did (or would do) to a vault directory.
|
||||
type LayoutReport struct {
|
||||
VaultPath string `json:"vault_path"`
|
||||
Created []string `json:"created"` // dirs created (relative paths)
|
||||
Migrated []string `json:"migrated"` // renames executed, format "src -> dst" (relative)
|
||||
AlreadyOK []string `json:"already_ok"` // dirs that already existed at the target location
|
||||
Skipped []string `json:"skipped"` // unrecognized root-level entries, left untouched
|
||||
DryRun bool `json:"dry_run"`
|
||||
}
|
||||
|
||||
// dataBuckets are root-level directories that belong under data/.
|
||||
var dataBuckets = []string{"raw", "processed", "exports"}
|
||||
|
||||
// knowledgeBuckets are root-level directories that belong under knowledge/.
|
||||
var knowledgeBuckets = []string{"decisions", "domains", "models", "benchmarks", "test_documents"}
|
||||
|
||||
// knownRootFiles are root-level files that should be moved to knowledge/.
|
||||
var knownRootFiles = []string{"README.md", "README.txt"}
|
||||
|
||||
// VaultLayoutEnsure ensures a vault directory uses the canonical hybrid layout:
|
||||
//
|
||||
// data/{raw,processed,exports}
|
||||
// knowledge/{decisions,domains,models,benchmarks,test_documents}
|
||||
//
|
||||
// Legacy vaults that have these directories at the root are migrated by renaming
|
||||
// (or merging when both src and dst already exist). The operation is idempotent:
|
||||
// a second run returns everything in AlreadyOK.
|
||||
//
|
||||
// When dryRun is true the function computes the report but does not touch the disk.
|
||||
func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error) {
|
||||
report := LayoutReport{DryRun: dryRun}
|
||||
|
||||
// --- resolve path ---
|
||||
vaultPath = strings.TrimRight(vaultPath, "/\\")
|
||||
|
||||
var err error
|
||||
vaultPath, err = filepath.Abs(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: abs(%q): %w", vaultPath, err)
|
||||
}
|
||||
|
||||
// Follow symlinks for the vault root itself.
|
||||
resolved, err := filepath.EvalSymlinks(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: eval symlinks %q: %w", vaultPath, err)
|
||||
}
|
||||
vaultPath = resolved
|
||||
report.VaultPath = vaultPath
|
||||
|
||||
// --- check that vault exists and is a directory ---
|
||||
info, err := os.Stat(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: stat %q: %w", vaultPath, err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
return report, fmt.Errorf("vault_layout_ensure: %q is not a directory", vaultPath)
|
||||
}
|
||||
|
||||
// --- ensure top-level containers ---
|
||||
for _, container := range []string{"data", "knowledge"} {
|
||||
dst := filepath.Join(vaultPath, container)
|
||||
if err := ensureDir(dst, dryRun, container, &report); err != nil {
|
||||
return report, err
|
||||
}
|
||||
}
|
||||
|
||||
// --- build migration table: root name -> relative destination ---
|
||||
type migration struct {
|
||||
rootName string // name in vault root (dir or file)
|
||||
dstRel string // relative destination path inside vault
|
||||
isFile bool
|
||||
}
|
||||
|
||||
var migrations []migration
|
||||
for _, b := range dataBuckets {
|
||||
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("data", b)})
|
||||
}
|
||||
for _, b := range knowledgeBuckets {
|
||||
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("knowledge", b)})
|
||||
}
|
||||
for _, rf := range knownRootFiles {
|
||||
migrations = append(migrations, migration{rootName: rf, dstRel: filepath.Join("knowledge", "README.md"), isFile: true})
|
||||
}
|
||||
|
||||
// Track which root names are "known" so we can compute Skipped.
|
||||
knownNames := make(map[string]struct{})
|
||||
for _, m := range migrations {
|
||||
knownNames[strings.ToLower(m.rootName)] = struct{}{}
|
||||
}
|
||||
knownNames["data"] = struct{}{}
|
||||
knownNames["knowledge"] = struct{}{}
|
||||
|
||||
// --- apply migrations ---
|
||||
for _, m := range migrations {
|
||||
src := filepath.Join(vaultPath, m.rootName)
|
||||
dst := filepath.Join(vaultPath, m.dstRel)
|
||||
srcRel := m.rootName
|
||||
dstRel := m.dstRel
|
||||
|
||||
srcExists := pathExists(src)
|
||||
dstExists := pathExists(dst)
|
||||
|
||||
switch {
|
||||
case srcExists && dstExists:
|
||||
// Both exist: merge if directory, error on file collision.
|
||||
if m.isFile {
|
||||
return report, fmt.Errorf("vault_layout_ensure: conflict: both %q and %q exist", srcRel, dstRel)
|
||||
}
|
||||
if err := mergeDirs(src, dst, srcRel, dstRel, dryRun, &report); err != nil {
|
||||
return report, err
|
||||
}
|
||||
|
||||
case srcExists && !dstExists:
|
||||
// Only source exists: rename.
|
||||
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", srcRel, dstRel))
|
||||
if !dryRun {
|
||||
if err := os.Rename(src, dst); err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", src, dst, err)
|
||||
}
|
||||
}
|
||||
|
||||
case !srcExists && dstExists:
|
||||
// Already migrated.
|
||||
report.AlreadyOK = append(report.AlreadyOK, dstRel)
|
||||
|
||||
default:
|
||||
// Neither exists: create empty destination directory (skip for files).
|
||||
if !m.isFile {
|
||||
report.Created = append(report.Created, dstRel)
|
||||
if !dryRun {
|
||||
if err := os.MkdirAll(dst, 0o755); err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: mkdir %q: %w", dst, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- collect skipped (unrecognized root entries) ---
|
||||
entries, err := os.ReadDir(vaultPath)
|
||||
if err != nil {
|
||||
return report, fmt.Errorf("vault_layout_ensure: readdir %q: %w", vaultPath, err)
|
||||
}
|
||||
for _, e := range entries {
|
||||
if _, known := knownNames[strings.ToLower(e.Name())]; !known {
|
||||
report.Skipped = append(report.Skipped, e.Name())
|
||||
}
|
||||
}
|
||||
|
||||
return report, nil
|
||||
}
|
||||
|
||||
// ensureDir adds the dir to Created (and creates it) if it doesn't exist,
|
||||
// or to AlreadyOK if it does. Used for top-level containers "data" and "knowledge".
|
||||
func ensureDir(path string, dryRun bool, rel string, report *LayoutReport) error {
|
||||
if pathExists(path) {
|
||||
report.AlreadyOK = append(report.AlreadyOK, rel)
|
||||
return nil
|
||||
}
|
||||
report.Created = append(report.Created, rel)
|
||||
if dryRun {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(path, 0o755); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: mkdir %q: %w", path, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mergeDirs moves the contents of src into dst, then removes src if empty.
|
||||
// Returns an error if any file in src already exists in dst (no overwrite policy).
|
||||
func mergeDirs(src, dst, srcRel, dstRel string, dryRun bool, report *LayoutReport) error {
|
||||
children, err := os.ReadDir(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: readdir %q: %w", src, err)
|
||||
}
|
||||
|
||||
for _, child := range children {
|
||||
childDst := filepath.Join(dst, child.Name())
|
||||
if pathExists(childDst) {
|
||||
return fmt.Errorf("vault_layout_ensure: merge conflict: %q already exists in %q (cannot overwrite %q)",
|
||||
child.Name(), dstRel, filepath.Join(srcRel, child.Name()))
|
||||
}
|
||||
childSrc := filepath.Join(src, child.Name())
|
||||
childSrcRel := filepath.Join(srcRel, child.Name())
|
||||
childDstRel := filepath.Join(dstRel, child.Name())
|
||||
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", childSrcRel, childDstRel))
|
||||
if !dryRun {
|
||||
if err := os.Rename(childSrc, childDst); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", childSrc, childDst, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the now-empty src directory.
|
||||
if !dryRun {
|
||||
// Re-check emptiness after renames.
|
||||
remaining, _ := os.ReadDir(src)
|
||||
if len(remaining) == 0 {
|
||||
if err := os.Remove(src); err != nil {
|
||||
return fmt.Errorf("vault_layout_ensure: remove empty src %q: %w", src, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// pathExists returns true if path exists (any type).
|
||||
func pathExists(path string) bool {
|
||||
_, err := os.Lstat(path)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// dirIsEmpty returns true if a directory exists and has no entries.
|
||||
func dirIsEmpty(path string) bool {
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return len(entries) == 0
|
||||
}
|
||||
|
||||
// _ prevents "declared but not used" if dirIsEmpty is only used in tests.
|
||||
var _ = dirIsEmpty
|
||||
|
||||
// vaultLayoutKnownNames returns the set of root-level names managed by this function.
|
||||
// Exported for use in tests.
|
||||
func vaultLayoutKnownNames() map[string]struct{} {
|
||||
known := make(map[string]struct{})
|
||||
for _, b := range dataBuckets {
|
||||
known[b] = struct{}{}
|
||||
}
|
||||
for _, b := range knowledgeBuckets {
|
||||
known[b] = struct{}{}
|
||||
}
|
||||
for _, rf := range knownRootFiles {
|
||||
known[strings.ToLower(rf)] = struct{}{}
|
||||
}
|
||||
known["data"] = struct{}{}
|
||||
known["knowledge"] = struct{}{}
|
||||
return known
|
||||
}
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
---
|
||||
name: vault_layout_ensure
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error)"
|
||||
description: "Normaliza el layout de un vault al esquema hibrido canónico data/{raw,processed,exports} + knowledge/{decisions,domains,models,benchmarks,test_documents}. Migra directorios legacy en la raíz del vault a su ubicación correcta; idempotente."
|
||||
tags: [vault, layout, migration, infra, filesystem, idempotent]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
params:
|
||||
- name: vault_path
|
||||
desc: "Ruta al directorio raíz del vault. Puede ser absoluta, relativa o un symlink — se resuelve con filepath.Abs + filepath.EvalSymlinks. Trailing slashes se ignoran."
|
||||
- name: dry_run
|
||||
desc: "Si true, calcula el reporte completo (qué se crearía, migraría, etc.) pero no modifica el disco. Util para previsualizar antes de ejecutar."
|
||||
output: "LayoutReport con: VaultPath (ruta resuelta), Created (dirs creados), Migrated (renombres ejecutados, formato 'src -> dst'), AlreadyOK (destinos que ya existían), Skipped (entradas en raíz no reconocidas, no tocadas), DryRun (flag). Error si el path no existe, no es directorio, o hay conflicto de merge (mismo nombre de archivo en src y dst)."
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultLayoutEnsure_DryRun_NoChange"
|
||||
- "TestVaultLayoutEnsure_FreshDir_CreatesLayout"
|
||||
- "TestVaultLayoutEnsure_LegacyDataLayout_Migrates"
|
||||
- "TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates"
|
||||
- "TestVaultLayoutEnsure_AlreadyMigrated_Idempotent"
|
||||
- "TestVaultLayoutEnsure_Mixed_PartialMigration"
|
||||
- "TestVaultLayoutEnsure_MergeConflict_Errors"
|
||||
- "TestVaultLayoutEnsure_UnknownFiles_Skipped"
|
||||
- "TestVaultLayoutEnsure_NotADir_Errors"
|
||||
test_file_path: "functions/infra/vault_layout_ensure_test.go"
|
||||
file_path: "functions/infra/vault_layout_ensure.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
// Previsualizar sin tocar disco:
|
||||
report, err := VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", true)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
fmt.Printf("Would migrate: %v\n", report.Migrated)
|
||||
fmt.Printf("Would create: %v\n", report.Created)
|
||||
|
||||
// Ejecutar la migración:
|
||||
report, err = VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", false)
|
||||
if err != nil {
|
||||
log.Fatalf("migration failed: %v", err)
|
||||
}
|
||||
fmt.Printf("Migrated: %v\n", report.Migrated)
|
||||
fmt.Printf("Created: %v\n", report.Created)
|
||||
fmt.Printf("Skipped: %v\n", report.Skipped)
|
||||
```
|
||||
|
||||
## Comportamiento detallado
|
||||
|
||||
**Directorios gestionados:**
|
||||
|
||||
| Raíz (legacy) | Destino canónico |
|
||||
|---|---|
|
||||
| `raw/` | `data/raw/` |
|
||||
| `processed/` | `data/processed/` |
|
||||
| `exports/` | `data/exports/` |
|
||||
| `decisions/` | `knowledge/decisions/` |
|
||||
| `domains/` | `knowledge/domains/` |
|
||||
| `models/` | `knowledge/models/` |
|
||||
| `benchmarks/` | `knowledge/benchmarks/` |
|
||||
| `test_documents/` | `knowledge/test_documents/` |
|
||||
| `README.md` / `README.txt` | `knowledge/README.md` |
|
||||
|
||||
**Lógica de migración (por cada entrada conocida):**
|
||||
|
||||
- Solo `src` existe → rename atómico `src` → `dst`, registrado en `Migrated`.
|
||||
- Solo `dst` existe → ya migrado, registrado en `AlreadyOK`.
|
||||
- Ambos existen (dir) → merge: mueve cada hijo de `src/` a `dst/`; error si mismo nombre. Registrado en `Migrated` por hijo.
|
||||
- Ambos existen (archivo README) → error inmediato con paths concretos.
|
||||
- Ninguno existe → crea `dst` vacío, registrado en `Created`.
|
||||
|
||||
**Archivos/dirs no reconocidos** en la raíz (`.git`, `vault_index.db`, archivos custom) se registran en `Skipped` y no se tocan.
|
||||
|
||||
**Idempotencia:** segunda ejecución sobre un vault ya migrado reporta todo en `AlreadyOK` y no toca disco.
|
||||
|
||||
## Notas
|
||||
|
||||
`LayoutReport` es un tipo local de esta función (no un tipo del registry). El struct exportado vive en `functions/infra/vault_layout_ensure.go` junto con la función.
|
||||
|
||||
Para aplicar la migración a múltiples vaults en batch, invocar desde un pipeline que lea los paths de `vault.yaml` (ver `vault_manifest_read_go_infra`) y llame a `VaultLayoutEnsure` en cada uno.
|
||||
@@ -0,0 +1,394 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// mkVaultDir creates a temporary directory tree for tests.
|
||||
// entries is a list of relative paths to create.
|
||||
// Paths ending in "/" are directories; others are files with placeholder content.
|
||||
func mkVaultDir(t *testing.T, entries []string) string {
|
||||
t.Helper()
|
||||
root := t.TempDir()
|
||||
for _, e := range entries {
|
||||
full := filepath.Join(root, filepath.FromSlash(e))
|
||||
if e[len(e)-1] == '/' {
|
||||
if err := os.MkdirAll(full, 0o755); err != nil {
|
||||
t.Fatalf("mkVaultDir: mkdir %q: %v", full, err)
|
||||
}
|
||||
} else {
|
||||
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
|
||||
t.Fatalf("mkVaultDir: mkdir parent %q: %v", full, err)
|
||||
}
|
||||
if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
|
||||
t.Fatalf("mkVaultDir: write %q: %v", full, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return root
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_DryRun_NoChange(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/file1.csv",
|
||||
"processed/",
|
||||
})
|
||||
|
||||
before := snapshotDir(t, root)
|
||||
report, err := VaultLayoutEnsure(root, true)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if !report.DryRun {
|
||||
t.Error("DryRun flag not set in report")
|
||||
}
|
||||
after := snapshotDir(t, root)
|
||||
if !mapEqual(before, after) {
|
||||
t.Errorf("dry-run modified disk: before=%v after=%v", before, after)
|
||||
}
|
||||
// Should have planned a migration for raw and processed.
|
||||
if len(report.Migrated) == 0 {
|
||||
t.Error("expected Migrated to be non-empty in dry-run plan")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_FreshDir_CreatesLayout(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{}) // empty vault
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// All standard dirs should be created.
|
||||
wantCreated := []string{
|
||||
"data", "knowledge",
|
||||
filepath.Join("data", "raw"),
|
||||
filepath.Join("data", "processed"),
|
||||
filepath.Join("data", "exports"),
|
||||
filepath.Join("knowledge", "decisions"),
|
||||
filepath.Join("knowledge", "domains"),
|
||||
filepath.Join("knowledge", "models"),
|
||||
filepath.Join("knowledge", "benchmarks"),
|
||||
filepath.Join("knowledge", "test_documents"),
|
||||
}
|
||||
createdSet := toSet(report.Created)
|
||||
for _, w := range wantCreated {
|
||||
if _, ok := createdSet[w]; !ok {
|
||||
t.Errorf("expected Created to contain %q, got %v", w, report.Created)
|
||||
}
|
||||
}
|
||||
|
||||
// All directories must actually exist on disk.
|
||||
for _, w := range wantCreated {
|
||||
full := filepath.Join(root, w)
|
||||
info, err := os.Stat(full)
|
||||
if err != nil {
|
||||
t.Errorf("expected %q to exist: %v", full, err)
|
||||
continue
|
||||
}
|
||||
if !info.IsDir() {
|
||||
t.Errorf("%q should be a directory", full)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_LegacyDataLayout_Migrates(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/file1.parquet",
|
||||
"raw/file2.parquet",
|
||||
"processed/",
|
||||
"processed/clean.csv",
|
||||
"exports/",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// raw and processed should appear in Migrated (as dirs, top-level rename).
|
||||
migratedSet := toSet(report.Migrated)
|
||||
for _, pair := range []string{
|
||||
"raw -> " + filepath.Join("data", "raw"),
|
||||
"processed -> " + filepath.Join("data", "processed"),
|
||||
} {
|
||||
if _, ok := migratedSet[pair]; !ok {
|
||||
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
// Files must have moved.
|
||||
for _, f := range []string{
|
||||
filepath.Join("data", "raw", "file1.parquet"),
|
||||
filepath.Join("data", "raw", "file2.parquet"),
|
||||
filepath.Join("data", "processed", "clean.csv"),
|
||||
} {
|
||||
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
|
||||
t.Errorf("expected %q to exist after migration: %v", f, err)
|
||||
}
|
||||
}
|
||||
// Old dirs must be gone.
|
||||
for _, d := range []string{"raw", "processed"} {
|
||||
if pathExists(filepath.Join(root, d)) {
|
||||
t.Errorf("expected legacy dir %q to be removed", d)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"decisions/",
|
||||
"decisions/2024-01.md",
|
||||
"models/",
|
||||
"models/ner_v1.pkl",
|
||||
"README.md",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// decisions and models should appear in Migrated.
|
||||
migratedSet := toSet(report.Migrated)
|
||||
for _, pair := range []string{
|
||||
"decisions -> " + filepath.Join("knowledge", "decisions"),
|
||||
"models -> " + filepath.Join("knowledge", "models"),
|
||||
"README.md -> " + filepath.Join("knowledge", "README.md"),
|
||||
} {
|
||||
if _, ok := migratedSet[pair]; !ok {
|
||||
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
// Files must be at new location.
|
||||
for _, f := range []string{
|
||||
filepath.Join("knowledge", "decisions", "2024-01.md"),
|
||||
filepath.Join("knowledge", "models", "ner_v1.pkl"),
|
||||
filepath.Join("knowledge", "README.md"),
|
||||
} {
|
||||
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
|
||||
t.Errorf("expected %q to exist after migration: %v", f, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_AlreadyMigrated_Idempotent(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/file.csv",
|
||||
"data/processed/",
|
||||
"data/exports/",
|
||||
"knowledge/",
|
||||
"knowledge/decisions/",
|
||||
"knowledge/domains/",
|
||||
"knowledge/models/",
|
||||
"knowledge/benchmarks/",
|
||||
"knowledge/test_documents/",
|
||||
})
|
||||
|
||||
report1, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("first run error: %v", err)
|
||||
}
|
||||
if len(report1.Migrated) != 0 {
|
||||
t.Errorf("first run on fully-migrated vault should have no migrations, got %v", report1.Migrated)
|
||||
}
|
||||
|
||||
before := snapshotDir(t, root)
|
||||
report2, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("second run error: %v", err)
|
||||
}
|
||||
after := snapshotDir(t, root)
|
||||
|
||||
if !mapEqual(before, after) {
|
||||
t.Error("second run modified disk (not idempotent)")
|
||||
}
|
||||
if len(report2.Migrated) != 0 {
|
||||
t.Errorf("second run should produce no migrations, got %v", report2.Migrated)
|
||||
}
|
||||
if len(report2.AlreadyOK) == 0 {
|
||||
t.Error("second run should report existing dirs as AlreadyOK")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_Mixed_PartialMigration(t *testing.T) {
|
||||
// data/raw already migrated; exports still at root; knowledge dirs in legacy positions.
|
||||
root := mkVaultDir(t, []string{
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/already_here.csv",
|
||||
"exports/",
|
||||
"exports/report.pdf",
|
||||
"decisions/",
|
||||
"decisions/2023-note.md",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
// data/raw should be AlreadyOK.
|
||||
if !sliceContains(report.AlreadyOK, filepath.Join("data", "raw")) {
|
||||
t.Errorf("data/raw should be AlreadyOK, got AlreadyOK=%v", report.AlreadyOK)
|
||||
}
|
||||
// exports should be migrated.
|
||||
exportsMigrated := false
|
||||
for _, m := range report.Migrated {
|
||||
if m == "exports -> "+filepath.Join("data", "exports") {
|
||||
exportsMigrated = true
|
||||
}
|
||||
}
|
||||
if !exportsMigrated {
|
||||
t.Errorf("exports should be migrated, Migrated=%v", report.Migrated)
|
||||
}
|
||||
// decisions should be migrated.
|
||||
decisionsMigrated := false
|
||||
for _, m := range report.Migrated {
|
||||
if m == "decisions -> "+filepath.Join("knowledge", "decisions") {
|
||||
decisionsMigrated = true
|
||||
}
|
||||
}
|
||||
if !decisionsMigrated {
|
||||
t.Errorf("decisions should be migrated, Migrated=%v", report.Migrated)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_MergeConflict_Errors(t *testing.T) {
|
||||
// Both src (raw/) and dst (data/raw/) exist and have a file with the same name.
|
||||
root := mkVaultDir(t, []string{
|
||||
"raw/",
|
||||
"raw/collision.csv",
|
||||
"data/",
|
||||
"data/raw/",
|
||||
"data/raw/collision.csv", // same name -> conflict
|
||||
})
|
||||
|
||||
_, err := VaultLayoutEnsure(root, false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for merge conflict, got nil")
|
||||
}
|
||||
if !contains(err.Error(), "conflict") && !contains(err.Error(), "collision.csv") {
|
||||
t.Errorf("error should mention conflict or the file name, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_UnknownFiles_Skipped(t *testing.T) {
|
||||
root := mkVaultDir(t, []string{
|
||||
".git/",
|
||||
"vault_index.db",
|
||||
"my_custom_notes.txt",
|
||||
"raw/",
|
||||
})
|
||||
|
||||
report, err := VaultLayoutEnsure(root, false)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
skippedSet := toSet(report.Skipped)
|
||||
for _, name := range []string{".git", "vault_index.db", "my_custom_notes.txt"} {
|
||||
if _, ok := skippedSet[name]; !ok {
|
||||
t.Errorf("expected %q in Skipped, got %v", name, report.Skipped)
|
||||
}
|
||||
}
|
||||
// raw should NOT be in Skipped (it's a known bucket).
|
||||
if _, ok := skippedSet["raw"]; ok {
|
||||
t.Error("raw should not appear in Skipped — it is a known bucket")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultLayoutEnsure_NotADir_Errors(t *testing.T) {
|
||||
t.Run("non-existent path", func(t *testing.T) {
|
||||
_, err := VaultLayoutEnsure("/tmp/does_not_exist_fn_registry_test_xyz", false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for non-existent path")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("path is a file", func(t *testing.T) {
|
||||
f, err := os.CreateTemp("", "vault_layout_*.txt")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
f.Close()
|
||||
defer os.Remove(f.Name())
|
||||
|
||||
_, err = VaultLayoutEnsure(f.Name(), false)
|
||||
if err == nil {
|
||||
t.Fatal("expected error when vaultPath is a file, not a dir")
|
||||
}
|
||||
if !contains(err.Error(), "not a directory") {
|
||||
t.Errorf("error should mention 'not a directory', got: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// --- helpers ---
|
||||
|
||||
// snapshotDir returns a map of relative path -> exists for all entries under root.
|
||||
func snapshotDir(t *testing.T, root string) map[string]bool {
|
||||
t.Helper()
|
||||
snap := make(map[string]bool)
|
||||
err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, _ := filepath.Rel(root, path)
|
||||
snap[rel] = true
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("snapshotDir: %v", err)
|
||||
}
|
||||
return snap
|
||||
}
|
||||
|
||||
func mapEqual(a, b map[string]bool) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for k := range a {
|
||||
if !b[k] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func toSet(ss []string) map[string]struct{} {
|
||||
m := make(map[string]struct{}, len(ss))
|
||||
for _, s := range ss {
|
||||
m[s] = struct{}{}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
func sliceContains(ss []string, target string) bool {
|
||||
for _, s := range ss {
|
||||
if s == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func contains(s, sub string) bool {
|
||||
return len(s) >= len(sub) && (s == sub || len(sub) == 0 ||
|
||||
func() bool {
|
||||
for i := 0; i <= len(s)-len(sub); i++ {
|
||||
if s[i:i+len(sub)] == sub {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}())
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// VaultManifestEntry is a single vault entry parsed from a projects/<proj>/vaults/vault.yaml.
|
||||
type VaultManifestEntry struct {
|
||||
ProjectID string // basename of projects/<proj>/, inferred from manifest path
|
||||
Name string // vault name as declared in vault.yaml
|
||||
Description string // human description
|
||||
Path string // absolute path to the vault directory
|
||||
Tags []string // tags declared in vault.yaml
|
||||
ManifestFile string // absolute path to the vault.yaml this entry came from
|
||||
}
|
||||
|
||||
// vaultYAML mirrors the vault.yaml schema (only the fields we care about).
|
||||
type vaultYAML struct {
|
||||
Vaults []struct {
|
||||
Name string `yaml:"name"`
|
||||
Description string `yaml:"description"`
|
||||
Path string `yaml:"path"`
|
||||
Tags []string `yaml:"tags"`
|
||||
} `yaml:"vaults"`
|
||||
}
|
||||
|
||||
// VaultManifestRead globs all projects/*/vaults/vault.yaml under repoRoot, parses each
|
||||
// manifest and returns a flat slice of VaultManifestEntry.
|
||||
//
|
||||
// Rules:
|
||||
// - If a manifest fails to parse, an error is returned immediately with the file path.
|
||||
// - If no manifests are found, an empty slice is returned (not an error).
|
||||
// - ProjectID is inferred from the directory component between "projects/" and "/vaults/".
|
||||
func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error) {
|
||||
pattern := filepath.Join(repoRoot, "projects", "*", "vaults", "vault.yaml")
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: glob %q: %w", pattern, err)
|
||||
}
|
||||
|
||||
var out []VaultManifestEntry
|
||||
for _, manifestPath := range matches {
|
||||
entries, err := parseVaultManifest(manifestPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, entries...)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func parseVaultManifest(manifestPath string) ([]VaultManifestEntry, error) {
|
||||
data, err := os.ReadFile(manifestPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: read %q: %w", manifestPath, err)
|
||||
}
|
||||
|
||||
var raw vaultYAML
|
||||
if err := yaml.Unmarshal(data, &raw); err != nil {
|
||||
return nil, fmt.Errorf("vault_manifest_read: parse %q: %w", manifestPath, err)
|
||||
}
|
||||
|
||||
projectID := inferProjectID(manifestPath)
|
||||
|
||||
entries := make([]VaultManifestEntry, 0, len(raw.Vaults))
|
||||
for _, v := range raw.Vaults {
|
||||
entries = append(entries, VaultManifestEntry{
|
||||
ProjectID: projectID,
|
||||
Name: v.Name,
|
||||
Description: v.Description,
|
||||
Path: v.Path,
|
||||
Tags: v.Tags,
|
||||
ManifestFile: manifestPath,
|
||||
})
|
||||
}
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// inferProjectID extracts the project basename from a path of the form
|
||||
// .../projects/<proj>/vaults/vault.yaml.
|
||||
func inferProjectID(manifestPath string) string {
|
||||
// Normalize separators and split.
|
||||
parts := strings.Split(filepath.ToSlash(manifestPath), "/")
|
||||
// Walk backwards: vault.yaml -> vaults -> <proj> -> projects -> ...
|
||||
for i, p := range parts {
|
||||
if p == "projects" && i+1 < len(parts) {
|
||||
return parts[i+1]
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
name: vault_manifest_read
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error)"
|
||||
description: "Lee todos los manifests vault.yaml bajo projects/*/vaults/ del repo y devuelve una lista plana de entradas de vault con su ProjectID inferido del path."
|
||||
tags: [vault, manifest, yaml, infra, projects, storage]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports:
|
||||
- "fmt"
|
||||
- "os"
|
||||
- "path/filepath"
|
||||
- "strings"
|
||||
- "gopkg.in/yaml.v3"
|
||||
params:
|
||||
- name: repoRoot
|
||||
desc: "Ruta absoluta a la raiz del repositorio fn_registry. Se usa como base para el glob projects/*/vaults/vault.yaml."
|
||||
output: "Slice plano de VaultManifestEntry (ProjectID, Name, Description, Path, Tags, ManifestFile). Vacio si no hay manifests. Error si un yaml no parsea, con el path concreto en el mensaje."
|
||||
tested: true
|
||||
tests:
|
||||
- "TestVaultManifestRead_HappyPath"
|
||||
- "TestVaultManifestRead_MalformedYAML"
|
||||
- "TestVaultManifestRead_EmptyDir"
|
||||
test_file_path: "functions/infra/vault_manifest_read_test.go"
|
||||
file_path: "functions/infra/vault_manifest_read.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
entries, err := VaultManifestRead("/home/lucas/fn_registry")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, e := range entries {
|
||||
fmt.Printf("%s/%s -> %s\n", e.ProjectID, e.Name, e.Path)
|
||||
}
|
||||
// app_turismo/turismo_spain -> /home/lucas/vaults/turismo_spain
|
||||
// app_finance/finance_data -> /home/lucas/vaults/finance_data
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
`VaultManifestEntry` es un tipo local de esta funcion (no un tipo del registry). Contiene:
|
||||
- `ProjectID` — basename del directorio `projects/<proj>/`, inferido del path del manifest.
|
||||
- `Name`, `Description`, `Path`, `Tags` — copiados del yaml tal cual.
|
||||
- `ManifestFile` — path absoluto al vault.yaml de origen, util para mensajes de error y trazabilidad.
|
||||
|
||||
El parseo usa `gopkg.in/yaml.v3` (ya en go.mod). Si un manifest falla, la funcion devuelve
|
||||
error inmediatamente con el path del fichero problemático. Los manifests sin entradas
|
||||
`vaults:` contribuyen cero entries (no es error). Si no existe ningun `projects/*/vaults/vault.yaml`
|
||||
el resultado es slice vacio sin error.
|
||||
@@ -0,0 +1,113 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVaultManifestRead_HappyPath(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
writeManifest(t, root, "app_turismo", `
|
||||
vaults:
|
||||
- name: turismo_spain
|
||||
description: "Datos de turismo en Espana"
|
||||
path: "/home/lucas/vaults/turismo_spain"
|
||||
tags: [turismo, espana]
|
||||
- name: turismo_raw
|
||||
description: "Datos brutos sin procesar"
|
||||
path: "/home/lucas/vaults/turismo_raw"
|
||||
tags: [raw]
|
||||
`)
|
||||
|
||||
writeManifest(t, root, "app_finance", `
|
||||
vaults:
|
||||
- name: finance_data
|
||||
description: "Datos financieros"
|
||||
path: "/home/lucas/vaults/finance_data"
|
||||
tags: [finance]
|
||||
`)
|
||||
|
||||
entries, err := VaultManifestRead(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entries) != 3 {
|
||||
t.Fatalf("got %d entries, want 3", len(entries))
|
||||
}
|
||||
|
||||
// Build index by name for order-independent assertions.
|
||||
byName := make(map[string]VaultManifestEntry, len(entries))
|
||||
for _, e := range entries {
|
||||
byName[e.Name] = e
|
||||
}
|
||||
|
||||
// Check turismo_spain entry.
|
||||
e, ok := byName["turismo_spain"]
|
||||
if !ok {
|
||||
t.Fatal("missing entry 'turismo_spain'")
|
||||
}
|
||||
if e.ProjectID != "app_turismo" {
|
||||
t.Errorf("turismo_spain.ProjectID = %q, want %q", e.ProjectID, "app_turismo")
|
||||
}
|
||||
if e.Path != "/home/lucas/vaults/turismo_spain" {
|
||||
t.Errorf("turismo_spain.Path = %q, want %q", e.Path, "/home/lucas/vaults/turismo_spain")
|
||||
}
|
||||
if len(e.Tags) != 2 || e.Tags[0] != "turismo" {
|
||||
t.Errorf("turismo_spain.Tags = %v, want [turismo espana]", e.Tags)
|
||||
}
|
||||
if e.ManifestFile == "" {
|
||||
t.Error("turismo_spain.ManifestFile is empty")
|
||||
}
|
||||
|
||||
// Check finance_data entry belongs to app_finance.
|
||||
ef, ok := byName["finance_data"]
|
||||
if !ok {
|
||||
t.Fatal("missing entry 'finance_data'")
|
||||
}
|
||||
if ef.ProjectID != "app_finance" {
|
||||
t.Errorf("finance_data.ProjectID = %q, want %q", ef.ProjectID, "app_finance")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultManifestRead_MalformedYAML(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
writeManifest(t, root, "bad_project", `
|
||||
vaults:
|
||||
- name: [invalid yaml
|
||||
path: missing_bracket
|
||||
`)
|
||||
|
||||
_, err := VaultManifestRead(root)
|
||||
if err == nil {
|
||||
t.Fatal("expected error for malformed YAML, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVaultManifestRead_EmptyDir(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
|
||||
// No projects/ directory at all — glob returns no matches.
|
||||
entries, err := VaultManifestRead(root)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error for empty dir: %v", err)
|
||||
}
|
||||
if len(entries) != 0 {
|
||||
t.Fatalf("got %d entries, want 0", len(entries))
|
||||
}
|
||||
}
|
||||
|
||||
// writeManifest creates <root>/projects/<proj>/vaults/vault.yaml with the given content.
|
||||
func writeManifest(t *testing.T, root, proj, content string) {
|
||||
t.Helper()
|
||||
dir := filepath.Join(root, "projects", proj, "vaults")
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", dir, err)
|
||||
}
|
||||
f := filepath.Join(dir, "vault.yaml")
|
||||
if err := os.WriteFile(f, []byte(content), 0o644); err != nil {
|
||||
t.Fatalf("write %s: %v", f, err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// VaultSearchHit is a single result returned by VaultSearch.
|
||||
type VaultSearchHit struct {
|
||||
VaultPath string `json:"vault_path"`
|
||||
VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
|
||||
RelPath string `json:"rel_path"`
|
||||
Size int64 `json:"size"`
|
||||
Mtime int64 `json:"mtime"`
|
||||
Mime string `json:"mime"`
|
||||
Bucket string `json:"bucket"`
|
||||
SubBucket string `json:"sub_bucket"`
|
||||
Snippet string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
|
||||
}
|
||||
|
||||
// VaultSearch searches vault_index.db inside vaultPath for files matching query.
|
||||
//
|
||||
// Behaviour:
|
||||
// 1. Opens vault_index.db via VaultIndexOpen.
|
||||
// 2. If limit <= 0, defaults to 50.
|
||||
// 3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
|
||||
// is populated by profilers). Because the FTS5 table uses content='' (contentless),
|
||||
// column values are not stored; results are correlated back to files via a LIKE
|
||||
// match on rel_path for path tokens, or via an IN clause of matched rowids for
|
||||
// content_text matches.
|
||||
// 4. Also searches files.rel_path with LIKE to find path matches.
|
||||
// 5. Results from both searches are merged (deduplication by rel_path).
|
||||
// 6. If both FTS5 and LIKE queries fail, returns the error.
|
||||
// 7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
|
||||
func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
|
||||
if limit <= 0 {
|
||||
limit = 50
|
||||
}
|
||||
|
||||
db, err := VaultIndexOpen(vaultPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_search: open index: %w", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
vaultName := resolveVaultName(vaultPath)
|
||||
|
||||
hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vault_search: %w", err)
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
|
||||
// 1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
|
||||
// Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
|
||||
// 2. LIKE on files.rel_path (always reliable for path searching).
|
||||
//
|
||||
// Results are deduplicated by rel_path, up to limit entries.
|
||||
func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
|
||||
seen := make(map[string]struct{})
|
||||
var hits []VaultSearchHit
|
||||
|
||||
// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
|
||||
// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
|
||||
// We get matching rowids from FTS5, then look up files by rowid.
|
||||
// This is reliable for content_text matches because VaultIndexWrite inserts
|
||||
// content_text rows independently of the path rows (profilers update them).
|
||||
// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
|
||||
ftsQuery := safeFTSQuery(query)
|
||||
ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
|
||||
if ftsErr == nil {
|
||||
for _, h := range ftsHits {
|
||||
if len(hits) >= limit {
|
||||
break
|
||||
}
|
||||
if _, ok := seen[h.RelPath]; !ok {
|
||||
seen[h.RelPath] = struct{}{}
|
||||
hits = append(hits, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
|
||||
// If it failed with a non-syntax error, still continue to LIKE fallback.
|
||||
|
||||
// Strategy 2: LIKE on rel_path — reliable path search.
|
||||
// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
|
||||
// word-like token so the LIKE pattern is still useful.
|
||||
likeQuery := simplifyForLike(query)
|
||||
if len(hits) < limit && likeQuery != "" {
|
||||
remaining := limit - len(hits)
|
||||
likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
|
||||
if likeErr != nil && ftsErr != nil {
|
||||
// Both failed — return a combined error.
|
||||
return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
|
||||
}
|
||||
for _, h := range likeHits {
|
||||
if len(hits) >= limit {
|
||||
break
|
||||
}
|
||||
if _, ok := seen[h.RelPath]; !ok {
|
||||
seen[h.RelPath] = struct{}{}
|
||||
hits = append(hits, h)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hits == nil {
|
||||
hits = []VaultSearchHit{}
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
|
||||
// back to the files table.
|
||||
//
|
||||
// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
|
||||
// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
|
||||
// This works correctly when content_text was populated by a profiler that did NOT
|
||||
// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
|
||||
// without changing the rowid). For the current VaultIndexWrite implementation
|
||||
// (which inserts content_text='' and profilers update it in-place), the rowids
|
||||
// remain stable after profiling.
|
||||
func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
|
||||
// Get matching rowids from FTS5.
|
||||
const qRowids = `
|
||||
SELECT rowid
|
||||
FROM files_fts
|
||||
WHERE files_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?`
|
||||
|
||||
rows, err := db.Query(qRowids, safeQuery, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var rowids []int64
|
||||
for rows.Next() {
|
||||
var rid int64
|
||||
if err := rows.Scan(&rid); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rowids = append(rowids, rid)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(rowids) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
|
||||
// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
|
||||
var hits []VaultSearchHit
|
||||
for _, rid := range rowids {
|
||||
var h VaultSearchHit
|
||||
err := db.QueryRow(`
|
||||
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
|
||||
FROM files WHERE rowid = ?`, rid,
|
||||
).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
|
||||
if err != nil {
|
||||
// rowid mismatch (happens after update cycles) — skip gracefully.
|
||||
continue
|
||||
}
|
||||
h.VaultPath = vaultPath
|
||||
h.VaultName = vaultName
|
||||
h.Snippet = ""
|
||||
hits = append(hits, h)
|
||||
}
|
||||
return hits, nil
|
||||
}
|
||||
|
||||
// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
|
||||
func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
|
||||
const qLike = `
|
||||
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
|
||||
FROM files
|
||||
WHERE rel_path LIKE '%' || ? || '%'
|
||||
ORDER BY mtime DESC
|
||||
LIMIT ?`
|
||||
|
||||
rows, err := db.Query(qLike, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var hits []VaultSearchHit
|
||||
for rows.Next() {
|
||||
var h VaultSearchHit
|
||||
if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
h.VaultPath = vaultPath
|
||||
h.VaultName = vaultName
|
||||
h.Snippet = ""
|
||||
hits = append(hits, h)
|
||||
}
|
||||
return hits, rows.Err()
|
||||
}
|
||||
|
||||
// resolveVaultName returns the basename of vaultPath after resolving symlinks.
|
||||
// Falls back to filepath.Base if EvalSymlinks fails.
|
||||
func resolveVaultName(vaultPath string) string {
|
||||
resolved, err := filepath.EvalSymlinks(vaultPath)
|
||||
if err != nil {
|
||||
resolved = vaultPath
|
||||
}
|
||||
return filepath.Base(resolved)
|
||||
}
|
||||
|
||||
// safeFTSQuery wraps the query in double-quotes if it does not already contain
|
||||
// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
|
||||
// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
|
||||
func safeFTSQuery(query string) string {
|
||||
q := strings.TrimSpace(query)
|
||||
if q == "" {
|
||||
return q
|
||||
}
|
||||
upper := strings.ToUpper(q)
|
||||
// If user already uses explicit operators or column prefix, pass through.
|
||||
if strings.ContainsAny(q, ":") ||
|
||||
strings.Contains(upper, " AND ") ||
|
||||
strings.Contains(upper, " OR ") ||
|
||||
strings.Contains(upper, " NOT ") {
|
||||
return q
|
||||
}
|
||||
// Escape any double-quotes in the query before wrapping.
|
||||
escaped := strings.ReplaceAll(q, `"`, `""`)
|
||||
return `"` + escaped + `"`
|
||||
}
|
||||
|
||||
// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
|
||||
func isFTSSyntaxError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
msg := strings.ToLower(err.Error())
|
||||
return strings.Contains(msg, "syntax error") ||
|
||||
strings.Contains(msg, "no such column") ||
|
||||
strings.Contains(msg, "fts5: syntax error")
|
||||
}
|
||||
|
||||
// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
|
||||
// When the query contains FTS5 special characters (colons, double-quotes, operators),
|
||||
// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
|
||||
// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
|
||||
// syntactically complex or contains column-prefix syntax like "foo:bar:".
|
||||
func simplifyForLike(query string) string {
|
||||
q := strings.TrimSpace(query)
|
||||
var token strings.Builder
|
||||
for _, r := range q {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
|
||||
token.WriteRune(r)
|
||||
} else if token.Len() > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return token.String()
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
---
|
||||
name: vault_search
|
||||
kind: function
|
||||
lang: go
|
||||
domain: infra
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error)"
|
||||
description: "Busca en vault_index.db de un vault usando FTS5 sobre files_fts. Si el query rompe el parser FTS5, hace fallback a LIKE sobre rel_path. Retorna hits con snippet de contexto."
|
||||
tags: [vault, search, fts5, sqlite, infra]
|
||||
uses_functions: ["vault_index_open_go_infra"]
|
||||
uses_types: ["vault_file_go_infra"]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: [database/sql, fmt, path/filepath, strings]
|
||||
params:
|
||||
- name: vaultPath
|
||||
desc: "ruta absoluta al directorio raiz del vault (puede ser symlink)"
|
||||
- name: query
|
||||
desc: "termino o frase de busqueda; se escapa automaticamente para FTS5 salvo que ya incluya operadores booleanos o prefijos de columna"
|
||||
- name: limit
|
||||
desc: "maximo de resultados; si es <= 0 se usa 50"
|
||||
output: "slice de VaultSearchHit ordenado por rank FTS5 (o mtime DESC en fallback LIKE); slice vacio si no hay resultados"
|
||||
tested: true
|
||||
tests:
|
||||
- "FTS match devuelve hit con snippet"
|
||||
- "query sin resultados retorna slice vacio"
|
||||
- "limit se respeta"
|
||||
- "query FTS invalida activa fallback LIKE"
|
||||
- "limit cero usa 50 por defecto"
|
||||
test_file_path: "functions/infra/vault_search_test.go"
|
||||
file_path: "functions/infra/vault_search.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
hits, err := infra.VaultSearch("/home/lucas/vaults/turismo_spain", "hoteles", 20)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
for _, h := range hits {
|
||||
fmt.Printf("[%s] %s %s\n", h.VaultName, h.RelPath, h.Snippet)
|
||||
}
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
`VaultSearchHit` es un struct local definido en este archivo (no en `vault_file.go`)
|
||||
porque combina campos de `files` + metadatos de contexto de busqueda (Snippet, VaultPath, VaultName).
|
||||
|
||||
**FTS5 safety:** el helper `safeFTSQuery` envuelve la query en comillas dobles
|
||||
cuando no contiene operadores booleanos ni prefijos de columna. Esto evita errores
|
||||
del parser en tokens como `foo:bar:` o `hello-world`.
|
||||
|
||||
**Fallback LIKE:** si el MATCH falla con un error de sintaxis FTS5, se ejecuta
|
||||
`WHERE rel_path LIKE '%' || query || '%'`. Los hits del fallback tienen `Snippet=""`.
|
||||
|
||||
**VaultName:** se deriva del `filepath.Base(filepath.EvalSymlinks(vaultPath))`.
|
||||
Si `EvalSymlinks` falla (e.g. symlink roto), usa `filepath.Base(vaultPath)`.
|
||||
@@ -0,0 +1,147 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// openTestVaultDB creates a fresh vault_index.db in a temp dir and returns the path.
|
||||
func openTestVaultDir(t *testing.T) string {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen: %v", err)
|
||||
}
|
||||
db.Close()
|
||||
return dir
|
||||
}
|
||||
|
||||
// seedVaultFile inserts a row into files + files_fts.
|
||||
func seedVaultFile(t *testing.T, dir, relPath, mime, bucket, subBucket, contentText string, size int64) {
|
||||
t.Helper()
|
||||
db, err := VaultIndexOpen(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultIndexOpen seed: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
now := time.Now().Unix()
|
||||
_, err = db.Exec(`
|
||||
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
|
||||
VALUES (?, ?, ?, 'aabbccdd', ?, '', ?, ?, ?)`,
|
||||
relPath, size, now, mime, bucket, subBucket, now,
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("seed files: %v", err)
|
||||
}
|
||||
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`, relPath, contentText)
|
||||
if err != nil {
|
||||
t.Fatalf("seed files_fts: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Tests ---
|
||||
|
||||
func TestVaultSearch_FTSMatch(t *testing.T) {
|
||||
t.Run("FTS match devuelve hit con snippet", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
seedVaultFile(t, dir, "data/raw/informe.csv", "text/csv", "data", "raw",
|
||||
"ventas trimestrales empresa iberica", 1024)
|
||||
seedVaultFile(t, dir, "data/raw/other.csv", "text/csv", "data", "raw",
|
||||
"productos inventario almacen", 512)
|
||||
|
||||
hits, err := VaultSearch(dir, "ventas", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 1 {
|
||||
t.Fatalf("got %d hits, want 1", len(hits))
|
||||
}
|
||||
if hits[0].RelPath != "data/raw/informe.csv" {
|
||||
t.Errorf("RelPath = %q, want data/raw/informe.csv", hits[0].RelPath)
|
||||
}
|
||||
if hits[0].VaultName == "" {
|
||||
t.Errorf("VaultName should not be empty")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_NoMatch(t *testing.T) {
|
||||
t.Run("query sin resultados retorna slice vacio", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
seedVaultFile(t, dir, "data/raw/file.csv", "text/csv", "data", "raw", "some content", 100)
|
||||
|
||||
hits, err := VaultSearch(dir, "zzznomatch", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 0 {
|
||||
t.Errorf("got %d hits, want 0", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_LimitRespected(t *testing.T) {
|
||||
t.Run("limit se respeta", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
for i := 0; i < 10; i++ {
|
||||
path := "data/raw/file" + string(rune('a'+i)) + ".csv"
|
||||
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "common keyword everywhere", 100)
|
||||
}
|
||||
|
||||
hits, err := VaultSearch(dir, "common", 3)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 3 {
|
||||
t.Errorf("got %d hits, want 3", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_BadFTSQuery_FallbackLike(t *testing.T) {
|
||||
t.Run("query FTS invalida activa fallback LIKE", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
// Insert a file whose rel_path contains "foobar" so LIKE can find it.
|
||||
seedVaultFile(t, dir, "data/raw/foobar_report.csv", "text/csv", "data", "raw", "", 200)
|
||||
|
||||
// "foo:bar:" — colon after a non-column name triggers FTS5 parser error.
|
||||
// safeFTSQuery passes it through unchanged because it contains ":"
|
||||
// → FTS5 "no such column: bar" → fallback LIKE on rel_path.
|
||||
hits, err := VaultSearch(dir, "foo:bar:", 10)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) == 0 {
|
||||
t.Errorf("expected fallback LIKE to find foobar_report.csv, got 0 hits")
|
||||
}
|
||||
for _, h := range hits {
|
||||
if h.Snippet != "" {
|
||||
t.Errorf("fallback hits should have empty Snippet, got %q", h.Snippet)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestVaultSearch_LimitZeroDefaults(t *testing.T) {
|
||||
t.Run("limit cero usa 50 por defecto", func(t *testing.T) {
|
||||
dir := openTestVaultDir(t)
|
||||
// Insert 55 files with the same keyword.
|
||||
for i := 0; i < 55; i++ {
|
||||
path := "data/raw/doc" + string(rune('a')) + string(rune(int('0')+i%10)) + ".csv"
|
||||
if i >= 10 {
|
||||
path = "data/raw/doc" + string(rune('b'+i/10-1)) + string(rune(int('0')+i%10)) + ".csv"
|
||||
}
|
||||
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "keyword alpha beta", 100)
|
||||
}
|
||||
|
||||
hits, err := VaultSearch(dir, "keyword", 0)
|
||||
if err != nil {
|
||||
t.Fatalf("VaultSearch: %v", err)
|
||||
}
|
||||
if len(hits) != 50 {
|
||||
t.Errorf("got %d hits, want 50 (default limit)", len(hits))
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package ml
|
||||
|
||||
import "encoding/json"
|
||||
|
||||
// GenconfigMarshal serializa un GenerationConfig a JSON canonico con indent de 2 espacios.
|
||||
// El formato es identico al de Python json.dumps(indent=2, sort_keys=False):
|
||||
// keys en el orden de declaracion del struct, snake_case, campos omitempty ausentes si zero.
|
||||
func GenconfigMarshal(cfg GenerationConfig) ([]byte, error) {
|
||||
return json.MarshalIndent(cfg, "", " ")
|
||||
}
|
||||
|
||||
// GenconfigUnmarshal deserializa JSON (compacto o con indent) a GenerationConfig.
|
||||
// Los campos JSON deben usar snake_case: negative_prompt, cfg_scale, model_type, etc.
|
||||
func GenconfigUnmarshal(data []byte) (GenerationConfig, error) {
|
||||
var cfg GenerationConfig
|
||||
if err := json.Unmarshal(data, &cfg); err != nil {
|
||||
return GenerationConfig{}, err
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
name: genconfig_json_marshal
|
||||
kind: function
|
||||
lang: go
|
||||
domain: ml
|
||||
version: "1.0.0"
|
||||
purity: impure
|
||||
signature: "func GenconfigMarshal(cfg GenerationConfig) ([]byte, error)\nfunc GenconfigUnmarshal(data []byte) (GenerationConfig, error)"
|
||||
description: "Wrappers json.Marshal/Unmarshal para GenerationConfig con formato canonico (MarshalIndent 2 espacios). Garantiza roundtrip identico al Python: json.dumps(indent=2, sort_keys=False). Campos JSON en snake_case."
|
||||
tags: [ml, json, marshal, unmarshal, serialization, generation, canonical]
|
||||
uses_functions: []
|
||||
uses_types: [generation_config_go_ml]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: "error_go_core"
|
||||
imports: ["encoding/json"]
|
||||
params:
|
||||
- name: cfg
|
||||
desc: "GenerationConfig a serializar. Campos omitempty (negative_prompt, loras, clip_skip) se omiten si son zero/nil/empty."
|
||||
- name: data
|
||||
desc: "JSON bytes a deserializar. Acepta formato compacto o con indent. Keys deben ser snake_case (negative_prompt, cfg_scale, model_type, etc.)."
|
||||
output: "GenconfigMarshal: bytes JSON con indent 2 espacios, orden de campos segun declaracion del struct (prompt, negative_prompt, seed, steps, cfg_scale, sampler, width, height, model, loras, clip_skip). GenconfigUnmarshal: GenerationConfig poblado o error de parsing."
|
||||
tested: true
|
||||
tests:
|
||||
- "roundtrip marshal unmarshal produce config igual"
|
||||
- "json cross-language snake_case keys se deserializan correctamente"
|
||||
test_file_path: "functions/ml/genconfig_test.go"
|
||||
file_path: "functions/ml/genconfig_json_marshal.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
cfg := ml.GenerationConfig{
|
||||
Prompt: "a mountain at sunset",
|
||||
Seed: 1234,
|
||||
Steps: 30,
|
||||
CfgScale: 7.0,
|
||||
Sampler: "euler",
|
||||
Width: 768,
|
||||
Height: 512,
|
||||
Model: ml.ModelRef{Name: "sdxl-base", ModelType: "sdxl", Quantization: "fp16"},
|
||||
}
|
||||
|
||||
b, err := ml.GenconfigMarshal(cfg)
|
||||
// b == {
|
||||
// "prompt": "a mountain at sunset",
|
||||
// "seed": 1234,
|
||||
// ...
|
||||
// }
|
||||
|
||||
cfg2, err := ml.GenconfigUnmarshal(b)
|
||||
// cfg2 == cfg (DeepEqual)
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
### Formato canonico y compatibilidad con Python
|
||||
|
||||
`GenconfigMarshal` usa `json.MarshalIndent(cfg, "", " ")`. El formato resultante es identico al que produce Python con `model.model_dump_json()` o `json.dumps(data, indent=2)` cuando `sort_keys=False`:
|
||||
|
||||
- Keys en orden de declaracion del struct (no alfabetico).
|
||||
- Indent de 2 espacios, sin trailing whitespace.
|
||||
- Campos omitempty ausentes si zero: `negative_prompt` ausente si `""`, `loras` ausente si `[]`, `clip_skip` ausente si `nil`.
|
||||
|
||||
### Keys JSON (snake_case obligatorio)
|
||||
|
||||
| Campo Go | Key JSON |
|
||||
|---|---|
|
||||
| `Prompt` | `"prompt"` |
|
||||
| `NegativePrompt` | `"negative_prompt"` |
|
||||
| `Seed` | `"seed"` |
|
||||
| `Steps` | `"steps"` |
|
||||
| `CfgScale` | `"cfg_scale"` |
|
||||
| `Sampler` | `"sampler"` |
|
||||
| `Width` | `"width"` |
|
||||
| `Height` | `"height"` |
|
||||
| `Model.ModelType` | `"model_type"` |
|
||||
| `Model.Quantization` | `"quantization"` |
|
||||
| `ClipSkip` | `"clip_skip"` |
|
||||
|
||||
### Por que impure
|
||||
|
||||
Los errores de `json.Unmarshal` son errores de parsing del input externo, no de I/O, pero se modelan como `(T, error)` para forzar manejo explicito en el caller. Marcado `impure` con `error_type: error_go_core` por convencion del registry.
|
||||
@@ -0,0 +1,260 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TestGenconfigToSdcliArgs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGenconfigToSdcliArgs(t *testing.T) {
|
||||
clipSkip := 2
|
||||
|
||||
t.Run("config basico sin loras ni clip_skip", func(t *testing.T) {
|
||||
cfg := GenerationConfig{
|
||||
Prompt: "a cat",
|
||||
Seed: 42,
|
||||
Steps: 20,
|
||||
CfgScale: 7.5,
|
||||
Sampler: "euler",
|
||||
Width: 512,
|
||||
Height: 512,
|
||||
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16"},
|
||||
}
|
||||
args := GenconfigToSdcliArgs(cfg)
|
||||
|
||||
want := []string{
|
||||
"--prompt", "a cat",
|
||||
"--seed", "42",
|
||||
"--steps", "20",
|
||||
"--cfg-scale", "7.5",
|
||||
"--width", "512",
|
||||
"--height", "512",
|
||||
"--sampling-method", "euler",
|
||||
}
|
||||
if !reflect.DeepEqual(args, want) {
|
||||
t.Errorf("got %v\nwant %v", args, want)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("loras se emiten como pares path:weight", func(t *testing.T) {
|
||||
cfg := GenerationConfig{
|
||||
Prompt: "portrait",
|
||||
Seed: 1,
|
||||
Steps: 10,
|
||||
CfgScale: 7.0,
|
||||
Sampler: "euler",
|
||||
Width: 512,
|
||||
Height: 512,
|
||||
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1.safetensors"},
|
||||
Loras: []LoraRef{
|
||||
{Path: "/loras/detail.safetensors", Weight: 0.8},
|
||||
{Path: "/loras/style.safetensors", Weight: 0.5},
|
||||
},
|
||||
ClipSkip: &clipSkip,
|
||||
}
|
||||
args := GenconfigToSdcliArgs(cfg)
|
||||
|
||||
// Verificar que existen los pares --lora para ambas loras
|
||||
loraIdx := indexAll(args, "--lora")
|
||||
if len(loraIdx) != 2 {
|
||||
t.Fatalf("esperaba 2 flags --lora, got %d en %v", len(loraIdx), args)
|
||||
}
|
||||
wantLoras := []string{
|
||||
"/loras/detail.safetensors:0.8",
|
||||
"/loras/style.safetensors:0.5",
|
||||
}
|
||||
for i, idx := range loraIdx {
|
||||
if idx+1 >= len(args) {
|
||||
t.Fatalf("--lora[%d] sin valor siguiente", i)
|
||||
}
|
||||
if args[idx+1] != wantLoras[i] {
|
||||
t.Errorf("lora[%d]: got %q, want %q", i, args[idx+1], wantLoras[i])
|
||||
}
|
||||
}
|
||||
|
||||
// Verificar --model y --clip-skip presentes
|
||||
if !containsPair(args, "--model", "/models/v1.safetensors") {
|
||||
t.Errorf("--model no encontrado en %v", args)
|
||||
}
|
||||
if !containsPair(args, "--clip-skip", "2") {
|
||||
t.Errorf("--clip-skip no encontrado en %v", args)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("sampler dpm++2m se traduce a dpmpp2m", func(t *testing.T) {
|
||||
cfg := GenerationConfig{
|
||||
Prompt: "x",
|
||||
Seed: 0,
|
||||
Steps: 1,
|
||||
CfgScale: 1.0,
|
||||
Sampler: "dpm++2m",
|
||||
Width: 64,
|
||||
Height: 64,
|
||||
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
|
||||
}
|
||||
args := GenconfigToSdcliArgs(cfg)
|
||||
if !containsPair(args, "--sampling-method", "dpmpp2m") {
|
||||
t.Errorf("sampler no traducido; args=%v", args)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("negative_prompt vacio no genera flag", func(t *testing.T) {
|
||||
cfg := GenerationConfig{
|
||||
Prompt: "x",
|
||||
NegativePrompt: "",
|
||||
Seed: 0,
|
||||
Steps: 1,
|
||||
CfgScale: 1.0,
|
||||
Sampler: "euler",
|
||||
Width: 64,
|
||||
Height: 64,
|
||||
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
|
||||
}
|
||||
args := GenconfigToSdcliArgs(cfg)
|
||||
for _, a := range args {
|
||||
if a == "--negative-prompt" {
|
||||
t.Errorf("flag --negative-prompt presente aunque NegativePrompt es vacio")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TestGenconfigMarshalRoundtrip
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGenconfigMarshalRoundtrip(t *testing.T) {
|
||||
t.Run("roundtrip marshal unmarshal produce config igual", func(t *testing.T) {
|
||||
clip := 2
|
||||
cfg := GenerationConfig{
|
||||
Prompt: "sunset over the mountains",
|
||||
NegativePrompt: "blurry, low quality",
|
||||
Seed: 99,
|
||||
Steps: 30,
|
||||
CfgScale: 7.5,
|
||||
Sampler: "dpm++2m",
|
||||
Width: 768,
|
||||
Height: 512,
|
||||
Model: ModelRef{
|
||||
Name: "sdxl-base",
|
||||
ModelType: "sdxl",
|
||||
Quantization: "fp16",
|
||||
Path: "/models/sdxl.safetensors",
|
||||
},
|
||||
Loras: []LoraRef{
|
||||
{Path: "/loras/detail.safetensors", Weight: 0.8},
|
||||
},
|
||||
ClipSkip: &clip,
|
||||
}
|
||||
|
||||
b, err := GenconfigMarshal(cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("GenconfigMarshal: %v", err)
|
||||
}
|
||||
|
||||
got, err := GenconfigUnmarshal(b)
|
||||
if err != nil {
|
||||
t.Fatalf("GenconfigUnmarshal: %v", err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(cfg, got) {
|
||||
t.Errorf("roundtrip diverge\norig: %+v\ngot: %+v", cfg, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// TestGenconfigCrossLanguageJSON
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func TestGenconfigCrossLanguageJSON(t *testing.T) {
|
||||
// Fixture escrito a mano replicando lo que generaria Python:
|
||||
// json.dumps(config.model_dump(), indent=2)
|
||||
// Keys en snake_case, orden de declaracion del dataclass Python.
|
||||
fixture := `{
|
||||
"prompt": "a dragon",
|
||||
"negative_prompt": "ugly",
|
||||
"seed": 1234,
|
||||
"steps": 25,
|
||||
"cfg_scale": 7.0,
|
||||
"sampler": "euler_a",
|
||||
"width": 512,
|
||||
"height": 512,
|
||||
"model": {
|
||||
"name": "v1-5",
|
||||
"model_type": "sd15",
|
||||
"quantization": "fp16"
|
||||
},
|
||||
"loras": [
|
||||
{
|
||||
"path": "/loras/dragon.safetensors",
|
||||
"weight": 0.9
|
||||
}
|
||||
]
|
||||
}`
|
||||
|
||||
t.Run("json cross-language snake_case keys se deserializan correctamente", func(t *testing.T) {
|
||||
cfg, err := GenconfigUnmarshal([]byte(fixture))
|
||||
if err != nil {
|
||||
t.Fatalf("GenconfigUnmarshal fixture: %v", err)
|
||||
}
|
||||
|
||||
// Verificar campos clave
|
||||
if cfg.Prompt != "a dragon" {
|
||||
t.Errorf("Prompt: got %q", cfg.Prompt)
|
||||
}
|
||||
if cfg.NegativePrompt != "ugly" {
|
||||
t.Errorf("NegativePrompt: got %q", cfg.NegativePrompt)
|
||||
}
|
||||
if cfg.CfgScale != 7.0 {
|
||||
t.Errorf("CfgScale: got %v", cfg.CfgScale)
|
||||
}
|
||||
if cfg.Model.ModelType != "sd15" {
|
||||
t.Errorf("Model.ModelType: got %q", cfg.Model.ModelType)
|
||||
}
|
||||
if len(cfg.Loras) != 1 || cfg.Loras[0].Weight != 0.9 {
|
||||
t.Errorf("Loras: got %+v", cfg.Loras)
|
||||
}
|
||||
|
||||
// Re-marshal y verificar que las keys snake_case siguen presentes
|
||||
b, err := GenconfigMarshal(cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("GenconfigMarshal: %v", err)
|
||||
}
|
||||
s := string(b)
|
||||
for _, key := range []string{"negative_prompt", "cfg_scale", "model_type", "quantization"} {
|
||||
if !strings.Contains(s, `"`+key+`"`) {
|
||||
t.Errorf("key %q ausente en JSON re-serializado:\n%s", key, s)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// indexAll retorna todos los indices de val en slice.
|
||||
func indexAll(slice []string, val string) []int {
|
||||
var out []int
|
||||
for i, s := range slice {
|
||||
if s == val {
|
||||
out = append(out, i)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// containsPair verifica que flag seguido de value aparece en slice.
|
||||
func containsPair(slice []string, flag, value string) bool {
|
||||
for i := 0; i+1 < len(slice); i++ {
|
||||
if slice[i] == flag && slice[i+1] == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// samplerMap traduce nombres canonicos del dominio ml a flags de stable-diffusion.cpp.
|
||||
var samplerMap = map[string]string{
|
||||
"euler": "euler",
|
||||
"euler_a": "euler_a",
|
||||
"dpm++2m": "dpmpp2m",
|
||||
"dpm++2m_v2": "dpmpp2mv2",
|
||||
"heun": "heun",
|
||||
"dpm2": "dpm2",
|
||||
"lcm": "lcm",
|
||||
}
|
||||
|
||||
// GenconfigToSdcliArgs convierte un GenerationConfig en una lista de argumentos
|
||||
// CLI para stable-diffusion.cpp (sd.exe / sd binario).
|
||||
// Espejo Go de genconfig_to_sdcpp_args_py_ml.
|
||||
//
|
||||
// Loras se emiten como pares repetidos "--lora" "path:weight".
|
||||
// Si el sampler no existe en samplerMap se usa el valor literal sin traducir.
|
||||
// La funcion es pura: sin I/O, sin estado, determinista.
|
||||
func GenconfigToSdcliArgs(cfg GenerationConfig) []string {
|
||||
args := []string{
|
||||
"--prompt", cfg.Prompt,
|
||||
"--seed", strconv.FormatInt(cfg.Seed, 10),
|
||||
"--steps", strconv.Itoa(cfg.Steps),
|
||||
"--cfg-scale", strconv.FormatFloat(cfg.CfgScale, 'f', -1, 64),
|
||||
"--width", strconv.Itoa(cfg.Width),
|
||||
"--height", strconv.Itoa(cfg.Height),
|
||||
}
|
||||
|
||||
if cfg.NegativePrompt != "" {
|
||||
args = append(args, "--negative-prompt", cfg.NegativePrompt)
|
||||
}
|
||||
|
||||
sampler := cfg.Sampler
|
||||
if mapped, ok := samplerMap[sampler]; ok {
|
||||
sampler = mapped
|
||||
}
|
||||
args = append(args, "--sampling-method", sampler)
|
||||
|
||||
if cfg.Model.Path != "" {
|
||||
args = append(args, "--model", cfg.Model.Path)
|
||||
}
|
||||
|
||||
if cfg.ClipSkip != nil {
|
||||
args = append(args, "--clip-skip", strconv.Itoa(*cfg.ClipSkip))
|
||||
}
|
||||
|
||||
for _, lora := range cfg.Loras {
|
||||
args = append(args, "--lora", fmt.Sprintf("%s:%g", lora.Path, lora.Weight))
|
||||
}
|
||||
|
||||
return args
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
---
|
||||
name: genconfig_to_sdcli_args
|
||||
kind: function
|
||||
lang: go
|
||||
domain: ml
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "func GenconfigToSdcliArgs(cfg GenerationConfig) []string"
|
||||
description: "Convierte un GenerationConfig en argumentos CLI para stable-diffusion.cpp. Espejo Go de genconfig_to_sdcpp_args_py_ml. Loras se emiten como pares repetidos --lora path:weight. Sampler traducido via samplerMap canonico."
|
||||
tags: [ml, stable-diffusion, cli, args, generation, pure]
|
||||
uses_functions: []
|
||||
uses_types: [generation_config_go_ml]
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["fmt", "strconv"]
|
||||
params:
|
||||
- name: cfg
|
||||
desc: "Parametros completos de generacion de imagen. Sampler debe ser uno de los valores de SamplerName. Model.Path se emite como --model si no esta vacio."
|
||||
output: "Slice de strings listos para pasar a exec.Command o similar. Incluye --prompt, --seed, --steps, --cfg-scale, --width, --height, --sampling-method, opcionales --negative-prompt / --model / --clip-skip, y pares --lora path:weight por cada LoraRef."
|
||||
tested: true
|
||||
tests:
|
||||
- "config basico sin loras ni clip_skip"
|
||||
- "loras se emiten como pares path:weight"
|
||||
- "sampler dpm++2m se traduce a dpmpp2m"
|
||||
- "negative_prompt vacio no genera flag"
|
||||
test_file_path: "functions/ml/genconfig_test.go"
|
||||
file_path: "functions/ml/genconfig_to_sdcli_args.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
clip := 2
|
||||
cfg := ml.GenerationConfig{
|
||||
Prompt: "a cat",
|
||||
Seed: 42,
|
||||
Steps: 20,
|
||||
CfgScale: 7.5,
|
||||
Sampler: "dpm++2m",
|
||||
Width: 512,
|
||||
Height: 512,
|
||||
Model: ml.ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1-5.safetensors"},
|
||||
Loras: []ml.LoraRef{{Path: "/loras/detail.safetensors", Weight: 0.8}},
|
||||
ClipSkip: &clip,
|
||||
}
|
||||
args := ml.GenconfigToSdcliArgs(cfg)
|
||||
// args == ["--prompt","a cat","--seed","42","--steps","20",
|
||||
// "--cfg-scale","7.5","--width","512","--height","512",
|
||||
// "--sampling-method","dpmpp2m","--model","/models/v1-5.safetensors",
|
||||
// "--clip-skip","2","--lora","/loras/detail.safetensors:0.8"]
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- `samplerMap` traduce nombres canonicos del dominio ml a los identificadores que acepta stable-diffusion.cpp. Si el sampler no esta en el mapa se usa el valor literal.
|
||||
- El flag de modelo (`--model`) solo se emite si `cfg.Model.Path != ""`.
|
||||
- `%g` en `fmt.Sprintf` para el peso de la lora elimina ceros insignificantes: `0.800000` → `0.8`.
|
||||
- Funcion pura: misma entrada, misma salida. Sin I/O ni estado global.
|
||||
@@ -0,0 +1,18 @@
|
||||
package ml
|
||||
|
||||
// GenerationConfig parametriza una solicitud de generacion de imagen.
|
||||
// Espejo JSON-compatible de GenerationConfig_py_ml: los tags json coinciden
|
||||
// con los campos snake_case del dataclass Python para roundtrip sin perdida.
|
||||
type GenerationConfig struct {
|
||||
Prompt string `json:"prompt"`
|
||||
NegativePrompt string `json:"negative_prompt,omitempty"`
|
||||
Seed int64 `json:"seed"`
|
||||
Steps int `json:"steps"`
|
||||
CfgScale float64 `json:"cfg_scale"`
|
||||
Sampler string `json:"sampler"`
|
||||
Width int `json:"width"`
|
||||
Height int `json:"height"`
|
||||
Model ModelRef `json:"model"`
|
||||
Loras []LoraRef `json:"loras,omitempty"`
|
||||
ClipSkip *int `json:"clip_skip,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package ml
|
||||
|
||||
// ImageGenResult contiene la imagen generada y su metadata de ejecucion.
|
||||
// ImageBytes transporta los bytes raw del PNG y se excluye del JSON
|
||||
// (campo json:"-") porque viaja por canal binario separado.
|
||||
type ImageGenResult struct {
|
||||
ImageBytes []byte `json:"-"`
|
||||
Format string `json:"format"`
|
||||
Meta map[string]any `json:"meta"`
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
VramPeakMb *int `json:"vram_peak_mb,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
package ml
|
||||
|
||||
import "context"
|
||||
|
||||
// ImageGenerator define el contrato para cualquier backend de generacion de imagenes.
|
||||
// Las implementaciones pueden ser locales (ComfyUI, diffusers) o remotas (API).
|
||||
type ImageGenerator interface {
|
||||
Generate(ctx context.Context, cfg GenerationConfig) (ImageGenResult, error)
|
||||
}
|
||||
@@ -0,0 +1,8 @@
|
||||
package ml
|
||||
|
||||
// LoraRef referencia un adaptador LoRA con su peso de fusión y escala opcional.
|
||||
type LoraRef struct {
|
||||
Path string `json:"path"`
|
||||
Weight float64 `json:"weight"`
|
||||
Scale *float64 `json:"scale,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
package ml
|
||||
|
||||
// ModelRef identifica un modelo de generacion de imagenes por nombre, tipo,
|
||||
// cuantizacion y path opcional en disco.
|
||||
type ModelRef struct {
|
||||
Name string `json:"name"`
|
||||
ModelType string `json:"model_type"` // sd15|sdxl|flux_dev|...
|
||||
Quantization string `json:"quantization"` // fp16|q8_0|...
|
||||
Path string `json:"path,omitempty"`
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// SdcliProgress contiene el estado de progreso parseado de una linea de stderr de sd-cli.
|
||||
type SdcliProgress struct {
|
||||
Step int `json:"step"`
|
||||
TotalSteps int `json:"total_steps"`
|
||||
ItPerSec float64 `json:"it_per_sec"`
|
||||
Percent float64 `json:"percent"`
|
||||
}
|
||||
|
||||
// reProgress1 parsea el formato compacto: " 3/30 | 0.84it/s | 10%"
|
||||
var reProgress1 = regexp.MustCompile(`\s*(\d+)\s*/\s*(\d+)\s*\|[^|]*?([\d.]+)\s*it/s[^|]*?\|\s*([\d.]+)\s*%`)
|
||||
|
||||
// reProgress2 parsea el formato verbose: "sampling: step 3 of 30 (0.84 it/s)"
|
||||
var reProgress2 = regexp.MustCompile(`step\s+(\d+)\s+of\s+(\d+)\s*\(\s*([\d.]+)\s*it/s\)`)
|
||||
|
||||
// reProgress3 parsea el formato minimal: "step 3/30" o "progress: 3/30"
|
||||
var reProgress3 = regexp.MustCompile(`(?:progress[:\s]+)?(\d+)\s*/\s*(\d+)`)
|
||||
|
||||
// SdcliParseProgress parsea una linea de stderr de stable-diffusion.cpp / sd-cli
|
||||
// y extrae el estado de progreso. Retorna (SdcliProgress, true) si la linea
|
||||
// contiene informacion de progreso reconocible; (zero, false) en caso contrario.
|
||||
// Funcion pura: sin I/O, sin estado mutable, determinista.
|
||||
func SdcliParseProgress(line string) (SdcliProgress, bool) {
|
||||
// Formato 1: " 3/30 | 0.84it/s | 10%"
|
||||
if m := reProgress1.FindStringSubmatch(line); m != nil {
|
||||
step, err1 := strconv.Atoi(m[1])
|
||||
total, err2 := strconv.Atoi(m[2])
|
||||
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
|
||||
pct, err4 := strconv.ParseFloat(m[4], 64)
|
||||
if err1 == nil && err2 == nil && err3 == nil && err4 == nil {
|
||||
return SdcliProgress{
|
||||
Step: step,
|
||||
TotalSteps: total,
|
||||
ItPerSec: itPerSec,
|
||||
Percent: pct,
|
||||
}, true
|
||||
}
|
||||
}
|
||||
|
||||
// Formato 2: "sampling: step 3 of 30 (0.84 it/s)"
|
||||
if m := reProgress2.FindStringSubmatch(line); m != nil {
|
||||
step, err1 := strconv.Atoi(m[1])
|
||||
total, err2 := strconv.Atoi(m[2])
|
||||
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
|
||||
if err1 == nil && err2 == nil && err3 == nil && total > 0 {
|
||||
pct := 100.0 * float64(step) / float64(total)
|
||||
return SdcliProgress{
|
||||
Step: step,
|
||||
TotalSteps: total,
|
||||
ItPerSec: itPerSec,
|
||||
Percent: pct,
|
||||
}, true
|
||||
}
|
||||
}
|
||||
|
||||
// Formato 3: "step 3/30" o "progress: 3/30" sin velocidad
|
||||
if m := reProgress3.FindStringSubmatch(line); m != nil {
|
||||
step, err1 := strconv.Atoi(m[1])
|
||||
total, err2 := strconv.Atoi(m[2])
|
||||
if err1 == nil && err2 == nil && total > 0 {
|
||||
pct := 100.0 * float64(step) / float64(total)
|
||||
return SdcliProgress{
|
||||
Step: step,
|
||||
TotalSteps: total,
|
||||
ItPerSec: 0,
|
||||
Percent: pct,
|
||||
}, true
|
||||
}
|
||||
}
|
||||
|
||||
return SdcliProgress{}, false
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
---
|
||||
name: sdcli_parse_progress
|
||||
kind: function
|
||||
lang: go
|
||||
domain: ml
|
||||
version: "1.0.0"
|
||||
purity: pure
|
||||
signature: "func SdcliParseProgress(line string) (SdcliProgress, bool)"
|
||||
description: "Parsea una linea de stderr de stable-diffusion.cpp / sd-cli y extrae el estado de progreso. Soporta el formato compacto '3/30 | 0.84it/s | 10%', el formato verbose 'sampling: step 3 of 30 (0.84 it/s)', y el formato minimal 'progress: 3/30'. Retorna (zero, false) si la linea no contiene informacion de progreso reconocible."
|
||||
tags: [ml, stable-diffusion, sdcli, progress, parser, stderr, pure]
|
||||
uses_functions: []
|
||||
uses_types: []
|
||||
returns: []
|
||||
returns_optional: false
|
||||
error_type: ""
|
||||
imports: ["regexp", "strconv"]
|
||||
params:
|
||||
- name: line
|
||||
desc: "Una linea de stderr emitida por sd-cli / stable-diffusion.cpp durante la fase de sampling. Puede contener espacios al inicio o final."
|
||||
output: "Par (SdcliProgress, bool). bool=true si se reconocio un patron de progreso; SdcliProgress contiene Step (paso actual), TotalSteps (pasos totales), ItPerSec (iteraciones por segundo, 0 si no disponible) y Percent (porcentaje 0-100 calculado o leido de la linea). bool=false y struct zero si la linea no contiene progreso."
|
||||
tested: true
|
||||
tests:
|
||||
- "formato estandar compacto step/total/itpersec/percent"
|
||||
- "linea sin patron retorna false"
|
||||
- "formato sampling verbose con velocidad"
|
||||
file_path: "functions/ml/sdcli_parse_progress.go"
|
||||
test_file_path: "functions/ml/sdcli_parse_progress_test.go"
|
||||
---
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```go
|
||||
p, ok := ml.SdcliParseProgress(" 3/30 | 0.84it/s | 10%")
|
||||
// ok = true
|
||||
// p = SdcliProgress{Step:3, TotalSteps:30, ItPerSec:0.84, Percent:10.0}
|
||||
|
||||
p2, ok2 := ml.SdcliParseProgress("sampling: step 15 of 30 (1.2 it/s)")
|
||||
// ok2 = true
|
||||
// p2 = SdcliProgress{Step:15, TotalSteps:30, ItPerSec:1.2, Percent:50.0}
|
||||
|
||||
_, ok3 := ml.SdcliParseProgress("loading model...")
|
||||
// ok3 = false
|
||||
```
|
||||
|
||||
## Notas
|
||||
|
||||
- Regexps precompiladas como vars de paquete (se compilan una sola vez al init del paquete).
|
||||
- Tolerante a variaciones de espaciado gracias a `\s*` en los patrones.
|
||||
- El campo `Percent` en el formato verbose se calcula como `100 * step / total` (no se lee de la linea porque ese formato no lo emite).
|
||||
- Funcion pura: sin I/O, sin estado mutable, determinista.
|
||||
@@ -0,0 +1,103 @@
|
||||
package ml
|
||||
|
||||
import (
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSdcliParseProgress_StandardFormat(t *testing.T) {
|
||||
line := " 3/30 | 0.84it/s | 10%"
|
||||
got, ok := SdcliParseProgress(line)
|
||||
if !ok {
|
||||
t.Fatalf("expected match, got false")
|
||||
}
|
||||
if got.Step != 3 {
|
||||
t.Errorf("Step: got %d, want 3", got.Step)
|
||||
}
|
||||
if got.TotalSteps != 30 {
|
||||
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
|
||||
}
|
||||
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
|
||||
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
|
||||
}
|
||||
if math.Abs(got.Percent-10.0) > 1e-9 {
|
||||
t.Errorf("Percent: got %v, want 10.0", got.Percent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSdcliParseProgress_NoMatch(t *testing.T) {
|
||||
cases := []string{
|
||||
"loading model...",
|
||||
"",
|
||||
"error: out of memory",
|
||||
"clip model loaded",
|
||||
"generating image...",
|
||||
}
|
||||
for _, line := range cases {
|
||||
_, ok := SdcliParseProgress(line)
|
||||
if ok {
|
||||
t.Errorf("expected no match for %q, but got match", line)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSdcliParseProgress_AltFormat(t *testing.T) {
|
||||
t.Run("formato sampling verbose", func(t *testing.T) {
|
||||
line := "sampling: step 3 of 30 (0.84 it/s)"
|
||||
got, ok := SdcliParseProgress(line)
|
||||
if !ok {
|
||||
t.Fatalf("expected match, got false")
|
||||
}
|
||||
if got.Step != 3 {
|
||||
t.Errorf("Step: got %d, want 3", got.Step)
|
||||
}
|
||||
if got.TotalSteps != 30 {
|
||||
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
|
||||
}
|
||||
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
|
||||
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
|
||||
}
|
||||
expectedPct := 100.0 * 3.0 / 30.0
|
||||
if math.Abs(got.Percent-expectedPct) > 1e-6 {
|
||||
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("formato step/total sin velocidad", func(t *testing.T) {
|
||||
line := "progress: 15/20"
|
||||
got, ok := SdcliParseProgress(line)
|
||||
if !ok {
|
||||
t.Fatalf("expected match, got false")
|
||||
}
|
||||
if got.Step != 15 {
|
||||
t.Errorf("Step: got %d, want 15", got.Step)
|
||||
}
|
||||
if got.TotalSteps != 20 {
|
||||
t.Errorf("TotalSteps: got %d, want 20", got.TotalSteps)
|
||||
}
|
||||
if got.ItPerSec != 0 {
|
||||
t.Errorf("ItPerSec: got %v, want 0", got.ItPerSec)
|
||||
}
|
||||
expectedPct := 75.0
|
||||
if math.Abs(got.Percent-expectedPct) > 1e-6 {
|
||||
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("formato con espacios variables y mayor velocidad", func(t *testing.T) {
|
||||
line := " 20/30 | 12.50it/s | 66%"
|
||||
got, ok := SdcliParseProgress(line)
|
||||
if !ok {
|
||||
t.Fatalf("expected match, got false")
|
||||
}
|
||||
if got.Step != 20 {
|
||||
t.Errorf("Step: got %d, want 20", got.Step)
|
||||
}
|
||||
if got.TotalSteps != 30 {
|
||||
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
|
||||
}
|
||||
if math.Abs(got.ItPerSec-12.5) > 1e-9 {
|
||||
t.Errorf("ItPerSec: got %v, want 12.5", got.ItPerSec)
|
||||
}
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user