chore: auto-commit (95 archivos)

- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 00:50:34 +02:00
parent ef60449e64
commit a802f59f55
189 changed files with 18964 additions and 330 deletions
@@ -0,0 +1,238 @@
+package infra
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+// MlEnvCheck holds the result of a single ML environment probe.
+type MlEnvCheck struct {
+	Name    string `json:"name"`              // e.g. "cuda_toolkit", "python_venv"
+	Status  string `json:"status"`            // "ok" | "missing" | "warning" | "unknown"
+	Version string `json:"version,omitempty"` // version string if detected
+	Detail  string `json:"detail,omitempty"`  // human-readable extra info
+}
+
+// MlEnvReport is the full ML environment audit result.
+type MlEnvReport struct {
+	Gpus        []GpuInfo    `json:"gpus"`
+	Checks      []MlEnvCheck `json:"checks"`
+	OverallOK   bool         `json:"overall_ok"`
+	GeneratedAt int64        `json:"generated_at"`
+}
+
+// AuditMlEnv probes the ML environment rooted at registryRoot.
+// It checks for NVIDIA drivers, CUDA toolkit, Python venv, key Python
+// packages and optional tools (sd, llama-cli) and a local vault path.
+// Returns a non-nil MlEnvReport even when individual checks fail; the
+// function itself only errors if a fundamental system call cannot be
+// attempted.
+func AuditMlEnv(registryRoot string) (MlEnvReport, error) {
+	report := MlEnvReport{
+		GeneratedAt: time.Now().Unix(),
+	}
+
+	// --- GPU detection (composes GetGpuInfo) ---
+	gpus, err := GetGpuInfo()
+	if err != nil {
+		// Non-fatal: record absence.
+		gpus = []GpuInfo{}
+	}
+	report.Gpus = gpus
+
+	checks := []MlEnvCheck{}
+
+	// --- nvidia-smi ---
+	checks = append(checks, probeCommand("nvidia_smi", "nvidia-smi", []string{"--version"}, 5))
+
+	// --- nvcc (CUDA toolkit compiler) ---
+	nvcc := probeNvcc()
+	checks = append(checks, nvcc)
+
+	// --- Python venv ---
+	venvCheck := probeVenv(registryRoot)
+	checks = append(checks, venvCheck)
+
+	// Python venv path for subsequent checks.
+	venvPy := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
+
+	// --- Python packages ---
+	for _, pkg := range []string{"torch", "diffusers", "transformers", "huggingface_hub", "stable_diffusion_cpp_python"} {
+		checks = append(checks, probePythonPackage(venvPy, pkg))
+	}
+
+	// --- sd.cpp CLI ---
+	checks = append(checks, probeCommand("sd_cli", "sd", []string{"--version"}, 5))
+
+	// --- llama.cpp CLI ---
+	checks = append(checks, probeCommand("llama_cpp", "llama-cli", []string{"--version"}, 5))
+
+	// --- imagegen_vault ---
+	checks = append(checks, probeImagegenVault())
+
+	report.Checks = checks
+
+	// OverallOK: no "missing" checks (warning is tolerated) and at least 1 GPU.
+	overallOK := len(gpus) > 0
+	for _, c := range checks {
+		if c.Status == "missing" {
+			// stable_diffusion_cpp_python and sd_cli are optional — downgrade to warning-only.
+			if c.Name == "stable_diffusion_cpp_python" || c.Name == "sd_cli" || c.Name == "llama_cpp" {
+				continue
+			}
+			overallOK = false
+		}
+	}
+	report.OverallOK = overallOK
+
+	return report, nil
+}
+
+// probeCommand checks whether a binary is available in PATH by running it with
+// the given args and recording any version output.
+func probeCommand(name, binary string, args []string, timeoutSec int) MlEnvCheck {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
+	defer cancel()
+
+	path, err := exec.LookPath(binary)
+	if err != nil {
+		return MlEnvCheck{Name: name, Status: "missing", Detail: fmt.Sprintf("%s not found in PATH", binary)}
+	}
+
+	out, err := exec.CommandContext(ctx, path, args...).CombinedOutput()
+	version := strings.TrimSpace(string(out))
+	if len(version) > 120 {
+		version = version[:120]
+	}
+	if err != nil {
+		return MlEnvCheck{Name: name, Status: "warning", Version: version, Detail: fmt.Sprintf("exit error: %v", err)}
+	}
+	return MlEnvCheck{Name: name, Status: "ok", Version: version}
+}
+
+// probeNvcc extracts the CUDA toolkit version from nvcc --version output.
+func probeNvcc() MlEnvCheck {
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	path, err := exec.LookPath("nvcc")
+	if err != nil {
+		return MlEnvCheck{Name: "nvcc", Status: "missing", Detail: "nvcc not found in PATH (CUDA toolkit not installed)"}
+	}
+
+	out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
+	if err != nil {
+		return MlEnvCheck{Name: "nvcc", Status: "warning", Detail: fmt.Sprintf("nvcc --version failed: %v", err)}
+	}
+
+	// Extract version from line like: "Cuda compilation tools, release 12.4, V12.4.99"
+	version := ""
+	for _, line := range strings.Split(string(out), "\n") {
+		if strings.Contains(line, "release") {
+			parts := strings.Split(line, ",")
+			for _, p := range parts {
+				p = strings.TrimSpace(p)
+				if strings.HasPrefix(p, "release") {
+					version = strings.TrimSpace(strings.TrimPrefix(p, "release"))
+					break
+				}
+			}
+			break
+		}
+	}
+	if version == "" {
+		version = strings.TrimSpace(string(out))
+		if len(version) > 80 {
+			version = version[:80]
+		}
+	}
+	return MlEnvCheck{Name: "nvcc", Status: "ok", Version: version}
+}
+
+// probeVenv checks that the Python venv exists and is functional.
+func probeVenv(registryRoot string) MlEnvCheck {
+	py := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
+	if _, err := os.Stat(py); os.IsNotExist(err) {
+		return MlEnvCheck{Name: "python_venv", Status: "missing", Detail: fmt.Sprintf("not found: %s", py)}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	out, err := exec.CommandContext(ctx, py, "--version").CombinedOutput()
+	version := strings.TrimSpace(string(out))
+	if err != nil {
+		return MlEnvCheck{Name: "python_venv", Status: "warning", Version: version, Detail: fmt.Sprintf("python3 --version failed: %v", err)}
+	}
+	return MlEnvCheck{Name: "python_venv", Status: "ok", Version: version}
+}
+
+// probePythonPackage imports a package in the venv Python and extracts __version__.
+func probePythonPackage(venvPy, pkg string) MlEnvCheck {
+	// Map package name → import name (for packages with different import names).
+	importName := pkg
+	switch pkg {
+	case "stable_diffusion_cpp_python":
+		importName = "stable_diffusion_cpp"
+	case "huggingface_hub":
+		importName = "huggingface_hub"
+	}
+
+	// Check that the venv python binary exists first.
+	if _, err := os.Stat(venvPy); os.IsNotExist(err) {
+		return MlEnvCheck{Name: pkg, Status: "unknown", Detail: "python_venv not available"}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	script := fmt.Sprintf("import %s; v = getattr(%s, '__version__', None); print(v or 'unknown')", importName, importName)
+	out, err := exec.CommandContext(ctx, venvPy, "-c", script).CombinedOutput()
+	output := strings.TrimSpace(string(out))
+
+	if err != nil {
+		// Module not found → missing; other errors → warning.
+		detail := output
+		if len(detail) > 200 {
+			detail = detail[:200]
+		}
+		if strings.Contains(output, "ModuleNotFoundError") || strings.Contains(output, "No module named") {
+			return MlEnvCheck{Name: pkg, Status: "missing", Detail: fmt.Sprintf("%s not installed", importName)}
+		}
+		return MlEnvCheck{Name: pkg, Status: "warning", Detail: detail}
+	}
+	return MlEnvCheck{Name: pkg, Status: "ok", Version: output}
+}
+
+// probeImagegenVault checks that ~/vaults/imagegen_models exists and lists subdirs.
+func probeImagegenVault() MlEnvCheck {
+	home, err := os.UserHomeDir()
+	if err != nil {
+		return MlEnvCheck{Name: "imagegen_vault", Status: "unknown", Detail: "cannot determine home directory"}
+	}
+	vaultPath := filepath.Join(home, "vaults", "imagegen_models")
+	entries, err := os.ReadDir(vaultPath)
+	if os.IsNotExist(err) {
+		return MlEnvCheck{Name: "imagegen_vault", Status: "missing", Detail: fmt.Sprintf("vault not found: %s", vaultPath)}
+	}
+	if err != nil {
+		return MlEnvCheck{Name: "imagegen_vault", Status: "warning", Detail: fmt.Sprintf("cannot read vault: %v", err)}
+	}
+
+	subdirs := []string{}
+	for _, e := range entries {
+		if e.IsDir() {
+			subdirs = append(subdirs, e.Name())
+		}
+	}
+	detail := fmt.Sprintf("subdirs: %s", strings.Join(subdirs, ", "))
+	if len(subdirs) == 0 {
+		detail = "vault exists but is empty"
+	}
+	return MlEnvCheck{Name: "imagegen_vault", Status: "ok", Detail: detail}
+}
@@ -0,0 +1,67 @@
+---
+name: audit_ml_env
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
+description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
+tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
+uses_functions: [get_gpu_info_go_infra]
+uses_types: [gpu_info_go_infra]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [context, fmt, os, os/exec, path/filepath, strings, time]
+tested: true
+tests:
+  - "report no nil y tiene checks"
+  - "generated_at es positivo"
+  - "checks tiene al menos 4 entradas"
+  - "gpus puede ser vacio en CI"
+test_file_path: "functions/infra/audit_ml_env_test.go"
+file_path: "functions/infra/audit_ml_env.go"
+params:
+  - name: registryRoot
+    desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
+output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
+---
+
+## Checks realizados
+
+| Check | Tipo | Critico |
+|---|---|---|
+| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
+| `nvcc` | CUDA toolkit version | no |
+| `python_venv` | exists + `python3 --version` | si |
+| `torch` | `import torch; __version__` | si |
+| `diffusers` | `import diffusers; __version__` | si |
+| `transformers` | `import transformers; __version__` | si |
+| `huggingface_hub` | `import huggingface_hub; __version__` | si |
+| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
+| `sd_cli` | `sd --version` in PATH | no (opcional) |
+| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
+| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
+
+## Ejemplo
+
+```go
+root := "/home/lucas/fn_registry"
+report, err := AuditMlEnv(root)
+if err != nil {
+    log.Fatal(err)
+}
+for _, c := range report.Checks {
+    fmt.Printf("%-40s %s  %s\n", c.Name, c.Status, c.Version)
+}
+fmt.Printf("OverallOK: %v\n", report.OverallOK)
+```
+
+## Notas
+
+- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
+- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
+- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
+- No escribe nada en disco. Read-only.
+- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
@@ -0,0 +1,53 @@
+package infra
+
+import (
+	"testing"
+)
+
+func TestAuditMlEnv(t *testing.T) {
+	// Use the actual registry root relative to the test binary location.
+	// Tests run from the package directory; go up two levels.
+	registryRoot := "../.."
+
+	t.Run("report no nil y tiene checks", func(t *testing.T) {
+		report, err := AuditMlEnv(registryRoot)
+		if err != nil {
+			t.Fatalf("AuditMlEnv returned error: %v", err)
+		}
+		if report.Checks == nil {
+			t.Fatal("report.Checks is nil")
+		}
+	})
+
+	t.Run("generated_at es positivo", func(t *testing.T) {
+		report, err := AuditMlEnv(registryRoot)
+		if err != nil {
+			t.Fatalf("AuditMlEnv returned error: %v", err)
+		}
+		if report.GeneratedAt <= 0 {
+			t.Errorf("GeneratedAt should be positive unix timestamp, got %d", report.GeneratedAt)
+		}
+	})
+
+	t.Run("checks tiene al menos 4 entradas", func(t *testing.T) {
+		report, err := AuditMlEnv(registryRoot)
+		if err != nil {
+			t.Fatalf("AuditMlEnv returned error: %v", err)
+		}
+		if len(report.Checks) < 4 {
+			t.Errorf("expected at least 4 checks, got %d", len(report.Checks))
+		}
+	})
+
+	t.Run("gpus puede ser vacio en CI", func(t *testing.T) {
+		report, err := AuditMlEnv(registryRoot)
+		if err != nil {
+			t.Fatalf("AuditMlEnv returned error: %v", err)
+		}
+		// Gpus may be empty in CI without a GPU; that's OK.
+		// Just verify the field is not nil.
+		if report.Gpus == nil {
+			t.Error("report.Gpus should be a non-nil slice (can be empty)")
+		}
+	})
+}
@@ -0,0 +1,60 @@
+package infra
+
+import (
+	"encoding/csv"
+	"errors"
+	"fmt"
+	"os/exec"
+	"strconv"
+	"strings"
+)
+
+// GetGpuInfo queries NVIDIA GPUs via nvidia-smi and returns a slice of GpuInfo.
+// If nvidia-smi is not installed or no NVIDIA GPU is present, returns an empty
+// slice and a nil error (absence of NVIDIA hardware is not an error).
+func GetGpuInfo() ([]GpuInfo, error) {
+	out, err := exec.Command(
+		"nvidia-smi",
+		"--query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version",
+		"--format=csv,noheader,nounits",
+	).Output()
+
+	if err != nil {
+		// nvidia-smi not installed or no NVIDIA device — not an error.
+		var exitErr *exec.ExitError
+		if errors.Is(err, exec.ErrNotFound) || errors.As(err, &exitErr) {
+			return []GpuInfo{}, nil
+		}
+		return nil, fmt.Errorf("gpu_info: nvidia-smi: %w", err)
+	}
+
+	r := csv.NewReader(strings.NewReader(strings.TrimSpace(string(out))))
+	r.TrimLeadingSpace = true
+
+	records, err := r.ReadAll()
+	if err != nil {
+		return nil, fmt.Errorf("gpu_info: parse csv: %w", err)
+	}
+
+	gpus := make([]GpuInfo, 0, len(records))
+	for _, rec := range records {
+		if len(rec) < 6 {
+			continue
+		}
+
+		idx, _ := strconv.Atoi(strings.TrimSpace(rec[0]))
+		totalMb, _ := strconv.Atoi(strings.TrimSpace(rec[2]))
+		freeMb, _ := strconv.Atoi(strings.TrimSpace(rec[3]))
+
+		gpus = append(gpus, GpuInfo{
+			Index:         idx,
+			Name:          strings.TrimSpace(rec[1]),
+			VramTotalMb:   totalMb,
+			VramFreeMb:    freeMb,
+			DriverVersion: strings.TrimSpace(rec[4]),
+			CudaVersion:   strings.TrimSpace(rec[5]),
+		})
+	}
+
+	return gpus, nil
+}
@@ -0,0 +1,70 @@
+---
+name: get_gpu_info
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func GetGpuInfo() ([]GpuInfo, error)"
+description: "Consulta GPUs NVIDIA via nvidia-smi y retorna un slice de GpuInfo con index, nombre, VRAM total/libre, driver y version CUDA. Si nvidia-smi no esta instalado o no hay GPU NVIDIA, retorna slice vacio y nil (ausencia de hardware no es error)."
+tags: [gpu, nvidia, cuda, hardware, infra, probe]
+uses_functions: []
+uses_types: ["gpu_info_go_infra"]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [encoding/csv, errors, fmt, os/exec, strconv, strings]
+params:
+  - name: (ninguno)
+    desc: "No toma parametros. Lee el estado del sistema via nvidia-smi."
+output: "Slice de GpuInfo con una entrada por GPU detectada. Slice vacio si no hay GPUs NVIDIA o nvidia-smi no esta instalado. Error solo si nvidia-smi existe pero falla inesperadamente al parsear la salida CSV."
+tested: true
+tests:
+  - "retorna slice vacio y nil cuando no hay GPU NVIDIA"
+  - "linea GPU RTX 3080 tipica"
+  - "dos GPUs en el CSV"
+  - "CSV vacio retorna slice vacio"
+  - "linea con menos de 6 campos se ignora"
+  - "espacios extra en los valores se eliminan"
+  - "campos del struct GpuInfo correctos"
+test_file_path: "functions/infra/get_gpu_info_test.go"
+file_path: "functions/infra/get_gpu_info.go"
+---
+
+## Ejemplo
+
+```go
+gpus, err := GetGpuInfo()
+if err != nil {
+    log.Fatal(err)
+}
+if len(gpus) == 0 {
+    fmt.Println("No NVIDIA GPUs detected")
+} else {
+    for _, g := range gpus {
+        fmt.Printf("[%d] %s  VRAM: %d/%d MiB  Driver: %s  CUDA: %s\n",
+            g.Index, g.Name, g.VramFreeMb, g.VramTotalMb,
+            g.DriverVersion, g.CudaVersion)
+    }
+}
+```
+
+## Salida nvidia-smi
+
+Ejecuta:
+```
+nvidia-smi --query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version --format=csv,noheader,nounits
+```
+
+Ejemplo de salida con una GPU:
+```
+0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4
+```
+
+## Notas
+
+- Requiere `nvidia-smi` en PATH (parte del driver NVIDIA).
+- La columna `cuda_version` en nvidia-smi refleja la version maxima de CUDA soportada por el driver, no la del toolkit instalado.
+- Para comprobar el toolkit CUDA instalado, usar `cuda_toolkit_check_bash_infra`.
+- En maquinas sin GPU NVIDIA retorna `([]GpuInfo{}, nil)` — el caller puede tratar esto como "sin GPU disponible".
+- No ejecutar tests automatizados para esta funcion en CI sin GPU; verificar manualmente o con mock.
@@ -0,0 +1,165 @@
+package infra
+
+import (
+	"strconv"
+	"strings"
+	"testing"
+)
+
+// TestGetGpuInfoNoGpu verifica que la funcion retorna slice vacio sin error
+// cuando nvidia-smi no esta instalado o no hay GPU NVIDIA presente.
+// Este test pasa en cualquier maquina, con o sin GPU.
+func TestGetGpuInfoNoGpu(t *testing.T) {
+	t.Run("retorna slice vacio y nil cuando no hay GPU NVIDIA", func(t *testing.T) {
+		gpus, err := GetGpuInfo()
+		if err != nil {
+			t.Errorf("GetGpuInfo() error inesperado: %v", err)
+		}
+		// En maquinas sin nvidia-smi el resultado debe ser un slice vacio (no nil)
+		if gpus == nil {
+			t.Error("GetGpuInfo() retorno nil, se esperaba slice vacio []GpuInfo{}")
+		}
+	})
+}
+
+// parseCsvNvidiaSmi replica la logica de parsing de GetGpuInfo para tests unitarios.
+// Recibe el output de nvidia-smi --format=csv,noheader,nounits y retorna []GpuInfo.
+func parseCsvNvidiaSmi(output string) ([]GpuInfo, error) {
+	trimmed := strings.TrimSpace(output)
+	if trimmed == "" {
+		return []GpuInfo{}, nil
+	}
+	lines := strings.Split(trimmed, "\n")
+	gpus := make([]GpuInfo, 0, len(lines))
+	for _, line := range lines {
+		parts := strings.Split(line, ",")
+		if len(parts) < 6 {
+			continue
+		}
+		idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
+		totalMb, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
+		freeMb, _ := strconv.Atoi(strings.TrimSpace(parts[3]))
+		gpus = append(gpus, GpuInfo{
+			Index:         idx,
+			Name:          strings.TrimSpace(parts[1]),
+			VramTotalMb:   totalMb,
+			VramFreeMb:    freeMb,
+			DriverVersion: strings.TrimSpace(parts[4]),
+			CudaVersion:   strings.TrimSpace(parts[5]),
+		})
+	}
+	return gpus, nil
+}
+
+// TestParseCsvNvidiaSmi verifica el parsing de la salida CSV de nvidia-smi
+// sin requerir GPU real ni nvidia-smi instalado.
+func TestParseCsvNvidiaSmi(t *testing.T) {
+	tests := []struct {
+		name          string
+		csvInput      string
+		wantLen       int
+		wantIndex     int
+		wantName      string
+		wantVramTotal int
+		wantVramFree  int
+		wantDriver    string
+		wantCuda      string
+	}{
+		{
+			name:          "linea GPU RTX 3080 tipica",
+			csvInput:      "0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4",
+			wantLen:       1,
+			wantIndex:     0,
+			wantName:      "NVIDIA GeForce RTX 3080",
+			wantVramTotal: 10240,
+			wantVramFree:  8192,
+			wantDriver:    "550.54.15",
+			wantCuda:      "12.4",
+		},
+		{
+			name:     "dos GPUs en el CSV",
+			csvInput: "0, GPU A, 8192, 4096, 525.0, 12.0\n1, GPU B, 24576, 20000, 525.0, 12.0",
+			wantLen:  2,
+		},
+		{
+			name:     "CSV vacio retorna slice vacio",
+			csvInput: "",
+			wantLen:  0,
+		},
+		{
+			name:     "linea con menos de 6 campos se ignora",
+			csvInput: "0, GPU, 8192",
+			wantLen:  0,
+		},
+		{
+			name:          "espacios extra en los valores se eliminan",
+			csvInput:      " 1 ,  NVIDIA RTX 4090 ,  24576 ,  20000 ,  545.0 ,  12.6 ",
+			wantLen:       1,
+			wantIndex:     1,
+			wantName:      "NVIDIA RTX 4090",
+			wantVramTotal: 24576,
+			wantVramFree:  20000,
+			wantDriver:    "545.0",
+			wantCuda:      "12.6",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			gpus, err := parseCsvNvidiaSmi(tc.csvInput)
+			if err != nil {
+				t.Fatalf("error inesperado: %v", err)
+			}
+			if len(gpus) != tc.wantLen {
+				t.Fatalf("len(gpus) = %d, quería %d", len(gpus), tc.wantLen)
+			}
+			if tc.wantLen == 1 {
+				g := gpus[0]
+				if g.Index != tc.wantIndex {
+					t.Errorf("Index = %d, quería %d", g.Index, tc.wantIndex)
+				}
+				if g.Name != tc.wantName {
+					t.Errorf("Name = %q, quería %q", g.Name, tc.wantName)
+				}
+				if g.VramTotalMb != tc.wantVramTotal {
+					t.Errorf("VramTotalMb = %d, quería %d", g.VramTotalMb, tc.wantVramTotal)
+				}
+				if g.VramFreeMb != tc.wantVramFree {
+					t.Errorf("VramFreeMb = %d, quería %d", g.VramFreeMb, tc.wantVramFree)
+				}
+				if g.DriverVersion != tc.wantDriver {
+					t.Errorf("DriverVersion = %q, quería %q", g.DriverVersion, tc.wantDriver)
+				}
+				if g.CudaVersion != tc.wantCuda {
+					t.Errorf("CudaVersion = %q, quería %q", g.CudaVersion, tc.wantCuda)
+				}
+			}
+		})
+	}
+}
+
+// TestGpuInfoStruct verifica los campos del tipo GpuInfo.
+func TestGpuInfoStruct(t *testing.T) {
+	t.Run("campos del struct GpuInfo correctos", func(t *testing.T) {
+		g := GpuInfo{
+			Index:         0,
+			Name:          "NVIDIA GeForce GTX 1080",
+			VramTotalMb:   8192,
+			VramFreeMb:    6144,
+			DriverVersion: "470.0",
+			CudaVersion:   "11.4",
+		}
+		if g.Index != 0 {
+			t.Errorf("Index = %d", g.Index)
+		}
+		if g.Name != "NVIDIA GeForce GTX 1080" {
+			t.Errorf("Name = %q", g.Name)
+		}
+		if g.VramTotalMb != 8192 {
+			t.Errorf("VramTotalMb = %d", g.VramTotalMb)
+		}
+		if g.VramFreeMb != 6144 {
+			t.Errorf("VramFreeMb = %d", g.VramFreeMb)
+		}
+	})
+}
@@ -0,0 +1,12 @@
+package infra
+
+// GpuInfo describe una GPU detectada en el sistema con sus capacidades de VRAM
+// y versiones de driver y CUDA.
+type GpuInfo struct {
+	Index         int    `json:"index"`
+	Name          string `json:"name"`
+	VramTotalMb   int    `json:"vram_total_mb"`
+	VramFreeMb    int    `json:"vram_free_mb"`
+	DriverVersion string `json:"driver_version"`
+	CudaVersion   string `json:"cuda_version,omitempty"`
+}
@@ -0,0 +1,171 @@
+package infra
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"time"
+)
+
+// AggregateReport summarises the result of a VaultAggregateIndex run.
+type AggregateReport struct {
+	VaultsProcessed int
+	VaultsSkipped   int      // vaults without a vault_index.db
+	TotalFiles      int
+	Errors          []string // non-fatal per-vault errors
+}
+
+// VaultAggregateIndex reads all vault manifests from repoRoot, opens each
+// vault_index.db and copies all file records into the central registry.db
+// vault_files table. The table is created if it does not exist (idempotent).
+//
+// For each vault the previous rows are deleted and replaced atomically, so
+// re-running always produces a clean, non-duplicated state.
+//
+// Returns an AggregateReport with counts. Per-vault errors are non-fatal
+// (logged in report.Errors); only fatal errors (e.g. registry.db
+// unreachable) are returned as the error value.
+func VaultAggregateIndex(repoRoot string) (AggregateReport, error) {
+	var report AggregateReport
+
+	// 1. Open registry.db
+	registryDB, err := SQLiteOpen(filepath.Join(repoRoot, "registry.db"), "")
+	if err != nil {
+		return report, fmt.Errorf("vault_aggregate_index: open registry.db: %w", err)
+	}
+	defer registryDB.Close()
+
+	// 2. Idempotent schema migration
+	for _, stmt := range []string{
+		`CREATE TABLE IF NOT EXISTS vault_files (
+    vault_id    TEXT NOT NULL,
+    vault_name  TEXT NOT NULL,
+    rel_path    TEXT NOT NULL,
+    size        INTEGER NOT NULL,
+    mtime       INTEGER NOT NULL,
+    sha256      TEXT NOT NULL,
+    mime        TEXT NOT NULL DEFAULT '',
+    ext         TEXT NOT NULL DEFAULT '',
+    bucket      TEXT NOT NULL DEFAULT '',
+    sub_bucket  TEXT NOT NULL DEFAULT '',
+    indexed_at  INTEGER NOT NULL,
+    PRIMARY KEY (vault_id, rel_path)
+);`,
+		`CREATE INDEX IF NOT EXISTS idx_vault_files_sha256 ON vault_files(sha256);`,
+		`CREATE INDEX IF NOT EXISTS idx_vault_files_vault ON vault_files(vault_id);`,
+	} {
+		if _, err := registryDB.Exec(stmt); err != nil {
+			if !isIdempotentMigrationError(err) {
+				return report, fmt.Errorf("vault_aggregate_index: schema: %w", err)
+			}
+		}
+	}
+
+	// 3. Read manifest
+	entries, err := VaultManifestRead(repoRoot)
+	if err != nil {
+		return report, fmt.Errorf("vault_aggregate_index: manifest: %w", err)
+	}
+
+	now := time.Now().UTC().Unix()
+
+	for _, entry := range entries {
+		vaultID := vaultIDFromEntry(entry)
+		vaultName := entry.Name
+		vaultPath := entry.Path
+
+		indexPath := filepath.Join(vaultPath, "vault_index.db")
+		if _, statErr := os.Stat(indexPath); statErr != nil {
+			report.VaultsSkipped++
+			continue
+		}
+
+		vaultDB, openErr := VaultIndexOpen(vaultPath)
+		if openErr != nil {
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: open index: %v", vaultName, openErr))
+			continue
+		}
+
+		rows, queryErr := vaultDB.Query(
+			`SELECT rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket FROM files`,
+		)
+		if queryErr != nil {
+			vaultDB.Close()
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: query files: %v", vaultName, queryErr))
+			continue
+		}
+
+		type fileRow struct {
+			RelPath   string
+			Size      int64
+			Mtime     int64
+			Sha256    string
+			Mime      string
+			Ext       string
+			Bucket    string
+			SubBucket string
+		}
+		var fileRows []fileRow
+		for rows.Next() {
+			var r fileRow
+			if scanErr := rows.Scan(&r.RelPath, &r.Size, &r.Mtime, &r.Sha256, &r.Mime, &r.Ext, &r.Bucket, &r.SubBucket); scanErr != nil {
+				continue
+			}
+			fileRows = append(fileRows, r)
+		}
+		rows.Close()
+		vaultDB.Close()
+
+		// Atomic replace in registry.db
+		tx, txErr := registryDB.Begin()
+		if txErr != nil {
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: begin tx: %v", vaultName, txErr))
+			continue
+		}
+
+		if _, delErr := tx.Exec(`DELETE FROM vault_files WHERE vault_id = ?`, vaultID); delErr != nil {
+			tx.Rollback()
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: delete: %v", vaultName, delErr))
+			continue
+		}
+
+		stmt, prepErr := tx.Prepare(`
+INSERT INTO vault_files
+    (vault_id, vault_name, rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
+		if prepErr != nil {
+			tx.Rollback()
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: prepare: %v", vaultName, prepErr))
+			continue
+		}
+
+		for _, r := range fileRows {
+			if _, insErr := stmt.Exec(vaultID, vaultName, r.RelPath, r.Size, r.Mtime, r.Sha256, r.Mime, r.Ext, r.Bucket, r.SubBucket, now); insErr != nil {
+				stmt.Close()
+				tx.Rollback()
+				report.Errors = append(report.Errors, fmt.Sprintf("%s: insert %s: %v", vaultName, r.RelPath, insErr))
+				continue
+			}
+		}
+		stmt.Close()
+
+		if commitErr := tx.Commit(); commitErr != nil {
+			report.Errors = append(report.Errors, fmt.Sprintf("%s: commit: %v", vaultName, commitErr))
+			continue
+		}
+
+		report.VaultsProcessed++
+		report.TotalFiles += len(fileRows)
+	}
+
+	return report, nil
+}
+
+// vaultIDFromEntry constructs the canonical vault ID used in registry.db.
+// Pattern: "<vault_name>_<project_id>" — consistent with the vaults table.
+func vaultIDFromEntry(e VaultManifestEntry) string {
+	if e.ProjectID == "" {
+		return e.Name
+	}
+	return e.Name + "_" + e.ProjectID
+}
@@ -0,0 +1,58 @@
+---
+name: vault_aggregate_index
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultAggregateIndex(repoRoot string) (AggregateReport, error)"
+description: "Agrega los índices de todos los vaults del registry en la tabla vault_files de registry.db. Lee cada vault_index.db (via VaultIndexOpen) y reemplaza las filas de forma atómica. Idempotente: re-ejecutar limpia y reescribe sin duplicar."
+tags: [vault, index, aggregate, registry]
+uses_functions:
+  - "vault_manifest_read_go_infra"
+  - "vault_index_open_go_infra"
+  - "sqlite_open_go_infra"
+uses_types:
+  - "vault_file_go_infra"
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports:
+  - "database/sql"
+  - "fmt"
+  - "os"
+  - "path/filepath"
+  - "time"
+tested: true
+tests:
+  - "TestVaultAggregateIndex_NoVaults"
+  - "TestVaultAggregateIndex_VaultWithoutIndex"
+  - "TestVaultAggregateIndex_HappyPath"
+  - "TestVaultAggregateIndex_ReRunReplaces"
+test_file_path: "functions/infra/vault_aggregate_index_test.go"
+file_path: "functions/infra/vault_aggregate_index.go"
+params:
+  - name: repoRoot
+    desc: "Ruta absoluta a la raiz del fn_registry (contiene registry.db y projects/)."
+output: "AggregateReport con VaultsProcessed, VaultsSkipped (sin vault_index.db), TotalFiles y Errors (errores no fatales por vault). Error fatal solo si registry.db no se puede abrir."
+---
+
+## Ejemplo
+
+```go
+report, err := infra.VaultAggregateIndex("/home/lucas/fn_registry")
+if err != nil {
+    log.Fatal(err)
+}
+fmt.Printf("Processed: %d vaults, %d files\n", report.VaultsProcessed, report.TotalFiles)
+for _, e := range report.Errors {
+    fmt.Println("warning:", e)
+}
+```
+
+## Notas
+
+- Requiere que `registry/migrations/012_vault_files.sql` haya sido aplicado (o que el indexer lo aplique al arrancar). La función aplica la migración de forma idempotente ella misma con `CREATE TABLE IF NOT EXISTS`.
+- Por cada vault: `DELETE WHERE vault_id = ?` + batch `INSERT` dentro de una transacción. Re-run siempre produce el mismo resultado.
+- Vaults sin `vault_index.db` se cuentan en `VaultsSkipped` y se omiten sin error.
+- El `vault_id` sigue el patrón `<vault_name>_<project_id>`, consistente con la tabla `vaults` de registry.db.
@@ -0,0 +1,175 @@
+package infra
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+// setupAggregateTestRepo creates a minimal repo layout:
+//
+//	<root>/
+//	  registry.db  (SQLite, empty)
+//	  projects/<project>/vaults/vault.yaml
+//	  <vaultPath>/    (optionally with vault_index.db populated)
+func setupAggregateTestRepo(t *testing.T, vaultName, projectID, vaultPath string, withIndex bool) string {
+	t.Helper()
+	root := t.TempDir()
+
+	// Create registry.db
+	regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
+	if err != nil {
+		t.Fatalf("create registry.db: %v", err)
+	}
+	regDB.Close()
+
+	// Create project vault manifest
+	projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
+	if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
+		t.Fatalf("mkdir projects: %v", err)
+	}
+	manifestYAML := "vaults:\n  - name: " + vaultName + "\n    description: test\n    path: " + vaultPath + "\n    tags: []\n"
+	if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifestYAML), 0644); err != nil {
+		t.Fatalf("write vault.yaml: %v", err)
+	}
+
+	// Create vault dir
+	if err := os.MkdirAll(vaultPath, 0755); err != nil {
+		t.Fatalf("mkdir vault: %v", err)
+	}
+
+	if withIndex {
+		// Create a vault_index.db with one file row
+		vdb, err := VaultIndexOpen(vaultPath)
+		if err != nil {
+			t.Fatalf("VaultIndexOpen: %v", err)
+		}
+		now := time.Now().UTC().Unix()
+		_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+			"data/raw/sample.csv", 1024, now, "deadbeef", "text/csv", ".csv", "data", "raw", now)
+		if err != nil {
+			t.Fatalf("insert test file: %v", err)
+		}
+		vdb.Close()
+	}
+
+	return root
+}
+
+func TestVaultAggregateIndex_NoVaults(t *testing.T) {
+	root := t.TempDir()
+	// No manifests, just registry.db
+	regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
+	if err != nil {
+		t.Fatalf("create registry.db: %v", err)
+	}
+	regDB.Close()
+
+	report, err := VaultAggregateIndex(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if report.VaultsProcessed != 0 {
+		t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
+	}
+	if len(report.Errors) != 0 {
+		t.Errorf("Errors: want empty, got %v", report.Errors)
+	}
+}
+
+func TestVaultAggregateIndex_VaultWithoutIndex(t *testing.T) {
+	vaultDir := t.TempDir()
+	root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, false /* no vault_index.db */)
+
+	report, err := VaultAggregateIndex(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if report.VaultsSkipped != 1 {
+		t.Errorf("VaultsSkipped: want 1, got %d", report.VaultsSkipped)
+	}
+	if report.VaultsProcessed != 0 {
+		t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
+	}
+}
+
+func TestVaultAggregateIndex_HappyPath(t *testing.T) {
+	vaultDir := t.TempDir()
+	root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
+
+	report, err := VaultAggregateIndex(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if report.VaultsProcessed != 1 {
+		t.Errorf("VaultsProcessed: want 1, got %d", report.VaultsProcessed)
+	}
+	if report.TotalFiles != 1 {
+		t.Errorf("TotalFiles: want 1, got %d", report.TotalFiles)
+	}
+
+	// Verify row exists in registry.db
+	regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
+	if err != nil {
+		t.Fatalf("open registry.db: %v", err)
+	}
+	defer regDB.Close()
+
+	var count int
+	if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
+		t.Fatalf("count vault_files: %v", err)
+	}
+	if count != 1 {
+		t.Errorf("vault_files count: want 1, got %d", count)
+	}
+}
+
+func TestVaultAggregateIndex_ReRunReplaces(t *testing.T) {
+	vaultDir := t.TempDir()
+	root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
+
+	// First run
+	if _, err := VaultAggregateIndex(root); err != nil {
+		t.Fatalf("first run: %v", err)
+	}
+
+	// Add a second file to vault_index.db
+	vdb, err := VaultIndexOpen(vaultDir)
+	if err != nil {
+		t.Fatalf("reopen vault index: %v", err)
+	}
+	now := time.Now().UTC().Unix()
+	_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		"data/raw/extra.csv", 512, now, "cafebabe", "text/csv", ".csv", "data", "raw", now)
+	if err != nil {
+		t.Fatalf("insert second file: %v", err)
+	}
+	vdb.Close()
+
+	// Second run
+	report, err := VaultAggregateIndex(root)
+	if err != nil {
+		t.Fatalf("second run: %v", err)
+	}
+	if report.TotalFiles != 2 {
+		t.Errorf("TotalFiles: want 2, got %d", report.TotalFiles)
+	}
+
+	// Verify no duplicates — exactly 2 rows
+	regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
+	if err != nil {
+		t.Fatalf("open registry.db: %v", err)
+	}
+	defer regDB.Close()
+
+	var count int
+	if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
+		t.Fatalf("count vault_files: %v", err)
+	}
+	if count != 2 {
+		t.Errorf("vault_files count after re-run: want 2, got %d", count)
+	}
+}
@@ -0,0 +1,68 @@
+package infra
+
+import "sort"
+
+// VaultFileChange holds the before/after state of a file whose content changed.
+type VaultFileChange struct {
+	RelPath string
+	Prev    VaultFile
+	Curr    VaultFile
+}
+
+// VaultDiffReport is the result of comparing two VaultFile slices.
+type VaultDiffReport struct {
+	Added     []VaultFile       // in curr but not in prev (by rel_path)
+	Removed   []VaultFile       // in prev but not in curr
+	Changed   []VaultFileChange // same rel_path, different sha256
+	Unchanged int               // files present in both with identical sha256
+}
+
+// VaultDiff computes the difference between two vault snapshots.
+// It indexes both slices by RelPath, then classifies each entry as
+// Added, Removed, Changed, or Unchanged. All output slices are sorted
+// by RelPath ascending. The function is pure and deterministic.
+func VaultDiff(prev, curr []VaultFile) VaultDiffReport {
+	prevMap := make(map[string]VaultFile, len(prev))
+	for _, f := range prev {
+		prevMap[f.RelPath] = f
+	}
+	currMap := make(map[string]VaultFile, len(curr))
+	for _, f := range curr {
+		currMap[f.RelPath] = f
+	}
+
+	var report VaultDiffReport
+
+	for _, f := range curr {
+		p, exists := prevMap[f.RelPath]
+		if !exists {
+			report.Added = append(report.Added, f)
+		} else if p.Sha256 != f.Sha256 {
+			report.Changed = append(report.Changed, VaultFileChange{
+				RelPath: f.RelPath,
+				Prev:    p,
+				Curr:    f,
+			})
+		} else {
+			report.Unchanged++
+		}
+	}
+
+	for _, f := range prev {
+		if _, exists := currMap[f.RelPath]; !exists {
+			report.Removed = append(report.Removed, f)
+		}
+	}
+
+	sort.Slice(report.Added, func(i, j int) bool {
+		return report.Added[i].RelPath < report.Added[j].RelPath
+	})
+	sort.Slice(report.Removed, func(i, j int) bool {
+		return report.Removed[i].RelPath < report.Removed[j].RelPath
+	})
+	sort.Slice(report.Changed, func(i, j int) bool {
+		return report.Changed[i].RelPath < report.Changed[j].RelPath
+	})
+
+	return report
+}
@@ -0,0 +1,49 @@
+---
+name: vault_diff
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: pure
+signature: "func VaultDiff(prev, curr []VaultFile) VaultDiffReport"
+description: "Computes the diff between two vault snapshots (slices of VaultFile). Returns Added, Removed, Changed and Unchanged counts. Pure and deterministic — no I/O."
+tags: [vault, diff, comparison, pure]
+uses_functions: []
+uses_types: ["vault_file_go_infra"]
+returns: []
+returns_optional: false
+error_type: ""
+imports: ["sort"]
+tested: true
+tests:
+  - "TestVaultDiff_NoChanges"
+  - "TestVaultDiff_AllAdded"
+  - "TestVaultDiff_AllRemoved"
+  - "TestVaultDiff_ContentChanged"
+  - "TestVaultDiff_Mixed"
+test_file_path: "functions/infra/vault_diff_test.go"
+file_path: "functions/infra/vault_diff.go"
+params:
+  - name: prev
+    desc: "Snapshot anterior — slice de VaultFile del estado previo del vault (puede ser nil para diff desde cero)."
+  - name: curr
+    desc: "Snapshot actual — slice de VaultFile del estado corriente del vault (puede ser nil para diff de borrado total)."
+output: "VaultDiffReport con Added (nuevos), Removed (eliminados), Changed (mismo rel_path, sha256 distinto) y Unchanged (identicos). Todos los slices ordenados por RelPath ASC."
+---
+
+## Ejemplo
+
+```go
+prev, _ := infra.VaultInventoryScan(oldPath, "my_vault_proj", "my_vault")
+curr, _ := infra.VaultInventoryScan(newPath, "my_vault_proj", "my_vault")
+report := infra.VaultDiff(prev, curr)
+fmt.Printf("Added: %d, Removed: %d, Changed: %d, Unchanged: %d\n",
+    len(report.Added), len(report.Removed), len(report.Changed), report.Unchanged)
+```
+
+## Notas
+
+- Usa `RelPath` como clave de identidad de archivo (no nombre, no sha256).
+- Dos archivos con mismo `RelPath` pero diferente `Sha256` se consideran Changed.
+- Los slices del report se ordenan por `RelPath` ASC para salida deterministica.
+- Función pura: no toca disco ni BD.
@@ -0,0 +1,126 @@
+package infra
+
+import (
+	"testing"
+)
+
+func makeVF(relPath, sha256 string) VaultFile {
+	return VaultFile{
+		VaultID:   "test_vault",
+		VaultName: "test",
+		RelPath:   relPath,
+		Sha256:    sha256,
+	}
+}
+
+func TestVaultDiff_NoChanges(t *testing.T) {
+	files := []VaultFile{
+		makeVF("data/a.csv", "aaa"),
+		makeVF("data/b.csv", "bbb"),
+	}
+	report := VaultDiff(files, files)
+	if len(report.Added) != 0 {
+		t.Errorf("Added: want 0, got %d", len(report.Added))
+	}
+	if len(report.Removed) != 0 {
+		t.Errorf("Removed: want 0, got %d", len(report.Removed))
+	}
+	if len(report.Changed) != 0 {
+		t.Errorf("Changed: want 0, got %d", len(report.Changed))
+	}
+	if report.Unchanged != 2 {
+		t.Errorf("Unchanged: want 2, got %d", report.Unchanged)
+	}
+}
+
+func TestVaultDiff_AllAdded(t *testing.T) {
+	curr := []VaultFile{
+		makeVF("data/a.csv", "aaa"),
+		makeVF("data/b.csv", "bbb"),
+	}
+	report := VaultDiff(nil, curr)
+	if len(report.Added) != 2 {
+		t.Errorf("Added: want 2, got %d", len(report.Added))
+	}
+	if len(report.Removed) != 0 {
+		t.Errorf("Removed: want 0, got %d", len(report.Removed))
+	}
+	if report.Added[0].RelPath != "data/a.csv" {
+		t.Errorf("Added[0]: want data/a.csv, got %s", report.Added[0].RelPath)
+	}
+	if report.Added[1].RelPath != "data/b.csv" {
+		t.Errorf("Added[1]: want data/b.csv, got %s", report.Added[1].RelPath)
+	}
+}
+
+func TestVaultDiff_AllRemoved(t *testing.T) {
+	prev := []VaultFile{
+		makeVF("data/a.csv", "aaa"),
+		makeVF("data/b.csv", "bbb"),
+	}
+	report := VaultDiff(prev, nil)
+	if len(report.Removed) != 2 {
+		t.Errorf("Removed: want 2, got %d", len(report.Removed))
+	}
+	if len(report.Added) != 0 {
+		t.Errorf("Added: want 0, got %d", len(report.Added))
+	}
+	if report.Removed[0].RelPath != "data/a.csv" {
+		t.Errorf("Removed[0]: want data/a.csv, got %s", report.Removed[0].RelPath)
+	}
+}
+
+func TestVaultDiff_ContentChanged(t *testing.T) {
+	prev := []VaultFile{
+		makeVF("data/a.csv", "old_hash"),
+	}
+	curr := []VaultFile{
+		makeVF("data/a.csv", "new_hash"),
+	}
+	report := VaultDiff(prev, curr)
+	if len(report.Changed) != 1 {
+		t.Fatalf("Changed: want 1, got %d", len(report.Changed))
+	}
+	if report.Changed[0].RelPath != "data/a.csv" {
+		t.Errorf("Changed[0].RelPath: want data/a.csv, got %s", report.Changed[0].RelPath)
+	}
+	if report.Changed[0].Prev.Sha256 != "old_hash" {
+		t.Errorf("Changed[0].Prev.Sha256: want old_hash, got %s", report.Changed[0].Prev.Sha256)
+	}
+	if report.Changed[0].Curr.Sha256 != "new_hash" {
+		t.Errorf("Changed[0].Curr.Sha256: want new_hash, got %s", report.Changed[0].Curr.Sha256)
+	}
+	if len(report.Added) != 0 || len(report.Removed) != 0 {
+		t.Errorf("Expected no added/removed, got %d/%d", len(report.Added), len(report.Removed))
+	}
+	if report.Unchanged != 0 {
+		t.Errorf("Unchanged: want 0, got %d", report.Unchanged)
+	}
+}
+
+func TestVaultDiff_Mixed(t *testing.T) {
+	prev := []VaultFile{
+		makeVF("data/a.csv", "aaa"),
+		makeVF("data/b.csv", "bbb"),
+		makeVF("data/c.csv", "ccc"),
+	}
+	curr := []VaultFile{
+		makeVF("data/a.csv", "aaa"),     // unchanged
+		makeVF("data/b.csv", "bbb_new"), // changed
+		makeVF("data/d.csv", "ddd"),     // added
+	}
+	report := VaultDiff(prev, curr)
+
+	if len(report.Added) != 1 || report.Added[0].RelPath != "data/d.csv" {
+		t.Errorf("Added: want [data/d.csv], got %v", report.Added)
+	}
+	if len(report.Removed) != 1 || report.Removed[0].RelPath != "data/c.csv" {
+		t.Errorf("Removed: want [data/c.csv], got %v", report.Removed)
+	}
+	if len(report.Changed) != 1 || report.Changed[0].RelPath != "data/b.csv" {
+		t.Errorf("Changed: want [data/b.csv], got %v", report.Changed)
+	}
+	if report.Unchanged != 1 {
+		t.Errorf("Unchanged: want 1, got %d", report.Unchanged)
+	}
+}
@@ -0,0 +1,230 @@
+package infra
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+// VaultDoctorEntry holds the health report for a single vault.
+type VaultDoctorEntry struct {
+	VaultName     string   `json:"vault_name"`
+	VaultPath     string   `json:"vault_path"`
+	ProjectID     string   `json:"project_id"`
+	Issues        []string `json:"issues"`         // human-readable issues; empty = healthy
+	IndexedFiles  int      `json:"indexed_files"`  // 0 if no vault_index.db
+	LastIndexedAt int64    `json:"last_indexed_at"` // unix seconds; 0 if N/A
+	DiskFiles     int      `json:"disk_files"`     // count via WalkDir (no hashing)
+	Status        string   `json:"status"`         // "ok" | "warning" | "error"
+}
+
+// VaultDoctor audits every vault declared in projects/*/vaults/vault.yaml under
+// repoRoot. For each vault it performs a series of checks (disk presence, layout,
+// index existence, staleness, drift) and returns a slice of VaultDoctorEntry.
+//
+// The function is read-only: it never writes to disk or any database.
+// Returns an error only if VaultManifestRead fails (manifest parse error).
+func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error) {
+	entries, err := VaultManifestRead(repoRoot)
+	if err != nil {
+		return nil, fmt.Errorf("vault_doctor: read manifests: %w", err)
+	}
+
+	results := make([]VaultDoctorEntry, 0, len(entries))
+	for _, e := range entries {
+		result := auditVault(e)
+		results = append(results, result)
+	}
+	return results, nil
+}
+
+func auditVault(e VaultManifestEntry) VaultDoctorEntry {
+	entry := VaultDoctorEntry{
+		VaultName: e.Name,
+		VaultPath: e.Path,
+		ProjectID: e.ProjectID,
+	}
+
+	// Resolve symlinks for disk checks
+	realPath, err := filepath.EvalSymlinks(e.Path)
+	if err != nil || realPath == "" {
+		realPath = e.Path
+	}
+
+	// CHECK 1: directory_missing
+	info, statErr := os.Stat(realPath)
+	if statErr != nil || !info.IsDir() {
+		entry.Issues = append(entry.Issues, "directory_missing")
+		entry.Status = "error"
+		return entry
+	}
+
+	// COUNT disk files (cheap walk — no hashing, no mime detection)
+	diskCount := countDiskFiles(realPath)
+	entry.DiskFiles = diskCount
+
+	// CHECK 2: layout_missing / non_standard_layout
+	hasData := dirExists(filepath.Join(realPath, "data"))
+	hasKnowledge := dirExists(filepath.Join(realPath, "knowledge"))
+	if !hasData && !hasKnowledge {
+		// Check if it looks like a non-standard but intentional layout
+		if hasNonStandardLayout(realPath) {
+			entry.Issues = append(entry.Issues, "non_standard_layout")
+		} else {
+			entry.Issues = append(entry.Issues, "layout_missing")
+		}
+	}
+
+	// CHECK 3: index_missing
+	indexPath := filepath.Join(realPath, "vault_index.db")
+	_, indexStatErr := os.Stat(indexPath)
+	if indexStatErr != nil {
+		entry.Issues = append(entry.Issues, "index_missing")
+		entry.setWarningStatus()
+		entry.setFinalStatus()
+		return entry
+	}
+
+	// Open vault index (read-only) for checks 4 and 5
+	vdb, openErr := VaultIndexOpen(realPath)
+	if openErr != nil {
+		entry.Issues = append(entry.Issues, fmt.Sprintf("index_open_error: %v", openErr))
+		entry.setWarningStatus()
+		return entry
+	}
+	defer vdb.Close()
+
+	// Query indexed file count and max indexed_at
+	var indexedCount int
+	var maxIndexedAt int64
+	row := vdb.QueryRow(`SELECT COUNT(*), COALESCE(MAX(indexed_at), 0) FROM files`)
+	if scanErr := row.Scan(&indexedCount, &maxIndexedAt); scanErr != nil {
+		entry.Issues = append(entry.Issues, fmt.Sprintf("index_query_error: %v", scanErr))
+	} else {
+		entry.IndexedFiles = indexedCount
+		entry.LastIndexedAt = maxIndexedAt
+	}
+
+	// CHECK 4: index_stale — any file on disk newer than MAX(indexed_at)
+	if maxIndexedAt > 0 {
+		maxTime := time.Unix(maxIndexedAt, 0)
+		if isIndexStale(realPath, maxTime) {
+			entry.Issues = append(entry.Issues, "index_stale")
+		}
+	}
+
+	// CHECK 5: index_drift — disk file count != indexed count
+	if indexedCount != diskCount {
+		entry.Issues = append(entry.Issues, fmt.Sprintf("index_drift: disk=%d indexed=%d", diskCount, indexedCount))
+	}
+
+	// CHECK 6: empty_vault
+	if diskCount == 0 {
+		entry.Issues = append(entry.Issues, "empty_vault")
+	}
+
+	entry.setFinalStatus()
+	return entry
+}
+
+// setWarningStatus sets status to warning if not already error.
+func (e *VaultDoctorEntry) setWarningStatus() {
+	if e.Status != "error" {
+		e.Status = "warning"
+	}
+}
+
+// setFinalStatus derives the final Status from Issues.
+func (e *VaultDoctorEntry) setFinalStatus() {
+	if e.Status == "error" {
+		return
+	}
+	if len(e.Issues) == 0 {
+		e.Status = "ok"
+	} else {
+		e.Status = "warning"
+	}
+}
+
+// countDiskFiles walks realPath and counts regular files, excluding:
+// vault_index.db*, .git/, hidden files/dirs at any depth.
+func countDiskFiles(realPath string) int {
+	count := 0
+	_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return nil
+		}
+		name := d.Name()
+		// Skip hidden entries
+		if strings.HasPrefix(name, ".") {
+			if d.IsDir() {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		// Skip .git
+		if d.IsDir() && name == ".git" {
+			return filepath.SkipDir
+		}
+		// Skip vault_index.db files
+		if !d.IsDir() && (name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal") {
+			return nil
+		}
+		if !d.IsDir() {
+			count++
+		}
+		return nil
+	})
+	return count
+}
+
+// isIndexStale returns true if any regular file under realPath has an mtime
+// strictly after maxTime (excluding vault_index.db* and hidden files).
+func isIndexStale(realPath string, maxTime time.Time) bool {
+	stale := false
+	_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
+		if err != nil || stale {
+			return nil
+		}
+		name := d.Name()
+		if strings.HasPrefix(name, ".") {
+			if d.IsDir() {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		if d.IsDir() && name == ".git" {
+			return filepath.SkipDir
+		}
+		if !d.IsDir() {
+			if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
+				return nil
+			}
+			fi, statErr := d.Info()
+			if statErr == nil && fi.ModTime().After(maxTime) {
+				stale = true
+			}
+		}
+		return nil
+	})
+	return stale
+}
+
+// hasNonStandardLayout returns true when a vault directory contains
+// subdirectories that are clearly intentional but not data/knowledge.
+// Heuristic: any subdir at the vault root that is not data/knowledge.
+func hasNonStandardLayout(realPath string) bool {
+	entries, err := os.ReadDir(realPath)
+	if err != nil {
+		return false
+	}
+	standardDirs := map[string]bool{"data": true, "knowledge": true, ".git": true}
+	for _, e := range entries {
+		if e.IsDir() && !standardDirs[e.Name()] && !strings.HasPrefix(e.Name(), ".") {
+			return true
+		}
+	}
+	return false
+}
@@ -0,0 +1,66 @@
+---
+name: vault_doctor
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error)"
+description: "Audita la salud de todos los vaults declarados en projects/*/vaults/vault.yaml. Comprueba existencia del directorio, layout estándar, presencia del índice, staleness y drift entre disco e índice. Read-only."
+tags: [vault, doctor, health, audit]
+uses_functions:
+  - "vault_manifest_read_go_infra"
+  - "vault_index_open_go_infra"
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports:
+  - "fmt"
+  - "os"
+  - "path/filepath"
+  - "strings"
+  - "time"
+tested: true
+tests:
+  - "TestVaultDoctor_OK"
+  - "TestVaultDoctor_MissingDir"
+  - "TestVaultDoctor_NoIndex"
+  - "TestVaultDoctor_LayoutDrift"
+  - "TestVaultDoctor_EmptyVault"
+test_file_path: "functions/infra/vault_doctor_test.go"
+file_path: "functions/infra/vault_doctor.go"
+params:
+  - name: repoRoot
+    desc: "Ruta absoluta a la raiz del fn_registry (donde están projects/ y registry.db)."
+output: "Slice de VaultDoctorEntry con Status (ok/warning/error), Issues, DiskFiles, IndexedFiles y LastIndexedAt por vault. Error fatal solo si los manifests no se pueden leer."
+---
+
+## Checks aplicados
+
+| Check | Condición | Severidad |
+|---|---|---|
+| `directory_missing` | `e.Path` no existe en disco | error |
+| `layout_missing` | no hay `data/` ni `knowledge/` en la raíz del vault | warning |
+| `non_standard_layout` | no hay `data/`/`knowledge/` pero sí otros subdirectorios (ej. imagegen_models) | warning |
+| `index_missing` | no existe `vault_index.db` | warning |
+| `index_stale` | algún archivo en disco tiene mtime > MAX(indexed_at) | warning |
+| `index_drift` | count disco != count en tabla `files` | warning |
+| `empty_vault` | DiskFiles == 0 | warning |
+
+## Ejemplo
+
+```go
+entries, err := infra.VaultDoctor("/home/lucas/fn_registry")
+for _, e := range entries {
+    fmt.Printf("%-30s  %-8s  files=%d  issues=%v\n",
+        e.VaultName, e.Status, e.DiskFiles, e.Issues)
+}
+```
+
+## Notas
+
+- Función read-only: nunca escribe en disco ni en ninguna base de datos.
+- `countDiskFiles` usa `filepath.WalkDir` sin hash (cheap) — excluye `vault_index.db*`, `.git/` y ficheros ocultos.
+- `isIndexStale` también usa WalkDir; compara mtime de archivos con MAX(indexed_at) de la BD.
+- El VaultIndexOpen de sólo lectura no crea el DB (si no existe, retorna error y se reporta `index_missing`).
@@ -0,0 +1,211 @@
+package infra
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+// setupDoctorRepo creates a repo layout with one vault in a project manifest.
+// vaultPath must be an absolute path that already exists (or not, for missing tests).
+func setupDoctorRepo(t *testing.T, vaultName, projectID, vaultPath string) string {
+	t.Helper()
+	root := t.TempDir()
+	projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
+	if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
+		t.Fatalf("mkdir projects: %v", err)
+	}
+	manifest := "vaults:\n  - name: " + vaultName + "\n    description: test vault\n    path: " + vaultPath + "\n    tags: []\n"
+	if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifest), 0644); err != nil {
+		t.Fatalf("write vault.yaml: %v", err)
+	}
+	return root
+}
+
+func TestVaultDoctor_OK(t *testing.T) {
+	vaultDir := t.TempDir()
+
+	// Proper layout
+	if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	// Create a file with a past mtime so the index is not stale
+	samplePath := filepath.Join(vaultDir, "data", "raw", "sample.csv")
+	if err := os.WriteFile(samplePath, []byte("a,b\n1,2\n"), 0644); err != nil {
+		t.Fatal(err)
+	}
+	pastTime := time.Now().Add(-1 * time.Hour)
+	if err := os.Chtimes(samplePath, pastTime, pastTime); err != nil {
+		t.Fatal(err)
+	}
+
+	// Create vault_index.db with the file indexed after its mtime
+	vdb, err := VaultIndexOpen(vaultDir)
+	if err != nil {
+		t.Fatalf("VaultIndexOpen: %v", err)
+	}
+	futureIndexed := time.Now().Unix() // indexed_at is now — after file mtime
+	_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+		"data/raw/sample.csv", 8, pastTime.Unix(), "deadbeef", "text/csv", ".csv", "data", "raw", futureIndexed)
+	if err != nil {
+		t.Fatalf("insert: %v", err)
+	}
+	vdb.Close()
+
+	root := setupDoctorRepo(t, "my_vault", "my_proj", vaultDir)
+	entries, err := VaultDoctor(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	e := entries[0]
+	if e.Status != "ok" {
+		t.Errorf("Status: want ok, got %s (issues: %v)", e.Status, e.Issues)
+	}
+	if len(e.Issues) != 0 {
+		t.Errorf("Issues: want empty, got %v", e.Issues)
+	}
+	if e.DiskFiles != 1 {
+		t.Errorf("DiskFiles: want 1, got %d", e.DiskFiles)
+	}
+	if e.IndexedFiles != 1 {
+		t.Errorf("IndexedFiles: want 1, got %d", e.IndexedFiles)
+	}
+}
+
+func TestVaultDoctor_MissingDir(t *testing.T) {
+	missingPath := filepath.Join(t.TempDir(), "does_not_exist")
+	root := setupDoctorRepo(t, "missing_vault", "my_proj", missingPath)
+
+	entries, err := VaultDoctor(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	e := entries[0]
+	if e.Status != "error" {
+		t.Errorf("Status: want error, got %s", e.Status)
+	}
+	found := false
+	for _, issue := range e.Issues {
+		if issue == "directory_missing" {
+			found = true
+		}
+	}
+	if !found {
+		t.Errorf("Expected directory_missing issue, got %v", e.Issues)
+	}
+}
+
+func TestVaultDoctor_NoIndex(t *testing.T) {
+	vaultDir := t.TempDir()
+	// Proper layout but no vault_index.db
+	if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "a.csv"), []byte("x"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	root := setupDoctorRepo(t, "no_index_vault", "my_proj", vaultDir)
+	entries, err := VaultDoctor(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	e := entries[0]
+	if e.Status != "warning" {
+		t.Errorf("Status: want warning, got %s", e.Status)
+	}
+	found := false
+	for _, issue := range e.Issues {
+		if issue == "index_missing" {
+			found = true
+		}
+	}
+	if !found {
+		t.Errorf("Expected index_missing issue, got %v", e.Issues)
+	}
+}
+
+func TestVaultDoctor_LayoutDrift(t *testing.T) {
+	vaultDir := t.TempDir()
+	// No data/ or knowledge/ — just a random file at root
+	if err := os.WriteFile(filepath.Join(vaultDir, "something.txt"), []byte("hi"), 0644); err != nil {
+		t.Fatal(err)
+	}
+
+	root := setupDoctorRepo(t, "layout_vault", "my_proj", vaultDir)
+	entries, err := VaultDoctor(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	e := entries[0]
+	if e.Status != "warning" {
+		t.Errorf("Status: want warning, got %s", e.Status)
+	}
+	foundLayout := false
+	for _, issue := range e.Issues {
+		if issue == "layout_missing" || issue == "non_standard_layout" {
+			foundLayout = true
+		}
+	}
+	if !foundLayout {
+		t.Errorf("Expected layout_missing or non_standard_layout, got %v", e.Issues)
+	}
+}
+
+func TestVaultDoctor_EmptyVault(t *testing.T) {
+	vaultDir := t.TempDir()
+	// data/ and knowledge/ exist but are empty
+	if err := os.MkdirAll(filepath.Join(vaultDir, "data"), 0755); err != nil {
+		t.Fatal(err)
+	}
+	if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	// Create vault_index.db (empty)
+	vdb, err := VaultIndexOpen(vaultDir)
+	if err != nil {
+		t.Fatalf("VaultIndexOpen: %v", err)
+	}
+	vdb.Close()
+
+	root := setupDoctorRepo(t, "empty_vault", "my_proj", vaultDir)
+	entries, err := VaultDoctor(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 1 {
+		t.Fatalf("expected 1 entry, got %d", len(entries))
+	}
+	e := entries[0]
+	if e.Status != "warning" {
+		t.Errorf("Status: want warning, got %s (issues: %v)", e.Status, e.Issues)
+	}
+	found := false
+	for _, issue := range e.Issues {
+		if issue == "empty_vault" {
+			found = true
+		}
+	}
+	if !found {
+		t.Errorf("Expected empty_vault issue, got %v", e.Issues)
+	}
+}
@@ -0,0 +1,21 @@
+package infra
+
+// VaultFile describes a single file inside a vault directory.
+// It carries identity (vault + relative path), content metadata (size, mtime, sha256, mime)
+// and structural classification (bucket, sub-bucket).
+type VaultFile struct {
+	VaultID   string `json:"vault_id"`   // e.g. "turismo_spain_app_turismo"
+	VaultName string `json:"vault_name"` // e.g. "turismo_spain"
+	RelPath   string `json:"rel_path"`   // path relative to vault root, e.g. "data/raw/foo.csv"
+	Size      int64  `json:"size"`       // bytes
+	Mtime     int64  `json:"mtime"`      // unix seconds (UTC)
+	Sha256    string `json:"sha256"`     // hex lowercase
+	Mime      string `json:"mime"`       // e.g. "text/csv"
+	Ext       string `json:"ext"`        // e.g. ".csv"
+	// Bucket is the top-level classification: "data" or "knowledge".
+	Bucket string `json:"bucket"`
+	// SubBucket is the second-level directory within the bucket.
+	// Known values: raw, processed, exports (data); decisions, domains, models,
+	// benchmarks, test_documents (knowledge). Empty string for files at bucket root.
+	SubBucket string `json:"sub_bucket"`
+}
@@ -0,0 +1,49 @@
+CREATE TABLE IF NOT EXISTS files (
+    rel_path    TEXT PRIMARY KEY,
+    size        INTEGER NOT NULL,
+    mtime       INTEGER NOT NULL,
+    sha256      TEXT NOT NULL,
+    mime        TEXT NOT NULL DEFAULT '',
+    ext         TEXT NOT NULL DEFAULT '',
+    bucket      TEXT NOT NULL DEFAULT '',
+    sub_bucket  TEXT NOT NULL DEFAULT '',
+    indexed_at  INTEGER NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_files_sha256 ON files(sha256);
+CREATE INDEX IF NOT EXISTS idx_files_bucket ON files(bucket, sub_bucket);
+
+CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
+    rel_path,
+    content_text,
+    content='',
+    tokenize='unicode61 remove_diacritics 2'
+);
+
+CREATE TABLE IF NOT EXISTS csv_profiles (
+    rel_path    TEXT PRIMARY KEY,
+    cols_json   TEXT NOT NULL,
+    n_rows      INTEGER NOT NULL,
+    encoding    TEXT NOT NULL DEFAULT '',
+    date_min    TEXT,
+    date_max    TEXT,
+    profiled_at INTEGER NOT NULL,
+    FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
+);
+
+CREATE TABLE IF NOT EXISTS pdf_extracts (
+    rel_path     TEXT PRIMARY KEY,
+    page_count   INTEGER NOT NULL,
+    text_len     INTEGER NOT NULL,
+    extracted_to TEXT,
+    extracted_at INTEGER NOT NULL,
+    FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
+);
+
+CREATE TABLE IF NOT EXISTS knowledge_docs (
+    rel_path         TEXT PRIMARY KEY,
+    title            TEXT NOT NULL DEFAULT '',
+    frontmatter_json TEXT NOT NULL DEFAULT '{}',
+    headings_json    TEXT NOT NULL DEFAULT '[]',
+    parsed_at        INTEGER NOT NULL,
+    FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
+);
@@ -0,0 +1,30 @@
+package infra
+
+import (
+	"database/sql"
+	"embed"
+	"fmt"
+	"path/filepath"
+)
+
+//go:embed vault_index_migrations/*.sql
+var vaultIndexMigrationsFS embed.FS
+
+// VaultIndexOpen opens (or creates) the vault_index.db inside vaultPath.
+// It applies all embedded migrations idempotently and returns a ready-to-use
+// *sql.DB. The caller is responsible for closing the connection.
+//
+// The database is opened with WAL mode and foreign keys enabled via SQLiteOpen.
+// Migrations are applied from vault_index_migrations/*.sql in lexicographic order.
+func VaultIndexOpen(vaultPath string) (*sql.DB, error) {
+	dbPath := filepath.Join(vaultPath, "vault_index.db")
+	db, err := SQLiteOpen(dbPath, "")
+	if err != nil {
+		return nil, fmt.Errorf("vault_index_open: %w", err)
+	}
+	if err := ApplyMigrations(db, vaultIndexMigrationsFS, "vault_index_migrations/*.sql"); err != nil {
+		db.Close()
+		return nil, fmt.Errorf("vault_index_open: apply migrations: %w", err)
+	}
+	return db, nil
+}
@@ -0,0 +1,54 @@
+---
+name: vault_index_open
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultIndexOpen(vaultPath string) (*sql.DB, error)"
+description: "Abre (o crea) vault_index.db dentro de vaultPath con WAL + FK y aplica las migraciones embebidas idempotentemente. El caller cierra la conexion."
+tags: [vault, sqlite, index, migration, infra]
+uses_functions: ["sqlite_open_go_infra", "sqlite_apply_migrations_go_infra"]
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [database/sql, embed, fmt, path/filepath]
+params:
+  - name: vaultPath
+    desc: "ruta absoluta o relativa al directorio raiz del vault"
+output: "*sql.DB apuntando a <vaultPath>/vault_index.db con schema completo aplicado; el caller es responsable de cerrar"
+tested: true
+tests:
+  - "crea vault_index.db en tmpdir vacio"
+  - "segunda apertura no falla (idempotente)"
+  - "todas las tablas esperadas existen en sqlite_master"
+  - "fts5 INSERT y MATCH funcionan"
+test_file_path: "functions/infra/vault_index_open_test.go"
+file_path: "functions/infra/vault_index_open.go"
+---
+
+## Ejemplo
+
+```go
+db, err := VaultIndexOpen("/data/vaults/turismo_spain")
+if err != nil {
+    log.Fatal(err)
+}
+defer db.Close()
+```
+
+## Notas
+
+El archivo de base de datos se crea en `<vaultPath>/vault_index.db`. Las migraciones
+viven en `vault_index_migrations/*.sql` embebidas via `//go:embed` en el mismo paquete.
+
+Schema creado por `001_init.sql`:
+- `files` — inventario de archivos (PK: rel_path)
+- `files_fts` — tabla FTS5 virtual para busqueda de texto (content_text lo llenan profilers posteriores)
+- `csv_profiles` — perfil de columnas/filas para .csv (FK → files)
+- `pdf_extracts` — metadatos de extraccion de texto para .pdf (FK → files)
+- `knowledge_docs` — headings/frontmatter para .md del bucket knowledge (FK → files)
+
+`SQLiteOpen` abre con WAL mode + foreign keys. `ApplyMigrations` es idempotente:
+los errores de "already exists" y "duplicate column" se ignoran silenciosamente.
@@ -0,0 +1,107 @@
+package infra
+
+import (
+	"database/sql"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestVaultIndexOpen_CreatesDB(t *testing.T) {
+	t.Run("crea vault_index.db en tmpdir vacio", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatalf("VaultIndexOpen: %v", err)
+		}
+		defer db.Close()
+
+		dbPath := filepath.Join(dir, "vault_index.db")
+		if _, err := os.Stat(dbPath); os.IsNotExist(err) {
+			t.Fatalf("vault_index.db no fue creado en %s", dir)
+		}
+	})
+}
+
+func TestVaultIndexOpen_Idempotent(t *testing.T) {
+	t.Run("segunda apertura no falla (idempotente)", func(t *testing.T) {
+		dir := t.TempDir()
+
+		db1, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatalf("primera apertura: %v", err)
+		}
+		db1.Close()
+
+		db2, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatalf("segunda apertura: %v", err)
+		}
+		db2.Close()
+	})
+}
+
+func TestVaultIndexOpen_AppliesAllMigrations(t *testing.T) {
+	t.Run("todas las tablas esperadas existen en sqlite_master", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatalf("VaultIndexOpen: %v", err)
+		}
+		defer db.Close()
+
+		expectedTables := []string{
+			"files",
+			"csv_profiles",
+			"pdf_extracts",
+			"knowledge_docs",
+		}
+		for _, tbl := range expectedTables {
+			assertTableExists(t, db, tbl)
+		}
+	})
+}
+
+func TestVaultIndexOpen_FTS5Works(t *testing.T) {
+	t.Run("fts5 INSERT y MATCH funcionan", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatalf("VaultIndexOpen: %v", err)
+		}
+		defer db.Close()
+
+		// Insert a row into files_fts (content='' table, manual INSERT required)
+		_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`,
+			"data/raw/informe_ventas.csv", "ventas trimestrales empresa")
+		if err != nil {
+			t.Fatalf("INSERT files_fts: %v", err)
+		}
+
+		var count int
+		err = db.QueryRow(
+			`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'ventas'`,
+		).Scan(&count)
+		if err != nil {
+			t.Fatalf("FTS MATCH query: %v", err)
+		}
+		if count != 1 {
+			t.Errorf("FTS MATCH: got %d rows, want 1", count)
+		}
+	})
+}
+
+// assertTableExists verifies that a table (or virtual table) exists in sqlite_master.
+func assertTableExists(t *testing.T, db *sql.DB, name string) {
+	t.Helper()
+	var exists int
+	err := db.QueryRow(
+		`SELECT count(*) FROM sqlite_master WHERE name = ?`, name,
+	).Scan(&exists)
+	if err != nil {
+		t.Fatalf("sqlite_master query for %q: %v", name, err)
+	}
+	if exists == 0 {
+		t.Errorf("table/vtable %q not found in sqlite_master", name)
+	}
+}
@@ -0,0 +1,154 @@
+package infra
+
+import (
+	"database/sql"
+	"fmt"
+	"strings"
+	"time"
+)
+
+// WriteReport summarises the outcome of a VaultIndexWrite call.
+type WriteReport struct {
+	Inserted int // rows newly inserted into files
+	Updated  int // rows updated (upserted) in files
+	Pruned   int // rows deleted from files (only when prune=true)
+	FTS      int // rows inserted into files_fts
+}
+
+// VaultIndexWrite upserts a slice of VaultFile into the vault_index.db opened
+// as db, updates the files_fts FTS5 table, and optionally prunes stale rows.
+//
+// All changes run inside a single transaction.
+//
+// Counting strategy: the set of rel_paths already in the DB is read before the
+// loop. An upsert is counted as Inserted if the rel_path was absent, Updated if
+// it was present. This avoids N+1 queries while remaining correct.
+//
+// FTS5: all affected rows are deleted and re-inserted with rel_path and empty
+// content_text. Downstream profilers (csv_profiles, pdf_extracts, knowledge_docs)
+// are responsible for populating content_text with meaningful text.
+//
+// Prune: if prune=true, every row in files whose rel_path is NOT in the provided
+// slice is deleted. Cascades to csv_profiles, pdf_extracts, knowledge_docs via FK.
+func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error) {
+	var report WriteReport
+	if len(files) == 0 && !prune {
+		return report, nil
+	}
+
+	tx, err := db.Begin()
+	if err != nil {
+		return report, fmt.Errorf("vault_index_write: begin tx: %w", err)
+	}
+	defer func() {
+		if err != nil {
+			tx.Rollback() //nolint:errcheck
+		}
+	}()
+
+	// Load existing rel_paths into a set to distinguish insert vs update.
+	existing := make(map[string]struct{})
+	rows, err := tx.Query(`SELECT rel_path FROM files`)
+	if err != nil {
+		return report, fmt.Errorf("vault_index_write: query existing: %w", err)
+	}
+	for rows.Next() {
+		var rp string
+		if err := rows.Scan(&rp); err != nil {
+			rows.Close()
+			return report, fmt.Errorf("vault_index_write: scan existing: %w", err)
+		}
+		existing[rp] = struct{}{}
+	}
+	rows.Close()
+	if err := rows.Err(); err != nil {
+		return report, fmt.Errorf("vault_index_write: rows err: %w", err)
+	}
+
+	now := time.Now().Unix()
+
+	upsertStmt, err := tx.Prepare(`
+		INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+		VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+		ON CONFLICT(rel_path) DO UPDATE SET
+			size       = excluded.size,
+			mtime      = excluded.mtime,
+			sha256     = excluded.sha256,
+			mime       = excluded.mime,
+			ext        = excluded.ext,
+			bucket     = excluded.bucket,
+			sub_bucket = excluded.sub_bucket,
+			indexed_at = excluded.indexed_at
+	`)
+	if err != nil {
+		return report, fmt.Errorf("vault_index_write: prepare upsert: %w", err)
+	}
+	defer upsertStmt.Close()
+
+	ftsDeleteStmt, err := tx.Prepare(`DELETE FROM files_fts WHERE rel_path = ?`)
+	if err != nil {
+		return report, fmt.Errorf("vault_index_write: prepare fts delete: %w", err)
+	}
+	defer ftsDeleteStmt.Close()
+
+	ftsInsertStmt, err := tx.Prepare(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, '')`)
+	if err != nil {
+		return report, fmt.Errorf("vault_index_write: prepare fts insert: %w", err)
+	}
+	defer ftsInsertStmt.Close()
+
+	for _, f := range files {
+		_, err = upsertStmt.Exec(
+			f.RelPath, f.Size, f.Mtime, f.Sha256,
+			f.Mime, f.Ext, f.Bucket, f.SubBucket, now,
+		)
+		if err != nil {
+			return report, fmt.Errorf("vault_index_write: upsert %q: %w", f.RelPath, err)
+		}
+
+		if _, wasExisting := existing[f.RelPath]; wasExisting {
+			report.Updated++
+		} else {
+			report.Inserted++
+		}
+
+		// Refresh FTS row.
+		if _, err = ftsDeleteStmt.Exec(f.RelPath); err != nil {
+			return report, fmt.Errorf("vault_index_write: fts delete %q: %w", f.RelPath, err)
+		}
+		if _, err = ftsInsertStmt.Exec(f.RelPath); err != nil {
+			return report, fmt.Errorf("vault_index_write: fts insert %q: %w", f.RelPath, err)
+		}
+		report.FTS++
+	}
+
+	// Prune rows not present in the incoming slice.
+	if prune && len(files) > 0 {
+		keep := make([]string, len(files))
+		for i, f := range files {
+			keep[i] = "'" + strings.ReplaceAll(f.RelPath, "'", "''") + "'"
+		}
+		inClause := strings.Join(keep, ",")
+		res, err := tx.Exec(fmt.Sprintf(
+			`DELETE FROM files WHERE rel_path NOT IN (%s)`, inClause,
+		))
+		if err != nil {
+			return report, fmt.Errorf("vault_index_write: prune: %w", err)
+		}
+		n, _ := res.RowsAffected()
+		report.Pruned = int(n)
+	} else if prune && len(files) == 0 {
+		// prune=true with empty slice means delete everything.
+		res, err := tx.Exec(`DELETE FROM files`)
+		if err != nil {
+			return report, fmt.Errorf("vault_index_write: prune all: %w", err)
+		}
+		n, _ := res.RowsAffected()
+		report.Pruned = int(n)
+	}
+
+	if err = tx.Commit(); err != nil {
+		return report, fmt.Errorf("vault_index_write: commit: %w", err)
+	}
+	return report, nil
+}
@@ -0,0 +1,84 @@
+---
+name: vault_index_write
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error)"
+description: "Upserta un slice de VaultFile en vault_index.db (tabla files + FTS5 files_fts) dentro de una sola transaccion. Cuenta Inserted/Updated/FTS. Con prune=true elimina filas no presentes en el slice."
+tags: [vault, sqlite, index, write, upsert, fts, infra]
+uses_functions: []
+uses_types: ["vault_file_go_infra"]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [database/sql, fmt, strings, time]
+params:
+  - name: db
+    desc: "*sql.DB abierto sobre vault_index.db (tipicamente retornado por VaultIndexOpen)"
+  - name: files
+    desc: "slice de VaultFile a insertar/actualizar; puede ser vacio"
+  - name: prune
+    desc: "si true, elimina de 'files' todas las filas cuyo rel_path no este en el slice (sincronizacion destructiva)"
+output: "WriteReport con conteos Inserted/Updated/Pruned/FTS; error si falla la transaccion"
+tested: true
+tests:
+  - "N archivos nuevos — Inserted=N"
+  - "re-escritura con mtime distinto — Updated=N"
+  - "prune elimina filas ausentes"
+  - "sin prune, filas previas persisten"
+  - "FTS5 MATCH funciona tras escritura"
+test_file_path: "functions/infra/vault_index_write_test.go"
+file_path: "functions/infra/vault_index_write.go"
+---
+
+## Ejemplo
+
+```go
+db, _ := VaultIndexOpen("/data/vaults/turismo")
+defer db.Close()
+
+files, _ := VaultInventoryScan("/data/vaults/turismo", "turismo_v1", "turismo")
+report, err := VaultIndexWrite(db, files, true)
+if err != nil {
+    log.Fatal(err)
+}
+fmt.Printf("inserted=%d updated=%d pruned=%d fts=%d\n",
+    report.Inserted, report.Updated, report.Pruned, report.FTS)
+```
+
+## Notas
+
+### WriteReport
+Struct local al paquete infra:
+```go
+type WriteReport struct {
+    Inserted int
+    Updated  int
+    Pruned   int
+    FTS      int
+}
+```
+
+### Estrategia de conteo Inserted vs Updated
+Se carga el conjunto de rel_paths existentes en un map antes del loop. Un upsert
+se clasifica como Inserted si el rel_path no estaba en el map, Updated si estaba.
+Esto evita N+1 SELECTs y es correcto porque la transaccion serializa los cambios.
+
+### FTS5
+`files_fts` usa `content=''` (tabla de contenido externo vacio). Para cada archivo
+se borra la fila FTS existente y se reinserta con `content_text=''`. Los profilers
+posteriores (csv_profiles, knowledge_docs) son responsables de actualizar
+`content_text` con texto indexable real.
+
+### Prune
+Con `prune=true` se construye un IN clause con los rel_paths del slice. La FK con
+`ON DELETE CASCADE` propaga el DELETE a csv_profiles, pdf_extracts y knowledge_docs
+automaticamente. Con slice vacio + prune=true se borra todo (DELETE FROM files).
+
+### Escapado SQL
+El IN clause se construye escapando las comillas simples en rel_path (duplicandolas).
+Evita inyeccion en rutas con apostrofos. Para entornos con rutas controladas
+(interior de vaults sin apostrofos) esto es suficiente; para entornos adversariales
+usar parametros binding con VALUES multiples via prepared statement.
@@ -0,0 +1,210 @@
+package infra
+
+import (
+	"testing"
+	"time"
+)
+
+// makeTestVaultFile creates a minimal VaultFile for testing.
+func makeTestVaultFile(relPath, mime, bucket, subBucket string) VaultFile {
+	return VaultFile{
+		VaultID:   "test_vault",
+		VaultName: "test",
+		RelPath:   relPath,
+		Size:      100,
+		Mtime:     time.Now().Unix(),
+		Sha256:    "abc123def456abc123def456abc123def456abc123def456abc123def456abc1",
+		Mime:      mime,
+		Ext:       ".csv",
+		Bucket:    bucket,
+		SubBucket: subBucket,
+	}
+}
+
+func openInMemoryVaultIndex(t *testing.T) interface{ Close() error } {
+	t.Helper()
+	dir := t.TempDir()
+	db, err := VaultIndexOpen(dir)
+	if err != nil {
+		t.Fatalf("VaultIndexOpen: %v", err)
+	}
+	return db
+}
+
+func TestVaultIndexWrite_FreshInsert(t *testing.T) {
+	t.Run("N archivos nuevos — Inserted=N", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer db.Close()
+
+		files := []VaultFile{
+			makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("knowledge/decisions/x.md", "text/markdown", "knowledge", "decisions"),
+		}
+
+		report, err := VaultIndexWrite(db, files, false)
+		if err != nil {
+			t.Fatalf("VaultIndexWrite: %v", err)
+		}
+		if report.Inserted != 3 {
+			t.Errorf("Inserted = %d, want 3", report.Inserted)
+		}
+		if report.Updated != 0 {
+			t.Errorf("Updated = %d, want 0", report.Updated)
+		}
+		if report.Pruned != 0 {
+			t.Errorf("Pruned = %d, want 0", report.Pruned)
+		}
+		if report.FTS != 3 {
+			t.Errorf("FTS = %d, want 3", report.FTS)
+		}
+	})
+}
+
+func TestVaultIndexWrite_Upsert(t *testing.T) {
+	t.Run("re-escritura con mtime distinto — Updated=N", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer db.Close()
+
+		files := []VaultFile{
+			makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
+		}
+
+		if _, err := VaultIndexWrite(db, files, false); err != nil {
+			t.Fatalf("first write: %v", err)
+		}
+
+		// Modify mtime to simulate file change.
+		files[0].Mtime = time.Now().Unix() + 100
+		files[1].Mtime = time.Now().Unix() + 200
+
+		report, err := VaultIndexWrite(db, files, false)
+		if err != nil {
+			t.Fatalf("second write: %v", err)
+		}
+		if report.Inserted != 0 {
+			t.Errorf("Inserted = %d, want 0", report.Inserted)
+		}
+		if report.Updated != 2 {
+			t.Errorf("Updated = %d, want 2", report.Updated)
+		}
+	})
+}
+
+func TestVaultIndexWrite_Prune(t *testing.T) {
+	t.Run("prune elimina filas ausentes", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer db.Close()
+
+		// Write A and B.
+		ab := []VaultFile{
+			makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
+		}
+		if _, err := VaultIndexWrite(db, ab, false); err != nil {
+			t.Fatalf("first write: %v", err)
+		}
+
+		// Write only A with prune=true — B should be deleted.
+		onlyA := []VaultFile{ab[0]}
+		report, err := VaultIndexWrite(db, onlyA, true)
+		if err != nil {
+			t.Fatalf("prune write: %v", err)
+		}
+		if report.Pruned != 1 {
+			t.Errorf("Pruned = %d, want 1", report.Pruned)
+		}
+
+		// Verify B is gone.
+		var count int
+		err = db.QueryRow(`SELECT count(*) FROM files WHERE rel_path = 'data/raw/b.csv'`).Scan(&count)
+		if err != nil {
+			t.Fatalf("query: %v", err)
+		}
+		if count != 0 {
+			t.Errorf("b.csv still present after prune")
+		}
+	})
+}
+
+func TestVaultIndexWrite_NoPrune(t *testing.T) {
+	t.Run("sin prune, filas previas persisten", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer db.Close()
+
+		ab := []VaultFile{
+			makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
+		}
+		if _, err := VaultIndexWrite(db, ab, false); err != nil {
+			t.Fatalf("first write: %v", err)
+		}
+
+		// Write only A without prune — B must remain.
+		onlyA := []VaultFile{ab[0]}
+		report, err := VaultIndexWrite(db, onlyA, false)
+		if err != nil {
+			t.Fatalf("second write: %v", err)
+		}
+		if report.Pruned != 0 {
+			t.Errorf("Pruned = %d, want 0", report.Pruned)
+		}
+
+		var count int
+		err = db.QueryRow(`SELECT count(*) FROM files`).Scan(&count)
+		if err != nil {
+			t.Fatalf("query: %v", err)
+		}
+		if count != 2 {
+			t.Errorf("files count = %d, want 2", count)
+		}
+	})
+}
+
+func TestVaultIndexWrite_FTSMatch(t *testing.T) {
+	t.Run("FTS5 MATCH funciona tras escritura", func(t *testing.T) {
+		dir := t.TempDir()
+		db, err := VaultIndexOpen(dir)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer db.Close()
+
+		files := []VaultFile{
+			makeTestVaultFile("data/raw/foo_report.csv", "text/csv", "data", "raw"),
+			makeTestVaultFile("data/raw/bar_data.csv", "text/csv", "data", "raw"),
+		}
+		if _, err := VaultIndexWrite(db, files, false); err != nil {
+			t.Fatalf("write: %v", err)
+		}
+
+		// FTS5 on rel_path column: MATCH 'foo*'
+		var count int
+		err = db.QueryRow(
+			`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'rel_path:foo*'`,
+		).Scan(&count)
+		if err != nil {
+			t.Fatalf("FTS MATCH query: %v", err)
+		}
+		if count != 1 {
+			t.Errorf("FTS MATCH rel_path:foo* = %d rows, want 1", count)
+		}
+	})
+}
@@ -0,0 +1,174 @@
+package infra
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
+// for every regular file found, skipping:
+//   - vault_index.db, vault_index.db-shm, vault_index.db-wal
+//   - .git/ directories at any depth
+//   - hidden files/dirs (names starting with ".") at the vault root level only
+//
+// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
+// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
+//
+// MIME detection priority:
+//  1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
+//  2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
+//
+// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
+// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
+// for upload validation, not for open-ended inventory scanning where any MIME is valid.
+// http.DetectContentType provides the same magic-byte detection without the allowlist
+// coupling and handles a broader set of formats including text/plain for CSV fallback.
+func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
+	var files []VaultFile
+
+	err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+
+		name := d.Name()
+
+		// Skip .git directories at any depth.
+		if d.IsDir() && name == ".git" {
+			return filepath.SkipDir
+		}
+
+		// Skip hidden entries (names starting with ".") at vault root only.
+		if strings.HasPrefix(name, ".") {
+			rel, relErr := filepath.Rel(vaultPath, path)
+			if relErr == nil {
+				// At root level the relative path has no separator.
+				if !strings.Contains(filepath.ToSlash(rel), "/") {
+					if d.IsDir() {
+						return filepath.SkipDir
+					}
+					return nil
+				}
+			}
+		}
+
+		if d.IsDir() {
+			return nil
+		}
+
+		// Skip vault_index.db and its WAL/SHM sidecar files.
+		if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
+			return nil
+		}
+
+		rel, err := filepath.Rel(vaultPath, path)
+		if err != nil {
+			return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
+		}
+		rel = filepath.ToSlash(rel)
+
+		info, err := d.Info()
+		if err != nil {
+			return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
+		}
+
+		// Compute sha256 by streaming — avoids loading large files into memory.
+		sha, err := fileSha256(path)
+		if err != nil {
+			return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
+		}
+
+		mime, err := detectVaultFileMime(path, name)
+		if err != nil {
+			return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
+		}
+
+		ext := strings.ToLower(filepath.Ext(name))
+		bucket, subBucket := vaultBucketParts(rel)
+
+		files = append(files, VaultFile{
+			VaultID:   vaultID,
+			VaultName: vaultName,
+			RelPath:   rel,
+			Size:      info.Size(),
+			Mtime:     info.ModTime().UTC().Unix(),
+			Sha256:    sha,
+			Mime:      mime,
+			Ext:       ext,
+			Bucket:    bucket,
+			SubBucket: subBucket,
+		})
+		return nil
+	})
+	if err != nil {
+		return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
+	}
+
+	sort.Slice(files, func(i, j int) bool {
+		return files[i].RelPath < files[j].RelPath
+	})
+	return files, nil
+}
+
+// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
+func fileSha256(path string) (string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+	h := sha256.New()
+	if _, err := io.Copy(h, f); err != nil {
+		return "", err
+	}
+	return hex.EncodeToString(h.Sum(nil)), nil
+}
+
+// detectVaultFileMime returns the MIME type for a vault file.
+// Extension overrides take priority; otherwise http.DetectContentType is used.
+func detectVaultFileMime(path, name string) (string, error) {
+	ext := strings.ToLower(filepath.Ext(name))
+	switch ext {
+	case ".csv":
+		return "text/csv", nil
+	case ".md":
+		return "text/markdown", nil
+	case ".parquet":
+		return "application/parquet", nil
+	}
+
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	buf := make([]byte, 512)
+	n, err := f.Read(buf)
+	if err != nil && err != io.EOF {
+		return "", err
+	}
+	return http.DetectContentType(buf[:n]), nil
+}
+
+// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
+// the second-level sub-bucket from a forward-slash relative path.
+// Returns empty strings for files at vault root or with no recognisable bucket.
+func vaultBucketParts(relPath string) (bucket, subBucket string) {
+	parts := strings.SplitN(relPath, "/", 3)
+	if len(parts) < 1 {
+		return "", ""
+	}
+	bucket = parts[0]
+	if len(parts) >= 2 {
+		subBucket = parts[1]
+	}
+	return bucket, subBucket
+}
@@ -0,0 +1,74 @@
+---
+name: vault_inventory_scan
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error)"
+description: "Recorre vaultPath con filepath.WalkDir y retorna un slice de VaultFile ordenado por RelPath para cada archivo regular, computando sha256 por streaming, MIME por extension/magic y bucket/sub-bucket por posicion en el arbol."
+tags: [vault, inventory, scan, filesystem, sha256, mime, infra]
+uses_functions: []
+uses_types: ["vault_file_go_infra"]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [crypto/sha256, encoding/hex, fmt, io, net/http, os, path/filepath, sort, strings]
+params:
+  - name: vaultPath
+    desc: "ruta absoluta o relativa al directorio raiz del vault"
+  - name: vaultID
+    desc: "identificador del vault (ej: turismo_spain_app_turismo) — se copia a cada VaultFile"
+  - name: vaultName
+    desc: "nombre legible del vault (ej: turismo_spain) — se copia a cada VaultFile"
+output: "slice de VaultFile ordenado lexicograficamente por RelPath; slice vacio (no nil) si el vault esta vacio"
+tested: true
+tests:
+  - "tmpdir vacio retorna slice vacio"
+  - "data layout — bucket y sub_bucket correctos"
+  - "knowledge layout — bucket y sub_bucket correctos"
+  - "omite vault_index.db y .git"
+  - "sha256 determinista para mismo contenido"
+  - "orden lexicografico del resultado"
+test_file_path: "functions/infra/vault_inventory_scan_test.go"
+file_path: "functions/infra/vault_inventory_scan.go"
+---
+
+## Ejemplo
+
+```go
+files, err := VaultInventoryScan("/data/vaults/turismo_spain", "turismo_spain_v1", "turismo_spain")
+if err != nil {
+    log.Fatal(err)
+}
+for _, f := range files {
+    fmt.Printf("%s  %s  %s/%s\n", f.RelPath, f.Mime, f.Bucket, f.SubBucket)
+}
+```
+
+## Notas
+
+### Archivos omitidos
+- `vault_index.db`, `vault_index.db-shm`, `vault_index.db-wal` (siempre)
+- `.git/` en cualquier profundidad (SkipDir)
+- Entradas cuyo nombre empieza por `.` solo en la raiz del vault (nivel 0)
+
+### Deteccion de MIME
+`file_validate_type_go_infra` (FileValidateType) no se usa porque su firma
+requiere una lista blanca de tipos permitidos y retorna (mime, bool) — esta
+disenada para validacion de uploads, no para escaneo inventarial donde
+cualquier MIME es valido. Se usan en su lugar:
+
+1. Override por extension (prioridad alta): `.csv` → `text/csv`, `.md` → `text/markdown`,
+   `.parquet` → `application/parquet`. Necesario porque `http.DetectContentType`
+   clasifica CSV como `text/plain` y no conoce Parquet.
+2. `http.DetectContentType` sobre primeros 512 bytes (magic bytes, stdlib) para el resto.
+
+### SHA-256
+Calculado por streaming con `io.Copy` a `sha256.New()` — no carga el archivo completo
+a memoria. Valido para archivos de cualquier tamano.
+
+### Bucket / SubBucket
+Derivados de la posicion en el arbol:
+- `bucket` = primer segmento del RelPath (tipicamente "data" o "knowledge")
+- `subBucket` = segundo segmento si existe; vacio si el archivo esta en la raiz del bucket
@@ -0,0 +1,182 @@
+package infra
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func writeTestFile(t *testing.T, dir, rel, content string) {
+	t.Helper()
+	full := filepath.Join(dir, filepath.FromSlash(rel))
+	if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
+		t.Fatalf("mkdir %s: %v", filepath.Dir(full), err)
+	}
+	if err := os.WriteFile(full, []byte(content), 0o644); err != nil {
+		t.Fatalf("write %s: %v", full, err)
+	}
+}
+
+func TestVaultInventoryScan_Empty(t *testing.T) {
+	t.Run("tmpdir vacio retorna slice vacio", func(t *testing.T) {
+		dir := t.TempDir()
+		files, err := VaultInventoryScan(dir, "v1", "test")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(files) != 0 {
+			t.Errorf("expected 0 files, got %d", len(files))
+		}
+	})
+}
+
+func TestVaultInventoryScan_DataLayout(t *testing.T) {
+	t.Run("data layout — bucket y sub_bucket correctos", func(t *testing.T) {
+		dir := t.TempDir()
+		writeTestFile(t, dir, "data/raw/a.csv", "col1,col2\n1,2\n")
+		writeTestFile(t, dir, "data/processed/b.parquet", "PAR1fakedata")
+
+		files, err := VaultInventoryScan(dir, "vid", "vname")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(files) != 2 {
+			t.Fatalf("expected 2 files, got %d", len(files))
+		}
+
+		// files are sorted: data/processed/b.parquet < data/raw/a.csv
+		b := files[0]
+		if b.RelPath != "data/processed/b.parquet" {
+			t.Errorf("files[0].RelPath = %q, want data/processed/b.parquet", b.RelPath)
+		}
+		if b.Bucket != "data" {
+			t.Errorf("files[0].Bucket = %q, want data", b.Bucket)
+		}
+		if b.SubBucket != "processed" {
+			t.Errorf("files[0].SubBucket = %q, want processed", b.SubBucket)
+		}
+		if b.Mime != "application/parquet" {
+			t.Errorf("files[0].Mime = %q, want application/parquet", b.Mime)
+		}
+		if b.Ext != ".parquet" {
+			t.Errorf("files[0].Ext = %q, want .parquet", b.Ext)
+		}
+		if b.VaultID != "vid" {
+			t.Errorf("VaultID = %q, want vid", b.VaultID)
+		}
+
+		a := files[1]
+		if a.RelPath != "data/raw/a.csv" {
+			t.Errorf("files[1].RelPath = %q, want data/raw/a.csv", a.RelPath)
+		}
+		if a.Mime != "text/csv" {
+			t.Errorf("files[1].Mime = %q, want text/csv", a.Mime)
+		}
+		if a.Bucket != "data" || a.SubBucket != "raw" {
+			t.Errorf("files[1]: bucket=%q subBucket=%q, want data/raw", a.Bucket, a.SubBucket)
+		}
+	})
+}
+
+func TestVaultInventoryScan_KnowledgeLayout(t *testing.T) {
+	t.Run("knowledge layout — bucket y sub_bucket correctos", func(t *testing.T) {
+		dir := t.TempDir()
+		writeTestFile(t, dir, "knowledge/decisions/x.md", "# Decision\n\ncontent")
+
+		files, err := VaultInventoryScan(dir, "vid", "vname")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(files) != 1 {
+			t.Fatalf("expected 1 file, got %d", len(files))
+		}
+		f := files[0]
+		if f.RelPath != "knowledge/decisions/x.md" {
+			t.Errorf("RelPath = %q", f.RelPath)
+		}
+		if f.Bucket != "knowledge" {
+			t.Errorf("Bucket = %q, want knowledge", f.Bucket)
+		}
+		if f.SubBucket != "decisions" {
+			t.Errorf("SubBucket = %q, want decisions", f.SubBucket)
+		}
+		if f.Mime != "text/markdown" {
+			t.Errorf("Mime = %q, want text/markdown", f.Mime)
+		}
+	})
+}
+
+func TestVaultInventoryScan_SkipsIndexAndGit(t *testing.T) {
+	t.Run("omite vault_index.db y .git", func(t *testing.T) {
+		dir := t.TempDir()
+		writeTestFile(t, dir, "vault_index.db", "sqlite data")
+		writeTestFile(t, dir, "vault_index.db-wal", "wal data")
+		writeTestFile(t, dir, ".git/HEAD", "ref: refs/heads/master")
+		writeTestFile(t, dir, "data/raw/real.csv", "a,b\n1,2\n")
+
+		files, err := VaultInventoryScan(dir, "vid", "vname")
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if len(files) != 1 {
+			t.Fatalf("expected 1 file (real.csv), got %d: %v", len(files), relPaths(files))
+		}
+		if files[0].RelPath != "data/raw/real.csv" {
+			t.Errorf("unexpected file: %q", files[0].RelPath)
+		}
+	})
+}
+
+func TestVaultInventoryScan_Sha256Deterministic(t *testing.T) {
+	t.Run("sha256 determinista para mismo contenido", func(t *testing.T) {
+		dir1 := t.TempDir()
+		dir2 := t.TempDir()
+		content := "deterministic content 123\n"
+		writeTestFile(t, dir1, "data/raw/f.csv", content)
+		writeTestFile(t, dir2, "data/raw/f.csv", content)
+
+		files1, err := VaultInventoryScan(dir1, "v1", "vault1")
+		if err != nil {
+			t.Fatal(err)
+		}
+		files2, err := VaultInventoryScan(dir2, "v2", "vault2")
+		if err != nil {
+			t.Fatal(err)
+		}
+		if files1[0].Sha256 != files2[0].Sha256 {
+			t.Errorf("sha256 mismatch: %q vs %q", files1[0].Sha256, files2[0].Sha256)
+		}
+		if len(files1[0].Sha256) != 64 {
+			t.Errorf("sha256 length = %d, want 64", len(files1[0].Sha256))
+		}
+	})
+}
+
+func TestVaultInventoryScan_Sorted(t *testing.T) {
+	t.Run("orden lexicografico del resultado", func(t *testing.T) {
+		dir := t.TempDir()
+		writeTestFile(t, dir, "knowledge/decisions/z.md", "z")
+		writeTestFile(t, dir, "data/raw/a.csv", "a")
+		writeTestFile(t, dir, "data/processed/m.parquet", "m")
+		writeTestFile(t, dir, "knowledge/domains/b.md", "b")
+
+		files, err := VaultInventoryScan(dir, "v", "v")
+		if err != nil {
+			t.Fatal(err)
+		}
+		for i := 1; i < len(files); i++ {
+			if files[i].RelPath < files[i-1].RelPath {
+				t.Errorf("not sorted at index %d: %q < %q", i, files[i].RelPath, files[i-1].RelPath)
+			}
+		}
+	})
+}
+
+// relPaths is a helper for test error messages.
+func relPaths(files []VaultFile) []string {
+	out := make([]string, len(files))
+	for i, f := range files {
+		out[i] = f.RelPath
+	}
+	return out
+}
@@ -0,0 +1,252 @@
+package infra
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// LayoutReport describes what VaultLayoutEnsure did (or would do) to a vault directory.
+type LayoutReport struct {
+	VaultPath string   `json:"vault_path"`
+	Created   []string `json:"created"`    // dirs created (relative paths)
+	Migrated  []string `json:"migrated"`   // renames executed, format "src -> dst" (relative)
+	AlreadyOK []string `json:"already_ok"` // dirs that already existed at the target location
+	Skipped   []string `json:"skipped"`    // unrecognized root-level entries, left untouched
+	DryRun    bool     `json:"dry_run"`
+}
+
+// dataBuckets are root-level directories that belong under data/.
+var dataBuckets = []string{"raw", "processed", "exports"}
+
+// knowledgeBuckets are root-level directories that belong under knowledge/.
+var knowledgeBuckets = []string{"decisions", "domains", "models", "benchmarks", "test_documents"}
+
+// knownRootFiles are root-level files that should be moved to knowledge/.
+var knownRootFiles = []string{"README.md", "README.txt"}
+
+// VaultLayoutEnsure ensures a vault directory uses the canonical hybrid layout:
+//
+//	data/{raw,processed,exports}
+//	knowledge/{decisions,domains,models,benchmarks,test_documents}
+//
+// Legacy vaults that have these directories at the root are migrated by renaming
+// (or merging when both src and dst already exist). The operation is idempotent:
+// a second run returns everything in AlreadyOK.
+//
+// When dryRun is true the function computes the report but does not touch the disk.
+func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error) {
+	report := LayoutReport{DryRun: dryRun}
+
+	// --- resolve path ---
+	vaultPath = strings.TrimRight(vaultPath, "/\\")
+
+	var err error
+	vaultPath, err = filepath.Abs(vaultPath)
+	if err != nil {
+		return report, fmt.Errorf("vault_layout_ensure: abs(%q): %w", vaultPath, err)
+	}
+
+	// Follow symlinks for the vault root itself.
+	resolved, err := filepath.EvalSymlinks(vaultPath)
+	if err != nil {
+		return report, fmt.Errorf("vault_layout_ensure: eval symlinks %q: %w", vaultPath, err)
+	}
+	vaultPath = resolved
+	report.VaultPath = vaultPath
+
+	// --- check that vault exists and is a directory ---
+	info, err := os.Stat(vaultPath)
+	if err != nil {
+		return report, fmt.Errorf("vault_layout_ensure: stat %q: %w", vaultPath, err)
+	}
+	if !info.IsDir() {
+		return report, fmt.Errorf("vault_layout_ensure: %q is not a directory", vaultPath)
+	}
+
+	// --- ensure top-level containers ---
+	for _, container := range []string{"data", "knowledge"} {
+		dst := filepath.Join(vaultPath, container)
+		if err := ensureDir(dst, dryRun, container, &report); err != nil {
+			return report, err
+		}
+	}
+
+	// --- build migration table: root name -> relative destination ---
+	type migration struct {
+		rootName string // name in vault root (dir or file)
+		dstRel   string // relative destination path inside vault
+		isFile   bool
+	}
+
+	var migrations []migration
+	for _, b := range dataBuckets {
+		migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("data", b)})
+	}
+	for _, b := range knowledgeBuckets {
+		migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("knowledge", b)})
+	}
+	for _, rf := range knownRootFiles {
+		migrations = append(migrations, migration{rootName: rf, dstRel: filepath.Join("knowledge", "README.md"), isFile: true})
+	}
+
+	// Track which root names are "known" so we can compute Skipped.
+	knownNames := make(map[string]struct{})
+	for _, m := range migrations {
+		knownNames[strings.ToLower(m.rootName)] = struct{}{}
+	}
+	knownNames["data"] = struct{}{}
+	knownNames["knowledge"] = struct{}{}
+
+	// --- apply migrations ---
+	for _, m := range migrations {
+		src := filepath.Join(vaultPath, m.rootName)
+		dst := filepath.Join(vaultPath, m.dstRel)
+		srcRel := m.rootName
+		dstRel := m.dstRel
+
+		srcExists := pathExists(src)
+		dstExists := pathExists(dst)
+
+		switch {
+		case srcExists && dstExists:
+			// Both exist: merge if directory, error on file collision.
+			if m.isFile {
+				return report, fmt.Errorf("vault_layout_ensure: conflict: both %q and %q exist", srcRel, dstRel)
+			}
+			if err := mergeDirs(src, dst, srcRel, dstRel, dryRun, &report); err != nil {
+				return report, err
+			}
+
+		case srcExists && !dstExists:
+			// Only source exists: rename.
+			report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", srcRel, dstRel))
+			if !dryRun {
+				if err := os.Rename(src, dst); err != nil {
+					return report, fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", src, dst, err)
+				}
+			}
+
+		case !srcExists && dstExists:
+			// Already migrated.
+			report.AlreadyOK = append(report.AlreadyOK, dstRel)
+
+		default:
+			// Neither exists: create empty destination directory (skip for files).
+			if !m.isFile {
+				report.Created = append(report.Created, dstRel)
+				if !dryRun {
+					if err := os.MkdirAll(dst, 0o755); err != nil {
+						return report, fmt.Errorf("vault_layout_ensure: mkdir %q: %w", dst, err)
+					}
+				}
+			}
+		}
+	}
+
+	// --- collect skipped (unrecognized root entries) ---
+	entries, err := os.ReadDir(vaultPath)
+	if err != nil {
+		return report, fmt.Errorf("vault_layout_ensure: readdir %q: %w", vaultPath, err)
+	}
+	for _, e := range entries {
+		if _, known := knownNames[strings.ToLower(e.Name())]; !known {
+			report.Skipped = append(report.Skipped, e.Name())
+		}
+	}
+
+	return report, nil
+}
+
+// ensureDir adds the dir to Created (and creates it) if it doesn't exist,
+// or to AlreadyOK if it does. Used for top-level containers "data" and "knowledge".
+func ensureDir(path string, dryRun bool, rel string, report *LayoutReport) error {
+	if pathExists(path) {
+		report.AlreadyOK = append(report.AlreadyOK, rel)
+		return nil
+	}
+	report.Created = append(report.Created, rel)
+	if dryRun {
+		return nil
+	}
+	if err := os.MkdirAll(path, 0o755); err != nil {
+		return fmt.Errorf("vault_layout_ensure: mkdir %q: %w", path, err)
+	}
+	return nil
+}
+
+// mergeDirs moves the contents of src into dst, then removes src if empty.
+// Returns an error if any file in src already exists in dst (no overwrite policy).
+func mergeDirs(src, dst, srcRel, dstRel string, dryRun bool, report *LayoutReport) error {
+	children, err := os.ReadDir(src)
+	if err != nil {
+		return fmt.Errorf("vault_layout_ensure: readdir %q: %w", src, err)
+	}
+
+	for _, child := range children {
+		childDst := filepath.Join(dst, child.Name())
+		if pathExists(childDst) {
+			return fmt.Errorf("vault_layout_ensure: merge conflict: %q already exists in %q (cannot overwrite %q)",
+				child.Name(), dstRel, filepath.Join(srcRel, child.Name()))
+		}
+		childSrc := filepath.Join(src, child.Name())
+		childSrcRel := filepath.Join(srcRel, child.Name())
+		childDstRel := filepath.Join(dstRel, child.Name())
+		report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", childSrcRel, childDstRel))
+		if !dryRun {
+			if err := os.Rename(childSrc, childDst); err != nil {
+				return fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", childSrc, childDst, err)
+			}
+		}
+	}
+
+	// Remove the now-empty src directory.
+	if !dryRun {
+		// Re-check emptiness after renames.
+		remaining, _ := os.ReadDir(src)
+		if len(remaining) == 0 {
+			if err := os.Remove(src); err != nil {
+				return fmt.Errorf("vault_layout_ensure: remove empty src %q: %w", src, err)
+			}
+		}
+	}
+	return nil
+}
+
+// pathExists returns true if path exists (any type).
+func pathExists(path string) bool {
+	_, err := os.Lstat(path)
+	return err == nil
+}
+
+// dirIsEmpty returns true if a directory exists and has no entries.
+func dirIsEmpty(path string) bool {
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return false
+	}
+	return len(entries) == 0
+}
+
+// _ prevents "declared but not used" if dirIsEmpty is only used in tests.
+var _ = dirIsEmpty
+
+// vaultLayoutKnownNames returns the set of root-level names managed by this function.
+// Exported for use in tests.
+func vaultLayoutKnownNames() map[string]struct{} {
+	known := make(map[string]struct{})
+	for _, b := range dataBuckets {
+		known[b] = struct{}{}
+	}
+	for _, b := range knowledgeBuckets {
+		known[b] = struct{}{}
+	}
+	for _, rf := range knownRootFiles {
+		known[strings.ToLower(rf)] = struct{}{}
+	}
+	known["data"] = struct{}{}
+	known["knowledge"] = struct{}{}
+	return known
+}
+
@@ -0,0 +1,95 @@
+---
+name: vault_layout_ensure
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error)"
+description: "Normaliza el layout de un vault al esquema hibrido canónico data/{raw,processed,exports} + knowledge/{decisions,domains,models,benchmarks,test_documents}. Migra directorios legacy en la raíz del vault a su ubicación correcta; idempotente."
+tags: [vault, layout, migration, infra, filesystem, idempotent]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports:
+  - "fmt"
+  - "os"
+  - "path/filepath"
+  - "strings"
+params:
+  - name: vault_path
+    desc: "Ruta al directorio raíz del vault. Puede ser absoluta, relativa o un symlink — se resuelve con filepath.Abs + filepath.EvalSymlinks. Trailing slashes se ignoran."
+  - name: dry_run
+    desc: "Si true, calcula el reporte completo (qué se crearía, migraría, etc.) pero no modifica el disco. Util para previsualizar antes de ejecutar."
+output: "LayoutReport con: VaultPath (ruta resuelta), Created (dirs creados), Migrated (renombres ejecutados, formato 'src -> dst'), AlreadyOK (destinos que ya existían), Skipped (entradas en raíz no reconocidas, no tocadas), DryRun (flag). Error si el path no existe, no es directorio, o hay conflicto de merge (mismo nombre de archivo en src y dst)."
+tested: true
+tests:
+  - "TestVaultLayoutEnsure_DryRun_NoChange"
+  - "TestVaultLayoutEnsure_FreshDir_CreatesLayout"
+  - "TestVaultLayoutEnsure_LegacyDataLayout_Migrates"
+  - "TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates"
+  - "TestVaultLayoutEnsure_AlreadyMigrated_Idempotent"
+  - "TestVaultLayoutEnsure_Mixed_PartialMigration"
+  - "TestVaultLayoutEnsure_MergeConflict_Errors"
+  - "TestVaultLayoutEnsure_UnknownFiles_Skipped"
+  - "TestVaultLayoutEnsure_NotADir_Errors"
+test_file_path: "functions/infra/vault_layout_ensure_test.go"
+file_path: "functions/infra/vault_layout_ensure.go"
+---
+
+## Ejemplo
+
+```go
+// Previsualizar sin tocar disco:
+report, err := VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", true)
+if err != nil {
+    log.Fatal(err)
+}
+fmt.Printf("Would migrate: %v\n", report.Migrated)
+fmt.Printf("Would create:  %v\n", report.Created)
+
+// Ejecutar la migración:
+report, err = VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", false)
+if err != nil {
+    log.Fatalf("migration failed: %v", err)
+}
+fmt.Printf("Migrated: %v\n", report.Migrated)
+fmt.Printf("Created:  %v\n", report.Created)
+fmt.Printf("Skipped:  %v\n", report.Skipped)
+```
+
+## Comportamiento detallado
+
+**Directorios gestionados:**
+
+| Raíz (legacy) | Destino canónico |
+|---|---|
+| `raw/` | `data/raw/` |
+| `processed/` | `data/processed/` |
+| `exports/` | `data/exports/` |
+| `decisions/` | `knowledge/decisions/` |
+| `domains/` | `knowledge/domains/` |
+| `models/` | `knowledge/models/` |
+| `benchmarks/` | `knowledge/benchmarks/` |
+| `test_documents/` | `knowledge/test_documents/` |
+| `README.md` / `README.txt` | `knowledge/README.md` |
+
+**Lógica de migración (por cada entrada conocida):**
+
+- Solo `src` existe → rename atómico `src` → `dst`, registrado en `Migrated`.
+- Solo `dst` existe → ya migrado, registrado en `AlreadyOK`.
+- Ambos existen (dir) → merge: mueve cada hijo de `src/` a `dst/`; error si mismo nombre. Registrado en `Migrated` por hijo.
+- Ambos existen (archivo README) → error inmediato con paths concretos.
+- Ninguno existe → crea `dst` vacío, registrado en `Created`.
+
+**Archivos/dirs no reconocidos** en la raíz (`.git`, `vault_index.db`, archivos custom) se registran en `Skipped` y no se tocan.
+
+**Idempotencia:** segunda ejecución sobre un vault ya migrado reporta todo en `AlreadyOK` y no toca disco.
+
+## Notas
+
+`LayoutReport` es un tipo local de esta función (no un tipo del registry). El struct exportado vive en `functions/infra/vault_layout_ensure.go` junto con la función.
+
+Para aplicar la migración a múltiples vaults en batch, invocar desde un pipeline que lea los paths de `vault.yaml` (ver `vault_manifest_read_go_infra`) y llame a `VaultLayoutEnsure` en cada uno.
@@ -0,0 +1,394 @@
+package infra
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// mkVaultDir creates a temporary directory tree for tests.
+// entries is a list of relative paths to create.
+// Paths ending in "/" are directories; others are files with placeholder content.
+func mkVaultDir(t *testing.T, entries []string) string {
+	t.Helper()
+	root := t.TempDir()
+	for _, e := range entries {
+		full := filepath.Join(root, filepath.FromSlash(e))
+		if e[len(e)-1] == '/' {
+			if err := os.MkdirAll(full, 0o755); err != nil {
+				t.Fatalf("mkVaultDir: mkdir %q: %v", full, err)
+			}
+		} else {
+			if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
+				t.Fatalf("mkVaultDir: mkdir parent %q: %v", full, err)
+			}
+			if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
+				t.Fatalf("mkVaultDir: write %q: %v", full, err)
+			}
+		}
+	}
+	return root
+}
+
+func TestVaultLayoutEnsure_DryRun_NoChange(t *testing.T) {
+	root := mkVaultDir(t, []string{
+		"raw/",
+		"raw/file1.csv",
+		"processed/",
+	})
+
+	before := snapshotDir(t, root)
+	report, err := VaultLayoutEnsure(root, true)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if !report.DryRun {
+		t.Error("DryRun flag not set in report")
+	}
+	after := snapshotDir(t, root)
+	if !mapEqual(before, after) {
+		t.Errorf("dry-run modified disk: before=%v after=%v", before, after)
+	}
+	// Should have planned a migration for raw and processed.
+	if len(report.Migrated) == 0 {
+		t.Error("expected Migrated to be non-empty in dry-run plan")
+	}
+}
+
+func TestVaultLayoutEnsure_FreshDir_CreatesLayout(t *testing.T) {
+	root := mkVaultDir(t, []string{}) // empty vault
+
+	report, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// All standard dirs should be created.
+	wantCreated := []string{
+		"data", "knowledge",
+		filepath.Join("data", "raw"),
+		filepath.Join("data", "processed"),
+		filepath.Join("data", "exports"),
+		filepath.Join("knowledge", "decisions"),
+		filepath.Join("knowledge", "domains"),
+		filepath.Join("knowledge", "models"),
+		filepath.Join("knowledge", "benchmarks"),
+		filepath.Join("knowledge", "test_documents"),
+	}
+	createdSet := toSet(report.Created)
+	for _, w := range wantCreated {
+		if _, ok := createdSet[w]; !ok {
+			t.Errorf("expected Created to contain %q, got %v", w, report.Created)
+		}
+	}
+
+	// All directories must actually exist on disk.
+	for _, w := range wantCreated {
+		full := filepath.Join(root, w)
+		info, err := os.Stat(full)
+		if err != nil {
+			t.Errorf("expected %q to exist: %v", full, err)
+			continue
+		}
+		if !info.IsDir() {
+			t.Errorf("%q should be a directory", full)
+		}
+	}
+}
+
+func TestVaultLayoutEnsure_LegacyDataLayout_Migrates(t *testing.T) {
+	root := mkVaultDir(t, []string{
+		"raw/",
+		"raw/file1.parquet",
+		"raw/file2.parquet",
+		"processed/",
+		"processed/clean.csv",
+		"exports/",
+	})
+
+	report, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// raw and processed should appear in Migrated (as dirs, top-level rename).
+	migratedSet := toSet(report.Migrated)
+	for _, pair := range []string{
+		"raw -> " + filepath.Join("data", "raw"),
+		"processed -> " + filepath.Join("data", "processed"),
+	} {
+		if _, ok := migratedSet[pair]; !ok {
+			t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
+		}
+	}
+
+	// Files must have moved.
+	for _, f := range []string{
+		filepath.Join("data", "raw", "file1.parquet"),
+		filepath.Join("data", "raw", "file2.parquet"),
+		filepath.Join("data", "processed", "clean.csv"),
+	} {
+		if _, err := os.Stat(filepath.Join(root, f)); err != nil {
+			t.Errorf("expected %q to exist after migration: %v", f, err)
+		}
+	}
+	// Old dirs must be gone.
+	for _, d := range []string{"raw", "processed"} {
+		if pathExists(filepath.Join(root, d)) {
+			t.Errorf("expected legacy dir %q to be removed", d)
+		}
+	}
+}
+
+func TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates(t *testing.T) {
+	root := mkVaultDir(t, []string{
+		"decisions/",
+		"decisions/2024-01.md",
+		"models/",
+		"models/ner_v1.pkl",
+		"README.md",
+	})
+
+	report, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// decisions and models should appear in Migrated.
+	migratedSet := toSet(report.Migrated)
+	for _, pair := range []string{
+		"decisions -> " + filepath.Join("knowledge", "decisions"),
+		"models -> " + filepath.Join("knowledge", "models"),
+		"README.md -> " + filepath.Join("knowledge", "README.md"),
+	} {
+		if _, ok := migratedSet[pair]; !ok {
+			t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
+		}
+	}
+
+	// Files must be at new location.
+	for _, f := range []string{
+		filepath.Join("knowledge", "decisions", "2024-01.md"),
+		filepath.Join("knowledge", "models", "ner_v1.pkl"),
+		filepath.Join("knowledge", "README.md"),
+	} {
+		if _, err := os.Stat(filepath.Join(root, f)); err != nil {
+			t.Errorf("expected %q to exist after migration: %v", f, err)
+		}
+	}
+}
+
+func TestVaultLayoutEnsure_AlreadyMigrated_Idempotent(t *testing.T) {
+	root := mkVaultDir(t, []string{
+		"data/",
+		"data/raw/",
+		"data/raw/file.csv",
+		"data/processed/",
+		"data/exports/",
+		"knowledge/",
+		"knowledge/decisions/",
+		"knowledge/domains/",
+		"knowledge/models/",
+		"knowledge/benchmarks/",
+		"knowledge/test_documents/",
+	})
+
+	report1, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("first run error: %v", err)
+	}
+	if len(report1.Migrated) != 0 {
+		t.Errorf("first run on fully-migrated vault should have no migrations, got %v", report1.Migrated)
+	}
+
+	before := snapshotDir(t, root)
+	report2, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("second run error: %v", err)
+	}
+	after := snapshotDir(t, root)
+
+	if !mapEqual(before, after) {
+		t.Error("second run modified disk (not idempotent)")
+	}
+	if len(report2.Migrated) != 0 {
+		t.Errorf("second run should produce no migrations, got %v", report2.Migrated)
+	}
+	if len(report2.AlreadyOK) == 0 {
+		t.Error("second run should report existing dirs as AlreadyOK")
+	}
+}
+
+func TestVaultLayoutEnsure_Mixed_PartialMigration(t *testing.T) {
+	// data/raw already migrated; exports still at root; knowledge dirs in legacy positions.
+	root := mkVaultDir(t, []string{
+		"data/",
+		"data/raw/",
+		"data/raw/already_here.csv",
+		"exports/",
+		"exports/report.pdf",
+		"decisions/",
+		"decisions/2023-note.md",
+	})
+
+	report, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	// data/raw should be AlreadyOK.
+	if !sliceContains(report.AlreadyOK, filepath.Join("data", "raw")) {
+		t.Errorf("data/raw should be AlreadyOK, got AlreadyOK=%v", report.AlreadyOK)
+	}
+	// exports should be migrated.
+	exportsMigrated := false
+	for _, m := range report.Migrated {
+		if m == "exports -> "+filepath.Join("data", "exports") {
+			exportsMigrated = true
+		}
+	}
+	if !exportsMigrated {
+		t.Errorf("exports should be migrated, Migrated=%v", report.Migrated)
+	}
+	// decisions should be migrated.
+	decisionsMigrated := false
+	for _, m := range report.Migrated {
+		if m == "decisions -> "+filepath.Join("knowledge", "decisions") {
+			decisionsMigrated = true
+		}
+	}
+	if !decisionsMigrated {
+		t.Errorf("decisions should be migrated, Migrated=%v", report.Migrated)
+	}
+}
+
+func TestVaultLayoutEnsure_MergeConflict_Errors(t *testing.T) {
+	// Both src (raw/) and dst (data/raw/) exist and have a file with the same name.
+	root := mkVaultDir(t, []string{
+		"raw/",
+		"raw/collision.csv",
+		"data/",
+		"data/raw/",
+		"data/raw/collision.csv", // same name -> conflict
+	})
+
+	_, err := VaultLayoutEnsure(root, false)
+	if err == nil {
+		t.Fatal("expected error for merge conflict, got nil")
+	}
+	if !contains(err.Error(), "conflict") && !contains(err.Error(), "collision.csv") {
+		t.Errorf("error should mention conflict or the file name, got: %v", err)
+	}
+}
+
+func TestVaultLayoutEnsure_UnknownFiles_Skipped(t *testing.T) {
+	root := mkVaultDir(t, []string{
+		".git/",
+		"vault_index.db",
+		"my_custom_notes.txt",
+		"raw/",
+	})
+
+	report, err := VaultLayoutEnsure(root, false)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	skippedSet := toSet(report.Skipped)
+	for _, name := range []string{".git", "vault_index.db", "my_custom_notes.txt"} {
+		if _, ok := skippedSet[name]; !ok {
+			t.Errorf("expected %q in Skipped, got %v", name, report.Skipped)
+		}
+	}
+	// raw should NOT be in Skipped (it's a known bucket).
+	if _, ok := skippedSet["raw"]; ok {
+		t.Error("raw should not appear in Skipped — it is a known bucket")
+	}
+}
+
+func TestVaultLayoutEnsure_NotADir_Errors(t *testing.T) {
+	t.Run("non-existent path", func(t *testing.T) {
+		_, err := VaultLayoutEnsure("/tmp/does_not_exist_fn_registry_test_xyz", false)
+		if err == nil {
+			t.Fatal("expected error for non-existent path")
+		}
+	})
+
+	t.Run("path is a file", func(t *testing.T) {
+		f, err := os.CreateTemp("", "vault_layout_*.txt")
+		if err != nil {
+			t.Fatal(err)
+		}
+		f.Close()
+		defer os.Remove(f.Name())
+
+		_, err = VaultLayoutEnsure(f.Name(), false)
+		if err == nil {
+			t.Fatal("expected error when vaultPath is a file, not a dir")
+		}
+		if !contains(err.Error(), "not a directory") {
+			t.Errorf("error should mention 'not a directory', got: %v", err)
+		}
+	})
+}
+
+// --- helpers ---
+
+// snapshotDir returns a map of relative path -> exists for all entries under root.
+func snapshotDir(t *testing.T, root string) map[string]bool {
+	t.Helper()
+	snap := make(map[string]bool)
+	err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
+		if err != nil {
+			return err
+		}
+		rel, _ := filepath.Rel(root, path)
+		snap[rel] = true
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("snapshotDir: %v", err)
+	}
+	return snap
+}
+
+func mapEqual(a, b map[string]bool) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for k := range a {
+		if !b[k] {
+			return false
+		}
+	}
+	return true
+}
+
+func toSet(ss []string) map[string]struct{} {
+	m := make(map[string]struct{}, len(ss))
+	for _, s := range ss {
+		m[s] = struct{}{}
+	}
+	return m
+}
+
+func sliceContains(ss []string, target string) bool {
+	for _, s := range ss {
+		if s == target {
+			return true
+		}
+	}
+	return false
+}
+
+func contains(s, sub string) bool {
+	return len(s) >= len(sub) && (s == sub || len(sub) == 0 ||
+		func() bool {
+			for i := 0; i <= len(s)-len(sub); i++ {
+				if s[i:i+len(sub)] == sub {
+					return true
+				}
+			}
+			return false
+		}())
+}
@@ -0,0 +1,96 @@
+package infra
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"gopkg.in/yaml.v3"
+)
+
+// VaultManifestEntry is a single vault entry parsed from a projects/<proj>/vaults/vault.yaml.
+type VaultManifestEntry struct {
+	ProjectID    string   // basename of projects/<proj>/, inferred from manifest path
+	Name         string   // vault name as declared in vault.yaml
+	Description  string   // human description
+	Path         string   // absolute path to the vault directory
+	Tags         []string // tags declared in vault.yaml
+	ManifestFile string   // absolute path to the vault.yaml this entry came from
+}
+
+// vaultYAML mirrors the vault.yaml schema (only the fields we care about).
+type vaultYAML struct {
+	Vaults []struct {
+		Name        string   `yaml:"name"`
+		Description string   `yaml:"description"`
+		Path        string   `yaml:"path"`
+		Tags        []string `yaml:"tags"`
+	} `yaml:"vaults"`
+}
+
+// VaultManifestRead globs all projects/*/vaults/vault.yaml under repoRoot, parses each
+// manifest and returns a flat slice of VaultManifestEntry.
+//
+// Rules:
+//   - If a manifest fails to parse, an error is returned immediately with the file path.
+//   - If no manifests are found, an empty slice is returned (not an error).
+//   - ProjectID is inferred from the directory component between "projects/" and "/vaults/".
+func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error) {
+	pattern := filepath.Join(repoRoot, "projects", "*", "vaults", "vault.yaml")
+	matches, err := filepath.Glob(pattern)
+	if err != nil {
+		return nil, fmt.Errorf("vault_manifest_read: glob %q: %w", pattern, err)
+	}
+
+	var out []VaultManifestEntry
+	for _, manifestPath := range matches {
+		entries, err := parseVaultManifest(manifestPath)
+		if err != nil {
+			return nil, err
+		}
+		out = append(out, entries...)
+	}
+	return out, nil
+}
+
+func parseVaultManifest(manifestPath string) ([]VaultManifestEntry, error) {
+	data, err := os.ReadFile(manifestPath)
+	if err != nil {
+		return nil, fmt.Errorf("vault_manifest_read: read %q: %w", manifestPath, err)
+	}
+
+	var raw vaultYAML
+	if err := yaml.Unmarshal(data, &raw); err != nil {
+		return nil, fmt.Errorf("vault_manifest_read: parse %q: %w", manifestPath, err)
+	}
+
+	projectID := inferProjectID(manifestPath)
+
+	entries := make([]VaultManifestEntry, 0, len(raw.Vaults))
+	for _, v := range raw.Vaults {
+		entries = append(entries, VaultManifestEntry{
+			ProjectID:    projectID,
+			Name:         v.Name,
+			Description:  v.Description,
+			Path:         v.Path,
+			Tags:         v.Tags,
+			ManifestFile: manifestPath,
+		})
+	}
+	return entries, nil
+}
+
+// inferProjectID extracts the project basename from a path of the form
+// .../projects/<proj>/vaults/vault.yaml.
+func inferProjectID(manifestPath string) string {
+	// Normalize separators and split.
+	parts := strings.Split(filepath.ToSlash(manifestPath), "/")
+	// Walk backwards: vault.yaml -> vaults -> <proj> -> projects -> ...
+	for i, p := range parts {
+		if p == "projects" && i+1 < len(parts) {
+			return parts[i+1]
+		}
+	}
+	return ""
+}
@@ -0,0 +1,59 @@
+---
+name: vault_manifest_read
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error)"
+description: "Lee todos los manifests vault.yaml bajo projects/*/vaults/ del repo y devuelve una lista plana de entradas de vault con su ProjectID inferido del path."
+tags: [vault, manifest, yaml, infra, projects, storage]
+uses_functions: []
+uses_types: []
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports:
+  - "fmt"
+  - "os"
+  - "path/filepath"
+  - "strings"
+  - "gopkg.in/yaml.v3"
+params:
+  - name: repoRoot
+    desc: "Ruta absoluta a la raiz del repositorio fn_registry. Se usa como base para el glob projects/*/vaults/vault.yaml."
+output: "Slice plano de VaultManifestEntry (ProjectID, Name, Description, Path, Tags, ManifestFile). Vacio si no hay manifests. Error si un yaml no parsea, con el path concreto en el mensaje."
+tested: true
+tests:
+  - "TestVaultManifestRead_HappyPath"
+  - "TestVaultManifestRead_MalformedYAML"
+  - "TestVaultManifestRead_EmptyDir"
+test_file_path: "functions/infra/vault_manifest_read_test.go"
+file_path: "functions/infra/vault_manifest_read.go"
+---
+
+## Ejemplo
+
+```go
+entries, err := VaultManifestRead("/home/lucas/fn_registry")
+if err != nil {
+    log.Fatal(err)
+}
+for _, e := range entries {
+    fmt.Printf("%s/%s -> %s\n", e.ProjectID, e.Name, e.Path)
+}
+// app_turismo/turismo_spain -> /home/lucas/vaults/turismo_spain
+// app_finance/finance_data  -> /home/lucas/vaults/finance_data
+```
+
+## Notas
+
+`VaultManifestEntry` es un tipo local de esta funcion (no un tipo del registry). Contiene:
+- `ProjectID` — basename del directorio `projects/<proj>/`, inferido del path del manifest.
+- `Name`, `Description`, `Path`, `Tags` — copiados del yaml tal cual.
+- `ManifestFile` — path absoluto al vault.yaml de origen, util para mensajes de error y trazabilidad.
+
+El parseo usa `gopkg.in/yaml.v3` (ya en go.mod). Si un manifest falla, la funcion devuelve
+error inmediatamente con el path del fichero problemático. Los manifests sin entradas
+`vaults:` contribuyen cero entries (no es error). Si no existe ningun `projects/*/vaults/vault.yaml`
+el resultado es slice vacio sin error.
@@ -0,0 +1,113 @@
+package infra
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func TestVaultManifestRead_HappyPath(t *testing.T) {
+	root := t.TempDir()
+
+	writeManifest(t, root, "app_turismo", `
+vaults:
+  - name: turismo_spain
+    description: "Datos de turismo en Espana"
+    path: "/home/lucas/vaults/turismo_spain"
+    tags: [turismo, espana]
+  - name: turismo_raw
+    description: "Datos brutos sin procesar"
+    path: "/home/lucas/vaults/turismo_raw"
+    tags: [raw]
+`)
+
+	writeManifest(t, root, "app_finance", `
+vaults:
+  - name: finance_data
+    description: "Datos financieros"
+    path: "/home/lucas/vaults/finance_data"
+    tags: [finance]
+`)
+
+	entries, err := VaultManifestRead(root)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entries) != 3 {
+		t.Fatalf("got %d entries, want 3", len(entries))
+	}
+
+	// Build index by name for order-independent assertions.
+	byName := make(map[string]VaultManifestEntry, len(entries))
+	for _, e := range entries {
+		byName[e.Name] = e
+	}
+
+	// Check turismo_spain entry.
+	e, ok := byName["turismo_spain"]
+	if !ok {
+		t.Fatal("missing entry 'turismo_spain'")
+	}
+	if e.ProjectID != "app_turismo" {
+		t.Errorf("turismo_spain.ProjectID = %q, want %q", e.ProjectID, "app_turismo")
+	}
+	if e.Path != "/home/lucas/vaults/turismo_spain" {
+		t.Errorf("turismo_spain.Path = %q, want %q", e.Path, "/home/lucas/vaults/turismo_spain")
+	}
+	if len(e.Tags) != 2 || e.Tags[0] != "turismo" {
+		t.Errorf("turismo_spain.Tags = %v, want [turismo espana]", e.Tags)
+	}
+	if e.ManifestFile == "" {
+		t.Error("turismo_spain.ManifestFile is empty")
+	}
+
+	// Check finance_data entry belongs to app_finance.
+	ef, ok := byName["finance_data"]
+	if !ok {
+		t.Fatal("missing entry 'finance_data'")
+	}
+	if ef.ProjectID != "app_finance" {
+		t.Errorf("finance_data.ProjectID = %q, want %q", ef.ProjectID, "app_finance")
+	}
+}
+
+func TestVaultManifestRead_MalformedYAML(t *testing.T) {
+	root := t.TempDir()
+
+	writeManifest(t, root, "bad_project", `
+vaults:
+  - name: [invalid yaml
+    path: missing_bracket
+`)
+
+	_, err := VaultManifestRead(root)
+	if err == nil {
+		t.Fatal("expected error for malformed YAML, got nil")
+	}
+}
+
+func TestVaultManifestRead_EmptyDir(t *testing.T) {
+	root := t.TempDir()
+
+	// No projects/ directory at all — glob returns no matches.
+	entries, err := VaultManifestRead(root)
+	if err != nil {
+		t.Fatalf("unexpected error for empty dir: %v", err)
+	}
+	if len(entries) != 0 {
+		t.Fatalf("got %d entries, want 0", len(entries))
+	}
+}
+
+// writeManifest creates <root>/projects/<proj>/vaults/vault.yaml with the given content.
+func writeManifest(t *testing.T, root, proj, content string) {
+	t.Helper()
+	dir := filepath.Join(root, "projects", proj, "vaults")
+	if err := os.MkdirAll(dir, 0o755); err != nil {
+		t.Fatalf("mkdir %s: %v", dir, err)
+	}
+	f := filepath.Join(dir, "vault.yaml")
+	if err := os.WriteFile(f, []byte(content), 0o644); err != nil {
+		t.Fatalf("write %s: %v", f, err)
+	}
+}
@@ -0,0 +1,265 @@
+package infra
+
+import (
+	"database/sql"
+	"fmt"
+	"path/filepath"
+	"strings"
+)
+
+// VaultSearchHit is a single result returned by VaultSearch.
+type VaultSearchHit struct {
+	VaultPath string `json:"vault_path"`
+	VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
+	RelPath   string `json:"rel_path"`
+	Size      int64  `json:"size"`
+	Mtime     int64  `json:"mtime"`
+	Mime      string `json:"mime"`
+	Bucket    string `json:"bucket"`
+	SubBucket string `json:"sub_bucket"`
+	Snippet   string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
+}
+
+// VaultSearch searches vault_index.db inside vaultPath for files matching query.
+//
+// Behaviour:
+//  1. Opens vault_index.db via VaultIndexOpen.
+//  2. If limit <= 0, defaults to 50.
+//  3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
+//     is populated by profilers). Because the FTS5 table uses content='' (contentless),
+//     column values are not stored; results are correlated back to files via a LIKE
+//     match on rel_path for path tokens, or via an IN clause of matched rowids for
+//     content_text matches.
+//  4. Also searches files.rel_path with LIKE to find path matches.
+//  5. Results from both searches are merged (deduplication by rel_path).
+//  6. If both FTS5 and LIKE queries fail, returns the error.
+//  7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
+func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
+	if limit <= 0 {
+		limit = 50
+	}
+
+	db, err := VaultIndexOpen(vaultPath)
+	if err != nil {
+		return nil, fmt.Errorf("vault_search: open index: %w", err)
+	}
+	defer db.Close()
+
+	vaultName := resolveVaultName(vaultPath)
+
+	hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
+	if err != nil {
+		return nil, fmt.Errorf("vault_search: %w", err)
+	}
+	return hits, nil
+}
+
+// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
+//  1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
+//     Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
+//  2. LIKE on files.rel_path (always reliable for path searching).
+//
+// Results are deduplicated by rel_path, up to limit entries.
+func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
+	seen := make(map[string]struct{})
+	var hits []VaultSearchHit
+
+	// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
+	// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
+	// We get matching rowids from FTS5, then look up files by rowid.
+	// This is reliable for content_text matches because VaultIndexWrite inserts
+	// content_text rows independently of the path rows (profilers update them).
+	// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
+	ftsQuery := safeFTSQuery(query)
+	ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
+	if ftsErr == nil {
+		for _, h := range ftsHits {
+			if len(hits) >= limit {
+				break
+			}
+			if _, ok := seen[h.RelPath]; !ok {
+				seen[h.RelPath] = struct{}{}
+				hits = append(hits, h)
+			}
+		}
+	}
+	// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
+	// If it failed with a non-syntax error, still continue to LIKE fallback.
+
+	// Strategy 2: LIKE on rel_path — reliable path search.
+	// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
+	// word-like token so the LIKE pattern is still useful.
+	likeQuery := simplifyForLike(query)
+	if len(hits) < limit && likeQuery != "" {
+		remaining := limit - len(hits)
+		likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
+		if likeErr != nil && ftsErr != nil {
+			// Both failed — return a combined error.
+			return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
+		}
+		for _, h := range likeHits {
+			if len(hits) >= limit {
+				break
+			}
+			if _, ok := seen[h.RelPath]; !ok {
+				seen[h.RelPath] = struct{}{}
+				hits = append(hits, h)
+			}
+		}
+	}
+
+	if hits == nil {
+		hits = []VaultSearchHit{}
+	}
+	return hits, nil
+}
+
+// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
+// back to the files table.
+//
+// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
+// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
+// This works correctly when content_text was populated by a profiler that did NOT
+// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
+// without changing the rowid). For the current VaultIndexWrite implementation
+// (which inserts content_text='' and profilers update it in-place), the rowids
+// remain stable after profiling.
+func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
+	// Get matching rowids from FTS5.
+	const qRowids = `
+		SELECT rowid
+		FROM files_fts
+		WHERE files_fts MATCH ?
+		ORDER BY rank
+		LIMIT ?`
+
+	rows, err := db.Query(qRowids, safeQuery, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var rowids []int64
+	for rows.Next() {
+		var rid int64
+		if err := rows.Scan(&rid); err != nil {
+			return nil, err
+		}
+		rowids = append(rowids, rid)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	if len(rowids) == 0 {
+		return nil, nil
+	}
+
+	// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
+	// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
+	var hits []VaultSearchHit
+	for _, rid := range rowids {
+		var h VaultSearchHit
+		err := db.QueryRow(`
+			SELECT rel_path, size, mtime, mime, bucket, sub_bucket
+			FROM files WHERE rowid = ?`, rid,
+		).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
+		if err != nil {
+			// rowid mismatch (happens after update cycles) — skip gracefully.
+			continue
+		}
+		h.VaultPath = vaultPath
+		h.VaultName = vaultName
+		h.Snippet = ""
+		hits = append(hits, h)
+	}
+	return hits, nil
+}
+
+// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
+func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
+	const qLike = `
+		SELECT rel_path, size, mtime, mime, bucket, sub_bucket
+		FROM files
+		WHERE rel_path LIKE '%' || ? || '%'
+		ORDER BY mtime DESC
+		LIMIT ?`
+
+	rows, err := db.Query(qLike, query, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var hits []VaultSearchHit
+	for rows.Next() {
+		var h VaultSearchHit
+		if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
+			return nil, err
+		}
+		h.VaultPath = vaultPath
+		h.VaultName = vaultName
+		h.Snippet = ""
+		hits = append(hits, h)
+	}
+	return hits, rows.Err()
+}
+
+// resolveVaultName returns the basename of vaultPath after resolving symlinks.
+// Falls back to filepath.Base if EvalSymlinks fails.
+func resolveVaultName(vaultPath string) string {
+	resolved, err := filepath.EvalSymlinks(vaultPath)
+	if err != nil {
+		resolved = vaultPath
+	}
+	return filepath.Base(resolved)
+}
+
+// safeFTSQuery wraps the query in double-quotes if it does not already contain
+// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
+// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
+func safeFTSQuery(query string) string {
+	q := strings.TrimSpace(query)
+	if q == "" {
+		return q
+	}
+	upper := strings.ToUpper(q)
+	// If user already uses explicit operators or column prefix, pass through.
+	if strings.ContainsAny(q, ":") ||
+		strings.Contains(upper, " AND ") ||
+		strings.Contains(upper, " OR ") ||
+		strings.Contains(upper, " NOT ") {
+		return q
+	}
+	// Escape any double-quotes in the query before wrapping.
+	escaped := strings.ReplaceAll(q, `"`, `""`)
+	return `"` + escaped + `"`
+}
+
+// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
+func isFTSSyntaxError(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := strings.ToLower(err.Error())
+	return strings.Contains(msg, "syntax error") ||
+		strings.Contains(msg, "no such column") ||
+		strings.Contains(msg, "fts5: syntax error")
+}
+
+// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
+// When the query contains FTS5 special characters (colons, double-quotes, operators),
+// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
+// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
+// syntactically complex or contains column-prefix syntax like "foo:bar:".
+func simplifyForLike(query string) string {
+	q := strings.TrimSpace(query)
+	var token strings.Builder
+	for _, r := range q {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
+			token.WriteRune(r)
+		} else if token.Len() > 0 {
+			break
+		}
+	}
+	return token.String()
+}
@@ -0,0 +1,61 @@
+---
+name: vault_search
+kind: function
+lang: go
+domain: infra
+version: "1.0.0"
+purity: impure
+signature: "func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error)"
+description: "Busca en vault_index.db de un vault usando FTS5 sobre files_fts. Si el query rompe el parser FTS5, hace fallback a LIKE sobre rel_path. Retorna hits con snippet de contexto."
+tags: [vault, search, fts5, sqlite, infra]
+uses_functions: ["vault_index_open_go_infra"]
+uses_types: ["vault_file_go_infra"]
+returns: []
+returns_optional: false
+error_type: "error_go_core"
+imports: [database/sql, fmt, path/filepath, strings]
+params:
+  - name: vaultPath
+    desc: "ruta absoluta al directorio raiz del vault (puede ser symlink)"
+  - name: query
+    desc: "termino o frase de busqueda; se escapa automaticamente para FTS5 salvo que ya incluya operadores booleanos o prefijos de columna"
+  - name: limit
+    desc: "maximo de resultados; si es <= 0 se usa 50"
+output: "slice de VaultSearchHit ordenado por rank FTS5 (o mtime DESC en fallback LIKE); slice vacio si no hay resultados"
+tested: true
+tests:
+  - "FTS match devuelve hit con snippet"
+  - "query sin resultados retorna slice vacio"
+  - "limit se respeta"
+  - "query FTS invalida activa fallback LIKE"
+  - "limit cero usa 50 por defecto"
+test_file_path: "functions/infra/vault_search_test.go"
+file_path: "functions/infra/vault_search.go"
+---
+
+## Ejemplo
+
+```go
+hits, err := infra.VaultSearch("/home/lucas/vaults/turismo_spain", "hoteles", 20)
+if err != nil {
+    log.Fatal(err)
+}
+for _, h := range hits {
+    fmt.Printf("[%s] %s  %s\n", h.VaultName, h.RelPath, h.Snippet)
+}
+```
+
+## Notas
+
+`VaultSearchHit` es un struct local definido en este archivo (no en `vault_file.go`)
+porque combina campos de `files` + metadatos de contexto de busqueda (Snippet, VaultPath, VaultName).
+
+**FTS5 safety:** el helper `safeFTSQuery` envuelve la query en comillas dobles
+cuando no contiene operadores booleanos ni prefijos de columna. Esto evita errores
+del parser en tokens como `foo:bar:` o `hello-world`.
+
+**Fallback LIKE:** si el MATCH falla con un error de sintaxis FTS5, se ejecuta
+`WHERE rel_path LIKE '%' || query || '%'`. Los hits del fallback tienen `Snippet=""`.
+
+**VaultName:** se deriva del `filepath.Base(filepath.EvalSymlinks(vaultPath))`.
+Si `EvalSymlinks` falla (e.g. symlink roto), usa `filepath.Base(vaultPath)`.
@@ -0,0 +1,147 @@
+package infra
+
+import (
+	"testing"
+	"time"
+)
+
+// openTestVaultDB creates a fresh vault_index.db in a temp dir and returns the path.
+func openTestVaultDir(t *testing.T) string {
+	t.Helper()
+	dir := t.TempDir()
+	db, err := VaultIndexOpen(dir)
+	if err != nil {
+		t.Fatalf("VaultIndexOpen: %v", err)
+	}
+	db.Close()
+	return dir
+}
+
+// seedVaultFile inserts a row into files + files_fts.
+func seedVaultFile(t *testing.T, dir, relPath, mime, bucket, subBucket, contentText string, size int64) {
+	t.Helper()
+	db, err := VaultIndexOpen(dir)
+	if err != nil {
+		t.Fatalf("VaultIndexOpen seed: %v", err)
+	}
+	defer db.Close()
+
+	now := time.Now().Unix()
+	_, err = db.Exec(`
+		INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
+		VALUES (?, ?, ?, 'aabbccdd', ?, '', ?, ?, ?)`,
+		relPath, size, now, mime, bucket, subBucket, now,
+	)
+	if err != nil {
+		t.Fatalf("seed files: %v", err)
+	}
+	_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`, relPath, contentText)
+	if err != nil {
+		t.Fatalf("seed files_fts: %v", err)
+	}
+}
+
+// --- Tests ---
+
+func TestVaultSearch_FTSMatch(t *testing.T) {
+	t.Run("FTS match devuelve hit con snippet", func(t *testing.T) {
+		dir := openTestVaultDir(t)
+		seedVaultFile(t, dir, "data/raw/informe.csv", "text/csv", "data", "raw",
+			"ventas trimestrales empresa iberica", 1024)
+		seedVaultFile(t, dir, "data/raw/other.csv", "text/csv", "data", "raw",
+			"productos inventario almacen", 512)
+
+		hits, err := VaultSearch(dir, "ventas", 10)
+		if err != nil {
+			t.Fatalf("VaultSearch: %v", err)
+		}
+		if len(hits) != 1 {
+			t.Fatalf("got %d hits, want 1", len(hits))
+		}
+		if hits[0].RelPath != "data/raw/informe.csv" {
+			t.Errorf("RelPath = %q, want data/raw/informe.csv", hits[0].RelPath)
+		}
+		if hits[0].VaultName == "" {
+			t.Errorf("VaultName should not be empty")
+		}
+	})
+}
+
+func TestVaultSearch_NoMatch(t *testing.T) {
+	t.Run("query sin resultados retorna slice vacio", func(t *testing.T) {
+		dir := openTestVaultDir(t)
+		seedVaultFile(t, dir, "data/raw/file.csv", "text/csv", "data", "raw", "some content", 100)
+
+		hits, err := VaultSearch(dir, "zzznomatch", 10)
+		if err != nil {
+			t.Fatalf("VaultSearch: %v", err)
+		}
+		if len(hits) != 0 {
+			t.Errorf("got %d hits, want 0", len(hits))
+		}
+	})
+}
+
+func TestVaultSearch_LimitRespected(t *testing.T) {
+	t.Run("limit se respeta", func(t *testing.T) {
+		dir := openTestVaultDir(t)
+		for i := 0; i < 10; i++ {
+			path := "data/raw/file" + string(rune('a'+i)) + ".csv"
+			seedVaultFile(t, dir, path, "text/csv", "data", "raw", "common keyword everywhere", 100)
+		}
+
+		hits, err := VaultSearch(dir, "common", 3)
+		if err != nil {
+			t.Fatalf("VaultSearch: %v", err)
+		}
+		if len(hits) != 3 {
+			t.Errorf("got %d hits, want 3", len(hits))
+		}
+	})
+}
+
+func TestVaultSearch_BadFTSQuery_FallbackLike(t *testing.T) {
+	t.Run("query FTS invalida activa fallback LIKE", func(t *testing.T) {
+		dir := openTestVaultDir(t)
+		// Insert a file whose rel_path contains "foobar" so LIKE can find it.
+		seedVaultFile(t, dir, "data/raw/foobar_report.csv", "text/csv", "data", "raw", "", 200)
+
+		// "foo:bar:" — colon after a non-column name triggers FTS5 parser error.
+		// safeFTSQuery passes it through unchanged because it contains ":"
+		// → FTS5 "no such column: bar" → fallback LIKE on rel_path.
+		hits, err := VaultSearch(dir, "foo:bar:", 10)
+		if err != nil {
+			t.Fatalf("VaultSearch: %v", err)
+		}
+		if len(hits) == 0 {
+			t.Errorf("expected fallback LIKE to find foobar_report.csv, got 0 hits")
+		}
+		for _, h := range hits {
+			if h.Snippet != "" {
+				t.Errorf("fallback hits should have empty Snippet, got %q", h.Snippet)
+			}
+		}
+	})
+}
+
+func TestVaultSearch_LimitZeroDefaults(t *testing.T) {
+	t.Run("limit cero usa 50 por defecto", func(t *testing.T) {
+		dir := openTestVaultDir(t)
+		// Insert 55 files with the same keyword.
+		for i := 0; i < 55; i++ {
+			path := "data/raw/doc" + string(rune('a')) + string(rune(int('0')+i%10)) + ".csv"
+			if i >= 10 {
+				path = "data/raw/doc" + string(rune('b'+i/10-1)) + string(rune(int('0')+i%10)) + ".csv"
+			}
+			seedVaultFile(t, dir, path, "text/csv", "data", "raw", "keyword alpha beta", 100)
+		}
+
+		hits, err := VaultSearch(dir, "keyword", 0)
+		if err != nil {
+			t.Fatalf("VaultSearch: %v", err)
+		}
+		if len(hits) != 50 {
+			t.Errorf("got %d hits, want 50 (default limit)", len(hits))
+		}
+	})
+}