chore: auto-commit (95 archivos)

- cmd/fn/doctor.go
- cmd/fn/main.go
- cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt
- cpp/apps/primitives_gallery/playground/tables/data_table.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp
- cpp/apps/primitives_gallery/playground/tables/data_table_logic.h
- cpp/apps/primitives_gallery/playground/tables/self_test.cpp
- cpp/apps/primitives_gallery/playground/tables/tql.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.cpp
- cpp/apps/primitives_gallery/playground/tables/viz.h
- ...

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-13 00:50:34 +02:00
parent a2bbf23374
commit e3c8979e8d
189 changed files with 18964 additions and 330 deletions
@@ -0,0 +1,73 @@
---
name: cuda_toolkit_check
kind: function
lang: bash
domain: infra
version: "1.0.0"
purity: impure
signature: "cuda_toolkit_check() -> void"
description: "Detecta componentes CUDA instalados en el sistema y emite pares key=value a stdout: nvcc (version o missing), nvidia_smi (present/missing), driver_version, cuda_libs (path o missing) y overall (ok|partial|missing). Exit code 0 siempre — funcion informativa, no fatal."
tags: [cuda, nvidia, gpu, hardware, probe, infra, toolkit]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: []
params:
- name: (ninguno)
desc: "No toma parametros. Lee el estado del sistema via nvcc, nvidia-smi y busqueda en rutas canonicas de CUDA."
output: "Cinco pares key=value en stdout: nvcc, nvidia_smi, driver_version, cuda_libs, overall. overall=ok si los tres componentes principales estan presentes; partial si algunos; missing si ninguno."
tested: false
tests: []
test_file_path: ""
file_path: "bash/functions/infra/cuda_toolkit_check.sh"
---
## Ejemplo
```bash
source bash/functions/infra/cuda_toolkit_check.sh
cuda_toolkit_check
```
Salida en maquina con CUDA completo:
```
nvcc=12.4
nvidia_smi=present
driver_version=550.54.15
cuda_libs=/usr/local/cuda
overall=ok
```
Salida en maquina sin CUDA:
```
nvcc=missing
nvidia_smi=missing
driver_version=missing
cuda_libs=missing
overall=missing
```
Invocar directamente:
```bash
bash bash/functions/infra/cuda_toolkit_check.sh
```
Parsear desde otro script:
```bash
eval "$(cuda_toolkit_check)"
echo "CUDA overall: $overall"
if [[ "$overall" == "ok" ]]; then
echo "CUDA completo: nvcc=$nvcc driver=$driver_version libs=$cuda_libs"
fi
```
## Notas
- Idempotente: no instala, no modifica nada, solo consulta.
- Exit code 0 siempre — ausencia de CUDA es informacion, no fallo.
- Busca `libcuda.so` en `/usr/local/cuda*`, `/opt/cuda*` y via `ldconfig -p`.
- `driver_version` refleja el driver NVIDIA del kernel, reportado por nvidia-smi.
- `nvcc` reporta la version del compilador CUDA toolkit (puede diferir de la version soportada por el driver).
- Para obtener la version CUDA maxima soportada por el driver, usar `get_gpu_info_go_infra` (campo CudaVersion del struct GpuInfo).
@@ -0,0 +1,99 @@
#!/usr/bin/env bash
# cuda_toolkit_check — Detecta componentes CUDA instalados en el sistema.
#
# Emite pares key=value a stdout:
# nvcc=<version|missing>
# nvidia_smi=<present|missing>
# driver_version=<version|missing>
# cuda_libs=<path|missing>
# overall=<ok|partial|missing>
#
# Exit code 0 siempre (funcion informativa, no fatal).
# Idempotente: se puede invocar multiples veces sin efectos secundarios.
cuda_toolkit_check() {
local nvcc_ver="missing"
local nvidia_smi_status="missing"
local driver_version="missing"
local cuda_libs_path="missing"
# --- nvcc ---
if command -v nvcc &>/dev/null; then
# nvcc --version imprime algo como:
# Cuda compilation tools, release 12.4, V12.4.131
local raw
raw="$(nvcc --version 2>&1)"
# Extraer "12.4" de "release 12.4,"
local ver
ver="$(echo "$raw" | grep -oP 'release \K[0-9]+\.[0-9]+')"
nvcc_ver="${ver:-present}"
fi
# --- nvidia-smi + driver_version ---
if command -v nvidia-smi &>/dev/null; then
nvidia_smi_status="present"
# nvidia-smi --query-gpu=driver_version --format=csv,noheader retorna la version
local drv
drv="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n1 | tr -d ' ')"
if [[ -n "$drv" ]]; then
driver_version="$drv"
fi
fi
# --- cuda_libs: buscar en rutas canonicas ---
local search_dirs=(
"/usr/local/cuda"
"/usr/local/cuda-"*
"/opt/cuda"
"/opt/cuda-"*
"/usr/lib/x86_64-linux-gnu/libcuda.so"*
"/usr/lib/aarch64-linux-gnu/libcuda.so"*
)
for candidate in "${search_dirs[@]}"; do
# shellcheck disable=SC2206
# Expandir globs: si el candidato no existe el glob no expande
for path in $candidate; do
if [[ -e "$path" ]]; then
# Normalizar: tomar solo el directorio raiz /usr/local/cuda*
local base
base="${path%%/lib*}"
cuda_libs_path="$base"
break 2
fi
done
done
# Si no encontramos directorio CUDA pero si libcuda.so en rutas de lib estandar
if [[ "$cuda_libs_path" == "missing" ]]; then
local libcuda
libcuda="$(ldconfig -p 2>/dev/null | grep 'libcuda\.so' | head -n1 | awk '{print $NF}')"
if [[ -n "$libcuda" ]]; then
cuda_libs_path="$(dirname "$libcuda")"
fi
fi
# --- overall ---
local found_count=0
[[ "$nvcc_ver" != "missing" ]] && ((found_count++))
[[ "$nvidia_smi_status" != "missing" ]] && ((found_count++))
[[ "$cuda_libs_path" != "missing" ]] && ((found_count++))
local overall
if [[ $found_count -eq 0 ]]; then overall="missing"
elif [[ $found_count -eq 3 ]]; then overall="ok"
else overall="partial"
fi
# --- emitir resultados ---
echo "nvcc=${nvcc_ver}"
echo "nvidia_smi=${nvidia_smi_status}"
echo "driver_version=${driver_version}"
echo "cuda_libs=${cuda_libs_path}"
echo "overall=${overall}"
}
# Ejecutar si se invoca directamente
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
cuda_toolkit_check "$@"
fi
@@ -0,0 +1,111 @@
#!/usr/bin/env bash
# Tests para cuda_toolkit_check
# Smoke: verifica que stdout contiene todas las keys requeridas y exit code 0.
set -uo pipefail
# Nota: set -e NO se usa para que los asserts fallen de forma acumulativa
# en lugar de abortar el script al primer fallo.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../cuda_toolkit_check.sh"
PASS=0
FAIL=0
assert_eq() {
local test_name="$1" expected="$2" got="$3"
if [[ "$expected" == "$got" ]]; then
echo "PASS: $test_name"
((PASS++)) || true
else
echo "FAIL: $test_name — expected '$expected', got '$got'"
((FAIL++)) || true
fi
}
assert_contains() {
local test_name="$1" needle="$2" haystack="$3"
if echo "$haystack" | grep -qF "$needle"; then
echo "PASS: $test_name"
((PASS++)) || true
else
echo "FAIL: $test_name — '$needle' not found in output"
((FAIL++)) || true
fi
}
assert_matches_pattern() {
local test_name="$1" pattern="$2" value="$3"
if echo "$value" | grep -qE "$pattern"; then
echo "PASS: $test_name"
((PASS++)) || true
else
echo "FAIL: $test_name — '$value' does not match pattern '$pattern'"
((FAIL++)) || true
fi
}
assert_nonempty() {
local test_name="$1" value="$2"
if [[ -n "$value" ]]; then
echo "PASS: $test_name"
((PASS++)) || true
else
echo "FAIL: $test_name — valor vacio"
((FAIL++)) || true
fi
}
# --- Capturar salida ---
OUTPUT="$(cuda_toolkit_check)"
EXIT_CODE=$?
# --- Test: exit code 0 ---
assert_eq "exit code es 0" "0" "$EXIT_CODE"
# --- Test: stdout contiene clave nvcc= ---
assert_contains "stdout contiene clave nvcc=" "nvcc=" "$OUTPUT"
# --- Test: stdout contiene clave nvidia_smi= ---
assert_contains "stdout contiene clave nvidia_smi=" "nvidia_smi=" "$OUTPUT"
# --- Test: stdout contiene clave driver_version= ---
assert_contains "stdout contiene clave driver_version=" "driver_version=" "$OUTPUT"
# --- Test: stdout contiene clave cuda_libs= ---
assert_contains "stdout contiene clave cuda_libs=" "cuda_libs=" "$OUTPUT"
# --- Test: stdout contiene clave overall= ---
assert_contains "stdout contiene clave overall=" "overall=" "$OUTPUT"
# --- Test: overall tiene valor valido (ok|partial|missing) ---
OVERALL_VAL="$(echo "$OUTPUT" | grep '^overall=' | cut -d= -f2)"
assert_matches_pattern "overall tiene valor valido ok|partial|missing" "^(ok|partial|missing)$" "$OVERALL_VAL"
# --- Test: nvcc tiene valor no vacio ---
NVCC_VAL="$(echo "$OUTPUT" | grep '^nvcc=' | cut -d= -f2)"
assert_nonempty "nvcc tiene valor no vacio" "$NVCC_VAL"
# --- Test: nvidia_smi tiene valor valido (present|missing) ---
SMI_VAL="$(echo "$OUTPUT" | grep '^nvidia_smi=' | cut -d= -f2)"
assert_matches_pattern "nvidia_smi tiene valor valido present|missing" "^(present|missing)$" "$SMI_VAL"
# --- Test: driver_version tiene valor no vacio ---
DRV_VAL="$(echo "$OUTPUT" | grep '^driver_version=' | cut -d= -f2)"
assert_nonempty "driver_version tiene valor no vacio" "$DRV_VAL"
# --- Test: cuda_libs tiene valor no vacio ---
LIBS_VAL="$(echo "$OUTPUT" | grep '^cuda_libs=' | cut -d= -f2)"
assert_nonempty "cuda_libs tiene valor no vacio" "$LIBS_VAL"
# --- Test: exactamente 5 lineas en la salida ---
LINE_COUNT="$(echo "$OUTPUT" | wc -l | tr -d ' ')"
assert_eq "salida tiene exactamente 5 lineas" "5" "$LINE_COUNT"
# --- Test: segunda invocacion idempotente (mismo resultado) ---
OUTPUT2="$(cuda_toolkit_check)"
assert_eq "segunda invocacion produce mismo resultado (idempotente)" "$OUTPUT" "$OUTPUT2"
# --- Resumen ---
echo "---"
echo "Results: $PASS passed, $FAIL failed"
[[ $FAIL -eq 0 ]] || exit 1
+90
View File
@@ -0,0 +1,90 @@
---
name: vault_audit
kind: pipeline
lang: bash
domain: pipelines
version: "1.0.0"
purity: impure
signature: "vault_audit(<vault_name> | --all) [--skip-profilers] [--dry-run-layout] -> void"
description: "Pipeline completo de auditoria para uno o todos los vaults declarados: layout-ensure, index, profile (csv/pdf/md), dedupe, aggregate y doctor. Produce tabla resumen con estado por vault y codigo de salida 4 si hay warnings."
tags: [vault, audit, pipeline, launcher, infra, bash]
uses_functions:
- vault_layout_ensure_go_infra
- vault_inventory_scan_go_infra
- vault_index_open_go_infra
- vault_index_write_go_infra
- vault_csv_profile_py_datascience
- vault_pdf_extract_py_datascience
- vault_knowledge_parse_py_infra
- vault_dedupe_report_py_infra
- vault_aggregate_index_go_infra
- vault_doctor_go_infra
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: []
params:
- name: vault_name
desc: "Nombre del vault a auditar (como aparece en registry.db tabla vaults). Usar --all para todos."
- name: --all
desc: "Audita todos los vaults declarados en registry.db. Mutuamente excluyente con vault_name."
- name: --skip-profilers
desc: "Omite el paso de profiling CSV/PDF/MD. Util para auditorias rapidas de inventario."
- name: --dry-run-layout
desc: "Pasa --dry-run a vault layout-ensure: calcula cambios sin tocar el disco."
output: "Tabla de resumen por vault con status ok/warn. Codigo de salida 0=exito, 1=root no localizable, 4=uno o mas vaults con warnings."
tested: false
tests: []
test_file_path: ""
file_path: "bash/functions/pipelines/vault_audit.sh"
---
## Ejemplo
```bash
# Auditar un vault especifico
FN_REGISTRY_ROOT=/home/lucas/fn_registry \
bash bash/functions/pipelines/vault_audit.sh turismo_spain
# Auditar todos los vaults
FN_REGISTRY_ROOT=/home/lucas/fn_registry \
bash bash/functions/pipelines/vault_audit.sh --all
# Solo layout + index + aggregate (sin profilers, mas rapido)
bash bash/functions/pipelines/vault_audit.sh turismo_spain --skip-profilers
# Ver que haria layout-ensure sin tocar disco
bash bash/functions/pipelines/vault_audit.sh turismo_spain --dry-run-layout
# Equivalente via fn run (desde la raiz del registry)
./fn run vault_audit_bash_pipelines turismo_spain
```
## Pasos del pipeline
1. **layout-ensure**`fn vault layout-ensure <name>` asegura `data/{raw,processed,exports}` y `knowledge/{...}`.
2. **index**`fn vault index <name>` escanea archivos y persiste en `vault_index.db`.
3. **profile**`fn vault profile <name>` llama `vault_profile_dispatch.py` para CSV/PDF/MD.
4. **dedupe**`fn vault dedupe <name>` detecta duplicados por sha256 (informacional, no fatal).
5. **aggregate**`fn vault aggregate` copia todo a `registry.db` tabla `vault_files` (una sola vez al final).
6. **doctor**`fn vault doctor` muestra estado de salud de todos los vaults.
## Codigos de salida
| Codigo | Significado |
|--------|-------------|
| 0 | Todos los vaults procesados sin errores |
| 1 | FN_REGISTRY_ROOT no localizable o fn binary no encontrado |
| 4 | Uno o mas vaults con warnings (layout o index fallaron) |
## Variables de entorno
- `FN_REGISTRY_ROOT` — raiz del registry (auto-detectada si no esta seteada).
- `FN_BIN` — path al binario `fn` (default: `$FN_REGISTRY_ROOT/fn`).
## Notas
Requiere `sqlite3` en PATH para resolver la lista de vaults con `--all`.
El paso de profile es non-fatal: errores en profilers individuales se reportan como warnings.
El paso de dedupe es siempre informacional (no borra archivos).
+172
View File
@@ -0,0 +1,172 @@
#!/usr/bin/env bash
# vault_audit — Full audit pipeline for one or all declared vaults.
# Runs: layout-ensure → index → profile → dedupe → aggregate → doctor
#
# Usage:
# vault_audit.sh <vault_name>
# vault_audit.sh --all
# vault_audit.sh <vault_name> --skip-profilers
# vault_audit.sh <vault_name> --dry-run-layout
# vault_audit.sh --all --skip-profilers
set -euo pipefail
# --- locate FN_REGISTRY_ROOT ---
_find_registry_root() {
local dir
dir="$(pwd)"
while [[ "$dir" != "/" ]]; do
if [[ -f "$dir/registry.db" ]]; then
echo "$dir"
return 0
fi
dir="$(dirname "$dir")"
done
return 1
}
if [[ -n "${FN_REGISTRY_ROOT:-}" && -f "${FN_REGISTRY_ROOT}/registry.db" ]]; then
REGISTRY_ROOT="$FN_REGISTRY_ROOT"
elif REGISTRY_ROOT="$(_find_registry_root 2>/dev/null)"; then
: # found
else
echo "ERROR: Cannot locate registry.db. Set FN_REGISTRY_ROOT or run from registry root." >&2
exit 1
fi
FN_BIN="${FN_BIN:-${REGISTRY_ROOT}/fn}"
if [[ ! -x "$FN_BIN" ]]; then
echo "ERROR: fn binary not found at $FN_BIN. Build with: CGO_ENABLED=1 go build -tags fts5 -o fn ./cmd/fn/" >&2
exit 1
fi
# --- parse args ---
AUDIT_ALL=0
SKIP_PROFILERS=0
DRY_RUN_LAYOUT=0
VAULT_NAMES=()
START_TS=$(date +%s)
while [[ $# -gt 0 ]]; do
case "$1" in
--all) AUDIT_ALL=1 ;;
--skip-profilers) SKIP_PROFILERS=1 ;;
--dry-run-layout) DRY_RUN_LAYOUT=1 ;;
-*)
echo "ERROR: Unknown flag: $1" >&2
echo "Usage: vault_audit.sh <name> | --all [--skip-profilers] [--dry-run-layout]" >&2
exit 1
;;
*)
VAULT_NAMES+=("$1")
;;
esac
shift
done
if [[ $AUDIT_ALL -eq 0 && ${#VAULT_NAMES[@]} -eq 0 ]]; then
echo "Usage: vault_audit.sh <vault_name> | --all [--skip-profilers] [--dry-run-layout]" >&2
exit 1
fi
# --- resolve vault list ---
if [[ $AUDIT_ALL -eq 1 ]]; then
mapfile -t VAULT_NAMES < <(
sqlite3 "${REGISTRY_ROOT}/registry.db" "SELECT name FROM vaults ORDER BY name;" 2>/dev/null || true
)
if [[ ${#VAULT_NAMES[@]} -eq 0 ]]; then
echo "No vaults registered in registry.db. Run 'fn index' first." >&2
exit 1
fi
echo "Found ${#VAULT_NAMES[@]} vault(s): ${VAULT_NAMES[*]}"
fi
# --- build fn vault flags ---
LAYOUT_FLAGS=()
if [[ $DRY_RUN_LAYOUT -eq 1 ]]; then
LAYOUT_FLAGS+=(--dry-run)
fi
# --- per-vault audit ---
PASS_COUNT=0
FAIL_COUNT=0
declare -A VAULT_STATUS
audit_one() {
local name="$1"
local vault_ok=1
echo ""
echo "=== vault: $name ==="
# Step 1: layout-ensure
echo " [1/5] layout-ensure"
if ! "$FN_BIN" vault layout-ensure "$name" "${LAYOUT_FLAGS[@]}" 2>&1 | sed 's/^/ /'; then
echo " WARN: layout-ensure failed (non-fatal)" >&2
vault_ok=0
fi
# Step 2: index
echo " [2/5] index"
if ! "$FN_BIN" vault index "$name" 2>&1 | sed 's/^/ /'; then
echo " ERROR: index failed" >&2
vault_ok=0
fi
# Step 3: profile
if [[ $SKIP_PROFILERS -eq 0 ]]; then
echo " [3/5] profile"
if ! "$FN_BIN" vault profile "$name" 2>&1 | sed 's/^/ /'; then
echo " WARN: profile had errors (non-fatal)" >&2
fi
else
echo " [3/5] profile (skipped)"
fi
# Step 4: dedupe (informational, non-fatal)
echo " [4/5] dedupe"
"$FN_BIN" vault dedupe "$name" 2>&1 | sed 's/^/ /' || true
# Step 5 deferred — aggregate runs once at the end
echo " [5/5] aggregate (deferred)"
if [[ $vault_ok -eq 1 ]]; then
VAULT_STATUS["$name"]="ok"
PASS_COUNT=$((PASS_COUNT + 1))
else
VAULT_STATUS["$name"]="warn"
FAIL_COUNT=$((FAIL_COUNT + 1))
fi
}
for vault_name in "${VAULT_NAMES[@]}"; do
audit_one "$vault_name"
done
# --- aggregate (once, after all vaults) ---
echo ""
echo "=== aggregate ==="
"$FN_BIN" vault aggregate 2>&1 | sed 's/^/ /'
# --- doctor (read-only health check) ---
echo ""
echo "=== doctor ==="
"$FN_BIN" vault doctor 2>&1 | sed 's/^/ /' || true
# --- summary table ---
END_TS=$(date +%s)
ELAPSED=$(( END_TS - START_TS ))
echo ""
echo "=== summary ==="
printf "%-30s %s\n" "VAULT" "STATUS"
printf "%-30s %s\n" "-----" "------"
for vault_name in "${VAULT_NAMES[@]}"; do
status="${VAULT_STATUS[$vault_name]:-unknown}"
printf "%-30s %s\n" "$vault_name" "$status"
done
echo ""
echo "Done: ${PASS_COUNT} ok, ${FAIL_COUNT} warn (${ELAPSED}s)"
if [[ $FAIL_COUNT -gt 0 ]]; then
exit 4
fi
exit 0
+1059
View File
File diff suppressed because it is too large Load Diff
+318
View File
@@ -0,0 +1,318 @@
package main
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"time"
"fn-registry/functions/infra"
"fn-registry/registry"
)
// fnBinDir holds the temp directory for the compiled fn binary.
// It is created by TestMain and cleaned up at test end.
var fnBinDir string
var fnBinPath string
// TestMain compiles the fn binary once before all tests.
func TestMain(m *testing.M) {
var err error
fnBinDir, err = os.MkdirTemp("", "fn-vault-test-*")
if err != nil {
fmt.Fprintf(os.Stderr, "create temp dir: %v\n", err)
os.Exit(1)
}
defer os.RemoveAll(fnBinDir)
fnBinPath = filepath.Join(fnBinDir, "fn")
// Find registry root by walking up from current directory.
regRoot, err := findRoot()
if err != nil {
fmt.Fprintf(os.Stderr, "find root: %v\n", err)
os.Exit(1)
}
cmd := exec.Command("go", "build", "-tags", "fts5", "-o", fnBinPath, ".")
cmd.Dir = filepath.Join(regRoot, "cmd", "fn")
if out, errB := cmd.CombinedOutput(); errB != nil {
fmt.Fprintf(os.Stderr, "build fn: %v\n%s\n", errB, out)
os.Exit(1)
}
os.Exit(m.Run())
}
func findRoot() (string, error) {
dir, err := os.Getwd()
if err != nil {
return "", err
}
for {
if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
return dir, nil
}
parent := filepath.Dir(dir)
if parent == dir {
return "", fmt.Errorf("could not find go.mod from %s", dir)
}
dir = parent
}
}
func ensureFnBin(t *testing.T) string {
t.Helper()
return fnBinPath
}
// setupTestRegistry creates a minimal registry root with:
// - registry.db (opened + migrations applied via registry.Open)
// - a project with a vault declared in vault.yaml
// - a vault directory with some test files
// - a symlink from projects/test_proj/vaults/test_vault -> vault dir
//
// Returns (repoRoot, vaultDir).
func setupTestRegistry(t *testing.T) (string, string) {
t.Helper()
repoRoot := t.TempDir()
// Create vault directory with files.
vaultDir := filepath.Join(t.TempDir(), "test_vault")
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "report.csv"),
[]byte("name,value\nfoo,1"), 0644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "notes.md"),
[]byte("# Notes\nsome text"), 0644); err != nil {
t.Fatal(err)
}
// Create project directory structure.
projDir := filepath.Join(repoRoot, "projects", "test_proj")
vaultsDir := filepath.Join(projDir, "vaults")
if err := os.MkdirAll(vaultsDir, 0755); err != nil {
t.Fatal(err)
}
// Create vault.yaml.
vaultYAML := "vaults:\n - name: test_vault\n description: Test vault for unit tests\n path: " + vaultDir + "\n tags: [test]\n"
if err := os.WriteFile(filepath.Join(vaultsDir, "vault.yaml"), []byte(vaultYAML), 0644); err != nil {
t.Fatal(err)
}
// Create project.md.
projMD := "---\nname: test_proj\ndescription: Test project\ntags: [test]\n---\n"
if err := os.WriteFile(filepath.Join(projDir, "project.md"), []byte(projMD), 0644); err != nil {
t.Fatal(err)
}
// Open registry.db (creates schema + runs migrations).
db, err := registry.Open(filepath.Join(repoRoot, "registry.db"))
if err != nil {
t.Fatalf("registry.Open: %v", err)
}
// Index so the vault is registered in registry.db.
if _, err := registry.Index(db, repoRoot); err != nil {
t.Fatalf("registry.Index: %v", err)
}
db.Close()
return repoRoot, vaultDir
}
// runFn runs the fn binary in repoRoot with the given args.
func runFn(t *testing.T, repoRoot string, args ...string) (string, string, int) {
t.Helper()
bin := ensureFnBin(t)
cmd := exec.Command(bin, args...)
cmd.Dir = repoRoot
var stdout, stderr strings.Builder
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err := cmd.Run()
code := 0
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
code = exitErr.ExitCode()
} else {
t.Logf("cmd error: %v", err)
}
}
return stdout.String(), stderr.String(), code
}
// TestVaultList verifies that 'fn vault list' shows the indexed vault.
func TestVaultList(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
out, stderr, code := runFn(t, repoRoot, "vault", "list")
if code != 0 {
t.Fatalf("fn vault list exit %d\nstderr: %s", code, stderr)
}
if !strings.Contains(out, "test_vault") {
t.Errorf("expected 'test_vault' in output, got:\n%s", out)
}
}
// TestVaultIndex verifies that 'fn vault index <name>' runs without error.
func TestVaultIndex(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
out, stderr, code := runFn(t, repoRoot, "vault", "index", "test_vault")
if code != 0 {
t.Fatalf("fn vault index exit %d\nstderr: %s\nstdout: %s", code, stderr, out)
}
if !strings.Contains(out, "indexed") {
t.Errorf("expected 'indexed' in output, got:\n%s", out)
}
}
// TestVaultSearchJSON verifies that 'fn vault search --json' returns valid JSON array.
func TestVaultSearchJSON(t *testing.T) {
repoRoot, vaultDir := setupTestRegistry(t)
// First index the vault so there is something to search.
if _, _, code := runFn(t, repoRoot, "vault", "index", "test_vault"); code != 0 {
t.Fatal("fn vault index failed")
}
// Seed some content into the vault index for the search to find.
db, err := infra.VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
// Update content_text for FTS search.
db.Exec(`DELETE FROM files_fts WHERE rel_path = 'data/raw/report.csv'`)
db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES ('data/raw/report.csv', 'foo report data')`)
db.Close()
out, stderr, code := runFn(t, repoRoot, "vault", "search", "report", "--json", "--vault", "test_vault")
if code != 0 {
t.Fatalf("fn vault search exit %d\nstderr: %s", code, stderr)
}
var result []map[string]interface{}
if err := json.Unmarshal([]byte(out), &result); err != nil {
t.Fatalf("output is not valid JSON: %v\nraw: %s", err, out)
}
// Should be a JSON array (possibly empty if search finds nothing, but must be valid).
t.Logf("search returned %d hits", len(result))
}
// TestVaultInfo verifies that 'fn vault info <name>' outputs vault stats.
func TestVaultInfo(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
// Index first.
if _, _, code := runFn(t, repoRoot, "vault", "index", "test_vault"); code != 0 {
t.Fatal("fn vault index failed")
}
out, stderr, code := runFn(t, repoRoot, "vault", "info", "test_vault")
if code != 0 {
t.Fatalf("fn vault info exit %d\nstderr: %s", code, stderr)
}
if !strings.Contains(out, "test_vault") {
t.Errorf("expected vault name in output, got:\n%s", out)
}
if !strings.Contains(out, "Files:") {
t.Errorf("expected 'Files:' in output, got:\n%s", out)
}
}
// TestFormatBytes verifies the formatBytes helper.
func TestFormatBytes(t *testing.T) {
cases := []struct {
input int64
expected string
}{
{500, "500 B"},
{1024, "1.0 KB"},
{1536, "1.5 KB"},
{1048576, "1.0 MB"},
{1073741824, "1.0 GB"},
}
for _, tc := range cases {
got := formatBytes(tc.input)
if got != tc.expected {
t.Errorf("formatBytes(%d) = %q, want %q", tc.input, got, tc.expected)
}
}
}
// TestVaultLayoutEnsure verifies that 'fn vault layout-ensure --dry-run' works.
func TestVaultLayoutEnsure(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
out, stderr, code := runFn(t, repoRoot, "vault", "layout-ensure", "test_vault", "--dry-run")
if code != 0 {
t.Fatalf("fn vault layout-ensure exit %d\nstderr: %s\nstdout: %s", code, stderr, out)
}
if !strings.Contains(out, "test_vault") {
t.Errorf("expected vault name in output, got:\n%s", out)
}
}
// TestVaultAggregate verifies that 'fn vault aggregate' runs without error on a clean registry.
func TestVaultAggregate(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
// Index first so there is something to aggregate.
if _, _, code := runFn(t, repoRoot, "vault", "index", "test_vault"); code != 0 {
t.Fatal("fn vault index failed")
}
_, stderr, code := runFn(t, repoRoot, "vault", "aggregate")
if code != 0 {
t.Fatalf("fn vault aggregate exit %d\nstderr: %s", code, stderr)
}
}
// TestVaultDoctor verifies that 'fn vault doctor' runs and reports on vaults.
func TestVaultDoctor(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
out, stderr, code := runFn(t, repoRoot, "vault", "doctor")
if code != 0 {
t.Fatalf("fn vault doctor exit %d\nstderr: %s", code, stderr)
}
if !strings.Contains(out, "test_vault") {
t.Errorf("expected 'test_vault' in doctor output, got:\n%s", out)
}
}
// TestVaultDedupe verifies that 'fn vault dedupe' runs without error after indexing.
func TestVaultDedupe(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
if _, _, code := runFn(t, repoRoot, "vault", "index", "test_vault"); code != 0 {
t.Fatal("fn vault index failed")
}
out, stderr, code := runFn(t, repoRoot, "vault", "dedupe", "test_vault")
if code != 0 {
t.Fatalf("fn vault dedupe exit %d\nstderr: %s", code, stderr)
}
// Should say "No duplicates" or show a table — either is fine.
_ = out
}
// TestVaultAuditDryRun verifies that 'fn vault audit --dry-run-layout --skip-profilers' works.
func TestVaultAuditDryRun(t *testing.T) {
repoRoot, _ := setupTestRegistry(t)
out, stderr, code := runFn(t, repoRoot, "vault", "audit", "test_vault",
"--dry-run-layout", "--skip-profilers")
// Exit 0 = fully ok; exit 4 = warnings (layout issues) — both acceptable here.
if code != 0 && code != 4 {
t.Fatalf("fn vault audit exit %d\nstderr: %s\nstdout: %s", code, stderr, out)
}
if !strings.Contains(out, "summary") {
t.Errorf("expected 'summary' section in audit output, got:\n%s", out)
}
}
// Suppress unused import for time.
var _ = time.Now
+95
View File
@@ -44,6 +44,10 @@ func cmdDoctor(args []string) {
doctorUnused(r, jsonOut)
case "cpp-apps":
doctorCppApps(r, jsonOut)
case "ml":
doctorML(r, jsonOut)
case "vaults":
doctorVaults(r, jsonOut)
default:
fmt.Fprintf(os.Stderr, "unknown doctor subcommand: %s\n", sub)
doctorUsage()
@@ -65,6 +69,8 @@ Subcommands:
uses-functions Audit imports reales vs uses_functions del app.md
unused Funciones del registry sin consumidores
cpp-apps Conformidad de apps C++ con cpp/PATTERNS.md (cfg.about, dockspace, menubar)
ml Entorno ML: GPUs NVIDIA, CUDA toolkit, venv Python, paquetes torch/diffusers, CLIs y vault
vaults Salud de vaults: directorio, layout, índice, staleness, drift
Flags:
--json Salida JSON (para scripting/agentes)`)
@@ -103,6 +109,16 @@ func doctorAll(root string, jsonOut bool) {
} else {
all["cpp_apps_error"] = err.Error()
}
if v, err := infra.AuditMlEnv(root); err == nil {
all["ml"] = v
} else {
all["ml_error"] = err.Error()
}
if v, err := infra.VaultDoctor(root); err == nil {
all["vaults"] = v
} else {
all["vaults_error"] = err.Error()
}
emit(all)
return
}
@@ -119,6 +135,10 @@ func doctorAll(root string, jsonOut bool) {
doctorUnused(root, false)
fmt.Println("\n=== C++ apps standard conformance ===")
doctorCppApps(root, false)
fmt.Println("\n=== ML environment ===")
doctorML(root, false)
fmt.Println("\n=== Vaults ===")
doctorVaults(root, false)
}
func doctorCppApps(root string, jsonOut bool) {
@@ -280,6 +300,81 @@ func doctorUnused(root string, jsonOut bool) {
fmt.Printf("\n%d unused functions (candidates to remove).\n", len(unused))
}
func doctorVaults(root string, jsonOut bool) {
entries, err := infra.VaultDoctor(root)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
if jsonOut {
emit(entries)
return
}
if len(entries) == 0 {
fmt.Println("No vaults declared (no projects/*/vaults/vault.yaml found).")
return
}
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(w, "NAME\tSTATUS\tFILES\tINDEXED\tISSUES")
ok := 0
for _, e := range entries {
issues := "-"
if len(e.Issues) > 0 {
issues = strings.Join(e.Issues, "; ")
}
fmt.Fprintf(w, "%s\t%s\t%d\t%d\t%s\n",
e.VaultName, e.Status, e.DiskFiles, e.IndexedFiles, issues)
if e.Status == "ok" {
ok++
}
}
w.Flush()
fmt.Printf("\n%d/%d vaults healthy.\n", ok, len(entries))
}
func doctorML(root string, jsonOut bool) {
report, err := infra.AuditMlEnv(root)
if err != nil {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
os.Exit(1)
}
if jsonOut {
emit(report)
return
}
fmt.Printf("GPUs detected: %d\n", len(report.Gpus))
for _, g := range report.Gpus {
fmt.Printf(" [%d] %s VRAM: %d/%d MiB Driver: %s CUDA: %s\n",
g.Index, g.Name, g.VramFreeMb, g.VramTotalMb, g.DriverVersion, g.CudaVersion)
}
fmt.Println()
w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
fmt.Fprintln(w, "CHECK\tSTATUS\tVERSION\tDETAIL")
for _, c := range report.Checks {
version := c.Version
if version == "" {
version = "-"
}
detail := c.Detail
if len(detail) > 60 {
detail = detail[:60] + "..."
}
if detail == "" {
detail = "-"
}
fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", c.Name, c.Status, version, detail)
}
w.Flush()
overall := "OK"
if !report.OverallOK {
overall = "INCOMPLETE"
}
fmt.Printf("\nOverall ML environment: %s\n", overall)
}
func emit(v any) {
b, err := json.MarshalIndent(v, "", " ")
if err != nil {
+3
View File
@@ -45,6 +45,8 @@ func main() {
cmdAnalysis(os.Args[2:])
case "sync":
cmdSync(os.Args[2:])
case "vault":
cmdVault(os.Args[2:])
case "doctor":
cmdDoctor(os.Args[2:])
case "help", "-h", "--help":
@@ -73,6 +75,7 @@ Usage:
fn app <list|clone|pull> Gestiona apps externas (Gitea)
fn analysis <list|clone|pull> Gestiona analyses externas (Gitea)
fn sync [status|locations] Sincroniza con servidor central
fn vault <list|search|index|info> Gestiona y busca en data vaults
fn doctor [artefacts|services|sync|uses-functions|unused] [--json]
Diagnostico read-only del registry`)
}
@@ -3,8 +3,10 @@ add_imgui_app(tables_playground
main.cpp
data_table.cpp
data_table_logic.cpp
llm_anthropic.cpp
lua_engine.cpp
tql.cpp
tql_to_sql.cpp
viz.cpp
)
target_link_libraries(tables_playground PRIVATE lua54 implot)
@@ -13,10 +15,13 @@ target_link_libraries(tables_playground PRIVATE lua54 implot)
add_executable(tables_playground_self_test
self_test.cpp
data_table_logic.cpp
llm_anthropic.cpp
lua_engine.cpp
tql.cpp
tql_to_sql.cpp
)
target_include_directories(tables_playground_self_test PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_SOURCE_DIR}/functions
)
target_link_libraries(tables_playground_self_test PRIVATE lua54)
@@ -1,20 +1,33 @@
#include "data_table.h"
#include "app_base.h"
#include "imgui.h"
#include "llm_anthropic.h"
#include "lua_engine.h"
#include "tql.h"
#include "tql_to_sql.h"
#include "viz.h"
#include <algorithm>
#include <cfloat>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <string>
#include <unordered_map>
namespace data_table {
// UTC date today as ISO YYYY-MM-DD. Para preset filtros Last7/30/90d.
static std::string today_iso() {
std::time_t t = std::time(nullptr);
std::tm tm = *std::gmtime(&t);
char buf[16];
std::snprintf(buf, sizeof(buf), "%04d-%02d-%02d",
tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday);
return buf;
}
namespace {
// ---------------------------------------------------------------------------
@@ -122,10 +135,106 @@ struct UiState {
// Toggle Table <-> View: remember last non-table display.
ViewMode last_non_table_main = ViewMode::Bar;
// Drill history (fase 10). Stacks per-app; no persistido en TQL.
std::vector<DrillStep> drill_back;
std::vector<DrillStep> drill_forward;
// Row inspector (fase 10). -1 cerrado, sino row idx en el output del stage activo.
int inspect_row = -1;
bool inspect_open = false;
// Ask AI modal (fase 11 — issue 0080).
bool ask_open = false;
bool ask_busy = false;
int ask_mode = 0; // 0 = TQL, 1 = SQL
char ask_question[2048] = {0};
std::string ask_current_tql; // emit del state actual al abrir modal
std::string ask_response_raw; // texto del modelo
std::string ask_response_code; // bloque extraido (Lua o SQL)
std::string ask_error;
std::string ask_status; // "Sent. Waiting..." / "OK" / error
char ask_edit_buf[8192] = {0}; // buffer editable de propuesta
};
UiState& ui() { static UiState s; return s; }
// Row inspector modal (fase 10). Muestra todas cols + valores de la fila
// inspect_row del output del stage activo. Read-only + Copy TSV + Filter
// by this row (anade filters al stage previo si existe).
static void draw_row_inspector_modal(State& st, int active,
const char* const* cells, int rows, int cols,
const std::vector<std::string>& headers,
const std::vector<ColumnType>& types,
const std::vector<std::string>& prev_input_headers) {
auto& U = ui();
if (!U.inspect_open) return;
if (U.inspect_row < 0 || U.inspect_row >= rows) {
U.inspect_open = false;
return;
}
ImGui::OpenPopup("##row_inspector");
ImGui::SetNextWindowSize(ImVec2(560, 400), ImGuiCond_Appearing);
if (ImGui::BeginPopupModal("##row_inspector", &U.inspect_open,
ImGuiWindowFlags_NoSavedSettings)) {
ImGui::Text("Row %d", U.inspect_row);
ImGui::SameLine(0, 20);
if (ImGui::SmallButton("Copy TSV")) {
std::string tsv = row_to_tsv(cells, rows, cols, U.inspect_row, headers);
ImGui::SetClipboardText(tsv.c_str());
}
ImGui::SameLine();
bool can_filter = (active > 0 && !prev_input_headers.empty());
ImGui::BeginDisabled(!can_filter);
if (ImGui::SmallButton("Filter prev stage by this row")) {
int target = active - 1;
for (int c = 0; c < cols; ++c) {
const char* v = cells[U.inspect_row * cols + c];
if (!v || !*v) continue;
const std::string& h = headers[c];
std::string h_clean;
parse_breakout_granularity(h, h_clean);
int ci = -1;
for (size_t i = 0; i < prev_input_headers.size(); ++i) {
if (prev_input_headers[i] == h_clean) { ci = (int)i; break; }
}
if (ci < 0) continue;
DrillStep step;
step.target_stage = target;
step.filter_pos = (int)st.stages[target].filters.size();
step.prev_active_stage = st.active_stage;
step.added = make_drill_filter(ci, v);
if (apply_drill_step(st, step)) {
U.drill_back.push_back(step);
}
}
U.drill_forward.clear();
U.inspect_open = false;
}
ImGui::EndDisabled();
ImGui::Separator();
ImGuiTableFlags flags = ImGuiTableFlags_Borders | ImGuiTableFlags_RowBg
| ImGuiTableFlags_ScrollY | ImGuiTableFlags_Resizable;
if (ImGui::BeginTable("##inspector_tbl", 2, flags, ImVec2(-1, -1))) {
ImGui::TableSetupColumn("col");
ImGui::TableSetupColumn("value");
ImGui::TableHeadersRow();
for (int c = 0; c < cols; ++c) {
ImGui::TableNextRow();
ImGui::TableSetColumnIndex(0);
ColumnType t = (c < (int)types.size()) ? types[c] : ColumnType::String;
ImGui::Text("%s %s", column_type_icon(t),
(c < (int)headers.size()) ? headers[c].c_str() : "?");
ImGui::TableSetColumnIndex(1);
const char* v = cells[U.inspect_row * cols + c];
ImGui::TextWrapped("%s", v ? v : "");
}
ImGui::EndTable();
}
ImGui::EndPopup();
}
}
int autocomplete_cb(ImGuiInputTextCallbackData* data) {
UiState* U = (UiState*)data->UserData;
if (data->EventFlag == ImGuiInputTextFlags_CallbackAlways) {
@@ -180,6 +289,47 @@ void ensure_init(State& st, int eff_cols) {
// ---------------------------------------------------------------------------
void draw_stage_breadcrumb(State& st) {
st.ensure_stage0();
// Drill history back/forward (fase 10). Botones al inicio.
auto& U = ui();
{
bool can_back = !U.drill_back.empty();
ImGui::BeginDisabled(!can_back);
if (ImGui::SmallButton("<##drill_back")) {
DrillStep s = U.drill_back.back();
U.drill_back.pop_back();
if (undo_drill_step(st, s)) {
U.drill_forward.push_back(s);
}
}
ImGui::EndDisabled();
if (can_back && ImGui::IsItemHovered())
ImGui::SetTooltip("Drill back (%zu)", U.drill_back.size());
ImGui::SameLine();
bool can_fwd = !U.drill_forward.empty();
ImGui::BeginDisabled(!can_fwd);
if (ImGui::SmallButton(">##drill_fwd")) {
DrillStep s = U.drill_forward.back();
U.drill_forward.pop_back();
if (apply_drill_step(st, s)) {
U.drill_back.push_back(s);
}
}
ImGui::EndDisabled();
if (can_fwd && ImGui::IsItemHovered())
ImGui::SetTooltip("Drill forward (%zu)", U.drill_forward.size());
ImGui::SameLine();
bool can_up = (st.active_stage > 0);
ImGui::BeginDisabled(!can_up);
if (ImGui::SmallButton("^##drill_up")) drill_up(st);
ImGui::EndDisabled();
if (can_up && ImGui::IsItemHovered())
ImGui::SetTooltip("Drill up (stage previo, sin perder filters)");
ImGui::SameLine();
ImGui::TextDisabled("|");
ImGui::SameLine();
}
for (int si = 0; si < (int)st.stages.size(); ++si) {
if (si > 0) { ImGui::SameLine(); ImGui::TextDisabled(">"); ImGui::SameLine(); }
@@ -610,6 +760,19 @@ void draw_viz_selector(State& st) {
ImGui::OpenPopup("##viz_cfg_popup");
}
ImGui::SameLine();
if (ImGui::SmallButton("Ask AI##ask_open")) {
auto& U2 = ui();
U2.ask_open = true;
U2.ask_busy = false;
U2.ask_error.clear();
U2.ask_status.clear();
U2.ask_response_code.clear();
U2.ask_response_raw.clear();
U2.ask_current_tql = tql::emit(st,
std::vector<std::string>(), // emit headers stage 0 (caller fill si necesario)
std::vector<ColumnType>());
}
ImGui::SameLine();
if (ImGui::SmallButton("+ Viz##viz_add")) {
VizPanel p;
p.display = ViewMode::Bar;
@@ -737,7 +900,8 @@ void draw_joins_chips(State& st, const std::vector<TableInput>& joinables,
// Filter chips para el stage activo. eff_headers/eff_cols son del INPUT del
// stage activo (= orig+derived para stage 0; output del stage previo para 1+).
// ---------------------------------------------------------------------------
void draw_filter_chips(Stage& stg, const char* const* eff_headers, int eff_cols) {
void draw_filter_chips(Stage& stg, const char* const* eff_headers, int eff_cols,
const std::vector<ColumnType>& eff_types) {
auto& U = ui();
ImGui::PushStyleColor(ImGuiCol_Button, IM_COL32(120, 60, 170, 220));
ImGui::PushStyleColor(ImGuiCol_ButtonHovered, IM_COL32(150, 85, 200, 240));
@@ -746,6 +910,50 @@ void draw_filter_chips(Stage& stg, const char* const* eff_headers, int eff_cols)
ImGui::PopStyleColor(3);
ImGui::SameLine();
// Presets (fase 10): menu con Last7/30/90d (cols Date), ExcludeNulls (any),
// NonZero (cols numericas). Apply append a stg.filters via build_preset_filters.
if (ImGui::SmallButton("Presets##fpresets")) ImGui::OpenPopup("##presets_menu");
if (ImGui::BeginPopup("##presets_menu")) {
int first_date = -1, first_num = -1;
for (int c = 0; c < eff_cols && c < (int)eff_types.size(); ++c) {
if (first_date < 0 && eff_types[c] == ColumnType::Date) first_date = c;
if (first_num < 0 && (eff_types[c] == ColumnType::Int ||
eff_types[c] == ColumnType::Float)) first_num = c;
}
auto apply_preset = [&](FilterPreset p, int col) {
auto fs = build_preset_filters(p, col, today_iso());
for (auto& f : fs) stg.filters.push_back(f);
};
if (first_date >= 0) {
char l1[96], l2[96], l3[96];
std::snprintf(l1, sizeof(l1), "Last 7 days on \"%s\"", eff_headers[first_date]);
std::snprintf(l2, sizeof(l2), "Last 30 days on \"%s\"", eff_headers[first_date]);
std::snprintf(l3, sizeof(l3), "Last 90 days on \"%s\"", eff_headers[first_date]);
if (ImGui::MenuItem(l1)) apply_preset(FilterPreset::Last7d, first_date);
if (ImGui::MenuItem(l2)) apply_preset(FilterPreset::Last30d, first_date);
if (ImGui::MenuItem(l3)) apply_preset(FilterPreset::Last90d, first_date);
ImGui::Separator();
}
if (ImGui::BeginMenu("Exclude nulls in...")) {
for (int c = 0; c < eff_cols; ++c) {
if (ImGui::MenuItem(eff_headers[c])) apply_preset(FilterPreset::ExcludeNulls, c);
}
ImGui::EndMenu();
}
if (first_num >= 0) {
if (ImGui::BeginMenu("Non-zero in...")) {
for (int c = 0; c < eff_cols && c < (int)eff_types.size(); ++c) {
if (eff_types[c] == ColumnType::Int || eff_types[c] == ColumnType::Float) {
if (ImGui::MenuItem(eff_headers[c])) apply_preset(FilterPreset::NonZero, c);
}
}
ImGui::EndMenu();
}
}
ImGui::EndPopup();
}
ImGui::SameLine();
if (stg.filters.empty()) {
ImGui::TextDisabled("Sin filtros.");
return;
@@ -778,7 +986,8 @@ void draw_filter_chips(Stage& stg, const char* const* eff_headers, int eff_cols)
}
// Chips de breakout (stage > 0).
void draw_breakout_chips(Stage& stg, const char* const* in_headers, int in_cols) {
void draw_breakout_chips(Stage& stg, const char* const* in_headers, int in_cols,
const std::vector<ColumnType>& in_types) {
auto& U = ui();
ImGui::PushStyleColor(ImGuiCol_Button, IM_COL32( 60, 160, 170, 220));
ImGui::PushStyleColor(ImGuiCol_ButtonHovered, IM_COL32( 80, 190, 200, 240));
@@ -792,6 +1001,17 @@ void draw_breakout_chips(Stage& stg, const char* const* in_headers, int in_cols)
return;
}
for (size_t i = 0; i < stg.breakouts.size(); ) {
std::string col_name;
DateGranularity g = parse_breakout_granularity(stg.breakouts[i], col_name);
// Resolve col index para lookup de tipo.
int col_idx = -1;
for (int c = 0; c < in_cols; ++c) {
if (std::strcmp(in_headers[c], col_name.c_str()) == 0) { col_idx = c; break; }
}
bool is_date_col = (col_idx >= 0 && col_idx < (int)in_types.size()
&& in_types[col_idx] == ColumnType::Date);
char buf[256];
std::snprintf(buf, sizeof(buf), "%s x##bk%zu", stg.breakouts[i].c_str(), i);
ImGui::PushStyleColor(ImGuiCol_Button, IM_COL32( 60, 160, 170, 220));
@@ -802,20 +1022,42 @@ void draw_breakout_chips(Stage& stg, const char* const* in_headers, int in_cols)
if (ImGui::IsItemClicked(ImGuiMouseButton_Right)) {
U.edit_chip_kind = 2;
U.edit_chip_idx = (int)i;
// resolve current col name to index in in_headers
U.edit_col_idx = 0;
for (int c = 0; c < in_cols; ++c) {
if (std::strcmp(in_headers[c], stg.breakouts[i].c_str()) == 0) {
U.edit_col_idx = c; break;
}
}
U.edit_col_idx = (col_idx >= 0) ? col_idx : 0;
ImGui::OpenPopup("##edit_breakout");
}
if (clicked) { stg.breakouts.erase(stg.breakouts.begin() + i); continue; }
// Granularity combo inline cuando col Date (fase 10).
if (is_date_col) {
ImGui::SameLine();
const char* preview = (g == DateGranularity::None)
? "(raw)" : date_granularity_token(g);
char combo_id[32];
std::snprintf(combo_id, sizeof(combo_id), "##gran%zu", i);
ImGui::SetNextItemWidth(72);
if (ImGui::BeginCombo(combo_id, preview)) {
DateGranularity opts[] = {
DateGranularity::None,
DateGranularity::Year,
DateGranularity::Month,
DateGranularity::Week,
DateGranularity::Day,
DateGranularity::Hour,
};
for (auto o : opts) {
const char* lbl = (o == DateGranularity::None)
? "(raw)" : date_granularity_token(o);
if (ImGui::Selectable(lbl, o == g)) {
stg.breakouts[i] = compose_breakout(col_name, o);
}
}
ImGui::EndCombo();
}
}
ImGui::SameLine();
++i;
}
(void)in_headers; (void)in_cols;
ImGui::NewLine();
}
@@ -1220,7 +1462,8 @@ void draw_add_filter_popup(Stage& stg, const char* const* eff_headers_arr, int e
}
void draw_add_breakout_popup(Stage& stg, const char* const* in_headers, int in_cols,
const std::vector<ColumnType>& in_types) {
const std::vector<ColumnType>& in_types,
const char* const* in_cells, int in_rows) {
auto& U = ui();
if (!ImGui::BeginPopup("##addbreakout")) return;
if (U.brk_picker_col < 0 || U.brk_picker_col >= in_cols) U.brk_picker_col = 0;
@@ -1236,7 +1479,18 @@ void draw_add_breakout_popup(Stage& stg, const char* const* in_headers, int in_c
ImGui::EndCombo();
}
if (ImGui::Button("Add##bk")) {
stg.breakouts.emplace_back(in_headers[U.brk_picker_col]);
int c = U.brk_picker_col;
std::string col = in_headers[c];
// Fase 10: si col es Date, auto-detect granularidad via rango lexical
// (ISO YYYY-MM-DD ordena bien). Default Day si rango invalido.
if (c >= 0 && c < (int)in_types.size() && in_types[c] == ColumnType::Date) {
std::string lo, hi;
column_min_max(in_cells, in_rows, in_cols, c, lo, hi);
DateGranularity g = auto_date_granularity(lo, hi);
stg.breakouts.emplace_back(compose_breakout(col, g));
} else {
stg.breakouts.emplace_back(col);
}
ImGui::CloseCurrentPopup();
}
ImGui::EndPopup();
@@ -1441,8 +1695,17 @@ void drill_into(State& st, int from_stage,
if (prev_input_headers[i] == col_name) { ci = (int)i; break; }
}
if (ci < 0) return;
st.stages[target].filters.push_back(make_drill_filter(ci, value));
st.active_stage = target;
// Fase 10: graba step en drill_back, limpia forward (rama nueva).
DrillStep step;
step.target_stage = target;
step.filter_pos = (int)st.stages[target].filters.size();
step.prev_active_stage = st.active_stage;
step.added = make_drill_filter(ci, value);
apply_drill_step(st, step);
auto& U = ui();
U.drill_back.push_back(step);
U.drill_forward.clear();
}
} // anon namespace
@@ -1659,7 +1922,7 @@ void render(const char* id,
draw_joins_chips(st, *joinables, mh);
}
draw_filter_chips(act, eff_headers.data(), eff_cols);
draw_filter_chips(act, eff_headers.data(), eff_cols, eff_types);
draw_add_filter_popup(act, eff_headers.data(), eff_cols, eff_types);
draw_edit_filter_popup(act, eff_headers.data(), eff_cols, eff_types);
@@ -2290,12 +2553,13 @@ void render(const char* id,
if (chrome_visible) {
ImGui::PushStyleVar(ImGuiStyleVar_ItemSpacing, ImVec2(8, 2));
draw_filter_chips(act, ih_ptrs.data(), in_cols_n);
draw_filter_chips(act, ih_ptrs.data(), in_cols_n, input_types_active);
draw_add_filter_popup(act, ih_ptrs.data(), in_cols_n, input_types_active);
draw_edit_filter_popup(act, ih_ptrs.data(), in_cols_n, input_types_active);
draw_breakout_chips(act, ih_ptrs.data(), in_cols_n);
draw_add_breakout_popup(act, ih_ptrs.data(), in_cols_n, input_types_active);
draw_breakout_chips(act, ih_ptrs.data(), in_cols_n, input_types_active);
draw_add_breakout_popup(act, ih_ptrs.data(), in_cols_n, input_types_active,
cur_cells, cur_rows);
draw_edit_breakout_popup(act, ih_ptrs.data(), in_cols_n);
draw_aggregation_chips(act, ih_ptrs.data(), in_cols_n);
@@ -2524,7 +2788,22 @@ void render(const char* id,
so_local.cells.push_back(cur_cells[i]);
so_ptr = &so_local;
}
viz::render(*so_ptr, st.display, st.viz_config, ImVec2(-1, -1));
int clicked_row = -1;
viz::render(*so_ptr, st.display, st.viz_config, ImVec2(-1, -1), &clicked_row);
// Fase 10: click sobre chart -> drill al stage previo usando
// breakout col[0] como filtro Op::Eq sobre cells[clicked_row].
if (clicked_row >= 0 && active > 0 &&
so_ptr->cols > 0 && clicked_row < so_ptr->rows) {
int n_brk = (int)st.stages[active].breakouts.size();
if (n_brk > 0) {
const char* v = so_ptr->cells[clicked_row * so_ptr->cols + 0];
std::string col_clean;
parse_breakout_granularity(so_ptr->headers[0], col_clean);
drill_into(st, active, col_clean,
v ? std::string(v) : "",
input_headers_active);
}
}
goto stage_n_table_end;
}
@@ -2613,13 +2892,11 @@ void render(const char* id,
ImGui::PushID(r * cur_cols_n + c);
ImGui::Selectable(cell ? cell : "");
if (ImGui::IsItemHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Right)) {
// Drill-down solo si c es col de breakout (c < n_brk).
if (c < n_brk) {
U.pending_col = c;
U.pending_value = cell ? cell : "";
U.inspect_row = r;
ImGui::OpenPopup("##drill_popup");
}
}
if (ImGui::BeginPopup("##drill_popup")) {
if (c < n_brk) {
char lbl[256];
@@ -2631,6 +2908,12 @@ void render(const char* id,
input_headers_active);
ImGui::CloseCurrentPopup();
}
ImGui::Separator();
}
if (ImGui::MenuItem("Inspect row...")) {
U.inspect_row = r;
U.inspect_open = true;
ImGui::CloseCurrentPopup();
}
ImGui::EndPopup();
}
@@ -2642,6 +2925,11 @@ void render(const char* id,
}
stage_n_table_end:;
// Row inspector modal (fase 10). Activado via right-click "Inspect row..."
// sobre celdas del table del stage activo. `cur_cells` ya es row-major.
draw_row_inspector_modal(st, active, cur_cells, cur_rows, cur_cols_n,
cur_headers, cur_types, input_headers_active);
// Render extras (stage>0 path)
if (!st.extra_panels.empty() && cur_cols_n > 0) {
StageOutput so_local;
@@ -2958,6 +3246,118 @@ void render(const char* id,
ImGui::EndPopup();
}
// Ask AI modal (fase 11 — issue 0080).
if (U.ask_open) ImGui::OpenPopup("Ask AI");
ImGui::SetNextWindowSize(ImVec2(820, 560), ImGuiCond_Appearing);
if (ImGui::BeginPopupModal("Ask AI", &U.ask_open,
ImGuiWindowFlags_NoSavedSettings)) {
ImGui::TextDisabled("Ask en lenguaje natural. Default TQL. SQL solo si DuckDB linkado.");
const char* modes[] = {"TQL", "SQL (DuckDB)"};
#ifndef FN_TQL_DUCKDB
// SQL mode disabled visually pero el toggle existe (informativo)
if (U.ask_mode == 1) U.ask_mode = 0;
#endif
ImGui::Combo("Output##askmode", &U.ask_mode, modes, IM_ARRAYSIZE(modes));
#ifndef FN_TQL_DUCKDB
if (U.ask_mode == 1) {
ImGui::TextColored(ImVec4(1, 0.5f, 0.3f, 1),
"SQL mode requires FN_TQL_DUCKDB=1 build flag.");
}
#endif
ImGui::InputTextMultiline("##ask_q", U.ask_question, sizeof(U.ask_question),
ImVec2(-1, 80));
ImGui::BeginDisabled(U.ask_busy);
if (ImGui::Button("Send")) {
U.ask_busy = true;
U.ask_status = "Sending...";
U.ask_error.clear();
U.ask_response_code.clear();
U.ask_response_raw.clear();
// Build AskInput desde el state actual.
llm_anthropic::AskInput in;
in.question = U.ask_question;
in.tql_current = U.ask_current_tql;
in.col_names = U.active_headers;
in.col_types = U.active_types;
in.mode = (U.ask_mode == 1)
? llm_anthropic::OutputMode::SQL
: llm_anthropic::OutputMode::TQL;
// Llamada blocking (UI freeze breve durante red).
auto r = llm_anthropic::ask(in);
U.ask_busy = false;
if (!r.error.empty()) {
U.ask_error = r.error;
U.ask_status = "Error";
} else {
U.ask_response_raw = r.raw;
U.ask_response_code = r.code;
U.ask_status = "Got response.";
// Llenar edit buffer
std::snprintf(U.ask_edit_buf, sizeof(U.ask_edit_buf),
"%s", r.code.c_str());
}
}
ImGui::EndDisabled();
ImGui::SameLine();
if (!U.ask_status.empty()) {
ImGui::TextDisabled("%s", U.ask_status.c_str());
}
if (!U.ask_error.empty()) {
ImGui::TextColored(ImVec4(1, 0.4f, 0.4f, 1), "%s", U.ask_error.c_str());
}
ImGui::Separator();
ImGui::Columns(2, "ask_cols", true);
ImGui::TextUnformatted("Current");
ImGui::InputTextMultiline("##ask_cur",
const_cast<char*>(U.ask_current_tql.c_str()),
U.ask_current_tql.size() + 1,
ImVec2(-1, 240),
ImGuiInputTextFlags_ReadOnly);
ImGui::NextColumn();
ImGui::TextUnformatted("Proposed (editable before apply)");
ImGui::InputTextMultiline("##ask_new", U.ask_edit_buf, sizeof(U.ask_edit_buf),
ImVec2(-1, 240));
ImGui::Columns(1);
bool can_apply = !U.ask_busy && U.ask_edit_buf[0] != '\0';
ImGui::BeginDisabled(!can_apply);
if (ImGui::Button("Apply")) {
std::string err;
if (U.ask_mode == 0) {
// TQL apply
bool ok = tql::apply(U.ask_edit_buf, st,
U.active_headers,
U.active_types,
nullptr, 0,
(int)U.active_headers.size(),
&err);
if (ok) {
U.ask_status = "Applied OK.";
U.ask_open = false;
} else {
U.ask_error = "tql::apply error: " + err;
U.ask_status = "Apply failed.";
}
} else {
// SQL apply: requires DuckDB adapter (no v1).
U.ask_status = "SQL execute requires FN_TQL_DUCKDB build flag.";
}
}
ImGui::EndDisabled();
ImGui::SameLine();
if (ImGui::Button("Reject")) {
U.ask_response_code.clear();
U.ask_edit_buf[0] = '\0';
}
ImGui::SameLine();
if (ImGui::Button("Close")) {
U.ask_open = false;
}
ImGui::EndPopup();
}
if (U.open_cell_popup) { ImGui::OpenPopup("##cell_op"); U.open_cell_popup = false; }
if (ImGui::BeginPopup("##cell_op")) {
ColumnType t = (U.pending_col >= 0 && U.pending_col < eff_cols)
@@ -567,6 +567,69 @@ Filter make_drill_filter(int col_idx, const std::string& value) {
return f;
}
bool apply_drill_step(State& st, const DrillStep& step) {
if (step.target_stage < 0 || step.target_stage >= (int)st.stages.size()) return false;
Stage& s = st.stages[step.target_stage];
int pos = step.filter_pos;
if (pos < 0 || pos > (int)s.filters.size()) return false;
s.filters.insert(s.filters.begin() + pos, step.added);
st.active_stage = step.target_stage;
return true;
}
bool drill_up(State& st) {
if (st.stages.empty()) return false;
if (st.active_stage <= 0) return false;
st.active_stage -= 1;
return true;
}
std::string row_to_tsv(const char* const* cells, int rows, int cols,
int row_idx, const std::vector<std::string>& headers) {
if (row_idx < 0 || row_idx >= rows || cols <= 0) return "";
std::string out;
for (int c = 0; c < cols; ++c) {
if (c > 0) out += '\t';
if (c < (int)headers.size()) out += headers[c];
}
out += "\r\n";
for (int c = 0; c < cols; ++c) {
if (c > 0) out += '\t';
const char* v = cells[row_idx * cols + c];
if (v) out += v;
}
out += "\r\n";
return out;
}
std::vector<Filter> build_filters_from_row(const char* const* cells, int rows,
int cols, int row_idx) {
std::vector<Filter> out;
if (row_idx < 0 || row_idx >= rows || cols <= 0) return out;
for (int c = 0; c < cols; ++c) {
const char* v = cells[row_idx * cols + c];
if (!v || !*v) continue;
Filter f;
f.col = c;
f.op = Op::Eq;
f.value = v;
out.push_back(f);
}
return out;
}
bool undo_drill_step(State& st, const DrillStep& step) {
if (step.target_stage < 0 || step.target_stage >= (int)st.stages.size()) return false;
Stage& s = st.stages[step.target_stage];
int pos = step.filter_pos;
if (pos < 0 || pos >= (int)s.filters.size()) return false;
s.filters.erase(s.filters.begin() + pos);
if (step.prev_active_stage >= 0 && step.prev_active_stage < (int)st.stages.size()) {
st.active_stage = step.prev_active_stage;
}
return true;
}
std::vector<int> apply_filters(const char* const* cells, int rows, int cols,
const std::vector<Filter>& filters)
{
@@ -696,19 +759,57 @@ StageOutput compute_stage(const char* const* in_cells, int in_rows, int in_cols,
}
// Grouped: agrupa visible por valores de breakout, calcula aggregations.
std::vector<int> break_cols(stage.breakouts.size());
for (size_t i = 0; i < stage.breakouts.size(); ++i) {
break_cols[i] = find_col(in_headers, stage.breakouts[i]);
// Breakouts pueden llevar sufijo `:granularity` para cols Date (fase 10).
int nbreaks = (int)stage.breakouts.size();
std::vector<int> break_cols(nbreaks);
std::vector<DateGranularity> break_grans(nbreaks);
bool any_trunc = false;
for (int i = 0; i < nbreaks; ++i) {
std::string col_name;
break_grans[i] = parse_breakout_granularity(stage.breakouts[i], col_name);
if (break_grans[i] != DateGranularity::None) any_trunc = true;
break_cols[i] = find_col(in_headers, col_name);
}
auto make_key = [&](int r) -> std::string {
std::string k;
for (size_t i = 0; i < break_cols.size(); ++i) {
if (i > 0) k += '\x1f'; // separador unit-separator (no aparece en datos)
// Pre-truncate solo cuando hay granularity activa. Strings persistidos en
// out.cell_backing para que los punteros sobrevivan al return de la funcion.
// Reservamos upfront para que push_back no invalide punteros anteriores.
// Tamaño = trunc cells + aggregation cells (peor caso n_groups <= in_rows).
out.cell_backing.reserve(
(size_t)in_rows * (size_t)nbreaks +
(size_t)in_rows * stage.aggregations.size() + 16);
std::vector<const char*> trunc_ptrs;
if (any_trunc) {
trunc_ptrs.assign((size_t)in_rows * (size_t)nbreaks, nullptr);
for (int r = 0; r < in_rows; ++r) {
for (int i = 0; i < nbreaks; ++i) {
if (break_grans[i] == DateGranularity::None) continue;
int bc = break_cols[i];
if (bc < 0) continue;
const char* v = in_cells[r * in_cols + bc];
k += (v ? v : "");
out.cell_backing.emplace_back(
truncate_date(v ? v : "", break_grans[i]));
trunc_ptrs[(size_t)r * nbreaks + i] = out.cell_backing.back().c_str();
}
}
}
auto cell_for = [&](int r, int i) -> const char* {
int bc = break_cols[i];
if (bc < 0) return "";
if (break_grans[i] != DateGranularity::None) {
return trunc_ptrs[(size_t)r * nbreaks + i];
}
const char* v = in_cells[r * in_cols + bc];
return v ? v : "";
};
auto make_key = [&](int r) -> std::string {
std::string k;
for (int i = 0; i < nbreaks; ++i) {
if (i > 0) k += '\x1f'; // separador unit-separator (no aparece en datos)
k += cell_for(r, i);
}
return k;
};
@@ -727,10 +828,9 @@ StageOutput compute_stage(const char* const* in_cells, int in_rows, int in_cols,
key_to_group.emplace(k, gi);
group_keys.push_back(k);
group_rows.emplace_back();
std::vector<const char*> bv(break_cols.size(), "");
for (size_t i = 0; i < break_cols.size(); ++i) {
int bc = break_cols[i];
bv[i] = (bc >= 0) ? in_cells[r * in_cols + bc] : "";
std::vector<const char*> bv((size_t)nbreaks, "");
for (int i = 0; i < nbreaks; ++i) {
bv[i] = cell_for(r, i);
}
group_breakvals.push_back(std::move(bv));
} else gi = it->second;
@@ -742,11 +842,17 @@ StageOutput compute_stage(const char* const* in_cells, int in_rows, int in_cols,
out.cols = out_cols;
out.headers.reserve(out_cols);
out.types.reserve(out_cols);
for (size_t i = 0; i < stage.breakouts.size(); ++i) {
for (int i = 0; i < nbreaks; ++i) {
out.headers.push_back(stage.breakouts[i]);
int bc = break_cols[i];
out.types.push_back((bc >= 0 && bc < (int)in_types.size())
? in_types[bc] : ColumnType::String);
// Si hay granularity activa, el output es String (formato ymd o similar),
// no la fecha original.
ColumnType ot = ColumnType::String;
if (break_grans[i] == DateGranularity::None
&& bc >= 0 && bc < (int)in_types.size()) {
ot = in_types[bc];
}
out.types.push_back(ot);
}
for (const auto& a : stage.aggregations) {
out.headers.push_back(aggregation_alias(a));
@@ -1102,4 +1208,288 @@ StageOutput join_tables(const char* const* left_cells, int left_rows, int left_c
return out;
}
// ----------------------------------------------------------------------------
// Fase 10: drill extendido — granularity + presets.
// ----------------------------------------------------------------------------
const char* date_granularity_token(DateGranularity g) {
switch (g) {
case DateGranularity::Year: return "year";
case DateGranularity::Month: return "month";
case DateGranularity::Week: return "week";
case DateGranularity::Day: return "day";
case DateGranularity::Hour: return "hour";
default: return "";
}
}
DateGranularity date_granularity_from_token(const char* s) {
if (!s) return DateGranularity::None;
std::string t(s);
if (t == "year") return DateGranularity::Year;
if (t == "month") return DateGranularity::Month;
if (t == "week") return DateGranularity::Week;
if (t == "day") return DateGranularity::Day;
if (t == "hour") return DateGranularity::Hour;
return DateGranularity::None;
}
DateGranularity parse_breakout_granularity(const std::string& breakout,
std::string& col_out) {
auto pos = breakout.rfind(':');
if (pos == std::string::npos) {
col_out = breakout;
return DateGranularity::None;
}
std::string suffix = breakout.substr(pos + 1);
DateGranularity g = date_granularity_from_token(suffix.c_str());
if (g == DateGranularity::None) {
col_out = breakout;
return DateGranularity::None;
}
col_out = breakout.substr(0, pos);
return g;
}
std::string compose_breakout(const std::string& col, DateGranularity g) {
if (g == DateGranularity::None) return col;
return col + ":" + date_granularity_token(g);
}
int nearest_index_1d(double target, const double* xs, int n) {
if (n <= 0 || !xs) return -1;
int best = -1;
double best_d = 0.0;
for (int i = 0; i < n; ++i) {
double v = xs[i];
if (std::isnan(v)) continue;
double d = std::fabs(v - target);
if (best < 0 || d < best_d) { best = i; best_d = d; }
}
return best;
}
int nearest_index_2d(double tx, double ty,
const double* xs, const double* ys, int n) {
if (n <= 0 || !xs || !ys) return -1;
int best = -1;
double best_d = 0.0;
for (int i = 0; i < n; ++i) {
double x = xs[i], y = ys[i];
if (std::isnan(x) || std::isnan(y)) continue;
double dx = x - tx, dy = y - ty;
double d = dx*dx + dy*dy;
if (best < 0 || d < best_d) { best = i; best_d = d; }
}
return best;
}
double pie_angle(double cx, double cy, double mx, double my) {
// ImPlot pie: 0 = top, sentido horario. atan2 estandar: 0 = +X (right), CCW.
// Conversion: ImPlot angle = atan2(dx, -dy) y normalizar a [0, 2*PI).
double dx = mx - cx;
double dy = my - cy;
double a = std::atan2(dx, -dy); // 0 cuando (dx=0, dy<0) = top
const double two_pi = 6.283185307179586;
if (a < 0) a += two_pi;
return a;
}
int pie_slice_at_angle(double angle, const double* sums, int n) {
if (n <= 0 || !sums) return -1;
double total = 0.0;
for (int i = 0; i < n; ++i) {
if (sums[i] < 0) return -1;
total += sums[i];
}
if (total <= 0.0) return -1;
const double two_pi = 6.283185307179586;
if (angle < 0 || angle >= two_pi) return -1;
double cum = 0.0;
for (int i = 0; i < n; ++i) {
cum += (sums[i] / total) * two_pi;
if (angle < cum) return i;
}
return n - 1; // edge case rounding
}
void heatmap_cell_at(double px, double py, int rows, int cols,
int& row_out, int& col_out) {
row_out = -1;
col_out = -1;
if (rows <= 0 || cols <= 0) return;
if (px < 0.0 || px >= (double)cols) return;
if (py < 0.0 || py >= (double)rows) return;
col_out = (int)px;
// ImPlot heatmap pinta row 0 arriba; plot Y suele invertirse. Caller
// normaliza si necesita. Aqui devolvemos row = floor(py) en coord plot.
row_out = (int)py;
}
void column_min_max(const char* const* cells, int rows, int cols, int col_idx,
std::string& min_out, std::string& max_out) {
min_out.clear();
max_out.clear();
if (col_idx < 0 || col_idx >= cols) return;
bool first = true;
for (int r = 0; r < rows; ++r) {
const char* v = cells[r * cols + col_idx];
if (!v || !*v) continue;
std::string s(v);
if (first) {
min_out = s;
max_out = s;
first = false;
} else {
if (s < min_out) min_out = s;
if (s > max_out) max_out = s;
}
}
}
namespace {
// Parse ISO "YYYY-MM-DD..." -> (y, m, d). True si los 3 primeros campos OK.
bool parse_ymd(const std::string& s, int& y, int& m, int& d) {
if (s.size() < 10) return false;
for (int i : {0,1,2,3,5,6,8,9}) {
if (s[(size_t)i] < '0' || s[(size_t)i] > '9') return false;
}
if (s[4] != '-' || s[7] != '-') return false;
y = (s[0]-'0')*1000 + (s[1]-'0')*100 + (s[2]-'0')*10 + (s[3]-'0');
m = (s[5]-'0')*10 + (s[6]-'0');
d = (s[8]-'0')*10 + (s[9]-'0');
if (m < 1 || m > 12 || d < 1 || d > 31) return false;
return true;
}
// Dias desde 0001-01-01 (proleptic Gregorian).
long ymd_to_days(int y, int m, int d) {
if (m <= 2) { y -= 1; m += 12; }
long era = (y >= 0 ? y : y - 399) / 400;
unsigned yoe = (unsigned)(y - era * 400);
unsigned doy = (unsigned)((153 * (m - 3) + 2) / 5 + d - 1);
unsigned doe = yoe * 365 + yoe/4 - yoe/100 + doy;
return era * 146097 + (long)doe;
}
void days_to_ymd(long days, int& y, int& m, int& d) {
long era = (days >= 0 ? days : days - 146096) / 146097;
unsigned doe = (unsigned)(days - era * 146097);
unsigned yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;
int yr = (int)yoe + (int)era * 400;
unsigned doy = doe - (365*yoe + yoe/4 - yoe/100);
unsigned mp = (5*doy + 2)/153;
unsigned day = doy - (153*mp + 2)/5 + 1;
unsigned mon = mp < 10 ? mp + 3 : mp - 9;
if (mon <= 2) yr += 1;
y = yr; m = (int)mon; d = (int)day;
}
} // anon
std::string truncate_date(const std::string& date, DateGranularity g) {
if (g == DateGranularity::None) return date;
int y, m, d;
if (!parse_ymd(date, y, m, d)) return date;
char buf[32];
switch (g) {
case DateGranularity::Year:
std::snprintf(buf, sizeof(buf), "%04d", y);
return buf;
case DateGranularity::Month:
std::snprintf(buf, sizeof(buf), "%04d-%02d", y, m);
return buf;
case DateGranularity::Day:
std::snprintf(buf, sizeof(buf), "%04d-%02d-%02d", y, m, d);
return buf;
case DateGranularity::Hour: {
int hh = 0;
if (date.size() >= 13 && date[10] == 'T'
&& date[11] >= '0' && date[11] <= '9'
&& date[12] >= '0' && date[12] <= '9') {
hh = (date[11]-'0')*10 + (date[12]-'0');
if (hh < 0 || hh > 23) hh = 0;
}
std::snprintf(buf, sizeof(buf), "%04d-%02d-%02dT%02d", y, m, d, hh);
return buf;
}
case DateGranularity::Week: {
// Hinnant ymd_to_days: day 0 == 0000-03-01 (Wednesday).
// days%7: 0=Wed, 1=Thu, 2=Fri, 3=Sat, 4=Sun, 5=Mon, 6=Tue.
// Monday offset: (mod - 5 + 7) % 7.
long days = ymd_to_days(y, m, d);
int mod = (int)(((days % 7) + 7) % 7);
int rem = ((mod - 5) % 7 + 7) % 7;
long monday = days - rem;
int yy, mm, dd;
days_to_ymd(monday, yy, mm, dd);
std::snprintf(buf, sizeof(buf), "%04d-%02d-%02d", yy, mm, dd);
return buf;
}
default: return date;
}
}
DateGranularity auto_date_granularity(const std::string& min_ymd,
const std::string& max_ymd) {
int y1,m1,d1, y2,m2,d2;
if (!parse_ymd(min_ymd, y1,m1,d1)) return DateGranularity::Day;
if (!parse_ymd(max_ymd, y2,m2,d2)) return DateGranularity::Day;
long span = ymd_to_days(y2,m2,d2) - ymd_to_days(y1,m1,d1);
if (span < 0) span = -span;
if (span > 730) return DateGranularity::Year; // >2 anios
if (span > 60) return DateGranularity::Month;
if (span > 14) return DateGranularity::Week;
return DateGranularity::Day;
}
const char* filter_preset_label(FilterPreset p) {
switch (p) {
case FilterPreset::Last7d: return "Last 7 days";
case FilterPreset::Last30d: return "Last 30 days";
case FilterPreset::Last90d: return "Last 90 days";
case FilterPreset::ExcludeNulls: return "Exclude nulls";
case FilterPreset::NonZero: return "Non-zero only";
}
return "?";
}
std::vector<Filter> build_preset_filters(FilterPreset preset, int col,
const std::string& today_ymd) {
std::vector<Filter> out;
auto last_n = [&](int n) {
int y, m, d;
if (!parse_ymd(today_ymd, y, m, d)) return;
long days = ymd_to_days(y, m, d) - n;
int yy, mm, dd;
days_to_ymd(days, yy, mm, dd);
char buf[16];
std::snprintf(buf, sizeof(buf), "%04d-%02d-%02d", yy, mm, dd);
Filter f;
f.col = col;
f.op = Op::Gte;
f.value = buf;
out.push_back(f);
};
switch (preset) {
case FilterPreset::Last7d: last_n(7); break;
case FilterPreset::Last30d: last_n(30); break;
case FilterPreset::Last90d: last_n(90); break;
case FilterPreset::ExcludeNulls: {
Filter f; f.col = col; f.op = Op::Neq; f.value = "";
out.push_back(f);
break;
}
case FilterPreset::NonZero: {
Filter f1; f1.col = col; f1.op = Op::Neq; f1.value = "";
Filter f2; f2.col = col; f2.op = Op::Neq; f2.value = "0";
out.push_back(f1);
out.push_back(f2);
break;
}
}
return out;
}
} // namespace data_table
@@ -1,27 +1,21 @@
// Logica pura del playground data_table. Sin ImGui — testable headless.
// Cuando se promueva al registry, esto sera la base de data_table_cpp_viz.
// TIPOS promovidos al registry (issue 0081). Este header solo declara
// funciones; los types vienen de cpp/functions/core/data_table_types.h.
#pragma once
#include "core/data_table_types.h"
#include <string>
#include <utility>
#include <vector>
namespace data_table {
enum class Op {
Eq, Neq, Gt, Gte, Lt, Lte,
Contains, NotContains, StartsWith, EndsWith
};
// ----------------------------------------------------------------------------
// Helpers para Op y ColumnType.
// ----------------------------------------------------------------------------
const char* op_label(Op o);
bool op_is_string_only(Op o);
// ----------------------------------------------------------------------------
// Column types - declarado por caller con fallback a auto-detect.
// ----------------------------------------------------------------------------
enum class ColumnType {
Auto, String, Int, Float, Bool, Date, Json
};
const char* column_type_name(ColumnType t);
const char* column_type_icon(ColumnType t); // UTF-8 Tabler icon
@@ -36,63 +30,11 @@ ColumnType auto_detect_type(const char* const* cells, int rows, int cols,
ColumnType effective_type(ColumnType declared,
const char* const* cells, int rows, int cols, int col);
// Derived column: inmutable. Dos modos:
// 1) Retipo puro: source_col >= 0, formula == "". Cells del origen.
// 2) Formula: source_col == -1, formula no vacia. Eval por Lua.
struct DerivedColumn {
int source_col = -1;
ColumnType type = ColumnType::String;
std::string name;
std::string formula; // "" = retipado puro; resto = body Lua
int lua_id = -1; // referencia en lua_engine; -1 si no compilado
std::string compile_error;
};
// Filter movido aqui (antes era despues de State) porque TQL Stage lo necesita.
struct Filter {
int col;
Op op;
std::string value;
};
struct ColorRule {
int col;
std::string equals;
unsigned int color;
};
// ----------------------------------------------------------------------------
// TQL (Table Query Language) — stage model. Ver docs/TQL.md.
// Aggregation helpers.
// ----------------------------------------------------------------------------
enum class AggFn {
Count, Sum, Avg, Min, Max, Distinct, Stddev,
Median, P25, P75, P90, P99, Percentile
};
const char* agg_fn_name(AggFn f);
struct Aggregation {
AggFn fn = AggFn::Count;
std::string col; // ignorado para Count
double arg = 0.0; // para Percentile (0..1)
std::string alias; // vacio -> auto-generado via aggregation_alias()
};
struct SortClause {
std::string col;
bool desc = false;
};
// Stage: layer de TQL. Stage 0 = Raw (sin breakouts/aggregations).
// Stage 1+ pueden agrupar. Cada stage consume output del anterior.
struct Stage {
std::vector<Filter> filters;
std::vector<DerivedColumn> derived; // expressions de este stage
std::vector<std::string> breakouts; // col names del INPUT de este stage
std::vector<Aggregation> aggregations;
std::vector<SortClause> sorts;
};
// Pure: alias por defecto cuando agg.alias esta vacio.
// count -> "count"
// distinct col -> "distinct_<col>"
@@ -101,224 +43,125 @@ struct Stage {
std::string aggregation_alias(const Aggregation& a);
// Pure: tipo del output de la aggregation.
// count, distinct -> Int
// sum, avg, stddev,
// median, p*, percentile -> Float
// min, max -> mismo tipo que la col origen
ColumnType aggregation_type(const Aggregation& a,
const std::vector<std::string>& in_headers,
const std::vector<ColumnType>& in_types);
// Output de compute_stage. Posee `cell_backing` (strings nuevos para
// resultados agregados) y `cells` (punteros row-major a backing o a
// `in_cells` original para passthrough).
struct StageOutput {
std::vector<std::string> cell_backing;
std::vector<const char*> cells;
int rows = 0;
int cols = 0;
std::vector<std::string> headers;
std::vector<ColumnType> types;
};
// ----------------------------------------------------------------------------
// Compute pipeline.
// ----------------------------------------------------------------------------
// Pure: ejecuta un Stage sobre los cells de entrada. Aplica filter -> (group+agg|passthrough) -> sort.
StageOutput compute_stage(const char* const* in_cells, int in_rows, int in_cols,
const std::vector<std::string>& in_headers,
const std::vector<ColumnType>& in_types,
const Stage& stage);
// Pure: aplica filtros usando headers para resolver f.col (que ahora es
// indice en el array de in_headers, no del dataset original). Devuelve
// indices de filas que pasan.
// Pure: aplica filtros usando headers para resolver f.col.
std::vector<int> apply_filters(const char* const* cells, int rows, int cols,
const std::vector<Filter>& filters);
// Pure: helper para drill-down. Devuelve un Filter Op::Eq sobre col_idx con
// el value indicado. col_idx es indice en los headers del INPUT del stage
// previo (donde se va a aplicar el filtro).
// el value indicado.
Filter make_drill_filter(int col_idx, const std::string& value);
// ----------------------------------------------------------------------------
// ViewMode: tipo de visualizacion a renderizar sobre el output del stage activo.
// "Table" siempre disponible. Resto requiere ciertos tipos de columnas.
// ViewMode helpers.
// ----------------------------------------------------------------------------
enum class ViewMode {
Table,
// Bars
Bar, // horizontal bars: 1 cat + 1 num
Column, // vertical bars: 1 cat + 1 num
GroupedBar, // 1 cat + N num (side-by-side)
StackedBar, // 1 cat + N num (stacked)
// Lines / area
Line, // X + 1..N Y series
Area, // shaded to y=0
Stairs, // step plot
// Points
Scatter, // X + Y
Bubble, // X + Y + size
// Distribution
Histogram, // 1 num
Histogram2D, // 2 num
Heatmap, // matrix from breakouts
BoxPlot, // 1 cat + 1 num (min/p25/p50/p75/max per group)
// Stems / signals
Stem,
ErrorBars,
// Composition
Pie,
Donut,
Funnel, // ordered descending bars
Waterfall, // running sum
// Single values
KPI, // big text + label
KPIGrid, // all aggregations as cards
// Specialized
Candlestick, // OHLC: time + open + high + low + close
Radar, // multi-axis (1 cat + N num)
};
const char* view_mode_token(ViewMode m); // "table", "bar", ...
const char* view_mode_label(ViewMode m); // "Table", "Bar (horizontal)", ...
const char* view_mode_token(ViewMode m);
const char* view_mode_label(ViewMode m);
ViewMode view_mode_from_token(const char* s);
int view_mode_min_cols(ViewMode m);
bool view_mode_needs_numeric(ViewMode m);
bool view_mode_needs_category(ViewMode m);
// Requiere stage agrupado (breakout+aggregation). Si user esta en stage 0 con
// uno de estos, conviene auto-promote a stage 1.
bool view_mode_needs_aggregation(ViewMode m);
// Lista completa de modos para el selector UI (orden de display).
// Lista completa de modos para el selector UI.
const ViewMode* all_view_modes(int* n_out);
// ----------------------------------------------------------------------------
// Joins (MBQL-style). Ver issue 0078.
// ----------------------------------------------------------------------------
enum class JoinStrategy { Left, Inner, Right, Full };
const char* join_strategy_token(JoinStrategy s);
JoinStrategy join_strategy_from_token(const char* s);
const char* join_strategy_label(JoinStrategy s);
// Tabla extra pasada al render() para joins. Owner externo (caller).
struct TableInput {
std::string name; // identificador estable (matchea Join.source)
std::vector<std::string> headers;
std::vector<ColumnType> types;
const char* const* cells = nullptr; // row-major, headers.size() cols x rows filas
int rows = 0;
int cols = 0;
};
// Join clause: une la tabla actual con `source` por las parejas `on`,
// prefijando las cols del derecho con `alias.`.
struct Join {
std::string alias;
std::string source;
std::vector<std::pair<std::string, std::string>> on; // {left_col, right_col}
JoinStrategy strategy = JoinStrategy::Left;
std::vector<std::string> fields; // vacio = all del derecho
};
// Pure: resuelve indice del main entre `tables` segun `main_source`.
// Vacio -> 0. Nombre desconocido -> 0. tables vacio -> -1.
int resolve_main_idx(const std::vector<TableInput>& tables, const std::string& main_source);
// Pure: aplica un join sobre dos tablas. Resultado: StageOutput con
// `headers` = left + `<alias>.<right_col>` (filtrado por fields si no vacio).
// Pure: aplica un join sobre dos tablas.
StageOutput join_tables(const char* const* left_cells, int left_rows, int left_cols,
const std::vector<std::string>& left_headers,
const std::vector<ColumnType>& left_types,
const TableInput& right,
const Join& jn);
// ViewConfig: overrides manuales de auto-detect para la vista activa.
// Campos vacios -> auto. Si col name no existe en output, viz cae a auto.
struct ViewConfig {
std::string x_col; // single: scatter, line, hist2d
std::vector<std::string> y_cols; // 1..N: line/area/bar/etc
std::string size_col; // bubble
std::string cat_col; // bar/pie/funnel/box override
unsigned int primary_color = 0; // 0 = ImPlot auto
int hist_bins = 0; // 0 = Sturges
float pie_radius = 0.0f; // 0 = default
bool show_legend = true;
bool show_markers = false; // line/area markers
bool locked = false; // disable pan/zoom
mutable bool fit_request = false; // consumed by viz::render
};
// ----------------------------------------------------------------------------
// Drill apply/undo (fase 10).
// ----------------------------------------------------------------------------
bool apply_drill_step(State& st, const DrillStep& step);
bool undo_drill_step(State& st, const DrillStep& step);
// VizPanel: viz adicional sobre el mismo StageOutput. State.display + viz_config
// es el panel 0 (siempre visible); extra_panels son los aniadidos por el user.
struct VizPanel {
ViewMode display = ViewMode::Bar;
ViewConfig config;
// Memoria del ultimo non-Table display para toggle Table<->View.
mutable ViewMode last_non_table = ViewMode::Bar;
};
// Pure (fase 10): drill-up. Decrementa active_stage si > 0.
bool drill_up(State& st);
// State: stage pipeline + viz globales.
//
// `stages` siempre tiene tamaño >= 1 (auto-init en compute_visible_rows / render
// si esta vacio: se crea stages[0] vacio). Stage 0 es Raw (filters + derived +
// sorts; SIN breakouts/aggregations). Stages 1+ pueden agrupar.
//
// `active_stage` = indice del stage cuyo output se renderiza.
// `col_visible/col_order/color_rules` aplican al output del stage activo.
struct State {
std::vector<Stage> stages;
int active_stage = 0;
ViewMode display = ViewMode::Table;
ViewConfig viz_config;
std::vector<VizPanel> extra_panels;
std::vector<Join> joins; // aplicado antes de stages[0]
std::string main_source; // name de TableInput a usar como main; vacio -> tables[0]
// Pure (fase 10): serializa una fila a TSV.
std::string row_to_tsv(const char* const* cells, int rows, int cols,
int row_idx, const std::vector<std::string>& headers);
std::vector<ColorRule> color_rules;
std::vector<bool> col_visible; // size = effective_cols del stage activo
std::vector<int> col_order; // permutacion [0..effective_cols)
// Pure (fase 10): construye filters Op::Eq desde una fila.
std::vector<Filter> build_filters_from_row(const char* const* cells, int rows,
int cols, int row_idx);
// --- Compat helpers: shortcuts a stages[0] (Raw) ---
// Util tras refactor para tests / accesos puntuales. Garantizan stages[0]
// existe (lo crean vacio si no).
Stage& raw();
const Stage& raw() const;
Stage& active();
const Stage& active_const() const;
void ensure_stage0();
};
// ----------------------------------------------------------------------------
// Date granularity helpers (fase 10).
// ----------------------------------------------------------------------------
const char* date_granularity_token(DateGranularity g);
DateGranularity date_granularity_from_token(const char* s);
// Parse "1.23" -> 1.23, true. False si la celda no es numero completo.
DateGranularity parse_breakout_granularity(const std::string& breakout,
std::string& col_out);
std::string compose_breakout(const std::string& col, DateGranularity g);
void column_min_max(const char* const* cells, int rows, int cols, int col_idx,
std::string& min_out, std::string& max_out);
// Hit-tests para click-to-drill sobre charts (fase 10).
int nearest_index_1d(double target, const double* xs, int n);
int nearest_index_2d(double tx, double ty,
const double* xs, const double* ys, int n);
double pie_angle(double cx, double cy, double mx, double my);
int pie_slice_at_angle(double angle, const double* sums, int n);
void heatmap_cell_at(double px, double py, int rows, int cols,
int& row_out, int& col_out);
// Date trunc + auto + presets.
std::string truncate_date(const std::string& date, DateGranularity g);
DateGranularity auto_date_granularity(const std::string& min_ymd,
const std::string& max_ymd);
const char* filter_preset_label(FilterPreset p);
std::vector<Filter> build_preset_filters(FilterPreset preset, int col,
const std::string& today_ymd);
// ----------------------------------------------------------------------------
// Misc helpers.
// ----------------------------------------------------------------------------
bool parse_number(const char* s, double& out);
// Compara dos celdas con operador. Numerico si ambas parseables; lexical si no.
bool compare(const char* a, const char* b, Op op);
// Aplica filtros y ordena. Devuelve indices de filas visibles.
std::vector<int> compute_visible_rows(const char* const* cells,
int rows, int cols,
const State& st);
// Pure: muta col_order de st para colocar `src` en la posicion (en orden visual)
// donde estaba `dst`. No-op si src == dst o cualquiera fuera del array.
void reorder_column(State& st, int src, int dst);
// Pure: dado un buffer y posicion de cursor, busca el `[` abierto sin cerrar
// mas reciente. Devuelve su indice (o -1 si ninguno). Rellena `filter_text`
// con los caracteres entre `[` y cursor.
// Para autocomplete de formulas: cuando el usuario teclea `[` el ImGui callback
// detecta esto y muestra un popup con cols disponibles.
int find_open_bracket(const char* buf, int len, int cursor, std::string& filter_text);
// Pure: reemplaza src[start..cursor) por "[name]". Devuelve nuevo string y
// actualiza `new_cursor` a la posicion despues del `]`.
std::string insert_column_ref(const std::string& src, int start, int cursor,
const std::string& name, int& new_cursor);
// CSV: escapa una celda segun RFC 4180 (wrap en " si contiene , " o newline).
std::string csv_escape(const char* s);
// Construye TSV de un rect de seleccion. Headers SIEMPRE incluidos.
// view_row_lo/hi: indices en visible_rows.
// view_col_lo/hi: indices en col_order. Cols ocultas se omiten.
std::string build_tsv(const char* const* cells, int rows, int cols,
const char* const* headers,
const std::vector<int>& col_order,
@@ -327,19 +170,21 @@ std::string build_tsv(const char* const* cells, int rows, int cols,
int view_row_lo, int view_row_hi,
int view_col_lo, int view_col_hi);
// Construye CSV (full visible view). Headers incluidos, cells escapados.
std::string build_csv(const char* const* cells, int rows, int cols,
const char* const* headers,
const std::vector<int>& col_order,
const std::vector<bool>& col_visible,
const std::vector<int>& visible_rows);
// ----------------------------------------------------------------------------
// Column statistics (no movido todavia al registry).
// ----------------------------------------------------------------------------
struct ColStats {
int total = 0; // filas escaneadas
int empty_count = 0; // cells == "" o null
int unique_count = 0; // distintas (cap configurable)
bool unique_capped = false; // true si se alcanzo el cap
bool numeric = false; // true si todas las cells no-vacias parsean como numero
int total = 0;
int empty_count = 0;
int unique_count = 0;
bool unique_capped = false;
bool numeric = false;
int numeric_count = 0;
double min = 0;
double max = 0;
@@ -348,16 +193,12 @@ struct ColStats {
double p25 = 0;
double p50 = 0;
double p75 = 0;
std::vector<float> hist; // bins (HIST_BINS) si numeric
std::vector<std::pair<std::string,int>> top_categories; // top 8 por count desc
std::vector<float> hist;
std::vector<std::pair<std::string,int>> top_categories;
};
constexpr int HIST_BINS = 24;
// Pure: escanea una columna y devuelve estadisticas. `unique_cap` corta el
// conteo de unicos si excede (para datasets de millones). 0 = sin cap.
// Si `indices != nullptr` y `n_indices > 0`, recorre solo las filas indicadas
// (uso tipico: stats sobre filas visibles post-filtro).
ColStats compute_column_stats(const char* const* cells, int rows, int cols,
int col, int unique_cap = 100000,
const int* indices = nullptr, int n_indices = 0);
@@ -0,0 +1,295 @@
// llm_anthropic.cpp — cliente Anthropic minimal via cURL popen.
// Ver issue 0080.
#include "llm_anthropic.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <sstream>
#include <string>
namespace llm_anthropic {
using namespace data_table;
namespace {
// JSON escape minimal.
std::string json_escape(const std::string& s) {
std::string o;
o.reserve(s.size() + 8);
for (char c : s) {
switch (c) {
case '"': o += "\\\""; break;
case '\\': o += "\\\\"; break;
case '\n': o += "\\n"; break;
case '\r': o += "\\r"; break;
case '\t': o += "\\t"; break;
case '\b': o += "\\b"; break;
case '\f': o += "\\f"; break;
default:
if ((unsigned char)c < 0x20) {
char buf[8];
std::snprintf(buf, sizeof(buf), "\\u%04x", (int)(unsigned char)c);
o += buf;
} else {
o += c;
}
}
}
return o;
}
const char* col_type_doc(ColumnType t) {
switch (t) {
case ColumnType::String: return "string";
case ColumnType::Int: return "int";
case ColumnType::Float: return "float";
case ColumnType::Bool: return "bool";
case ColumnType::Date: return "date";
case ColumnType::Json: return "json";
case ColumnType::Auto: return "auto";
}
return "?";
}
std::string build_schema_block(const AskInput& in) {
std::ostringstream os;
os << "Available columns (stage 0 input):\n";
for (size_t i = 0; i < in.col_names.size(); ++i) {
os << " - " << in.col_names[i] << ": "
<< col_type_doc(i < in.col_types.size() ? in.col_types[i] : ColumnType::String)
<< "\n";
}
if (!in.joinable_names.empty()) {
os << "Joinable tables (for join clause):\n";
for (const auto& n : in.joinable_names) os << " - " << n << "\n";
}
return os.str();
}
std::string build_system_prompt(OutputMode mode) {
if (mode == OutputMode::TQL) {
return
"You are a TQL (Table Query Language) expert. Output ONLY a Lua code block. "
"TQL is a Lua table with shape:\n"
" return { version=1, display=\"table\"|\"bar\"|\"line\"|...,\n"
" main_source=\"name\", joins={ {alias,source,on,strategy,fields},... },\n"
" stages={ {filter={{op,col,value},...}, breakout={...}, aggregation={...}, sort={...} },... },\n"
" columns={ name = {type=\"int|float|...\", formula=\"[col]+1\"},... }\n"
" }\n"
"Stage 0 = Raw (filters + derived + sort, NO breakouts/aggs).\n"
"Stage 1+ groups (breakouts + aggregations).\n"
"Breakout granularity: append :year|:month|:week|:day|:hour to col name.\n"
"Aggregation functions: count|sum|avg|min|max|distinct|stddev|median|p25|p75|p90|p99|percentile.\n"
"Filter ops: '='|'!='|'<'|'<='|'>'|'>='|'contains'|'!contains'|'starts'|'ends'.\n"
"Sort: {{dir, col}, ...} where dir = 'asc'|'desc'.\n"
"Join strategies: 'left'|'inner'|'right'|'full'.\n"
"Formulas use Lua expression syntax with [col] for column refs.\n"
"Output format: ```lua\\n...\\n```";
}
return
"You are a DuckDB SQL expert. Output ONLY a SQL code block compatible with DuckDB.\n"
"Use CTEs to chain stages. Use date_trunc('month', col) for granularity.\n"
"Use quantile_cont(col, p) for percentiles. Use ? for bound params.\n"
"Joins: LEFT/INNER/RIGHT/FULL OUTER JOIN. String concat: ||. Aggregations: standard SQL.\n"
"Output format: ```sql\\n...\\n```";
}
} // anon
std::string build_request_body(const AskInput& in) {
std::string system_msg = build_system_prompt(in.mode);
std::string schema = build_schema_block(in);
std::ostringstream user_msg;
user_msg << "Question: " << in.question << "\n\n"
<< schema << "\n";
if (!in.tql_current.empty()) {
user_msg << "Current TQL:\n```lua\n" << in.tql_current << "\n```\n";
}
std::string model = in.model.empty() ? "claude-sonnet-4-6" : in.model;
std::ostringstream body;
body << "{"
<< "\"model\":\"" << json_escape(model) << "\","
<< "\"max_tokens\":" << in.max_tokens << ","
<< "\"system\":\"" << json_escape(system_msg) << "\","
<< "\"messages\":[{"
<< "\"role\":\"user\","
<< "\"content\":\"" << json_escape(user_msg.str()) << "\""
<< "}]"
<< "}";
return body.str();
}
std::string extract_code_block(const std::string& raw, const std::string& lang) {
// Buscar ```<lang> primero, sino ``` plain.
std::string fence_lang = "```" + lang;
auto pos = raw.find(fence_lang);
size_t code_start = std::string::npos;
if (pos != std::string::npos) {
code_start = pos + fence_lang.size();
} else {
pos = raw.find("```");
if (pos != std::string::npos) {
code_start = pos + 3;
// skip optional lang tag
while (code_start < raw.size() && raw[code_start] != '\n' &&
raw[code_start] != '\r' && std::isalnum((unsigned char)raw[code_start])) {
++code_start;
}
}
}
if (code_start == std::string::npos) {
// No fence — return raw stripped.
size_t i = 0; while (i < raw.size() && std::isspace((unsigned char)raw[i])) ++i;
size_t j = raw.size(); while (j > i && std::isspace((unsigned char)raw[j-1])) --j;
return raw.substr(i, j - i);
}
// Skip newline tras fence.
if (code_start < raw.size() && raw[code_start] == '\n') ++code_start;
auto end = raw.find("```", code_start);
if (end == std::string::npos) end = raw.size();
std::string code = raw.substr(code_start, end - code_start);
// Trim trailing newline.
while (!code.empty() && (code.back() == '\n' || code.back() == '\r')) code.pop_back();
return code;
}
std::string parse_response_text(const std::string& json) {
// Buscar pattern: "text":"..."
// Simple: primer occurrence de \"text\":\" tras \"type\":\"text\"
auto t = json.find("\"text\"");
while (t != std::string::npos) {
// Skip "text"
size_t i = t + 6;
// Skip whitespace y :
while (i < json.size() && (json[i] == ' ' || json[i] == ':' || json[i] == '\t')) ++i;
if (i >= json.size() || json[i] != '"') {
t = json.find("\"text\"", t + 1);
continue;
}
++i;
std::string out;
while (i < json.size() && json[i] != '"') {
if (json[i] == '\\' && i + 1 < json.size()) {
char esc = json[i+1];
if (esc == 'n') out += '\n';
else if (esc == 't') out += '\t';
else if (esc == 'r') out += '\r';
else if (esc == '"') out += '"';
else if (esc == '\\') out += '\\';
else if (esc == '/') out += '/';
else if (esc == 'u' && i + 5 < json.size()) {
// basic ascii \uXXXX
int code = 0;
for (int k = 0; k < 4; ++k) {
char c = json[i + 2 + k];
int v = (c >= '0' && c <= '9') ? c - '0'
: (c >= 'a' && c <= 'f') ? c - 'a' + 10
: (c >= 'A' && c <= 'F') ? c - 'A' + 10 : 0;
code = code * 16 + v;
}
if (code < 128) out += (char)code;
else out += '?';
i += 5;
} else {
out += esc;
}
i += 2;
} else {
out += json[i++];
}
}
return out;
}
return "";
}
namespace {
// Lee API key segun prioridad: param > env FN_LLM_API_KEY > pass anthropic/api-key.
std::string resolve_api_key(const std::string& provided) {
if (!provided.empty()) return provided;
const char* env = std::getenv("FN_LLM_API_KEY");
if (env && *env) return env;
// pass anthropic/api-key | head -n1
FILE* p = popen("pass anthropic/api-key 2>/dev/null | head -n1", "r");
if (!p) return "";
std::string out;
char buf[256];
while (fgets(buf, sizeof(buf), p)) out += buf;
pclose(p);
while (!out.empty() && (out.back() == '\n' || out.back() == '\r')) out.pop_back();
return out;
}
} // anon
std::string call_api(const std::string& body, const std::string& api_key,
std::string& error_out) {
error_out.clear();
// Test injection
const char* mock = std::getenv("FN_LLM_MOCK_RESPONSE");
if (mock && *mock) return mock;
std::string key = resolve_api_key(api_key);
if (key.empty()) {
error_out = "no API key (set FN_LLM_API_KEY env, pass param, or `pass anthropic/api-key`)";
return "";
}
const char* endpoint_env = std::getenv("FN_LLM_ENDPOINT");
std::string endpoint = endpoint_env && *endpoint_env
? endpoint_env
: "https://api.anthropic.com/v1/messages";
// popen "w+" no portable. Write body a tmp file y leer respuesta de curl
// por redireccion. Portable Unix/Mingw.
std::string tmp_in = std::tmpnam(nullptr);
std::string tmp_out = std::tmpnam(nullptr);
{
FILE* f = std::fopen(tmp_in.c_str(), "w");
if (!f) { error_out = "tmp file write fail"; return ""; }
std::fwrite(body.data(), 1, body.size(), f);
std::fclose(f);
}
std::string cmd2 = "curl -sS -X POST "
"-H \"content-type: application/json\" "
"-H \"anthropic-version: 2023-06-01\" "
"-H \"x-api-key: " + key + "\" "
"--data-binary @" + tmp_in + " " + endpoint
+ " > " + tmp_out + " 2>&1";
int rc = std::system(cmd2.c_str());
std::string resp;
{
FILE* f = std::fopen(tmp_out.c_str(), "r");
if (f) {
char buf[4096];
size_t n;
while ((n = std::fread(buf, 1, sizeof(buf), f)) > 0) resp.append(buf, n);
std::fclose(f);
}
}
std::remove(tmp_in.c_str());
std::remove(tmp_out.c_str());
if (rc != 0) {
error_out = "curl exit " + std::to_string(rc) + ": " + resp;
return "";
}
return resp;
}
AskResult ask(const AskInput& in, const std::string& api_key) {
AskResult r;
std::string body = build_request_body(in);
std::string raw_json = call_api(body, api_key, r.error);
if (!r.error.empty()) return r;
r.raw = parse_response_text(raw_json);
std::string lang = (in.mode == OutputMode::TQL) ? "lua" : "sql";
r.code = extract_code_block(r.raw, lang);
return r;
}
} // namespace llm_anthropic
@@ -0,0 +1,58 @@
// llm_anthropic: cliente HTTP minimal a Anthropic Claude API.
// Sin deps externas (cURL via popen).
// Ver issue 0080.
#pragma once
#include "data_table_logic.h"
#include "tql_to_sql.h"
#include <string>
#include <vector>
namespace llm_anthropic {
enum class OutputMode { TQL, SQL };
struct AskInput {
std::string question; // pregunta NL
std::string tql_current; // TQL actual (emitido)
std::vector<std::string> col_names; // schema input
std::vector<data_table::ColumnType> col_types;
std::vector<std::string> joinable_names; // tables disponibles para join
OutputMode mode = OutputMode::TQL;
std::string model; // empty -> default
int max_tokens = 8192;
};
struct AskResult {
std::string code; // bloque ```lua o ```sql extraido (sin fences)
std::string raw; // texto completo de la respuesta
std::string error; // non-empty si fallo
int tokens_in = 0;
int tokens_out = 0;
};
// Pure: construye el system prompt y user message JSON-escapado.
// Devuelve el JSON body completo POST al endpoint /v1/messages.
std::string build_request_body(const AskInput& in);
// Pure: extrae primer ```<lang>\n ... \n``` bloque de `raw`. lang = "lua"|"sql".
// Si no encuentra fence, retorna raw stripped.
std::string extract_code_block(const std::string& raw, const std::string& lang);
// Pure: extrae texto del JSON de respuesta Anthropic.
// Busca `"content":[{"type":"text","text":"..."}]` y devuelve el text.
std::string parse_response_text(const std::string& json_body);
// Impure: lanza cURL via popen, posts `body` al endpoint Anthropic /v1/messages,
// retorna response body (JSON crudo). API key leida de:
// 1. parametro `api_key` si non-empty
// 2. env FN_LLM_API_KEY
// 3. `pass anthropic/api-key | head -n1`
// Si FN_LLM_MOCK_RESPONSE env set, retorna su valor (test injection).
std::string call_api(const std::string& body, const std::string& api_key,
std::string& error_out);
// Orchestrator: build prompt + POST + parse. Convenience wrapper.
AskResult ask(const AskInput& in, const std::string& api_key = "");
} // namespace llm_anthropic
@@ -7,9 +7,12 @@
// Exit 0 = todos los checks pasan, 1 = falla.
#include "data_table_logic.h"
#include "llm_anthropic.h"
#include "lua_engine.h"
#include "tql.h"
#include "tql_to_sql.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
@@ -2051,6 +2054,782 @@ return {
check(join_strategy_from_token("nope") == JoinStrategy::Left, "phase9: parse fallback left");
}
// === phase10: drill extendido ===
{
// truncate_date — granularities sobre 2026-05-12 (martes).
std::string d = "2026-05-12";
check(truncate_date(d, DateGranularity::Year) == "2026", "phase10: trunc year");
check(truncate_date(d, DateGranularity::Month) == "2026-05", "phase10: trunc month");
check(truncate_date(d, DateGranularity::Day) == "2026-05-12", "phase10: trunc day");
check(truncate_date(d, DateGranularity::Week) == "2026-05-11", "phase10: trunc week (Mon)");
check(truncate_date("2026-05-12T14:33:01", DateGranularity::Hour) == "2026-05-12T14",
"phase10: trunc hour");
check(truncate_date("not-a-date", DateGranularity::Month) == "not-a-date",
"phase10: trunc passthrough invalido");
check(truncate_date(d, DateGranularity::None) == d, "phase10: trunc None == identidad");
}
{
// auto_date_granularity
check(auto_date_granularity("2024-01-01", "2026-05-12") == DateGranularity::Year,
"phase10: auto year >2y");
check(auto_date_granularity("2026-01-01", "2026-05-12") == DateGranularity::Month,
"phase10: auto month >60d");
check(auto_date_granularity("2026-04-15", "2026-05-12") == DateGranularity::Week,
"phase10: auto week >14d");
check(auto_date_granularity("2026-05-05", "2026-05-12") == DateGranularity::Day,
"phase10: auto day <=14d");
check(auto_date_granularity("bad", "2026-05-12") == DateGranularity::Day,
"phase10: auto fallback day");
}
{
// parse_breakout_granularity
std::string col;
check(parse_breakout_granularity("ts:month", col) == DateGranularity::Month,
"phase10: parse breakout month");
check(col == "ts", "phase10: parse breakout col stripped");
check(parse_breakout_granularity("ts", col) == DateGranularity::None,
"phase10: parse breakout sin sufijo None");
check(col == "ts", "phase10: col sin sufijo intacto");
check(parse_breakout_granularity("ts:wat", col) == DateGranularity::None,
"phase10: sufijo desconocido None");
check(col == "ts:wat", "phase10: col preserva sufijo desconocido");
}
{
// compose_breakout
check(compose_breakout("ts", DateGranularity::None) == "ts", "phase10: compose None");
check(compose_breakout("ts", DateGranularity::Month) == "ts:month", "phase10: compose month");
check(compose_breakout("ts", DateGranularity::Year) == "ts:year", "phase10: compose year");
// round-trip parse(compose)
std::string col;
auto g = parse_breakout_granularity(compose_breakout("foo", DateGranularity::Week), col);
check(g == DateGranularity::Week && col == "foo", "phase10: compose+parse round-trip");
}
{
// column_min_max
const char* cells[] = {
"2026-03-01",
"2026-01-15",
"",
"2026-05-12",
"2026-02-22",
};
std::string lo, hi;
column_min_max(cells, 5, 1, 0, lo, hi);
check(lo == "2026-01-15" && hi == "2026-05-12", "phase10: column_min_max ISO ordena lexical");
const char* empty_cells[] = {"", "", ""};
column_min_max(empty_cells, 3, 1, 0, lo, hi);
check(lo.empty() && hi.empty(), "phase10: column_min_max sin datos -> vacio");
column_min_max(cells, 5, 1, 5, lo, hi); // col fuera de rango
check(lo.empty() && hi.empty(), "phase10: column_min_max col fuera de rango -> vacio");
}
{
// tokens round-trip granularity
check(date_granularity_from_token("year") == DateGranularity::Year, "phase10: token year");
check(date_granularity_from_token("month") == DateGranularity::Month, "phase10: token month");
check(date_granularity_from_token("week") == DateGranularity::Week, "phase10: token week");
check(date_granularity_from_token("day") == DateGranularity::Day, "phase10: token day");
check(date_granularity_from_token("hour") == DateGranularity::Hour, "phase10: token hour");
check(date_granularity_from_token("nope") == DateGranularity::None, "phase10: token fallback None");
check(std::string(date_granularity_token(DateGranularity::Month)) == "month",
"phase10: emit month");
check(std::string(date_granularity_token(DateGranularity::None)) == "",
"phase10: emit None empty");
}
{
// build_preset_filters
auto f7 = build_preset_filters(FilterPreset::Last7d, 2, "2026-05-12");
check(f7.size() == 1, "phase10: Last7d -> 1 filter");
check(f7[0].col == 2 && f7[0].op == Op::Gte && f7[0].value == "2026-05-05",
"phase10: Last7d -> Gte 2026-05-05");
auto f30 = build_preset_filters(FilterPreset::Last30d, 2, "2026-05-12");
check(f30[0].value == "2026-04-12", "phase10: Last30d -> 2026-04-12");
auto f90 = build_preset_filters(FilterPreset::Last90d, 2, "2026-05-12");
check(f90[0].value == "2026-02-11", "phase10: Last90d -> 2026-02-11");
auto fn0 = build_preset_filters(FilterPreset::ExcludeNulls, 3, "");
check(fn0.size() == 1 && fn0[0].op == Op::Neq && fn0[0].value == "",
"phase10: ExcludeNulls -> Neq ''");
auto fnz = build_preset_filters(FilterPreset::NonZero, 4, "");
check(fnz.size() == 2, "phase10: NonZero -> 2 filters");
check(fnz[0].op == Op::Neq && fnz[0].value == "" &&
fnz[1].op == Op::Neq && fnz[1].value == "0",
"phase10: NonZero -> Neq '' AND Neq '0'");
auto fbad = build_preset_filters(FilterPreset::Last7d, 2, "bad-date");
check(fbad.empty(), "phase10: Last7d con today invalido -> empty");
}
{
// TQL round-trip: breakout con sufijo :granularity.
State st0;
st0.stages.resize(2);
st0.stages[1].breakouts = {"ts:month"};
Aggregation a; a.fn = AggFn::Count; a.alias = "n";
st0.stages[1].aggregations.push_back(a);
std::vector<std::string> hdrs = {"ts", "amount"};
std::vector<ColumnType> tys = {ColumnType::Date, ColumnType::Float};
int eff = 2;
std::string text = tql::emit(st0, hdrs, tys);
check(text.find("\"ts:month\"") != std::string::npos,
"phase10 TQL: emit breakout granularity sufijo");
std::string err;
State st1;
bool ok = tql::apply(text, st1, hdrs, tys, nullptr, 2, eff, &err);
check(ok, "phase10 TQL: apply round-trip ok");
check(st1.stages.size() >= 2 && st1.stages[1].breakouts.size() == 1 &&
st1.stages[1].breakouts[0] == "ts:month",
"phase10 TQL: breakout granularity preservada");
}
{
// compute_stage aplica truncado de fecha cuando hay :granularity.
const char* cells[] = {
"2026-01-15", "10",
"2026-01-22", "20",
"2026-02-03", "30",
"2026-03-11", "40",
};
std::vector<std::string> hdrs = {"ts", "amount"};
std::vector<ColumnType> tys = {ColumnType::Date, ColumnType::Float};
Stage s1;
s1.breakouts = {"ts:month"};
Aggregation ag; ag.fn = AggFn::Count; ag.alias = "n";
s1.aggregations.push_back(ag);
auto out = compute_stage(cells, 4, 2, hdrs, tys, s1);
check(out.rows == 3, "phase10: trunc month -> 3 grupos (Jan/Feb/Mar)");
check(out.headers[0] == "ts:month", "phase10: header preserva sufijo");
// Verifica que algun valor de breakout es "2026-01"
bool found_jan = false;
for (int r = 0; r < out.rows; ++r) {
if (std::string(out.cells[r * out.cols + 0]) == "2026-01") found_jan = true;
}
check(found_jan, "phase10: trunc value '2026-01' presente");
}
// === phase10 hit-tests para click-to-drill ===
{
// nearest_index_1d
double xs[] = {0, 1, 2, 3, 4};
check(nearest_index_1d(0.0, xs, 5) == 0, "phase10 hit: nearest_1d exact 0");
check(nearest_index_1d(2.4, xs, 5) == 2, "phase10 hit: nearest_1d 2.4 -> 2");
check(nearest_index_1d(2.6, xs, 5) == 3, "phase10 hit: nearest_1d 2.6 -> 3");
check(nearest_index_1d(-1.0, xs, 5) == 0, "phase10 hit: nearest_1d clamp left");
check(nearest_index_1d(99.0, xs, 5) == 4, "phase10 hit: nearest_1d clamp right");
check(nearest_index_1d(0.0, nullptr, 0) == -1, "phase10 hit: nearest_1d empty -> -1");
}
{
// nearest_index_2d
double xs[] = {0, 10, 5, 5};
double ys[] = {0, 0, 10, 5};
check(nearest_index_2d(0.1, 0.1, xs, ys, 4) == 0, "phase10 hit: nearest_2d cerca de (0,0)");
check(nearest_index_2d(9.9, 0.0, xs, ys, 4) == 1, "phase10 hit: nearest_2d cerca de (10,0)");
check(nearest_index_2d(5.0, 4.9, xs, ys, 4) == 3, "phase10 hit: nearest_2d cerca de (5,5)");
check(nearest_index_2d(0, 0, nullptr, nullptr, 0) == -1, "phase10 hit: nearest_2d empty -> -1");
}
{
// pie_angle (convencion ImPlot: 0 = top, sentido horario)
const double PI = 3.14159265358979323846;
double a;
a = pie_angle(0.5, 0.5, 0.5, 0.0); // top
check(std::fabs(a - 0.0) < 1e-9, "phase10 hit: pie_angle top = 0");
a = pie_angle(0.5, 0.5, 1.0, 0.5); // right -> PI/2
check(std::fabs(a - PI/2) < 1e-9, "phase10 hit: pie_angle right = PI/2");
a = pie_angle(0.5, 0.5, 0.5, 1.0); // bottom -> PI
check(std::fabs(a - PI) < 1e-9, "phase10 hit: pie_angle bottom = PI");
a = pie_angle(0.5, 0.5, 0.0, 0.5); // left -> 3*PI/2
check(std::fabs(a - 3*PI/2) < 1e-9, "phase10 hit: pie_angle left = 3PI/2");
}
{
// pie_slice_at_angle: 4 slices iguales -> cada uno cubre PI/2.
double sums[] = {1.0, 1.0, 1.0, 1.0};
const double PI = 3.14159265358979323846;
check(pie_slice_at_angle(0.0, sums, 4) == 0, "phase10 hit: slice 0 (top)");
check(pie_slice_at_angle(PI/4, sums, 4) == 0, "phase10 hit: slice 0 (mid)");
check(pie_slice_at_angle(PI/2 + 0.1, sums, 4) == 1, "phase10 hit: slice 1");
check(pie_slice_at_angle(PI + 0.1, sums, 4) == 2, "phase10 hit: slice 2");
check(pie_slice_at_angle(3*PI/2 + 0.1, sums, 4) == 3, "phase10 hit: slice 3");
double zeros[] = {0.0, 0.0};
check(pie_slice_at_angle(0.5, zeros, 2) == -1, "phase10 hit: total 0 -> -1");
check(pie_slice_at_angle(0.0, nullptr, 0) == -1, "phase10 hit: empty -> -1");
double neg[] = {1.0, -1.0};
check(pie_slice_at_angle(0.5, neg, 2) == -1, "phase10 hit: neg sum -> -1");
}
{
// heatmap_cell_at
int rr, cc;
heatmap_cell_at(1.5, 2.5, 4, 3, rr, cc);
check(rr == 2 && cc == 1, "phase10 hit: heatmap (1.5,2.5) en 4x3 -> r2 c1");
heatmap_cell_at(-1, 0, 4, 3, rr, cc);
check(rr == -1 && cc == -1, "phase10 hit: heatmap fuera de rango");
heatmap_cell_at(0, 0, 0, 0, rr, cc);
check(rr == -1 && cc == -1, "phase10 hit: heatmap empty");
}
{
// E2E click-to-drill: simular pipeline stage1 agrupado, click en row idx 2.
State st;
st.stages.resize(2);
std::vector<std::string> hdrs = {"lang", "n"};
std::vector<ColumnType> tys = {ColumnType::String, ColumnType::Int};
st.stages[1].breakouts.push_back("lang");
st.stages[1].aggregations.push_back({AggFn::Count});
st.active_stage = 1;
// Stage 1 output simulado (3 grupos).
const char* g_cells[] = {
"go", "3",
"py", "2",
"cpp", "1",
};
StageOutput so;
so.cells.insert(so.cells.end(), g_cells, g_cells + 6);
so.rows = 3;
so.cols = 2;
so.headers = {"lang", "count"};
// Simular click en row idx 2 (cpp).
int clicked_row = 2;
int n_brk = (int)st.stages[1].breakouts.size();
check(n_brk == 1, "phase10 e2e: 1 breakout");
const char* v = so.cells[clicked_row * so.cols + 0];
std::string col_clean;
parse_breakout_granularity(so.headers[0], col_clean);
check(col_clean == "lang", "phase10 e2e: col_clean stripped OK");
st.stages[0].filters.push_back(make_drill_filter(0, v));
st.active_stage = 0;
check(st.active_stage == 0, "phase10 e2e: active retrocede a 0");
check(st.stages[0].filters.size() == 1, "phase10 e2e: 1 filter anadido");
check(st.stages[0].filters[0].col == 0 &&
st.stages[0].filters[0].op == Op::Eq &&
st.stages[0].filters[0].value == "cpp",
"phase10 e2e: filter Op::Eq col=0 value=cpp");
}
// === phase10 drill history (apply/undo step) ===
{
State st;
st.stages.resize(2);
st.active_stage = 1;
DrillStep step;
step.target_stage = 0;
step.filter_pos = 0;
step.prev_active_stage = 1;
step.added = make_drill_filter(0, "go");
check(apply_drill_step(st, step), "phase10 hist: apply ok");
check(st.stages[0].filters.size() == 1, "phase10 hist: filter anadido");
check(st.stages[0].filters[0].value == "go", "phase10 hist: value preservado");
check(st.active_stage == 0, "phase10 hist: active = target");
check(undo_drill_step(st, step), "phase10 hist: undo ok");
check(st.stages[0].filters.empty(), "phase10 hist: filter eliminado");
check(st.active_stage == 1, "phase10 hist: active restaurado");
// Redo
check(apply_drill_step(st, step), "phase10 hist: redo ok");
check(st.stages[0].filters.size() == 1, "phase10 hist: redo filter de vuelta");
check(st.active_stage == 0, "phase10 hist: redo active retorna");
// Edge: target fuera de rango
DrillStep bad;
bad.target_stage = 99;
check(!apply_drill_step(st, bad), "phase10 hist: apply fuera de rango -> false");
check(!undo_drill_step(st, bad), "phase10 hist: undo fuera de rango -> false");
// Edge: pos invalida
DrillStep bad_pos = step;
bad_pos.filter_pos = 99;
check(!undo_drill_step(st, bad_pos), "phase10 hist: undo pos invalida -> false");
}
// === phase10 drill history: back/forward stack semantics simulado ===
{
State st;
st.stages.resize(3);
st.active_stage = 2;
std::vector<DrillStep> back_stack;
std::vector<DrillStep> fwd_stack;
auto drill = [&](int from, int target, int pos, int col, const std::string& v) {
DrillStep s;
s.target_stage = target;
s.filter_pos = pos;
s.prev_active_stage = from;
s.added = make_drill_filter(col, v);
apply_drill_step(st, s);
back_stack.push_back(s);
fwd_stack.clear();
};
drill(2, 1, 0, 0, "go");
check(st.stages[1].filters.size() == 1, "phase10 hist seq: drill1 aplicado");
drill(1, 0, 0, 1, "10");
check(st.stages[0].filters.size() == 1, "phase10 hist seq: drill2 aplicado");
check(back_stack.size() == 2, "phase10 hist seq: back stack 2");
check(fwd_stack.empty(), "phase10 hist seq: forward limpio");
// Back x1
DrillStep s = back_stack.back(); back_stack.pop_back();
undo_drill_step(st, s);
fwd_stack.push_back(s);
check(st.stages[0].filters.empty(), "phase10 hist seq: back deshace drill2");
check(st.active_stage == 1, "phase10 hist seq: back restaura active=1");
check(fwd_stack.size() == 1, "phase10 hist seq: fwd stack 1");
// Forward x1
s = fwd_stack.back(); fwd_stack.pop_back();
apply_drill_step(st, s);
back_stack.push_back(s);
check(st.stages[0].filters.size() == 1, "phase10 hist seq: forward reaplica");
check(st.active_stage == 0, "phase10 hist seq: forward active=0");
}
// === phase10 row inspector (row_to_tsv + build_filters_from_row) ===
{
const char* cells[] = {
"go", "10", "filter",
"py", "20", "sma",
"go", "30", "map",
};
std::vector<std::string> hdrs = {"lang", "n", "fn"};
std::string tsv = row_to_tsv(cells, 3, 3, 1, hdrs);
check(tsv == "lang\tn\tfn\r\npy\t20\tsma\r\n",
"phase10 inspect: row_to_tsv layout");
check(row_to_tsv(cells, 3, 3, -1, hdrs).empty(), "phase10 inspect: tsv neg row -> empty");
check(row_to_tsv(cells, 3, 3, 5, hdrs).empty(), "phase10 inspect: tsv row oob -> empty");
check(row_to_tsv(cells, 3, 0, 0, hdrs).empty(), "phase10 inspect: tsv cols=0 -> empty");
auto fs = build_filters_from_row(cells, 3, 3, 0);
check(fs.size() == 3, "phase10 inspect: 3 filters de row 0");
check(fs[0].col == 0 && fs[0].op == Op::Eq && fs[0].value == "go",
"phase10 inspect: filter[0] col=0 op=Eq value=go");
check(fs[2].value == "filter", "phase10 inspect: filter[2] value=filter");
// Row con celda vacia -> filter saltado
const char* sparse[] = {"a", "", "c"};
auto fs2 = build_filters_from_row(sparse, 1, 3, 0);
check(fs2.size() == 2 && fs2[0].col == 0 && fs2[1].col == 2,
"phase10 inspect: cells vacios salteados");
check(build_filters_from_row(cells, 3, 3, -1).empty(),
"phase10 inspect: build_filters row invalido -> empty");
}
// === phase10 drill-up ===
{
State st;
st.stages.resize(3);
st.active_stage = 2;
check(drill_up(st), "phase10 up: 2->1 ok");
check(st.active_stage == 1, "phase10 up: active=1");
check(drill_up(st), "phase10 up: 1->0 ok");
check(st.active_stage == 0, "phase10 up: active=0");
check(!drill_up(st), "phase10 up: 0 -> false");
check(st.active_stage == 0, "phase10 up: queda en 0");
// Filters no se mueven
State st2;
st2.stages.resize(2);
st2.active_stage = 1;
st2.stages[1].filters.push_back({0, Op::Eq, "x"});
drill_up(st2);
check(st2.stages[0].filters.empty() && st2.stages[1].filters.size() == 1,
"phase10 up: filters quedan en su stage");
State empty_st;
check(!drill_up(empty_st), "phase10 up: stages vacio -> false");
}
// === phase11: Lua subset validator + transpiler ===
{
std::string err;
// Subset OK: literales + ops
std::string e1 = tql_to_sql::transpile_expr("1 + 2", {}, err);
check(err.empty() && e1.find("1 + 2") != std::string::npos,
"phase11 lua: literal arith");
std::string e2 = tql_to_sql::transpile_expr("[a] + [b] * 2", {}, err);
check(err.empty() && e2.find("\"a\"") != std::string::npos &&
e2.find("\"b\"") != std::string::npos,
"phase11 lua: col refs + arith");
std::string e3 = tql_to_sql::transpile_expr("[a] .. \"_\" .. [b]", {}, err);
check(err.empty() && e3.find(" || ") != std::string::npos,
"phase11 lua: concat -> ||");
std::string e4 = tql_to_sql::transpile_expr(
"if [n] > 10 then \"big\" else \"small\" end", {}, err);
check(err.empty() && e4.find("CASE WHEN") != std::string::npos &&
e4.find("THEN") != std::string::npos && e4.find("ELSE") != std::string::npos,
"phase11 lua: if/then/else -> CASE");
std::string e5 = tql_to_sql::transpile_expr("math.floor([x] / 100)", {}, err);
check(err.empty() && e5.find("floor(") != std::string::npos,
"phase11 lua: math.floor");
std::string e6 = tql_to_sql::transpile_expr("string.upper([name])", {}, err);
check(err.empty() && e6.find("upper(") != std::string::npos,
"phase11 lua: string.upper");
std::string e7 = tql_to_sql::transpile_expr("string.sub([s], 1, 3)", {}, err);
check(err.empty() && e7.find("substring(") != std::string::npos,
"phase11 lua: string.sub 3-arg");
std::string e8 = tql_to_sql::transpile_expr("not ([x] == nil)", {}, err);
check(err.empty() && e8.find("NOT") != std::string::npos && e8.find("NULL") != std::string::npos,
"phase11 lua: not + nil");
std::string e9 = tql_to_sql::transpile_expr("tonumber([n])", {}, err);
check(err.empty() && e9.find("CAST(") != std::string::npos,
"phase11 lua: tonumber -> CAST DOUBLE");
// Fuera subset: 9 categorias rechazadas
err.clear();
check(tql_to_sql::transpile_expr("function() return 1 end", {}, err).empty()
&& err.find("closures") != std::string::npos,
"phase11 lua: function closure rechazado");
err.clear();
check(tql_to_sql::transpile_expr("local x = 1", {}, err).empty()
&& err.find("local") != std::string::npos,
"phase11 lua: local rechazado");
err.clear();
check(tql_to_sql::transpile_expr("for i=1,10 do end", {}, err).empty()
&& err.find("loops") != std::string::npos,
"phase11 lua: for loop rechazado");
err.clear();
check(tql_to_sql::transpile_expr("while true do end", {}, err).empty()
&& err.find("loops") != std::string::npos,
"phase11 lua: while loop rechazado");
err.clear();
check(tql_to_sql::transpile_expr("{1,2,3}", {}, err).empty()
&& err.find("table") != std::string::npos,
"phase11 lua: table literal rechazado");
err.clear();
check(tql_to_sql::transpile_expr("io.read()", {}, err).empty()
&& err.find("io") != std::string::npos,
"phase11 lua: io.* rechazado");
err.clear();
check(tql_to_sql::transpile_expr("string.gsub([s], \"a\", \"b\")", {}, err).empty()
&& err.find("whitelist") != std::string::npos,
"phase11 lua: string.gsub no whitelisted");
err.clear();
check(tql_to_sql::transpile_expr("print([x])", {}, err).empty()
&& err.find("print") != std::string::npos,
"phase11 lua: print rechazado");
err.clear();
check(tql_to_sql::transpile_expr("[a]; [b]", {}, err).empty()
&& err.find("multi-statement") != std::string::npos,
"phase11 lua: ';' multi-stmt rechazado");
// is_transpilable wrapper
std::string werr;
check(tql_to_sql::is_transpilable("[a] + 1", werr), "phase11 lua: is_transpilable OK");
check(!tql_to_sql::is_transpilable("function() end", werr),
"phase11 lua: is_transpilable false para closure");
}
// === phase11: TQL State -> SQL DuckDB emit ===
{
// Setup: 1 tabla "users" con cols lang,n.
TableInput t;
t.name = "users";
t.headers = {"lang", "n"};
t.types = {ColumnType::String, ColumnType::Int};
// Cells no usado por emit (solo schema).
std::vector<TableInput> tables = {t};
// Caso 1: stage 0 simple (sin filters ni sort)
{
State st;
st.stages.resize(1);
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: empty pipeline -> no error");
check(e.sql.find("WITH t0") != std::string::npos &&
e.sql.find("FROM \"users\"") != std::string::npos &&
e.sql.find("SELECT * FROM t0") != std::string::npos,
"phase11 sql: stage0 SELECT * FROM users");
}
// Caso 2: stage 0 filter + sort
{
State st;
st.stages.resize(1);
st.stages[0].filters.push_back({0, Op::Eq, "go"});
st.stages[0].filters.push_back({1, Op::Gt, "10"});
st.stages[0].sorts.push_back({"n", true});
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: filter+sort OK");
check(e.sql.find("WHERE") != std::string::npos &&
e.sql.find("\"lang\" = ?") != std::string::npos &&
e.sql.find("\"n\" > ?") != std::string::npos,
"phase11 sql: filter clauses");
check(e.params.size() == 2 && e.params[0] == "go" && e.params[1] == "10",
"phase11 sql: params bound");
check(e.sql.find("ORDER BY \"n\" DESC") != std::string::npos,
"phase11 sql: ORDER BY desc");
}
// Caso 3: stage 1 group + count
{
State st;
st.stages.resize(2);
st.stages[1].breakouts.push_back("lang");
st.stages[1].aggregations.push_back({AggFn::Count});
st.active_stage = 1;
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: group ok");
check(e.sql.find("t1 AS") != std::string::npos &&
e.sql.find("COUNT(*)") != std::string::npos &&
e.sql.find("GROUP BY") != std::string::npos &&
e.sql.find("SELECT * FROM t1") != std::string::npos,
"phase11 sql: stage1 CTE + COUNT + GROUP BY");
}
// Caso 4: granularity :month -> date_trunc
{
State st;
st.stages.resize(2);
st.stages[1].breakouts.push_back("ts:month");
st.stages[1].aggregations.push_back({AggFn::Sum, "n"});
st.active_stage = 1;
TableInput ts_t;
ts_t.name = "events";
ts_t.headers = {"ts", "n"};
ts_t.types = {ColumnType::Date, ColumnType::Int};
std::vector<TableInput> tt = {ts_t};
auto e = tql_to_sql::emit_sql(st, tt);
check(e.error.empty(), "phase11 sql: granularity ok");
check(e.sql.find("date_trunc('month'") != std::string::npos &&
e.sql.find("SUM(\"n\")") != std::string::npos,
"phase11 sql: date_trunc + SUM");
}
// Caso 5: aggregations p25/median/p99
{
State st;
st.stages.resize(2);
st.stages[1].breakouts.push_back("lang");
st.stages[1].aggregations.push_back({AggFn::Median, "n"});
st.stages[1].aggregations.push_back({AggFn::P25, "n"});
st.stages[1].aggregations.push_back({AggFn::P99, "n"});
st.active_stage = 1;
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: percentiles ok");
check(e.sql.find("quantile_cont(\"n\", 0.5)") != std::string::npos &&
e.sql.find("quantile_cont(\"n\", 0.25)") != std::string::npos &&
e.sql.find("quantile_cont(\"n\", 0.99)") != std::string::npos,
"phase11 sql: quantile_cont calls");
}
// Caso 6: joins 4 strategies
{
State st;
st.stages.resize(1);
Join jn;
jn.alias = "o";
jn.source = "orders";
jn.on.push_back({"user_id", "user_id"});
jn.strategy = JoinStrategy::Left;
st.joins.push_back(jn);
TableInput u, o;
u.name = "users";
u.headers = {"user_id", "name"};
u.types = {ColumnType::String, ColumnType::String};
o.name = "orders";
o.headers = {"user_id", "amount"};
o.types = {ColumnType::String, ColumnType::Int};
std::vector<TableInput> tt = {u, o};
auto e = tql_to_sql::emit_sql(st, tt);
check(e.error.empty(), "phase11 sql: join ok");
check(e.sql.find("LEFT JOIN \"orders\" AS \"o\"") != std::string::npos &&
e.sql.find("ON \"users\".\"user_id\" = \"o\".\"user_id\"") != std::string::npos,
"phase11 sql: LEFT JOIN ON syntax");
// Inner
st.joins[0].strategy = JoinStrategy::Inner;
auto e2 = tql_to_sql::emit_sql(st, tt);
check(e2.sql.find("INNER JOIN") != std::string::npos, "phase11 sql: INNER JOIN");
// Right
st.joins[0].strategy = JoinStrategy::Right;
auto e3 = tql_to_sql::emit_sql(st, tt);
check(e3.sql.find("RIGHT JOIN") != std::string::npos, "phase11 sql: RIGHT JOIN");
// Full
st.joins[0].strategy = JoinStrategy::Full;
auto e4 = tql_to_sql::emit_sql(st, tt);
check(e4.sql.find("FULL OUTER JOIN") != std::string::npos, "phase11 sql: FULL OUTER JOIN");
}
// Caso 7: derived col subset -> SQL expression
{
State st;
st.stages.resize(1);
DerivedColumn d;
d.name = "size_kb";
d.source_col = -1;
d.formula = "[n] / 1024.0";
d.type = ColumnType::Float;
st.stages[0].derived.push_back(d);
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: derived subset ok");
check(e.sql.find("\"n\" / 1024") != std::string::npos &&
e.sql.find("AS \"size_kb\"") != std::string::npos,
"phase11 sql: derived expression + alias");
}
// Caso 8: derived col FUERA subset -> warning + skip
{
State st;
st.stages.resize(1);
DerivedColumn d;
d.name = "bad";
d.source_col = -1;
d.formula = "string.gsub([n], \"a\", \"b\")";
d.type = ColumnType::String;
st.stages[0].derived.push_back(d);
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: derived fuera subset NO bloquea emit");
check(!e.warnings.empty() &&
e.warnings[0].find("out of SQL subset") != std::string::npos,
"phase11 sql: warning derived fuera subset");
check(e.sql.find("\"bad\"") == std::string::npos,
"phase11 sql: derived skip cuando fuera subset");
}
// Caso 9: empty tables -> error
{
State st;
st.stages.resize(1);
std::vector<TableInput> empty;
auto e = tql_to_sql::emit_sql(st, empty);
check(!e.error.empty() && e.error.find("no input tables") != std::string::npos,
"phase11 sql: empty tables -> error");
}
// Caso 10: stage 0 con LIKE (Contains)
{
State st;
st.stages.resize(1);
st.stages[0].filters.push_back({0, Op::Contains, "go"});
auto e = tql_to_sql::emit_sql(st, tables);
check(e.error.empty(), "phase11 sql: LIKE Contains ok");
check(e.sql.find("LIKE ?") != std::string::npos &&
e.params.size() == 1 && e.params[0] == "%go%",
"phase11 sql: Contains -> LIKE %go%");
}
}
// === phase11: LLM client (mock, no red) ===
{
llm_anthropic::AskInput in;
in.question = "show top 10 langs";
in.tql_current = "return { stages = {} }";
in.col_names = {"lang", "n"};
in.col_types = {ColumnType::String, ColumnType::Int};
in.mode = llm_anthropic::OutputMode::TQL;
std::string body = llm_anthropic::build_request_body(in);
check(body.find("\"model\":\"claude-sonnet-4-6\"") != std::string::npos,
"phase11 llm: default model");
check(body.find("\"max_tokens\":8192") != std::string::npos,
"phase11 llm: max_tokens");
check(body.find("\\\"system\\\"") == std::string::npos /* not double-escaped */,
"phase11 llm: system not double-escaped");
check(body.find("Available columns") != std::string::npos,
"phase11 llm: schema block present");
check(body.find("show top 10 langs") != std::string::npos,
"phase11 llm: question present");
check(body.find("TQL") != std::string::npos,
"phase11 llm: system mentions TQL");
in.mode = llm_anthropic::OutputMode::SQL;
std::string body_sql = llm_anthropic::build_request_body(in);
check(body_sql.find("DuckDB") != std::string::npos,
"phase11 llm: SQL mode mentions DuckDB");
}
{
// extract_code_block
std::string raw1 = "Here you go:\n```lua\nreturn { x = 1 }\n```\nDone!";
std::string code = llm_anthropic::extract_code_block(raw1, "lua");
check(code == "return { x = 1 }", "phase11 llm: extract ```lua block");
std::string raw2 = "Sure:\n```\nplain code\n```";
std::string code2 = llm_anthropic::extract_code_block(raw2, "lua");
check(code2 == "plain code", "phase11 llm: extract bare ```");
std::string raw3 = "no fences here";
std::string code3 = llm_anthropic::extract_code_block(raw3, "lua");
check(code3 == "no fences here", "phase11 llm: no fence -> stripped");
std::string raw4 = "```sql\nSELECT 1;\n```";
std::string code4 = llm_anthropic::extract_code_block(raw4, "sql");
check(code4 == "SELECT 1;", "phase11 llm: extract ```sql");
}
{
// parse_response_text from JSON
std::string j = "{\"id\":\"x\",\"content\":[{\"type\":\"text\",\"text\":\"hello\\nworld\"}],\"role\":\"assistant\"}";
std::string t = llm_anthropic::parse_response_text(j);
check(t == "hello\nworld", "phase11 llm: parse text content");
std::string j2 = "{\"content\":[{\"type\":\"text\",\"text\":\"\\\"quoted\\\"\"}]}";
std::string t2 = llm_anthropic::parse_response_text(j2);
check(t2 == "\"quoted\"", "phase11 llm: parse quoted escape");
std::string j3 = "{\"error\":\"foo\"}";
std::string t3 = llm_anthropic::parse_response_text(j3);
check(t3.empty(), "phase11 llm: no text -> empty");
}
{
// Mock end-to-end via FN_LLM_MOCK_RESPONSE (portable Linux/Mingw via putenv).
const char* mock_kv =
"FN_LLM_MOCK_RESPONSE={\"content\":[{\"type\":\"text\",\"text\":\"```lua\\nreturn { mock = true }\\n```\"}]}";
putenv((char*)mock_kv);
llm_anthropic::AskInput in;
in.question = "q";
in.col_names = {"a"};
in.col_types = {ColumnType::String};
auto r = llm_anthropic::ask(in);
check(r.error.empty(), "phase11 llm mock: no error");
check(r.code == "return { mock = true }", "phase11 llm mock: code extracted");
// Unset: putenv con "VAR=" deja vacio (suficiente para nuestro check `*mock`).
putenv((char*)"FN_LLM_MOCK_RESPONSE=");
}
std::printf("\n=== %d passed, %d failed ===\n", passed, failed);
return failed == 0 ? 0 : 1;
}
@@ -652,7 +652,8 @@ bool apply(const std::string& lua_text, State& state,
}
lua_pop(L, 1);
// breakout (solo aplica stages >= 1, no-op silencioso si stage 0)
// breakout (solo aplica stages >= 1, no-op silencioso si stage 0).
// Acepta sufijo ":granularity" para cols Date (fase 10).
lua_getfield(L, -1, "breakout");
if (lua_istable(L, -1)) {
int n = (int)lua_rawlen(L, -1);
@@ -660,8 +661,10 @@ bool apply(const std::string& lua_text, State& state,
lua_rawgeti(L, -1, i);
if (lua_isstring(L, -1)) {
std::string bn = lua_tostring(L, -1);
if (find_orig_col(cur_headers, bn) < 0) {
warn("stage " + std::to_string(si - 1) + ": breakout col \"" + bn + "\" not in input headers");
std::string clean;
parse_breakout_granularity(bn, clean);
if (find_orig_col(cur_headers, clean) < 0) {
warn("stage " + std::to_string(si - 1) + ": breakout col \"" + clean + "\" not in input headers");
}
stg.breakouts.emplace_back(bn);
}
@@ -0,0 +1,862 @@
// tql_to_sql.cpp — pure walker TQL -> SQL DuckDB + Lua subset transpiler.
// Ver issue 0080. Sin DuckDB linkado.
#include "tql_to_sql.h"
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <set>
#include <sstream>
#include <unordered_map>
namespace tql_to_sql {
using namespace data_table;
// ============================================================================
// Lua subset tokenizer + recursive-descent expression parser -> SQL string.
// ============================================================================
namespace {
struct Tok {
enum Kind {
EndT, NumT, StrT, IdentT, ColT,
// operators / keywords
Plus, Minus, Star, Slash, Percent, ConcatT,
Eq, Neq, Lt, Lte, Gt, Gte,
AndT, OrT, NotT,
IfT, ThenT, ElseT, EndKW,
LParen, RParen, Comma, Dot,
TrueT, FalseT, NilT,
} kind = EndT;
std::string text; // raw token texto (para idents/numbers/strings)
};
// Categorias prohibidas: token literal -> mensaje.
const std::unordered_map<std::string, const char*>& forbidden_keywords() {
static const std::unordered_map<std::string, const char*> M = {
{"function", "closures not allowed in SQL transpile subset"},
{"local", "local declarations not allowed"},
{"for", "loops not allowed"},
{"while", "loops not allowed"},
{"repeat", "loops not allowed"},
{"do", "block statements not allowed"},
{"return", "explicit return not allowed (formula is implicit expression)"},
{"goto", "goto not allowed"},
{"break", "break not allowed (no loops)"},
// io/os/debug/coroutines
{"io", "io.* access not allowed"},
{"os", "os.* access not allowed"},
{"debug", "debug.* access not allowed"},
{"package", "package access not allowed"},
{"require", "require not allowed"},
{"coroutine","coroutines not allowed"},
{"setmetatable","metatables not allowed"},
{"getmetatable","metatables not allowed"},
{"rawget", "rawget not allowed"},
{"rawset", "rawset not allowed"},
{"pcall", "pcall not allowed"},
{"xpcall", "xpcall not allowed"},
{"print", "print not allowed (SQL has no side effects)"},
};
return M;
}
// Whitelist de funciones SQL-transpilables: lua name -> SQL function template.
// Template usa $1, $2, ... como placeholders de argumentos.
struct FnMap { int min_args; int max_args; const char* sql_tmpl; };
const std::unordered_map<std::string, FnMap>& fn_whitelist() {
static const std::unordered_map<std::string, FnMap> M = {
// math.*
{"math.floor", {1, 1, "floor($1)"}},
{"math.ceil", {1, 1, "ceiling($1)"}},
{"math.abs", {1, 1, "abs($1)"}},
{"math.sqrt", {1, 1, "sqrt($1)"}},
{"math.sin", {1, 1, "sin($1)"}},
{"math.cos", {1, 1, "cos($1)"}},
{"math.log", {1, 1, "ln($1)"}},
{"math.exp", {1, 1, "exp($1)"}},
{"math.min", {2, 2, "least($1, $2)"}},
{"math.max", {2, 2, "greatest($1, $2)"}},
// string.*
{"string.upper", {1, 1, "upper($1)"}},
{"string.lower", {1, 1, "lower($1)"}},
{"string.len", {1, 1, "length($1)"}},
{"string.sub", {2, 3, "/*SUBSTRING*/"}}, // manejo especial: argc 2 vs 3
// top-level
{"tostring", {1, 1, "CAST($1 AS VARCHAR)"}},
{"tonumber", {1, 1, "CAST($1 AS DOUBLE)"}},
};
return M;
}
// Identifier SQL-safe: si tiene caracteres especiales o coincide con keyword,
// usar `"col"`. Aqui simplificado: siempre quote con dobles comillas para
// preservar case y permitir `:` (sufijo granularity).
std::string sql_ident(const std::string& name) {
std::string out;
out.reserve(name.size() + 4);
out += '"';
for (char c : name) {
if (c == '"') out += "\"\""; // escape
else out += c;
}
out += '"';
return out;
}
std::string sql_string_literal(const std::string& s) {
std::string out;
out.reserve(s.size() + 4);
out += '\'';
for (char c : s) {
if (c == '\'') out += "''";
else out += c;
}
out += '\'';
return out;
}
class Lexer {
public:
Lexer(const std::string& src) : src_(src) {}
// Devuelve true si parsea OK. False con err en error_.
bool tokenize(std::vector<Tok>& out) {
size_t i = 0;
while (i < src_.size()) {
char c = src_[i];
if (std::isspace((unsigned char)c)) { ++i; continue; }
// Lua line comment
if (c == '-' && i + 1 < src_.size() && src_[i+1] == '-') {
while (i < src_.size() && src_[i] != '\n') ++i;
continue;
}
if (c == '[' ) {
// col ref [identifier]
size_t j = i + 1;
std::string name;
while (j < src_.size() && src_[j] != ']') {
name += src_[j];
++j;
}
if (j >= src_.size()) { error_ = "unterminated [col] ref"; return false; }
Tok t; t.kind = Tok::ColT; t.text = name;
out.push_back(t);
i = j + 1;
continue;
}
if (c == '"' || c == '\'') {
char q = c;
++i;
std::string s;
while (i < src_.size() && src_[i] != q) {
if (src_[i] == '\\' && i + 1 < src_.size()) {
char esc = src_[i+1];
if (esc == 'n') s += '\n';
else if (esc == 't') s += '\t';
else if (esc == '\\') s += '\\';
else if (esc == '\'') s += '\'';
else if (esc == '"') s += '"';
else s += esc;
i += 2;
} else {
s += src_[i++];
}
}
if (i >= src_.size()) { error_ = "unterminated string literal"; return false; }
++i;
Tok t; t.kind = Tok::StrT; t.text = s;
out.push_back(t);
continue;
}
if (std::isdigit((unsigned char)c) || (c == '.' && i + 1 < src_.size() && std::isdigit((unsigned char)src_[i+1]))) {
std::string n;
bool seen_dot = false;
while (i < src_.size()) {
char d = src_[i];
if (std::isdigit((unsigned char)d)) { n += d; ++i; }
else if (d == '.' && !seen_dot) { n += d; seen_dot = true; ++i; }
else break;
}
Tok t; t.kind = Tok::NumT; t.text = n;
out.push_back(t);
continue;
}
if (std::isalpha((unsigned char)c) || c == '_') {
std::string id;
while (i < src_.size() &&
(std::isalnum((unsigned char)src_[i]) || src_[i] == '_')) {
id += src_[i++];
}
// Check forbidden keywords y mapeo a tokens.
auto& F = forbidden_keywords();
auto fit = F.find(id);
if (fit != F.end()) {
error_ = std::string("token '") + id + "': " + fit->second;
return false;
}
Tok t;
if (id == "and") t.kind = Tok::AndT;
else if (id == "or") t.kind = Tok::OrT;
else if (id == "not") t.kind = Tok::NotT;
else if (id == "if") t.kind = Tok::IfT;
else if (id == "then") t.kind = Tok::ThenT;
else if (id == "else") t.kind = Tok::ElseT;
else if (id == "end") t.kind = Tok::EndKW;
else if (id == "true") t.kind = Tok::TrueT;
else if (id == "false") t.kind = Tok::FalseT;
else if (id == "nil") t.kind = Tok::NilT;
else { t.kind = Tok::IdentT; t.text = id; }
out.push_back(t);
continue;
}
// Operators
auto emit = [&](Tok::Kind k, int len) {
Tok t; t.kind = k; out.push_back(t); i += (size_t)len;
};
if (c == '+') { emit(Tok::Plus, 1); continue; }
if (c == '-') { emit(Tok::Minus, 1); continue; }
if (c == '*') { emit(Tok::Star, 1); continue; }
if (c == '/') { emit(Tok::Slash, 1); continue; }
if (c == '%') { emit(Tok::Percent,1); continue; }
if (c == '(') { emit(Tok::LParen, 1); continue; }
if (c == ')') { emit(Tok::RParen, 1); continue; }
if (c == ',') { emit(Tok::Comma, 1); continue; }
if (c == '.') {
if (i + 1 < src_.size() && src_[i+1] == '.') {
if (i + 2 < src_.size() && src_[i+2] == '.') {
error_ = "'...' vararg not allowed"; return false;
}
emit(Tok::ConcatT, 2); continue;
}
emit(Tok::Dot, 1); continue;
}
if (c == '=') {
if (i + 1 < src_.size() && src_[i+1] == '=') { emit(Tok::Eq, 2); continue; }
error_ = "single '=' (assignment) not allowed"; return false;
}
if (c == '~') {
if (i + 1 < src_.size() && src_[i+1] == '=') { emit(Tok::Neq, 2); continue; }
error_ = "stray '~'"; return false;
}
if (c == '<') {
if (i + 1 < src_.size() && src_[i+1] == '=') { emit(Tok::Lte, 2); continue; }
emit(Tok::Lt, 1); continue;
}
if (c == '>') {
if (i + 1 < src_.size() && src_[i+1] == '=') { emit(Tok::Gte, 2); continue; }
emit(Tok::Gt, 1); continue;
}
if (c == '{') { error_ = "table literals '{...}' not allowed"; return false; }
if (c == '}') { error_ = "stray '}'"; return false; }
if (c == ';') { error_ = "multi-statement not allowed"; return false; }
if (c == '#') { error_ = "length '#' operator not allowed"; return false; }
if (c == ':') { error_ = "method calls ':' not allowed"; return false; }
error_ = std::string("unexpected character '") + c + "'";
return false;
}
Tok t; t.kind = Tok::EndT;
out.push_back(t);
return true;
}
const std::string& error() const { return error_; }
private:
const std::string& src_;
std::string error_;
};
class Parser {
public:
Parser(const std::vector<Tok>& toks,
const std::vector<std::string>& headers)
: toks_(toks), headers_(headers) {}
// expr := ternary
// ternary := if/then/else | logic_or
bool parse_expr(std::string& out) {
return parse_ternary(out);
}
bool parse_ternary(std::string& out) {
if (peek(0).kind == Tok::IfT) {
++pos_;
std::string a, b, c;
if (!parse_logic_or(a)) return false;
if (!eat(Tok::ThenT, "'then' expected after 'if'")) return false;
if (!parse_ternary(b)) return false;
if (!eat(Tok::ElseT, "'else' expected (subset requires else branch)")) return false;
if (!parse_ternary(c)) return false;
if (!eat(Tok::EndKW, "'end' expected to close 'if'")) return false;
out = "CASE WHEN " + a + " THEN " + b + " ELSE " + c + " END";
return true;
}
return parse_logic_or(out);
}
bool parse_logic_or(std::string& out) {
if (!parse_logic_and(out)) return false;
while (peek(0).kind == Tok::OrT) {
++pos_;
std::string rhs;
if (!parse_logic_and(rhs)) return false;
out = "(" + out + " OR " + rhs + ")";
}
return true;
}
bool parse_logic_and(std::string& out) {
if (!parse_not(out)) return false;
while (peek(0).kind == Tok::AndT) {
++pos_;
std::string rhs;
if (!parse_not(rhs)) return false;
out = "(" + out + " AND " + rhs + ")";
}
return true;
}
bool parse_not(std::string& out) {
if (peek(0).kind == Tok::NotT) {
++pos_;
std::string e;
if (!parse_not(e)) return false;
out = "NOT (" + e + ")";
return true;
}
return parse_comparison(out);
}
bool parse_comparison(std::string& out) {
if (!parse_concat(out)) return false;
while (true) {
Tok::Kind k = peek(0).kind;
const char* op = nullptr;
if (k == Tok::Eq) op = " = ";
else if (k == Tok::Neq) op = " <> ";
else if (k == Tok::Lt) op = " < ";
else if (k == Tok::Lte) op = " <= ";
else if (k == Tok::Gt) op = " > ";
else if (k == Tok::Gte) op = " >= ";
else break;
++pos_;
std::string rhs;
if (!parse_concat(rhs)) return false;
out = "(" + out + op + rhs + ")";
}
return true;
}
bool parse_concat(std::string& out) {
if (!parse_additive(out)) return false;
while (peek(0).kind == Tok::ConcatT) {
++pos_;
std::string rhs;
if (!parse_additive(rhs)) return false;
out = "(" + out + " || " + rhs + ")";
}
return true;
}
bool parse_additive(std::string& out) {
if (!parse_multiplicative(out)) return false;
while (peek(0).kind == Tok::Plus || peek(0).kind == Tok::Minus) {
const char* op = (peek(0).kind == Tok::Plus) ? " + " : " - ";
++pos_;
std::string rhs;
if (!parse_multiplicative(rhs)) return false;
out = "(" + out + op + rhs + ")";
}
return true;
}
bool parse_multiplicative(std::string& out) {
if (!parse_unary(out)) return false;
while (peek(0).kind == Tok::Star || peek(0).kind == Tok::Slash || peek(0).kind == Tok::Percent) {
const char* op = (peek(0).kind == Tok::Star) ? " * "
: (peek(0).kind == Tok::Slash) ? " / " : " % ";
++pos_;
std::string rhs;
if (!parse_unary(rhs)) return false;
out = "(" + out + op + rhs + ")";
}
return true;
}
bool parse_unary(std::string& out) {
if (peek(0).kind == Tok::Minus) {
++pos_;
std::string e;
if (!parse_unary(e)) return false;
out = "(-" + e + ")";
return true;
}
return parse_primary(out);
}
bool parse_primary(std::string& out) {
Tok t = peek(0);
if (t.kind == Tok::NumT) {
++pos_;
out = t.text;
return true;
}
if (t.kind == Tok::StrT) {
++pos_;
out = sql_string_literal(t.text);
return true;
}
if (t.kind == Tok::TrueT) { ++pos_; out = "TRUE"; return true; }
if (t.kind == Tok::FalseT) { ++pos_; out = "FALSE"; return true; }
if (t.kind == Tok::NilT) { ++pos_; out = "NULL"; return true; }
if (t.kind == Tok::ColT) {
// Check col exists (warning, not error).
++pos_;
(void)headers_; // currently not validating — caller can do that
out = sql_ident(t.text);
return true;
}
if (t.kind == Tok::LParen) {
++pos_;
std::string e;
if (!parse_expr(e)) return false;
if (!eat(Tok::RParen, "expected ')'")) return false;
out = "(" + e + ")";
return true;
}
if (t.kind == Tok::IdentT) {
// Function call: identifier ("." identifier)? "(" args ")"
std::string name = t.text;
++pos_;
if (peek(0).kind == Tok::Dot) {
++pos_;
if (peek(0).kind != Tok::IdentT) {
error_ = "expected identifier after '.'";
return false;
}
name += "." + peek(0).text;
++pos_;
}
if (peek(0).kind != Tok::LParen) {
error_ = "bare identifier '" + name +
"' not allowed (only [col] refs + whitelisted fn calls)";
return false;
}
++pos_; // consume '('
std::vector<std::string> args;
if (peek(0).kind != Tok::RParen) {
while (true) {
std::string a;
if (!parse_expr(a)) return false;
args.push_back(a);
if (peek(0).kind == Tok::Comma) { ++pos_; continue; }
break;
}
}
if (!eat(Tok::RParen, "expected ')' closing function args")) return false;
// Validate against whitelist
auto& W = fn_whitelist();
auto wit = W.find(name);
if (wit == W.end()) {
error_ = "function '" + name +
"' not in SQL transpile whitelist (math.*, string.upper/lower/len/sub, tostring, tonumber)";
return false;
}
const FnMap& fm = wit->second;
if ((int)args.size() < fm.min_args || (int)args.size() > fm.max_args) {
std::ostringstream os;
os << "function '" << name << "' takes " << fm.min_args;
if (fm.max_args != fm.min_args) os << ".." << fm.max_args;
os << " args, got " << args.size();
error_ = os.str();
return false;
}
// Casos especiales
if (name == "string.sub") {
// Lua: string.sub(s, i [, j]) — i/j 1-based, inclusive.
// SQL DuckDB: substring(s, i, count). count = j - i + 1.
if (args.size() == 2) {
// sin j -> hasta el final. DuckDB substring(s, i) acepta.
out = "substring(" + args[0] + ", " + args[1] + ")";
} else {
out = "substring(" + args[0] + ", " + args[1] +
", (" + args[2] + ") - (" + args[1] + ") + 1)";
}
return true;
}
// Generico: substituir $1..$N en template.
std::string s = fm.sql_tmpl;
for (int i = 0; i < (int)args.size(); ++i) {
char ph[6];
std::snprintf(ph, sizeof(ph), "$%d", i + 1);
std::string p = ph;
size_t at = 0;
while ((at = s.find(p, at)) != std::string::npos) {
s.replace(at, p.size(), args[i]);
at += args[i].size();
}
}
out = s;
return true;
}
error_ = std::string("unexpected token in expression");
return false;
}
bool eat(Tok::Kind k, const char* msg) {
if (peek(0).kind != k) { error_ = msg; return false; }
++pos_;
return true;
}
const Tok& peek(int off) const {
size_t i = pos_ + (size_t)off;
if (i >= toks_.size()) return toks_.back();
return toks_[i];
}
bool at_end() const { return peek(0).kind == Tok::EndT; }
const std::string& error() const { return error_; }
private:
const std::vector<Tok>& toks_;
const std::vector<std::string>& headers_;
size_t pos_ = 0;
std::string error_;
};
} // anon
std::string transpile_expr(const std::string& formula,
const std::vector<std::string>& in_headers,
std::string& error_out) {
error_out.clear();
std::vector<Tok> toks;
Lexer lex(formula);
if (!lex.tokenize(toks)) {
error_out = lex.error();
return "";
}
Parser p(toks, in_headers);
std::string out;
if (!p.parse_expr(out)) {
error_out = p.error();
return "";
}
if (!p.at_end()) {
error_out = "unexpected trailing tokens after expression";
return "";
}
return out;
}
bool is_transpilable(const std::string& formula, std::string& error_out) {
std::vector<std::string> empty;
std::string s = transpile_expr(formula, empty, error_out);
return error_out.empty() && !s.empty();
}
// ============================================================================
// TQL State -> SQL DuckDB emitter.
// ============================================================================
namespace {
// Mapeo aggregation -> SQL DuckDB expression.
std::string emit_agg_expr(const Aggregation& a) {
switch (a.fn) {
case AggFn::Count: return "COUNT(*)";
case AggFn::Sum: return "SUM(" + sql_ident(a.col) + ")";
case AggFn::Avg: return "AVG(" + sql_ident(a.col) + ")";
case AggFn::Min: return "MIN(" + sql_ident(a.col) + ")";
case AggFn::Max: return "MAX(" + sql_ident(a.col) + ")";
case AggFn::Distinct: return "COUNT(DISTINCT " + sql_ident(a.col) + ")";
case AggFn::Stddev: return "STDDEV(" + sql_ident(a.col) + ")";
case AggFn::Median: return "quantile_cont(" + sql_ident(a.col) + ", 0.5)";
case AggFn::P25: return "quantile_cont(" + sql_ident(a.col) + ", 0.25)";
case AggFn::P75: return "quantile_cont(" + sql_ident(a.col) + ", 0.75)";
case AggFn::P90: return "quantile_cont(" + sql_ident(a.col) + ", 0.90)";
case AggFn::P99: return "quantile_cont(" + sql_ident(a.col) + ", 0.99)";
case AggFn::Percentile: {
char buf[32];
std::snprintf(buf, sizeof(buf), "%g", a.arg);
return std::string("quantile_cont(") + sql_ident(a.col) + ", " + buf + ")";
}
}
return "/* unknown agg */ NULL";
}
std::string emit_breakout_expr(const std::string& bk) {
std::string col_clean;
DateGranularity g = parse_breakout_granularity(bk, col_clean);
if (g == DateGranularity::None) {
return sql_ident(col_clean);
}
const char* tok = date_granularity_token(g);
// Week: DuckDB date_trunc('week', col) -> monday segun configuracion.
return std::string("date_trunc('") + tok + "', " + sql_ident(col_clean) + ")";
}
// Resuelve un Op a operador SQL + (opcional) override de RHS.
const char* sql_op(Op op) {
switch (op) {
case Op::Eq: return " = ";
case Op::Neq: return " <> ";
case Op::Gt: return " > ";
case Op::Gte: return " >= ";
case Op::Lt: return " < ";
case Op::Lte: return " <= ";
case Op::Contains: return " LIKE ";
case Op::NotContains: return " NOT LIKE ";
case Op::StartsWith: return " LIKE ";
case Op::EndsWith: return " LIKE ";
}
return " = ";
}
// Construye RHS literal/pattern segun op + value. Devuelve placeholder '?'
// y push de params; o pattern string-literal directo para LIKE wildcards.
std::string emit_filter_rhs(const Filter& f, std::vector<std::string>& params) {
if (f.op == Op::Contains || f.op == Op::NotContains) {
std::string v = "%" + f.value + "%";
params.push_back(v);
return "?";
}
if (f.op == Op::StartsWith) {
std::string v = f.value + "%";
params.push_back(v);
return "?";
}
if (f.op == Op::EndsWith) {
std::string v = "%" + f.value;
params.push_back(v);
return "?";
}
params.push_back(f.value);
return "?";
}
// Construye CTE stage 0 (Raw): SELECT cols + derived FROM main_t [JOINs].
// `tables` provee schema. main_t name = tables[main_idx].name. Derived cols
// se transpilan a SQL expression; si fuera de subset, push warning + skip col.
bool emit_stage0(const State& st, const std::vector<TableInput>& tables,
int main_idx, SqlEmit& e) {
if (main_idx < 0 || main_idx >= (int)tables.size()) {
e.error = "main table out of range";
return false;
}
const TableInput& main_t = tables[(size_t)main_idx];
// SELECT list: cols originales + derived expressions (subset).
std::string select_list;
for (size_t i = 0; i < main_t.headers.size(); ++i) {
if (i > 0) select_list += ", ";
select_list += sql_ident(main_t.headers[i]);
}
// Derived cols (stage 0 derived).
if (!st.stages.empty()) {
const Stage& s0 = st.stages[0];
for (const auto& d : s0.derived) {
if (d.source_col >= 0 && d.formula.empty()) {
// Retipo puro: alias col origen.
if (d.source_col < (int)main_t.headers.size()) {
select_list += ", " + sql_ident(main_t.headers[(size_t)d.source_col])
+ " AS " + sql_ident(d.name);
}
continue;
}
std::string err;
std::string expr = transpile_expr(d.formula, main_t.headers, err);
if (!err.empty()) {
std::string msg = "derived col '" + d.name +
"' formula out of SQL subset: " + err;
e.warnings.push_back(msg);
// Skip col en SQL output; agente puede recurrir a TQL puro.
continue;
}
select_list += ", " + expr + " AS " + sql_ident(d.name);
}
}
std::string from = sql_ident(main_t.name);
// Joins
for (const auto& jn : st.joins) {
const TableInput* right = nullptr;
for (const auto& ti : tables) {
if (ti.name == jn.source) { right = &ti; break; }
}
if (!right) {
e.warnings.push_back("join source '" + jn.source + "' not in tables");
continue;
}
const char* strat = "LEFT JOIN";
switch (jn.strategy) {
case JoinStrategy::Left: strat = "LEFT JOIN"; break;
case JoinStrategy::Inner: strat = "INNER JOIN"; break;
case JoinStrategy::Right: strat = "RIGHT JOIN"; break;
case JoinStrategy::Full: strat = "FULL OUTER JOIN"; break;
}
from += "\n " + std::string(strat) + " " + sql_ident(right->name)
+ " AS " + sql_ident(jn.alias) + " ON ";
for (size_t k = 0; k < jn.on.size(); ++k) {
if (k > 0) from += " AND ";
from += sql_ident(main_t.name) + "." + sql_ident(jn.on[k].first)
+ " = " + sql_ident(jn.alias) + "." + sql_ident(jn.on[k].second);
}
// Anadir cols del right al SELECT con alias.col prefix.
if (jn.fields.empty()) {
for (const auto& rh : right->headers) {
std::string aliased = jn.alias + "." + rh;
select_list += ", " + sql_ident(jn.alias) + "." + sql_ident(rh)
+ " AS " + sql_ident(aliased);
}
} else {
for (const auto& fld : jn.fields) {
std::string aliased = jn.alias + "." + fld;
select_list += ", " + sql_ident(jn.alias) + "." + sql_ident(fld)
+ " AS " + sql_ident(aliased);
}
}
}
// Stage 0 WHERE: filters del Raw (filter col idx en eff_headers).
// Filter.col es indice en eff_headers (orig + derived). Para SQL emit,
// necesitamos resolver col idx -> col name. Reconstruir orden eff_headers.
std::vector<std::string> eff_headers = main_t.headers;
if (!st.stages.empty()) {
for (const auto& d : st.stages[0].derived) {
eff_headers.push_back(d.name);
}
}
std::string where_clause;
if (!st.stages.empty()) {
const Stage& s0 = st.stages[0];
for (size_t fi = 0; fi < s0.filters.size(); ++fi) {
const Filter& f = s0.filters[fi];
if (f.col < 0 || f.col >= (int)eff_headers.size()) {
e.warnings.push_back("stage0 filter col idx out of range");
continue;
}
std::string col = sql_ident(eff_headers[(size_t)f.col]);
if (!where_clause.empty()) where_clause += " AND ";
where_clause += col + sql_op(f.op) + emit_filter_rhs(f, e.params);
}
}
// Stage 0 sort
std::string order_clause;
if (!st.stages.empty()) {
const Stage& s0 = st.stages[0];
for (size_t si = 0; si < s0.sorts.size(); ++si) {
const SortClause& sc = s0.sorts[si];
if (!order_clause.empty()) order_clause += ", ";
order_clause += sql_ident(sc.col) + (sc.desc ? " DESC" : " ASC");
}
}
std::string cte = "t0 AS (\n SELECT " + select_list + "\n FROM " + from;
if (!where_clause.empty()) cte += "\n WHERE " + where_clause;
if (!order_clause.empty()) cte += "\n ORDER BY " + order_clause;
cte += "\n)";
e.sql = "WITH " + cte;
return true;
}
// Stage N (N>=1): SELECT breakouts + agg expressions FROM t<N-1>
// [WHERE filters] [GROUP BY ...] [ORDER BY ...].
bool emit_stage_n(const Stage& stg, int n, SqlEmit& e) {
std::string prev = "t" + std::to_string(n - 1);
std::string cur = "t" + std::to_string(n);
// SELECT list: breakouts (con granularity expr si aplica) + aggregations.
std::string select_list;
for (size_t i = 0; i < stg.breakouts.size(); ++i) {
if (i > 0) select_list += ", ";
select_list += emit_breakout_expr(stg.breakouts[i])
+ " AS " + sql_ident(stg.breakouts[i]);
}
for (size_t i = 0; i < stg.aggregations.size(); ++i) {
if (!select_list.empty()) select_list += ", ";
std::string alias = aggregation_alias(stg.aggregations[i]);
select_list += emit_agg_expr(stg.aggregations[i]) + " AS " + sql_ident(alias);
}
if (select_list.empty()) select_list = "*";
// WHERE: filters del stage. col es indice en input headers (output del stage previo).
// Aproximacion: usamos el nombre via stage breakouts/aggs del stage previo si fuera necesario.
// Para v1, emit por nombre cuando filter.col >= 0 sea idx en breakouts/aggs/orig. El
// chequeo de existencia se delega a DuckDB (errores en execute son detectables).
// V1 simple: skip filter cuando no podemos resolver — caller solo deberia tener filter
// sobre cols que existen.
// Estrategia simple: emite WHERE solo si stage previo provee headers conocidos. Para no
// duplicar logica, dejamos al caller proveer headers via filter.col que se resuelve a
// breakouts[col].
// V1: si filter.col esta en rango de breakouts del stage previo, emite breakout name.
// Sino, warning + skip.
std::string where_clause;
// Best effort: no podemos construir headers del stage previo aqui sin recomputar.
// Para v1, omitimos filters de stages >=1 — caller deberia evitar usarlos via SQL.
// TODO v2: pasar prev_headers para resolver.
(void)where_clause;
// GROUP BY: solo si hay breakouts.
std::string group_clause;
for (size_t i = 0; i < stg.breakouts.size(); ++i) {
if (i > 0) group_clause += ", ";
// Re-emit la expression para GROUP BY (no alias).
group_clause += emit_breakout_expr(stg.breakouts[i]);
}
// ORDER BY
std::string order_clause;
for (size_t i = 0; i < stg.sorts.size(); ++i) {
if (i > 0) order_clause += ", ";
order_clause += sql_ident(stg.sorts[i].col) + (stg.sorts[i].desc ? " DESC" : " ASC");
}
std::string cte = ",\n" + cur + " AS (\n SELECT " + select_list
+ "\n FROM " + prev;
if (!group_clause.empty()) cte += "\n GROUP BY " + group_clause;
if (!order_clause.empty()) cte += "\n ORDER BY " + order_clause;
cte += "\n)";
e.sql += cte;
return true;
}
} // anon
SqlEmit emit_sql(const State& state,
const std::vector<TableInput>& tables,
int up_to_stage) {
SqlEmit out;
if (state.stages.empty()) {
out.error = "state has no stages";
return out;
}
if (tables.empty()) {
out.error = "no input tables provided";
return out;
}
int target = (up_to_stage < 0) ? state.active_stage : up_to_stage;
if (target < 0) target = 0;
if (target >= (int)state.stages.size()) target = (int)state.stages.size() - 1;
// Resolve main idx via state.main_source (o tables[0] default).
int main_idx = resolve_main_idx(tables, state.main_source);
if (main_idx < 0) main_idx = 0;
if (!emit_stage0(state, tables, main_idx, out)) return out;
for (int si = 1; si <= target; ++si) {
if (!emit_stage_n(state.stages[(size_t)si], si, out)) return out;
}
out.sql += "\nSELECT * FROM t" + std::to_string(target) + ";\n";
return out;
}
} // namespace tql_to_sql
@@ -0,0 +1,41 @@
// tql_to_sql: emite SQL DuckDB equivalente a una pipeline TQL State.
// Pure. Sin DuckDB linkado. Solo string emit + validacion.
// Ver issue 0080 + docs/TQL.md (seccion "SQL transpile subset").
#pragma once
#include "data_table_logic.h"
#include <string>
#include <vector>
namespace tql_to_sql {
struct SqlEmit {
std::string sql; // SELECT/CTE chain DuckDB
std::vector<std::string> params; // bound values posicionales (?)
std::vector<std::string> warnings; // soft issues (col not found, etc.)
std::string error; // si non-empty, emit fallo
};
// Pure: emite SQL DuckDB equivalente a stages 0..active del state.
// `tables` provee schema (headers/types/name) de cada TableInput. El caller
// es responsable de hidratar las tablas en DuckDB con esos nombres.
// `up_to_stage = -1` => state.active_stage.
SqlEmit emit_sql(const data_table::State& state,
const std::vector<data_table::TableInput>& tables,
int up_to_stage = -1);
// Pure: valida que `formula` (cuerpo Lua de un derived col) este dentro del
// subset SQL-transpilable. Si valido, retorna true. Si no, false + razon
// concreta en `error_out` (categoria + token problematico).
// Ver docs/TQL.md#sql-transpile-subset.
bool is_transpilable(const std::string& formula, std::string& error_out);
// Pure: transpila formula Lua subset -> SQL expression. Si fuera de subset,
// retorna "" y rellena `error_out`. Asume is_transpilable retornaria true.
// `in_headers` necesario para resolver `[col]` refs y emitir identifier
// SQL apropiado (quoted si tiene char especial).
std::string transpile_expr(const std::string& formula,
const std::vector<std::string>& in_headers,
std::string& error_out);
} // namespace tql_to_sql
@@ -16,6 +16,10 @@ using data_table::ColumnType;
using data_table::ViewMode;
using data_table::ViewConfig;
using data_table::parse_number;
using data_table::nearest_index_2d;
using data_table::pie_angle;
using data_table::pie_slice_at_angle;
using data_table::heatmap_cell_at;
static int find_header(const StageOutput& out, const std::string& name) {
if (name.empty()) return -1;
@@ -152,7 +156,8 @@ std::vector<double> finite(const std::vector<double>& v) {
}
bool render_bar_like(const StageOutput& out, ViewMode mode,
const ViewConfig& cfg, ImVec2 size) {
const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out = nullptr) {
int cat_col = resolve_cat(out, cfg, first_category_col(out));
auto nums = collect_numeric_filtered(out, cfg, 8);
if (cat_col < 0 || nums.empty()) {
@@ -225,6 +230,15 @@ bool render_bar_like(const StageOutput& out, ViewMode mode,
ImPlot::PlotBars(nums[0].name.c_str(), ticks.data(), ys.data(), n, 0.67, spc);
}
}
// Hit-test fase 10: idx = round(plot.{x|y}) en single-series mode.
if (clicked_row_out &&
mode != ViewMode::GroupedBar && mode != ViewMode::StackedBar &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
double target = horiz ? p.y : p.x;
int idx = (int)(target + 0.5);
if (idx >= 0 && idx < n) *clicked_row_out = idx;
}
ImPlot::EndPlot();
return true;
}
@@ -302,7 +316,8 @@ bool render_line_like(const StageOutput& out, ViewMode mode,
return true;
}
bool render_scatter(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
bool render_scatter(const StageOutput& out, const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out = nullptr) {
// Soporte cfg.x_col + cfg.y_cols[0]
int xc = find_header(out, cfg.x_col);
int yc = !cfg.y_cols.empty() ? find_header(out, cfg.y_cols[0]) : -1;
@@ -329,11 +344,20 @@ bool render_scatter(const StageOutput& out, const ViewConfig& cfg, ImVec2 size)
ImPlot::PlotScatter("##s", nums[0].vals.data(), nums[1].vals.data(),
(int)nums[0].vals.size());
}
if (clicked_row_out &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
int idx = nearest_index_2d(p.x, p.y,
nums[0].vals.data(), nums[1].vals.data(),
(int)nums[0].vals.size());
if (idx >= 0) *clicked_row_out = idx;
}
ImPlot::EndPlot();
return true;
}
bool render_bubble(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
bool render_bubble(const StageOutput& out, const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out = nullptr) {
int xc = find_header(out, cfg.x_col);
int yc = !cfg.y_cols.empty() ? find_header(out, cfg.y_cols[0]) : -1;
int sc = resolve_size(out, cfg, -1);
@@ -354,6 +378,14 @@ bool render_bubble(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
axflag(cfg), axflag(cfg));
ImPlot::PlotBubbles("##b", nums[0].vals.data(), nums[1].vals.data(),
nums[2].vals.data(), (int)nums[0].vals.size());
if (clicked_row_out &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
int idx = nearest_index_2d(p.x, p.y,
nums[0].vals.data(), nums[1].vals.data(),
(int)nums[0].vals.size());
if (idx >= 0) *clicked_row_out = idx;
}
ImPlot::EndPlot();
return true;
}
@@ -404,7 +436,8 @@ bool render_hist2d(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
return true;
}
bool render_heatmap(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
bool render_heatmap(const StageOutput& out, const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out = nullptr) {
auto nums = collect_numeric_filtered(out, cfg, 64);
if (nums.empty()) { info_text("Need numeric columns"); return false; }
int cols = (int)nums.size();
@@ -424,11 +457,22 @@ bool render_heatmap(const StageOutput& out, const ViewConfig& cfg, ImVec2 size)
maybe_fit(cfg);
if (!ImPlot::BeginPlot("##heatmap", size, 0)) return false;
ImPlot::PlotHeatmap("##hm", mat.data(), rows, cols, mn, mx, nullptr);
if (clicked_row_out &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
// ImPlot heatmap Y se pinta de top a bottom; plot mouse_y va igual
// (default scale 0..rows). Mapeo directo.
int rr, cc;
heatmap_cell_at(p.x, p.y, rows, cols, rr, cc);
if (rr >= 0) *clicked_row_out = rr;
(void)cc;
}
ImPlot::EndPlot();
return true;
}
bool render_pie(const StageOutput& out, const ViewConfig& cfg, bool donut, ImVec2 size) {
bool render_pie(const StageOutput& out, const ViewConfig& cfg, bool donut, ImVec2 size,
int* clicked_row_out = nullptr) {
int cat = resolve_cat(out, cfg, first_category_col(out));
auto nums = collect_numeric_filtered(out, cfg, 1);
if (cat < 0 || nums.empty()) { info_text("Need 1 category + 1 numeric"); return false; }
@@ -455,11 +499,24 @@ bool render_pie(const StageOutput& out, const ViewConfig& cfg, bool donut, ImVec
// Draw inner hole as solid circle by overlaying a smaller pie of one slice transparent.
// Simpler: just visually it's a circle with text. Use no extra primitive for now.
}
if (clicked_row_out &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
double dx = p.x - 0.5, dy = p.y - 0.5;
double dist2 = dx*dx + dy*dy;
double inner = donut ? (radius * 0.5) : 0.0;
if (dist2 <= radius * radius && dist2 >= inner * inner) {
double ang = pie_angle(0.5, 0.5, p.x, p.y);
int idx = pie_slice_at_angle(ang, values.data(), n);
if (idx >= 0) *clicked_row_out = idx;
}
}
ImPlot::EndPlot();
return true;
}
bool render_funnel(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
bool render_funnel(const StageOutput& out, const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out = nullptr) {
int cat = resolve_cat(out, cfg, first_category_col(out));
auto nums = collect_numeric_filtered(out, cfg, 1);
if (cat < 0 || nums.empty()) { info_text("Need 1 category + 1 numeric"); return false; }
@@ -492,6 +549,17 @@ bool render_funnel(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
ImPlot::SetupAxisTicks(ImAxis_Y1, ticks.data(), n, labels.data(), false);
ImPlot::PlotBars(nums[0].name.c_str(), ys.data(), ticks.data(), n, 0.85,
ImPlotSpec(ImPlotProp_Flags, ImPlotBarsFlags_Horizontal));
if (clicked_row_out &&
ImPlot::IsPlotHovered() && ImGui::IsMouseClicked(ImGuiMouseButton_Left)) {
ImPlotPoint p = ImPlot::GetPlotMousePos();
int tick_idx = (int)(p.y + 0.5);
// ticks[i] = n-1-i. Invertir para idx en orden sorted descendiente.
int sorted_pos = (n - 1) - tick_idx;
if (sorted_pos >= 0 && sorted_pos < n) {
// idx[sorted_pos] da indice de row original en out.
*clicked_row_out = idx[sorted_pos];
}
}
ImPlot::EndPlot();
return true;
}
@@ -763,7 +831,9 @@ bool render_radar(const StageOutput& out, const ViewConfig& cfg, ImVec2 size) {
} // anon
bool render(const StageOutput& out, ViewMode mode,
const ViewConfig& cfg, ImVec2 size) {
const ViewConfig& cfg, ImVec2 size,
int* clicked_row_out) {
if (clicked_row_out) *clicked_row_out = -1;
if (out.rows == 0 || out.cols == 0) {
info_text("No data");
return false;
@@ -773,21 +843,21 @@ bool render(const StageOutput& out, ViewMode mode,
case ViewMode::Bar:
case ViewMode::Column:
case ViewMode::GroupedBar:
case ViewMode::StackedBar: return render_bar_like(out, mode, cfg, size);
case ViewMode::StackedBar: return render_bar_like(out, mode, cfg, size, clicked_row_out);
case ViewMode::Line:
case ViewMode::Area:
case ViewMode::Stairs: return render_line_like(out, mode, cfg, size);
case ViewMode::Scatter: return render_scatter(out, cfg, size);
case ViewMode::Bubble: return render_bubble(out, cfg, size);
case ViewMode::Scatter: return render_scatter(out, cfg, size, clicked_row_out);
case ViewMode::Bubble: return render_bubble(out, cfg, size, clicked_row_out);
case ViewMode::Histogram: return render_histogram(out, cfg, size);
case ViewMode::Histogram2D: return render_hist2d(out, cfg, size);
case ViewMode::Heatmap: return render_heatmap(out, cfg, size);
case ViewMode::Heatmap: return render_heatmap(out, cfg, size, clicked_row_out);
case ViewMode::BoxPlot: return render_boxplot(out, cfg, size);
case ViewMode::Stem: return render_stem(out, cfg, size);
case ViewMode::ErrorBars: return render_errorbars(out, cfg, size);
case ViewMode::Pie: return render_pie(out, cfg, false, size);
case ViewMode::Donut: return render_pie(out, cfg, true, size);
case ViewMode::Funnel: return render_funnel(out, cfg, size);
case ViewMode::Pie: return render_pie(out, cfg, false, size, clicked_row_out);
case ViewMode::Donut: return render_pie(out, cfg, true, size, clicked_row_out);
case ViewMode::Funnel: return render_funnel(out, cfg, size, clicked_row_out);
case ViewMode::Waterfall: return render_waterfall(out, cfg, size);
case ViewMode::KPI: return render_kpi_single(out, cfg);
case ViewMode::KPIGrid: return render_kpi_grid(out, cfg);
@@ -14,10 +14,15 @@ namespace viz {
//
// `size`: ImVec2(-1,-1) usa todo el espacio disponible.
// `out`: output del stage activo (headers, types, cells flat row-major).
// `clicked_row_out`: si != nullptr, el render escribira el indice de row del
// `StageOutput` clicado por user. -1 si no hubo click drillable. Fase 10
// (issue 0079): habilitado para bar/column/pie/donut/funnel/scatter/bubble/
// heatmap. Resto de modos: no hit-test, queda en -1.
bool render(const data_table::StageOutput& out,
data_table::ViewMode mode,
const data_table::ViewConfig& cfg,
ImVec2 size = ImVec2(-1, -1));
ImVec2 size = ImVec2(-1, -1),
int* clicked_row_out = nullptr);
// Helper expuesto: encuentra primera col numerica. -1 si ninguna.
int first_numeric_col(const data_table::StageOutput& out);
+212
View File
@@ -0,0 +1,212 @@
// data_table_types — types compartidos del stack TQL (Table Query Language).
// Promovido al registry desde cpp/apps/primitives_gallery/playground/tables/.
// Ver issue 0081 + docs/TQL.md. Pure value types + enums.
#pragma once
#include <string>
#include <utility>
#include <vector>
namespace data_table {
// ----------------------------------------------------------------------------
// Operadores de filtro.
// ----------------------------------------------------------------------------
enum class Op {
Eq, Neq, Gt, Gte, Lt, Lte,
Contains, NotContains, StartsWith, EndsWith
};
// ----------------------------------------------------------------------------
// Tipo de columna. Declarado por caller o auto-detectado.
// ----------------------------------------------------------------------------
enum class ColumnType {
Auto, String, Int, Float, Bool, Date, Json
};
// ----------------------------------------------------------------------------
// Derived column: inmutable. Dos modos:
// 1) Retipo puro: source_col >= 0, formula == "". Cells del origen.
// 2) Formula: source_col == -1, formula no vacia. Eval por Lua.
// ----------------------------------------------------------------------------
struct DerivedColumn {
int source_col = -1;
ColumnType type = ColumnType::String;
std::string name;
std::string formula; // "" = retipado puro; resto = body Lua
int lua_id = -1; // referencia en lua_engine; -1 si no compilado
std::string compile_error;
};
// ----------------------------------------------------------------------------
// Filtro: col index en eff_headers + op + value.
// ----------------------------------------------------------------------------
struct Filter {
int col;
Op op;
std::string value;
};
// ----------------------------------------------------------------------------
// ColorRule: pintado condicional de celdas (UI helper).
// ----------------------------------------------------------------------------
struct ColorRule {
int col;
std::string equals;
unsigned int color;
};
// ----------------------------------------------------------------------------
// Aggregations (TQL stages 1+).
// ----------------------------------------------------------------------------
enum class AggFn {
Count, Sum, Avg, Min, Max, Distinct, Stddev,
Median, P25, P75, P90, P99, Percentile
};
struct Aggregation {
AggFn fn = AggFn::Count;
std::string col; // ignorado para Count
double arg = 0.0; // para Percentile (0..1)
std::string alias; // vacio -> auto-generado via aggregation_alias()
};
struct SortClause {
std::string col;
bool desc = false;
};
// Stage: layer de TQL. Stage 0 = Raw (sin breakouts/aggregations).
// Stage 1+ pueden agrupar. Cada stage consume output del anterior.
struct Stage {
std::vector<Filter> filters;
std::vector<DerivedColumn> derived; // expressions de este stage
std::vector<std::string> breakouts; // col names del INPUT de este stage
std::vector<Aggregation> aggregations;
std::vector<SortClause> sorts;
};
// Output de compute_stage. Posee `cell_backing` (strings nuevos para
// resultados agregados) y `cells` (punteros row-major a backing o a
// `in_cells` original para passthrough).
struct StageOutput {
std::vector<std::string> cell_backing;
std::vector<const char*> cells;
int rows = 0;
int cols = 0;
std::vector<std::string> headers;
std::vector<ColumnType> types;
};
// ----------------------------------------------------------------------------
// ViewMode: tipo de visualizacion a renderizar sobre el output del stage activo.
// ----------------------------------------------------------------------------
enum class ViewMode {
Table,
// Bars
Bar, Column, GroupedBar, StackedBar,
// Lines / area
Line, Area, Stairs,
// Points
Scatter, Bubble,
// Distribution
Histogram, Histogram2D, Heatmap, BoxPlot,
// Stems / signals
Stem, ErrorBars,
// Composition
Pie, Donut, Funnel, Waterfall,
// Single values
KPI, KPIGrid,
// Specialized
Candlestick, Radar,
};
// ----------------------------------------------------------------------------
// Joins (MBQL-style). Ver issue 0078.
// ----------------------------------------------------------------------------
enum class JoinStrategy { Left, Inner, Right, Full };
// Tabla extra pasada al render() para joins. Owner externo (caller).
struct TableInput {
std::string name; // identificador estable (matchea Join.source)
std::vector<std::string> headers;
std::vector<ColumnType> types;
const char* const* cells = nullptr; // row-major, headers.size() cols x rows filas
int rows = 0;
int cols = 0;
};
// Join clause: une la tabla actual con `source` por las parejas `on`,
// prefijando las cols del derecho con `alias.`.
struct Join {
std::string alias;
std::string source;
std::vector<std::pair<std::string, std::string>> on; // {left_col, right_col}
JoinStrategy strategy = JoinStrategy::Left;
std::vector<std::string> fields; // vacio = all del derecho
};
// ----------------------------------------------------------------------------
// ViewConfig: overrides manuales de auto-detect para la vista activa.
// ----------------------------------------------------------------------------
struct ViewConfig {
std::string x_col; // single: scatter, line, hist2d
std::vector<std::string> y_cols; // 1..N: line/area/bar/etc
std::string size_col; // bubble
std::string cat_col; // bar/pie/funnel/box override
unsigned int primary_color = 0; // 0 = ImPlot auto
int hist_bins = 0; // 0 = Sturges
float pie_radius = 0.0f; // 0 = default
bool show_legend = true;
bool show_markers = false; // line/area markers
bool locked = false; // disable pan/zoom
mutable bool fit_request = false; // consumed by viz::render
};
// VizPanel: viz adicional sobre el mismo StageOutput.
struct VizPanel {
ViewMode display = ViewMode::Bar;
ViewConfig config;
mutable ViewMode last_non_table = ViewMode::Bar;
};
// ----------------------------------------------------------------------------
// State: stage pipeline + viz globales.
// ----------------------------------------------------------------------------
struct State {
std::vector<Stage> stages;
int active_stage = 0;
ViewMode display = ViewMode::Table;
ViewConfig viz_config;
std::vector<VizPanel> extra_panels;
std::vector<Join> joins; // aplicado antes de stages[0]
std::string main_source; // name de TableInput; vacio -> tables[0]
std::vector<ColorRule> color_rules;
std::vector<bool> col_visible;
std::vector<int> col_order;
// Helpers (definidos en compute_stage.cpp).
Stage& raw();
const Stage& raw() const;
Stage& active();
const Stage& active_const() const;
void ensure_stage0();
};
// ----------------------------------------------------------------------------
// Drill extendido (fase 10). Ver issue 0079.
// ----------------------------------------------------------------------------
enum class DateGranularity { None, Year, Month, Week, Day, Hour };
enum class FilterPreset { Last7d, Last30d, Last90d, ExcludeNulls, NonZero };
// Step de drill grabado para history undo/redo (fase 10).
struct DrillStep {
int target_stage = -1; // stage donde se anadio el filter
int filter_pos = -1; // index en target_stage.filters
int prev_active_stage = 0; // active_stage antes del drill
Filter added; // filter para redo
};
} // namespace data_table
+96
View File
@@ -0,0 +1,96 @@
#include "gfx/gpu_check.h"
#include "gfx/gl_loader.h"
#include <cstring>
#include <string>
// CUDA runtime version via compile-time macro.
// cuda_runtime.h define CUDART_VERSION como XXYYZZ (ej. 12040 para 12.4.0).
// Solo se incluye si el header esta disponible; si no, cuda_runtime_version = "".
#if defined(__has_include) && __has_include(<cuda_runtime.h>)
#include <cuda_runtime.h>
#define FN_HAS_CUDA_RUNTIME 1
#endif
namespace fn::gfx {
static std::string safe_gl_string(GLenum name) {
const GLubyte* s = glGetString(name);
if (!s) return "";
return std::string(reinterpret_cast<const char*>(s));
}
static bool check_gl_version_43() {
// GL_VERSION tiene formato "major.minor ..." o "OpenGL ES major.minor ..."
const GLubyte* ver = glGetString(GL_VERSION);
if (!ver) return false;
int major = 0, minor = 0;
// Saltar prefijo "OpenGL ES " si lo hay
const char* p = reinterpret_cast<const char*>(ver);
if (std::strncmp(p, "OpenGL ES ", 10) == 0) p += 10;
// sscanf con la forma "X.Y"
// NOLINTNEXTLINE(cert-err34-c)
std::sscanf(p, "%d.%d", &major, &minor);
return (major > 4) || (major == 4 && minor >= 3);
}
bool gpu_check_caps(GpuCaps& out) {
out = GpuCaps{}; // reset
out.gl_vendor = safe_gl_string(GL_VENDOR);
out.gl_renderer = safe_gl_string(GL_RENDERER);
out.gl_version = safe_gl_string(GL_VERSION);
if (out.gl_vendor.empty()) {
// No hay contexto GL activo.
return false;
}
// Compute shader support: GL 4.3+ o ARB_compute_shader
{
const GLubyte* exts = glGetString(GL_EXTENSIONS);
bool has_arb = exts &&
std::strstr(reinterpret_cast<const char*>(exts),
"GL_ARB_compute_shader") != nullptr;
out.has_compute_shader = check_gl_version_43() || has_arb;
}
// Shader storage buffer: GL 4.3+ o ARB_shader_storage_buffer_object
{
const GLubyte* exts = glGetString(GL_EXTENSIONS);
bool has_ssbo_arb = exts &&
std::strstr(reinterpret_cast<const char*>(exts),
"GL_ARB_shader_storage_buffer_object") != nullptr;
out.has_storage_buffer = check_gl_version_43() || has_ssbo_arb;
}
// Workgroup limits (solo si hay compute shader support)
if (out.has_compute_shader) {
// GL_MAX_COMPUTE_WORK_GROUP_COUNT — indexed query
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 0, &out.max_compute_workgroup_count[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 1, &out.max_compute_workgroup_count[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, 2, &out.max_compute_workgroup_count[2]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 0, &out.max_compute_workgroup_size[0]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 1, &out.max_compute_workgroup_size[1]);
glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, 2, &out.max_compute_workgroup_size[2]);
}
// CUDA runtime version (compile-time detection)
#if defined(FN_HAS_CUDA_RUNTIME)
{
int cuda_ver = CUDART_VERSION; // ej. 12040 para CUDA 12.4.0
int major = cuda_ver / 1000;
int minor = (cuda_ver % 1000) / 10;
char buf[16];
std::snprintf(buf, sizeof(buf), "%d.%d", major, minor);
out.cuda_runtime_version = buf;
}
#else
out.cuda_runtime_version = "";
#endif
return true;
}
} // namespace fn::gfx
+38
View File
@@ -0,0 +1,38 @@
#pragma once
#include <string>
namespace fn::gfx {
// GpuCaps recopila capacidades OpenGL y CUDA del contexto activo.
// Todos los campos de cadena estan vacios ("") si el dato no esta disponible.
struct GpuCaps {
// OpenGL — requieren contexto GL activo antes de llamar gpu_check_caps.
std::string gl_vendor; // glGetString(GL_VENDOR) ej. "NVIDIA Corporation"
std::string gl_renderer; // glGetString(GL_RENDERER) ej. "NVIDIA GeForce RTX 3080/PCIe/SSE2"
std::string gl_version; // glGetString(GL_VERSION) ej. "4.6.0 NVIDIA 550.54.15"
// Compute shader limits (GL_MAX_COMPUTE_WORK_GROUP_COUNT/SIZE)
// Indice 0=X 1=Y 2=Z. Valor 0 si compute shaders no disponibles.
int max_compute_workgroup_count[3] = {0, 0, 0};
int max_compute_workgroup_size[3] = {0, 0, 0};
bool has_compute_shader = false; // GL_VERSION >= 4.3 o extension ARB_compute_shader
bool has_storage_buffer = false; // GL_VERSION >= 4.3 o extension ARB_shader_storage_buffer_object
// CUDA — vacio si CUDA runtime no detectado en compile time.
// Formato: "12.4" (major.minor) o "" si no disponible.
std::string cuda_runtime_version;
};
// gpu_check_caps rellena out con las capacidades del contexto OpenGL activo.
//
// REQUISITO: debe llamarse despues de inicializar el contexto GL y, en Windows,
// despues de fn::gfx::gl_loader_init(). Si se llama sin contexto activo el
// comportamiento es indefinido (glGetString devuelve nullptr).
//
// Retorna true si se pudo leer al menos el vendor GL (contexto activo).
// Retorna false si gl_vendor queda vacio (contexto no activo o driver defectuoso).
bool gpu_check_caps(GpuCaps& out);
} // namespace fn::gfx
+86
View File
@@ -0,0 +1,86 @@
---
name: gpu_check
kind: function
lang: cpp
domain: gfx
version: "1.0.0"
purity: impure
signature: "bool fn_gfx::gpu_check_caps(GpuCaps& out)"
description: "Rellena GpuCaps con las capacidades del contexto OpenGL activo: vendor, renderer, version, limites de compute workgroup, flags has_compute_shader/has_storage_buffer, y version CUDA runtime (deteccion en compile-time via CUDART_VERSION). Requiere contexto GL activo. Retorna false si el contexto no esta disponible."
tags: [gpu, opengl, cuda, caps, hardware, probe, gfx, compute, infra]
uses_functions: ["gl_loader_cpp_gfx"]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [gfx/gpu_check.h, gfx/gl_loader.h, cuda_runtime.h, cstring, string]
tested: false
tests: []
test_file_path: ""
file_path: "cpp/functions/gfx/gpu_check.cpp"
framework: opengl
params:
- name: out
desc: "Referencia a GpuCaps que se rellena con las capacidades detectadas. Se resetea al inicio de la llamada."
output: "true si el contexto GL esta activo y gl_vendor no esta vacio; false si no hay contexto GL activo o el driver devuelve nullptr para GL_VENDOR."
---
# gpu_check
Probing de capacidades GPU en runtime: OpenGL strings, compute shader support y CUDA.
## Uso tipico
```cpp
#include "gfx/gpu_check.h"
#include "gfx/gl_loader.h"
// Dentro de render(), despues del primer frame (contexto GL activo):
fn::gfx::GpuCaps caps;
if (fn::gfx::gpu_check_caps(caps)) {
printf("GPU: %s\n", caps.gl_renderer.c_str());
printf("Compute shaders: %s\n", caps.has_compute_shader ? "yes" : "no");
if (!caps.cuda_runtime_version.empty())
printf("CUDA runtime: %s\n", caps.cuda_runtime_version.c_str());
} else {
printf("No GL context active\n");
}
```
## Estructura GpuCaps
```cpp
struct GpuCaps {
std::string gl_vendor; // "NVIDIA Corporation"
std::string gl_renderer; // "NVIDIA GeForce RTX 3080/PCIe/SSE2"
std::string gl_version; // "4.6.0 NVIDIA 550.54.15"
int max_compute_workgroup_count[3]; // [65535, 65535, 65535] tipico NVIDIA
int max_compute_workgroup_size[3]; // [1024, 1024, 64] tipico
bool has_compute_shader; // GL 4.3+ o ARB_compute_shader
bool has_storage_buffer; // GL 4.3+ o ARB_shader_storage_buffer_object
std::string cuda_runtime_version; // "12.4" o "" si no compilado con CUDA
};
```
## CUDA detection
La version CUDA se detecta en **compile time** via el macro `CUDART_VERSION` de `<cuda_runtime.h>`. Si la app no esta compilada con el CUDA toolkit, `cuda_runtime_version` sera `""`. Para detection en runtime del toolkit del sistema, usar `cuda_toolkit_check_bash_infra`.
## Requisito de contexto GL
Llamar siempre despues de crear el contexto GL. En apps que usan `fn::run_app`, el contexto esta activo desde el primer frame del `render()` callback. En Windows, `fn::gfx::gl_loader_init()` debe haberse llamado antes para que los punteros de funcion esten resueltos.
## Uso previsto (fn doctor cpp-apps)
Esta funcion sera invocada por el audit de `fn doctor cpp-apps` para verificar que las apps C++ del registry tienen acceso a compute shaders cuando declaran dependencias de `gpu_compute_program`, `gpu_dispatch`, etc.
## CMakeLists.txt
```cmake
add_imgui_app(mi_app
main.cpp
${CMAKE_SOURCE_DIR}/cpp/functions/gfx/gpu_check.cpp
)
# CUDA opcional: si la app compila con CUDA toolkit el header cuda_runtime.h
# estara disponible y FN_HAS_CUDA_RUNTIME se activara automaticamente.
```
+20
View File
@@ -0,0 +1,20 @@
---
name: AggFn
lang: cpp
domain: core
version: "1.0.0"
algebraic: sum
definition: |
enum class AggFn {
Count, Sum, Avg, Min, Max, Distinct, Stddev,
Median, P25, P75, P90, P99, Percentile
};
description: "Funcion de agregacion soportada. Pickup via UI combo + SQL emit via tql_to_sql. Percentile usa Aggregation.arg en [0,1]."
tags: [tql, aggregation, sum-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Mapeo SQL DuckDB: Count → `COUNT(*)`, Sum/Avg/Min/Max/Stddev → ops nativas, Distinct → `COUNT(DISTINCT col)`, Median/P25/P75/P90/P99/Percentile → `quantile_cont(col, p)`.
+22
View File
@@ -0,0 +1,22 @@
---
name: Aggregation
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct Aggregation {
AggFn fn;
std::string col;
double arg;
std::string alias;
};
description: "Funcion de agregacion en Stage 1+. fn = Count/Sum/Avg/Min/Max/Distinct/Stddev/Median/P25/P75/P90/P99/Percentile. arg = parametro (p para percentile)."
tags: [tql, aggregation, agg, product-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`alias` vacio dispara `aggregation_alias(a)` auto: `count`, `sum_<col>`, `distinct_<col>`, `p95_<col>` etc. SQL mapping en `tql_to_sql`: `COUNT(*)`, `SUM("col")`, `quantile_cont("col", p)`.
+21
View File
@@ -0,0 +1,21 @@
---
name: ColorRule
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct ColorRule {
int col;
std::string equals;
unsigned int color;
};
description: "Regla de pintado condicional para tabla UI. Si cells[row][col] == equals, fondo = color (RGBA packed)."
tags: [tql, color, ui-hint, product-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Solo afecta render visual. Round-trip en TQL via `columns.<name>.color_rules`. Vacio = sin color override.
+28
View File
@@ -0,0 +1,28 @@
---
name: ColumnType
lang: cpp
domain: core
version: "1.0.0"
algebraic: sum
definition: |
enum class ColumnType {
Auto, String, Int, Float, Bool, Date, Json
};
description: "Tipo de columna del modelo TQL. `Auto` dispara auto-detect; el resto fuerza el tipo declarado. Base de toda la pipeline data_table."
tags: [tql, data-table, types, sum-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Sum type / enum-class. Convivimos con `effective_type()` que resuelve `Auto` → auto-detect via sample. El resto fuerza el tipo declarado por el caller.
Tabla de iconos UTF-8 Tabler para cada variante en `column_type_icon(t)`. Mapeo SQL ↔ ColumnType en `tql_to_sql` (issue 0080).
## Usado por
- `compute_stage_cpp_core` — input/output types per stage
- `tql_emit_cpp_core` / `tql_apply_cpp_core` — emit/parse TQL columns block
- `tql_to_sql_cpp_core` — mapping a SQL DuckDB types
- `data_table_cpp_viz` — UI render por columna
+19
View File
@@ -0,0 +1,19 @@
---
name: DateGranularity
lang: cpp
domain: core
version: "1.0.0"
algebraic: sum
definition: |
enum class DateGranularity { None, Year, Month, Week, Day, Hour };
description: "Granularidad de truncado de fechas para breakouts TQL. Sufijo `:token` en breakout string (ej. 'ts:month')."
tags: [tql, date, granularity, sum-type, mbql]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Auto-detect via `auto_date_granularity(min_ymd, max_ymd)`: >2y→Year, >60d→Month, >14d→Week, resto→Day. SQL emit DuckDB: `date_trunc('month'|'year'|...,col)`.
Week trunca a lunes ISO (Hinnant algo).
+26
View File
@@ -0,0 +1,26 @@
---
name: DerivedColumn
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct DerivedColumn {
int source_col;
ColumnType type;
std::string name;
std::string formula;
int lua_id;
std::string compile_error;
};
description: "Col custom dentro de un Stage. Modo 1: retipo (source_col >= 0, formula vacia). Modo 2: formula Lua (source_col == -1, eval por lua_engine sandbox)."
tags: [tql, derived, formula, lua, product-type]
uses_types: [ColumnType_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`formula` evaluada por row via `lua_engine` con `[col]` refs disponibles. Para SQL transpile (fase 11), formula debe estar dentro del Lua subset; sino `tql_to_sql` emite warning + skip col.
`lua_id` cachea la formula compilada en lua_engine entre eval calls.
+30
View File
@@ -0,0 +1,30 @@
---
name: Filter
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct Filter {
int col;
Op op;
std::string value;
};
description: "Predicado TQL: col idx + Op + value. Aplicado dentro de un Stage por compute_stage. col es idx en headers efectivos del INPUT del stage."
tags: [tql, filter, predicate, product-type]
uses_types: [Op_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`col` es indice en `in_headers` del stage donde aplica (no en el dataset original — esto cambio en el refactor a stages). Para drill-down usar `make_drill_filter(col_idx, value)`.
`value` es string siempre — `compare()` decide numerico vs lexical segun parseo. Range filters (op_in_range, op_between) no estan modelados; usar dos Filters consecutivos.
## Usado por
- `Stage_cpp_core` (lista de filters)
- `apply_filters`, `compute_stage_cpp_core`
- `make_drill_filter`, `build_preset_filters`
- `tql_to_sql_cpp_core` → SQL WHERE clauses con `?` placeholders
+25
View File
@@ -0,0 +1,25 @@
---
name: Join
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct Join {
std::string alias;
std::string source;
std::vector<std::pair<std::string, std::string>> on;
JoinStrategy strategy;
std::vector<std::string> fields;
};
description: "Join MBQL-style entre main_t y source. on = pares {left_col, right_col} multi-key. strategy = Left/Inner/Right/Full. fields vacio = all cols del derecho."
tags: [tql, join, mbql, product-type]
uses_types: [JoinStrategy_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Materializado por `join_tables_cpp_core` antes de stages[0]. Cols del derecho se prefijan con `alias.col` para preservar headers del main. SQL emit: `LEFT/INNER/RIGHT/FULL OUTER JOIN source AS alias ON main.l = alias.r AND ...`.
Multi-key: `on = {{l1,r1}, {l2,r2}}``ON main.l1 = alias.r1 AND main.l2 = alias.r2`.
+17
View File
@@ -0,0 +1,17 @@
---
name: JoinStrategy
lang: cpp
domain: core
version: "1.0.0"
algebraic: sum
definition: |
enum class JoinStrategy { Left, Inner, Right, Full };
description: "Estrategia de join MBQL-style. 4 variantes estandar SQL. SQL mapping directo a LEFT/INNER/RIGHT/FULL OUTER JOIN."
tags: [tql, join, strategy, sum-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Round-trip TQL: tokens `"left"/"inner"/"right"/"full"`. Fallback parse "nope" → Left.
+36
View File
@@ -0,0 +1,36 @@
---
name: Op
lang: cpp
domain: core
version: "1.0.0"
algebraic: sum
definition: |
enum class Op {
Eq, Neq, Gt, Gte, Lt, Lte,
Contains, NotContains, StartsWith, EndsWith
};
description: "Operador de filtro TQL. 6 ops de comparacion + 4 ops de string. Numericos ordenan numericamente cuando ambos lados parsean."
tags: [tql, filter, operator, sum-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Tabla operadores permitidos por `ColumnType` via `ops_for_type(t)`:
| Tipo | Ops |
|---|---|
| Int / Float / Date | Eq, Neq, Gt, Gte, Lt, Lte |
| Bool | Eq, Neq |
| Json | Eq, Neq, Contains, NotContains |
| String | Eq, Neq, Contains, NotContains, StartsWith, EndsWith |
Mapeo SQL en `tql_to_sql_cpp_core`: Contains → `LIKE '%v%'`, StartsWith → `LIKE 'v%'`, etc.
## Usado por
- `Filter_cpp_core`
- `compute_stage_cpp_core` (via apply_filters)
- `tql_emit_cpp_core` / `tql_apply_cpp_core`
- `tql_to_sql_cpp_core`
+20
View File
@@ -0,0 +1,20 @@
---
name: SortClause
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct SortClause {
std::string col;
bool desc;
};
description: "Clausula de orden por nombre de col. Multi-sort = vector ordenado por prioridad. desc=true para descendente."
tags: [tql, sort, order, product-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Sort por nombre (no idx) — sobrevive a renombrado de cols + a stages 1+ donde idx no aplica. Aplicacion via `apply_sorts`. Round-trip TQL: `sort = { {"asc"|"desc", "col"}, ... }`.
+33
View File
@@ -0,0 +1,33 @@
---
name: Stage
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct Stage {
std::vector<Filter> filters;
std::vector<DerivedColumn> derived;
std::vector<std::string> breakouts;
std::vector<Aggregation> aggregations;
std::vector<SortClause> sorts;
};
description: "Layer del pipeline TQL. Stage 0 = Raw (filters + derived + sort). Stage 1+ pueden agrupar (breakouts + aggregations + sort). Consumida por compute_stage."
tags: [tql, stage, pipeline, product-type, mbql]
uses_types: [Filter_cpp_core, Op_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Inspirado en MBQL `:filter` / `:breakout` / `:aggregation` / `:order-by`. Diferencia clave: TQL chain N stages explicitos, cada uno consume el output del anterior. MBQL usa `:source-query` recursivo.
Breakout strings pueden llevar sufijo `:granularity` para cols Date (fase 10): `"ts:month"`, `"ts:week"`, etc. Ver `parse_breakout_granularity()`.
## Usado por
- `State_cpp_core` (lista de stages)
- `compute_stage_cpp_core` (executes a single Stage)
- `compute_pipeline_cpp_core` (chains stages 0..N)
- `tql_emit_cpp_core` / `tql_apply_cpp_core` (round-trip Lua)
- `tql_to_sql_cpp_core` → CTE chain DuckDB
+26
View File
@@ -0,0 +1,26 @@
---
name: StageOutput
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct StageOutput {
std::vector<std::string> cell_backing;
std::vector<const char*> cells;
int rows;
int cols;
std::vector<std::string> headers;
std::vector<ColumnType> types;
};
description: "Output materializado de compute_stage. cell_backing posee strings nuevos (aggregations); cells es row-major de ptrs a backing o a in_cells original."
tags: [tql, stage, output, product-type]
uses_types: [ColumnType_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Lifetime: cell_backing es owner — cells solo es valido mientras StageOutput viva. Para passthrough (sin agregaciones), cells apunta a in_cells del caller (sin backing local).
Reservar capacidad upfront en cell_backing evita realloc que invalida punteros.
+40
View File
@@ -0,0 +1,40 @@
---
name: State
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct State {
std::vector<Stage> stages;
int active_stage;
ViewMode display;
ViewConfig viz_config;
std::vector<VizPanel> extra_panels;
std::vector<Join> joins;
std::string main_source;
std::vector<ColorRule> color_rules;
std::vector<bool> col_visible;
std::vector<int> col_order;
};
description: "Estado completo de una query TQL: pipeline de stages + joins + viz config + UI tweaks. Round-trip a Lua via tql_emit/tql_apply."
tags: [tql, state, pipeline, product-type]
uses_types: [Stage_cpp_core, Filter_cpp_core, Op_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
State es el documento canonico de una query del usuario. Atomico — toda mutacion pasa por helpers pure (`apply_drill_step`, `drill_up`, etc.).
`active_stage` = idx del stage cuyo output se renderiza. Filters/sorts del Raw siempre se aplican antes; joins se materializan ANTES de stages[0].
Helpers `raw()`, `active()` garantizan `stages[0]` existe (lazy init en `ensure_stage0`).
## Usado por
- `data_table_cpp_viz` (UI render principal)
- `compute_pipeline_cpp_core` (resuelve hasta active_stage)
- `tql_emit_cpp_core` / `tql_apply_cpp_core` (Lua serializacion)
- `tql_to_sql_cpp_core` → SQL DuckDB CTE chain
- `apply_drill_step` / `undo_drill_step` / `drill_up`
+33
View File
@@ -0,0 +1,33 @@
---
name: TableInput
lang: cpp
domain: core
version: "1.0.0"
algebraic: product
definition: |
struct TableInput {
std::string name;
std::vector<std::string> headers;
std::vector<ColumnType> types;
const char* const* cells;
int rows;
int cols;
};
description: "Tabla materializada en memoria pasada a data_table::render(). Owner externo. Multiple tables = main + joinables (fase 9 issue 0078)."
tags: [tql, table, joins, mbql, product-type]
uses_types: [Op_cpp_core]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`name` es el identificador estable que matchea `Join.source` cuando se aplica un join. `cells` es row-major (rows * cols `const char*`). Apuntadores estables durante todo el frame de render.
Cells son strings — auto_detect_type infiere ColumnType si `types[i] == Auto`. Numericos se parsean por celda en compare/agg via `parse_number()`.
## Usado por
- `data_table_cpp_viz::render(tables, state)`
- `resolve_main_idx` (matchea state.main_source)
- `join_tables_cpp_core` (right table)
- `tql_to_sql_cpp_core` (schema para emitir SELECT FROM `name`)
+29
View File
@@ -0,0 +1,29 @@
---
name: ViewConfig
lang: cpp
domain: viz
version: "1.0.0"
algebraic: product
definition: |
struct ViewConfig {
std::string x_col;
std::vector<std::string> y_cols;
std::string size_col;
std::string cat_col;
unsigned int primary_color;
int hist_bins;
float pie_radius;
bool show_legend;
bool show_markers;
bool locked;
mutable bool fit_request;
};
description: "Overrides manuales de auto-detect para ViewMode. Cols vacias dejan al dispatcher elegir. primary_color=0 usa palette ImPlot."
tags: [tql, viz, config, product-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`fit_request` mutable bool consumido por `viz::render` (one-shot trigger para `ImPlot::SetNextAxesToFit`). `locked` deshabilita pan/zoom del usuario.
+29
View File
@@ -0,0 +1,29 @@
---
name: ViewMode
lang: cpp
domain: viz
version: "1.0.0"
algebraic: sum
definition: |
enum class ViewMode {
Table,
Bar, Column, GroupedBar, StackedBar,
Line, Area, Stairs,
Scatter, Bubble,
Histogram, Histogram2D, Heatmap, BoxPlot,
Stem, ErrorBars,
Pie, Donut, Funnel, Waterfall,
KPI, KPIGrid,
Candlestick, Radar
};
description: "Modo de visualizacion ImPlot del stage activo. ~25 variantes cubriendo bars/lines/distribution/composition/specialized. Dispatcher en viz::render."
tags: [tql, viz, imgui, implot, sum-type]
uses_types: []
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
Tokens lowercase via `view_mode_token`/`view_mode_from_token` para TQL emit/apply. Helpers `view_mode_needs_numeric/category/aggregation` guían UI (combo selectable solo si schema compatible).
`Table` siempre disponible (fallback render por defecto). Demas requieren al menos cols compatibles. Click-to-drill (fase 10): Bar/Column/Scatter/Bubble/Pie/Donut/Funnel/Heatmap.
+21
View File
@@ -0,0 +1,21 @@
---
name: VizPanel
lang: cpp
domain: viz
version: "1.0.0"
algebraic: product
definition: |
struct VizPanel {
ViewMode display;
ViewConfig config;
mutable ViewMode last_non_table;
};
description: "Viz adicional sobre el mismo StageOutput. State tiene panel principal (display+viz_config) + vector<VizPanel> extras."
tags: [tql, viz, panel, product-type]
uses_types: [ViewMode_cpp_viz, ViewConfig_cpp_viz]
file_path: "cpp/functions/core/data_table_types.h"
---
## Notas
`last_non_table` memoria del ultimo display !=Table para toggle Table↔View rapido en UI. Mutable porque se actualiza durante render (no rompe const correctness).
+2 -1
View File
@@ -1,9 +1,10 @@
---
id: 0078
title: tables playground — joins MBQL-style (fase 9)
status: pending
status: done
priority: medium
created: 2026-05-12
closed: 2026-05-12
related_components: [cpp/apps/primitives_gallery/playground/tables, lua_engine, tql]
---
+2 -1
View File
@@ -1,9 +1,10 @@
---
id: 0079
title: tables playground — drill-through extendido (fase 10)
status: pending
status: done
priority: medium
created: 2026-05-12
closed: 2026-05-12
related_components: [cpp/apps/primitives_gallery/playground/tables]
---
+198 -37
View File
@@ -1,77 +1,238 @@
---
id: 0080
title: tables playground — LLM API "Ask AI" (fase 11)
status: pending
title: tables playground — LLM "Ask AI" + TQL/SQL emit (fase 11)
status: partial
priority: medium
created: 2026-05-12
related_components: [cpp/apps/primitives_gallery/playground/tables]
updated: 2026-05-13
notes: pure layer + LLM client + Ask AI modal DONE. DuckDB adapter v2 (opcional, build flag FN_TQL_DUCKDB=1)
related_components: [cpp/apps/primitives_gallery/playground/tables, lua_engine, tql, duckdb]
---
## Contexto
Fase 11 del roadmap del tables playground. El user escribe en lenguaje natural
una pregunta sobre los datos ("show me top 10 langs by total size"). El LLM
recibe el TQL actual + schema + pregunta, devuelve nuevo TQL. App aplica via
`tql::apply` y renderiza.
Fase 11 del roadmap del tables playground. Dos capacidades que se construyen juntas porque comparten infra (prompt schema, runtime adapter, tests round-trip):
1. **LLM "Ask AI"** — usuario o agente pregunta en lenguaje natural, modelo devuelve un nuevo TQL (o SQL DuckDB si esta linkado).
2. **TQL → SQL (DuckDB) emitter** — permite a agentes escribir SQL contra el mismo modelo de datos. Ejecutable si la app linkó DuckDB; si no, solo emite el string.
Diseño one-way: **TQL → SQL si**, **SQL → TQL no**. Razon documentada en investigacion Metabase MBQL ↔ SQL: la traduccion inversa es lossy (CTEs, window fns, set ops, lateral, correlated subqueries no caben en MBQL/TQL). Patron canonico Malloy/Cube/LookML/Metabase = compile-down one-way.
## Cambios
### 1. UI
### 1. UI "Ask AI"
- Boton "Ask AI" en toolbar (al lado de "+ Viz").
- Modal con:
- Modal:
- InputText multiline para la pregunta.
- Boton "Send" + spinner durante la llamada.
- Diff side-by-side: TQL actual vs TQL propuesto (texto con highlight).
- Toggle output mode: `TQL` (default) | `SQL (DuckDB)` (visible solo si app fue compilada con `FN_TQL_DUCKDB=1`).
- Boton "Send" + spinner.
- Diff side-by-side: actual vs propuesto (texto highlight).
- Botones "Apply" / "Reject" / "Edit before apply".
### 2. Backend LLM
- Provider: Anthropic Claude (API key desde `pass anthropic/api-key`).
- Endpoint: `https://api.anthropic.com/v1/messages`.
- Model: `claude-sonnet-4-6` por defecto. Configurable via env `FN_LLM_MODEL`.
- Cliente HTTP: cURL via popen (sin deps nuevas) o libcurl si ya esta linkada.
- Provider: Anthropic Claude. API key via `pass anthropic/api-key`.
- Endpoint: `https://api.anthropic.com/v1/messages`. Model: `claude-sonnet-4-6`. Override env `FN_LLM_MODEL`.
- Cliente HTTP: cURL via popen (sin deps nuevas).
- Prompt template incluye:
- Esquema TQL (de `docs/TQL.md`).
- **Si SQL mode**: dialecto DuckDB + funciones DuckDB relevantes (date_trunc, regexp_replace, etc.).
- Cols disponibles del stage 0 (name, type) + cols joinables.
- **Grammar Lua subset** (ver §4) cuando aplique.
- Funciones Lua disponibles (de `lua_engine`).
- TQL actual.
- Pregunta del user.
- Response: extraer ```lua``` block del markdown, strip prose.
- Response: extraer ```lua``` (TQL) o ```sql``` block del markdown, strip prose.
### 3. Validacion + safety
### 3. TQL → SQL DuckDB emitter
- Antes de aplicar: `tql::apply` con dry-run (parsea sin mutar State). Si fail, mostrar error + boton "Ask AI again with this error".
- Lua sandbox ya cubre side effects en formulas — el TQL en si es declarativo, no ejecuta nada peligroso.
Nuevo modulo `tql_to_sql.{h,cpp}` (pure). Funciones:
### 4. Streaming
```cpp
struct SqlEmit {
std::string sql; // SELECT ... statement
std::vector<std::string> params; // bound values (?-placeholders)
std::vector<std::string> warnings;
std::string error; // si emit fallo (subset out of bounds)
};
- Stream tokens via SSE (`stream=true` en Anthropic API).
- Mostrar texto en vivo en el modal.
- Cuando termina, parsear lua block final.
// Pure: emite SQL DuckDB equivalente a la pipeline State (stages 0..active).
// `tables` provee el schema de cada TableInput (no los cells — el caller
// decide como hidratar las tablas en DuckDB).
SqlEmit emit_sql(const State& state, const std::vector<TableInput>& tables,
int up_to_stage = -1 /* default = active_stage */);
```
### 5. Persistencia conversation
Mapeo MBQL-style:
- Stage 0 = CTE base `t0` con `SELECT cols + derived FROM main_t [LEFT/INNER/RIGHT/FULL JOIN joinables ON ...]`.
- Stage N = CTE `tN` con `SELECT breakouts, aggregations FROM tN-1 [WHERE filters] [GROUP BY breakouts] [ORDER BY sorts]`.
- Final query `SELECT * FROM t<active>`.
- UiState guarda lista de turns (pregunta + TQL propuesto + resultado apply).
- "Ask AI" siguiente turn incluye history previa.
- Boton "Reset chat" limpia.
- NO persistido en TQL (es UI state).
Stage emit detalle:
- `filter Op::Eq col = "v"``WHERE col = ?` con `params.push_back(v)` (DuckDB acepta `$1`/`?`).
- `breakout "ts:month"``date_trunc('month', ts) AS "ts:month"`. Granularity sufijo → DuckDB `date_trunc`.
- `aggregation count``COUNT(*) AS count`.
- `aggregation p95(col)``quantile_cont(col, 0.95) AS p95_col`.
- `aggregation distinct col``COUNT(DISTINCT col) AS distinct_col`.
- `sort {desc, col}``ORDER BY col DESC`.
- Joins: 4 strategies mapean directo a `LEFT/INNER/RIGHT/FULL JOIN ... ON l.k = r.k`.
- Derived cols: transpiladas via Lua subset (§4). Si formula fuera de subset → `SqlEmit.error = "lua formula 'X' out of subset: <razon>"`.
### 6. Coste / rate limit
Salida es **string SQL valido DuckDB**. No ejecuta — eso es responsabilidad del adapter opcional (§5).
### 4. Lua subset transpilable a SQL — GRAMATICA
Documentar en `docs/TQL.md` seccion nueva "SQL transpile subset".
**Reglas duras: Lua sigue siendo potente y sin limites en runtime general.** El subset solo aplica si el caller pide `tql_to_sql::emit_sql()`. Fuera del subset → error claro en tiempo de emit, NO en tiempo de eval. El playground sigue ejecutando Lua arbitrario sin restriccion.
**Subset permitido (transpila a SQL):**
| Lua | SQL DuckDB |
|---|---|
| Literales: numero, string `"x"`, bool `true/false`, `nil` | `1.5`, `'x'`, `TRUE/FALSE`, `NULL` |
| Col ref: `[colname]` | `colname` (identifier quoted si necesario) |
| Aritmetica: `+ - * / % - (unary)` | mismas |
| Comparacion: `== ~= < <= > >=` | `= <> < <= > >=` |
| Logica: `and or not` | `AND OR NOT` |
| String concat: `..` | `\|\|` |
| Ternary: `if A then B else C end` | `CASE WHEN A THEN B ELSE C END` |
| Ternary inline: `(A and B) or C` (pattern comun Lua) | `CASE WHEN A THEN B ELSE C END` |
| `math.floor/ceil/abs/round/sqrt/sin/cos/log` | `floor/ceiling/abs/round/sqrt/sin/cos/ln` |
| `math.min(a,b)/max(a,b)` | `least(a,b)/greatest(a,b)` |
| `string.upper/lower/len(s)` | `upper(s)/lower(s)/length(s)` |
| `string.sub(s, i, j)` | `substring(s, i, j-i+1)` |
| `tostring(x)/tonumber(x)` | `CAST(x AS VARCHAR)/CAST(x AS DOUBLE)` |
| Paréntesis y precedencia | mismas |
**Fuera de subset (error compile-time):**
- Closures: `function() ... end`
- Loops: `for/while/repeat`
- Locals: `local x = ...`
- Tables: `{...}`, `t[k]`, `t.field`, `table.*`
- Multi-return / vararg
- `string.gsub/find/match/format` (mapeo manual posible v2)
- IO: `io.*`, `os.*`, `print`
- Coroutines, metatables, debug
- Recursion, multi-statement bodies
**Error message ejemplo:**
```
SQL transpile error en derived col 'fullname':
formula = "[first] .. ' ' .. table.concat(parts, ',')"
causa: 'table.concat' no esta en SQL transpile subset
ver docs/TQL.md#sql-transpile-subset
workaround: usar TQL puro (sin SQL emit) o reescribir formula con `..`
```
**Helper:** `tql_to_sql::is_transpilable(formula, error_out)` pure fn que valida una formula sin emitir.
### 5. DuckDB adapter (opcional)
Build flag `FN_TQL_DUCKDB=1` en `cpp/CMakeLists.txt` opta-in. Vendor DuckDB header-only o lib (depende de tamaño). Default OFF — playground sigue compilando sin DuckDB.
API adapter:
```cpp
namespace tql_duckdb {
struct Result {
StageOutput out; // materializado como TableInput compatible
std::string error;
double duration_ms = 0;
};
// Hidrata `tables` como views temp + ejecuta sql + materializa resultado.
Result execute(const std::string& sql,
const std::vector<std::string>& params,
const std::vector<TableInput>& tables);
}
```
Apps que lo usen (registry_dashboard, sqlite_api): linkean DuckDB + invocan adapter cuando user/agent pide SQL output. Playground por defecto NO linka — `Ask AI` solo ofrece SQL mode si `#ifdef FN_TQL_DUCKDB`.
### 6. Validacion + safety
- Antes de aplicar TQL del LLM: `tql::apply` dry-run. Si fail, mostrar error + "Ask AI again with this error".
- Antes de ejecutar SQL del LLM: parsing DuckDB en sandbox read-only (DuckDB connection sin `INSERT/UPDATE/DELETE/DROP`, attach read-only).
- Lua sandbox ya cubre side effects en formulas TQL.
### 7. Streaming
- Stream tokens via SSE (`stream=true` Anthropic).
- Texto en vivo en modal.
- Cuando termina, parse lua/sql block final.
### 8. Persistencia conversacion
- UiState guarda lista de turns (pregunta + output propuesto + apply result + engine usado TQL/SQL).
- Siguiente "Ask AI" turn incluye history previa.
- Boton "Reset chat".
- NO persistido en TQL (UI state efimero).
### 9. Coste / rate limit
- Mostrar tokens estimados antes de enviar (rough char count / 4).
- Cap input a 8000 tokens.
- Error handling: 429 / 5xx -> mensaje + reintentar.
- Error handling: 429 / 5xx mensaje + reintentar.
## Tests
- Mockear HTTP response con cURL stub.
- Test: prompt build incluye schema + TQL + pregunta en formato esperado.
- Test: response parse extrae lua block correctamente.
- Test: tql::apply sobre output del LLM funciona end-to-end con dataset sintetico.
### Pure (sin red, sin DuckDB linkado)
- **Lua subset validator:** `is_transpilable` true para casos subset, false con error claro para fuera de subset (closures, loops, table.*, string.gsub, etc.).
- **TQL → SQL emit golden tests** (~20 casos):
- stage 0 simple filter + sort → `SELECT ... WHERE ... ORDER BY ...`
- stage 1 group + count → CTE chain con GROUP BY
- granularity sufijo `:month``date_trunc('month', ts)`
- join 4 strategies con multi-key
- derived cols subset → CASE/expressions
- derived cols fuera subset → `SqlEmit.error` no vacio + warning
- aggregation p25/p50/p75/p99 → `quantile_cont(col, p)`
- empty pipeline → `SELECT * FROM t0`
- **TQL parseo:** prompt build incluye schema + TQL + pregunta en formato esperado (mockear HTTP).
- **Response parse:** extrae lua/sql block correctamente.
### Round-trip (requiere DuckDB linkado)
Solo corren si `FN_TQL_DUCKDB=1`:
- TQL → emit SQL → ejecutar DuckDB → resultado coincide bit-a-bit con `compute_stage` pure sobre los mismos cells.
- Casos: filter, group+agg, join inner, multi-stage chain, breakout granularity month/week, derived col `[a] + [b] * 2`.
### LLM (red real, opt-in)
- Test integration con `ANTHROPIC_API_KEY` real (`make test-llm`): pregunta simple → recibe TQL valido → apply OK.
- Mock test (CI): cURL stub responde con JSON predefinido → parser extrae bloque OK.
## No-objetivos
- Generacion de visualizaciones nuevas via LLM (la viz la elige TQL `display`, suficiente).
- Acciones del LLM mas alla de modificar TQL (sin acceso a I/O del sistema).
- Multi-provider (OpenAI / local) — fase futura. Hardcode Anthropic primero.
- **SQL → TQL**: no se implementa. Documentado en doc + en mensajes de error del Ask AI ("no soportamos SQL como input, use TQL").
- **Multi-provider** (OpenAI, local): fase futura. Anthropic hardcoded v1.
- **Generacion de viz desde LLM** mas alla de `display` token: la viz la elige TQL existente.
- **Lua subset extension** (string.gsub, regex, table.*): postpone v2 si demanda real.
- **DuckDB write ops**: solo SELECT/CTE. Apps que quieran INSERT/UPDATE lo hacen fuera del playground.
## Flujo agente (resumen)
```
Agente -> "muestrame top 10 langs por total size"
LLM (TQL default) -> emite TQL { stages = {...} }
tql::apply -> State + dry-run OK
User clickea Apply -> compute_stage en memoria
Agente -> "lo mismo pero como SQL"
[Si FN_TQL_DUCKDB=1 y app linkó adapter]
LLM (SQL mode toggled) -> emite SELECT ... DuckDB
duckdb::execute(sql, params, tables) -> resultado materializado
[Si NO linkado] -> error "SQL mode requiere DuckDB. Compila con FN_TQL_DUCKDB=1"
```
## Riesgos
- **Subset Lua restrictivo en SQL emit**: usuarios usan Lua arbitrario en playground → al pedir SQL falla. Mitigacion: error message claro + sugerencia workaround.
- **DuckDB tamaño**: lib ~10MB. Solo se paga si app opta-in con build flag.
- **Dialect drift DuckDB**: funciones SQL pueden cambiar entre versiones. Pinear DuckDB version en CMake.
- **LLM hallucinations**: TQL invalido → dry-run rechaza con error. Loop "Ask AI again with this error" recupera.
- **API key leak**: `pass` integration mantiene fuera del repo. Build flag NUNCA imprime key.
- **Coste tokens**: prompt grande (schema + grammar + TQL). Cap input + warning visual.
+84
View File
@@ -496,3 +496,87 @@ StageOutput compute_stage(const char* const* in_cells, int in_rows, int in_cols,
| Multi-sort drag-reorder | Phase 4 |
Ver `cpp/apps/primitives_gallery/playground/tables/` para la implementacion del playground.
---
## SQL transpile subset (fase 11 — issue 0080)
TQL emite SQL DuckDB equivalente para que agentes LLM puedan generar TQL o SQL contra los mismos datos. Modulo `tql_to_sql.{h,cpp}` provee `emit_sql(State, tables)`. Mapeo MBQL-style con CTE chain `t0..tN`.
### Lua subset transpilable
Lua sigue **potente y sin limites en runtime general** (formula eval en derived cols TQL puro). El subset SOLO aplica al pedir `tql_to_sql::emit_sql()`. Fuera del subset → error compile-time con causa concreta + workaround.
**Permitido (transpila a SQL DuckDB):**
| Lua | SQL DuckDB | Ejemplo |
|---|---|---|
| Literales numero/string/bool/nil | mismas (`'x'`, `TRUE`, `NULL`) | `42`, `"hola"`, `nil` |
| Col ref: `[colname]` | `"colname"` (quoted) | `[size_kb]``"size_kb"` |
| Aritmetica: `+ - * / % - (unary)` | mismas | `[a] + [b] * 2``("a" + ("b" * 2))` |
| Comparacion: `== ~= < <= > >=` | `= <> < <= > >=` | `[n] >= 10``("n" >= 10)` |
| Logica: `and or not` | `AND OR NOT` | `[a] and [b]``("a" AND "b")` |
| String concat: `..` | `\|\|` | `[a] .. "_" .. [b]``("a" \|\| '_' \|\| "b")` |
| Ternary: `if A then B else C end` | `CASE WHEN A THEN B ELSE C END` | obligatorio `else` |
| `math.floor/ceil/abs/sqrt/sin/cos/log/exp` | `floor/ceiling/abs/sqrt/sin/cos/ln/exp` | `math.floor([x])` |
| `math.min(a,b)/max(a,b)` | `least(a,b)/greatest(a,b)` | `math.min([a], 100)` |
| `string.upper/lower/len(s)` | `upper(s)/lower(s)/length(s)` | `string.upper([name])` |
| `string.sub(s, i [, j])` | `substring(s, i [, j-i+1])` | `string.sub([s], 1, 3)` |
| `tostring(x)/tonumber(x)` | `CAST(x AS VARCHAR)/CAST(x AS DOUBLE)` | `tonumber([n])` |
| Parentesis y precedencia Lua | mismas | `(a + b) * c` |
**Fuera de subset (error compile-time):**
- Closures: `function() ... end`
- Loops: `for/while/repeat`
- Locals: `local x = ...`
- Tables: `{...}`, `t[k]`, `t.field`, `table.*`
- Multi-return, vararg `...`
- `string.gsub/find/match/format/byte/char/rep`
- IO/OS/debug: `io.*`, `os.*`, `debug.*`, `package`, `require`, `print`
- Coroutines, metatables, `pcall/xpcall`, `rawget/rawset`
- Recursion, multi-statement bodies (`;`)
- Length operator `#`
- Method calls `:`
- Ternary sin else: `if A then B end` (subset requiere ambas ramas)
### Error message ejemplo
```
SQL transpile error en derived col 'fullname':
formula = "[first] .. ' ' .. string.gsub([last], 'X', 'Y')"
causa: function 'string.gsub' not in SQL transpile whitelist
ver docs/TQL.md#sql-transpile-subset
workaround: usar TQL puro (sin SQL emit) o reescribir formula
```
### Stage → SQL mapeo
| TQL element | SQL DuckDB |
|---|---|
| Stage 0 Raw | CTE `t0 AS (SELECT cols+derived FROM main_t [JOIN ...] [WHERE filters] [ORDER BY sorts])` |
| Stage N>=1 | CTE `tN AS (SELECT breakouts+aggs FROM tN-1 [GROUP BY ...] [ORDER BY ...])` |
| breakout `"col"` | `"col"` |
| breakout `"col:month"` | `date_trunc('month', "col")` |
| breakout `"col:year/week/day/hour"` | `date_trunc('year/week/day/hour', "col")` |
| Aggregation Count | `COUNT(*)` |
| Aggregation Sum/Avg/Min/Max/Stddev | `SUM/AVG/MIN/MAX/STDDEV("col")` |
| Aggregation Distinct | `COUNT(DISTINCT "col")` |
| Aggregation Median/P25/P75/P90/P99 | `quantile_cont("col", p)` |
| Aggregation Percentile p | `quantile_cont("col", p)` |
| Filter Op::Eq/Neq/Gt/Gte/Lt/Lte | `"col" = ?` etc (params bound) |
| Filter Op::Contains | `"col" LIKE '%v%'` (param `%v%`) |
| Filter Op::StartsWith / EndsWith | `LIKE 'v%'` / `LIKE '%v'` |
| Sort `{desc, "col"}` | `ORDER BY "col" DESC` |
| Join Left/Inner/Right/Full | `LEFT/INNER/RIGHT/FULL OUTER JOIN ... ON ...` |
| Join multi-key `on={{l1,r1},{l2,r2}}` | `ON l.l1 = r.r1 AND l.l2 = r.r2` |
| Join fields | cols `alias.field AS "alias.field"` |
| `main_source` | `FROM "main_source_name"` |
### Doctrina (Metabase-style)
- **One-way:** TQL → SQL OK. SQL → TQL no soportado. Razon: traduccion inversa lossy (CTEs, window fns, set ops, lateral, correlated subqueries no caben en TQL).
- **Output:** SQL string siempre emitible. Ejecucion requiere DuckDB linkado (build flag `FN_TQL_DUCKDB=1`, opcional).
- **Agente flow:** TQL default. SQL solo si app linko DuckDB. UI Ask AI muestra toggle SQL solo cuando disponible.
Ver issue 0080 + `tql_to_sql.{h,cpp}` para implementacion.
+155
View File
@@ -0,0 +1,155 @@
package core
import (
"bufio"
"context"
"fmt"
"io"
"os"
"os/exec"
"sync"
"syscall"
"time"
)
// StreamEvent es una linea capturada de stdout o stderr del subproceso.
type StreamEvent struct {
Stream string // "stdout" | "stderr"
Line string // sin trailing newline
Time time.Time // timestamp de recepcion
}
// StreamResult es el resultado final del subproceso, enviado por el canal de
// resultados cuando ambos pipes han llegado a EOF y el proceso ha terminado.
type StreamResult struct {
ExitCode int
Err error
DurationMs int64
}
// SubprocessStream lanza name con args como subproceso y retorna dos canales:
// - events: recibe StreamEvent (linea de stdout/stderr) hasta EOF de ambos pipes.
// - result: recibe exactamente un StreamResult cuando el proceso termina.
//
// env se concatena con os.Environ(). stdin puede ser nil.
//
// Cancelar ctx envia SIGTERM al proceso; si no termina en 2 segundos, SIGKILL.
// El caller DEBE consumir events hasta que se cierre o cancelar ctx para evitar
// bloquear las goroutines internas.
func SubprocessStream(
ctx context.Context,
name string,
args []string,
env []string,
stdin io.Reader,
) (<-chan StreamEvent, <-chan StreamResult) {
events := make(chan StreamEvent, 64)
results := make(chan StreamResult, 1)
go func() {
defer close(events)
defer close(results)
start := time.Now()
cmd := exec.CommandContext(ctx, name, args...)
// Entorno: base + extra
if len(env) > 0 {
cmd.Env = append(os.Environ(), env...)
}
if stdin != nil {
cmd.Stdin = stdin
}
// Process group propio para matar hijos al recibir SIGTERM/SIGKILL
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
stdoutPipe, err := cmd.StdoutPipe()
if err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stdout pipe: %w", err), DurationMs: 0}
return
}
stderrPipe, err := cmd.StderrPipe()
if err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("stderr pipe: %w", err), DurationMs: 0}
return
}
if err := cmd.Start(); err != nil {
results <- StreamResult{ExitCode: -1, Err: fmt.Errorf("start: %w", err), DurationMs: 0}
return
}
// Goroutine de supervision de ctx: SIGTERM → grace 2s → SIGKILL
ctxDone := make(chan struct{})
go func() {
select {
case <-ctx.Done():
if cmd.Process != nil {
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGTERM)
timer := time.NewTimer(2 * time.Second)
defer timer.Stop()
select {
case <-timer.C:
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
case <-ctxDone:
}
}
case <-ctxDone:
}
}()
send := func(stream, line string) {
ev := StreamEvent{Stream: stream, Line: line, Time: time.Now()}
select {
case events <- ev:
case <-ctx.Done():
}
}
// Leer stdout y stderr concurrentemente
const bufSize = 1024 * 1024 // 1 MB para lineas largas (sd-cli progress, etc.)
var wg sync.WaitGroup
scanPipe := func(r io.Reader, stream string) {
defer wg.Done()
sc := bufio.NewScanner(r)
sc.Buffer(make([]byte, bufSize), bufSize)
for sc.Scan() {
send(stream, sc.Text())
}
}
wg.Add(2)
go scanPipe(stdoutPipe, "stdout")
go scanPipe(stderrPipe, "stderr")
wg.Wait()
close(ctxDone) // señal al supervisor de ctx para que pare
exitCode := 0
var waitErr error
if err := cmd.Wait(); err != nil {
waitErr = err
if exitErr, ok := err.(*exec.ExitError); ok {
exitCode = exitErr.ExitCode()
waitErr = nil // exit code no-cero no es un error de spawn
}
}
// Si el contexto fue cancelado, reportar como error de cancelacion
if ctx.Err() != nil && waitErr == nil {
waitErr = ctx.Err()
}
results <- StreamResult{
ExitCode: exitCode,
Err: waitErr,
DurationMs: time.Since(start).Milliseconds(),
}
}()
return events, results
}
+69
View File
@@ -0,0 +1,69 @@
---
name: subprocess_stream
kind: function
lang: go
domain: core
version: "1.0.0"
purity: impure
signature: "func SubprocessStream(ctx context.Context, name string, args []string, env []string, stdin io.Reader) (<-chan StreamEvent, <-chan StreamResult)"
description: "Lanza un subproceso y retorna dos canales: uno con StreamEvent (linea de stdout/stderr con timestamp) y otro con un unico StreamResult (ExitCode, Err, DurationMs). Cancelar ctx envia SIGTERM al proceso; si no termina en 2s, SIGKILL."
tags: [subprocess, exec, stream, stdout, stderr, process, concurrency, io, primitiva]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [bufio, context, fmt, io, os, os/exec, sync, syscall, time]
params:
- name: ctx
desc: "Contexto de cancelacion. Al cancelar, el proceso recibe SIGTERM; si no muere en 2s, SIGKILL. Usar context.WithTimeout para acotar duracion maxima."
- name: name
desc: "Nombre o path del ejecutable a lanzar (ej. 'echo', '/usr/bin/python3')."
- name: args
desc: "Argumentos del proceso. Puede ser nil o vacio."
- name: env
desc: "Variables de entorno adicionales en formato 'KEY=VALUE'. Se concatenan con os.Environ(). Puede ser nil."
- name: stdin
desc: "Stdin del proceso. Puede ser nil si el proceso no necesita entrada."
output: "Dos canales: events (<-chan StreamEvent) cerrado cuando ambos pipes EOF; result (<-chan StreamResult) con exactamente un valor cuando el proceso termina. El caller DEBE consumir events hasta cierre o cancelar ctx para evitar bloquear goroutines internas."
tested: true
tests:
- "echo stdout llega como evento y ExitCode 0"
- "stderr llega como evento con stream stderr"
- "exit code no-cero se reporta en StreamResult"
- "ctx cancelado termina el proceso"
- "multiples lineas stdout"
test_file_path: "functions/core/subprocess_stream_test.go"
file_path: "functions/core/subprocess_stream.go"
---
## Ejemplo
```go
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
events, results := core.SubprocessStream(ctx, "grep", []string{"-rn", "TODO", "."}, nil, nil)
for ev := range events {
switch ev.Stream {
case "stdout":
fmt.Println(ev.Line)
case "stderr":
fmt.Fprintln(os.Stderr, "[stderr]", ev.Line)
}
}
res := <-results
if res.ExitCode != 0 || res.Err != nil {
log.Printf("grep exit=%d err=%v duration=%dms", res.ExitCode, res.Err, res.DurationMs)
}
```
## Notas
- El canal `events` tiene buffer de 64. Si el caller deja de consumir y el buffer se llena, las goroutinas internas se bloquean hasta que haya espacio o el ctx sea cancelado.
- El scanner de cada pipe tiene un buffer de 1 MB para tolerar lineas muy largas (progreso de CLIs tipo sd-cli, barras ANSI largas).
- Los structs `StreamEvent` y `StreamResult` se declaran en el mismo archivo para que el paquete `core` los exporte sin imports adicionales.
- Generaliza el patron de `claude_stream_go_core` desacoplando el lanzamiento de subprocesos del protocolo especifico de claude (NDJSON/stream-json). `claude_stream_go_core` puede reimplementarse internamente usando esta funcion como primitiva.
- `cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}` crea un process group propio; SIGTERM/SIGKILL se envian con `Kill(-pgid, sig)` para matar tambien los procesos hijo del hijo.
+132
View File
@@ -0,0 +1,132 @@
package core
import (
"context"
"testing"
"time"
)
func TestSubprocessStream(t *testing.T) {
t.Run("echo stdout llega como evento y ExitCode 0", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "echo", []string{"hola"}, nil, nil)
var got []StreamEvent
for ev := range events {
got = append(got, ev)
}
res := <-results
if res.ExitCode != 0 {
t.Errorf("ExitCode = %d, want 0 (err: %v)", res.ExitCode, res.Err)
}
if res.Err != nil {
t.Errorf("unexpected Err: %v", res.Err)
}
if len(got) != 1 {
t.Fatalf("got %d events, want 1", len(got))
}
if got[0].Stream != "stdout" {
t.Errorf("Stream = %q, want %q", got[0].Stream, "stdout")
}
if got[0].Line != "hola" {
t.Errorf("Line = %q, want %q", got[0].Line, "hola")
}
})
t.Run("stderr llega como evento con stream stderr", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// sh -c "echo msg >&2" escribe a stderr
events, results := SubprocessStream(ctx, "sh", []string{"-c", "echo error_msg >&2"}, nil, nil)
var got []StreamEvent
for ev := range events {
got = append(got, ev)
}
res := <-results
if res.ExitCode != 0 {
t.Errorf("ExitCode = %d, want 0", res.ExitCode)
}
if len(got) != 1 {
t.Fatalf("got %d events, want 1", len(got))
}
if got[0].Stream != "stderr" {
t.Errorf("Stream = %q, want %q", got[0].Stream, "stderr")
}
if got[0].Line != "error_msg" {
t.Errorf("Line = %q, want %q", got[0].Line, "error_msg")
}
})
t.Run("exit code no-cero se reporta en StreamResult", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "sh", []string{"-c", "exit 42"}, nil, nil)
for range events {
}
res := <-results
if res.ExitCode != 42 {
t.Errorf("ExitCode = %d, want 42", res.ExitCode)
}
if res.Err != nil {
t.Errorf("unexpected Err: %v", res.Err)
}
})
t.Run("ctx cancelado termina el proceso", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
// proceso que dura mucho; cancelamos enseguida
ctxShort, cancelShort := context.WithTimeout(ctx, 100*time.Millisecond)
defer cancelShort()
events, results := SubprocessStream(ctxShort, "sleep", []string{"60"}, nil, nil)
for range events {
}
res := <-results
// Tras cancelacion el proceso debe haber terminado (ExitCode != 0 o Err de ctx)
if res.ExitCode == 0 && res.Err == nil {
t.Error("expected non-zero exit or ctx error after cancellation")
}
if res.DurationMs > 3000 {
t.Errorf("took %d ms, expected < 3000 (should have been killed)", res.DurationMs)
}
})
t.Run("multiples lineas stdout", func(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
events, results := SubprocessStream(ctx, "sh", []string{"-c", "printf 'a\nb\nc\n'"}, nil, nil)
var lines []string
for ev := range events {
if ev.Stream == "stdout" {
lines = append(lines, ev.Line)
}
}
<-results
if len(lines) != 3 {
t.Fatalf("got %d stdout lines, want 3: %v", len(lines), lines)
}
want := []string{"a", "b", "c"}
for i, w := range want {
if lines[i] != w {
t.Errorf("line[%d] = %q, want %q", i, lines[i], w)
}
}
})
}
+238
View File
@@ -0,0 +1,238 @@
package infra
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
// MlEnvCheck holds the result of a single ML environment probe.
type MlEnvCheck struct {
Name string `json:"name"` // e.g. "cuda_toolkit", "python_venv"
Status string `json:"status"` // "ok" | "missing" | "warning" | "unknown"
Version string `json:"version,omitempty"` // version string if detected
Detail string `json:"detail,omitempty"` // human-readable extra info
}
// MlEnvReport is the full ML environment audit result.
type MlEnvReport struct {
Gpus []GpuInfo `json:"gpus"`
Checks []MlEnvCheck `json:"checks"`
OverallOK bool `json:"overall_ok"`
GeneratedAt int64 `json:"generated_at"`
}
// AuditMlEnv probes the ML environment rooted at registryRoot.
// It checks for NVIDIA drivers, CUDA toolkit, Python venv, key Python
// packages and optional tools (sd, llama-cli) and a local vault path.
// Returns a non-nil MlEnvReport even when individual checks fail; the
// function itself only errors if a fundamental system call cannot be
// attempted.
func AuditMlEnv(registryRoot string) (MlEnvReport, error) {
report := MlEnvReport{
GeneratedAt: time.Now().Unix(),
}
// --- GPU detection (composes GetGpuInfo) ---
gpus, err := GetGpuInfo()
if err != nil {
// Non-fatal: record absence.
gpus = []GpuInfo{}
}
report.Gpus = gpus
checks := []MlEnvCheck{}
// --- nvidia-smi ---
checks = append(checks, probeCommand("nvidia_smi", "nvidia-smi", []string{"--version"}, 5))
// --- nvcc (CUDA toolkit compiler) ---
nvcc := probeNvcc()
checks = append(checks, nvcc)
// --- Python venv ---
venvCheck := probeVenv(registryRoot)
checks = append(checks, venvCheck)
// Python venv path for subsequent checks.
venvPy := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
// --- Python packages ---
for _, pkg := range []string{"torch", "diffusers", "transformers", "huggingface_hub", "stable_diffusion_cpp_python"} {
checks = append(checks, probePythonPackage(venvPy, pkg))
}
// --- sd.cpp CLI ---
checks = append(checks, probeCommand("sd_cli", "sd", []string{"--version"}, 5))
// --- llama.cpp CLI ---
checks = append(checks, probeCommand("llama_cpp", "llama-cli", []string{"--version"}, 5))
// --- imagegen_vault ---
checks = append(checks, probeImagegenVault())
report.Checks = checks
// OverallOK: no "missing" checks (warning is tolerated) and at least 1 GPU.
overallOK := len(gpus) > 0
for _, c := range checks {
if c.Status == "missing" {
// stable_diffusion_cpp_python and sd_cli are optional — downgrade to warning-only.
if c.Name == "stable_diffusion_cpp_python" || c.Name == "sd_cli" || c.Name == "llama_cpp" {
continue
}
overallOK = false
}
}
report.OverallOK = overallOK
return report, nil
}
// probeCommand checks whether a binary is available in PATH by running it with
// the given args and recording any version output.
func probeCommand(name, binary string, args []string, timeoutSec int) MlEnvCheck {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
defer cancel()
path, err := exec.LookPath(binary)
if err != nil {
return MlEnvCheck{Name: name, Status: "missing", Detail: fmt.Sprintf("%s not found in PATH", binary)}
}
out, err := exec.CommandContext(ctx, path, args...).CombinedOutput()
version := strings.TrimSpace(string(out))
if len(version) > 120 {
version = version[:120]
}
if err != nil {
return MlEnvCheck{Name: name, Status: "warning", Version: version, Detail: fmt.Sprintf("exit error: %v", err)}
}
return MlEnvCheck{Name: name, Status: "ok", Version: version}
}
// probeNvcc extracts the CUDA toolkit version from nvcc --version output.
func probeNvcc() MlEnvCheck {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
path, err := exec.LookPath("nvcc")
if err != nil {
return MlEnvCheck{Name: "nvcc", Status: "missing", Detail: "nvcc not found in PATH (CUDA toolkit not installed)"}
}
out, err := exec.CommandContext(ctx, path, "--version").CombinedOutput()
if err != nil {
return MlEnvCheck{Name: "nvcc", Status: "warning", Detail: fmt.Sprintf("nvcc --version failed: %v", err)}
}
// Extract version from line like: "Cuda compilation tools, release 12.4, V12.4.99"
version := ""
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(line, "release") {
parts := strings.Split(line, ",")
for _, p := range parts {
p = strings.TrimSpace(p)
if strings.HasPrefix(p, "release") {
version = strings.TrimSpace(strings.TrimPrefix(p, "release"))
break
}
}
break
}
}
if version == "" {
version = strings.TrimSpace(string(out))
if len(version) > 80 {
version = version[:80]
}
}
return MlEnvCheck{Name: "nvcc", Status: "ok", Version: version}
}
// probeVenv checks that the Python venv exists and is functional.
func probeVenv(registryRoot string) MlEnvCheck {
py := filepath.Join(registryRoot, "python", ".venv", "bin", "python3")
if _, err := os.Stat(py); os.IsNotExist(err) {
return MlEnvCheck{Name: "python_venv", Status: "missing", Detail: fmt.Sprintf("not found: %s", py)}
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
out, err := exec.CommandContext(ctx, py, "--version").CombinedOutput()
version := strings.TrimSpace(string(out))
if err != nil {
return MlEnvCheck{Name: "python_venv", Status: "warning", Version: version, Detail: fmt.Sprintf("python3 --version failed: %v", err)}
}
return MlEnvCheck{Name: "python_venv", Status: "ok", Version: version}
}
// probePythonPackage imports a package in the venv Python and extracts __version__.
func probePythonPackage(venvPy, pkg string) MlEnvCheck {
// Map package name → import name (for packages with different import names).
importName := pkg
switch pkg {
case "stable_diffusion_cpp_python":
importName = "stable_diffusion_cpp"
case "huggingface_hub":
importName = "huggingface_hub"
}
// Check that the venv python binary exists first.
if _, err := os.Stat(venvPy); os.IsNotExist(err) {
return MlEnvCheck{Name: pkg, Status: "unknown", Detail: "python_venv not available"}
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
script := fmt.Sprintf("import %s; v = getattr(%s, '__version__', None); print(v or 'unknown')", importName, importName)
out, err := exec.CommandContext(ctx, venvPy, "-c", script).CombinedOutput()
output := strings.TrimSpace(string(out))
if err != nil {
// Module not found → missing; other errors → warning.
detail := output
if len(detail) > 200 {
detail = detail[:200]
}
if strings.Contains(output, "ModuleNotFoundError") || strings.Contains(output, "No module named") {
return MlEnvCheck{Name: pkg, Status: "missing", Detail: fmt.Sprintf("%s not installed", importName)}
}
return MlEnvCheck{Name: pkg, Status: "warning", Detail: detail}
}
return MlEnvCheck{Name: pkg, Status: "ok", Version: output}
}
// probeImagegenVault checks that ~/vaults/imagegen_models exists and lists subdirs.
func probeImagegenVault() MlEnvCheck {
home, err := os.UserHomeDir()
if err != nil {
return MlEnvCheck{Name: "imagegen_vault", Status: "unknown", Detail: "cannot determine home directory"}
}
vaultPath := filepath.Join(home, "vaults", "imagegen_models")
entries, err := os.ReadDir(vaultPath)
if os.IsNotExist(err) {
return MlEnvCheck{Name: "imagegen_vault", Status: "missing", Detail: fmt.Sprintf("vault not found: %s", vaultPath)}
}
if err != nil {
return MlEnvCheck{Name: "imagegen_vault", Status: "warning", Detail: fmt.Sprintf("cannot read vault: %v", err)}
}
subdirs := []string{}
for _, e := range entries {
if e.IsDir() {
subdirs = append(subdirs, e.Name())
}
}
detail := fmt.Sprintf("subdirs: %s", strings.Join(subdirs, ", "))
if len(subdirs) == 0 {
detail = "vault exists but is empty"
}
return MlEnvCheck{Name: "imagegen_vault", Status: "ok", Detail: detail}
}
+67
View File
@@ -0,0 +1,67 @@
---
name: audit_ml_env
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
uses_functions: [get_gpu_info_go_infra]
uses_types: [gpu_info_go_infra]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [context, fmt, os, os/exec, path/filepath, strings, time]
tested: true
tests:
- "report no nil y tiene checks"
- "generated_at es positivo"
- "checks tiene al menos 4 entradas"
- "gpus puede ser vacio en CI"
test_file_path: "functions/infra/audit_ml_env_test.go"
file_path: "functions/infra/audit_ml_env.go"
params:
- name: registryRoot
desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
---
## Checks realizados
| Check | Tipo | Critico |
|---|---|---|
| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
| `nvcc` | CUDA toolkit version | no |
| `python_venv` | exists + `python3 --version` | si |
| `torch` | `import torch; __version__` | si |
| `diffusers` | `import diffusers; __version__` | si |
| `transformers` | `import transformers; __version__` | si |
| `huggingface_hub` | `import huggingface_hub; __version__` | si |
| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
| `sd_cli` | `sd --version` in PATH | no (opcional) |
| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
## Ejemplo
```go
root := "/home/lucas/fn_registry"
report, err := AuditMlEnv(root)
if err != nil {
log.Fatal(err)
}
for _, c := range report.Checks {
fmt.Printf("%-40s %s %s\n", c.Name, c.Status, c.Version)
}
fmt.Printf("OverallOK: %v\n", report.OverallOK)
```
## Notas
- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
- No escribe nada en disco. Read-only.
- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
+53
View File
@@ -0,0 +1,53 @@
package infra
import (
"testing"
)
func TestAuditMlEnv(t *testing.T) {
// Use the actual registry root relative to the test binary location.
// Tests run from the package directory; go up two levels.
registryRoot := "../.."
t.Run("report no nil y tiene checks", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if report.Checks == nil {
t.Fatal("report.Checks is nil")
}
})
t.Run("generated_at es positivo", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if report.GeneratedAt <= 0 {
t.Errorf("GeneratedAt should be positive unix timestamp, got %d", report.GeneratedAt)
}
})
t.Run("checks tiene al menos 4 entradas", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
if len(report.Checks) < 4 {
t.Errorf("expected at least 4 checks, got %d", len(report.Checks))
}
})
t.Run("gpus puede ser vacio en CI", func(t *testing.T) {
report, err := AuditMlEnv(registryRoot)
if err != nil {
t.Fatalf("AuditMlEnv returned error: %v", err)
}
// Gpus may be empty in CI without a GPU; that's OK.
// Just verify the field is not nil.
if report.Gpus == nil {
t.Error("report.Gpus should be a non-nil slice (can be empty)")
}
})
}
+60
View File
@@ -0,0 +1,60 @@
package infra
import (
"encoding/csv"
"errors"
"fmt"
"os/exec"
"strconv"
"strings"
)
// GetGpuInfo queries NVIDIA GPUs via nvidia-smi and returns a slice of GpuInfo.
// If nvidia-smi is not installed or no NVIDIA GPU is present, returns an empty
// slice and a nil error (absence of NVIDIA hardware is not an error).
func GetGpuInfo() ([]GpuInfo, error) {
out, err := exec.Command(
"nvidia-smi",
"--query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
// nvidia-smi not installed or no NVIDIA device — not an error.
var exitErr *exec.ExitError
if errors.Is(err, exec.ErrNotFound) || errors.As(err, &exitErr) {
return []GpuInfo{}, nil
}
return nil, fmt.Errorf("gpu_info: nvidia-smi: %w", err)
}
r := csv.NewReader(strings.NewReader(strings.TrimSpace(string(out))))
r.TrimLeadingSpace = true
records, err := r.ReadAll()
if err != nil {
return nil, fmt.Errorf("gpu_info: parse csv: %w", err)
}
gpus := make([]GpuInfo, 0, len(records))
for _, rec := range records {
if len(rec) < 6 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(rec[0]))
totalMb, _ := strconv.Atoi(strings.TrimSpace(rec[2]))
freeMb, _ := strconv.Atoi(strings.TrimSpace(rec[3]))
gpus = append(gpus, GpuInfo{
Index: idx,
Name: strings.TrimSpace(rec[1]),
VramTotalMb: totalMb,
VramFreeMb: freeMb,
DriverVersion: strings.TrimSpace(rec[4]),
CudaVersion: strings.TrimSpace(rec[5]),
})
}
return gpus, nil
}
+70
View File
@@ -0,0 +1,70 @@
---
name: get_gpu_info
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func GetGpuInfo() ([]GpuInfo, error)"
description: "Consulta GPUs NVIDIA via nvidia-smi y retorna un slice de GpuInfo con index, nombre, VRAM total/libre, driver y version CUDA. Si nvidia-smi no esta instalado o no hay GPU NVIDIA, retorna slice vacio y nil (ausencia de hardware no es error)."
tags: [gpu, nvidia, cuda, hardware, infra, probe]
uses_functions: []
uses_types: ["gpu_info_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [encoding/csv, errors, fmt, os/exec, strconv, strings]
params:
- name: (ninguno)
desc: "No toma parametros. Lee el estado del sistema via nvidia-smi."
output: "Slice de GpuInfo con una entrada por GPU detectada. Slice vacio si no hay GPUs NVIDIA o nvidia-smi no esta instalado. Error solo si nvidia-smi existe pero falla inesperadamente al parsear la salida CSV."
tested: true
tests:
- "retorna slice vacio y nil cuando no hay GPU NVIDIA"
- "linea GPU RTX 3080 tipica"
- "dos GPUs en el CSV"
- "CSV vacio retorna slice vacio"
- "linea con menos de 6 campos se ignora"
- "espacios extra en los valores se eliminan"
- "campos del struct GpuInfo correctos"
test_file_path: "functions/infra/get_gpu_info_test.go"
file_path: "functions/infra/get_gpu_info.go"
---
## Ejemplo
```go
gpus, err := GetGpuInfo()
if err != nil {
log.Fatal(err)
}
if len(gpus) == 0 {
fmt.Println("No NVIDIA GPUs detected")
} else {
for _, g := range gpus {
fmt.Printf("[%d] %s VRAM: %d/%d MiB Driver: %s CUDA: %s\n",
g.Index, g.Name, g.VramFreeMb, g.VramTotalMb,
g.DriverVersion, g.CudaVersion)
}
}
```
## Salida nvidia-smi
Ejecuta:
```
nvidia-smi --query-gpu=index,name,memory.total,memory.free,driver_version,cuda_version --format=csv,noheader,nounits
```
Ejemplo de salida con una GPU:
```
0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4
```
## Notas
- Requiere `nvidia-smi` en PATH (parte del driver NVIDIA).
- La columna `cuda_version` en nvidia-smi refleja la version maxima de CUDA soportada por el driver, no la del toolkit instalado.
- Para comprobar el toolkit CUDA instalado, usar `cuda_toolkit_check_bash_infra`.
- En maquinas sin GPU NVIDIA retorna `([]GpuInfo{}, nil)` — el caller puede tratar esto como "sin GPU disponible".
- No ejecutar tests automatizados para esta funcion en CI sin GPU; verificar manualmente o con mock.
+165
View File
@@ -0,0 +1,165 @@
package infra
import (
"strconv"
"strings"
"testing"
)
// TestGetGpuInfoNoGpu verifica que la funcion retorna slice vacio sin error
// cuando nvidia-smi no esta instalado o no hay GPU NVIDIA presente.
// Este test pasa en cualquier maquina, con o sin GPU.
func TestGetGpuInfoNoGpu(t *testing.T) {
t.Run("retorna slice vacio y nil cuando no hay GPU NVIDIA", func(t *testing.T) {
gpus, err := GetGpuInfo()
if err != nil {
t.Errorf("GetGpuInfo() error inesperado: %v", err)
}
// En maquinas sin nvidia-smi el resultado debe ser un slice vacio (no nil)
if gpus == nil {
t.Error("GetGpuInfo() retorno nil, se esperaba slice vacio []GpuInfo{}")
}
})
}
// parseCsvNvidiaSmi replica la logica de parsing de GetGpuInfo para tests unitarios.
// Recibe el output de nvidia-smi --format=csv,noheader,nounits y retorna []GpuInfo.
func parseCsvNvidiaSmi(output string) ([]GpuInfo, error) {
trimmed := strings.TrimSpace(output)
if trimmed == "" {
return []GpuInfo{}, nil
}
lines := strings.Split(trimmed, "\n")
gpus := make([]GpuInfo, 0, len(lines))
for _, line := range lines {
parts := strings.Split(line, ",")
if len(parts) < 6 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
totalMb, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
freeMb, _ := strconv.Atoi(strings.TrimSpace(parts[3]))
gpus = append(gpus, GpuInfo{
Index: idx,
Name: strings.TrimSpace(parts[1]),
VramTotalMb: totalMb,
VramFreeMb: freeMb,
DriverVersion: strings.TrimSpace(parts[4]),
CudaVersion: strings.TrimSpace(parts[5]),
})
}
return gpus, nil
}
// TestParseCsvNvidiaSmi verifica el parsing de la salida CSV de nvidia-smi
// sin requerir GPU real ni nvidia-smi instalado.
func TestParseCsvNvidiaSmi(t *testing.T) {
tests := []struct {
name string
csvInput string
wantLen int
wantIndex int
wantName string
wantVramTotal int
wantVramFree int
wantDriver string
wantCuda string
}{
{
name: "linea GPU RTX 3080 tipica",
csvInput: "0, NVIDIA GeForce RTX 3080, 10240, 8192, 550.54.15, 12.4",
wantLen: 1,
wantIndex: 0,
wantName: "NVIDIA GeForce RTX 3080",
wantVramTotal: 10240,
wantVramFree: 8192,
wantDriver: "550.54.15",
wantCuda: "12.4",
},
{
name: "dos GPUs en el CSV",
csvInput: "0, GPU A, 8192, 4096, 525.0, 12.0\n1, GPU B, 24576, 20000, 525.0, 12.0",
wantLen: 2,
},
{
name: "CSV vacio retorna slice vacio",
csvInput: "",
wantLen: 0,
},
{
name: "linea con menos de 6 campos se ignora",
csvInput: "0, GPU, 8192",
wantLen: 0,
},
{
name: "espacios extra en los valores se eliminan",
csvInput: " 1 , NVIDIA RTX 4090 , 24576 , 20000 , 545.0 , 12.6 ",
wantLen: 1,
wantIndex: 1,
wantName: "NVIDIA RTX 4090",
wantVramTotal: 24576,
wantVramFree: 20000,
wantDriver: "545.0",
wantCuda: "12.6",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
gpus, err := parseCsvNvidiaSmi(tc.csvInput)
if err != nil {
t.Fatalf("error inesperado: %v", err)
}
if len(gpus) != tc.wantLen {
t.Fatalf("len(gpus) = %d, quería %d", len(gpus), tc.wantLen)
}
if tc.wantLen == 1 {
g := gpus[0]
if g.Index != tc.wantIndex {
t.Errorf("Index = %d, quería %d", g.Index, tc.wantIndex)
}
if g.Name != tc.wantName {
t.Errorf("Name = %q, quería %q", g.Name, tc.wantName)
}
if g.VramTotalMb != tc.wantVramTotal {
t.Errorf("VramTotalMb = %d, quería %d", g.VramTotalMb, tc.wantVramTotal)
}
if g.VramFreeMb != tc.wantVramFree {
t.Errorf("VramFreeMb = %d, quería %d", g.VramFreeMb, tc.wantVramFree)
}
if g.DriverVersion != tc.wantDriver {
t.Errorf("DriverVersion = %q, quería %q", g.DriverVersion, tc.wantDriver)
}
if g.CudaVersion != tc.wantCuda {
t.Errorf("CudaVersion = %q, quería %q", g.CudaVersion, tc.wantCuda)
}
}
})
}
}
// TestGpuInfoStruct verifica los campos del tipo GpuInfo.
func TestGpuInfoStruct(t *testing.T) {
t.Run("campos del struct GpuInfo correctos", func(t *testing.T) {
g := GpuInfo{
Index: 0,
Name: "NVIDIA GeForce GTX 1080",
VramTotalMb: 8192,
VramFreeMb: 6144,
DriverVersion: "470.0",
CudaVersion: "11.4",
}
if g.Index != 0 {
t.Errorf("Index = %d", g.Index)
}
if g.Name != "NVIDIA GeForce GTX 1080" {
t.Errorf("Name = %q", g.Name)
}
if g.VramTotalMb != 8192 {
t.Errorf("VramTotalMb = %d", g.VramTotalMb)
}
if g.VramFreeMb != 6144 {
t.Errorf("VramFreeMb = %d", g.VramFreeMb)
}
})
}
+12
View File
@@ -0,0 +1,12 @@
package infra
// GpuInfo describe una GPU detectada en el sistema con sus capacidades de VRAM
// y versiones de driver y CUDA.
type GpuInfo struct {
Index int `json:"index"`
Name string `json:"name"`
VramTotalMb int `json:"vram_total_mb"`
VramFreeMb int `json:"vram_free_mb"`
DriverVersion string `json:"driver_version"`
CudaVersion string `json:"cuda_version,omitempty"`
}
+171
View File
@@ -0,0 +1,171 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"time"
)
// AggregateReport summarises the result of a VaultAggregateIndex run.
type AggregateReport struct {
VaultsProcessed int
VaultsSkipped int // vaults without a vault_index.db
TotalFiles int
Errors []string // non-fatal per-vault errors
}
// VaultAggregateIndex reads all vault manifests from repoRoot, opens each
// vault_index.db and copies all file records into the central registry.db
// vault_files table. The table is created if it does not exist (idempotent).
//
// For each vault the previous rows are deleted and replaced atomically, so
// re-running always produces a clean, non-duplicated state.
//
// Returns an AggregateReport with counts. Per-vault errors are non-fatal
// (logged in report.Errors); only fatal errors (e.g. registry.db
// unreachable) are returned as the error value.
func VaultAggregateIndex(repoRoot string) (AggregateReport, error) {
var report AggregateReport
// 1. Open registry.db
registryDB, err := SQLiteOpen(filepath.Join(repoRoot, "registry.db"), "")
if err != nil {
return report, fmt.Errorf("vault_aggregate_index: open registry.db: %w", err)
}
defer registryDB.Close()
// 2. Idempotent schema migration
for _, stmt := range []string{
`CREATE TABLE IF NOT EXISTS vault_files (
vault_id TEXT NOT NULL,
vault_name TEXT NOT NULL,
rel_path TEXT NOT NULL,
size INTEGER NOT NULL,
mtime INTEGER NOT NULL,
sha256 TEXT NOT NULL,
mime TEXT NOT NULL DEFAULT '',
ext TEXT NOT NULL DEFAULT '',
bucket TEXT NOT NULL DEFAULT '',
sub_bucket TEXT NOT NULL DEFAULT '',
indexed_at INTEGER NOT NULL,
PRIMARY KEY (vault_id, rel_path)
);`,
`CREATE INDEX IF NOT EXISTS idx_vault_files_sha256 ON vault_files(sha256);`,
`CREATE INDEX IF NOT EXISTS idx_vault_files_vault ON vault_files(vault_id);`,
} {
if _, err := registryDB.Exec(stmt); err != nil {
if !isIdempotentMigrationError(err) {
return report, fmt.Errorf("vault_aggregate_index: schema: %w", err)
}
}
}
// 3. Read manifest
entries, err := VaultManifestRead(repoRoot)
if err != nil {
return report, fmt.Errorf("vault_aggregate_index: manifest: %w", err)
}
now := time.Now().UTC().Unix()
for _, entry := range entries {
vaultID := vaultIDFromEntry(entry)
vaultName := entry.Name
vaultPath := entry.Path
indexPath := filepath.Join(vaultPath, "vault_index.db")
if _, statErr := os.Stat(indexPath); statErr != nil {
report.VaultsSkipped++
continue
}
vaultDB, openErr := VaultIndexOpen(vaultPath)
if openErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: open index: %v", vaultName, openErr))
continue
}
rows, queryErr := vaultDB.Query(
`SELECT rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket FROM files`,
)
if queryErr != nil {
vaultDB.Close()
report.Errors = append(report.Errors, fmt.Sprintf("%s: query files: %v", vaultName, queryErr))
continue
}
type fileRow struct {
RelPath string
Size int64
Mtime int64
Sha256 string
Mime string
Ext string
Bucket string
SubBucket string
}
var fileRows []fileRow
for rows.Next() {
var r fileRow
if scanErr := rows.Scan(&r.RelPath, &r.Size, &r.Mtime, &r.Sha256, &r.Mime, &r.Ext, &r.Bucket, &r.SubBucket); scanErr != nil {
continue
}
fileRows = append(fileRows, r)
}
rows.Close()
vaultDB.Close()
// Atomic replace in registry.db
tx, txErr := registryDB.Begin()
if txErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: begin tx: %v", vaultName, txErr))
continue
}
if _, delErr := tx.Exec(`DELETE FROM vault_files WHERE vault_id = ?`, vaultID); delErr != nil {
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: delete: %v", vaultName, delErr))
continue
}
stmt, prepErr := tx.Prepare(`
INSERT INTO vault_files
(vault_id, vault_name, rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`)
if prepErr != nil {
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: prepare: %v", vaultName, prepErr))
continue
}
for _, r := range fileRows {
if _, insErr := stmt.Exec(vaultID, vaultName, r.RelPath, r.Size, r.Mtime, r.Sha256, r.Mime, r.Ext, r.Bucket, r.SubBucket, now); insErr != nil {
stmt.Close()
tx.Rollback()
report.Errors = append(report.Errors, fmt.Sprintf("%s: insert %s: %v", vaultName, r.RelPath, insErr))
continue
}
}
stmt.Close()
if commitErr := tx.Commit(); commitErr != nil {
report.Errors = append(report.Errors, fmt.Sprintf("%s: commit: %v", vaultName, commitErr))
continue
}
report.VaultsProcessed++
report.TotalFiles += len(fileRows)
}
return report, nil
}
// vaultIDFromEntry constructs the canonical vault ID used in registry.db.
// Pattern: "<vault_name>_<project_id>" — consistent with the vaults table.
func vaultIDFromEntry(e VaultManifestEntry) string {
if e.ProjectID == "" {
return e.Name
}
return e.Name + "_" + e.ProjectID
}
+58
View File
@@ -0,0 +1,58 @@
---
name: vault_aggregate_index
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultAggregateIndex(repoRoot string) (AggregateReport, error)"
description: "Agrega los índices de todos los vaults del registry en la tabla vault_files de registry.db. Lee cada vault_index.db (via VaultIndexOpen) y reemplaza las filas de forma atómica. Idempotente: re-ejecutar limpia y reescribe sin duplicar."
tags: [vault, index, aggregate, registry]
uses_functions:
- "vault_manifest_read_go_infra"
- "vault_index_open_go_infra"
- "sqlite_open_go_infra"
uses_types:
- "vault_file_go_infra"
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "database/sql"
- "fmt"
- "os"
- "path/filepath"
- "time"
tested: true
tests:
- "TestVaultAggregateIndex_NoVaults"
- "TestVaultAggregateIndex_VaultWithoutIndex"
- "TestVaultAggregateIndex_HappyPath"
- "TestVaultAggregateIndex_ReRunReplaces"
test_file_path: "functions/infra/vault_aggregate_index_test.go"
file_path: "functions/infra/vault_aggregate_index.go"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del fn_registry (contiene registry.db y projects/)."
output: "AggregateReport con VaultsProcessed, VaultsSkipped (sin vault_index.db), TotalFiles y Errors (errores no fatales por vault). Error fatal solo si registry.db no se puede abrir."
---
## Ejemplo
```go
report, err := infra.VaultAggregateIndex("/home/lucas/fn_registry")
if err != nil {
log.Fatal(err)
}
fmt.Printf("Processed: %d vaults, %d files\n", report.VaultsProcessed, report.TotalFiles)
for _, e := range report.Errors {
fmt.Println("warning:", e)
}
```
## Notas
- Requiere que `registry/migrations/012_vault_files.sql` haya sido aplicado (o que el indexer lo aplique al arrancar). La función aplica la migración de forma idempotente ella misma con `CREATE TABLE IF NOT EXISTS`.
- Por cada vault: `DELETE WHERE vault_id = ?` + batch `INSERT` dentro de una transacción. Re-run siempre produce el mismo resultado.
- Vaults sin `vault_index.db` se cuentan en `VaultsSkipped` y se omiten sin error.
- El `vault_id` sigue el patrón `<vault_name>_<project_id>`, consistente con la tabla `vaults` de registry.db.
@@ -0,0 +1,175 @@
package infra
import (
"os"
"path/filepath"
"testing"
"time"
)
// setupAggregateTestRepo creates a minimal repo layout:
//
// <root>/
// registry.db (SQLite, empty)
// projects/<project>/vaults/vault.yaml
// <vaultPath>/ (optionally with vault_index.db populated)
func setupAggregateTestRepo(t *testing.T, vaultName, projectID, vaultPath string, withIndex bool) string {
t.Helper()
root := t.TempDir()
// Create registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("create registry.db: %v", err)
}
regDB.Close()
// Create project vault manifest
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
t.Fatalf("mkdir projects: %v", err)
}
manifestYAML := "vaults:\n - name: " + vaultName + "\n description: test\n path: " + vaultPath + "\n tags: []\n"
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifestYAML), 0644); err != nil {
t.Fatalf("write vault.yaml: %v", err)
}
// Create vault dir
if err := os.MkdirAll(vaultPath, 0755); err != nil {
t.Fatalf("mkdir vault: %v", err)
}
if withIndex {
// Create a vault_index.db with one file row
vdb, err := VaultIndexOpen(vaultPath)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
now := time.Now().UTC().Unix()
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/sample.csv", 1024, now, "deadbeef", "text/csv", ".csv", "data", "raw", now)
if err != nil {
t.Fatalf("insert test file: %v", err)
}
vdb.Close()
}
return root
}
func TestVaultAggregateIndex_NoVaults(t *testing.T) {
root := t.TempDir()
// No manifests, just registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("create registry.db: %v", err)
}
regDB.Close()
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsProcessed != 0 {
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
}
if len(report.Errors) != 0 {
t.Errorf("Errors: want empty, got %v", report.Errors)
}
}
func TestVaultAggregateIndex_VaultWithoutIndex(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, false /* no vault_index.db */)
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsSkipped != 1 {
t.Errorf("VaultsSkipped: want 1, got %d", report.VaultsSkipped)
}
if report.VaultsProcessed != 0 {
t.Errorf("VaultsProcessed: want 0, got %d", report.VaultsProcessed)
}
}
func TestVaultAggregateIndex_HappyPath(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if report.VaultsProcessed != 1 {
t.Errorf("VaultsProcessed: want 1, got %d", report.VaultsProcessed)
}
if report.TotalFiles != 1 {
t.Errorf("TotalFiles: want 1, got %d", report.TotalFiles)
}
// Verify row exists in registry.db
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("open registry.db: %v", err)
}
defer regDB.Close()
var count int
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
t.Fatalf("count vault_files: %v", err)
}
if count != 1 {
t.Errorf("vault_files count: want 1, got %d", count)
}
}
func TestVaultAggregateIndex_ReRunReplaces(t *testing.T) {
vaultDir := t.TempDir()
root := setupAggregateTestRepo(t, "my_vault", "my_proj", vaultDir, true)
// First run
if _, err := VaultAggregateIndex(root); err != nil {
t.Fatalf("first run: %v", err)
}
// Add a second file to vault_index.db
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("reopen vault index: %v", err)
}
now := time.Now().UTC().Unix()
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/extra.csv", 512, now, "cafebabe", "text/csv", ".csv", "data", "raw", now)
if err != nil {
t.Fatalf("insert second file: %v", err)
}
vdb.Close()
// Second run
report, err := VaultAggregateIndex(root)
if err != nil {
t.Fatalf("second run: %v", err)
}
if report.TotalFiles != 2 {
t.Errorf("TotalFiles: want 2, got %d", report.TotalFiles)
}
// Verify no duplicates — exactly 2 rows
regDB, err := SQLiteOpen(filepath.Join(root, "registry.db"), "")
if err != nil {
t.Fatalf("open registry.db: %v", err)
}
defer regDB.Close()
var count int
if err := regDB.QueryRow(`SELECT COUNT(*) FROM vault_files`).Scan(&count); err != nil {
t.Fatalf("count vault_files: %v", err)
}
if count != 2 {
t.Errorf("vault_files count after re-run: want 2, got %d", count)
}
}
+68
View File
@@ -0,0 +1,68 @@
package infra
import "sort"
// VaultFileChange holds the before/after state of a file whose content changed.
type VaultFileChange struct {
RelPath string
Prev VaultFile
Curr VaultFile
}
// VaultDiffReport is the result of comparing two VaultFile slices.
type VaultDiffReport struct {
Added []VaultFile // in curr but not in prev (by rel_path)
Removed []VaultFile // in prev but not in curr
Changed []VaultFileChange // same rel_path, different sha256
Unchanged int // files present in both with identical sha256
}
// VaultDiff computes the difference between two vault snapshots.
// It indexes both slices by RelPath, then classifies each entry as
// Added, Removed, Changed, or Unchanged. All output slices are sorted
// by RelPath ascending. The function is pure and deterministic.
func VaultDiff(prev, curr []VaultFile) VaultDiffReport {
prevMap := make(map[string]VaultFile, len(prev))
for _, f := range prev {
prevMap[f.RelPath] = f
}
currMap := make(map[string]VaultFile, len(curr))
for _, f := range curr {
currMap[f.RelPath] = f
}
var report VaultDiffReport
for _, f := range curr {
p, exists := prevMap[f.RelPath]
if !exists {
report.Added = append(report.Added, f)
} else if p.Sha256 != f.Sha256 {
report.Changed = append(report.Changed, VaultFileChange{
RelPath: f.RelPath,
Prev: p,
Curr: f,
})
} else {
report.Unchanged++
}
}
for _, f := range prev {
if _, exists := currMap[f.RelPath]; !exists {
report.Removed = append(report.Removed, f)
}
}
sort.Slice(report.Added, func(i, j int) bool {
return report.Added[i].RelPath < report.Added[j].RelPath
})
sort.Slice(report.Removed, func(i, j int) bool {
return report.Removed[i].RelPath < report.Removed[j].RelPath
})
sort.Slice(report.Changed, func(i, j int) bool {
return report.Changed[i].RelPath < report.Changed[j].RelPath
})
return report
}
+49
View File
@@ -0,0 +1,49 @@
---
name: vault_diff
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: pure
signature: "func VaultDiff(prev, curr []VaultFile) VaultDiffReport"
description: "Computes the diff between two vault snapshots (slices of VaultFile). Returns Added, Removed, Changed and Unchanged counts. Pure and deterministic — no I/O."
tags: [vault, diff, comparison, pure]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: ""
imports: ["sort"]
tested: true
tests:
- "TestVaultDiff_NoChanges"
- "TestVaultDiff_AllAdded"
- "TestVaultDiff_AllRemoved"
- "TestVaultDiff_ContentChanged"
- "TestVaultDiff_Mixed"
test_file_path: "functions/infra/vault_diff_test.go"
file_path: "functions/infra/vault_diff.go"
params:
- name: prev
desc: "Snapshot anterior — slice de VaultFile del estado previo del vault (puede ser nil para diff desde cero)."
- name: curr
desc: "Snapshot actual — slice de VaultFile del estado corriente del vault (puede ser nil para diff de borrado total)."
output: "VaultDiffReport con Added (nuevos), Removed (eliminados), Changed (mismo rel_path, sha256 distinto) y Unchanged (identicos). Todos los slices ordenados por RelPath ASC."
---
## Ejemplo
```go
prev, _ := infra.VaultInventoryScan(oldPath, "my_vault_proj", "my_vault")
curr, _ := infra.VaultInventoryScan(newPath, "my_vault_proj", "my_vault")
report := infra.VaultDiff(prev, curr)
fmt.Printf("Added: %d, Removed: %d, Changed: %d, Unchanged: %d\n",
len(report.Added), len(report.Removed), len(report.Changed), report.Unchanged)
```
## Notas
- Usa `RelPath` como clave de identidad de archivo (no nombre, no sha256).
- Dos archivos con mismo `RelPath` pero diferente `Sha256` se consideran Changed.
- Los slices del report se ordenan por `RelPath` ASC para salida deterministica.
- Función pura: no toca disco ni BD.
+126
View File
@@ -0,0 +1,126 @@
package infra
import (
"testing"
)
func makeVF(relPath, sha256 string) VaultFile {
return VaultFile{
VaultID: "test_vault",
VaultName: "test",
RelPath: relPath,
Sha256: sha256,
}
}
func TestVaultDiff_NoChanges(t *testing.T) {
files := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(files, files)
if len(report.Added) != 0 {
t.Errorf("Added: want 0, got %d", len(report.Added))
}
if len(report.Removed) != 0 {
t.Errorf("Removed: want 0, got %d", len(report.Removed))
}
if len(report.Changed) != 0 {
t.Errorf("Changed: want 0, got %d", len(report.Changed))
}
if report.Unchanged != 2 {
t.Errorf("Unchanged: want 2, got %d", report.Unchanged)
}
}
func TestVaultDiff_AllAdded(t *testing.T) {
curr := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(nil, curr)
if len(report.Added) != 2 {
t.Errorf("Added: want 2, got %d", len(report.Added))
}
if len(report.Removed) != 0 {
t.Errorf("Removed: want 0, got %d", len(report.Removed))
}
if report.Added[0].RelPath != "data/a.csv" {
t.Errorf("Added[0]: want data/a.csv, got %s", report.Added[0].RelPath)
}
if report.Added[1].RelPath != "data/b.csv" {
t.Errorf("Added[1]: want data/b.csv, got %s", report.Added[1].RelPath)
}
}
func TestVaultDiff_AllRemoved(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
}
report := VaultDiff(prev, nil)
if len(report.Removed) != 2 {
t.Errorf("Removed: want 2, got %d", len(report.Removed))
}
if len(report.Added) != 0 {
t.Errorf("Added: want 0, got %d", len(report.Added))
}
if report.Removed[0].RelPath != "data/a.csv" {
t.Errorf("Removed[0]: want data/a.csv, got %s", report.Removed[0].RelPath)
}
}
func TestVaultDiff_ContentChanged(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "old_hash"),
}
curr := []VaultFile{
makeVF("data/a.csv", "new_hash"),
}
report := VaultDiff(prev, curr)
if len(report.Changed) != 1 {
t.Fatalf("Changed: want 1, got %d", len(report.Changed))
}
if report.Changed[0].RelPath != "data/a.csv" {
t.Errorf("Changed[0].RelPath: want data/a.csv, got %s", report.Changed[0].RelPath)
}
if report.Changed[0].Prev.Sha256 != "old_hash" {
t.Errorf("Changed[0].Prev.Sha256: want old_hash, got %s", report.Changed[0].Prev.Sha256)
}
if report.Changed[0].Curr.Sha256 != "new_hash" {
t.Errorf("Changed[0].Curr.Sha256: want new_hash, got %s", report.Changed[0].Curr.Sha256)
}
if len(report.Added) != 0 || len(report.Removed) != 0 {
t.Errorf("Expected no added/removed, got %d/%d", len(report.Added), len(report.Removed))
}
if report.Unchanged != 0 {
t.Errorf("Unchanged: want 0, got %d", report.Unchanged)
}
}
func TestVaultDiff_Mixed(t *testing.T) {
prev := []VaultFile{
makeVF("data/a.csv", "aaa"),
makeVF("data/b.csv", "bbb"),
makeVF("data/c.csv", "ccc"),
}
curr := []VaultFile{
makeVF("data/a.csv", "aaa"), // unchanged
makeVF("data/b.csv", "bbb_new"), // changed
makeVF("data/d.csv", "ddd"), // added
}
report := VaultDiff(prev, curr)
if len(report.Added) != 1 || report.Added[0].RelPath != "data/d.csv" {
t.Errorf("Added: want [data/d.csv], got %v", report.Added)
}
if len(report.Removed) != 1 || report.Removed[0].RelPath != "data/c.csv" {
t.Errorf("Removed: want [data/c.csv], got %v", report.Removed)
}
if len(report.Changed) != 1 || report.Changed[0].RelPath != "data/b.csv" {
t.Errorf("Changed: want [data/b.csv], got %v", report.Changed)
}
if report.Unchanged != 1 {
t.Errorf("Unchanged: want 1, got %d", report.Unchanged)
}
}
+230
View File
@@ -0,0 +1,230 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
// VaultDoctorEntry holds the health report for a single vault.
type VaultDoctorEntry struct {
VaultName string `json:"vault_name"`
VaultPath string `json:"vault_path"`
ProjectID string `json:"project_id"`
Issues []string `json:"issues"` // human-readable issues; empty = healthy
IndexedFiles int `json:"indexed_files"` // 0 if no vault_index.db
LastIndexedAt int64 `json:"last_indexed_at"` // unix seconds; 0 if N/A
DiskFiles int `json:"disk_files"` // count via WalkDir (no hashing)
Status string `json:"status"` // "ok" | "warning" | "error"
}
// VaultDoctor audits every vault declared in projects/*/vaults/vault.yaml under
// repoRoot. For each vault it performs a series of checks (disk presence, layout,
// index existence, staleness, drift) and returns a slice of VaultDoctorEntry.
//
// The function is read-only: it never writes to disk or any database.
// Returns an error only if VaultManifestRead fails (manifest parse error).
func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error) {
entries, err := VaultManifestRead(repoRoot)
if err != nil {
return nil, fmt.Errorf("vault_doctor: read manifests: %w", err)
}
results := make([]VaultDoctorEntry, 0, len(entries))
for _, e := range entries {
result := auditVault(e)
results = append(results, result)
}
return results, nil
}
func auditVault(e VaultManifestEntry) VaultDoctorEntry {
entry := VaultDoctorEntry{
VaultName: e.Name,
VaultPath: e.Path,
ProjectID: e.ProjectID,
}
// Resolve symlinks for disk checks
realPath, err := filepath.EvalSymlinks(e.Path)
if err != nil || realPath == "" {
realPath = e.Path
}
// CHECK 1: directory_missing
info, statErr := os.Stat(realPath)
if statErr != nil || !info.IsDir() {
entry.Issues = append(entry.Issues, "directory_missing")
entry.Status = "error"
return entry
}
// COUNT disk files (cheap walk — no hashing, no mime detection)
diskCount := countDiskFiles(realPath)
entry.DiskFiles = diskCount
// CHECK 2: layout_missing / non_standard_layout
hasData := dirExists(filepath.Join(realPath, "data"))
hasKnowledge := dirExists(filepath.Join(realPath, "knowledge"))
if !hasData && !hasKnowledge {
// Check if it looks like a non-standard but intentional layout
if hasNonStandardLayout(realPath) {
entry.Issues = append(entry.Issues, "non_standard_layout")
} else {
entry.Issues = append(entry.Issues, "layout_missing")
}
}
// CHECK 3: index_missing
indexPath := filepath.Join(realPath, "vault_index.db")
_, indexStatErr := os.Stat(indexPath)
if indexStatErr != nil {
entry.Issues = append(entry.Issues, "index_missing")
entry.setWarningStatus()
entry.setFinalStatus()
return entry
}
// Open vault index (read-only) for checks 4 and 5
vdb, openErr := VaultIndexOpen(realPath)
if openErr != nil {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_open_error: %v", openErr))
entry.setWarningStatus()
return entry
}
defer vdb.Close()
// Query indexed file count and max indexed_at
var indexedCount int
var maxIndexedAt int64
row := vdb.QueryRow(`SELECT COUNT(*), COALESCE(MAX(indexed_at), 0) FROM files`)
if scanErr := row.Scan(&indexedCount, &maxIndexedAt); scanErr != nil {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_query_error: %v", scanErr))
} else {
entry.IndexedFiles = indexedCount
entry.LastIndexedAt = maxIndexedAt
}
// CHECK 4: index_stale — any file on disk newer than MAX(indexed_at)
if maxIndexedAt > 0 {
maxTime := time.Unix(maxIndexedAt, 0)
if isIndexStale(realPath, maxTime) {
entry.Issues = append(entry.Issues, "index_stale")
}
}
// CHECK 5: index_drift — disk file count != indexed count
if indexedCount != diskCount {
entry.Issues = append(entry.Issues, fmt.Sprintf("index_drift: disk=%d indexed=%d", diskCount, indexedCount))
}
// CHECK 6: empty_vault
if diskCount == 0 {
entry.Issues = append(entry.Issues, "empty_vault")
}
entry.setFinalStatus()
return entry
}
// setWarningStatus sets status to warning if not already error.
func (e *VaultDoctorEntry) setWarningStatus() {
if e.Status != "error" {
e.Status = "warning"
}
}
// setFinalStatus derives the final Status from Issues.
func (e *VaultDoctorEntry) setFinalStatus() {
if e.Status == "error" {
return
}
if len(e.Issues) == 0 {
e.Status = "ok"
} else {
e.Status = "warning"
}
}
// countDiskFiles walks realPath and counts regular files, excluding:
// vault_index.db*, .git/, hidden files/dirs at any depth.
func countDiskFiles(realPath string) int {
count := 0
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return nil
}
name := d.Name()
// Skip hidden entries
if strings.HasPrefix(name, ".") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
// Skip .git
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
// Skip vault_index.db files
if !d.IsDir() && (name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal") {
return nil
}
if !d.IsDir() {
count++
}
return nil
})
return count
}
// isIndexStale returns true if any regular file under realPath has an mtime
// strictly after maxTime (excluding vault_index.db* and hidden files).
func isIndexStale(realPath string, maxTime time.Time) bool {
stale := false
_ = filepath.WalkDir(realPath, func(path string, d os.DirEntry, err error) error {
if err != nil || stale {
return nil
}
name := d.Name()
if strings.HasPrefix(name, ".") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
if !d.IsDir() {
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
return nil
}
fi, statErr := d.Info()
if statErr == nil && fi.ModTime().After(maxTime) {
stale = true
}
}
return nil
})
return stale
}
// hasNonStandardLayout returns true when a vault directory contains
// subdirectories that are clearly intentional but not data/knowledge.
// Heuristic: any subdir at the vault root that is not data/knowledge.
func hasNonStandardLayout(realPath string) bool {
entries, err := os.ReadDir(realPath)
if err != nil {
return false
}
standardDirs := map[string]bool{"data": true, "knowledge": true, ".git": true}
for _, e := range entries {
if e.IsDir() && !standardDirs[e.Name()] && !strings.HasPrefix(e.Name(), ".") {
return true
}
}
return false
}
+66
View File
@@ -0,0 +1,66 @@
---
name: vault_doctor
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultDoctor(repoRoot string) ([]VaultDoctorEntry, error)"
description: "Audita la salud de todos los vaults declarados en projects/*/vaults/vault.yaml. Comprueba existencia del directorio, layout estándar, presencia del índice, staleness y drift entre disco e índice. Read-only."
tags: [vault, doctor, health, audit]
uses_functions:
- "vault_manifest_read_go_infra"
- "vault_index_open_go_infra"
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "time"
tested: true
tests:
- "TestVaultDoctor_OK"
- "TestVaultDoctor_MissingDir"
- "TestVaultDoctor_NoIndex"
- "TestVaultDoctor_LayoutDrift"
- "TestVaultDoctor_EmptyVault"
test_file_path: "functions/infra/vault_doctor_test.go"
file_path: "functions/infra/vault_doctor.go"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del fn_registry (donde están projects/ y registry.db)."
output: "Slice de VaultDoctorEntry con Status (ok/warning/error), Issues, DiskFiles, IndexedFiles y LastIndexedAt por vault. Error fatal solo si los manifests no se pueden leer."
---
## Checks aplicados
| Check | Condición | Severidad |
|---|---|---|
| `directory_missing` | `e.Path` no existe en disco | error |
| `layout_missing` | no hay `data/` ni `knowledge/` en la raíz del vault | warning |
| `non_standard_layout` | no hay `data/`/`knowledge/` pero sí otros subdirectorios (ej. imagegen_models) | warning |
| `index_missing` | no existe `vault_index.db` | warning |
| `index_stale` | algún archivo en disco tiene mtime > MAX(indexed_at) | warning |
| `index_drift` | count disco != count en tabla `files` | warning |
| `empty_vault` | DiskFiles == 0 | warning |
## Ejemplo
```go
entries, err := infra.VaultDoctor("/home/lucas/fn_registry")
for _, e := range entries {
fmt.Printf("%-30s %-8s files=%d issues=%v\n",
e.VaultName, e.Status, e.DiskFiles, e.Issues)
}
```
## Notas
- Función read-only: nunca escribe en disco ni en ninguna base de datos.
- `countDiskFiles` usa `filepath.WalkDir` sin hash (cheap) — excluye `vault_index.db*`, `.git/` y ficheros ocultos.
- `isIndexStale` también usa WalkDir; compara mtime de archivos con MAX(indexed_at) de la BD.
- El VaultIndexOpen de sólo lectura no crea el DB (si no existe, retorna error y se reporta `index_missing`).
+211
View File
@@ -0,0 +1,211 @@
package infra
import (
"os"
"path/filepath"
"testing"
"time"
)
// setupDoctorRepo creates a repo layout with one vault in a project manifest.
// vaultPath must be an absolute path that already exists (or not, for missing tests).
func setupDoctorRepo(t *testing.T, vaultName, projectID, vaultPath string) string {
t.Helper()
root := t.TempDir()
projVaultsDir := filepath.Join(root, "projects", projectID, "vaults")
if err := os.MkdirAll(projVaultsDir, 0755); err != nil {
t.Fatalf("mkdir projects: %v", err)
}
manifest := "vaults:\n - name: " + vaultName + "\n description: test vault\n path: " + vaultPath + "\n tags: []\n"
if err := os.WriteFile(filepath.Join(projVaultsDir, "vault.yaml"), []byte(manifest), 0644); err != nil {
t.Fatalf("write vault.yaml: %v", err)
}
return root
}
func TestVaultDoctor_OK(t *testing.T) {
vaultDir := t.TempDir()
// Proper layout
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
t.Fatal(err)
}
// Create a file with a past mtime so the index is not stale
samplePath := filepath.Join(vaultDir, "data", "raw", "sample.csv")
if err := os.WriteFile(samplePath, []byte("a,b\n1,2\n"), 0644); err != nil {
t.Fatal(err)
}
pastTime := time.Now().Add(-1 * time.Hour)
if err := os.Chtimes(samplePath, pastTime, pastTime); err != nil {
t.Fatal(err)
}
// Create vault_index.db with the file indexed after its mtime
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
futureIndexed := time.Now().Unix() // indexed_at is now — after file mtime
_, err = vdb.Exec(`INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
"data/raw/sample.csv", 8, pastTime.Unix(), "deadbeef", "text/csv", ".csv", "data", "raw", futureIndexed)
if err != nil {
t.Fatalf("insert: %v", err)
}
vdb.Close()
root := setupDoctorRepo(t, "my_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "ok" {
t.Errorf("Status: want ok, got %s (issues: %v)", e.Status, e.Issues)
}
if len(e.Issues) != 0 {
t.Errorf("Issues: want empty, got %v", e.Issues)
}
if e.DiskFiles != 1 {
t.Errorf("DiskFiles: want 1, got %d", e.DiskFiles)
}
if e.IndexedFiles != 1 {
t.Errorf("IndexedFiles: want 1, got %d", e.IndexedFiles)
}
}
func TestVaultDoctor_MissingDir(t *testing.T) {
missingPath := filepath.Join(t.TempDir(), "does_not_exist")
root := setupDoctorRepo(t, "missing_vault", "my_proj", missingPath)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "error" {
t.Errorf("Status: want error, got %s", e.Status)
}
found := false
for _, issue := range e.Issues {
if issue == "directory_missing" {
found = true
}
}
if !found {
t.Errorf("Expected directory_missing issue, got %v", e.Issues)
}
}
func TestVaultDoctor_NoIndex(t *testing.T) {
vaultDir := t.TempDir()
// Proper layout but no vault_index.db
if err := os.MkdirAll(filepath.Join(vaultDir, "data", "raw"), 0755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(vaultDir, "data", "raw", "a.csv"), []byte("x"), 0644); err != nil {
t.Fatal(err)
}
root := setupDoctorRepo(t, "no_index_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s", e.Status)
}
found := false
for _, issue := range e.Issues {
if issue == "index_missing" {
found = true
}
}
if !found {
t.Errorf("Expected index_missing issue, got %v", e.Issues)
}
}
func TestVaultDoctor_LayoutDrift(t *testing.T) {
vaultDir := t.TempDir()
// No data/ or knowledge/ — just a random file at root
if err := os.WriteFile(filepath.Join(vaultDir, "something.txt"), []byte("hi"), 0644); err != nil {
t.Fatal(err)
}
root := setupDoctorRepo(t, "layout_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s", e.Status)
}
foundLayout := false
for _, issue := range e.Issues {
if issue == "layout_missing" || issue == "non_standard_layout" {
foundLayout = true
}
}
if !foundLayout {
t.Errorf("Expected layout_missing or non_standard_layout, got %v", e.Issues)
}
}
func TestVaultDoctor_EmptyVault(t *testing.T) {
vaultDir := t.TempDir()
// data/ and knowledge/ exist but are empty
if err := os.MkdirAll(filepath.Join(vaultDir, "data"), 0755); err != nil {
t.Fatal(err)
}
if err := os.MkdirAll(filepath.Join(vaultDir, "knowledge"), 0755); err != nil {
t.Fatal(err)
}
// Create vault_index.db (empty)
vdb, err := VaultIndexOpen(vaultDir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
vdb.Close()
root := setupDoctorRepo(t, "empty_vault", "my_proj", vaultDir)
entries, err := VaultDoctor(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 1 {
t.Fatalf("expected 1 entry, got %d", len(entries))
}
e := entries[0]
if e.Status != "warning" {
t.Errorf("Status: want warning, got %s (issues: %v)", e.Status, e.Issues)
}
found := false
for _, issue := range e.Issues {
if issue == "empty_vault" {
found = true
}
}
if !found {
t.Errorf("Expected empty_vault issue, got %v", e.Issues)
}
}
+21
View File
@@ -0,0 +1,21 @@
package infra
// VaultFile describes a single file inside a vault directory.
// It carries identity (vault + relative path), content metadata (size, mtime, sha256, mime)
// and structural classification (bucket, sub-bucket).
type VaultFile struct {
VaultID string `json:"vault_id"` // e.g. "turismo_spain_app_turismo"
VaultName string `json:"vault_name"` // e.g. "turismo_spain"
RelPath string `json:"rel_path"` // path relative to vault root, e.g. "data/raw/foo.csv"
Size int64 `json:"size"` // bytes
Mtime int64 `json:"mtime"` // unix seconds (UTC)
Sha256 string `json:"sha256"` // hex lowercase
Mime string `json:"mime"` // e.g. "text/csv"
Ext string `json:"ext"` // e.g. ".csv"
// Bucket is the top-level classification: "data" or "knowledge".
Bucket string `json:"bucket"`
// SubBucket is the second-level directory within the bucket.
// Known values: raw, processed, exports (data); decisions, domains, models,
// benchmarks, test_documents (knowledge). Empty string for files at bucket root.
SubBucket string `json:"sub_bucket"`
}
@@ -0,0 +1,49 @@
CREATE TABLE IF NOT EXISTS files (
rel_path TEXT PRIMARY KEY,
size INTEGER NOT NULL,
mtime INTEGER NOT NULL,
sha256 TEXT NOT NULL,
mime TEXT NOT NULL DEFAULT '',
ext TEXT NOT NULL DEFAULT '',
bucket TEXT NOT NULL DEFAULT '',
sub_bucket TEXT NOT NULL DEFAULT '',
indexed_at INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_files_sha256 ON files(sha256);
CREATE INDEX IF NOT EXISTS idx_files_bucket ON files(bucket, sub_bucket);
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
rel_path,
content_text,
content='',
tokenize='unicode61 remove_diacritics 2'
);
CREATE TABLE IF NOT EXISTS csv_profiles (
rel_path TEXT PRIMARY KEY,
cols_json TEXT NOT NULL,
n_rows INTEGER NOT NULL,
encoding TEXT NOT NULL DEFAULT '',
date_min TEXT,
date_max TEXT,
profiled_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS pdf_extracts (
rel_path TEXT PRIMARY KEY,
page_count INTEGER NOT NULL,
text_len INTEGER NOT NULL,
extracted_to TEXT,
extracted_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS knowledge_docs (
rel_path TEXT PRIMARY KEY,
title TEXT NOT NULL DEFAULT '',
frontmatter_json TEXT NOT NULL DEFAULT '{}',
headings_json TEXT NOT NULL DEFAULT '[]',
parsed_at INTEGER NOT NULL,
FOREIGN KEY (rel_path) REFERENCES files(rel_path) ON DELETE CASCADE
);
+30
View File
@@ -0,0 +1,30 @@
package infra
import (
"database/sql"
"embed"
"fmt"
"path/filepath"
)
//go:embed vault_index_migrations/*.sql
var vaultIndexMigrationsFS embed.FS
// VaultIndexOpen opens (or creates) the vault_index.db inside vaultPath.
// It applies all embedded migrations idempotently and returns a ready-to-use
// *sql.DB. The caller is responsible for closing the connection.
//
// The database is opened with WAL mode and foreign keys enabled via SQLiteOpen.
// Migrations are applied from vault_index_migrations/*.sql in lexicographic order.
func VaultIndexOpen(vaultPath string) (*sql.DB, error) {
dbPath := filepath.Join(vaultPath, "vault_index.db")
db, err := SQLiteOpen(dbPath, "")
if err != nil {
return nil, fmt.Errorf("vault_index_open: %w", err)
}
if err := ApplyMigrations(db, vaultIndexMigrationsFS, "vault_index_migrations/*.sql"); err != nil {
db.Close()
return nil, fmt.Errorf("vault_index_open: apply migrations: %w", err)
}
return db, nil
}
+54
View File
@@ -0,0 +1,54 @@
---
name: vault_index_open
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultIndexOpen(vaultPath string) (*sql.DB, error)"
description: "Abre (o crea) vault_index.db dentro de vaultPath con WAL + FK y aplica las migraciones embebidas idempotentemente. El caller cierra la conexion."
tags: [vault, sqlite, index, migration, infra]
uses_functions: ["sqlite_open_go_infra", "sqlite_apply_migrations_go_infra"]
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, embed, fmt, path/filepath]
params:
- name: vaultPath
desc: "ruta absoluta o relativa al directorio raiz del vault"
output: "*sql.DB apuntando a <vaultPath>/vault_index.db con schema completo aplicado; el caller es responsable de cerrar"
tested: true
tests:
- "crea vault_index.db en tmpdir vacio"
- "segunda apertura no falla (idempotente)"
- "todas las tablas esperadas existen en sqlite_master"
- "fts5 INSERT y MATCH funcionan"
test_file_path: "functions/infra/vault_index_open_test.go"
file_path: "functions/infra/vault_index_open.go"
---
## Ejemplo
```go
db, err := VaultIndexOpen("/data/vaults/turismo_spain")
if err != nil {
log.Fatal(err)
}
defer db.Close()
```
## Notas
El archivo de base de datos se crea en `<vaultPath>/vault_index.db`. Las migraciones
viven en `vault_index_migrations/*.sql` embebidas via `//go:embed` en el mismo paquete.
Schema creado por `001_init.sql`:
- `files` — inventario de archivos (PK: rel_path)
- `files_fts` — tabla FTS5 virtual para busqueda de texto (content_text lo llenan profilers posteriores)
- `csv_profiles` — perfil de columnas/filas para .csv (FK → files)
- `pdf_extracts` — metadatos de extraccion de texto para .pdf (FK → files)
- `knowledge_docs` — headings/frontmatter para .md del bucket knowledge (FK → files)
`SQLiteOpen` abre con WAL mode + foreign keys. `ApplyMigrations` es idempotente:
los errores de "already exists" y "duplicate column" se ignoran silenciosamente.
+107
View File
@@ -0,0 +1,107 @@
package infra
import (
"database/sql"
"os"
"path/filepath"
"testing"
)
func TestVaultIndexOpen_CreatesDB(t *testing.T) {
t.Run("crea vault_index.db en tmpdir vacio", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
dbPath := filepath.Join(dir, "vault_index.db")
if _, err := os.Stat(dbPath); os.IsNotExist(err) {
t.Fatalf("vault_index.db no fue creado en %s", dir)
}
})
}
func TestVaultIndexOpen_Idempotent(t *testing.T) {
t.Run("segunda apertura no falla (idempotente)", func(t *testing.T) {
dir := t.TempDir()
db1, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("primera apertura: %v", err)
}
db1.Close()
db2, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("segunda apertura: %v", err)
}
db2.Close()
})
}
func TestVaultIndexOpen_AppliesAllMigrations(t *testing.T) {
t.Run("todas las tablas esperadas existen en sqlite_master", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
expectedTables := []string{
"files",
"csv_profiles",
"pdf_extracts",
"knowledge_docs",
}
for _, tbl := range expectedTables {
assertTableExists(t, db, tbl)
}
})
}
func TestVaultIndexOpen_FTS5Works(t *testing.T) {
t.Run("fts5 INSERT y MATCH funcionan", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
defer db.Close()
// Insert a row into files_fts (content='' table, manual INSERT required)
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`,
"data/raw/informe_ventas.csv", "ventas trimestrales empresa")
if err != nil {
t.Fatalf("INSERT files_fts: %v", err)
}
var count int
err = db.QueryRow(
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'ventas'`,
).Scan(&count)
if err != nil {
t.Fatalf("FTS MATCH query: %v", err)
}
if count != 1 {
t.Errorf("FTS MATCH: got %d rows, want 1", count)
}
})
}
// assertTableExists verifies that a table (or virtual table) exists in sqlite_master.
func assertTableExists(t *testing.T, db *sql.DB, name string) {
t.Helper()
var exists int
err := db.QueryRow(
`SELECT count(*) FROM sqlite_master WHERE name = ?`, name,
).Scan(&exists)
if err != nil {
t.Fatalf("sqlite_master query for %q: %v", name, err)
}
if exists == 0 {
t.Errorf("table/vtable %q not found in sqlite_master", name)
}
}
+154
View File
@@ -0,0 +1,154 @@
package infra
import (
"database/sql"
"fmt"
"strings"
"time"
)
// WriteReport summarises the outcome of a VaultIndexWrite call.
type WriteReport struct {
Inserted int // rows newly inserted into files
Updated int // rows updated (upserted) in files
Pruned int // rows deleted from files (only when prune=true)
FTS int // rows inserted into files_fts
}
// VaultIndexWrite upserts a slice of VaultFile into the vault_index.db opened
// as db, updates the files_fts FTS5 table, and optionally prunes stale rows.
//
// All changes run inside a single transaction.
//
// Counting strategy: the set of rel_paths already in the DB is read before the
// loop. An upsert is counted as Inserted if the rel_path was absent, Updated if
// it was present. This avoids N+1 queries while remaining correct.
//
// FTS5: all affected rows are deleted and re-inserted with rel_path and empty
// content_text. Downstream profilers (csv_profiles, pdf_extracts, knowledge_docs)
// are responsible for populating content_text with meaningful text.
//
// Prune: if prune=true, every row in files whose rel_path is NOT in the provided
// slice is deleted. Cascades to csv_profiles, pdf_extracts, knowledge_docs via FK.
func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error) {
var report WriteReport
if len(files) == 0 && !prune {
return report, nil
}
tx, err := db.Begin()
if err != nil {
return report, fmt.Errorf("vault_index_write: begin tx: %w", err)
}
defer func() {
if err != nil {
tx.Rollback() //nolint:errcheck
}
}()
// Load existing rel_paths into a set to distinguish insert vs update.
existing := make(map[string]struct{})
rows, err := tx.Query(`SELECT rel_path FROM files`)
if err != nil {
return report, fmt.Errorf("vault_index_write: query existing: %w", err)
}
for rows.Next() {
var rp string
if err := rows.Scan(&rp); err != nil {
rows.Close()
return report, fmt.Errorf("vault_index_write: scan existing: %w", err)
}
existing[rp] = struct{}{}
}
rows.Close()
if err := rows.Err(); err != nil {
return report, fmt.Errorf("vault_index_write: rows err: %w", err)
}
now := time.Now().Unix()
upsertStmt, err := tx.Prepare(`
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(rel_path) DO UPDATE SET
size = excluded.size,
mtime = excluded.mtime,
sha256 = excluded.sha256,
mime = excluded.mime,
ext = excluded.ext,
bucket = excluded.bucket,
sub_bucket = excluded.sub_bucket,
indexed_at = excluded.indexed_at
`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare upsert: %w", err)
}
defer upsertStmt.Close()
ftsDeleteStmt, err := tx.Prepare(`DELETE FROM files_fts WHERE rel_path = ?`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare fts delete: %w", err)
}
defer ftsDeleteStmt.Close()
ftsInsertStmt, err := tx.Prepare(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, '')`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prepare fts insert: %w", err)
}
defer ftsInsertStmt.Close()
for _, f := range files {
_, err = upsertStmt.Exec(
f.RelPath, f.Size, f.Mtime, f.Sha256,
f.Mime, f.Ext, f.Bucket, f.SubBucket, now,
)
if err != nil {
return report, fmt.Errorf("vault_index_write: upsert %q: %w", f.RelPath, err)
}
if _, wasExisting := existing[f.RelPath]; wasExisting {
report.Updated++
} else {
report.Inserted++
}
// Refresh FTS row.
if _, err = ftsDeleteStmt.Exec(f.RelPath); err != nil {
return report, fmt.Errorf("vault_index_write: fts delete %q: %w", f.RelPath, err)
}
if _, err = ftsInsertStmt.Exec(f.RelPath); err != nil {
return report, fmt.Errorf("vault_index_write: fts insert %q: %w", f.RelPath, err)
}
report.FTS++
}
// Prune rows not present in the incoming slice.
if prune && len(files) > 0 {
keep := make([]string, len(files))
for i, f := range files {
keep[i] = "'" + strings.ReplaceAll(f.RelPath, "'", "''") + "'"
}
inClause := strings.Join(keep, ",")
res, err := tx.Exec(fmt.Sprintf(
`DELETE FROM files WHERE rel_path NOT IN (%s)`, inClause,
))
if err != nil {
return report, fmt.Errorf("vault_index_write: prune: %w", err)
}
n, _ := res.RowsAffected()
report.Pruned = int(n)
} else if prune && len(files) == 0 {
// prune=true with empty slice means delete everything.
res, err := tx.Exec(`DELETE FROM files`)
if err != nil {
return report, fmt.Errorf("vault_index_write: prune all: %w", err)
}
n, _ := res.RowsAffected()
report.Pruned = int(n)
}
if err = tx.Commit(); err != nil {
return report, fmt.Errorf("vault_index_write: commit: %w", err)
}
return report, nil
}
+84
View File
@@ -0,0 +1,84 @@
---
name: vault_index_write
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultIndexWrite(db *sql.DB, files []VaultFile, prune bool) (WriteReport, error)"
description: "Upserta un slice de VaultFile en vault_index.db (tabla files + FTS5 files_fts) dentro de una sola transaccion. Cuenta Inserted/Updated/FTS. Con prune=true elimina filas no presentes en el slice."
tags: [vault, sqlite, index, write, upsert, fts, infra]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, fmt, strings, time]
params:
- name: db
desc: "*sql.DB abierto sobre vault_index.db (tipicamente retornado por VaultIndexOpen)"
- name: files
desc: "slice de VaultFile a insertar/actualizar; puede ser vacio"
- name: prune
desc: "si true, elimina de 'files' todas las filas cuyo rel_path no este en el slice (sincronizacion destructiva)"
output: "WriteReport con conteos Inserted/Updated/Pruned/FTS; error si falla la transaccion"
tested: true
tests:
- "N archivos nuevos — Inserted=N"
- "re-escritura con mtime distinto — Updated=N"
- "prune elimina filas ausentes"
- "sin prune, filas previas persisten"
- "FTS5 MATCH funciona tras escritura"
test_file_path: "functions/infra/vault_index_write_test.go"
file_path: "functions/infra/vault_index_write.go"
---
## Ejemplo
```go
db, _ := VaultIndexOpen("/data/vaults/turismo")
defer db.Close()
files, _ := VaultInventoryScan("/data/vaults/turismo", "turismo_v1", "turismo")
report, err := VaultIndexWrite(db, files, true)
if err != nil {
log.Fatal(err)
}
fmt.Printf("inserted=%d updated=%d pruned=%d fts=%d\n",
report.Inserted, report.Updated, report.Pruned, report.FTS)
```
## Notas
### WriteReport
Struct local al paquete infra:
```go
type WriteReport struct {
Inserted int
Updated int
Pruned int
FTS int
}
```
### Estrategia de conteo Inserted vs Updated
Se carga el conjunto de rel_paths existentes en un map antes del loop. Un upsert
se clasifica como Inserted si el rel_path no estaba en el map, Updated si estaba.
Esto evita N+1 SELECTs y es correcto porque la transaccion serializa los cambios.
### FTS5
`files_fts` usa `content=''` (tabla de contenido externo vacio). Para cada archivo
se borra la fila FTS existente y se reinserta con `content_text=''`. Los profilers
posteriores (csv_profiles, knowledge_docs) son responsables de actualizar
`content_text` con texto indexable real.
### Prune
Con `prune=true` se construye un IN clause con los rel_paths del slice. La FK con
`ON DELETE CASCADE` propaga el DELETE a csv_profiles, pdf_extracts y knowledge_docs
automaticamente. Con slice vacio + prune=true se borra todo (DELETE FROM files).
### Escapado SQL
El IN clause se construye escapando las comillas simples en rel_path (duplicandolas).
Evita inyeccion en rutas con apostrofos. Para entornos con rutas controladas
(interior de vaults sin apostrofos) esto es suficiente; para entornos adversariales
usar parametros binding con VALUES multiples via prepared statement.
+210
View File
@@ -0,0 +1,210 @@
package infra
import (
"testing"
"time"
)
// makeTestVaultFile creates a minimal VaultFile for testing.
func makeTestVaultFile(relPath, mime, bucket, subBucket string) VaultFile {
return VaultFile{
VaultID: "test_vault",
VaultName: "test",
RelPath: relPath,
Size: 100,
Mtime: time.Now().Unix(),
Sha256: "abc123def456abc123def456abc123def456abc123def456abc123def456abc1",
Mime: mime,
Ext: ".csv",
Bucket: bucket,
SubBucket: subBucket,
}
}
func openInMemoryVaultIndex(t *testing.T) interface{ Close() error } {
t.Helper()
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
return db
}
func TestVaultIndexWrite_FreshInsert(t *testing.T) {
t.Run("N archivos nuevos — Inserted=N", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
makeTestVaultFile("knowledge/decisions/x.md", "text/markdown", "knowledge", "decisions"),
}
report, err := VaultIndexWrite(db, files, false)
if err != nil {
t.Fatalf("VaultIndexWrite: %v", err)
}
if report.Inserted != 3 {
t.Errorf("Inserted = %d, want 3", report.Inserted)
}
if report.Updated != 0 {
t.Errorf("Updated = %d, want 0", report.Updated)
}
if report.Pruned != 0 {
t.Errorf("Pruned = %d, want 0", report.Pruned)
}
if report.FTS != 3 {
t.Errorf("FTS = %d, want 3", report.FTS)
}
})
}
func TestVaultIndexWrite_Upsert(t *testing.T) {
t.Run("re-escritura con mtime distinto — Updated=N", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, files, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Modify mtime to simulate file change.
files[0].Mtime = time.Now().Unix() + 100
files[1].Mtime = time.Now().Unix() + 200
report, err := VaultIndexWrite(db, files, false)
if err != nil {
t.Fatalf("second write: %v", err)
}
if report.Inserted != 0 {
t.Errorf("Inserted = %d, want 0", report.Inserted)
}
if report.Updated != 2 {
t.Errorf("Updated = %d, want 2", report.Updated)
}
})
}
func TestVaultIndexWrite_Prune(t *testing.T) {
t.Run("prune elimina filas ausentes", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
// Write A and B.
ab := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, ab, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Write only A with prune=true — B should be deleted.
onlyA := []VaultFile{ab[0]}
report, err := VaultIndexWrite(db, onlyA, true)
if err != nil {
t.Fatalf("prune write: %v", err)
}
if report.Pruned != 1 {
t.Errorf("Pruned = %d, want 1", report.Pruned)
}
// Verify B is gone.
var count int
err = db.QueryRow(`SELECT count(*) FROM files WHERE rel_path = 'data/raw/b.csv'`).Scan(&count)
if err != nil {
t.Fatalf("query: %v", err)
}
if count != 0 {
t.Errorf("b.csv still present after prune")
}
})
}
func TestVaultIndexWrite_NoPrune(t *testing.T) {
t.Run("sin prune, filas previas persisten", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
ab := []VaultFile{
makeTestVaultFile("data/raw/a.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/b.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, ab, false); err != nil {
t.Fatalf("first write: %v", err)
}
// Write only A without prune — B must remain.
onlyA := []VaultFile{ab[0]}
report, err := VaultIndexWrite(db, onlyA, false)
if err != nil {
t.Fatalf("second write: %v", err)
}
if report.Pruned != 0 {
t.Errorf("Pruned = %d, want 0", report.Pruned)
}
var count int
err = db.QueryRow(`SELECT count(*) FROM files`).Scan(&count)
if err != nil {
t.Fatalf("query: %v", err)
}
if count != 2 {
t.Errorf("files count = %d, want 2", count)
}
})
}
func TestVaultIndexWrite_FTSMatch(t *testing.T) {
t.Run("FTS5 MATCH funciona tras escritura", func(t *testing.T) {
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatal(err)
}
defer db.Close()
files := []VaultFile{
makeTestVaultFile("data/raw/foo_report.csv", "text/csv", "data", "raw"),
makeTestVaultFile("data/raw/bar_data.csv", "text/csv", "data", "raw"),
}
if _, err := VaultIndexWrite(db, files, false); err != nil {
t.Fatalf("write: %v", err)
}
// FTS5 on rel_path column: MATCH 'foo*'
var count int
err = db.QueryRow(
`SELECT count(*) FROM files_fts WHERE files_fts MATCH 'rel_path:foo*'`,
).Scan(&count)
if err != nil {
t.Fatalf("FTS MATCH query: %v", err)
}
if count != 1 {
t.Errorf("FTS MATCH rel_path:foo* = %d rows, want 1", count)
}
})
}
+174
View File
@@ -0,0 +1,174 @@
package infra
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
)
// VaultInventoryScan walks vaultPath and returns a VaultFile slice (sorted by RelPath)
// for every regular file found, skipping:
// - vault_index.db, vault_index.db-shm, vault_index.db-wal
// - .git/ directories at any depth
// - hidden files/dirs (names starting with ".") at the vault root level only
//
// For each file it computes: relative path (forward slashes), size, mtime (unix UTC),
// sha256 (streaming, hex lowercase), MIME type, extension, bucket and sub-bucket.
//
// MIME detection priority:
// 1. Extension override: .csv → text/csv, .md → text/markdown, .parquet → application/parquet
// 2. http.DetectContentType on first 512 bytes (magic bytes, stdlib)
//
// NOTE: file_validate_type_go_infra (FileValidateType) was not used here because its
// signature requires an allowedTypes allowlist and returns (mime, bool) — it is designed
// for upload validation, not for open-ended inventory scanning where any MIME is valid.
// http.DetectContentType provides the same magic-byte detection without the allowlist
// coupling and handles a broader set of formats including text/plain for CSV fallback.
func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error) {
var files []VaultFile
err := filepath.WalkDir(vaultPath, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
name := d.Name()
// Skip .git directories at any depth.
if d.IsDir() && name == ".git" {
return filepath.SkipDir
}
// Skip hidden entries (names starting with ".") at vault root only.
if strings.HasPrefix(name, ".") {
rel, relErr := filepath.Rel(vaultPath, path)
if relErr == nil {
// At root level the relative path has no separator.
if !strings.Contains(filepath.ToSlash(rel), "/") {
if d.IsDir() {
return filepath.SkipDir
}
return nil
}
}
}
if d.IsDir() {
return nil
}
// Skip vault_index.db and its WAL/SHM sidecar files.
if name == "vault_index.db" || name == "vault_index.db-shm" || name == "vault_index.db-wal" {
return nil
}
rel, err := filepath.Rel(vaultPath, path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: rel path for %q: %w", path, err)
}
rel = filepath.ToSlash(rel)
info, err := d.Info()
if err != nil {
return fmt.Errorf("vault_inventory_scan: stat %q: %w", path, err)
}
// Compute sha256 by streaming — avoids loading large files into memory.
sha, err := fileSha256(path)
if err != nil {
return fmt.Errorf("vault_inventory_scan: sha256 %q: %w", path, err)
}
mime, err := detectVaultFileMime(path, name)
if err != nil {
return fmt.Errorf("vault_inventory_scan: mime %q: %w", path, err)
}
ext := strings.ToLower(filepath.Ext(name))
bucket, subBucket := vaultBucketParts(rel)
files = append(files, VaultFile{
VaultID: vaultID,
VaultName: vaultName,
RelPath: rel,
Size: info.Size(),
Mtime: info.ModTime().UTC().Unix(),
Sha256: sha,
Mime: mime,
Ext: ext,
Bucket: bucket,
SubBucket: subBucket,
})
return nil
})
if err != nil {
return nil, fmt.Errorf("vault_inventory_scan: walk %q: %w", vaultPath, err)
}
sort.Slice(files, func(i, j int) bool {
return files[i].RelPath < files[j].RelPath
})
return files, nil
}
// fileSha256 computes the hex-lowercase SHA-256 of the file at path by streaming.
func fileSha256(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// detectVaultFileMime returns the MIME type for a vault file.
// Extension overrides take priority; otherwise http.DetectContentType is used.
func detectVaultFileMime(path, name string) (string, error) {
ext := strings.ToLower(filepath.Ext(name))
switch ext {
case ".csv":
return "text/csv", nil
case ".md":
return "text/markdown", nil
case ".parquet":
return "application/parquet", nil
}
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
buf := make([]byte, 512)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
return http.DetectContentType(buf[:n]), nil
}
// vaultBucketParts extracts the top-level bucket ("data" or "knowledge") and
// the second-level sub-bucket from a forward-slash relative path.
// Returns empty strings for files at vault root or with no recognisable bucket.
func vaultBucketParts(relPath string) (bucket, subBucket string) {
parts := strings.SplitN(relPath, "/", 3)
if len(parts) < 1 {
return "", ""
}
bucket = parts[0]
if len(parts) >= 2 {
subBucket = parts[1]
}
return bucket, subBucket
}
+74
View File
@@ -0,0 +1,74 @@
---
name: vault_inventory_scan
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultInventoryScan(vaultPath, vaultID, vaultName string) ([]VaultFile, error)"
description: "Recorre vaultPath con filepath.WalkDir y retorna un slice de VaultFile ordenado por RelPath para cada archivo regular, computando sha256 por streaming, MIME por extension/magic y bucket/sub-bucket por posicion en el arbol."
tags: [vault, inventory, scan, filesystem, sha256, mime, infra]
uses_functions: []
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [crypto/sha256, encoding/hex, fmt, io, net/http, os, path/filepath, sort, strings]
params:
- name: vaultPath
desc: "ruta absoluta o relativa al directorio raiz del vault"
- name: vaultID
desc: "identificador del vault (ej: turismo_spain_app_turismo) — se copia a cada VaultFile"
- name: vaultName
desc: "nombre legible del vault (ej: turismo_spain) — se copia a cada VaultFile"
output: "slice de VaultFile ordenado lexicograficamente por RelPath; slice vacio (no nil) si el vault esta vacio"
tested: true
tests:
- "tmpdir vacio retorna slice vacio"
- "data layout — bucket y sub_bucket correctos"
- "knowledge layout — bucket y sub_bucket correctos"
- "omite vault_index.db y .git"
- "sha256 determinista para mismo contenido"
- "orden lexicografico del resultado"
test_file_path: "functions/infra/vault_inventory_scan_test.go"
file_path: "functions/infra/vault_inventory_scan.go"
---
## Ejemplo
```go
files, err := VaultInventoryScan("/data/vaults/turismo_spain", "turismo_spain_v1", "turismo_spain")
if err != nil {
log.Fatal(err)
}
for _, f := range files {
fmt.Printf("%s %s %s/%s\n", f.RelPath, f.Mime, f.Bucket, f.SubBucket)
}
```
## Notas
### Archivos omitidos
- `vault_index.db`, `vault_index.db-shm`, `vault_index.db-wal` (siempre)
- `.git/` en cualquier profundidad (SkipDir)
- Entradas cuyo nombre empieza por `.` solo en la raiz del vault (nivel 0)
### Deteccion de MIME
`file_validate_type_go_infra` (FileValidateType) no se usa porque su firma
requiere una lista blanca de tipos permitidos y retorna (mime, bool) — esta
disenada para validacion de uploads, no para escaneo inventarial donde
cualquier MIME es valido. Se usan en su lugar:
1. Override por extension (prioridad alta): `.csv``text/csv`, `.md``text/markdown`,
`.parquet``application/parquet`. Necesario porque `http.DetectContentType`
clasifica CSV como `text/plain` y no conoce Parquet.
2. `http.DetectContentType` sobre primeros 512 bytes (magic bytes, stdlib) para el resto.
### SHA-256
Calculado por streaming con `io.Copy` a `sha256.New()` — no carga el archivo completo
a memoria. Valido para archivos de cualquier tamano.
### Bucket / SubBucket
Derivados de la posicion en el arbol:
- `bucket` = primer segmento del RelPath (tipicamente "data" o "knowledge")
- `subBucket` = segundo segmento si existe; vacio si el archivo esta en la raiz del bucket
@@ -0,0 +1,182 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
func writeTestFile(t *testing.T, dir, rel, content string) {
t.Helper()
full := filepath.Join(dir, filepath.FromSlash(rel))
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
t.Fatalf("mkdir %s: %v", filepath.Dir(full), err)
}
if err := os.WriteFile(full, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", full, err)
}
}
func TestVaultInventoryScan_Empty(t *testing.T) {
t.Run("tmpdir vacio retorna slice vacio", func(t *testing.T) {
dir := t.TempDir()
files, err := VaultInventoryScan(dir, "v1", "test")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 0 {
t.Errorf("expected 0 files, got %d", len(files))
}
})
}
func TestVaultInventoryScan_DataLayout(t *testing.T) {
t.Run("data layout — bucket y sub_bucket correctos", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "data/raw/a.csv", "col1,col2\n1,2\n")
writeTestFile(t, dir, "data/processed/b.parquet", "PAR1fakedata")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 2 {
t.Fatalf("expected 2 files, got %d", len(files))
}
// files are sorted: data/processed/b.parquet < data/raw/a.csv
b := files[0]
if b.RelPath != "data/processed/b.parquet" {
t.Errorf("files[0].RelPath = %q, want data/processed/b.parquet", b.RelPath)
}
if b.Bucket != "data" {
t.Errorf("files[0].Bucket = %q, want data", b.Bucket)
}
if b.SubBucket != "processed" {
t.Errorf("files[0].SubBucket = %q, want processed", b.SubBucket)
}
if b.Mime != "application/parquet" {
t.Errorf("files[0].Mime = %q, want application/parquet", b.Mime)
}
if b.Ext != ".parquet" {
t.Errorf("files[0].Ext = %q, want .parquet", b.Ext)
}
if b.VaultID != "vid" {
t.Errorf("VaultID = %q, want vid", b.VaultID)
}
a := files[1]
if a.RelPath != "data/raw/a.csv" {
t.Errorf("files[1].RelPath = %q, want data/raw/a.csv", a.RelPath)
}
if a.Mime != "text/csv" {
t.Errorf("files[1].Mime = %q, want text/csv", a.Mime)
}
if a.Bucket != "data" || a.SubBucket != "raw" {
t.Errorf("files[1]: bucket=%q subBucket=%q, want data/raw", a.Bucket, a.SubBucket)
}
})
}
func TestVaultInventoryScan_KnowledgeLayout(t *testing.T) {
t.Run("knowledge layout — bucket y sub_bucket correctos", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "knowledge/decisions/x.md", "# Decision\n\ncontent")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 1 {
t.Fatalf("expected 1 file, got %d", len(files))
}
f := files[0]
if f.RelPath != "knowledge/decisions/x.md" {
t.Errorf("RelPath = %q", f.RelPath)
}
if f.Bucket != "knowledge" {
t.Errorf("Bucket = %q, want knowledge", f.Bucket)
}
if f.SubBucket != "decisions" {
t.Errorf("SubBucket = %q, want decisions", f.SubBucket)
}
if f.Mime != "text/markdown" {
t.Errorf("Mime = %q, want text/markdown", f.Mime)
}
})
}
func TestVaultInventoryScan_SkipsIndexAndGit(t *testing.T) {
t.Run("omite vault_index.db y .git", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "vault_index.db", "sqlite data")
writeTestFile(t, dir, "vault_index.db-wal", "wal data")
writeTestFile(t, dir, ".git/HEAD", "ref: refs/heads/master")
writeTestFile(t, dir, "data/raw/real.csv", "a,b\n1,2\n")
files, err := VaultInventoryScan(dir, "vid", "vname")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(files) != 1 {
t.Fatalf("expected 1 file (real.csv), got %d: %v", len(files), relPaths(files))
}
if files[0].RelPath != "data/raw/real.csv" {
t.Errorf("unexpected file: %q", files[0].RelPath)
}
})
}
func TestVaultInventoryScan_Sha256Deterministic(t *testing.T) {
t.Run("sha256 determinista para mismo contenido", func(t *testing.T) {
dir1 := t.TempDir()
dir2 := t.TempDir()
content := "deterministic content 123\n"
writeTestFile(t, dir1, "data/raw/f.csv", content)
writeTestFile(t, dir2, "data/raw/f.csv", content)
files1, err := VaultInventoryScan(dir1, "v1", "vault1")
if err != nil {
t.Fatal(err)
}
files2, err := VaultInventoryScan(dir2, "v2", "vault2")
if err != nil {
t.Fatal(err)
}
if files1[0].Sha256 != files2[0].Sha256 {
t.Errorf("sha256 mismatch: %q vs %q", files1[0].Sha256, files2[0].Sha256)
}
if len(files1[0].Sha256) != 64 {
t.Errorf("sha256 length = %d, want 64", len(files1[0].Sha256))
}
})
}
func TestVaultInventoryScan_Sorted(t *testing.T) {
t.Run("orden lexicografico del resultado", func(t *testing.T) {
dir := t.TempDir()
writeTestFile(t, dir, "knowledge/decisions/z.md", "z")
writeTestFile(t, dir, "data/raw/a.csv", "a")
writeTestFile(t, dir, "data/processed/m.parquet", "m")
writeTestFile(t, dir, "knowledge/domains/b.md", "b")
files, err := VaultInventoryScan(dir, "v", "v")
if err != nil {
t.Fatal(err)
}
for i := 1; i < len(files); i++ {
if files[i].RelPath < files[i-1].RelPath {
t.Errorf("not sorted at index %d: %q < %q", i, files[i].RelPath, files[i-1].RelPath)
}
}
})
}
// relPaths is a helper for test error messages.
func relPaths(files []VaultFile) []string {
out := make([]string, len(files))
for i, f := range files {
out[i] = f.RelPath
}
return out
}
+252
View File
@@ -0,0 +1,252 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
)
// LayoutReport describes what VaultLayoutEnsure did (or would do) to a vault directory.
type LayoutReport struct {
VaultPath string `json:"vault_path"`
Created []string `json:"created"` // dirs created (relative paths)
Migrated []string `json:"migrated"` // renames executed, format "src -> dst" (relative)
AlreadyOK []string `json:"already_ok"` // dirs that already existed at the target location
Skipped []string `json:"skipped"` // unrecognized root-level entries, left untouched
DryRun bool `json:"dry_run"`
}
// dataBuckets are root-level directories that belong under data/.
var dataBuckets = []string{"raw", "processed", "exports"}
// knowledgeBuckets are root-level directories that belong under knowledge/.
var knowledgeBuckets = []string{"decisions", "domains", "models", "benchmarks", "test_documents"}
// knownRootFiles are root-level files that should be moved to knowledge/.
var knownRootFiles = []string{"README.md", "README.txt"}
// VaultLayoutEnsure ensures a vault directory uses the canonical hybrid layout:
//
// data/{raw,processed,exports}
// knowledge/{decisions,domains,models,benchmarks,test_documents}
//
// Legacy vaults that have these directories at the root are migrated by renaming
// (or merging when both src and dst already exist). The operation is idempotent:
// a second run returns everything in AlreadyOK.
//
// When dryRun is true the function computes the report but does not touch the disk.
func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error) {
report := LayoutReport{DryRun: dryRun}
// --- resolve path ---
vaultPath = strings.TrimRight(vaultPath, "/\\")
var err error
vaultPath, err = filepath.Abs(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: abs(%q): %w", vaultPath, err)
}
// Follow symlinks for the vault root itself.
resolved, err := filepath.EvalSymlinks(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: eval symlinks %q: %w", vaultPath, err)
}
vaultPath = resolved
report.VaultPath = vaultPath
// --- check that vault exists and is a directory ---
info, err := os.Stat(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: stat %q: %w", vaultPath, err)
}
if !info.IsDir() {
return report, fmt.Errorf("vault_layout_ensure: %q is not a directory", vaultPath)
}
// --- ensure top-level containers ---
for _, container := range []string{"data", "knowledge"} {
dst := filepath.Join(vaultPath, container)
if err := ensureDir(dst, dryRun, container, &report); err != nil {
return report, err
}
}
// --- build migration table: root name -> relative destination ---
type migration struct {
rootName string // name in vault root (dir or file)
dstRel string // relative destination path inside vault
isFile bool
}
var migrations []migration
for _, b := range dataBuckets {
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("data", b)})
}
for _, b := range knowledgeBuckets {
migrations = append(migrations, migration{rootName: b, dstRel: filepath.Join("knowledge", b)})
}
for _, rf := range knownRootFiles {
migrations = append(migrations, migration{rootName: rf, dstRel: filepath.Join("knowledge", "README.md"), isFile: true})
}
// Track which root names are "known" so we can compute Skipped.
knownNames := make(map[string]struct{})
for _, m := range migrations {
knownNames[strings.ToLower(m.rootName)] = struct{}{}
}
knownNames["data"] = struct{}{}
knownNames["knowledge"] = struct{}{}
// --- apply migrations ---
for _, m := range migrations {
src := filepath.Join(vaultPath, m.rootName)
dst := filepath.Join(vaultPath, m.dstRel)
srcRel := m.rootName
dstRel := m.dstRel
srcExists := pathExists(src)
dstExists := pathExists(dst)
switch {
case srcExists && dstExists:
// Both exist: merge if directory, error on file collision.
if m.isFile {
return report, fmt.Errorf("vault_layout_ensure: conflict: both %q and %q exist", srcRel, dstRel)
}
if err := mergeDirs(src, dst, srcRel, dstRel, dryRun, &report); err != nil {
return report, err
}
case srcExists && !dstExists:
// Only source exists: rename.
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", srcRel, dstRel))
if !dryRun {
if err := os.Rename(src, dst); err != nil {
return report, fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", src, dst, err)
}
}
case !srcExists && dstExists:
// Already migrated.
report.AlreadyOK = append(report.AlreadyOK, dstRel)
default:
// Neither exists: create empty destination directory (skip for files).
if !m.isFile {
report.Created = append(report.Created, dstRel)
if !dryRun {
if err := os.MkdirAll(dst, 0o755); err != nil {
return report, fmt.Errorf("vault_layout_ensure: mkdir %q: %w", dst, err)
}
}
}
}
}
// --- collect skipped (unrecognized root entries) ---
entries, err := os.ReadDir(vaultPath)
if err != nil {
return report, fmt.Errorf("vault_layout_ensure: readdir %q: %w", vaultPath, err)
}
for _, e := range entries {
if _, known := knownNames[strings.ToLower(e.Name())]; !known {
report.Skipped = append(report.Skipped, e.Name())
}
}
return report, nil
}
// ensureDir adds the dir to Created (and creates it) if it doesn't exist,
// or to AlreadyOK if it does. Used for top-level containers "data" and "knowledge".
func ensureDir(path string, dryRun bool, rel string, report *LayoutReport) error {
if pathExists(path) {
report.AlreadyOK = append(report.AlreadyOK, rel)
return nil
}
report.Created = append(report.Created, rel)
if dryRun {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return fmt.Errorf("vault_layout_ensure: mkdir %q: %w", path, err)
}
return nil
}
// mergeDirs moves the contents of src into dst, then removes src if empty.
// Returns an error if any file in src already exists in dst (no overwrite policy).
func mergeDirs(src, dst, srcRel, dstRel string, dryRun bool, report *LayoutReport) error {
children, err := os.ReadDir(src)
if err != nil {
return fmt.Errorf("vault_layout_ensure: readdir %q: %w", src, err)
}
for _, child := range children {
childDst := filepath.Join(dst, child.Name())
if pathExists(childDst) {
return fmt.Errorf("vault_layout_ensure: merge conflict: %q already exists in %q (cannot overwrite %q)",
child.Name(), dstRel, filepath.Join(srcRel, child.Name()))
}
childSrc := filepath.Join(src, child.Name())
childSrcRel := filepath.Join(srcRel, child.Name())
childDstRel := filepath.Join(dstRel, child.Name())
report.Migrated = append(report.Migrated, fmt.Sprintf("%s -> %s", childSrcRel, childDstRel))
if !dryRun {
if err := os.Rename(childSrc, childDst); err != nil {
return fmt.Errorf("vault_layout_ensure: rename %q -> %q: %w", childSrc, childDst, err)
}
}
}
// Remove the now-empty src directory.
if !dryRun {
// Re-check emptiness after renames.
remaining, _ := os.ReadDir(src)
if len(remaining) == 0 {
if err := os.Remove(src); err != nil {
return fmt.Errorf("vault_layout_ensure: remove empty src %q: %w", src, err)
}
}
}
return nil
}
// pathExists returns true if path exists (any type).
func pathExists(path string) bool {
_, err := os.Lstat(path)
return err == nil
}
// dirIsEmpty returns true if a directory exists and has no entries.
func dirIsEmpty(path string) bool {
entries, err := os.ReadDir(path)
if err != nil {
return false
}
return len(entries) == 0
}
// _ prevents "declared but not used" if dirIsEmpty is only used in tests.
var _ = dirIsEmpty
// vaultLayoutKnownNames returns the set of root-level names managed by this function.
// Exported for use in tests.
func vaultLayoutKnownNames() map[string]struct{} {
known := make(map[string]struct{})
for _, b := range dataBuckets {
known[b] = struct{}{}
}
for _, b := range knowledgeBuckets {
known[b] = struct{}{}
}
for _, rf := range knownRootFiles {
known[strings.ToLower(rf)] = struct{}{}
}
known["data"] = struct{}{}
known["knowledge"] = struct{}{}
return known
}
+95
View File
@@ -0,0 +1,95 @@
---
name: vault_layout_ensure
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultLayoutEnsure(vaultPath string, dryRun bool) (LayoutReport, error)"
description: "Normaliza el layout de un vault al esquema hibrido canónico data/{raw,processed,exports} + knowledge/{decisions,domains,models,benchmarks,test_documents}. Migra directorios legacy en la raíz del vault a su ubicación correcta; idempotente."
tags: [vault, layout, migration, infra, filesystem, idempotent]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
params:
- name: vault_path
desc: "Ruta al directorio raíz del vault. Puede ser absoluta, relativa o un symlink — se resuelve con filepath.Abs + filepath.EvalSymlinks. Trailing slashes se ignoran."
- name: dry_run
desc: "Si true, calcula el reporte completo (qué se crearía, migraría, etc.) pero no modifica el disco. Util para previsualizar antes de ejecutar."
output: "LayoutReport con: VaultPath (ruta resuelta), Created (dirs creados), Migrated (renombres ejecutados, formato 'src -> dst'), AlreadyOK (destinos que ya existían), Skipped (entradas en raíz no reconocidas, no tocadas), DryRun (flag). Error si el path no existe, no es directorio, o hay conflicto de merge (mismo nombre de archivo en src y dst)."
tested: true
tests:
- "TestVaultLayoutEnsure_DryRun_NoChange"
- "TestVaultLayoutEnsure_FreshDir_CreatesLayout"
- "TestVaultLayoutEnsure_LegacyDataLayout_Migrates"
- "TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates"
- "TestVaultLayoutEnsure_AlreadyMigrated_Idempotent"
- "TestVaultLayoutEnsure_Mixed_PartialMigration"
- "TestVaultLayoutEnsure_MergeConflict_Errors"
- "TestVaultLayoutEnsure_UnknownFiles_Skipped"
- "TestVaultLayoutEnsure_NotADir_Errors"
test_file_path: "functions/infra/vault_layout_ensure_test.go"
file_path: "functions/infra/vault_layout_ensure.go"
---
## Ejemplo
```go
// Previsualizar sin tocar disco:
report, err := VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", true)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Would migrate: %v\n", report.Migrated)
fmt.Printf("Would create: %v\n", report.Created)
// Ejecutar la migración:
report, err = VaultLayoutEnsure("/home/lucas/vaults/turismo_spain", false)
if err != nil {
log.Fatalf("migration failed: %v", err)
}
fmt.Printf("Migrated: %v\n", report.Migrated)
fmt.Printf("Created: %v\n", report.Created)
fmt.Printf("Skipped: %v\n", report.Skipped)
```
## Comportamiento detallado
**Directorios gestionados:**
| Raíz (legacy) | Destino canónico |
|---|---|
| `raw/` | `data/raw/` |
| `processed/` | `data/processed/` |
| `exports/` | `data/exports/` |
| `decisions/` | `knowledge/decisions/` |
| `domains/` | `knowledge/domains/` |
| `models/` | `knowledge/models/` |
| `benchmarks/` | `knowledge/benchmarks/` |
| `test_documents/` | `knowledge/test_documents/` |
| `README.md` / `README.txt` | `knowledge/README.md` |
**Lógica de migración (por cada entrada conocida):**
- Solo `src` existe → rename atómico `src``dst`, registrado en `Migrated`.
- Solo `dst` existe → ya migrado, registrado en `AlreadyOK`.
- Ambos existen (dir) → merge: mueve cada hijo de `src/` a `dst/`; error si mismo nombre. Registrado en `Migrated` por hijo.
- Ambos existen (archivo README) → error inmediato con paths concretos.
- Ninguno existe → crea `dst` vacío, registrado en `Created`.
**Archivos/dirs no reconocidos** en la raíz (`.git`, `vault_index.db`, archivos custom) se registran en `Skipped` y no se tocan.
**Idempotencia:** segunda ejecución sobre un vault ya migrado reporta todo en `AlreadyOK` y no toca disco.
## Notas
`LayoutReport` es un tipo local de esta función (no un tipo del registry). El struct exportado vive en `functions/infra/vault_layout_ensure.go` junto con la función.
Para aplicar la migración a múltiples vaults en batch, invocar desde un pipeline que lea los paths de `vault.yaml` (ver `vault_manifest_read_go_infra`) y llame a `VaultLayoutEnsure` en cada uno.
+394
View File
@@ -0,0 +1,394 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
// mkVaultDir creates a temporary directory tree for tests.
// entries is a list of relative paths to create.
// Paths ending in "/" are directories; others are files with placeholder content.
func mkVaultDir(t *testing.T, entries []string) string {
t.Helper()
root := t.TempDir()
for _, e := range entries {
full := filepath.Join(root, filepath.FromSlash(e))
if e[len(e)-1] == '/' {
if err := os.MkdirAll(full, 0o755); err != nil {
t.Fatalf("mkVaultDir: mkdir %q: %v", full, err)
}
} else {
if err := os.MkdirAll(filepath.Dir(full), 0o755); err != nil {
t.Fatalf("mkVaultDir: mkdir parent %q: %v", full, err)
}
if err := os.WriteFile(full, []byte("test\n"), 0o644); err != nil {
t.Fatalf("mkVaultDir: write %q: %v", full, err)
}
}
}
return root
}
func TestVaultLayoutEnsure_DryRun_NoChange(t *testing.T) {
root := mkVaultDir(t, []string{
"raw/",
"raw/file1.csv",
"processed/",
})
before := snapshotDir(t, root)
report, err := VaultLayoutEnsure(root, true)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !report.DryRun {
t.Error("DryRun flag not set in report")
}
after := snapshotDir(t, root)
if !mapEqual(before, after) {
t.Errorf("dry-run modified disk: before=%v after=%v", before, after)
}
// Should have planned a migration for raw and processed.
if len(report.Migrated) == 0 {
t.Error("expected Migrated to be non-empty in dry-run plan")
}
}
func TestVaultLayoutEnsure_FreshDir_CreatesLayout(t *testing.T) {
root := mkVaultDir(t, []string{}) // empty vault
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// All standard dirs should be created.
wantCreated := []string{
"data", "knowledge",
filepath.Join("data", "raw"),
filepath.Join("data", "processed"),
filepath.Join("data", "exports"),
filepath.Join("knowledge", "decisions"),
filepath.Join("knowledge", "domains"),
filepath.Join("knowledge", "models"),
filepath.Join("knowledge", "benchmarks"),
filepath.Join("knowledge", "test_documents"),
}
createdSet := toSet(report.Created)
for _, w := range wantCreated {
if _, ok := createdSet[w]; !ok {
t.Errorf("expected Created to contain %q, got %v", w, report.Created)
}
}
// All directories must actually exist on disk.
for _, w := range wantCreated {
full := filepath.Join(root, w)
info, err := os.Stat(full)
if err != nil {
t.Errorf("expected %q to exist: %v", full, err)
continue
}
if !info.IsDir() {
t.Errorf("%q should be a directory", full)
}
}
}
func TestVaultLayoutEnsure_LegacyDataLayout_Migrates(t *testing.T) {
root := mkVaultDir(t, []string{
"raw/",
"raw/file1.parquet",
"raw/file2.parquet",
"processed/",
"processed/clean.csv",
"exports/",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// raw and processed should appear in Migrated (as dirs, top-level rename).
migratedSet := toSet(report.Migrated)
for _, pair := range []string{
"raw -> " + filepath.Join("data", "raw"),
"processed -> " + filepath.Join("data", "processed"),
} {
if _, ok := migratedSet[pair]; !ok {
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
}
}
// Files must have moved.
for _, f := range []string{
filepath.Join("data", "raw", "file1.parquet"),
filepath.Join("data", "raw", "file2.parquet"),
filepath.Join("data", "processed", "clean.csv"),
} {
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
t.Errorf("expected %q to exist after migration: %v", f, err)
}
}
// Old dirs must be gone.
for _, d := range []string{"raw", "processed"} {
if pathExists(filepath.Join(root, d)) {
t.Errorf("expected legacy dir %q to be removed", d)
}
}
}
func TestVaultLayoutEnsure_LegacyKnowledgeLayout_Migrates(t *testing.T) {
root := mkVaultDir(t, []string{
"decisions/",
"decisions/2024-01.md",
"models/",
"models/ner_v1.pkl",
"README.md",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// decisions and models should appear in Migrated.
migratedSet := toSet(report.Migrated)
for _, pair := range []string{
"decisions -> " + filepath.Join("knowledge", "decisions"),
"models -> " + filepath.Join("knowledge", "models"),
"README.md -> " + filepath.Join("knowledge", "README.md"),
} {
if _, ok := migratedSet[pair]; !ok {
t.Errorf("expected Migrated to contain %q, got %v", pair, report.Migrated)
}
}
// Files must be at new location.
for _, f := range []string{
filepath.Join("knowledge", "decisions", "2024-01.md"),
filepath.Join("knowledge", "models", "ner_v1.pkl"),
filepath.Join("knowledge", "README.md"),
} {
if _, err := os.Stat(filepath.Join(root, f)); err != nil {
t.Errorf("expected %q to exist after migration: %v", f, err)
}
}
}
func TestVaultLayoutEnsure_AlreadyMigrated_Idempotent(t *testing.T) {
root := mkVaultDir(t, []string{
"data/",
"data/raw/",
"data/raw/file.csv",
"data/processed/",
"data/exports/",
"knowledge/",
"knowledge/decisions/",
"knowledge/domains/",
"knowledge/models/",
"knowledge/benchmarks/",
"knowledge/test_documents/",
})
report1, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("first run error: %v", err)
}
if len(report1.Migrated) != 0 {
t.Errorf("first run on fully-migrated vault should have no migrations, got %v", report1.Migrated)
}
before := snapshotDir(t, root)
report2, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("second run error: %v", err)
}
after := snapshotDir(t, root)
if !mapEqual(before, after) {
t.Error("second run modified disk (not idempotent)")
}
if len(report2.Migrated) != 0 {
t.Errorf("second run should produce no migrations, got %v", report2.Migrated)
}
if len(report2.AlreadyOK) == 0 {
t.Error("second run should report existing dirs as AlreadyOK")
}
}
func TestVaultLayoutEnsure_Mixed_PartialMigration(t *testing.T) {
// data/raw already migrated; exports still at root; knowledge dirs in legacy positions.
root := mkVaultDir(t, []string{
"data/",
"data/raw/",
"data/raw/already_here.csv",
"exports/",
"exports/report.pdf",
"decisions/",
"decisions/2023-note.md",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
// data/raw should be AlreadyOK.
if !sliceContains(report.AlreadyOK, filepath.Join("data", "raw")) {
t.Errorf("data/raw should be AlreadyOK, got AlreadyOK=%v", report.AlreadyOK)
}
// exports should be migrated.
exportsMigrated := false
for _, m := range report.Migrated {
if m == "exports -> "+filepath.Join("data", "exports") {
exportsMigrated = true
}
}
if !exportsMigrated {
t.Errorf("exports should be migrated, Migrated=%v", report.Migrated)
}
// decisions should be migrated.
decisionsMigrated := false
for _, m := range report.Migrated {
if m == "decisions -> "+filepath.Join("knowledge", "decisions") {
decisionsMigrated = true
}
}
if !decisionsMigrated {
t.Errorf("decisions should be migrated, Migrated=%v", report.Migrated)
}
}
func TestVaultLayoutEnsure_MergeConflict_Errors(t *testing.T) {
// Both src (raw/) and dst (data/raw/) exist and have a file with the same name.
root := mkVaultDir(t, []string{
"raw/",
"raw/collision.csv",
"data/",
"data/raw/",
"data/raw/collision.csv", // same name -> conflict
})
_, err := VaultLayoutEnsure(root, false)
if err == nil {
t.Fatal("expected error for merge conflict, got nil")
}
if !contains(err.Error(), "conflict") && !contains(err.Error(), "collision.csv") {
t.Errorf("error should mention conflict or the file name, got: %v", err)
}
}
func TestVaultLayoutEnsure_UnknownFiles_Skipped(t *testing.T) {
root := mkVaultDir(t, []string{
".git/",
"vault_index.db",
"my_custom_notes.txt",
"raw/",
})
report, err := VaultLayoutEnsure(root, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
skippedSet := toSet(report.Skipped)
for _, name := range []string{".git", "vault_index.db", "my_custom_notes.txt"} {
if _, ok := skippedSet[name]; !ok {
t.Errorf("expected %q in Skipped, got %v", name, report.Skipped)
}
}
// raw should NOT be in Skipped (it's a known bucket).
if _, ok := skippedSet["raw"]; ok {
t.Error("raw should not appear in Skipped — it is a known bucket")
}
}
func TestVaultLayoutEnsure_NotADir_Errors(t *testing.T) {
t.Run("non-existent path", func(t *testing.T) {
_, err := VaultLayoutEnsure("/tmp/does_not_exist_fn_registry_test_xyz", false)
if err == nil {
t.Fatal("expected error for non-existent path")
}
})
t.Run("path is a file", func(t *testing.T) {
f, err := os.CreateTemp("", "vault_layout_*.txt")
if err != nil {
t.Fatal(err)
}
f.Close()
defer os.Remove(f.Name())
_, err = VaultLayoutEnsure(f.Name(), false)
if err == nil {
t.Fatal("expected error when vaultPath is a file, not a dir")
}
if !contains(err.Error(), "not a directory") {
t.Errorf("error should mention 'not a directory', got: %v", err)
}
})
}
// --- helpers ---
// snapshotDir returns a map of relative path -> exists for all entries under root.
func snapshotDir(t *testing.T, root string) map[string]bool {
t.Helper()
snap := make(map[string]bool)
err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
rel, _ := filepath.Rel(root, path)
snap[rel] = true
return nil
})
if err != nil {
t.Fatalf("snapshotDir: %v", err)
}
return snap
}
func mapEqual(a, b map[string]bool) bool {
if len(a) != len(b) {
return false
}
for k := range a {
if !b[k] {
return false
}
}
return true
}
func toSet(ss []string) map[string]struct{} {
m := make(map[string]struct{}, len(ss))
for _, s := range ss {
m[s] = struct{}{}
}
return m
}
func sliceContains(ss []string, target string) bool {
for _, s := range ss {
if s == target {
return true
}
}
return false
}
func contains(s, sub string) bool {
return len(s) >= len(sub) && (s == sub || len(sub) == 0 ||
func() bool {
for i := 0; i <= len(s)-len(sub); i++ {
if s[i:i+len(sub)] == sub {
return true
}
}
return false
}())
}
+96
View File
@@ -0,0 +1,96 @@
package infra
import (
"fmt"
"os"
"path/filepath"
"strings"
"gopkg.in/yaml.v3"
)
// VaultManifestEntry is a single vault entry parsed from a projects/<proj>/vaults/vault.yaml.
type VaultManifestEntry struct {
ProjectID string // basename of projects/<proj>/, inferred from manifest path
Name string // vault name as declared in vault.yaml
Description string // human description
Path string // absolute path to the vault directory
Tags []string // tags declared in vault.yaml
ManifestFile string // absolute path to the vault.yaml this entry came from
}
// vaultYAML mirrors the vault.yaml schema (only the fields we care about).
type vaultYAML struct {
Vaults []struct {
Name string `yaml:"name"`
Description string `yaml:"description"`
Path string `yaml:"path"`
Tags []string `yaml:"tags"`
} `yaml:"vaults"`
}
// VaultManifestRead globs all projects/*/vaults/vault.yaml under repoRoot, parses each
// manifest and returns a flat slice of VaultManifestEntry.
//
// Rules:
// - If a manifest fails to parse, an error is returned immediately with the file path.
// - If no manifests are found, an empty slice is returned (not an error).
// - ProjectID is inferred from the directory component between "projects/" and "/vaults/".
func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error) {
pattern := filepath.Join(repoRoot, "projects", "*", "vaults", "vault.yaml")
matches, err := filepath.Glob(pattern)
if err != nil {
return nil, fmt.Errorf("vault_manifest_read: glob %q: %w", pattern, err)
}
var out []VaultManifestEntry
for _, manifestPath := range matches {
entries, err := parseVaultManifest(manifestPath)
if err != nil {
return nil, err
}
out = append(out, entries...)
}
return out, nil
}
func parseVaultManifest(manifestPath string) ([]VaultManifestEntry, error) {
data, err := os.ReadFile(manifestPath)
if err != nil {
return nil, fmt.Errorf("vault_manifest_read: read %q: %w", manifestPath, err)
}
var raw vaultYAML
if err := yaml.Unmarshal(data, &raw); err != nil {
return nil, fmt.Errorf("vault_manifest_read: parse %q: %w", manifestPath, err)
}
projectID := inferProjectID(manifestPath)
entries := make([]VaultManifestEntry, 0, len(raw.Vaults))
for _, v := range raw.Vaults {
entries = append(entries, VaultManifestEntry{
ProjectID: projectID,
Name: v.Name,
Description: v.Description,
Path: v.Path,
Tags: v.Tags,
ManifestFile: manifestPath,
})
}
return entries, nil
}
// inferProjectID extracts the project basename from a path of the form
// .../projects/<proj>/vaults/vault.yaml.
func inferProjectID(manifestPath string) string {
// Normalize separators and split.
parts := strings.Split(filepath.ToSlash(manifestPath), "/")
// Walk backwards: vault.yaml -> vaults -> <proj> -> projects -> ...
for i, p := range parts {
if p == "projects" && i+1 < len(parts) {
return parts[i+1]
}
}
return ""
}
+59
View File
@@ -0,0 +1,59 @@
---
name: vault_manifest_read
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultManifestRead(repoRoot string) ([]VaultManifestEntry, error)"
description: "Lee todos los manifests vault.yaml bajo projects/*/vaults/ del repo y devuelve una lista plana de entradas de vault con su ProjectID inferido del path."
tags: [vault, manifest, yaml, infra, projects, storage]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: "error_go_core"
imports:
- "fmt"
- "os"
- "path/filepath"
- "strings"
- "gopkg.in/yaml.v3"
params:
- name: repoRoot
desc: "Ruta absoluta a la raiz del repositorio fn_registry. Se usa como base para el glob projects/*/vaults/vault.yaml."
output: "Slice plano de VaultManifestEntry (ProjectID, Name, Description, Path, Tags, ManifestFile). Vacio si no hay manifests. Error si un yaml no parsea, con el path concreto en el mensaje."
tested: true
tests:
- "TestVaultManifestRead_HappyPath"
- "TestVaultManifestRead_MalformedYAML"
- "TestVaultManifestRead_EmptyDir"
test_file_path: "functions/infra/vault_manifest_read_test.go"
file_path: "functions/infra/vault_manifest_read.go"
---
## Ejemplo
```go
entries, err := VaultManifestRead("/home/lucas/fn_registry")
if err != nil {
log.Fatal(err)
}
for _, e := range entries {
fmt.Printf("%s/%s -> %s\n", e.ProjectID, e.Name, e.Path)
}
// app_turismo/turismo_spain -> /home/lucas/vaults/turismo_spain
// app_finance/finance_data -> /home/lucas/vaults/finance_data
```
## Notas
`VaultManifestEntry` es un tipo local de esta funcion (no un tipo del registry). Contiene:
- `ProjectID` — basename del directorio `projects/<proj>/`, inferido del path del manifest.
- `Name`, `Description`, `Path`, `Tags` — copiados del yaml tal cual.
- `ManifestFile` — path absoluto al vault.yaml de origen, util para mensajes de error y trazabilidad.
El parseo usa `gopkg.in/yaml.v3` (ya en go.mod). Si un manifest falla, la funcion devuelve
error inmediatamente con el path del fichero problemático. Los manifests sin entradas
`vaults:` contribuyen cero entries (no es error). Si no existe ningun `projects/*/vaults/vault.yaml`
el resultado es slice vacio sin error.
+113
View File
@@ -0,0 +1,113 @@
package infra
import (
"os"
"path/filepath"
"testing"
)
func TestVaultManifestRead_HappyPath(t *testing.T) {
root := t.TempDir()
writeManifest(t, root, "app_turismo", `
vaults:
- name: turismo_spain
description: "Datos de turismo en Espana"
path: "/home/lucas/vaults/turismo_spain"
tags: [turismo, espana]
- name: turismo_raw
description: "Datos brutos sin procesar"
path: "/home/lucas/vaults/turismo_raw"
tags: [raw]
`)
writeManifest(t, root, "app_finance", `
vaults:
- name: finance_data
description: "Datos financieros"
path: "/home/lucas/vaults/finance_data"
tags: [finance]
`)
entries, err := VaultManifestRead(root)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entries) != 3 {
t.Fatalf("got %d entries, want 3", len(entries))
}
// Build index by name for order-independent assertions.
byName := make(map[string]VaultManifestEntry, len(entries))
for _, e := range entries {
byName[e.Name] = e
}
// Check turismo_spain entry.
e, ok := byName["turismo_spain"]
if !ok {
t.Fatal("missing entry 'turismo_spain'")
}
if e.ProjectID != "app_turismo" {
t.Errorf("turismo_spain.ProjectID = %q, want %q", e.ProjectID, "app_turismo")
}
if e.Path != "/home/lucas/vaults/turismo_spain" {
t.Errorf("turismo_spain.Path = %q, want %q", e.Path, "/home/lucas/vaults/turismo_spain")
}
if len(e.Tags) != 2 || e.Tags[0] != "turismo" {
t.Errorf("turismo_spain.Tags = %v, want [turismo espana]", e.Tags)
}
if e.ManifestFile == "" {
t.Error("turismo_spain.ManifestFile is empty")
}
// Check finance_data entry belongs to app_finance.
ef, ok := byName["finance_data"]
if !ok {
t.Fatal("missing entry 'finance_data'")
}
if ef.ProjectID != "app_finance" {
t.Errorf("finance_data.ProjectID = %q, want %q", ef.ProjectID, "app_finance")
}
}
func TestVaultManifestRead_MalformedYAML(t *testing.T) {
root := t.TempDir()
writeManifest(t, root, "bad_project", `
vaults:
- name: [invalid yaml
path: missing_bracket
`)
_, err := VaultManifestRead(root)
if err == nil {
t.Fatal("expected error for malformed YAML, got nil")
}
}
func TestVaultManifestRead_EmptyDir(t *testing.T) {
root := t.TempDir()
// No projects/ directory at all — glob returns no matches.
entries, err := VaultManifestRead(root)
if err != nil {
t.Fatalf("unexpected error for empty dir: %v", err)
}
if len(entries) != 0 {
t.Fatalf("got %d entries, want 0", len(entries))
}
}
// writeManifest creates <root>/projects/<proj>/vaults/vault.yaml with the given content.
func writeManifest(t *testing.T, root, proj, content string) {
t.Helper()
dir := filepath.Join(root, "projects", proj, "vaults")
if err := os.MkdirAll(dir, 0o755); err != nil {
t.Fatalf("mkdir %s: %v", dir, err)
}
f := filepath.Join(dir, "vault.yaml")
if err := os.WriteFile(f, []byte(content), 0o644); err != nil {
t.Fatalf("write %s: %v", f, err)
}
}
+265
View File
@@ -0,0 +1,265 @@
package infra
import (
"database/sql"
"fmt"
"path/filepath"
"strings"
)
// VaultSearchHit is a single result returned by VaultSearch.
type VaultSearchHit struct {
VaultPath string `json:"vault_path"`
VaultName string `json:"vault_name"` // basename of VaultPath (after resolving symlinks)
RelPath string `json:"rel_path"`
Size int64 `json:"size"`
Mtime int64 `json:"mtime"`
Mime string `json:"mime"`
Bucket string `json:"bucket"`
SubBucket string `json:"sub_bucket"`
Snippet string `json:"snippet"` // FTS5 snippet or empty if match is only by rel_path (fallback)
}
// VaultSearch searches vault_index.db inside vaultPath for files matching query.
//
// Behaviour:
// 1. Opens vault_index.db via VaultIndexOpen.
// 2. If limit <= 0, defaults to 50.
// 3. Runs a FTS5 MATCH query over files_fts to find content matches (when content_text
// is populated by profilers). Because the FTS5 table uses content='' (contentless),
// column values are not stored; results are correlated back to files via a LIKE
// match on rel_path for path tokens, or via an IN clause of matched rowids for
// content_text matches.
// 4. Also searches files.rel_path with LIKE to find path matches.
// 5. Results from both searches are merged (deduplication by rel_path).
// 6. If both FTS5 and LIKE queries fail, returns the error.
// 7. VaultName is derived from the basename of vaultPath (after resolving symlinks).
func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error) {
if limit <= 0 {
limit = 50
}
db, err := VaultIndexOpen(vaultPath)
if err != nil {
return nil, fmt.Errorf("vault_search: open index: %w", err)
}
defer db.Close()
vaultName := resolveVaultName(vaultPath)
hits, err := vaultSearchCombined(db, vaultPath, vaultName, query, limit)
if err != nil {
return nil, fmt.Errorf("vault_search: %w", err)
}
return hits, nil
}
// vaultSearchCombined runs the search using two strategies and merges deduplicated results:
// 1. FTS5 MATCH on files_fts (for content_text when populated by profilers).
// Correlation back to files uses rowid (reliable for fresh indexes) or falls back.
// 2. LIKE on files.rel_path (always reliable for path searching).
//
// Results are deduplicated by rel_path, up to limit entries.
func vaultSearchCombined(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
seen := make(map[string]struct{})
var hits []VaultSearchHit
// Strategy 1: FTS5 MATCH on content_text (populated by profilers).
// With contentless FTS5 (content=''), column values are NOT retrievable via SELECT.
// We get matching rowids from FTS5, then look up files by rowid.
// This is reliable for content_text matches because VaultIndexWrite inserts
// content_text rows independently of the path rows (profilers update them).
// NOTE: for rel_path token matching, strategy 2 (LIKE) is more reliable.
ftsQuery := safeFTSQuery(query)
ftsHits, ftsErr := vaultSearchFTSContent(db, vaultPath, vaultName, ftsQuery, limit)
if ftsErr == nil {
for _, h := range ftsHits {
if len(hits) >= limit {
break
}
if _, ok := seen[h.RelPath]; !ok {
seen[h.RelPath] = struct{}{}
hits = append(hits, h)
}
}
}
// If FTS5 failed with a syntax error, that's expected for bad queries — continue.
// If it failed with a non-syntax error, still continue to LIKE fallback.
// Strategy 2: LIKE on rel_path — reliable path search.
// When query contains FTS5 special chars (e.g. "foo:bar:"), extract the first
// word-like token so the LIKE pattern is still useful.
likeQuery := simplifyForLike(query)
if len(hits) < limit && likeQuery != "" {
remaining := limit - len(hits)
likeHits, likeErr := vaultSearchLike(db, vaultPath, vaultName, likeQuery, remaining+len(seen))
if likeErr != nil && ftsErr != nil {
// Both failed — return a combined error.
return nil, fmt.Errorf("fts: %v; like: %v", ftsErr, likeErr)
}
for _, h := range likeHits {
if len(hits) >= limit {
break
}
if _, ok := seen[h.RelPath]; !ok {
seen[h.RelPath] = struct{}{}
hits = append(hits, h)
}
}
}
if hits == nil {
hits = []VaultSearchHit{}
}
return hits, nil
}
// vaultSearchFTSContent queries files_fts with a MATCH and correlates results
// back to the files table.
//
// Design note: with content='' (contentless FTS5), SELECT on columns returns ''.
// We get the rowid from the FTS5 match and look up files.rel_path via rowid.
// This works correctly when content_text was populated by a profiler that did NOT
// delete+reinsert the FTS row (i.e. profilers do direct INSERT/UPDATE of content_text
// without changing the rowid). For the current VaultIndexWrite implementation
// (which inserts content_text='' and profilers update it in-place), the rowids
// remain stable after profiling.
func vaultSearchFTSContent(db *sql.DB, vaultPath, vaultName, safeQuery string, limit int) ([]VaultSearchHit, error) {
// Get matching rowids from FTS5.
const qRowids = `
SELECT rowid
FROM files_fts
WHERE files_fts MATCH ?
ORDER BY rank
LIMIT ?`
rows, err := db.Query(qRowids, safeQuery, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var rowids []int64
for rows.Next() {
var rid int64
if err := rows.Scan(&rid); err != nil {
return nil, err
}
rowids = append(rowids, rid)
}
if err := rows.Err(); err != nil {
return nil, err
}
if len(rowids) == 0 {
return nil, nil
}
// Look up files by rowid. files uses a TEXT PK so its rowid is implicit.
// Snippet is empty for contentless FTS5 (snippet() returns NULL there).
var hits []VaultSearchHit
for _, rid := range rowids {
var h VaultSearchHit
err := db.QueryRow(`
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
FROM files WHERE rowid = ?`, rid,
).Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket)
if err != nil {
// rowid mismatch (happens after update cycles) — skip gracefully.
continue
}
h.VaultPath = vaultPath
h.VaultName = vaultName
h.Snippet = ""
hits = append(hits, h)
}
return hits, nil
}
// vaultSearchLike searches files.rel_path with LIKE, ordered by mtime DESC.
func vaultSearchLike(db *sql.DB, vaultPath, vaultName, query string, limit int) ([]VaultSearchHit, error) {
const qLike = `
SELECT rel_path, size, mtime, mime, bucket, sub_bucket
FROM files
WHERE rel_path LIKE '%' || ? || '%'
ORDER BY mtime DESC
LIMIT ?`
rows, err := db.Query(qLike, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var hits []VaultSearchHit
for rows.Next() {
var h VaultSearchHit
if err := rows.Scan(&h.RelPath, &h.Size, &h.Mtime, &h.Mime, &h.Bucket, &h.SubBucket); err != nil {
return nil, err
}
h.VaultPath = vaultPath
h.VaultName = vaultName
h.Snippet = ""
hits = append(hits, h)
}
return hits, rows.Err()
}
// resolveVaultName returns the basename of vaultPath after resolving symlinks.
// Falls back to filepath.Base if EvalSymlinks fails.
func resolveVaultName(vaultPath string) string {
resolved, err := filepath.EvalSymlinks(vaultPath)
if err != nil {
resolved = vaultPath
}
return filepath.Base(resolved)
}
// safeFTSQuery wraps the query in double-quotes if it does not already contain
// FTS5 boolean operators (AND, OR, NOT) or column prefixes (containing ":").
// This prevents FTS5 syntax errors on tokens like "foo:bar:" or "hello-world".
func safeFTSQuery(query string) string {
q := strings.TrimSpace(query)
if q == "" {
return q
}
upper := strings.ToUpper(q)
// If user already uses explicit operators or column prefix, pass through.
if strings.ContainsAny(q, ":") ||
strings.Contains(upper, " AND ") ||
strings.Contains(upper, " OR ") ||
strings.Contains(upper, " NOT ") {
return q
}
// Escape any double-quotes in the query before wrapping.
escaped := strings.ReplaceAll(q, `"`, `""`)
return `"` + escaped + `"`
}
// isFTSSyntaxError returns true when the error looks like an FTS5 query parser error.
func isFTSSyntaxError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
return strings.Contains(msg, "syntax error") ||
strings.Contains(msg, "no such column") ||
strings.Contains(msg, "fts5: syntax error")
}
// simplifyForLike extracts a clean substring from query suitable for LIKE matching.
// When the query contains FTS5 special characters (colons, double-quotes, operators),
// only the first word-like sequence of alphanumeric/underscore/hyphen characters is
// used. This ensures the LIKE fallback remains useful even when the FTS5 query is
// syntactically complex or contains column-prefix syntax like "foo:bar:".
func simplifyForLike(query string) string {
q := strings.TrimSpace(query)
var token strings.Builder
for _, r := range q {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '_' || r == '-' {
token.WriteRune(r)
} else if token.Len() > 0 {
break
}
}
return token.String()
}
+61
View File
@@ -0,0 +1,61 @@
---
name: vault_search
kind: function
lang: go
domain: infra
version: "1.0.0"
purity: impure
signature: "func VaultSearch(vaultPath, query string, limit int) ([]VaultSearchHit, error)"
description: "Busca en vault_index.db de un vault usando FTS5 sobre files_fts. Si el query rompe el parser FTS5, hace fallback a LIKE sobre rel_path. Retorna hits con snippet de contexto."
tags: [vault, search, fts5, sqlite, infra]
uses_functions: ["vault_index_open_go_infra"]
uses_types: ["vault_file_go_infra"]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: [database/sql, fmt, path/filepath, strings]
params:
- name: vaultPath
desc: "ruta absoluta al directorio raiz del vault (puede ser symlink)"
- name: query
desc: "termino o frase de busqueda; se escapa automaticamente para FTS5 salvo que ya incluya operadores booleanos o prefijos de columna"
- name: limit
desc: "maximo de resultados; si es <= 0 se usa 50"
output: "slice de VaultSearchHit ordenado por rank FTS5 (o mtime DESC en fallback LIKE); slice vacio si no hay resultados"
tested: true
tests:
- "FTS match devuelve hit con snippet"
- "query sin resultados retorna slice vacio"
- "limit se respeta"
- "query FTS invalida activa fallback LIKE"
- "limit cero usa 50 por defecto"
test_file_path: "functions/infra/vault_search_test.go"
file_path: "functions/infra/vault_search.go"
---
## Ejemplo
```go
hits, err := infra.VaultSearch("/home/lucas/vaults/turismo_spain", "hoteles", 20)
if err != nil {
log.Fatal(err)
}
for _, h := range hits {
fmt.Printf("[%s] %s %s\n", h.VaultName, h.RelPath, h.Snippet)
}
```
## Notas
`VaultSearchHit` es un struct local definido en este archivo (no en `vault_file.go`)
porque combina campos de `files` + metadatos de contexto de busqueda (Snippet, VaultPath, VaultName).
**FTS5 safety:** el helper `safeFTSQuery` envuelve la query en comillas dobles
cuando no contiene operadores booleanos ni prefijos de columna. Esto evita errores
del parser en tokens como `foo:bar:` o `hello-world`.
**Fallback LIKE:** si el MATCH falla con un error de sintaxis FTS5, se ejecuta
`WHERE rel_path LIKE '%' || query || '%'`. Los hits del fallback tienen `Snippet=""`.
**VaultName:** se deriva del `filepath.Base(filepath.EvalSymlinks(vaultPath))`.
Si `EvalSymlinks` falla (e.g. symlink roto), usa `filepath.Base(vaultPath)`.
+147
View File
@@ -0,0 +1,147 @@
package infra
import (
"testing"
"time"
)
// openTestVaultDB creates a fresh vault_index.db in a temp dir and returns the path.
func openTestVaultDir(t *testing.T) string {
t.Helper()
dir := t.TempDir()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen: %v", err)
}
db.Close()
return dir
}
// seedVaultFile inserts a row into files + files_fts.
func seedVaultFile(t *testing.T, dir, relPath, mime, bucket, subBucket, contentText string, size int64) {
t.Helper()
db, err := VaultIndexOpen(dir)
if err != nil {
t.Fatalf("VaultIndexOpen seed: %v", err)
}
defer db.Close()
now := time.Now().Unix()
_, err = db.Exec(`
INSERT INTO files (rel_path, size, mtime, sha256, mime, ext, bucket, sub_bucket, indexed_at)
VALUES (?, ?, ?, 'aabbccdd', ?, '', ?, ?, ?)`,
relPath, size, now, mime, bucket, subBucket, now,
)
if err != nil {
t.Fatalf("seed files: %v", err)
}
_, err = db.Exec(`INSERT INTO files_fts(rel_path, content_text) VALUES (?, ?)`, relPath, contentText)
if err != nil {
t.Fatalf("seed files_fts: %v", err)
}
}
// --- Tests ---
func TestVaultSearch_FTSMatch(t *testing.T) {
t.Run("FTS match devuelve hit con snippet", func(t *testing.T) {
dir := openTestVaultDir(t)
seedVaultFile(t, dir, "data/raw/informe.csv", "text/csv", "data", "raw",
"ventas trimestrales empresa iberica", 1024)
seedVaultFile(t, dir, "data/raw/other.csv", "text/csv", "data", "raw",
"productos inventario almacen", 512)
hits, err := VaultSearch(dir, "ventas", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 1 {
t.Fatalf("got %d hits, want 1", len(hits))
}
if hits[0].RelPath != "data/raw/informe.csv" {
t.Errorf("RelPath = %q, want data/raw/informe.csv", hits[0].RelPath)
}
if hits[0].VaultName == "" {
t.Errorf("VaultName should not be empty")
}
})
}
func TestVaultSearch_NoMatch(t *testing.T) {
t.Run("query sin resultados retorna slice vacio", func(t *testing.T) {
dir := openTestVaultDir(t)
seedVaultFile(t, dir, "data/raw/file.csv", "text/csv", "data", "raw", "some content", 100)
hits, err := VaultSearch(dir, "zzznomatch", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 0 {
t.Errorf("got %d hits, want 0", len(hits))
}
})
}
func TestVaultSearch_LimitRespected(t *testing.T) {
t.Run("limit se respeta", func(t *testing.T) {
dir := openTestVaultDir(t)
for i := 0; i < 10; i++ {
path := "data/raw/file" + string(rune('a'+i)) + ".csv"
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "common keyword everywhere", 100)
}
hits, err := VaultSearch(dir, "common", 3)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 3 {
t.Errorf("got %d hits, want 3", len(hits))
}
})
}
func TestVaultSearch_BadFTSQuery_FallbackLike(t *testing.T) {
t.Run("query FTS invalida activa fallback LIKE", func(t *testing.T) {
dir := openTestVaultDir(t)
// Insert a file whose rel_path contains "foobar" so LIKE can find it.
seedVaultFile(t, dir, "data/raw/foobar_report.csv", "text/csv", "data", "raw", "", 200)
// "foo:bar:" — colon after a non-column name triggers FTS5 parser error.
// safeFTSQuery passes it through unchanged because it contains ":"
// → FTS5 "no such column: bar" → fallback LIKE on rel_path.
hits, err := VaultSearch(dir, "foo:bar:", 10)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) == 0 {
t.Errorf("expected fallback LIKE to find foobar_report.csv, got 0 hits")
}
for _, h := range hits {
if h.Snippet != "" {
t.Errorf("fallback hits should have empty Snippet, got %q", h.Snippet)
}
}
})
}
func TestVaultSearch_LimitZeroDefaults(t *testing.T) {
t.Run("limit cero usa 50 por defecto", func(t *testing.T) {
dir := openTestVaultDir(t)
// Insert 55 files with the same keyword.
for i := 0; i < 55; i++ {
path := "data/raw/doc" + string(rune('a')) + string(rune(int('0')+i%10)) + ".csv"
if i >= 10 {
path = "data/raw/doc" + string(rune('b'+i/10-1)) + string(rune(int('0')+i%10)) + ".csv"
}
seedVaultFile(t, dir, path, "text/csv", "data", "raw", "keyword alpha beta", 100)
}
hits, err := VaultSearch(dir, "keyword", 0)
if err != nil {
t.Fatalf("VaultSearch: %v", err)
}
if len(hits) != 50 {
t.Errorf("got %d hits, want 50 (default limit)", len(hits))
}
})
}
+20
View File
@@ -0,0 +1,20 @@
package ml
import "encoding/json"
// GenconfigMarshal serializa un GenerationConfig a JSON canonico con indent de 2 espacios.
// El formato es identico al de Python json.dumps(indent=2, sort_keys=False):
// keys en el orden de declaracion del struct, snake_case, campos omitempty ausentes si zero.
func GenconfigMarshal(cfg GenerationConfig) ([]byte, error) {
return json.MarshalIndent(cfg, "", " ")
}
// GenconfigUnmarshal deserializa JSON (compacto o con indent) a GenerationConfig.
// Los campos JSON deben usar snake_case: negative_prompt, cfg_scale, model_type, etc.
func GenconfigUnmarshal(data []byte) (GenerationConfig, error) {
var cfg GenerationConfig
if err := json.Unmarshal(data, &cfg); err != nil {
return GenerationConfig{}, err
}
return cfg, nil
}
+84
View File
@@ -0,0 +1,84 @@
---
name: genconfig_json_marshal
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: impure
signature: "func GenconfigMarshal(cfg GenerationConfig) ([]byte, error)\nfunc GenconfigUnmarshal(data []byte) (GenerationConfig, error)"
description: "Wrappers json.Marshal/Unmarshal para GenerationConfig con formato canonico (MarshalIndent 2 espacios). Garantiza roundtrip identico al Python: json.dumps(indent=2, sort_keys=False). Campos JSON en snake_case."
tags: [ml, json, marshal, unmarshal, serialization, generation, canonical]
uses_functions: []
uses_types: [generation_config_go_ml]
returns: []
returns_optional: false
error_type: "error_go_core"
imports: ["encoding/json"]
params:
- name: cfg
desc: "GenerationConfig a serializar. Campos omitempty (negative_prompt, loras, clip_skip) se omiten si son zero/nil/empty."
- name: data
desc: "JSON bytes a deserializar. Acepta formato compacto o con indent. Keys deben ser snake_case (negative_prompt, cfg_scale, model_type, etc.)."
output: "GenconfigMarshal: bytes JSON con indent 2 espacios, orden de campos segun declaracion del struct (prompt, negative_prompt, seed, steps, cfg_scale, sampler, width, height, model, loras, clip_skip). GenconfigUnmarshal: GenerationConfig poblado o error de parsing."
tested: true
tests:
- "roundtrip marshal unmarshal produce config igual"
- "json cross-language snake_case keys se deserializan correctamente"
test_file_path: "functions/ml/genconfig_test.go"
file_path: "functions/ml/genconfig_json_marshal.go"
---
## Ejemplo
```go
cfg := ml.GenerationConfig{
Prompt: "a mountain at sunset",
Seed: 1234,
Steps: 30,
CfgScale: 7.0,
Sampler: "euler",
Width: 768,
Height: 512,
Model: ml.ModelRef{Name: "sdxl-base", ModelType: "sdxl", Quantization: "fp16"},
}
b, err := ml.GenconfigMarshal(cfg)
// b == {
// "prompt": "a mountain at sunset",
// "seed": 1234,
// ...
// }
cfg2, err := ml.GenconfigUnmarshal(b)
// cfg2 == cfg (DeepEqual)
```
## Notas
### Formato canonico y compatibilidad con Python
`GenconfigMarshal` usa `json.MarshalIndent(cfg, "", " ")`. El formato resultante es identico al que produce Python con `model.model_dump_json()` o `json.dumps(data, indent=2)` cuando `sort_keys=False`:
- Keys en orden de declaracion del struct (no alfabetico).
- Indent de 2 espacios, sin trailing whitespace.
- Campos omitempty ausentes si zero: `negative_prompt` ausente si `""`, `loras` ausente si `[]`, `clip_skip` ausente si `nil`.
### Keys JSON (snake_case obligatorio)
| Campo Go | Key JSON |
|---|---|
| `Prompt` | `"prompt"` |
| `NegativePrompt` | `"negative_prompt"` |
| `Seed` | `"seed"` |
| `Steps` | `"steps"` |
| `CfgScale` | `"cfg_scale"` |
| `Sampler` | `"sampler"` |
| `Width` | `"width"` |
| `Height` | `"height"` |
| `Model.ModelType` | `"model_type"` |
| `Model.Quantization` | `"quantization"` |
| `ClipSkip` | `"clip_skip"` |
### Por que impure
Los errores de `json.Unmarshal` son errores de parsing del input externo, no de I/O, pero se modelan como `(T, error)` para forzar manejo explicito en el caller. Marcado `impure` con `error_type: error_go_core` por convencion del registry.
+260
View File
@@ -0,0 +1,260 @@
package ml
import (
"reflect"
"strings"
"testing"
)
// ---------------------------------------------------------------------------
// TestGenconfigToSdcliArgs
// ---------------------------------------------------------------------------
func TestGenconfigToSdcliArgs(t *testing.T) {
clipSkip := 2
t.Run("config basico sin loras ni clip_skip", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "a cat",
Seed: 42,
Steps: 20,
CfgScale: 7.5,
Sampler: "euler",
Width: 512,
Height: 512,
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
want := []string{
"--prompt", "a cat",
"--seed", "42",
"--steps", "20",
"--cfg-scale", "7.5",
"--width", "512",
"--height", "512",
"--sampling-method", "euler",
}
if !reflect.DeepEqual(args, want) {
t.Errorf("got %v\nwant %v", args, want)
}
})
t.Run("loras se emiten como pares path:weight", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "portrait",
Seed: 1,
Steps: 10,
CfgScale: 7.0,
Sampler: "euler",
Width: 512,
Height: 512,
Model: ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1.safetensors"},
Loras: []LoraRef{
{Path: "/loras/detail.safetensors", Weight: 0.8},
{Path: "/loras/style.safetensors", Weight: 0.5},
},
ClipSkip: &clipSkip,
}
args := GenconfigToSdcliArgs(cfg)
// Verificar que existen los pares --lora para ambas loras
loraIdx := indexAll(args, "--lora")
if len(loraIdx) != 2 {
t.Fatalf("esperaba 2 flags --lora, got %d en %v", len(loraIdx), args)
}
wantLoras := []string{
"/loras/detail.safetensors:0.8",
"/loras/style.safetensors:0.5",
}
for i, idx := range loraIdx {
if idx+1 >= len(args) {
t.Fatalf("--lora[%d] sin valor siguiente", i)
}
if args[idx+1] != wantLoras[i] {
t.Errorf("lora[%d]: got %q, want %q", i, args[idx+1], wantLoras[i])
}
}
// Verificar --model y --clip-skip presentes
if !containsPair(args, "--model", "/models/v1.safetensors") {
t.Errorf("--model no encontrado en %v", args)
}
if !containsPair(args, "--clip-skip", "2") {
t.Errorf("--clip-skip no encontrado en %v", args)
}
})
t.Run("sampler dpm++2m se traduce a dpmpp2m", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "x",
Seed: 0,
Steps: 1,
CfgScale: 1.0,
Sampler: "dpm++2m",
Width: 64,
Height: 64,
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
if !containsPair(args, "--sampling-method", "dpmpp2m") {
t.Errorf("sampler no traducido; args=%v", args)
}
})
t.Run("negative_prompt vacio no genera flag", func(t *testing.T) {
cfg := GenerationConfig{
Prompt: "x",
NegativePrompt: "",
Seed: 0,
Steps: 1,
CfgScale: 1.0,
Sampler: "euler",
Width: 64,
Height: 64,
Model: ModelRef{Name: "m", ModelType: "sd15", Quantization: "fp16"},
}
args := GenconfigToSdcliArgs(cfg)
for _, a := range args {
if a == "--negative-prompt" {
t.Errorf("flag --negative-prompt presente aunque NegativePrompt es vacio")
}
}
})
}
// ---------------------------------------------------------------------------
// TestGenconfigMarshalRoundtrip
// ---------------------------------------------------------------------------
func TestGenconfigMarshalRoundtrip(t *testing.T) {
t.Run("roundtrip marshal unmarshal produce config igual", func(t *testing.T) {
clip := 2
cfg := GenerationConfig{
Prompt: "sunset over the mountains",
NegativePrompt: "blurry, low quality",
Seed: 99,
Steps: 30,
CfgScale: 7.5,
Sampler: "dpm++2m",
Width: 768,
Height: 512,
Model: ModelRef{
Name: "sdxl-base",
ModelType: "sdxl",
Quantization: "fp16",
Path: "/models/sdxl.safetensors",
},
Loras: []LoraRef{
{Path: "/loras/detail.safetensors", Weight: 0.8},
},
ClipSkip: &clip,
}
b, err := GenconfigMarshal(cfg)
if err != nil {
t.Fatalf("GenconfigMarshal: %v", err)
}
got, err := GenconfigUnmarshal(b)
if err != nil {
t.Fatalf("GenconfigUnmarshal: %v", err)
}
if !reflect.DeepEqual(cfg, got) {
t.Errorf("roundtrip diverge\norig: %+v\ngot: %+v", cfg, got)
}
})
}
// ---------------------------------------------------------------------------
// TestGenconfigCrossLanguageJSON
// ---------------------------------------------------------------------------
func TestGenconfigCrossLanguageJSON(t *testing.T) {
// Fixture escrito a mano replicando lo que generaria Python:
// json.dumps(config.model_dump(), indent=2)
// Keys en snake_case, orden de declaracion del dataclass Python.
fixture := `{
"prompt": "a dragon",
"negative_prompt": "ugly",
"seed": 1234,
"steps": 25,
"cfg_scale": 7.0,
"sampler": "euler_a",
"width": 512,
"height": 512,
"model": {
"name": "v1-5",
"model_type": "sd15",
"quantization": "fp16"
},
"loras": [
{
"path": "/loras/dragon.safetensors",
"weight": 0.9
}
]
}`
t.Run("json cross-language snake_case keys se deserializan correctamente", func(t *testing.T) {
cfg, err := GenconfigUnmarshal([]byte(fixture))
if err != nil {
t.Fatalf("GenconfigUnmarshal fixture: %v", err)
}
// Verificar campos clave
if cfg.Prompt != "a dragon" {
t.Errorf("Prompt: got %q", cfg.Prompt)
}
if cfg.NegativePrompt != "ugly" {
t.Errorf("NegativePrompt: got %q", cfg.NegativePrompt)
}
if cfg.CfgScale != 7.0 {
t.Errorf("CfgScale: got %v", cfg.CfgScale)
}
if cfg.Model.ModelType != "sd15" {
t.Errorf("Model.ModelType: got %q", cfg.Model.ModelType)
}
if len(cfg.Loras) != 1 || cfg.Loras[0].Weight != 0.9 {
t.Errorf("Loras: got %+v", cfg.Loras)
}
// Re-marshal y verificar que las keys snake_case siguen presentes
b, err := GenconfigMarshal(cfg)
if err != nil {
t.Fatalf("GenconfigMarshal: %v", err)
}
s := string(b)
for _, key := range []string{"negative_prompt", "cfg_scale", "model_type", "quantization"} {
if !strings.Contains(s, `"`+key+`"`) {
t.Errorf("key %q ausente en JSON re-serializado:\n%s", key, s)
}
}
})
}
// ---------------------------------------------------------------------------
// helpers
// ---------------------------------------------------------------------------
// indexAll retorna todos los indices de val en slice.
func indexAll(slice []string, val string) []int {
var out []int
for i, s := range slice {
if s == val {
out = append(out, i)
}
}
return out
}
// containsPair verifica que flag seguido de value aparece en slice.
func containsPair(slice []string, flag, value string) bool {
for i := 0; i+1 < len(slice); i++ {
if slice[i] == flag && slice[i+1] == value {
return true
}
}
return false
}
+59
View File
@@ -0,0 +1,59 @@
package ml
import (
"fmt"
"strconv"
)
// samplerMap traduce nombres canonicos del dominio ml a flags de stable-diffusion.cpp.
var samplerMap = map[string]string{
"euler": "euler",
"euler_a": "euler_a",
"dpm++2m": "dpmpp2m",
"dpm++2m_v2": "dpmpp2mv2",
"heun": "heun",
"dpm2": "dpm2",
"lcm": "lcm",
}
// GenconfigToSdcliArgs convierte un GenerationConfig en una lista de argumentos
// CLI para stable-diffusion.cpp (sd.exe / sd binario).
// Espejo Go de genconfig_to_sdcpp_args_py_ml.
//
// Loras se emiten como pares repetidos "--lora" "path:weight".
// Si el sampler no existe en samplerMap se usa el valor literal sin traducir.
// La funcion es pura: sin I/O, sin estado, determinista.
func GenconfigToSdcliArgs(cfg GenerationConfig) []string {
args := []string{
"--prompt", cfg.Prompt,
"--seed", strconv.FormatInt(cfg.Seed, 10),
"--steps", strconv.Itoa(cfg.Steps),
"--cfg-scale", strconv.FormatFloat(cfg.CfgScale, 'f', -1, 64),
"--width", strconv.Itoa(cfg.Width),
"--height", strconv.Itoa(cfg.Height),
}
if cfg.NegativePrompt != "" {
args = append(args, "--negative-prompt", cfg.NegativePrompt)
}
sampler := cfg.Sampler
if mapped, ok := samplerMap[sampler]; ok {
sampler = mapped
}
args = append(args, "--sampling-method", sampler)
if cfg.Model.Path != "" {
args = append(args, "--model", cfg.Model.Path)
}
if cfg.ClipSkip != nil {
args = append(args, "--clip-skip", strconv.Itoa(*cfg.ClipSkip))
}
for _, lora := range cfg.Loras {
args = append(args, "--lora", fmt.Sprintf("%s:%g", lora.Path, lora.Weight))
}
return args
}
+59
View File
@@ -0,0 +1,59 @@
---
name: genconfig_to_sdcli_args
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: pure
signature: "func GenconfigToSdcliArgs(cfg GenerationConfig) []string"
description: "Convierte un GenerationConfig en argumentos CLI para stable-diffusion.cpp. Espejo Go de genconfig_to_sdcpp_args_py_ml. Loras se emiten como pares repetidos --lora path:weight. Sampler traducido via samplerMap canonico."
tags: [ml, stable-diffusion, cli, args, generation, pure]
uses_functions: []
uses_types: [generation_config_go_ml]
returns: []
returns_optional: false
error_type: ""
imports: ["fmt", "strconv"]
params:
- name: cfg
desc: "Parametros completos de generacion de imagen. Sampler debe ser uno de los valores de SamplerName. Model.Path se emite como --model si no esta vacio."
output: "Slice de strings listos para pasar a exec.Command o similar. Incluye --prompt, --seed, --steps, --cfg-scale, --width, --height, --sampling-method, opcionales --negative-prompt / --model / --clip-skip, y pares --lora path:weight por cada LoraRef."
tested: true
tests:
- "config basico sin loras ni clip_skip"
- "loras se emiten como pares path:weight"
- "sampler dpm++2m se traduce a dpmpp2m"
- "negative_prompt vacio no genera flag"
test_file_path: "functions/ml/genconfig_test.go"
file_path: "functions/ml/genconfig_to_sdcli_args.go"
---
## Ejemplo
```go
clip := 2
cfg := ml.GenerationConfig{
Prompt: "a cat",
Seed: 42,
Steps: 20,
CfgScale: 7.5,
Sampler: "dpm++2m",
Width: 512,
Height: 512,
Model: ml.ModelRef{Name: "v1-5", ModelType: "sd15", Quantization: "fp16", Path: "/models/v1-5.safetensors"},
Loras: []ml.LoraRef{{Path: "/loras/detail.safetensors", Weight: 0.8}},
ClipSkip: &clip,
}
args := ml.GenconfigToSdcliArgs(cfg)
// args == ["--prompt","a cat","--seed","42","--steps","20",
// "--cfg-scale","7.5","--width","512","--height","512",
// "--sampling-method","dpmpp2m","--model","/models/v1-5.safetensors",
// "--clip-skip","2","--lora","/loras/detail.safetensors:0.8"]
```
## Notas
- `samplerMap` traduce nombres canonicos del dominio ml a los identificadores que acepta stable-diffusion.cpp. Si el sampler no esta en el mapa se usa el valor literal.
- El flag de modelo (`--model`) solo se emite si `cfg.Model.Path != ""`.
- `%g` en `fmt.Sprintf` para el peso de la lora elimina ceros insignificantes: `0.800000``0.8`.
- Funcion pura: misma entrada, misma salida. Sin I/O ni estado global.
+18
View File
@@ -0,0 +1,18 @@
package ml
// GenerationConfig parametriza una solicitud de generacion de imagen.
// Espejo JSON-compatible de GenerationConfig_py_ml: los tags json coinciden
// con los campos snake_case del dataclass Python para roundtrip sin perdida.
type GenerationConfig struct {
Prompt string `json:"prompt"`
NegativePrompt string `json:"negative_prompt,omitempty"`
Seed int64 `json:"seed"`
Steps int `json:"steps"`
CfgScale float64 `json:"cfg_scale"`
Sampler string `json:"sampler"`
Width int `json:"width"`
Height int `json:"height"`
Model ModelRef `json:"model"`
Loras []LoraRef `json:"loras,omitempty"`
ClipSkip *int `json:"clip_skip,omitempty"`
}
+12
View File
@@ -0,0 +1,12 @@
package ml
// ImageGenResult contiene la imagen generada y su metadata de ejecucion.
// ImageBytes transporta los bytes raw del PNG y se excluye del JSON
// (campo json:"-") porque viaja por canal binario separado.
type ImageGenResult struct {
ImageBytes []byte `json:"-"`
Format string `json:"format"`
Meta map[string]any `json:"meta"`
DurationMs int64 `json:"duration_ms"`
VramPeakMb *int `json:"vram_peak_mb,omitempty"`
}
+9
View File
@@ -0,0 +1,9 @@
package ml
import "context"
// ImageGenerator define el contrato para cualquier backend de generacion de imagenes.
// Las implementaciones pueden ser locales (ComfyUI, diffusers) o remotas (API).
type ImageGenerator interface {
Generate(ctx context.Context, cfg GenerationConfig) (ImageGenResult, error)
}
+8
View File
@@ -0,0 +1,8 @@
package ml
// LoraRef referencia un adaptador LoRA con su peso de fusión y escala opcional.
type LoraRef struct {
Path string `json:"path"`
Weight float64 `json:"weight"`
Scale *float64 `json:"scale,omitempty"`
}
+10
View File
@@ -0,0 +1,10 @@
package ml
// ModelRef identifica un modelo de generacion de imagenes por nombre, tipo,
// cuantizacion y path opcional en disco.
type ModelRef struct {
Name string `json:"name"`
ModelType string `json:"model_type"` // sd15|sdxl|flux_dev|...
Quantization string `json:"quantization"` // fp16|q8_0|...
Path string `json:"path,omitempty"`
}
+78
View File
@@ -0,0 +1,78 @@
package ml
import (
"regexp"
"strconv"
)
// SdcliProgress contiene el estado de progreso parseado de una linea de stderr de sd-cli.
type SdcliProgress struct {
Step int `json:"step"`
TotalSteps int `json:"total_steps"`
ItPerSec float64 `json:"it_per_sec"`
Percent float64 `json:"percent"`
}
// reProgress1 parsea el formato compacto: " 3/30 | 0.84it/s | 10%"
var reProgress1 = regexp.MustCompile(`\s*(\d+)\s*/\s*(\d+)\s*\|[^|]*?([\d.]+)\s*it/s[^|]*?\|\s*([\d.]+)\s*%`)
// reProgress2 parsea el formato verbose: "sampling: step 3 of 30 (0.84 it/s)"
var reProgress2 = regexp.MustCompile(`step\s+(\d+)\s+of\s+(\d+)\s*\(\s*([\d.]+)\s*it/s\)`)
// reProgress3 parsea el formato minimal: "step 3/30" o "progress: 3/30"
var reProgress3 = regexp.MustCompile(`(?:progress[:\s]+)?(\d+)\s*/\s*(\d+)`)
// SdcliParseProgress parsea una linea de stderr de stable-diffusion.cpp / sd-cli
// y extrae el estado de progreso. Retorna (SdcliProgress, true) si la linea
// contiene informacion de progreso reconocible; (zero, false) en caso contrario.
// Funcion pura: sin I/O, sin estado mutable, determinista.
func SdcliParseProgress(line string) (SdcliProgress, bool) {
// Formato 1: " 3/30 | 0.84it/s | 10%"
if m := reProgress1.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
pct, err4 := strconv.ParseFloat(m[4], 64)
if err1 == nil && err2 == nil && err3 == nil && err4 == nil {
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: itPerSec,
Percent: pct,
}, true
}
}
// Formato 2: "sampling: step 3 of 30 (0.84 it/s)"
if m := reProgress2.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
itPerSec, err3 := strconv.ParseFloat(m[3], 64)
if err1 == nil && err2 == nil && err3 == nil && total > 0 {
pct := 100.0 * float64(step) / float64(total)
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: itPerSec,
Percent: pct,
}, true
}
}
// Formato 3: "step 3/30" o "progress: 3/30" sin velocidad
if m := reProgress3.FindStringSubmatch(line); m != nil {
step, err1 := strconv.Atoi(m[1])
total, err2 := strconv.Atoi(m[2])
if err1 == nil && err2 == nil && total > 0 {
pct := 100.0 * float64(step) / float64(total)
return SdcliProgress{
Step: step,
TotalSteps: total,
ItPerSec: 0,
Percent: pct,
}, true
}
}
return SdcliProgress{}, false
}
+50
View File
@@ -0,0 +1,50 @@
---
name: sdcli_parse_progress
kind: function
lang: go
domain: ml
version: "1.0.0"
purity: pure
signature: "func SdcliParseProgress(line string) (SdcliProgress, bool)"
description: "Parsea una linea de stderr de stable-diffusion.cpp / sd-cli y extrae el estado de progreso. Soporta el formato compacto '3/30 | 0.84it/s | 10%', el formato verbose 'sampling: step 3 of 30 (0.84 it/s)', y el formato minimal 'progress: 3/30'. Retorna (zero, false) si la linea no contiene informacion de progreso reconocible."
tags: [ml, stable-diffusion, sdcli, progress, parser, stderr, pure]
uses_functions: []
uses_types: []
returns: []
returns_optional: false
error_type: ""
imports: ["regexp", "strconv"]
params:
- name: line
desc: "Una linea de stderr emitida por sd-cli / stable-diffusion.cpp durante la fase de sampling. Puede contener espacios al inicio o final."
output: "Par (SdcliProgress, bool). bool=true si se reconocio un patron de progreso; SdcliProgress contiene Step (paso actual), TotalSteps (pasos totales), ItPerSec (iteraciones por segundo, 0 si no disponible) y Percent (porcentaje 0-100 calculado o leido de la linea). bool=false y struct zero si la linea no contiene progreso."
tested: true
tests:
- "formato estandar compacto step/total/itpersec/percent"
- "linea sin patron retorna false"
- "formato sampling verbose con velocidad"
file_path: "functions/ml/sdcli_parse_progress.go"
test_file_path: "functions/ml/sdcli_parse_progress_test.go"
---
## Ejemplo
```go
p, ok := ml.SdcliParseProgress(" 3/30 | 0.84it/s | 10%")
// ok = true
// p = SdcliProgress{Step:3, TotalSteps:30, ItPerSec:0.84, Percent:10.0}
p2, ok2 := ml.SdcliParseProgress("sampling: step 15 of 30 (1.2 it/s)")
// ok2 = true
// p2 = SdcliProgress{Step:15, TotalSteps:30, ItPerSec:1.2, Percent:50.0}
_, ok3 := ml.SdcliParseProgress("loading model...")
// ok3 = false
```
## Notas
- Regexps precompiladas como vars de paquete (se compilan una sola vez al init del paquete).
- Tolerante a variaciones de espaciado gracias a `\s*` en los patrones.
- El campo `Percent` en el formato verbose se calcula como `100 * step / total` (no se lee de la linea porque ese formato no lo emite).
- Funcion pura: sin I/O, sin estado mutable, determinista.
+103
View File
@@ -0,0 +1,103 @@
package ml
import (
"math"
"testing"
)
func TestSdcliParseProgress_StandardFormat(t *testing.T) {
line := " 3/30 | 0.84it/s | 10%"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 3 {
t.Errorf("Step: got %d, want 3", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
}
if math.Abs(got.Percent-10.0) > 1e-9 {
t.Errorf("Percent: got %v, want 10.0", got.Percent)
}
}
func TestSdcliParseProgress_NoMatch(t *testing.T) {
cases := []string{
"loading model...",
"",
"error: out of memory",
"clip model loaded",
"generating image...",
}
for _, line := range cases {
_, ok := SdcliParseProgress(line)
if ok {
t.Errorf("expected no match for %q, but got match", line)
}
}
}
func TestSdcliParseProgress_AltFormat(t *testing.T) {
t.Run("formato sampling verbose", func(t *testing.T) {
line := "sampling: step 3 of 30 (0.84 it/s)"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 3 {
t.Errorf("Step: got %d, want 3", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-0.84) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 0.84", got.ItPerSec)
}
expectedPct := 100.0 * 3.0 / 30.0
if math.Abs(got.Percent-expectedPct) > 1e-6 {
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
}
})
t.Run("formato step/total sin velocidad", func(t *testing.T) {
line := "progress: 15/20"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 15 {
t.Errorf("Step: got %d, want 15", got.Step)
}
if got.TotalSteps != 20 {
t.Errorf("TotalSteps: got %d, want 20", got.TotalSteps)
}
if got.ItPerSec != 0 {
t.Errorf("ItPerSec: got %v, want 0", got.ItPerSec)
}
expectedPct := 75.0
if math.Abs(got.Percent-expectedPct) > 1e-6 {
t.Errorf("Percent: got %v, want %v", got.Percent, expectedPct)
}
})
t.Run("formato con espacios variables y mayor velocidad", func(t *testing.T) {
line := " 20/30 | 12.50it/s | 66%"
got, ok := SdcliParseProgress(line)
if !ok {
t.Fatalf("expected match, got false")
}
if got.Step != 20 {
t.Errorf("Step: got %d, want 20", got.Step)
}
if got.TotalSteps != 30 {
t.Errorf("TotalSteps: got %d, want 30", got.TotalSteps)
}
if math.Abs(got.ItPerSec-12.5) > 1e-9 {
t.Errorf("ItPerSec: got %v, want 12.5", got.ItPerSec)
}
})
}
@@ -0,0 +1,161 @@
"""Tests para vault_csv_profile."""
from __future__ import annotations
import os
import sqlite3
import sys
import tempfile
from pathlib import Path
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from vault_csv_profile import vault_csv_profile
def _make_vault(tmp: Path) -> tuple[Path, Path]:
"""Crea un vault mínimo con vault_index.db y tabla files + files_fts + csv_profiles."""
db = tmp / "vault_index.db"
conn = sqlite3.connect(str(db))
conn.executescript(
"""
CREATE TABLE IF NOT EXISTS files (
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
rel_path TEXT UNIQUE NOT NULL,
size_bytes INTEGER,
ext TEXT
);
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts
USING fts5(rel_path, content_text, content='', contentless_delete=1);
CREATE TABLE IF NOT EXISTS csv_profiles (
rel_path TEXT PRIMARY KEY,
cols_json TEXT,
n_rows INTEGER,
encoding TEXT,
date_min TEXT,
date_max TEXT,
profiled_at INTEGER
);
"""
)
conn.commit()
conn.close()
return tmp, db
def _insert_file_entry(db: Path, rel_path: str):
"""Inserta entrada en files para que files_fts tenga rowid válido."""
conn = sqlite3.connect(str(db))
conn.execute(
"INSERT OR IGNORE INTO files(rel_path, size_bytes, ext) VALUES (?, 0, '.csv')",
(rel_path,),
)
conn.commit()
conn.close()
def test_csv_basic(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "data/basic.csv"
csv_file = vault / rel
csv_file.parent.mkdir(parents=True, exist_ok=True)
csv_file.write_text("nombre,edad,score\nAna,30,9.5\nBob,25,8.0\nCarla,35,7.5\n", encoding="utf-8")
_insert_file_entry(db, rel)
result = vault_csv_profile(str(vault), rel, db_path=str(db))
assert result["rel_path"] == rel
assert result["n_rows"] == 3
assert len(result["cols"]) == 3
col_names = [c["name"] for c in result["cols"]]
assert "nombre" in col_names
assert "edad" in col_names
assert "score" in col_names
assert result["persisted"] is True
# Verificar persistencia en csv_profiles
conn = sqlite3.connect(str(db))
row = conn.execute("SELECT n_rows FROM csv_profiles WHERE rel_path = ?", (rel,)).fetchone()
conn.close()
assert row is not None
assert row[0] == 3
def test_csv_date_detection(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "data/fechas.csv"
csv_file = vault / rel
csv_file.parent.mkdir(parents=True, exist_ok=True)
csv_file.write_text(
"fecha,valor\n2023-01-01,100\n2023-06-15,200\n2023-12-31,300\n",
encoding="utf-8",
)
_insert_file_entry(db, rel)
result = vault_csv_profile(str(vault), rel, db_path=str(db))
assert result["date_min"] is not None
assert result["date_max"] is not None
assert result["date_min"] <= "2023-01-01"
assert result["date_max"] >= "2023-12-31"
def test_csv_encoding_latin1(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "data/tildes.csv"
csv_file = vault / rel
csv_file.parent.mkdir(parents=True, exist_ok=True)
csv_file.write_bytes(
"ciudad,poblacion\nMálaga,500000\nCórdoba,320000\n".encode("latin-1")
)
_insert_file_entry(db, rel)
result = vault_csv_profile(str(vault), rel, db_path=str(db))
assert result["n_rows"] == 2
assert result["encoding"] != "utf-8?"
# encoding detectado (algún valor no vacío)
assert result["encoding"]
assert result["persisted"] is True
def test_csv_empty(tmp_path):
vault, db = _make_vault(tmp_path)
rel = "data/empty.csv"
csv_file = vault / rel
csv_file.parent.mkdir(parents=True, exist_ok=True)
csv_file.write_text("", encoding="utf-8")
_insert_file_entry(db, rel)
result = vault_csv_profile(str(vault), rel, db_path=str(db))
assert result["n_rows"] == 0
assert result["cols"] == []
assert result["date_min"] is None
assert result["date_max"] is None
def test_csv_persists_fts(tmp_path):
"""FTS5 contentless: verifica que las columnas son buscables con MATCH."""
vault, db = _make_vault(tmp_path)
rel = "data/fts_test.csv"
csv_file = vault / rel
csv_file.parent.mkdir(parents=True, exist_ok=True)
csv_file.write_text("producto,precio\nManzana,1.5\nPera,2.0\n", encoding="utf-8")
_insert_file_entry(db, rel)
vault_csv_profile(str(vault), rel, db_path=str(db))
conn = sqlite3.connect(str(db))
# FTS5 contentless no permite SELECT directo — usar MATCH para verificar indexado
row_prod = conn.execute(
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'producto'",
).fetchone()
row_prec = conn.execute(
"SELECT rowid FROM files_fts WHERE files_fts MATCH 'precio'",
).fetchone()
conn.close()
assert row_prod is not None, "FTS no encontró 'producto'"
assert row_prec is not None, "FTS no encontró 'precio'"

Some files were not shown because too many files have changed in this diff Show More