e3c8979e8d
- cmd/fn/doctor.go - cmd/fn/main.go - cpp/apps/primitives_gallery/playground/tables/CMakeLists.txt - cpp/apps/primitives_gallery/playground/tables/data_table.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.cpp - cpp/apps/primitives_gallery/playground/tables/data_table_logic.h - cpp/apps/primitives_gallery/playground/tables/self_test.cpp - cpp/apps/primitives_gallery/playground/tables/tql.cpp - cpp/apps/primitives_gallery/playground/tables/viz.cpp - cpp/apps/primitives_gallery/playground/tables/viz.h - ... Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
68 lines
2.6 KiB
Markdown
68 lines
2.6 KiB
Markdown
---
|
|
name: audit_ml_env
|
|
kind: function
|
|
lang: go
|
|
domain: infra
|
|
version: "1.0.0"
|
|
purity: impure
|
|
signature: "func AuditMlEnv(registryRoot string) (MlEnvReport, error)"
|
|
description: "Audita el entorno ML del sistema: GPUs NVIDIA, toolkit CUDA, venv Python, paquetes clave (torch, diffusers, transformers, huggingface_hub), herramientas CLI (sd, llama-cli) y el vault de modelos. Retorna un MlEnvReport con OverallOK=true solo si hay al menos 1 GPU y los checks criticos estan en ok/warning."
|
|
tags: [ml, cuda, gpu, nvidia, audit, doctor, infra, torch, diffusers]
|
|
uses_functions: [get_gpu_info_go_infra]
|
|
uses_types: [gpu_info_go_infra]
|
|
returns: []
|
|
returns_optional: false
|
|
error_type: "error_go_core"
|
|
imports: [context, fmt, os, os/exec, path/filepath, strings, time]
|
|
tested: true
|
|
tests:
|
|
- "report no nil y tiene checks"
|
|
- "generated_at es positivo"
|
|
- "checks tiene al menos 4 entradas"
|
|
- "gpus puede ser vacio en CI"
|
|
test_file_path: "functions/infra/audit_ml_env_test.go"
|
|
file_path: "functions/infra/audit_ml_env.go"
|
|
params:
|
|
- name: registryRoot
|
|
desc: "Ruta absoluta a la raiz del fn_registry. Se usa para localizar python/.venv/bin/python3 y probar paquetes instalados."
|
|
output: "MlEnvReport con Gpus (puede estar vacio si no hay NVIDIA), Checks con estado por herramienta/paquete, OverallOK y GeneratedAt (unix timestamp)."
|
|
---
|
|
|
|
## Checks realizados
|
|
|
|
| Check | Tipo | Critico |
|
|
|---|---|---|
|
|
| `nvidia_smi` | binary in PATH | no (ok si hay GPU) |
|
|
| `nvcc` | CUDA toolkit version | no |
|
|
| `python_venv` | exists + `python3 --version` | si |
|
|
| `torch` | `import torch; __version__` | si |
|
|
| `diffusers` | `import diffusers; __version__` | si |
|
|
| `transformers` | `import transformers; __version__` | si |
|
|
| `huggingface_hub` | `import huggingface_hub; __version__` | si |
|
|
| `stable_diffusion_cpp_python` | `import stable_diffusion_cpp` | no (opcional) |
|
|
| `sd_cli` | `sd --version` in PATH | no (opcional) |
|
|
| `llama_cpp` | `llama-cli --version` in PATH | no (opcional) |
|
|
| `imagegen_vault` | `~/vaults/imagegen_models` exists | no |
|
|
|
|
## Ejemplo
|
|
|
|
```go
|
|
root := "/home/lucas/fn_registry"
|
|
report, err := AuditMlEnv(root)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
for _, c := range report.Checks {
|
|
fmt.Printf("%-40s %s %s\n", c.Name, c.Status, c.Version)
|
|
}
|
|
fmt.Printf("OverallOK: %v\n", report.OverallOK)
|
|
```
|
|
|
|
## Notas
|
|
|
|
- Cada check tiene timeout de 5 segundos para no bloquear en entornos sin GPU.
|
|
- `stable_diffusion_cpp_python`, `sd_cli` y `llama_cpp` son opcionales: si estan missing, `OverallOK` no se ve afectado.
|
|
- `OverallOK` requiere al menos 1 GPU NVIDIA detectada via `GetGpuInfo()`.
|
|
- No escribe nada en disco. Read-only.
|
|
- Se expone como `fn doctor ml` via cmd/fn/doctor.go.
|