commit ec0423f4818d5cb5371333c8bed3d107fe9aa336 Author: Egutierrez Date: Sun Jun 7 12:45:40 2026 +0200 feat: metrics_agent inicial (collect+format+push host metrics a VictoriaMetrics) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..93a08c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Binario compilado +/metrics_agent +/metrics_agent_* +*.arm64 +*.amd64 + +# Config real con secretos (solo se versiona agent.example.json) +/agent.json diff --git a/agent.example.json b/agent.example.json new file mode 100644 index 0000000..20886ca --- /dev/null +++ b/agent.example.json @@ -0,0 +1,7 @@ +{ + "node": "CAMBIAME", + "hub_url": "https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus", + "user": "fleet", + "pass": "PON_AQUI_pass_show_fleet_ingest-pass", + "interval_sec": 15 +} diff --git a/app.md b/app.md new file mode 100644 index 0000000..343089c --- /dev/null +++ b/app.md @@ -0,0 +1,95 @@ +--- +name: metrics_agent +lang: go +domain: infra +version: 0.1.0 +description: "Agente de monitorización por nodo: recolecta métricas de host (CPU/RAM/swap/disco/red/temp/procesos) y las empuja a VictoriaMetrics en formato Prometheus con basic auth." +tags: [fleet-metrics, monitoring, daemon] +uses_functions: + - collect_host_metrics_go_infra + - format_prom_exposition_go_infra + - push_prom_remote_go_infra +uses_types: + - PromSample_go_infra +framework: "" +entry_point: "main.go" +dir_path: "projects/fleet_monitoring/apps/metrics_agent" +repo_url: "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent" +--- + +# metrics_agent + +Agente ligero que corre en cada equipo de la flota. En un bucle de intervalo fijo recolecta +métricas de sistema y las empuja al hub central (VictoriaMetrics en magnus). Es el componente +por nodo del project `fleet_monitoring`. + +## Qué hace + +Compone tres funciones del registry (grupo `fleet-metrics`), no reimplementa nada: + +1. `collect_host_metrics_go_infra` — lee CPU (global + por core), memoria, swap, disco (uso + + I/O), red (por interfaz), temperaturas (best-effort) y top procesos, devolviendo `[]PromSample`. +2. `format_prom_exposition_go_infra` — serializa los samples en texto formato Prometheus exposition. +3. `push_prom_remote_go_infra` — hace POST del texto al endpoint de ingesta con basic auth, + añadiendo la label `instance=` a todas las series vía `extra_label`. + +## Por qué no lleva el tag `service` + +Es un daemon, pero no encaja en el modelo `service:`/`services_monitor` (un endpoint HTTP con +health check monitorizado por SSH). El agente se replica en N nodos y su liveness la vigila el +propio sistema de monitorización: si un nodo deja de empujar, su serie `up` se vuelve stale y +salta la alerta. Por eso se etiqueta `daemon` y no `service`. + +## Configuración + +Config por archivo JSON (`-config`) con override por variables de entorno. Campos: + +| campo / env | descripción | default | +|---|---|---| +| `node` / `FLEET_NODE` | valor de la label `instance` | hostname | +| `hub_url` / `FLEET_HUB_URL` | URL completa de ingesta (`…/api/v1/import/prometheus`) | — (obligatorio) | +| `user` / `FLEET_USER` | usuario basic-auth | "" | +| `pass` / `FLEET_PASS` | password basic-auth | "" | +| `interval_sec` / `FLEET_INTERVAL` | periodo de push en segundos | 15 | + +## Ejemplo + +```bash +# Build +cd projects/fleet_monitoring/apps/metrics_agent +go build -o metrics_agent . + +# Push único de prueba (lee config + empuja una vez y sale) +FLEET_NODE=lucas \ +FLEET_HUB_URL="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus" \ +FLEET_USER=fleet \ +FLEET_PASS="$(pass show fleet/ingest-pass | head -1)" \ +./metrics_agent -once + +# Bucle continuo con archivo de config +./metrics_agent -config /etc/fleet-agent/agent.json +``` + +## Cuando usarla + +Despliégalo en cualquier máquina nueva que quieras ver en Grafana: copia el binario, escribe +`/etc/fleet-agent/agent.json` con su `node` y los secretos, instala el unit systemd y arranca. +No hay que tocar el hub central. + +## Cross-compilación (para layla / Termux arm64) + +gopsutil es Go puro, así que cross-compila sin CGO: + +```bash +CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o metrics_agent_arm64 . +``` + +## Gotchas + +- `hub_url` debe ser la URL **completa** incluyendo `/api/v1/import/prometheus`. +- El push lleva la label `instance` vía `extra_label`; no la pongas tú en las métricas. +- Las temperaturas son best-effort: en VPS y en Android/Termux puede no haber sensores y el + grupo `node_temp_celsius` simplemente se omite. +- El binario importa el paquete `fn-registry/functions/infra` completo (vía `replace` al + registry), por lo que arrastra las dependencias de ese paquete. El linker elimina el código + no usado, pero el árbol de compilación es grande. diff --git a/config.go b/config.go new file mode 100644 index 0000000..9529129 --- /dev/null +++ b/config.go @@ -0,0 +1,65 @@ +package main + +import ( + "encoding/json" + "os" + "strconv" +) + +// Config holds the agent runtime configuration. It is read from an optional +// JSON file and can be overridden by environment variables, which is handy for +// systemd drop-ins and for deploying the same binary to many nodes. +type Config struct { + Node string `json:"node"` // value of the "instance" label attached to every series + HubURL string `json:"hub_url"` // full ingest URL, e.g. https://metrics-…/api/v1/import/prometheus + User string `json:"user"` // basic-auth user (empty disables auth) + Pass string `json:"pass"` // basic-auth password + IntervalSec int `json:"interval_sec"` // push period in seconds (default 15) +} + +// defaultConfig returns the baseline configuration: the machine hostname as the +// node name and a 15-second push interval. +func defaultConfig() Config { + host, _ := os.Hostname() + return Config{Node: host, IntervalSec: 15} +} + +// loadConfig reads the JSON file at path (when non-empty) and then applies +// environment overrides. Recognised env vars: FLEET_NODE, FLEET_HUB_URL, +// FLEET_USER, FLEET_PASS, FLEET_INTERVAL. +func loadConfig(path string) (Config, error) { + cfg := defaultConfig() + if path != "" { + b, err := os.ReadFile(path) + if err != nil { + return cfg, err + } + if err := json.Unmarshal(b, &cfg); err != nil { + return cfg, err + } + } + if v := os.Getenv("FLEET_NODE"); v != "" { + cfg.Node = v + } + if v := os.Getenv("FLEET_HUB_URL"); v != "" { + cfg.HubURL = v + } + if v := os.Getenv("FLEET_USER"); v != "" { + cfg.User = v + } + if v := os.Getenv("FLEET_PASS"); v != "" { + cfg.Pass = v + } + if v := os.Getenv("FLEET_INTERVAL"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + cfg.IntervalSec = n + } + } + if cfg.IntervalSec <= 0 { + cfg.IntervalSec = 15 + } + if cfg.Node == "" { + cfg.Node, _ = os.Hostname() + } + return cfg, nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..21566bc --- /dev/null +++ b/go.mod @@ -0,0 +1,28 @@ +module metrics_agent + +go 1.25.0 + +require fn-registry v0.0.0 + +require ( + github.com/creack/pty v1.1.24 // indirect + github.com/ebitengine/purego v0.10.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/mattn/go-sqlite3 v1.14.44 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/shirou/gopsutil/v4 v4.26.5 // indirect + github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + golang.org/x/crypto v0.51.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.44.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + nhooyr.io/websocket v1.8.17 // indirect +) + +replace fn-registry => ../../../../ diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..43b0997 --- /dev/null +++ b/go.sum @@ -0,0 +1,50 @@ +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= +github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM= +github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e h1:MRM5ITcdelLK2j1vwZ3Je0FKVCfqOLp5zO6trqMLYs0= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e/go.mod h1:XV66xRDqSt+GTGFMVlhk3ULuV0y9ZmzeVGR4mloJI3M= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y= +nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c= diff --git a/main.go b/main.go new file mode 100644 index 0000000..37685bb --- /dev/null +++ b/main.go @@ -0,0 +1,85 @@ +// Command metrics_agent collects host metrics and pushes them to a +// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval. +// +// It is the per-node component of the fleet_monitoring project and is meant to +// run as a systemd service on every machine of the fleet. It does no work of +// its own beyond orchestration: the actual capability comes from three registry +// functions in fn-registry/functions/infra: +// +// - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs) +// - FormatPromExposition -> Prometheus exposition text +// - PushPromRemote -> POST the text with optional basic auth + extra labels +// +// The "instance" label is attached at push time so a single binary, configured +// only with its node name and the hub endpoint, identifies itself in Grafana. +package main + +import ( + "flag" + "log" + "os" + "os/signal" + "syscall" + "time" + + "fn-registry/functions/infra" +) + +func main() { + configPath := flag.String("config", "", "path to JSON config file") + once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)") + flag.Parse() + + cfg, err := loadConfig(*configPath) + if err != nil { + log.Fatalf("config: %v", err) + } + if cfg.HubURL == "" { + log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)") + } + log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec) + + if *once { + if err := pushOnce(cfg); err != nil { + log.Fatalf("push: %v", err) + } + return + } + + ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second) + defer ticker.Stop() + + stop := make(chan os.Signal, 1) + signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) + + // Push once right away so a freshly started node shows up immediately, + // then keep pushing on every tick. + if err := pushOnce(cfg); err != nil { + log.Printf("push error: %v", err) + } + for { + select { + case <-ticker.C: + if err := pushOnce(cfg); err != nil { + log.Printf("push error: %v", err) + } + case <-stop: + log.Print("shutting down") + return + } + } +} + +// pushOnce runs a single collect -> format -> push cycle. +func pushOnce(cfg Config) error { + samples, err := infra.CollectHostMetrics() + if err != nil { + return err + } + body := infra.FormatPromExposition(samples, time.Now().UnixMilli()) + if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil { + return err + } + log.Printf("pushed %d samples", len(samples)) + return nil +} diff --git a/registry.db b/registry.db new file mode 100644 index 0000000..e69de29 diff --git a/systemd/fleet-agent.service b/systemd/fleet-agent.service new file mode 100644 index 0000000..b6176c6 --- /dev/null +++ b/systemd/fleet-agent.service @@ -0,0 +1,15 @@ +[Unit] +Description=Fleet metrics agent (fleet_monitoring) +Documentation=https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/fleet-agent/metrics_agent -config /etc/fleet-agent/agent.json +Restart=always +RestartSec=10 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target