From ec0423f4818d5cb5371333c8bed3d107fe9aa336 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 7 Jun 2026 12:45:40 +0200 Subject: [PATCH] feat: metrics_agent inicial (collect+format+push host metrics a VictoriaMetrics) --- .gitignore | 8 ++++ agent.example.json | 7 +++ app.md | 95 +++++++++++++++++++++++++++++++++++++ config.go | 65 +++++++++++++++++++++++++ go.mod | 28 +++++++++++ go.sum | 50 +++++++++++++++++++ main.go | 85 +++++++++++++++++++++++++++++++++ registry.db | 0 systemd/fleet-agent.service | 15 ++++++ 9 files changed, 353 insertions(+) create mode 100644 .gitignore create mode 100644 agent.example.json create mode 100644 app.md create mode 100644 config.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 main.go create mode 100644 registry.db create mode 100644 systemd/fleet-agent.service diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..93a08c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Binario compilado +/metrics_agent +/metrics_agent_* +*.arm64 +*.amd64 + +# Config real con secretos (solo se versiona agent.example.json) +/agent.json diff --git a/agent.example.json b/agent.example.json new file mode 100644 index 0000000..20886ca --- /dev/null +++ b/agent.example.json @@ -0,0 +1,7 @@ +{ + "node": "CAMBIAME", + "hub_url": "https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus", + "user": "fleet", + "pass": "PON_AQUI_pass_show_fleet_ingest-pass", + "interval_sec": 15 +} diff --git a/app.md b/app.md new file mode 100644 index 0000000..343089c --- /dev/null +++ b/app.md @@ -0,0 +1,95 @@ +--- +name: metrics_agent +lang: go +domain: infra +version: 0.1.0 +description: "Agente de monitorización por nodo: recolecta métricas de host (CPU/RAM/swap/disco/red/temp/procesos) y las empuja a VictoriaMetrics en formato Prometheus con basic auth." +tags: [fleet-metrics, monitoring, daemon] +uses_functions: + - collect_host_metrics_go_infra + - format_prom_exposition_go_infra + - push_prom_remote_go_infra +uses_types: + - PromSample_go_infra +framework: "" +entry_point: "main.go" +dir_path: "projects/fleet_monitoring/apps/metrics_agent" +repo_url: "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent" +--- + +# metrics_agent + +Agente ligero que corre en cada equipo de la flota. En un bucle de intervalo fijo recolecta +métricas de sistema y las empuja al hub central (VictoriaMetrics en magnus). Es el componente +por nodo del project `fleet_monitoring`. + +## Qué hace + +Compone tres funciones del registry (grupo `fleet-metrics`), no reimplementa nada: + +1. `collect_host_metrics_go_infra` — lee CPU (global + por core), memoria, swap, disco (uso + + I/O), red (por interfaz), temperaturas (best-effort) y top procesos, devolviendo `[]PromSample`. +2. `format_prom_exposition_go_infra` — serializa los samples en texto formato Prometheus exposition. +3. `push_prom_remote_go_infra` — hace POST del texto al endpoint de ingesta con basic auth, + añadiendo la label `instance=` a todas las series vía `extra_label`. + +## Por qué no lleva el tag `service` + +Es un daemon, pero no encaja en el modelo `service:`/`services_monitor` (un endpoint HTTP con +health check monitorizado por SSH). El agente se replica en N nodos y su liveness la vigila el +propio sistema de monitorización: si un nodo deja de empujar, su serie `up` se vuelve stale y +salta la alerta. Por eso se etiqueta `daemon` y no `service`. + +## Configuración + +Config por archivo JSON (`-config`) con override por variables de entorno. Campos: + +| campo / env | descripción | default | +|---|---|---| +| `node` / `FLEET_NODE` | valor de la label `instance` | hostname | +| `hub_url` / `FLEET_HUB_URL` | URL completa de ingesta (`…/api/v1/import/prometheus`) | — (obligatorio) | +| `user` / `FLEET_USER` | usuario basic-auth | "" | +| `pass` / `FLEET_PASS` | password basic-auth | "" | +| `interval_sec` / `FLEET_INTERVAL` | periodo de push en segundos | 15 | + +## Ejemplo + +```bash +# Build +cd projects/fleet_monitoring/apps/metrics_agent +go build -o metrics_agent . + +# Push único de prueba (lee config + empuja una vez y sale) +FLEET_NODE=lucas \ +FLEET_HUB_URL="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus" \ +FLEET_USER=fleet \ +FLEET_PASS="$(pass show fleet/ingest-pass | head -1)" \ +./metrics_agent -once + +# Bucle continuo con archivo de config +./metrics_agent -config /etc/fleet-agent/agent.json +``` + +## Cuando usarla + +Despliégalo en cualquier máquina nueva que quieras ver en Grafana: copia el binario, escribe +`/etc/fleet-agent/agent.json` con su `node` y los secretos, instala el unit systemd y arranca. +No hay que tocar el hub central. + +## Cross-compilación (para layla / Termux arm64) + +gopsutil es Go puro, así que cross-compila sin CGO: + +```bash +CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o metrics_agent_arm64 . +``` + +## Gotchas + +- `hub_url` debe ser la URL **completa** incluyendo `/api/v1/import/prometheus`. +- El push lleva la label `instance` vía `extra_label`; no la pongas tú en las métricas. +- Las temperaturas son best-effort: en VPS y en Android/Termux puede no haber sensores y el + grupo `node_temp_celsius` simplemente se omite. +- El binario importa el paquete `fn-registry/functions/infra` completo (vía `replace` al + registry), por lo que arrastra las dependencias de ese paquete. El linker elimina el código + no usado, pero el árbol de compilación es grande. diff --git a/config.go b/config.go new file mode 100644 index 0000000..9529129 --- /dev/null +++ b/config.go @@ -0,0 +1,65 @@ +package main + +import ( + "encoding/json" + "os" + "strconv" +) + +// Config holds the agent runtime configuration. It is read from an optional +// JSON file and can be overridden by environment variables, which is handy for +// systemd drop-ins and for deploying the same binary to many nodes. +type Config struct { + Node string `json:"node"` // value of the "instance" label attached to every series + HubURL string `json:"hub_url"` // full ingest URL, e.g. https://metrics-…/api/v1/import/prometheus + User string `json:"user"` // basic-auth user (empty disables auth) + Pass string `json:"pass"` // basic-auth password + IntervalSec int `json:"interval_sec"` // push period in seconds (default 15) +} + +// defaultConfig returns the baseline configuration: the machine hostname as the +// node name and a 15-second push interval. +func defaultConfig() Config { + host, _ := os.Hostname() + return Config{Node: host, IntervalSec: 15} +} + +// loadConfig reads the JSON file at path (when non-empty) and then applies +// environment overrides. Recognised env vars: FLEET_NODE, FLEET_HUB_URL, +// FLEET_USER, FLEET_PASS, FLEET_INTERVAL. +func loadConfig(path string) (Config, error) { + cfg := defaultConfig() + if path != "" { + b, err := os.ReadFile(path) + if err != nil { + return cfg, err + } + if err := json.Unmarshal(b, &cfg); err != nil { + return cfg, err + } + } + if v := os.Getenv("FLEET_NODE"); v != "" { + cfg.Node = v + } + if v := os.Getenv("FLEET_HUB_URL"); v != "" { + cfg.HubURL = v + } + if v := os.Getenv("FLEET_USER"); v != "" { + cfg.User = v + } + if v := os.Getenv("FLEET_PASS"); v != "" { + cfg.Pass = v + } + if v := os.Getenv("FLEET_INTERVAL"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + cfg.IntervalSec = n + } + } + if cfg.IntervalSec <= 0 { + cfg.IntervalSec = 15 + } + if cfg.Node == "" { + cfg.Node, _ = os.Hostname() + } + return cfg, nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..21566bc --- /dev/null +++ b/go.mod @@ -0,0 +1,28 @@ +module metrics_agent + +go 1.25.0 + +require fn-registry v0.0.0 + +require ( + github.com/creack/pty v1.1.24 // indirect + github.com/ebitengine/purego v0.10.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/mattn/go-sqlite3 v1.14.44 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/shirou/gopsutil/v4 v4.26.5 // indirect + github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + golang.org/x/crypto v0.51.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.44.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + nhooyr.io/websocket v1.8.17 // indirect +) + +replace fn-registry => ../../../../ diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..43b0997 --- /dev/null +++ b/go.sum @@ -0,0 +1,50 @@ +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= +github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM= +github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e h1:MRM5ITcdelLK2j1vwZ3Je0FKVCfqOLp5zO6trqMLYs0= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e/go.mod h1:XV66xRDqSt+GTGFMVlhk3ULuV0y9ZmzeVGR4mloJI3M= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y= +nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c= diff --git a/main.go b/main.go new file mode 100644 index 0000000..37685bb --- /dev/null +++ b/main.go @@ -0,0 +1,85 @@ +// Command metrics_agent collects host metrics and pushes them to a +// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval. +// +// It is the per-node component of the fleet_monitoring project and is meant to +// run as a systemd service on every machine of the fleet. It does no work of +// its own beyond orchestration: the actual capability comes from three registry +// functions in fn-registry/functions/infra: +// +// - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs) +// - FormatPromExposition -> Prometheus exposition text +// - PushPromRemote -> POST the text with optional basic auth + extra labels +// +// The "instance" label is attached at push time so a single binary, configured +// only with its node name and the hub endpoint, identifies itself in Grafana. +package main + +import ( + "flag" + "log" + "os" + "os/signal" + "syscall" + "time" + + "fn-registry/functions/infra" +) + +func main() { + configPath := flag.String("config", "", "path to JSON config file") + once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)") + flag.Parse() + + cfg, err := loadConfig(*configPath) + if err != nil { + log.Fatalf("config: %v", err) + } + if cfg.HubURL == "" { + log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)") + } + log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec) + + if *once { + if err := pushOnce(cfg); err != nil { + log.Fatalf("push: %v", err) + } + return + } + + ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second) + defer ticker.Stop() + + stop := make(chan os.Signal, 1) + signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) + + // Push once right away so a freshly started node shows up immediately, + // then keep pushing on every tick. + if err := pushOnce(cfg); err != nil { + log.Printf("push error: %v", err) + } + for { + select { + case <-ticker.C: + if err := pushOnce(cfg); err != nil { + log.Printf("push error: %v", err) + } + case <-stop: + log.Print("shutting down") + return + } + } +} + +// pushOnce runs a single collect -> format -> push cycle. +func pushOnce(cfg Config) error { + samples, err := infra.CollectHostMetrics() + if err != nil { + return err + } + body := infra.FormatPromExposition(samples, time.Now().UnixMilli()) + if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil { + return err + } + log.Printf("pushed %d samples", len(samples)) + return nil +} diff --git a/registry.db b/registry.db new file mode 100644 index 0000000..e69de29 diff --git a/systemd/fleet-agent.service b/systemd/fleet-agent.service new file mode 100644 index 0000000..b6176c6 --- /dev/null +++ b/systemd/fleet-agent.service @@ -0,0 +1,15 @@ +[Unit] +Description=Fleet metrics agent (fleet_monitoring) +Documentation=https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/fleet-agent/metrics_agent -config /etc/fleet-agent/agent.json +Restart=always +RestartSec=10 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target