feat: metrics_agent inicial (collect+format+push host metrics a VictoriaMetrics)

This commit is contained in:
Egutierrez
2026-06-07 12:45:40 +02:00
commit ec0423f481
9 changed files with 353 additions and 0 deletions
+8
View File
@@ -0,0 +1,8 @@
# Binario compilado
/metrics_agent
/metrics_agent_*
*.arm64
*.amd64
# Config real con secretos (solo se versiona agent.example.json)
/agent.json
+7
View File
@@ -0,0 +1,7 @@
{
"node": "CAMBIAME",
"hub_url": "https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus",
"user": "fleet",
"pass": "PON_AQUI_pass_show_fleet_ingest-pass",
"interval_sec": 15
}
+95
View File
@@ -0,0 +1,95 @@
---
name: metrics_agent
lang: go
domain: infra
version: 0.1.0
description: "Agente de monitorización por nodo: recolecta métricas de host (CPU/RAM/swap/disco/red/temp/procesos) y las empuja a VictoriaMetrics en formato Prometheus con basic auth."
tags: [fleet-metrics, monitoring, daemon]
uses_functions:
- collect_host_metrics_go_infra
- format_prom_exposition_go_infra
- push_prom_remote_go_infra
uses_types:
- PromSample_go_infra
framework: ""
entry_point: "main.go"
dir_path: "projects/fleet_monitoring/apps/metrics_agent"
repo_url: "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent"
---
# metrics_agent
Agente ligero que corre en cada equipo de la flota. En un bucle de intervalo fijo recolecta
métricas de sistema y las empuja al hub central (VictoriaMetrics en magnus). Es el componente
por nodo del project `fleet_monitoring`.
## Qué hace
Compone tres funciones del registry (grupo `fleet-metrics`), no reimplementa nada:
1. `collect_host_metrics_go_infra` — lee CPU (global + por core), memoria, swap, disco (uso +
I/O), red (por interfaz), temperaturas (best-effort) y top procesos, devolviendo `[]PromSample`.
2. `format_prom_exposition_go_infra` — serializa los samples en texto formato Prometheus exposition.
3. `push_prom_remote_go_infra` — hace POST del texto al endpoint de ingesta con basic auth,
añadiendo la label `instance=<node>` a todas las series vía `extra_label`.
## Por qué no lleva el tag `service`
Es un daemon, pero no encaja en el modelo `service:`/`services_monitor` (un endpoint HTTP con
health check monitorizado por SSH). El agente se replica en N nodos y su liveness la vigila el
propio sistema de monitorización: si un nodo deja de empujar, su serie `up` se vuelve stale y
salta la alerta. Por eso se etiqueta `daemon` y no `service`.
## Configuración
Config por archivo JSON (`-config`) con override por variables de entorno. Campos:
| campo / env | descripción | default |
|---|---|---|
| `node` / `FLEET_NODE` | valor de la label `instance` | hostname |
| `hub_url` / `FLEET_HUB_URL` | URL completa de ingesta (`…/api/v1/import/prometheus`) | — (obligatorio) |
| `user` / `FLEET_USER` | usuario basic-auth | "" |
| `pass` / `FLEET_PASS` | password basic-auth | "" |
| `interval_sec` / `FLEET_INTERVAL` | periodo de push en segundos | 15 |
## Ejemplo
```bash
# Build
cd projects/fleet_monitoring/apps/metrics_agent
go build -o metrics_agent .
# Push único de prueba (lee config + empuja una vez y sale)
FLEET_NODE=lucas \
FLEET_HUB_URL="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus" \
FLEET_USER=fleet \
FLEET_PASS="$(pass show fleet/ingest-pass | head -1)" \
./metrics_agent -once
# Bucle continuo con archivo de config
./metrics_agent -config /etc/fleet-agent/agent.json
```
## Cuando usarla
Despliégalo en cualquier máquina nueva que quieras ver en Grafana: copia el binario, escribe
`/etc/fleet-agent/agent.json` con su `node` y los secretos, instala el unit systemd y arranca.
No hay que tocar el hub central.
## Cross-compilación (para layla / Termux arm64)
gopsutil es Go puro, así que cross-compila sin CGO:
```bash
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o metrics_agent_arm64 .
```
## Gotchas
- `hub_url` debe ser la URL **completa** incluyendo `/api/v1/import/prometheus`.
- El push lleva la label `instance` vía `extra_label`; no la pongas tú en las métricas.
- Las temperaturas son best-effort: en VPS y en Android/Termux puede no haber sensores y el
grupo `node_temp_celsius` simplemente se omite.
- El binario importa el paquete `fn-registry/functions/infra` completo (vía `replace` al
registry), por lo que arrastra las dependencias de ese paquete. El linker elimina el código
no usado, pero el árbol de compilación es grande.
+65
View File
@@ -0,0 +1,65 @@
package main
import (
"encoding/json"
"os"
"strconv"
)
// Config holds the agent runtime configuration. It is read from an optional
// JSON file and can be overridden by environment variables, which is handy for
// systemd drop-ins and for deploying the same binary to many nodes.
type Config struct {
Node string `json:"node"` // value of the "instance" label attached to every series
HubURL string `json:"hub_url"` // full ingest URL, e.g. https://metrics-…/api/v1/import/prometheus
User string `json:"user"` // basic-auth user (empty disables auth)
Pass string `json:"pass"` // basic-auth password
IntervalSec int `json:"interval_sec"` // push period in seconds (default 15)
}
// defaultConfig returns the baseline configuration: the machine hostname as the
// node name and a 15-second push interval.
func defaultConfig() Config {
host, _ := os.Hostname()
return Config{Node: host, IntervalSec: 15}
}
// loadConfig reads the JSON file at path (when non-empty) and then applies
// environment overrides. Recognised env vars: FLEET_NODE, FLEET_HUB_URL,
// FLEET_USER, FLEET_PASS, FLEET_INTERVAL.
func loadConfig(path string) (Config, error) {
cfg := defaultConfig()
if path != "" {
b, err := os.ReadFile(path)
if err != nil {
return cfg, err
}
if err := json.Unmarshal(b, &cfg); err != nil {
return cfg, err
}
}
if v := os.Getenv("FLEET_NODE"); v != "" {
cfg.Node = v
}
if v := os.Getenv("FLEET_HUB_URL"); v != "" {
cfg.HubURL = v
}
if v := os.Getenv("FLEET_USER"); v != "" {
cfg.User = v
}
if v := os.Getenv("FLEET_PASS"); v != "" {
cfg.Pass = v
}
if v := os.Getenv("FLEET_INTERVAL"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
cfg.IntervalSec = n
}
}
if cfg.IntervalSec <= 0 {
cfg.IntervalSec = 15
}
if cfg.Node == "" {
cfg.Node, _ = os.Hostname()
}
return cfg, nil
}
+28
View File
@@ -0,0 +1,28 @@
module metrics_agent
go 1.25.0
require fn-registry v0.0.0
require (
github.com/creack/pty v1.1.24 // indirect
github.com/ebitengine/purego v0.10.0 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/mattn/go-sqlite3 v1.14.44 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/shirou/gopsutil/v4 v4.26.5 // indirect
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e // indirect
github.com/tklauser/go-sysconf v0.3.16 // indirect
github.com/tklauser/numcpus v0.11.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
golang.org/x/crypto v0.51.0 // indirect
golang.org/x/sync v0.20.0 // indirect
golang.org/x/sys v0.44.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
nhooyr.io/websocket v1.8.17 // indirect
)
replace fn-registry => ../../../../
+50
View File
@@ -0,0 +1,50 @@
github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8=
github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM=
github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e h1:MRM5ITcdelLK2j1vwZ3Je0FKVCfqOLp5zO6trqMLYs0=
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e/go.mod h1:XV66xRDqSt+GTGFMVlhk3ULuV0y9ZmzeVGR4mloJI3M=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y=
nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c=
+85
View File
@@ -0,0 +1,85 @@
// Command metrics_agent collects host metrics and pushes them to a
// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval.
//
// It is the per-node component of the fleet_monitoring project and is meant to
// run as a systemd service on every machine of the fleet. It does no work of
// its own beyond orchestration: the actual capability comes from three registry
// functions in fn-registry/functions/infra:
//
// - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs)
// - FormatPromExposition -> Prometheus exposition text
// - PushPromRemote -> POST the text with optional basic auth + extra labels
//
// The "instance" label is attached at push time so a single binary, configured
// only with its node name and the hub endpoint, identifies itself in Grafana.
package main
import (
"flag"
"log"
"os"
"os/signal"
"syscall"
"time"
"fn-registry/functions/infra"
)
func main() {
configPath := flag.String("config", "", "path to JSON config file")
once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)")
flag.Parse()
cfg, err := loadConfig(*configPath)
if err != nil {
log.Fatalf("config: %v", err)
}
if cfg.HubURL == "" {
log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)")
}
log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec)
if *once {
if err := pushOnce(cfg); err != nil {
log.Fatalf("push: %v", err)
}
return
}
ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second)
defer ticker.Stop()
stop := make(chan os.Signal, 1)
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
// Push once right away so a freshly started node shows up immediately,
// then keep pushing on every tick.
if err := pushOnce(cfg); err != nil {
log.Printf("push error: %v", err)
}
for {
select {
case <-ticker.C:
if err := pushOnce(cfg); err != nil {
log.Printf("push error: %v", err)
}
case <-stop:
log.Print("shutting down")
return
}
}
}
// pushOnce runs a single collect -> format -> push cycle.
func pushOnce(cfg Config) error {
samples, err := infra.CollectHostMetrics()
if err != nil {
return err
}
body := infra.FormatPromExposition(samples, time.Now().UnixMilli())
if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil {
return err
}
log.Printf("pushed %d samples", len(samples))
return nil
}
View File
+15
View File
@@ -0,0 +1,15 @@
[Unit]
Description=Fleet metrics agent (fleet_monitoring)
Documentation=https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/opt/fleet-agent/metrics_agent -config /etc/fleet-agent/agent.json
Restart=always
RestartSec=10
NoNewPrivileges=true
[Install]
WantedBy=multi-user.target