feat: metrics_agent inicial (collect+format+push host metrics a VictoriaMetrics)
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
# Binario compilado
|
||||
/metrics_agent
|
||||
/metrics_agent_*
|
||||
*.arm64
|
||||
*.amd64
|
||||
|
||||
# Config real con secretos (solo se versiona agent.example.json)
|
||||
/agent.json
|
||||
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"node": "CAMBIAME",
|
||||
"hub_url": "https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus",
|
||||
"user": "fleet",
|
||||
"pass": "PON_AQUI_pass_show_fleet_ingest-pass",
|
||||
"interval_sec": 15
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
---
|
||||
name: metrics_agent
|
||||
lang: go
|
||||
domain: infra
|
||||
version: 0.1.0
|
||||
description: "Agente de monitorización por nodo: recolecta métricas de host (CPU/RAM/swap/disco/red/temp/procesos) y las empuja a VictoriaMetrics en formato Prometheus con basic auth."
|
||||
tags: [fleet-metrics, monitoring, daemon]
|
||||
uses_functions:
|
||||
- collect_host_metrics_go_infra
|
||||
- format_prom_exposition_go_infra
|
||||
- push_prom_remote_go_infra
|
||||
uses_types:
|
||||
- PromSample_go_infra
|
||||
framework: ""
|
||||
entry_point: "main.go"
|
||||
dir_path: "projects/fleet_monitoring/apps/metrics_agent"
|
||||
repo_url: "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent"
|
||||
---
|
||||
|
||||
# metrics_agent
|
||||
|
||||
Agente ligero que corre en cada equipo de la flota. En un bucle de intervalo fijo recolecta
|
||||
métricas de sistema y las empuja al hub central (VictoriaMetrics en magnus). Es el componente
|
||||
por nodo del project `fleet_monitoring`.
|
||||
|
||||
## Qué hace
|
||||
|
||||
Compone tres funciones del registry (grupo `fleet-metrics`), no reimplementa nada:
|
||||
|
||||
1. `collect_host_metrics_go_infra` — lee CPU (global + por core), memoria, swap, disco (uso +
|
||||
I/O), red (por interfaz), temperaturas (best-effort) y top procesos, devolviendo `[]PromSample`.
|
||||
2. `format_prom_exposition_go_infra` — serializa los samples en texto formato Prometheus exposition.
|
||||
3. `push_prom_remote_go_infra` — hace POST del texto al endpoint de ingesta con basic auth,
|
||||
añadiendo la label `instance=<node>` a todas las series vía `extra_label`.
|
||||
|
||||
## Por qué no lleva el tag `service`
|
||||
|
||||
Es un daemon, pero no encaja en el modelo `service:`/`services_monitor` (un endpoint HTTP con
|
||||
health check monitorizado por SSH). El agente se replica en N nodos y su liveness la vigila el
|
||||
propio sistema de monitorización: si un nodo deja de empujar, su serie `up` se vuelve stale y
|
||||
salta la alerta. Por eso se etiqueta `daemon` y no `service`.
|
||||
|
||||
## Configuración
|
||||
|
||||
Config por archivo JSON (`-config`) con override por variables de entorno. Campos:
|
||||
|
||||
| campo / env | descripción | default |
|
||||
|---|---|---|
|
||||
| `node` / `FLEET_NODE` | valor de la label `instance` | hostname |
|
||||
| `hub_url` / `FLEET_HUB_URL` | URL completa de ingesta (`…/api/v1/import/prometheus`) | — (obligatorio) |
|
||||
| `user` / `FLEET_USER` | usuario basic-auth | "" |
|
||||
| `pass` / `FLEET_PASS` | password basic-auth | "" |
|
||||
| `interval_sec` / `FLEET_INTERVAL` | periodo de push en segundos | 15 |
|
||||
|
||||
## Ejemplo
|
||||
|
||||
```bash
|
||||
# Build
|
||||
cd projects/fleet_monitoring/apps/metrics_agent
|
||||
go build -o metrics_agent .
|
||||
|
||||
# Push único de prueba (lee config + empuja una vez y sale)
|
||||
FLEET_NODE=lucas \
|
||||
FLEET_HUB_URL="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus" \
|
||||
FLEET_USER=fleet \
|
||||
FLEET_PASS="$(pass show fleet/ingest-pass | head -1)" \
|
||||
./metrics_agent -once
|
||||
|
||||
# Bucle continuo con archivo de config
|
||||
./metrics_agent -config /etc/fleet-agent/agent.json
|
||||
```
|
||||
|
||||
## Cuando usarla
|
||||
|
||||
Despliégalo en cualquier máquina nueva que quieras ver en Grafana: copia el binario, escribe
|
||||
`/etc/fleet-agent/agent.json` con su `node` y los secretos, instala el unit systemd y arranca.
|
||||
No hay que tocar el hub central.
|
||||
|
||||
## Cross-compilación (para layla / Termux arm64)
|
||||
|
||||
gopsutil es Go puro, así que cross-compila sin CGO:
|
||||
|
||||
```bash
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -o metrics_agent_arm64 .
|
||||
```
|
||||
|
||||
## Gotchas
|
||||
|
||||
- `hub_url` debe ser la URL **completa** incluyendo `/api/v1/import/prometheus`.
|
||||
- El push lleva la label `instance` vía `extra_label`; no la pongas tú en las métricas.
|
||||
- Las temperaturas son best-effort: en VPS y en Android/Termux puede no haber sensores y el
|
||||
grupo `node_temp_celsius` simplemente se omite.
|
||||
- El binario importa el paquete `fn-registry/functions/infra` completo (vía `replace` al
|
||||
registry), por lo que arrastra las dependencias de ese paquete. El linker elimina el código
|
||||
no usado, pero el árbol de compilación es grande.
|
||||
@@ -0,0 +1,65 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
// Config holds the agent runtime configuration. It is read from an optional
|
||||
// JSON file and can be overridden by environment variables, which is handy for
|
||||
// systemd drop-ins and for deploying the same binary to many nodes.
|
||||
type Config struct {
|
||||
Node string `json:"node"` // value of the "instance" label attached to every series
|
||||
HubURL string `json:"hub_url"` // full ingest URL, e.g. https://metrics-…/api/v1/import/prometheus
|
||||
User string `json:"user"` // basic-auth user (empty disables auth)
|
||||
Pass string `json:"pass"` // basic-auth password
|
||||
IntervalSec int `json:"interval_sec"` // push period in seconds (default 15)
|
||||
}
|
||||
|
||||
// defaultConfig returns the baseline configuration: the machine hostname as the
|
||||
// node name and a 15-second push interval.
|
||||
func defaultConfig() Config {
|
||||
host, _ := os.Hostname()
|
||||
return Config{Node: host, IntervalSec: 15}
|
||||
}
|
||||
|
||||
// loadConfig reads the JSON file at path (when non-empty) and then applies
|
||||
// environment overrides. Recognised env vars: FLEET_NODE, FLEET_HUB_URL,
|
||||
// FLEET_USER, FLEET_PASS, FLEET_INTERVAL.
|
||||
func loadConfig(path string) (Config, error) {
|
||||
cfg := defaultConfig()
|
||||
if path != "" {
|
||||
b, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return cfg, err
|
||||
}
|
||||
if err := json.Unmarshal(b, &cfg); err != nil {
|
||||
return cfg, err
|
||||
}
|
||||
}
|
||||
if v := os.Getenv("FLEET_NODE"); v != "" {
|
||||
cfg.Node = v
|
||||
}
|
||||
if v := os.Getenv("FLEET_HUB_URL"); v != "" {
|
||||
cfg.HubURL = v
|
||||
}
|
||||
if v := os.Getenv("FLEET_USER"); v != "" {
|
||||
cfg.User = v
|
||||
}
|
||||
if v := os.Getenv("FLEET_PASS"); v != "" {
|
||||
cfg.Pass = v
|
||||
}
|
||||
if v := os.Getenv("FLEET_INTERVAL"); v != "" {
|
||||
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||||
cfg.IntervalSec = n
|
||||
}
|
||||
}
|
||||
if cfg.IntervalSec <= 0 {
|
||||
cfg.IntervalSec = 15
|
||||
}
|
||||
if cfg.Node == "" {
|
||||
cfg.Node, _ = os.Hostname()
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
module metrics_agent
|
||||
|
||||
go 1.25.0
|
||||
|
||||
require fn-registry v0.0.0
|
||||
|
||||
require (
|
||||
github.com/creack/pty v1.1.24 // indirect
|
||||
github.com/ebitengine/purego v0.10.0 // indirect
|
||||
github.com/fsnotify/fsnotify v1.7.0 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
|
||||
github.com/mattn/go-sqlite3 v1.14.44 // indirect
|
||||
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
|
||||
github.com/shirou/gopsutil/v4 v4.26.5 // indirect
|
||||
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.16 // indirect
|
||||
github.com/tklauser/numcpus v0.11.0 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
golang.org/x/crypto v0.51.0 // indirect
|
||||
golang.org/x/sync v0.20.0 // indirect
|
||||
golang.org/x/sys v0.44.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
nhooyr.io/websocket v1.8.17 // indirect
|
||||
)
|
||||
|
||||
replace fn-registry => ../../../../
|
||||
@@ -0,0 +1,50 @@
|
||||
github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s=
|
||||
github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU=
|
||||
github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
|
||||
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
|
||||
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
|
||||
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
|
||||
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
|
||||
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4=
|
||||
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
|
||||
github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8=
|
||||
github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU=
|
||||
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
|
||||
github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM=
|
||||
github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ=
|
||||
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e h1:MRM5ITcdelLK2j1vwZ3Je0FKVCfqOLp5zO6trqMLYs0=
|
||||
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e/go.mod h1:XV66xRDqSt+GTGFMVlhk3ULuV0y9ZmzeVGR4mloJI3M=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA=
|
||||
github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI=
|
||||
github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw=
|
||||
github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ=
|
||||
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
|
||||
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI=
|
||||
golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
|
||||
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y=
|
||||
nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c=
|
||||
@@ -0,0 +1,85 @@
|
||||
// Command metrics_agent collects host metrics and pushes them to a
|
||||
// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval.
|
||||
//
|
||||
// It is the per-node component of the fleet_monitoring project and is meant to
|
||||
// run as a systemd service on every machine of the fleet. It does no work of
|
||||
// its own beyond orchestration: the actual capability comes from three registry
|
||||
// functions in fn-registry/functions/infra:
|
||||
//
|
||||
// - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs)
|
||||
// - FormatPromExposition -> Prometheus exposition text
|
||||
// - PushPromRemote -> POST the text with optional basic auth + extra labels
|
||||
//
|
||||
// The "instance" label is attached at push time so a single binary, configured
|
||||
// only with its node name and the hub endpoint, identifies itself in Grafana.
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"fn-registry/functions/infra"
|
||||
)
|
||||
|
||||
func main() {
|
||||
configPath := flag.String("config", "", "path to JSON config file")
|
||||
once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)")
|
||||
flag.Parse()
|
||||
|
||||
cfg, err := loadConfig(*configPath)
|
||||
if err != nil {
|
||||
log.Fatalf("config: %v", err)
|
||||
}
|
||||
if cfg.HubURL == "" {
|
||||
log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)")
|
||||
}
|
||||
log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec)
|
||||
|
||||
if *once {
|
||||
if err := pushOnce(cfg); err != nil {
|
||||
log.Fatalf("push: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
|
||||
stop := make(chan os.Signal, 1)
|
||||
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
// Push once right away so a freshly started node shows up immediately,
|
||||
// then keep pushing on every tick.
|
||||
if err := pushOnce(cfg); err != nil {
|
||||
log.Printf("push error: %v", err)
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
if err := pushOnce(cfg); err != nil {
|
||||
log.Printf("push error: %v", err)
|
||||
}
|
||||
case <-stop:
|
||||
log.Print("shutting down")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pushOnce runs a single collect -> format -> push cycle.
|
||||
func pushOnce(cfg Config) error {
|
||||
samples, err := infra.CollectHostMetrics()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body := infra.FormatPromExposition(samples, time.Now().UnixMilli())
|
||||
if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil {
|
||||
return err
|
||||
}
|
||||
log.Printf("pushed %d samples", len(samples))
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
[Unit]
|
||||
Description=Fleet metrics agent (fleet_monitoring)
|
||||
Documentation=https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/metrics_agent
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/opt/fleet-agent/metrics_agent -config /etc/fleet-agent/agent.json
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
NoNewPrivileges=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user