commit 92da0c0b0a45e68aaf3e46794cc1d7c00ea2cddb Author: Egutierrez Date: Sun Jun 7 20:23:21 2026 +0200 feat: unibus_exporter — daemon que sondea /healthz del cluster unibus y empuja estado+posture a VictoriaMetrics Compone parse_unibus_health + format_prom_exposition + push_prom_remote del registry (grupo fleet-metrics). Un solo exporter scrapea los 3 nodos por IP pública con la CA del cluster; labels node/instance por serie. Config JSON con secretos fuera de argv. Incluye systemd unit y unibus.example.json. Co-Authored-By: Claude Opus 4.8 (1M context) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48ec162 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# Binarios compilados +/unibus_exporter +/unibus_exporter_* +/dist/ + +# Config real con secretos y la CA del cluster (solo se versiona unibus.example.json) +/unibus.json +/ca.crt +*.crt +!*.example.* +registry.db diff --git a/app.md b/app.md new file mode 100644 index 0000000..349078a --- /dev/null +++ b/app.md @@ -0,0 +1,120 @@ +--- +name: unibus_exporter +lang: go +domain: infra +version: 0.1.0 +description: "Exporter del cluster de mensajería unibus: sondea el /healthz de cada nodo por TLS (CA del cluster) en un bucle y empuja a VictoriaMetrics métricas de estado del cluster y posture (up/down, enforce/acl/tls/cluster, store-kv, cluster_size) sin instrumentar el bus." +tags: [fleet-metrics, unibus, monitoring, daemon] +uses_functions: + - parse_unibus_health_go_infra + - format_prom_exposition_go_infra + - push_prom_remote_go_infra +uses_types: + - PromSample_go_infra +framework: "" +entry_point: "main.go" +dir_path: "projects/fleet_monitoring/apps/unibus_exporter" +repo_url: "https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/unibus_exporter" +--- + +# unibus_exporter + +Daemon que monitoriza el cluster de mensajería **unibus** (NATS + JetStream, desplegado como +3 nodos: magnus, homer, datardos) y lo hace visible en Grafana junto al resto de la flota. Es +parte del project `fleet_monitoring`. + +No instrumenta el bus: solo **lee** el endpoint público de salud de cada nodo +(`GET https://:8470/healthz`, verificado con la CA del cluster) y traduce su respuesta a +métricas Prometheus que empuja a VictoriaMetrics. Así un único exporter (corriendo en magnus, +que alcanza los 3 nodos por IP pública) cubre todo el cluster sin tocar la configuración de los +nodos. + +## Qué hace + +Compone tres funciones del registry (grupo `fleet-metrics`), no reimplementa nada: + +1. `parse_unibus_health_go_infra` — convierte el JSON de `/healthz` de un nodo en `[]PromSample` + con labels `node`/`instance`. +2. `format_prom_exposition_go_infra` — serializa los samples a texto formato Prometheus exposition. +3. `push_prom_remote_go_infra` — hace POST del texto a VictoriaMetrics, añadiendo la label común + `job=unibus_exporter` vía `extra_label`. + +Como un solo exporter scrapea varios nodos, las labels `node` e `instance` se fijan **por serie** +(en el parser) y no vía `extra_label`, que aplicaría un único valor a todo el lote. + +## Métricas que produce + +| Serie | Labels | Significado | +|---|---|---| +| `unibus_up` | node, instance | 1 si el nodo respondió `/healthz`, 0 si falló el GET/parseo | +| `unibus_status_ok` | node, instance | 1 si `status=="ok"` | +| `unibus_posture_enforce` | node, instance | posture: enforcement de auth (1/0) | +| `unibus_posture_acl` | node, instance | posture: ACL de subjects (1/0) | +| `unibus_posture_tls` | node, instance | posture: TLS del transporte (1/0) | +| `unibus_posture_cluster` | node, instance | posture: modo cluster activo (1/0) | +| `unibus_store_kv` | node, instance | 1 si el backend de store es `kv` (JetStream KV) | +| `unibus_scrape_error` | node, instance | 1 si el scrape de ese nodo falló | +| `unibus_scrape_duration_seconds` | node, instance | latencia del GET `/healthz` | +| `unibus_cluster_size` | (global) | número de nodos configurados (los vivos = `sum(unibus_up)`) | + +## Por qué no lleva el tag `service` + +Es un daemon, pero igual que `metrics_agent` su liveness no se vigila con un health check propio +por SSH (modelo `service:`/`services_monitor`): si el exporter cae, las series `unibus_*` se +vuelven stale y eso es la señal. Por eso se etiqueta `daemon`. + +## Configuración + +Config por archivo JSON (`-config`). Campos en `unibus.example.json`: + +| campo | descripción | default | +|---|---|---| +| `nodes[]` | lista de `{name, url}` por nodo (url = `/healthz` completo) | — (obligatorio) | +| `ca_cert_path` | PEM de la CA del cluster unibus, para verificar el TLS de cada nodo | — (obligatorio) | +| `hub_url` | endpoint de ingesta de VictoriaMetrics (`…/api/v1/import/prometheus`) | — (obligatorio) | +| `user` / `pass` | basic-auth del hub (vacío si el hub es local sin auth) | "" | +| `interval_sec` | periodo de scrape+push en segundos | 15 | +| `timeout_sec` | timeout del GET `/healthz` por nodo | 8 | +| `labels` | labels comunes añadidas vía `extra_label` | `{"job":"unibus_exporter"}` | + +Overrides por entorno: `UNIBUS_HUB_URL`, `UNIBUS_USER`, `UNIBUS_PASS`, `UNIBUS_CA_CERT`, +`UNIBUS_INTERVAL`. Los secretos viven solo en el archivo de config (chmod 600), nunca en argv. + +## Ejemplo + +```bash +cd projects/fleet_monitoring/apps/unibus_exporter +go build -o unibus_exporter . + +# Scrape+push único de prueba (lee config y sale) +./unibus_exporter -config /etc/unibus-exporter/unibus.json -once + +# Bucle continuo (lo que hace el servicio systemd) +./unibus_exporter -config /etc/unibus-exporter/unibus.json +``` + +## Cuando usarla + +Despliégalo en una máquina que alcance los 3 nodos del cluster (hoy magnus, por IP pública). No +hay que tocar los nodos de unibus: el exporter solo lee su `/healthz`. El dashboard +`hub/dashboards/unibus-cluster.json` visualiza estas series. + +## Deploy + +Desde la raíz del project: `./hub/deploy_unibus_exporter.sh magnus om`. Compila el binario, sube +binario + CA del cluster a `/opt/unibus-exporter` + `/etc/unibus-exporter`, escribe la config con +el endpoint local de VM (`http://127.0.0.1:8428/...`, sin auth porque corre en el hub) e instala +el servicio systemd. + +## Gotchas + +- La CA del cluster es **secreta** (gitignored): no se versiona en el repo. El deploy la sube al + nodo a `/etc/unibus-exporter/ca.crt` (chmod 600). Localmente vive en + `projects/message_bus/apps/unibus/deploy/tls/ca.crt`. +- El TLS se verifica **siempre** contra esa CA: una CA equivocada o ausente hace fallar el + arranque, no se ignora. +- `unibus_up=0` lo emite este exporter (no el parser) cuando el GET falla, para que un nodo caído + sea visible en Grafana en vez de simplemente desaparecer. +- Métricas profundas de NATS/JetStream (msgs/s, conexiones, RAFT leader por stream, NRestarts) NO + las produce: requieren el monitoring embebido de NATS (puerto 8222), hoy cerrado en producción. + Ver el report `unibus-grafana-monitoring` para el detalle del gap. diff --git a/config.go b/config.go new file mode 100644 index 0000000..99b4195 --- /dev/null +++ b/config.go @@ -0,0 +1,106 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" +) + +// Node is a single unibus cluster member to probe. Name is the logical node +// label ("magnus", "homer", "datardos"); URL is its full /healthz endpoint +// (e.g. https://135.125.201.30:8470/healthz). +type Node struct { + Name string `json:"name"` + URL string `json:"url"` +} + +// Config is the exporter runtime configuration, read from a JSON file. It holds +// the list of unibus nodes to probe, the cluster CA used to verify their TLS, +// the VictoriaMetrics ingest endpoint and its basic-auth credentials. +// +// Secrets (pass) live only in this file (chmod 600) or come from the environment +// — never in argv. The example config carries a placeholder, not the real one. +type Config struct { + Nodes []Node `json:"nodes"` // unibus members to probe + CACertPath string `json:"ca_cert_path"` // PEM CA of the unibus cluster, used to verify each node's TLS + HubURL string `json:"hub_url"` // VictoriaMetrics import endpoint (…/api/v1/import/prometheus) + User string `json:"user"` // basic-auth user for the hub (empty disables auth) + Pass string `json:"pass"` // basic-auth password for the hub + IntervalSec int `json:"interval_sec"` // scrape+push period in seconds (default 15) + TimeoutSec int `json:"timeout_sec"` // per-node healthz GET timeout in seconds (default 8) + Labels map[string]string `json:"labels"` // extra labels added to every series via extra_label (e.g. {"job":"unibus_exporter"}) +} + +// loadConfig reads and validates the JSON config at path. Environment overrides: +// UNIBUS_HUB_URL, UNIBUS_USER, UNIBUS_PASS, UNIBUS_CA_CERT, UNIBUS_INTERVAL. +func loadConfig(path string) (Config, error) { + cfg := Config{IntervalSec: 15, TimeoutSec: 8} + if path != "" { + b, err := os.ReadFile(path) + if err != nil { + return cfg, err + } + if err := json.Unmarshal(b, &cfg); err != nil { + return cfg, err + } + } + if v := os.Getenv("UNIBUS_HUB_URL"); v != "" { + cfg.HubURL = v + } + if v := os.Getenv("UNIBUS_USER"); v != "" { + cfg.User = v + } + if v := os.Getenv("UNIBUS_PASS"); v != "" { + cfg.Pass = v + } + if v := os.Getenv("UNIBUS_CA_CERT"); v != "" { + cfg.CACertPath = v + } + if v := os.Getenv("UNIBUS_INTERVAL"); v != "" { + if n, err := parsePositiveInt(v); err == nil { + cfg.IntervalSec = n + } + } + if cfg.IntervalSec <= 0 { + cfg.IntervalSec = 15 + } + if cfg.TimeoutSec <= 0 { + cfg.TimeoutSec = 8 + } + if cfg.Labels == nil { + cfg.Labels = map[string]string{"job": "unibus_exporter"} + } else if _, ok := cfg.Labels["job"]; !ok { + cfg.Labels["job"] = "unibus_exporter" + } + return cfg, validate(cfg) +} + +func validate(cfg Config) error { + if cfg.HubURL == "" { + return fmt.Errorf("hub_url is required") + } + if len(cfg.Nodes) == 0 { + return fmt.Errorf("at least one node is required") + } + if cfg.CACertPath == "" { + return fmt.Errorf("ca_cert_path is required (PEM CA of the unibus cluster)") + } + for i, n := range cfg.Nodes { + if n.Name == "" || n.URL == "" { + return fmt.Errorf("node[%d]: name and url are required", i) + } + } + return nil +} + +func parsePositiveInt(s string) (int, error) { + var n int + _, err := fmt.Sscanf(s, "%d", &n) + if err != nil { + return 0, err + } + if n <= 0 { + return 0, fmt.Errorf("not positive") + } + return n, nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..ea6e415 --- /dev/null +++ b/go.mod @@ -0,0 +1,28 @@ +module unibus_exporter + +go 1.25.0 + +require fn-registry v0.0.0 + +require ( + github.com/creack/pty v1.1.24 // indirect + github.com/ebitengine/purego v0.10.0 // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect + github.com/mattn/go-sqlite3 v1.14.44 // indirect + github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect + github.com/shirou/gopsutil/v4 v4.26.5 // indirect + github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect + golang.org/x/crypto v0.51.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.44.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + nhooyr.io/websocket v1.8.17 // indirect +) + +replace fn-registry => ../../../../ diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..43b0997 --- /dev/null +++ b/go.sum @@ -0,0 +1,50 @@ +github.com/creack/pty v1.1.24 h1:bJrF4RRfyJnbTJqzRLHzcGaZK1NeM5kTC9jGgovnR1s= +github.com/creack/pty v1.1.24/go.mod h1:08sCNb52WyoAwi2QDyzUCTgcvVFhUzewun7wtTfvcwE= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= +github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= +github.com/mattn/go-sqlite3 v1.14.44 h1:3VSe+xafpbzsLbdr2AWlAZk9yRHiBhTBakioXaCKTF8= +github.com/mattn/go-sqlite3 v1.14.44/go.mod h1:pjEuOr8IwzLJP2MfGeTb0A35jauH+C2kbHKBr7yXKVQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= +github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= +github.com/shirou/gopsutil/v4 v4.26.5 h1:RPcBXkpz7kOj9PqGFQOlBPZHsyaPvPVQc098y9RmCNM= +github.com/shirou/gopsutil/v4 v4.26.5/go.mod h1:LZ6ewCSkBqUpvSOf+LsTGnRinC6iaNUNMGBtDkJBaLQ= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e h1:MRM5ITcdelLK2j1vwZ3Je0FKVCfqOLp5zO6trqMLYs0= +github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e/go.mod h1:XV66xRDqSt+GTGFMVlhk3ULuV0y9ZmzeVGR4mloJI3M= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= +golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +nhooyr.io/websocket v1.8.17 h1:KEVeLJkUywCKVsnLIDlD/5gtayKp8VoCkksHCGGfT9Y= +nhooyr.io/websocket v1.8.17/go.mod h1:rN9OFWIUwuxg4fR5tELlYC04bXYowCP9GX47ivo2l+c= diff --git a/main.go b/main.go new file mode 100644 index 0000000..d015adf --- /dev/null +++ b/main.go @@ -0,0 +1,180 @@ +// Command unibus_exporter probes the /healthz endpoint of every node of the +// unibus messaging cluster (NATS+JetStream) on a fixed interval and pushes the +// resulting cluster/posture metrics to a VictoriaMetrics / Prometheus-compatible +// ingest endpoint. +// +// It does NOT instrument the bus: it only reads each node's public /healthz over +// TLS (verified with the cluster CA) and turns the JSON posture into metrics. The +// heavy lifting is three registry functions in fn-registry/functions/infra: +// +// - ParseUnibusHealth -> []infra.PromSample (per-node up/posture/store) +// - FormatPromExposition -> Prometheus exposition text +// - PushPromRemote -> POST the text with basic auth + an extra "job" label +// +// A single exporter scrapes all nodes, so the "node" and "instance" labels are +// attached per series (by ParseUnibusHealth) rather than via the push's +// extra_label, which would apply one value to the whole batch. +package main + +import ( + "context" + "crypto/tls" + "crypto/x509" + "flag" + "fmt" + "io" + "log" + "net/http" + "os" + "os/signal" + "syscall" + "time" + + "fn-registry/functions/infra" +) + +func main() { + configPath := flag.String("config", "", "path to JSON config file") + once := flag.Bool("once", false, "scrape and push a single time, then exit (useful for testing)") + flag.Parse() + + cfg, err := loadConfig(*configPath) + if err != nil { + log.Fatalf("config: %v", err) + } + client, err := newClient(cfg) + if err != nil { + log.Fatalf("tls client: %v", err) + } + log.Printf("unibus_exporter starting: nodes=%d hub=%q interval=%ds", len(cfg.Nodes), cfg.HubURL, cfg.IntervalSec) + + if *once { + if err := scrapeAndPush(cfg, client); err != nil { + log.Fatalf("push: %v", err) + } + return + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + stop := make(chan os.Signal, 1) + signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-stop + log.Print("shutting down") + cancel() + }() + + ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second) + defer ticker.Stop() + + // Push once right away so the cluster shows up immediately, then on each tick. + if err := scrapeAndPush(cfg, client); err != nil { + log.Printf("push error: %v", err) + } + for { + select { + case <-ticker.C: + if err := scrapeAndPush(cfg, client); err != nil { + log.Printf("push error: %v", err) + } + case <-ctx.Done(): + return + } + } +} + +// newClient builds an HTTP client that verifies the unibus nodes' TLS against the +// cluster CA loaded from cfg.CACertPath. We never disable verification: a wrong +// or missing CA must fail loudly, not silently trust the endpoint. +func newClient(cfg Config) (*http.Client, error) { + pem, err := os.ReadFile(cfg.CACertPath) + if err != nil { + return nil, fmt.Errorf("read CA %q: %w", cfg.CACertPath, err) + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(pem) { + return nil, fmt.Errorf("no certificates parsed from CA %q", cfg.CACertPath) + } + return &http.Client{ + Timeout: time.Duration(cfg.TimeoutSec) * time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{RootCAs: pool, MinVersion: tls.VersionTLS12}, + }, + }, nil +} + +// scrapeAndPush runs one full cycle: probe every node, build the metric samples +// and push them to VictoriaMetrics in a single request. +func scrapeAndPush(cfg Config, client *http.Client) error { + var samples []infra.PromSample + + // Cluster-wide gauge: the configured cluster size. Live nodes = sum(unibus_up). + samples = append(samples, infra.PromSample{ + Name: "unibus_cluster_size", + Value: float64(len(cfg.Nodes)), + }) + + for _, node := range cfg.Nodes { + samples = append(samples, probeNode(node, client)...) + } + + body := infra.FormatPromExposition(samples, time.Now().UnixMilli()) + if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, cfg.Labels); err != nil { + return err + } + log.Printf("pushed %d samples for %d nodes", len(samples), len(cfg.Nodes)) + return nil +} + +// probeNode does a single GET /healthz and turns the result into samples. +// On any failure it emits unibus_up=0 + unibus_scrape_error=1 for the node so a +// down node is visible in Grafana rather than just absent. On success it delegates +// the body parsing to the registry function and adds scrape_error=0. +// Either way it emits unibus_scrape_duration_seconds for the node. +func probeNode(node Node, client *http.Client) []infra.PromSample { + labels := map[string]string{"node": node.Name, "instance": node.Name} + start := time.Now() + body, err := getHealth(client, node.URL) + elapsed := time.Since(start).Seconds() + + dur := infra.PromSample{Name: "unibus_scrape_duration_seconds", Labels: labels, Value: elapsed} + + if err != nil { + log.Printf("node %s: scrape error: %v", node.Name, err) + return []infra.PromSample{ + {Name: "unibus_up", Labels: labels, Value: 0}, + {Name: "unibus_scrape_error", Labels: labels, Value: 1}, + dur, + } + } + + samples, perr := infra.ParseUnibusHealth(node.Name, body) + if perr != nil { + log.Printf("node %s: parse error: %v", node.Name, perr) + return []infra.PromSample{ + {Name: "unibus_up", Labels: labels, Value: 0}, + {Name: "unibus_scrape_error", Labels: labels, Value: 1}, + dur, + } + } + samples = append(samples, infra.PromSample{Name: "unibus_scrape_error", Labels: labels, Value: 0}, dur) + return samples +} + +// getHealth performs the HTTP GET and returns the body when the status is 2xx. +func getHealth(client *http.Client, url string) ([]byte, error) { + resp, err := client.Get(url) + if err != nil { + return nil, err + } + defer resp.Body.Close() + body, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) + if err != nil { + return nil, err + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("status %d: %s", resp.StatusCode, string(body)) + } + return body, nil +} diff --git a/systemd/unibus-exporter.service b/systemd/unibus-exporter.service new file mode 100644 index 0000000..0bdf8ae --- /dev/null +++ b/systemd/unibus-exporter.service @@ -0,0 +1,14 @@ +[Unit] +Description=unibus cluster exporter (fleet_monitoring) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/unibus-exporter/unibus_exporter -config /etc/unibus-exporter/unibus.json +Restart=always +RestartSec=10 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target diff --git a/unibus.example.json b/unibus.example.json new file mode 100644 index 0000000..1cea833 --- /dev/null +++ b/unibus.example.json @@ -0,0 +1,14 @@ +{ + "nodes": [ + { "name": "magnus", "url": "https://135.125.201.30:8470/healthz" }, + { "name": "homer", "url": "https://141.94.69.66:8470/healthz" }, + { "name": "datardos", "url": "https://51.91.100.142:8470/healthz" } + ], + "ca_cert_path": "/etc/unibus-exporter/ca.crt", + "hub_url": "http://127.0.0.1:8428/api/v1/import/prometheus", + "user": "", + "pass": "", + "interval_sec": 15, + "timeout_sec": 8, + "labels": { "job": "unibus_exporter" } +}