From dfd55dc10a2065699aced9e14c0d207d68d06b57 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 7 Jun 2026 21:19:42 +0200 Subject: [PATCH] feat(hub): dashboard + deploy del scraper NATS server-level del cluster unibus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the unibus-nats Grafana dashboard and the per-node deploy script for the unibus_exporter NATS-local mode, which surfaces the server-level NATS/JetStream metrics that /healthz cannot (msgs/s, connections, KV bucket msgs, RAFT leader per stream, memory, restarts). - hub/dashboards/unibus-nats.json (uid unibus-nats, datasource victoriametrics): 12 panels — NATS up, connections, msgs/s in, slow consumers, JetStream msgs, nats-server restarts (1h); msgs/s per node (in/out), connections per node, KV bucket msgs per bucket, nats-server memory per node; RAFT leader per stream table, JetStream per node table. Picked up automatically by the existing "fleet" dashboard provider (scans the dashboards path). - hub/deploy_unibus_nats_exporter.sh: deploys unibus_exporter in NATS-local mode on a node. magnus pushes to the local VictoriaMetrics (no auth); homer/datardos push to the public ingest endpoint with basic auth (pass fleet/ingest-pass, never in argv). Installs unibus-exporter-nats.service and probes 8222 to warn if the monitoring endpoint is not yet open. Does NOT restart membershipd or touch the cluster — that is the consolidated 0.11.0 rollout. The scraper code (unibus_exporter NATS-local mode) and the registry parser (parse_nats_monitor_go_infra) ship in their own repos; see report unibus-nats-metrics-clean for the integration order. Co-Authored-By: Claude Opus 4.8 (1M context) --- hub/dashboards/unibus-nats.json | 148 +++++++++++++++++++++++++++++ hub/deploy_unibus_nats_exporter.sh | 94 ++++++++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 hub/dashboards/unibus-nats.json create mode 100755 hub/deploy_unibus_nats_exporter.sh diff --git a/hub/dashboards/unibus-nats.json b/hub/dashboards/unibus-nats.json new file mode 100644 index 0000000..17d5147 --- /dev/null +++ b/hub/dashboards/unibus-nats.json @@ -0,0 +1,148 @@ +{ + "uid": "unibus-nats", + "title": "unibus — NATS server", + "tags": ["fleet", "unibus", "nats", "jetstream"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Nodos NATS up", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_up[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" }, "noValue": "n/d (8222 cerrado)" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" } + }, + { + "id": 2, + "type": "stat", + "title": "Conexiones (cluster)", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_connections[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" } + }, + { + "id": 3, + "type": "stat", + "title": "Msgs/s entrada (cluster)", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(rate(nats_msgs_in_total[5m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "cps", "color": { "mode": "fixed", "fixedColor": "purple" } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" } + }, + { + "id": 4, + "type": "stat", + "title": "Slow consumers", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_slow_consumers[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "id": 5, + "type": "stat", + "title": "JetStream msgs (cluster)", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_jetstream_messages[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" } + }, + { + "id": 6, + "type": "stat", + "title": "Reinicios nats-server (1h)", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(changes(nats_server_start_seconds[1h]))", "legendFormat": "reinicios", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 } ] }, "noValue": "0" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "id": 7, + "type": "timeseries", + "title": "Mensajes/s por nodo (in / out)", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ + { "refId": "A", "expr": "rate(nats_msgs_in_total[5m])", "legendFormat": "{{node}} · in", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "B", "expr": "rate(nats_msgs_out_total[5m])", "legendFormat": "{{node}} · out", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } + ], + "fieldConfig": { "defaults": { "unit": "cps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 8, + "type": "timeseries", + "title": "Conexiones por nodo", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "nats_connections", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 20, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 9, + "type": "timeseries", + "title": "KV bucket msgs (por bucket)", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "max by (bucket) (kv_bucket_msgs)", "legendFormat": "{{bucket}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 10, + "type": "timeseries", + "title": "Memoria nats-server por nodo", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "nats_mem_bytes", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "bytes", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 11, + "type": "table", + "title": "Leader RAFT por stream", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "nats_jetstream_raft_leader == 1", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "transformations": [ + { "id": "organize", "options": { "excludeByName": { "Time": true, "instance": true, "__name__": true, "job": true, "Value": true }, "renameByName": { "stream": "Stream", "node": "Leader" }, "indexByName": { "stream": 0, "node": 1 } } } + ], + "fieldConfig": { "defaults": { "custom": { "align": "left" }, "noValue": "n/d (sin métricas NATS o sin quorum)" }, "overrides": [] }, + "options": { "showHeader": true } + }, + { + "id": 12, + "type": "table", + "title": "JetStream por nodo", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ + { "refId": "streams", "expr": "last_over_time(nats_jetstream_streams[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "messages", "expr": "last_over_time(nats_jetstream_messages[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "bytes", "expr": "last_over_time(nats_jetstream_bytes[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "subs", "expr": "last_over_time(nats_subscriptions[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } + ], + "transformations": [ + { "id": "joinByField", "options": { "byField": "node", "mode": "outer" } }, + { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "instance": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true }, "renameByName": { "node": "Nodo", "Value #streams": "streams", "Value #messages": "messages", "Value #bytes": "bytes", "Value #subs": "subs" } } } + ], + "fieldConfig": { "defaults": { "custom": { "align": "center" } }, "overrides": [ { "matcher": { "id": "byName", "options": "bytes" }, "properties": [ { "id": "unit", "value": "bytes" } ] } ] }, + "options": { "showHeader": true } + } + ] +} diff --git a/hub/deploy_unibus_nats_exporter.sh b/hub/deploy_unibus_nats_exporter.sh new file mode 100755 index 0000000..020d40c --- /dev/null +++ b/hub/deploy_unibus_nats_exporter.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Despliega unibus_exporter en MODO NATS LOCAL en un nodo del cluster unibus. +# +# A diferencia del modo healthz (una sola instancia en magnus que sondea los 3 +# nodos por IP pública), el endpoint de monitoring de NATS (8222) es loopback-only +# y sin auth, así que este modo corre EN CADA nodo y lee su propio 127.0.0.1:8222. +# Por eso este script se ejecuta una vez por nodo. +# +# Uso: ./deploy_unibus_nats_exporter.sh +# node_name : etiqueta lógica del nodo (magnus | homer | datardos) +# ssh_host : alias SSH del nodo (~/.ssh/config). magnus suele ser `om`. +# +# Requisitos: +# - Go instalado localmente (compila el binario amd64). +# - `pass fleet/ingest-pass` (password de ingesta, solo para nodos != magnus). +# - sudo -n (sin password) en el host remoto. +# +# PRECONDICIÓN CRÍTICA: el nodo debe correr unibus 0.11.0+ arrancado con +# UNIBUS_NATS_MONITOR=1 (drop-in membershipd-cluster.service.d/nats-monitor.conf), +# o /varz no responderá y el exporter emitirá nats_up=0. Este script NO reinicia +# membershipd ni toca el cluster — eso es el rollout consolidado del binario 0.11.0. +set -euo pipefail + +NODE="${1:?uso: deploy_unibus_nats_exporter.sh }" +HOST="${2:?uso: deploy_unibus_nats_exporter.sh }" + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" # projects/fleet_monitoring +APP="$ROOT/apps/unibus_exporter" + +# magnus ES el hub: empuja a la VM local sin auth. El resto empuja al endpoint +# público de ingesta con basic auth (mismo que metrics_agent). +if [ "$NODE" = "magnus" ]; then + HUB="http://127.0.0.1:8428/api/v1/import/prometheus" + USER="" + PW="" +else + HUB="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus" + USER="fleet" + PW="$(pass show fleet/ingest-pass | head -1)" +fi + +echo ">> compilando unibus_exporter (linux/amd64)" +mkdir -p "$APP/dist" +( cd "$APP" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o dist/unibus_exporter_amd64 . ) +BIN="$APP/dist/unibus_exporter_amd64" + +echo ">> subiendo binario a $HOST" +scp -q -o BatchMode=yes "$BIN" "$HOST:/tmp/unibus_exporter" + +echo ">> instalando servicio NATS-local en $NODE ($HOST)" +ssh -o BatchMode=yes "$HOST" "NODE='$NODE' HUB='$HUB' VMUSER='$USER' PW='$PW' bash -s" <<'OUTER' +set -e +sudo -n mkdir -p /opt/unibus-exporter /etc/unibus-exporter +sudo -n mv /tmp/unibus_exporter /opt/unibus-exporter/unibus_exporter +sudo -n chmod 755 /opt/unibus-exporter/unibus_exporter +sudo -n tee /etc/unibus-exporter/unibus.nats.json >/dev/null </dev/null <<'UNIT' +[Unit] +Description=unibus NATS local exporter (fleet_monitoring) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/unibus-exporter/unibus_exporter -config /etc/unibus-exporter/unibus.nats.json +Restart=always +RestartSec=10 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target +UNIT +sudo -n systemctl daemon-reload +sudo -n systemctl enable unibus-exporter-nats >/dev/null 2>&1 +sudo -n systemctl restart unibus-exporter-nats +sleep 3 +echo -n "status: "; systemctl is-active unibus-exporter-nats || true +echo "== nats_up local ==" +curl -fsS "http://127.0.0.1:8222/varz" >/dev/null 2>&1 && echo "8222 OK (monitoring abierto)" || echo "8222 NO responde — falta UNIBUS_NATS_MONITOR=1 en membershipd (nats_up=0 hasta el rollout)" +OUTER + +echo ">> $NODE: unibus_exporter (modo NATS local) desplegado" +echo ">> recuerda: nats_up=0 hasta que el nodo corra unibus 0.11.0+ con UNIBUS_NATS_MONITOR=1"