feat(hub): dashboard + deploy del scraper NATS server-level del cluster unibus

Add the unibus-nats Grafana dashboard and the per-node deploy script for the
unibus_exporter NATS-local mode, which surfaces the server-level NATS/JetStream
metrics that /healthz cannot (msgs/s, connections, KV bucket msgs, RAFT leader
per stream, memory, restarts).

- hub/dashboards/unibus-nats.json (uid unibus-nats, datasource victoriametrics):
  12 panels — NATS up, connections, msgs/s in, slow consumers, JetStream msgs,
  nats-server restarts (1h); msgs/s per node (in/out), connections per node,
  KV bucket msgs per bucket, nats-server memory per node; RAFT leader per stream
  table, JetStream per node table. Picked up automatically by the existing
  "fleet" dashboard provider (scans the dashboards path).
- hub/deploy_unibus_nats_exporter.sh: deploys unibus_exporter in NATS-local mode
  on a node. magnus pushes to the local VictoriaMetrics (no auth); homer/datardos
  push to the public ingest endpoint with basic auth (pass fleet/ingest-pass,
  never in argv). Installs unibus-exporter-nats.service and probes 8222 to warn if
  the monitoring endpoint is not yet open. Does NOT restart membershipd or touch
  the cluster — that is the consolidated 0.11.0 rollout.

The scraper code (unibus_exporter NATS-local mode) and the registry parser
(parse_nats_monitor_go_infra) ship in their own repos; see report
unibus-nats-metrics-clean for the integration order.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Egutierrez
2026-06-07 21:19:42 +02:00
parent 1803833b50
commit dfd55dc10a
2 changed files with 242 additions and 0 deletions
+148
View File
@@ -0,0 +1,148 @@
{
"uid": "unibus-nats",
"title": "unibus — NATS server",
"tags": ["fleet", "unibus", "nats", "jetstream"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"refresh": "30s",
"time": { "from": "now-6h", "to": "now" },
"templating": { "list": [] },
"panels": [
{
"id": 1,
"type": "stat",
"title": "Nodos NATS up",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_up[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" }, "noValue": "n/d (8222 cerrado)" }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
},
{
"id": 2,
"type": "stat",
"title": "Conexiones (cluster)",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_connections[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" }
},
{
"id": 3,
"type": "stat",
"title": "Msgs/s entrada (cluster)",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(rate(nats_msgs_in_total[5m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "cps", "color": { "mode": "fixed", "fixedColor": "purple" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" }
},
{
"id": 4,
"type": "stat",
"title": "Slow consumers",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_slow_consumers[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" }
},
{
"id": 5,
"type": "stat",
"title": "JetStream msgs (cluster)",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_jetstream_messages[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
},
{
"id": 6,
"type": "stat",
"title": "Reinicios nats-server (1h)",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "sum(changes(nats_server_start_seconds[1h]))", "legendFormat": "reinicios", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 } ] }, "noValue": "0" }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" }
},
{
"id": 7,
"type": "timeseries",
"title": "Mensajes/s por nodo (in / out)",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "rate(nats_msgs_in_total[5m])", "legendFormat": "{{node}} · in", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "B", "expr": "rate(nats_msgs_out_total[5m])", "legendFormat": "{{node}} · out", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "cps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 8,
"type": "timeseries",
"title": "Conexiones por nodo",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "nats_connections", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 20, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 9,
"type": "timeseries",
"title": "KV bucket msgs (por bucket)",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "max by (bucket) (kv_bucket_msgs)", "legendFormat": "{{bucket}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 10,
"type": "timeseries",
"title": "Memoria nats-server por nodo",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "nats_mem_bytes", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "bytes", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 11,
"type": "table",
"title": "Leader RAFT por stream",
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "nats_jetstream_raft_leader == 1", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"transformations": [
{ "id": "organize", "options": { "excludeByName": { "Time": true, "instance": true, "__name__": true, "job": true, "Value": true }, "renameByName": { "stream": "Stream", "node": "Leader" }, "indexByName": { "stream": 0, "node": 1 } } }
],
"fieldConfig": { "defaults": { "custom": { "align": "left" }, "noValue": "n/d (sin métricas NATS o sin quorum)" }, "overrides": [] },
"options": { "showHeader": true }
},
{
"id": 12,
"type": "table",
"title": "JetStream por nodo",
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "streams", "expr": "last_over_time(nats_jetstream_streams[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "messages", "expr": "last_over_time(nats_jetstream_messages[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "bytes", "expr": "last_over_time(nats_jetstream_bytes[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "subs", "expr": "last_over_time(nats_subscriptions[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"transformations": [
{ "id": "joinByField", "options": { "byField": "node", "mode": "outer" } },
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "instance": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true }, "renameByName": { "node": "Nodo", "Value #streams": "streams", "Value #messages": "messages", "Value #bytes": "bytes", "Value #subs": "subs" } } }
],
"fieldConfig": { "defaults": { "custom": { "align": "center" } }, "overrides": [ { "matcher": { "id": "byName", "options": "bytes" }, "properties": [ { "id": "unit", "value": "bytes" } ] } ] },
"options": { "showHeader": true }
}
]
}