Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dfd55dc10a |
@@ -0,0 +1,148 @@
|
|||||||
|
{
|
||||||
|
"uid": "unibus-nats",
|
||||||
|
"title": "unibus — NATS server",
|
||||||
|
"tags": ["fleet", "unibus", "nats", "jetstream"],
|
||||||
|
"timezone": "browser",
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"version": 1,
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": { "from": "now-6h", "to": "now" },
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Nodos NATS up",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_up[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" }, "noValue": "n/d (8222 cerrado)" }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Conexiones (cluster)",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_connections[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Msgs/s entrada (cluster)",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(rate(nats_msgs_in_total[5m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "cps", "color": { "mode": "fixed", "fixedColor": "purple" } }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Slow consumers",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_slow_consumers[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "JetStream msgs (cluster)",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(last_over_time(nats_jetstream_messages[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Reinicios nats-server (1h)",
|
||||||
|
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "sum(changes(nats_server_start_seconds[1h]))", "legendFormat": "reinicios", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 1 } ] }, "noValue": "0" }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 7,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Mensajes/s por nodo (in / out)",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "rate(nats_msgs_in_total[5m])", "legendFormat": "{{node}} · in", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
|
||||||
|
{ "refId": "B", "expr": "rate(nats_msgs_out_total[5m])", "legendFormat": "{{node}} · out", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "cps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 8,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Conexiones por nodo",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "nats_connections", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 20, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 9,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "KV bucket msgs (por bucket)",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "max by (bucket) (kv_bucket_msgs)", "legendFormat": "{{bucket}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter" } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 10,
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memoria nats-server por nodo",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "nats_mem_bytes", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "bytes", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
|
||||||
|
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 11,
|
||||||
|
"type": "table",
|
||||||
|
"title": "Leader RAFT por stream",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [ { "refId": "A", "expr": "nats_jetstream_raft_leader == 1", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "organize", "options": { "excludeByName": { "Time": true, "instance": true, "__name__": true, "job": true, "Value": true }, "renameByName": { "stream": "Stream", "node": "Leader" }, "indexByName": { "stream": 0, "node": 1 } } }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "align": "left" }, "noValue": "n/d (sin métricas NATS o sin quorum)" }, "overrides": [] },
|
||||||
|
"options": { "showHeader": true }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 12,
|
||||||
|
"type": "table",
|
||||||
|
"title": "JetStream por nodo",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 },
|
||||||
|
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "streams", "expr": "last_over_time(nats_jetstream_streams[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
|
||||||
|
{ "refId": "messages", "expr": "last_over_time(nats_jetstream_messages[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
|
||||||
|
{ "refId": "bytes", "expr": "last_over_time(nats_jetstream_bytes[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
|
||||||
|
{ "refId": "subs", "expr": "last_over_time(nats_subscriptions[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
|
||||||
|
],
|
||||||
|
"transformations": [
|
||||||
|
{ "id": "joinByField", "options": { "byField": "node", "mode": "outer" } },
|
||||||
|
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "instance": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true }, "renameByName": { "node": "Nodo", "Value #streams": "streams", "Value #messages": "messages", "Value #bytes": "bytes", "Value #subs": "subs" } } }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "custom": { "align": "center" } }, "overrides": [ { "matcher": { "id": "byName", "options": "bytes" }, "properties": [ { "id": "unit", "value": "bytes" } ] } ] },
|
||||||
|
"options": { "showHeader": true }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
Executable
+94
@@ -0,0 +1,94 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Despliega unibus_exporter en MODO NATS LOCAL en un nodo del cluster unibus.
|
||||||
|
#
|
||||||
|
# A diferencia del modo healthz (una sola instancia en magnus que sondea los 3
|
||||||
|
# nodos por IP pública), el endpoint de monitoring de NATS (8222) es loopback-only
|
||||||
|
# y sin auth, así que este modo corre EN CADA nodo y lee su propio 127.0.0.1:8222.
|
||||||
|
# Por eso este script se ejecuta una vez por nodo.
|
||||||
|
#
|
||||||
|
# Uso: ./deploy_unibus_nats_exporter.sh <node_name> <ssh_host>
|
||||||
|
# node_name : etiqueta lógica del nodo (magnus | homer | datardos)
|
||||||
|
# ssh_host : alias SSH del nodo (~/.ssh/config). magnus suele ser `om`.
|
||||||
|
#
|
||||||
|
# Requisitos:
|
||||||
|
# - Go instalado localmente (compila el binario amd64).
|
||||||
|
# - `pass fleet/ingest-pass` (password de ingesta, solo para nodos != magnus).
|
||||||
|
# - sudo -n (sin password) en el host remoto.
|
||||||
|
#
|
||||||
|
# PRECONDICIÓN CRÍTICA: el nodo debe correr unibus 0.11.0+ arrancado con
|
||||||
|
# UNIBUS_NATS_MONITOR=1 (drop-in membershipd-cluster.service.d/nats-monitor.conf),
|
||||||
|
# o /varz no responderá y el exporter emitirá nats_up=0. Este script NO reinicia
|
||||||
|
# membershipd ni toca el cluster — eso es el rollout consolidado del binario 0.11.0.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
NODE="${1:?uso: deploy_unibus_nats_exporter.sh <node_name> <ssh_host>}"
|
||||||
|
HOST="${2:?uso: deploy_unibus_nats_exporter.sh <node_name> <ssh_host>}"
|
||||||
|
|
||||||
|
ROOT="$(cd "$(dirname "$0")/.." && pwd)" # projects/fleet_monitoring
|
||||||
|
APP="$ROOT/apps/unibus_exporter"
|
||||||
|
|
||||||
|
# magnus ES el hub: empuja a la VM local sin auth. El resto empuja al endpoint
|
||||||
|
# público de ingesta con basic auth (mismo que metrics_agent).
|
||||||
|
if [ "$NODE" = "magnus" ]; then
|
||||||
|
HUB="http://127.0.0.1:8428/api/v1/import/prometheus"
|
||||||
|
USER=""
|
||||||
|
PW=""
|
||||||
|
else
|
||||||
|
HUB="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus"
|
||||||
|
USER="fleet"
|
||||||
|
PW="$(pass show fleet/ingest-pass | head -1)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ">> compilando unibus_exporter (linux/amd64)"
|
||||||
|
mkdir -p "$APP/dist"
|
||||||
|
( cd "$APP" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o dist/unibus_exporter_amd64 . )
|
||||||
|
BIN="$APP/dist/unibus_exporter_amd64"
|
||||||
|
|
||||||
|
echo ">> subiendo binario a $HOST"
|
||||||
|
scp -q -o BatchMode=yes "$BIN" "$HOST:/tmp/unibus_exporter"
|
||||||
|
|
||||||
|
echo ">> instalando servicio NATS-local en $NODE ($HOST)"
|
||||||
|
ssh -o BatchMode=yes "$HOST" "NODE='$NODE' HUB='$HUB' VMUSER='$USER' PW='$PW' bash -s" <<'OUTER'
|
||||||
|
set -e
|
||||||
|
sudo -n mkdir -p /opt/unibus-exporter /etc/unibus-exporter
|
||||||
|
sudo -n mv /tmp/unibus_exporter /opt/unibus-exporter/unibus_exporter
|
||||||
|
sudo -n chmod 755 /opt/unibus-exporter/unibus_exporter
|
||||||
|
sudo -n tee /etc/unibus-exporter/unibus.nats.json >/dev/null <<JSON
|
||||||
|
{
|
||||||
|
"nats_monitor": { "enabled": true, "node": "${NODE}", "base_url": "http://127.0.0.1:8222" },
|
||||||
|
"hub_url": "${HUB}",
|
||||||
|
"user": "${VMUSER}",
|
||||||
|
"pass": "${PW}",
|
||||||
|
"interval_sec": 15,
|
||||||
|
"timeout_sec": 8,
|
||||||
|
"labels": { "job": "unibus_exporter" }
|
||||||
|
}
|
||||||
|
JSON
|
||||||
|
sudo -n chmod 600 /etc/unibus-exporter/unibus.nats.json
|
||||||
|
sudo -n tee /etc/systemd/system/unibus-exporter-nats.service >/dev/null <<'UNIT'
|
||||||
|
[Unit]
|
||||||
|
Description=unibus NATS local exporter (fleet_monitoring)
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/opt/unibus-exporter/unibus_exporter -config /etc/unibus-exporter/unibus.nats.json
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
NoNewPrivileges=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
UNIT
|
||||||
|
sudo -n systemctl daemon-reload
|
||||||
|
sudo -n systemctl enable unibus-exporter-nats >/dev/null 2>&1
|
||||||
|
sudo -n systemctl restart unibus-exporter-nats
|
||||||
|
sleep 3
|
||||||
|
echo -n "status: "; systemctl is-active unibus-exporter-nats || true
|
||||||
|
echo "== nats_up local =="
|
||||||
|
curl -fsS "http://127.0.0.1:8222/varz" >/dev/null 2>&1 && echo "8222 OK (monitoring abierto)" || echo "8222 NO responde — falta UNIBUS_NATS_MONITOR=1 en membershipd (nats_up=0 hasta el rollout)"
|
||||||
|
OUTER
|
||||||
|
|
||||||
|
echo ">> $NODE: unibus_exporter (modo NATS local) desplegado"
|
||||||
|
echo ">> recuerda: nats_up=0 hasta que el nodo corra unibus 0.11.0+ con UNIBUS_NATS_MONITOR=1"
|
||||||
Reference in New Issue
Block a user