From 1803833b5097f40e09b29ba0a3ea602effe9cab9 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 7 Jun 2026 20:26:04 +0200 Subject: [PATCH] =?UTF-8?q?feat(hub):=20monitorizaci=C3=B3n=20del=20cluste?= =?UTF-8?q?r=20unibus=20=E2=80=94=20dashboard=20+=20deploy=20del=20exporte?= =?UTF-8?q?r?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - hub/dashboards/unibus-cluster.json: dashboard 'unibus — Cluster' (carpeta Fleet, datasource VictoriaMetrics): nodos up, cluster_size, nodos caídos, posture homogénea segura, matriz de posture por nodo (enforce/acl/tls/cluster/store-kv), latencia de scrape y tabla de estado por nodo. Panel meta-leader preparado (n/d sin métricas NATS). - hub/deploy_unibus_exporter.sh: compila el exporter, sube binario + CA del cluster a magnus e instala el servicio systemd apuntando a la VictoriaMetrics local. El exporter (apps/unibus_exporter, sub-repo Gitea propio) compone parse_unibus_health + format_prom_exposition + push_prom_remote del registry. Co-Authored-By: Claude Opus 4.8 (1M context) --- hub/dashboards/unibus-cluster.json | 119 +++++++++++++++++++++++++++++ hub/deploy_unibus_exporter.sh | 84 ++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 hub/dashboards/unibus-cluster.json create mode 100755 hub/deploy_unibus_exporter.sh diff --git a/hub/dashboards/unibus-cluster.json b/hub/dashboards/unibus-cluster.json new file mode 100644 index 0000000..ee01944 --- /dev/null +++ b/hub/dashboards/unibus-cluster.json @@ -0,0 +1,119 @@ +{ + "uid": "unibus-cluster", + "title": "unibus — Cluster", + "tags": ["fleet", "unibus"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { "from": "now-6h", "to": "now" }, + "templating": { "list": [] }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Nodos up", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "sum(last_over_time(unibus_up[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" } + }, + { + "id": 2, + "type": "stat", + "title": "Cluster size (esperado)", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "max(last_over_time(unibus_cluster_size[5m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" } + }, + { + "id": 3, + "type": "stat", + "title": "Nodos caídos", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "max(last_over_time(unibus_cluster_size[5m])) - sum(last_over_time(unibus_up[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "id": 4, + "type": "stat", + "title": "Posture homogénea segura", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "min(last_over_time(unibus_posture_enforce[2m])) * min(last_over_time(unibus_posture_acl[2m])) * min(last_over_time(unibus_posture_tls[2m])) * min(last_over_time(unibus_posture_cluster[2m])) * min(last_over_time(unibus_store_kv[2m]))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "mappings": [ { "type": "value", "options": { "0": { "text": "DEGRADADA", "color": "red" }, "1": { "text": "OK (enforce+acl+tls+cluster+kv)", "color": "green" } } } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" } + }, + { + "id": 5, + "type": "stat", + "title": "Meta-leader", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "unibus_meta_leader == 1", "legendFormat": "{{node}}", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "purple" }, "noValue": "n/d (requiere métricas NATS)" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "none", "graphMode": "none", "textMode": "name" } + }, + { + "id": 6, + "type": "timeseries", + "title": "up / down por nodo", + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "unibus_up", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "none", "min": 0, "max": 1, "custom": { "drawStyle": "line", "fillOpacity": 25, "showPoints": "never", "lineWidth": 2, "lineInterpolation": "stepAfter", "spanNulls": false } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 7, + "type": "state-timeline", + "title": "Matriz de posture por nodo (enforce / acl / tls / cluster / store-kv)", + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ + { "refId": "A", "expr": "unibus_posture_enforce", "legendFormat": "{{node}} · enforce", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "B", "expr": "unibus_posture_acl", "legendFormat": "{{node}} · acl", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "C", "expr": "unibus_posture_tls", "legendFormat": "{{node}} · tls", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "D", "expr": "unibus_posture_cluster", "legendFormat": "{{node}} · cluster", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "E", "expr": "unibus_store_kv", "legendFormat": "{{node}} · store-kv", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } + ], + "fieldConfig": { "defaults": { "custom": { "fillOpacity": 80, "lineWidth": 0 }, "mappings": [ { "type": "value", "options": { "0": { "text": "off", "color": "red" }, "1": { "text": "on", "color": "green" } } }, { "type": "special", "options": { "match": "null", "result": { "text": "no reporta", "color": "dark-red" } } } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, + "options": { "showValue": "auto", "alignValue": "center", "mergeValues": true, "rowHeight": 0.9, "legend": { "displayMode": "list", "placement": "bottom" } } + }, + { + "id": 8, + "type": "timeseries", + "title": "Latencia de scrape healthz por nodo", + "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ { "refId": "A", "expr": "unibus_scrape_duration_seconds", "legendFormat": "{{node}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ], + "fieldConfig": { "defaults": { "unit": "s", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] }, + "options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } } + }, + { + "id": 9, + "type": "table", + "title": "Estado actual por nodo", + "gridPos": { "h": 7, "w": 12, "x": 12, "y": 13 }, + "datasource": { "type": "prometheus", "uid": "victoriametrics" }, + "targets": [ + { "refId": "up", "expr": "last_over_time(unibus_up[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "enforce", "expr": "last_over_time(unibus_posture_enforce[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "acl", "expr": "last_over_time(unibus_posture_acl[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "tls", "expr": "last_over_time(unibus_posture_tls[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } }, + { "refId": "kv", "expr": "last_over_time(unibus_store_kv[2m])", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } + ], + "transformations": [ + { "id": "joinByField", "options": { "byField": "node", "mode": "outer" } }, + { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "instance": true, "instance 1": true, "instance 2": true, "instance 3": true, "instance 4": true, "instance 5": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 4": true, "__name__ 5": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true, "job 5": true }, "renameByName": { "node": "Nodo", "Value #up": "up", "Value #enforce": "enforce", "Value #acl": "acl", "Value #tls": "tls", "Value #kv": "store-kv" } } } + ], + "fieldConfig": { "defaults": { "custom": { "align": "center" }, "mappings": [ { "type": "value", "options": { "0": { "text": "✗", "color": "red" }, "1": { "text": "✓", "color": "green" } } } ] }, "overrides": [] }, + "options": { "showHeader": true } + } + ] +} diff --git a/hub/deploy_unibus_exporter.sh b/hub/deploy_unibus_exporter.sh new file mode 100755 index 0000000..40b67a9 --- /dev/null +++ b/hub/deploy_unibus_exporter.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# Despliega unibus_exporter en un nodo (por defecto magnus, el hub) como servicio +# systemd. El exporter sondea el /healthz de los 3 nodos del cluster unibus por +# IP pública y empuja métricas de estado/posture a la VictoriaMetrics local. +# +# Uso: ./deploy_unibus_exporter.sh [node_name] [ssh_host] +# node_name : nombre lógico del host donde corre el exporter (default magnus) +# ssh_host : alias SSH de ese host (default om) +# +# Requisitos: +# - Go instalado localmente (compila el binario amd64). +# - La CA del cluster unibus en projects/message_bus/apps/unibus/deploy/tls/ca.crt +# (o cluster/out//ca.crt — son idénticas). +# - sudo -n (sin password) en el host remoto. +set -euo pipefail + +NODE="${1:-magnus}" +HOST="${2:-om}" + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" # projects/fleet_monitoring +APP="$ROOT/apps/unibus_exporter" +CA="$ROOT/../message_bus/apps/unibus/deploy/tls/ca.crt" +HUB="http://127.0.0.1:8428/api/v1/import/prometheus" # VM local en el hub (sin auth) + +[ -f "$CA" ] || { echo "ERROR: falta la CA del cluster en $CA"; exit 1; } + +echo ">> compilando unibus_exporter (linux/amd64)" +mkdir -p "$APP/dist" +( cd "$APP" && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags="-s -w" -o dist/unibus_exporter_amd64 . ) +BIN="$APP/dist/unibus_exporter_amd64" + +echo ">> subiendo binario + CA a $HOST" +scp -q -o BatchMode=yes "$BIN" "$HOST:/tmp/unibus_exporter" +scp -q -o BatchMode=yes "$CA" "$HOST:/tmp/unibus_ca.crt" + +echo ">> instalando servicio en $NODE ($HOST)" +ssh -o BatchMode=yes "$HOST" "NODE='$NODE' HUB='$HUB' bash -s" <<'OUTER' +set -e +sudo -n mkdir -p /opt/unibus-exporter /etc/unibus-exporter +sudo -n mv /tmp/unibus_exporter /opt/unibus-exporter/unibus_exporter +sudo -n chmod 755 /opt/unibus-exporter/unibus_exporter +sudo -n mv /tmp/unibus_ca.crt /etc/unibus-exporter/ca.crt +sudo -n chmod 644 /etc/unibus-exporter/ca.crt +sudo -n tee /etc/unibus-exporter/unibus.json >/dev/null </dev/null <<'UNIT' +[Unit] +Description=unibus cluster exporter (fleet_monitoring) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +ExecStart=/opt/unibus-exporter/unibus_exporter -config /etc/unibus-exporter/unibus.json +Restart=always +RestartSec=10 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target +UNIT +sudo -n systemctl daemon-reload +sudo -n systemctl enable unibus-exporter >/dev/null 2>&1 +sudo -n systemctl restart unibus-exporter +sleep 3 +echo -n "status: "; systemctl is-active unibus-exporter +OUTER + +echo ">> $NODE: unibus_exporter desplegado"