feat(hub): dashboards Grafana as-code (overview + node detail) + provisioning yaml + deploy_agent.sh

This commit is contained in:
Egutierrez
2026-06-07 12:54:39 +02:00
parent 527742b7c5
commit e87069d366
5 changed files with 390 additions and 0 deletions
+165
View File
@@ -0,0 +1,165 @@
{
"uid": "fleet-node",
"title": "Fleet — Node detail",
"tags": ["fleet"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"refresh": "15s",
"time": { "from": "now-3h", "to": "now" },
"templating": {
"list": [
{
"name": "node",
"label": "Nodo",
"type": "query",
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"query": "label_values(node_uptime_seconds, instance)",
"refresh": 2,
"includeAll": false,
"multi": false,
"sort": 1,
"current": {}
}
]
},
"panels": [
{
"id": 1,
"type": "gauge",
"title": "CPU %",
"gridPos": { "h": 5, "w": 6, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_cpu_percent{instance=\"$node\"}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }
},
{
"id": 2,
"type": "gauge",
"title": "RAM %",
"gridPos": { "h": 5, "w": 6, "x": 6, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_mem_used_percent{instance=\"$node\"}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false } }
},
{
"id": 3,
"type": "stat",
"title": "Load 1m",
"gridPos": { "h": 5, "w": 6, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_load1{instance=\"$node\"}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 4,
"type": "stat",
"title": "Uptime",
"gridPos": { "h": 5, "w": 6, "x": 18, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_uptime_seconds{instance=\"$node\"}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "s", "color": { "mode": "fixed", "fixedColor": "purple" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none" }
},
{
"id": 5,
"type": "timeseries",
"title": "CPU por núcleo",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 5 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_cpu_core_percent{instance=\"$node\"}", "legendFormat": "core {{core}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 1 } }, "overrides": [] },
"options": { "legend": { "displayMode": "list", "placement": "bottom" }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 6,
"type": "timeseries",
"title": "Memoria y swap",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 5 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_mem_total_bytes{instance=\"$node\"}", "legendFormat": "RAM total", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "B", "expr": "node_mem_used_bytes{instance=\"$node\"}", "legendFormat": "RAM usada", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "C", "expr": "node_mem_available_bytes{instance=\"$node\"}", "legendFormat": "RAM disponible", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "D", "expr": "node_swap_used_bytes{instance=\"$node\"}", "legendFormat": "swap usado", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "bytes", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi" } }
},
{
"id": 7,
"type": "timeseries",
"title": "Disco usado % por punto de montaje",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 14 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_disk_used_percent{instance=\"$node\"}", "legendFormat": "{{mount}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 8,
"type": "timeseries",
"title": "Disco I/O por dispositivo",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 14 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "rate(node_disk_read_bytes{instance=\"$node\"}[2m])", "legendFormat": "{{device}} read", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "B", "expr": "rate(node_disk_write_bytes{instance=\"$node\"}[2m])", "legendFormat": "{{device}} write", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 9,
"type": "timeseries",
"title": "Red por interfaz",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 23 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "rate(node_net_recv_bytes{instance=\"$node\"}[2m])", "legendFormat": "{{iface}} recv", "datasource": { "type": "prometheus", "uid": "victoriametrics" } },
{ "refId": "B", "expr": "rate(node_net_sent_bytes{instance=\"$node\"}[2m])", "legendFormat": "{{iface}} sent", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 10,
"type": "timeseries",
"title": "Temperaturas (si hay sensores)",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 23 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "node_temp_celsius{instance=\"$node\"}", "legendFormat": "{{sensor}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "celsius", "custom": { "drawStyle": "line", "fillOpacity": 8, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 11,
"type": "table",
"title": "Top procesos por CPU",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 32 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "topk(10, node_proc_cpu_percent{instance=\"$node\"})", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"transformations": [
{ "id": "organize", "options": { "excludeByName": { "Time": true, "instance": true, "__name__": true, "job": true }, "renameByName": { "name": "Proceso", "pid": "PID", "Value": "CPU %" } } }
],
"fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "CPU %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "custom.cellOptions", "value": { "type": "gauge" } }, { "id": "max", "value": 100 } ] } ] },
"options": { "sortBy": [ { "displayName": "CPU %", "desc": true } ] }
},
{
"id": 12,
"type": "table",
"title": "Top procesos por RAM",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 32 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "topk(10, node_proc_mem_percent{instance=\"$node\"})", "format": "table", "instant": true, "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"transformations": [
{ "id": "organize", "options": { "excludeByName": { "Time": true, "instance": true, "__name__": true, "job": true }, "renameByName": { "name": "Proceso", "pid": "PID", "Value": "RAM %" } } }
],
"fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "RAM %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "custom.cellOptions", "value": { "type": "gauge" } }, { "id": "max", "value": 100 } ] } ] },
"options": { "sortBy": [ { "displayName": "RAM %", "desc": true } ] }
}
]
}
+133
View File
@@ -0,0 +1,133 @@
{
"uid": "fleet-overview",
"title": "Fleet — Overview",
"tags": ["fleet"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"refresh": "15s",
"time": { "from": "now-3h", "to": "now" },
"templating": { "list": [] },
"panels": [
{
"id": 1,
"type": "stat",
"title": "Nodos reportando (<2m)",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "count(group by(instance) (last_over_time(node_uptime_seconds[2m])))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
},
{
"id": 2,
"type": "stat",
"title": "CPU máx flota",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_cpu_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 3,
"type": "stat",
"title": "RAM máx flota",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_mem_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 4,
"type": "stat",
"title": "Disco máx flota",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_disk_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 5,
"type": "timeseries",
"title": "CPU % por nodo",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_cpu_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 6,
"type": "timeseries",
"title": "RAM % por nodo",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_mem_used_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 7,
"type": "timeseries",
"title": "Load 1m por nodo",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 13 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_load1", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "short", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 8,
"type": "timeseries",
"title": "Disco usado % (máx por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 13 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max by(instance) (node_disk_used_percent)", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 9,
"type": "timeseries",
"title": "Red recibida (sum por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 22 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "sum by(instance) (rate(node_net_recv_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
{
"id": 10,
"type": "timeseries",
"title": "Red enviada (sum por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 22 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "sum by(instance) (rate(node_net_sent_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
}
]
}
+68
View File
@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# Despliega metrics_agent en un nodo remoto como servicio systemd.
#
# Uso: ./deploy_agent.sh <node_name> <ssh_host> [arch]
# node_name : valor de la label "instance" en Grafana (ej. homer)
# ssh_host : alias SSH del nodo (debe existir en ~/.ssh/config)
# arch : amd64 (default) | arm64
#
# Requisitos:
# - Binario compilado en apps/metrics_agent/dist/metrics_agent_<arch>
# (compila con: cd apps/metrics_agent && CGO_ENABLED=0 GOOS=linux GOARCH=<arch> \
# go build -ldflags="-s -w" -o dist/metrics_agent_<arch> .)
# - `pass fleet/ingest-pass` con el password de ingesta.
# - sudo -n (sin password) disponible en el nodo remoto.
set -euo pipefail
NODE="${1:?uso: deploy_agent.sh <node> <ssh_host> [arch]}"
HOST="${2:?uso: deploy_agent.sh <node> <ssh_host> [arch]}"
ARCH="${3:-amd64}"
HUB="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus"
PW="$(pass show fleet/ingest-pass | head -1)"
BIN="$(cd "$(dirname "$0")/.." && pwd)/apps/metrics_agent/dist/metrics_agent_${ARCH}"
[ -f "$BIN" ] || { echo "ERROR: falta el binario $BIN (compílalo primero)"; exit 1; }
echo ">> copiando binario a $HOST"
scp -q -o BatchMode=yes "$BIN" "$HOST:/tmp/metrics_agent"
echo ">> instalando servicio en $NODE ($HOST)"
ssh -o BatchMode=yes "$HOST" "NODE='$NODE' PW='$PW' HUB='$HUB' bash -s" <<'OUTER'
set -e
sudo -n mkdir -p /opt/fleet-agent /etc/fleet-agent
sudo -n mv /tmp/metrics_agent /opt/fleet-agent/metrics_agent
sudo -n chmod 755 /opt/fleet-agent/metrics_agent
sudo -n tee /etc/fleet-agent/agent.json >/dev/null <<JSON
{
"node": "${NODE}",
"hub_url": "${HUB}",
"user": "fleet",
"pass": "${PW}",
"interval_sec": 15
}
JSON
sudo -n chmod 600 /etc/fleet-agent/agent.json
sudo -n tee /etc/systemd/system/fleet-agent.service >/dev/null <<'UNIT'
[Unit]
Description=Fleet metrics agent (fleet_monitoring)
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
ExecStart=/opt/fleet-agent/metrics_agent -config /etc/fleet-agent/agent.json
Restart=always
RestartSec=10
NoNewPrivileges=true
[Install]
WantedBy=multi-user.target
UNIT
sudo -n systemctl daemon-reload
sudo -n systemctl enable --now fleet-agent
sleep 3
echo -n "status: "; systemctl is-active fleet-agent
OUTER
echo ">> $NODE desplegado"
@@ -0,0 +1,13 @@
apiVersion: 1
providers:
- name: fleet
orgId: 1
folder: Fleet
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: false
@@ -0,0 +1,11 @@
apiVersion: 1
datasources:
- name: VictoriaMetrics
uid: victoriametrics
type: prometheus
access: proxy
url: http://127.0.0.1:8428
isDefault: true
jsonData:
httpMethod: POST