feat(hub): Loki (config+datasource), panel de logs en node detail, stat nodos caídos + overview compacto, loki_url en deploy_agent

This commit is contained in:
Egutierrez
2026-06-07 13:22:00 +02:00
parent e87069d366
commit a099488a9d
5 changed files with 111 additions and 47 deletions
+9
View File
@@ -160,6 +160,15 @@
],
"fieldConfig": { "defaults": {}, "overrides": [ { "matcher": { "id": "byName", "options": "RAM %" }, "properties": [ { "id": "unit", "value": "percent" }, { "id": "custom.cellOptions", "value": { "type": "gauge" } }, { "id": "max", "value": 100 } ] } ] },
"options": { "sortBy": [ { "displayName": "RAM %", "desc": true } ] }
},
{
"id": 13,
"type": "logs",
"title": "Logs (journald)",
"gridPos": { "h": 11, "w": 24, "x": 0, "y": 41 },
"datasource": { "type": "loki", "uid": "loki" },
"targets": [ { "refId": "A", "expr": "{instance=\"$node\", job=\"journald\"}", "datasource": { "type": "loki", "uid": "loki" } } ],
"options": { "showTime": true, "wrapLogMessage": true, "prettifyLogMessage": false, "enableLogDetails": true, "dedupStrategy": "none", "sortOrder": "Descending" }
}
]
}
+45 -45
View File
@@ -4,7 +4,7 @@
"tags": ["fleet"],
"timezone": "browser",
"schemaVersion": 39,
"version": 1,
"version": 2,
"refresh": "15s",
"time": { "from": "now-3h", "to": "now" },
"templating": { "list": [] },
@@ -12,60 +12,70 @@
{
"id": 1,
"type": "stat",
"title": "Nodos reportando (<2m)",
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"title": "Nodos reportando",
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "count(group by(instance) (last_over_time(node_uptime_seconds[2m])))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "count(group by(instance) (last_over_time(node_uptime_seconds[2m])))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "color": { "mode": "fixed", "fixedColor": "green" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "none", "textMode": "value" }
},
{
"id": 11,
"type": "stat",
"title": "Nodos caídos",
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "(max_over_time((count(group by(instance) (last_over_time(node_uptime_seconds[2m]))))[24h:1m])) - count(group by(instance) (last_over_time(node_uptime_seconds[2m])))", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "none", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "background", "graphMode": "none", "textMode": "value" }
},
{
"id": 2,
"type": "stat",
"title": "CPU máx flota",
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"title": "CPU máx",
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_cpu_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "max(node_cpu_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 3,
"type": "stat",
"title": "RAM máx flota",
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"title": "RAM máx",
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_mem_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "max(node_mem_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 75 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 4,
"type": "stat",
"title": "Disco máx flota",
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"title": "Disco máx",
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max(node_disk_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "max(node_disk_used_percent)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 90 } ] } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 12,
"type": "stat",
"title": "Load máx",
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 0 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [ { "refId": "A", "expr": "max(node_load1)", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "short", "color": { "mode": "fixed", "fixedColor": "blue" } }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "colorMode": "value", "graphMode": "area" }
},
{
"id": 5,
"type": "timeseries",
"title": "CPU % por nodo",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 4 },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_cpu_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "node_cpu_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
@@ -73,11 +83,9 @@
"id": 6,
"type": "timeseries",
"title": "RAM % por nodo",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 4 },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 4 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_mem_used_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "node_mem_used_percent", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
@@ -85,11 +93,9 @@
"id": 7,
"type": "timeseries",
"title": "Load 1m por nodo",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 13 },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 10 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "node_load1", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "node_load1", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "short", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
@@ -97,11 +103,9 @@
"id": 8,
"type": "timeseries",
"title": "Disco usado % (máx por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 13 },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 10 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "max by(instance) (node_disk_used_percent)", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "max by(instance) (node_disk_used_percent)", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
@@ -109,11 +113,9 @@
"id": 9,
"type": "timeseries",
"title": "Red recibida (sum por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 0, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 16 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "sum by(instance) (rate(node_net_recv_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "sum by(instance) (rate(node_net_recv_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
},
@@ -121,11 +123,9 @@
"id": 10,
"type": "timeseries",
"title": "Red enviada (sum por nodo)",
"gridPos": { "h": 9, "w": 12, "x": 12, "y": 22 },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 16 },
"datasource": { "type": "prometheus", "uid": "victoriametrics" },
"targets": [
{ "refId": "A", "expr": "sum by(instance) (rate(node_net_sent_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } }
],
"targets": [ { "refId": "A", "expr": "sum by(instance) (rate(node_net_sent_bytes[2m]))", "legendFormat": "{{instance}}", "datasource": { "type": "prometheus", "uid": "victoriametrics" } } ],
"fieldConfig": { "defaults": { "unit": "Bps", "min": 0, "custom": { "drawStyle": "line", "fillOpacity": 10, "showPoints": "never", "lineWidth": 2 } }, "overrides": [] },
"options": { "legend": { "displayMode": "table", "placement": "right", "calcs": ["lastNotNull", "max"] }, "tooltip": { "mode": "multi", "sort": "desc" } }
}
+5 -2
View File
@@ -19,6 +19,7 @@ HOST="${2:?uso: deploy_agent.sh <node> <ssh_host> [arch]}"
ARCH="${3:-amd64}"
HUB="https://metrics-dxaqj3ina6eqd5pjt85wkrrj.organic-machine.com/api/v1/import/prometheus"
LOKI="https://logs-wmaxecsjcfnocz81d5luca92.organic-machine.com/loki/api/v1/push"
PW="$(pass show fleet/ingest-pass | head -1)"
BIN="$(cd "$(dirname "$0")/.." && pwd)/apps/metrics_agent/dist/metrics_agent_${ARCH}"
@@ -28,7 +29,7 @@ echo ">> copiando binario a $HOST"
scp -q -o BatchMode=yes "$BIN" "$HOST:/tmp/metrics_agent"
echo ">> instalando servicio en $NODE ($HOST)"
ssh -o BatchMode=yes "$HOST" "NODE='$NODE' PW='$PW' HUB='$HUB' bash -s" <<'OUTER'
ssh -o BatchMode=yes "$HOST" "NODE='$NODE' PW='$PW' HUB='$HUB' LOKI='$LOKI' bash -s" <<'OUTER'
set -e
sudo -n mkdir -p /opt/fleet-agent /etc/fleet-agent
sudo -n mv /tmp/metrics_agent /opt/fleet-agent/metrics_agent
@@ -37,6 +38,7 @@ sudo -n tee /etc/fleet-agent/agent.json >/dev/null <<JSON
{
"node": "${NODE}",
"hub_url": "${HUB}",
"loki_url": "${LOKI}",
"user": "fleet",
"pass": "${PW}",
"interval_sec": 15
@@ -60,7 +62,8 @@ NoNewPrivileges=true
WantedBy=multi-user.target
UNIT
sudo -n systemctl daemon-reload
sudo -n systemctl enable --now fleet-agent
sudo -n systemctl enable fleet-agent >/dev/null 2>&1
sudo -n systemctl restart fleet-agent
sleep 3
echo -n "status: "; systemctl is-active fleet-agent
OUTER
@@ -0,0 +1,10 @@
apiVersion: 1
datasources:
- name: Loki
uid: loki
type: loki
access: proxy
url: http://127.0.0.1:3100
jsonData:
maxLines: 1000
+42
View File
@@ -0,0 +1,42 @@
# Loki single-binary config para el hub fleet_monitoring (magnus).
# Storage filesystem, retención 31 días, escucha solo en localhost (Caddy expone el vhost logs- con auth).
auth_enabled: false
server:
http_listen_address: 127.0.0.1
http_listen_port: 3100
grpc_listen_port: 9095
log_level: warn
common:
instance_addr: 127.0.0.1
path_prefix: /var/lib/loki
storage:
filesystem:
chunks_directory: /var/lib/loki/chunks
rules_directory: /var/lib/loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
retention_period: 744h
reject_old_samples: false
allow_structured_metadata: true
volume_enabled: true
compactor:
working_directory: /var/lib/loki/compactor
retention_enabled: true
delete_request_store: filesystem