feat(infra): grupo fleet-metrics — collect_host_metrics, format_prom_exposition, push_prom_remote, push_loki_stream, collect_battery_metrics + tipo PromSample (gopsutil; Android-safe: sin exec/pidfd, procesos via /proc)
This commit is contained in:
@@ -0,0 +1,246 @@
|
||||
package infra
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/shirou/gopsutil/v4/cpu"
|
||||
"github.com/shirou/gopsutil/v4/disk"
|
||||
"github.com/shirou/gopsutil/v4/host"
|
||||
"github.com/shirou/gopsutil/v4/load"
|
||||
"github.com/shirou/gopsutil/v4/mem"
|
||||
"github.com/shirou/gopsutil/v4/net"
|
||||
"github.com/shirou/gopsutil/v4/process"
|
||||
"github.com/shirou/gopsutil/v4/sensors"
|
||||
)
|
||||
|
||||
// isAndroidHost indica si el host es Android (incluido Termux). Se usa para
|
||||
// evitar rutas de gopsutil que invocan os.FindProcess -> pidfd_open, syscall
|
||||
// bloqueado por el seccomp de Android que mata el proceso con SIGSYS.
|
||||
func isAndroidHost() bool {
|
||||
if os.Getenv("ANDROID_ROOT") != "" || os.Getenv("ANDROID_DATA") != "" {
|
||||
return true
|
||||
}
|
||||
if _, err := os.Stat("/system/build.prop"); err == nil {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// pseudoFstypes son filesystems virtuales que no representan almacenamiento
|
||||
// real y se ignoran al recolectar metricas de particiones.
|
||||
var pseudoFstypes = map[string]bool{
|
||||
"tmpfs": true,
|
||||
"devtmpfs": true,
|
||||
"overlay": true,
|
||||
"squashfs": true,
|
||||
"proc": true,
|
||||
"sysfs": true,
|
||||
"cgroup": true,
|
||||
"cgroup2": true,
|
||||
"devpts": true,
|
||||
"mqueue": true,
|
||||
"debugfs": true,
|
||||
"tracefs": true,
|
||||
"fusectl": true,
|
||||
"configfs": true,
|
||||
"pstore": true,
|
||||
"bpf": true,
|
||||
"securityfs": true,
|
||||
}
|
||||
|
||||
// CollectHostMetrics recolecta metricas del host actual (CPU, memoria, swap,
|
||||
// disco, red, temperaturas y procesos) y las devuelve como un slice de
|
||||
// PromSample con nombres estilo node_exporter simplificados.
|
||||
//
|
||||
// Es robusta: cada grupo de colector se ejecuta en su propio bloque con manejo
|
||||
// de error local. Si un colector secundario falla (red, temperaturas, etc.) se
|
||||
// omite ese grupo sin abortar. Solo retorna error si falla la informacion
|
||||
// basica de host (uptime), que se considera el minimo imprescindible.
|
||||
//
|
||||
// Funciona en Linux amd64 y Android/Termux (linux arm64): las temperaturas son
|
||||
// best-effort y se omiten si no hay sensores disponibles (tipico en Android).
|
||||
func CollectHostMetrics() ([]PromSample, error) {
|
||||
var samples []PromSample
|
||||
|
||||
// --- Host basico: uptime (imprescindible, error si falla) ---
|
||||
uptime, err := host.Uptime()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("collect host uptime: %w", err)
|
||||
}
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_uptime_seconds",
|
||||
Value: float64(uptime),
|
||||
})
|
||||
|
||||
// --- Load average (linux/darwin; best-effort) ---
|
||||
if avg, err := load.Avg(); err == nil && avg != nil {
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_load1", Value: avg.Load1},
|
||||
PromSample{Name: "node_load5", Value: avg.Load5},
|
||||
PromSample{Name: "node_load15", Value: avg.Load15},
|
||||
)
|
||||
}
|
||||
|
||||
// --- CPU global (intervalo corto de muestreo) ---
|
||||
if pcts, err := cpu.Percent(200*time.Millisecond, false); err == nil && len(pcts) > 0 {
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_cpu_percent",
|
||||
Value: pcts[0],
|
||||
})
|
||||
}
|
||||
|
||||
// --- CPU por nucleo ---
|
||||
if pcts, err := cpu.Percent(200*time.Millisecond, true); err == nil {
|
||||
for i, p := range pcts {
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_cpu_core_percent",
|
||||
Labels: map[string]string{"core": strconv.Itoa(i)},
|
||||
Value: p,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- Memoria virtual ---
|
||||
if vm, err := mem.VirtualMemory(); err == nil && vm != nil {
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_mem_total_bytes", Value: float64(vm.Total)},
|
||||
PromSample{Name: "node_mem_used_bytes", Value: float64(vm.Used)},
|
||||
PromSample{Name: "node_mem_available_bytes", Value: float64(vm.Available)},
|
||||
PromSample{Name: "node_mem_used_percent", Value: vm.UsedPercent},
|
||||
)
|
||||
}
|
||||
|
||||
// --- Swap ---
|
||||
if sw, err := mem.SwapMemory(); err == nil && sw != nil {
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_swap_total_bytes", Value: float64(sw.Total)},
|
||||
PromSample{Name: "node_swap_used_bytes", Value: float64(sw.Used)},
|
||||
)
|
||||
}
|
||||
|
||||
// --- Particiones fisicas (ignora fstypes pseudo) ---
|
||||
if parts, err := disk.Partitions(false); err == nil {
|
||||
for _, p := range parts {
|
||||
if pseudoFstypes[p.Fstype] {
|
||||
continue
|
||||
}
|
||||
u, err := disk.Usage(p.Mountpoint)
|
||||
if err != nil || u == nil {
|
||||
continue
|
||||
}
|
||||
lbl := map[string]string{"mount": p.Mountpoint}
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_disk_total_bytes", Labels: lbl, Value: float64(u.Total)},
|
||||
PromSample{Name: "node_disk_used_bytes", Labels: lbl, Value: float64(u.Used)},
|
||||
PromSample{Name: "node_disk_used_percent", Labels: lbl, Value: u.UsedPercent},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Contadores I/O por dispositivo ---
|
||||
if io, err := disk.IOCounters(); err == nil {
|
||||
for dev, c := range io {
|
||||
lbl := map[string]string{"device": dev}
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_disk_read_bytes", Labels: lbl, Value: float64(c.ReadBytes)},
|
||||
PromSample{Name: "node_disk_write_bytes", Labels: lbl, Value: float64(c.WriteBytes)},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Red por interfaz (excluye loopback "lo") ---
|
||||
if nics, err := net.IOCounters(true); err == nil {
|
||||
for _, n := range nics {
|
||||
if n.Name == "lo" {
|
||||
continue
|
||||
}
|
||||
lbl := map[string]string{"iface": n.Name}
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_net_recv_bytes", Labels: lbl, Value: float64(n.BytesRecv)},
|
||||
PromSample{Name: "node_net_sent_bytes", Labels: lbl, Value: float64(n.BytesSent)},
|
||||
PromSample{Name: "node_net_recv_errs", Labels: lbl, Value: float64(n.Errin)},
|
||||
PromSample{Name: "node_net_sent_errs", Labels: lbl, Value: float64(n.Errout)},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Temperaturas (best-effort; omite el grupo si falla o no hay sensores) ---
|
||||
if temps, err := sensors.SensorsTemperatures(); err == nil {
|
||||
for _, t := range temps {
|
||||
if t.SensorKey == "" {
|
||||
continue
|
||||
}
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_temp_celsius",
|
||||
Labels: map[string]string{"sensor": t.SensorKey},
|
||||
Value: t.Temperature,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// --- Procesos: total + top 5 por CPU ---
|
||||
// En Android (Termux) gopsutil process.Processes() llama internamente a
|
||||
// os.FindProcess, que usa el syscall pidfd_open bloqueado por el seccomp de
|
||||
// Android (mata el proceso con SIGSYS, no recuperable). Alli contamos los
|
||||
// PIDs con process.Pids() (que solo lee /proc, sin FindProcess) y omitimos
|
||||
// el top por CPU.
|
||||
if isAndroidHost() {
|
||||
if pids, err := process.Pids(); err == nil {
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_procs_total",
|
||||
Value: float64(len(pids)),
|
||||
})
|
||||
}
|
||||
} else if procs, err := process.Processes(); err == nil {
|
||||
samples = append(samples, PromSample{
|
||||
Name: "node_procs_total",
|
||||
Value: float64(len(procs)),
|
||||
})
|
||||
|
||||
type procStat struct {
|
||||
pid int32
|
||||
name string
|
||||
cpu float64
|
||||
mem float32
|
||||
}
|
||||
stats := make([]procStat, 0, len(procs))
|
||||
for _, p := range procs {
|
||||
cpuPct, err := p.CPUPercent()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
name, err := p.Name()
|
||||
if err != nil {
|
||||
name = ""
|
||||
}
|
||||
memPct, err := p.MemoryPercent()
|
||||
if err != nil {
|
||||
memPct = 0
|
||||
}
|
||||
stats = append(stats, procStat{pid: p.Pid, name: name, cpu: cpuPct, mem: memPct})
|
||||
}
|
||||
sort.Slice(stats, func(i, j int) bool {
|
||||
return stats[i].cpu > stats[j].cpu
|
||||
})
|
||||
top := stats
|
||||
if len(top) > 5 {
|
||||
top = top[:5]
|
||||
}
|
||||
for _, s := range top {
|
||||
lbl := map[string]string{
|
||||
"pid": strconv.Itoa(int(s.pid)),
|
||||
"name": s.name,
|
||||
}
|
||||
samples = append(samples,
|
||||
PromSample{Name: "node_proc_cpu_percent", Labels: lbl, Value: s.cpu},
|
||||
PromSample{Name: "node_proc_mem_percent", Labels: lbl, Value: float64(s.mem)},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return samples, nil
|
||||
}
|
||||
Reference in New Issue
Block a user