Files
fn_registry/functions/infra/collect_host_metrics.go
T

247 lines
7.3 KiB
Go

package infra
import (
"fmt"
"os"
"sort"
"strconv"
"time"
"github.com/shirou/gopsutil/v4/cpu"
"github.com/shirou/gopsutil/v4/disk"
"github.com/shirou/gopsutil/v4/host"
"github.com/shirou/gopsutil/v4/load"
"github.com/shirou/gopsutil/v4/mem"
"github.com/shirou/gopsutil/v4/net"
"github.com/shirou/gopsutil/v4/process"
"github.com/shirou/gopsutil/v4/sensors"
)
// isAndroidHost indica si el host es Android (incluido Termux). Se usa para
// evitar rutas de gopsutil que invocan os.FindProcess -> pidfd_open, syscall
// bloqueado por el seccomp de Android que mata el proceso con SIGSYS.
func isAndroidHost() bool {
if os.Getenv("ANDROID_ROOT") != "" || os.Getenv("ANDROID_DATA") != "" {
return true
}
if _, err := os.Stat("/system/build.prop"); err == nil {
return true
}
return false
}
// pseudoFstypes son filesystems virtuales que no representan almacenamiento
// real y se ignoran al recolectar metricas de particiones.
var pseudoFstypes = map[string]bool{
"tmpfs": true,
"devtmpfs": true,
"overlay": true,
"squashfs": true,
"proc": true,
"sysfs": true,
"cgroup": true,
"cgroup2": true,
"devpts": true,
"mqueue": true,
"debugfs": true,
"tracefs": true,
"fusectl": true,
"configfs": true,
"pstore": true,
"bpf": true,
"securityfs": true,
}
// CollectHostMetrics recolecta metricas del host actual (CPU, memoria, swap,
// disco, red, temperaturas y procesos) y las devuelve como un slice de
// PromSample con nombres estilo node_exporter simplificados.
//
// Es robusta: cada grupo de colector se ejecuta en su propio bloque con manejo
// de error local. Si un colector secundario falla (red, temperaturas, etc.) se
// omite ese grupo sin abortar. Solo retorna error si falla la informacion
// basica de host (uptime), que se considera el minimo imprescindible.
//
// Funciona en Linux amd64 y Android/Termux (linux arm64): las temperaturas son
// best-effort y se omiten si no hay sensores disponibles (tipico en Android).
func CollectHostMetrics() ([]PromSample, error) {
var samples []PromSample
// --- Host basico: uptime (imprescindible, error si falla) ---
uptime, err := host.Uptime()
if err != nil {
return nil, fmt.Errorf("collect host uptime: %w", err)
}
samples = append(samples, PromSample{
Name: "node_uptime_seconds",
Value: float64(uptime),
})
// --- Load average (linux/darwin; best-effort) ---
if avg, err := load.Avg(); err == nil && avg != nil {
samples = append(samples,
PromSample{Name: "node_load1", Value: avg.Load1},
PromSample{Name: "node_load5", Value: avg.Load5},
PromSample{Name: "node_load15", Value: avg.Load15},
)
}
// --- CPU global (intervalo corto de muestreo) ---
if pcts, err := cpu.Percent(200*time.Millisecond, false); err == nil && len(pcts) > 0 {
samples = append(samples, PromSample{
Name: "node_cpu_percent",
Value: pcts[0],
})
}
// --- CPU por nucleo ---
if pcts, err := cpu.Percent(200*time.Millisecond, true); err == nil {
for i, p := range pcts {
samples = append(samples, PromSample{
Name: "node_cpu_core_percent",
Labels: map[string]string{"core": strconv.Itoa(i)},
Value: p,
})
}
}
// --- Memoria virtual ---
if vm, err := mem.VirtualMemory(); err == nil && vm != nil {
samples = append(samples,
PromSample{Name: "node_mem_total_bytes", Value: float64(vm.Total)},
PromSample{Name: "node_mem_used_bytes", Value: float64(vm.Used)},
PromSample{Name: "node_mem_available_bytes", Value: float64(vm.Available)},
PromSample{Name: "node_mem_used_percent", Value: vm.UsedPercent},
)
}
// --- Swap ---
if sw, err := mem.SwapMemory(); err == nil && sw != nil {
samples = append(samples,
PromSample{Name: "node_swap_total_bytes", Value: float64(sw.Total)},
PromSample{Name: "node_swap_used_bytes", Value: float64(sw.Used)},
)
}
// --- Particiones fisicas (ignora fstypes pseudo) ---
if parts, err := disk.Partitions(false); err == nil {
for _, p := range parts {
if pseudoFstypes[p.Fstype] {
continue
}
u, err := disk.Usage(p.Mountpoint)
if err != nil || u == nil {
continue
}
lbl := map[string]string{"mount": p.Mountpoint}
samples = append(samples,
PromSample{Name: "node_disk_total_bytes", Labels: lbl, Value: float64(u.Total)},
PromSample{Name: "node_disk_used_bytes", Labels: lbl, Value: float64(u.Used)},
PromSample{Name: "node_disk_used_percent", Labels: lbl, Value: u.UsedPercent},
)
}
}
// --- Contadores I/O por dispositivo ---
if io, err := disk.IOCounters(); err == nil {
for dev, c := range io {
lbl := map[string]string{"device": dev}
samples = append(samples,
PromSample{Name: "node_disk_read_bytes", Labels: lbl, Value: float64(c.ReadBytes)},
PromSample{Name: "node_disk_write_bytes", Labels: lbl, Value: float64(c.WriteBytes)},
)
}
}
// --- Red por interfaz (excluye loopback "lo") ---
if nics, err := net.IOCounters(true); err == nil {
for _, n := range nics {
if n.Name == "lo" {
continue
}
lbl := map[string]string{"iface": n.Name}
samples = append(samples,
PromSample{Name: "node_net_recv_bytes", Labels: lbl, Value: float64(n.BytesRecv)},
PromSample{Name: "node_net_sent_bytes", Labels: lbl, Value: float64(n.BytesSent)},
PromSample{Name: "node_net_recv_errs", Labels: lbl, Value: float64(n.Errin)},
PromSample{Name: "node_net_sent_errs", Labels: lbl, Value: float64(n.Errout)},
)
}
}
// --- Temperaturas (best-effort; omite el grupo si falla o no hay sensores) ---
if temps, err := sensors.SensorsTemperatures(); err == nil {
for _, t := range temps {
if t.SensorKey == "" {
continue
}
samples = append(samples, PromSample{
Name: "node_temp_celsius",
Labels: map[string]string{"sensor": t.SensorKey},
Value: t.Temperature,
})
}
}
// --- Procesos: total + top 5 por CPU ---
// En Android (Termux) gopsutil process.Processes() llama internamente a
// os.FindProcess, que usa el syscall pidfd_open bloqueado por el seccomp de
// Android (mata el proceso con SIGSYS, no recuperable). Alli contamos los
// PIDs con process.Pids() (que solo lee /proc, sin FindProcess) y omitimos
// el top por CPU.
if isAndroidHost() {
if pids, err := process.Pids(); err == nil {
samples = append(samples, PromSample{
Name: "node_procs_total",
Value: float64(len(pids)),
})
}
} else if procs, err := process.Processes(); err == nil {
samples = append(samples, PromSample{
Name: "node_procs_total",
Value: float64(len(procs)),
})
type procStat struct {
pid int32
name string
cpu float64
mem float32
}
stats := make([]procStat, 0, len(procs))
for _, p := range procs {
cpuPct, err := p.CPUPercent()
if err != nil {
continue
}
name, err := p.Name()
if err != nil {
name = ""
}
memPct, err := p.MemoryPercent()
if err != nil {
memPct = 0
}
stats = append(stats, procStat{pid: p.Pid, name: name, cpu: cpuPct, mem: memPct})
}
sort.Slice(stats, func(i, j int) bool {
return stats[i].cpu > stats[j].cpu
})
top := stats
if len(top) > 5 {
top = top[:5]
}
for _, s := range top {
lbl := map[string]string{
"pid": strconv.Itoa(int(s.pid)),
"name": s.name,
}
samples = append(samples,
PromSample{Name: "node_proc_cpu_percent", Labels: lbl, Value: s.cpu},
PromSample{Name: "node_proc_mem_percent", Labels: lbl, Value: float64(s.mem)},
)
}
}
return samples, nil
}