247 lines
7.3 KiB
Go
247 lines
7.3 KiB
Go
package infra
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"sort"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/shirou/gopsutil/v4/cpu"
|
|
"github.com/shirou/gopsutil/v4/disk"
|
|
"github.com/shirou/gopsutil/v4/host"
|
|
"github.com/shirou/gopsutil/v4/load"
|
|
"github.com/shirou/gopsutil/v4/mem"
|
|
"github.com/shirou/gopsutil/v4/net"
|
|
"github.com/shirou/gopsutil/v4/process"
|
|
"github.com/shirou/gopsutil/v4/sensors"
|
|
)
|
|
|
|
// isAndroidHost indica si el host es Android (incluido Termux). Se usa para
|
|
// evitar rutas de gopsutil que invocan os.FindProcess -> pidfd_open, syscall
|
|
// bloqueado por el seccomp de Android que mata el proceso con SIGSYS.
|
|
func isAndroidHost() bool {
|
|
if os.Getenv("ANDROID_ROOT") != "" || os.Getenv("ANDROID_DATA") != "" {
|
|
return true
|
|
}
|
|
if _, err := os.Stat("/system/build.prop"); err == nil {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// pseudoFstypes son filesystems virtuales que no representan almacenamiento
|
|
// real y se ignoran al recolectar metricas de particiones.
|
|
var pseudoFstypes = map[string]bool{
|
|
"tmpfs": true,
|
|
"devtmpfs": true,
|
|
"overlay": true,
|
|
"squashfs": true,
|
|
"proc": true,
|
|
"sysfs": true,
|
|
"cgroup": true,
|
|
"cgroup2": true,
|
|
"devpts": true,
|
|
"mqueue": true,
|
|
"debugfs": true,
|
|
"tracefs": true,
|
|
"fusectl": true,
|
|
"configfs": true,
|
|
"pstore": true,
|
|
"bpf": true,
|
|
"securityfs": true,
|
|
}
|
|
|
|
// CollectHostMetrics recolecta metricas del host actual (CPU, memoria, swap,
|
|
// disco, red, temperaturas y procesos) y las devuelve como un slice de
|
|
// PromSample con nombres estilo node_exporter simplificados.
|
|
//
|
|
// Es robusta: cada grupo de colector se ejecuta en su propio bloque con manejo
|
|
// de error local. Si un colector secundario falla (red, temperaturas, etc.) se
|
|
// omite ese grupo sin abortar. Solo retorna error si falla la informacion
|
|
// basica de host (uptime), que se considera el minimo imprescindible.
|
|
//
|
|
// Funciona en Linux amd64 y Android/Termux (linux arm64): las temperaturas son
|
|
// best-effort y se omiten si no hay sensores disponibles (tipico en Android).
|
|
func CollectHostMetrics() ([]PromSample, error) {
|
|
var samples []PromSample
|
|
|
|
// --- Host basico: uptime (imprescindible, error si falla) ---
|
|
uptime, err := host.Uptime()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("collect host uptime: %w", err)
|
|
}
|
|
samples = append(samples, PromSample{
|
|
Name: "node_uptime_seconds",
|
|
Value: float64(uptime),
|
|
})
|
|
|
|
// --- Load average (linux/darwin; best-effort) ---
|
|
if avg, err := load.Avg(); err == nil && avg != nil {
|
|
samples = append(samples,
|
|
PromSample{Name: "node_load1", Value: avg.Load1},
|
|
PromSample{Name: "node_load5", Value: avg.Load5},
|
|
PromSample{Name: "node_load15", Value: avg.Load15},
|
|
)
|
|
}
|
|
|
|
// --- CPU global (intervalo corto de muestreo) ---
|
|
if pcts, err := cpu.Percent(200*time.Millisecond, false); err == nil && len(pcts) > 0 {
|
|
samples = append(samples, PromSample{
|
|
Name: "node_cpu_percent",
|
|
Value: pcts[0],
|
|
})
|
|
}
|
|
|
|
// --- CPU por nucleo ---
|
|
if pcts, err := cpu.Percent(200*time.Millisecond, true); err == nil {
|
|
for i, p := range pcts {
|
|
samples = append(samples, PromSample{
|
|
Name: "node_cpu_core_percent",
|
|
Labels: map[string]string{"core": strconv.Itoa(i)},
|
|
Value: p,
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- Memoria virtual ---
|
|
if vm, err := mem.VirtualMemory(); err == nil && vm != nil {
|
|
samples = append(samples,
|
|
PromSample{Name: "node_mem_total_bytes", Value: float64(vm.Total)},
|
|
PromSample{Name: "node_mem_used_bytes", Value: float64(vm.Used)},
|
|
PromSample{Name: "node_mem_available_bytes", Value: float64(vm.Available)},
|
|
PromSample{Name: "node_mem_used_percent", Value: vm.UsedPercent},
|
|
)
|
|
}
|
|
|
|
// --- Swap ---
|
|
if sw, err := mem.SwapMemory(); err == nil && sw != nil {
|
|
samples = append(samples,
|
|
PromSample{Name: "node_swap_total_bytes", Value: float64(sw.Total)},
|
|
PromSample{Name: "node_swap_used_bytes", Value: float64(sw.Used)},
|
|
)
|
|
}
|
|
|
|
// --- Particiones fisicas (ignora fstypes pseudo) ---
|
|
if parts, err := disk.Partitions(false); err == nil {
|
|
for _, p := range parts {
|
|
if pseudoFstypes[p.Fstype] {
|
|
continue
|
|
}
|
|
u, err := disk.Usage(p.Mountpoint)
|
|
if err != nil || u == nil {
|
|
continue
|
|
}
|
|
lbl := map[string]string{"mount": p.Mountpoint}
|
|
samples = append(samples,
|
|
PromSample{Name: "node_disk_total_bytes", Labels: lbl, Value: float64(u.Total)},
|
|
PromSample{Name: "node_disk_used_bytes", Labels: lbl, Value: float64(u.Used)},
|
|
PromSample{Name: "node_disk_used_percent", Labels: lbl, Value: u.UsedPercent},
|
|
)
|
|
}
|
|
}
|
|
|
|
// --- Contadores I/O por dispositivo ---
|
|
if io, err := disk.IOCounters(); err == nil {
|
|
for dev, c := range io {
|
|
lbl := map[string]string{"device": dev}
|
|
samples = append(samples,
|
|
PromSample{Name: "node_disk_read_bytes", Labels: lbl, Value: float64(c.ReadBytes)},
|
|
PromSample{Name: "node_disk_write_bytes", Labels: lbl, Value: float64(c.WriteBytes)},
|
|
)
|
|
}
|
|
}
|
|
|
|
// --- Red por interfaz (excluye loopback "lo") ---
|
|
if nics, err := net.IOCounters(true); err == nil {
|
|
for _, n := range nics {
|
|
if n.Name == "lo" {
|
|
continue
|
|
}
|
|
lbl := map[string]string{"iface": n.Name}
|
|
samples = append(samples,
|
|
PromSample{Name: "node_net_recv_bytes", Labels: lbl, Value: float64(n.BytesRecv)},
|
|
PromSample{Name: "node_net_sent_bytes", Labels: lbl, Value: float64(n.BytesSent)},
|
|
PromSample{Name: "node_net_recv_errs", Labels: lbl, Value: float64(n.Errin)},
|
|
PromSample{Name: "node_net_sent_errs", Labels: lbl, Value: float64(n.Errout)},
|
|
)
|
|
}
|
|
}
|
|
|
|
// --- Temperaturas (best-effort; omite el grupo si falla o no hay sensores) ---
|
|
if temps, err := sensors.SensorsTemperatures(); err == nil {
|
|
for _, t := range temps {
|
|
if t.SensorKey == "" {
|
|
continue
|
|
}
|
|
samples = append(samples, PromSample{
|
|
Name: "node_temp_celsius",
|
|
Labels: map[string]string{"sensor": t.SensorKey},
|
|
Value: t.Temperature,
|
|
})
|
|
}
|
|
}
|
|
|
|
// --- Procesos: total + top 5 por CPU ---
|
|
// En Android (Termux) gopsutil process.Processes() llama internamente a
|
|
// os.FindProcess, que usa el syscall pidfd_open bloqueado por el seccomp de
|
|
// Android (mata el proceso con SIGSYS, no recuperable). Alli contamos los
|
|
// PIDs con process.Pids() (que solo lee /proc, sin FindProcess) y omitimos
|
|
// el top por CPU.
|
|
if isAndroidHost() {
|
|
if pids, err := process.Pids(); err == nil {
|
|
samples = append(samples, PromSample{
|
|
Name: "node_procs_total",
|
|
Value: float64(len(pids)),
|
|
})
|
|
}
|
|
} else if procs, err := process.Processes(); err == nil {
|
|
samples = append(samples, PromSample{
|
|
Name: "node_procs_total",
|
|
Value: float64(len(procs)),
|
|
})
|
|
|
|
type procStat struct {
|
|
pid int32
|
|
name string
|
|
cpu float64
|
|
mem float32
|
|
}
|
|
stats := make([]procStat, 0, len(procs))
|
|
for _, p := range procs {
|
|
cpuPct, err := p.CPUPercent()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
name, err := p.Name()
|
|
if err != nil {
|
|
name = ""
|
|
}
|
|
memPct, err := p.MemoryPercent()
|
|
if err != nil {
|
|
memPct = 0
|
|
}
|
|
stats = append(stats, procStat{pid: p.Pid, name: name, cpu: cpuPct, mem: memPct})
|
|
}
|
|
sort.Slice(stats, func(i, j int) bool {
|
|
return stats[i].cpu > stats[j].cpu
|
|
})
|
|
top := stats
|
|
if len(top) > 5 {
|
|
top = top[:5]
|
|
}
|
|
for _, s := range top {
|
|
lbl := map[string]string{
|
|
"pid": strconv.Itoa(int(s.pid)),
|
|
"name": s.name,
|
|
}
|
|
samples = append(samples,
|
|
PromSample{Name: "node_proc_cpu_percent", Labels: lbl, Value: s.cpu},
|
|
PromSample{Name: "node_proc_mem_percent", Labels: lbl, Value: float64(s.mem)},
|
|
)
|
|
}
|
|
}
|
|
|
|
return samples, nil
|
|
}
|