161 lines
4.9 KiB
Go
161 lines
4.9 KiB
Go
// Command metrics_agent collects host metrics and pushes them to a
|
|
// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval.
|
|
//
|
|
// It is the per-node component of the fleet_monitoring project and is meant to
|
|
// run as a systemd service on every machine of the fleet. It does no work of
|
|
// its own beyond orchestration: the actual capability comes from three registry
|
|
// functions in fn-registry/functions/infra:
|
|
//
|
|
// - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs)
|
|
// - FormatPromExposition -> Prometheus exposition text
|
|
// - PushPromRemote -> POST the text with optional basic auth + extra labels
|
|
//
|
|
// The "instance" label is attached at push time so a single binary, configured
|
|
// only with its node name and the hub endpoint, identifies itself in Grafana.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"log"
|
|
"net"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"fn-registry/functions/infra"
|
|
)
|
|
|
|
// androidWorkarounds fixes two things that break a standard cross-compiled Go
|
|
// binary on Android/Termux (no-op on Linux servers):
|
|
// - DNS: the pure-Go resolver reads /etc/resolv.conf, which on Android does
|
|
// not reflect the system DNS, so lookups hit ::1:53 and fail. We point the
|
|
// default resolver at a public DNS server explicitly.
|
|
// - TLS: Go's default CA bundle paths don't exist on Android, so HTTPS fails
|
|
// with "certificate signed by unknown authority". We point SSL_CERT_FILE at
|
|
// the Termux ca-certificates bundle (read lazily by crypto/x509 on first use).
|
|
func androidWorkarounds() {
|
|
if os.Getenv("ANDROID_ROOT") == "" && os.Getenv("ANDROID_DATA") == "" {
|
|
return
|
|
}
|
|
net.DefaultResolver = &net.Resolver{
|
|
PreferGo: true,
|
|
Dial: func(ctx context.Context, network, address string) (net.Conn, error) {
|
|
d := net.Dialer{Timeout: 5 * time.Second}
|
|
return d.DialContext(ctx, "udp", "1.1.1.1:53")
|
|
},
|
|
}
|
|
if os.Getenv("SSL_CERT_FILE") == "" {
|
|
prefix := os.Getenv("PREFIX")
|
|
if prefix == "" {
|
|
prefix = "/data/data/com.termux/files/usr"
|
|
}
|
|
cert := prefix + "/etc/tls/cert.pem"
|
|
if _, err := os.Stat(cert); err == nil {
|
|
os.Setenv("SSL_CERT_FILE", cert)
|
|
}
|
|
}
|
|
log.Print("android: using 1.1.1.1 resolver + Termux CA bundle")
|
|
}
|
|
|
|
func main() {
|
|
configPath := flag.String("config", "", "path to JSON config file")
|
|
once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)")
|
|
flag.Parse()
|
|
|
|
androidWorkarounds()
|
|
|
|
cfg, err := loadConfig(*configPath)
|
|
if err != nil {
|
|
log.Fatalf("config: %v", err)
|
|
}
|
|
if cfg.HubURL == "" {
|
|
log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)")
|
|
}
|
|
log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec)
|
|
|
|
if *once {
|
|
if err := pushOnce(cfg); err != nil {
|
|
log.Fatalf("push: %v", err)
|
|
}
|
|
return
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
stop := make(chan os.Signal, 1)
|
|
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
|
go func() {
|
|
<-stop
|
|
log.Print("shutting down")
|
|
cancel()
|
|
}()
|
|
|
|
// Optional: ship logs to Loki in the background (journald on Linux,
|
|
// logcat on Android/Termux).
|
|
if cfg.LokiURL != "" {
|
|
go shipLogs(ctx, cfg)
|
|
}
|
|
|
|
ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
// Push metrics once right away so a freshly started node shows up
|
|
// immediately, then keep pushing on every tick.
|
|
if err := pushOnce(cfg); err != nil {
|
|
log.Printf("push error: %v", err)
|
|
}
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
if err := pushOnce(cfg); err != nil {
|
|
log.Printf("push error: %v", err)
|
|
}
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// pushOnce runs a single collect -> format -> push cycle.
|
|
func pushOnce(cfg Config) error {
|
|
samples, err := infra.CollectHostMetrics()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Battery metrics are best-effort. On Android/Termux the agent cannot exec
|
|
// termux-battery-status (seccomp), so a shell helper writes its JSON to
|
|
// cfg.BatteryFile and we parse that here; elsewhere we collect directly.
|
|
samples = append(samples, batterySamples(cfg)...)
|
|
body := infra.FormatPromExposition(samples, time.Now().UnixMilli())
|
|
if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, cfg.extraLabels(nil)); err != nil {
|
|
return err
|
|
}
|
|
log.Printf("pushed %d samples", len(samples))
|
|
return nil
|
|
}
|
|
|
|
// batterySamples returns battery metrics, reading them from a JSON file when
|
|
// cfg.BatteryFile is set (Android path) or collecting them directly otherwise.
|
|
// Always best-effort: any error yields no samples rather than failing the push.
|
|
func batterySamples(cfg Config) []infra.PromSample {
|
|
if cfg.BatteryFile != "" {
|
|
data, err := os.ReadFile(cfg.BatteryFile)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
s, err := infra.BatterySamplesFromJSON(data)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return s
|
|
}
|
|
s, err := infra.CollectBatteryMetrics()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return s
|
|
}
|