// Command metrics_agent collects host metrics and pushes them to a // VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval. // // It is the per-node component of the fleet_monitoring project and is meant to // run as a systemd service on every machine of the fleet. It does no work of // its own beyond orchestration: the actual capability comes from three registry // functions in fn-registry/functions/infra: // // - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs) // - FormatPromExposition -> Prometheus exposition text // - PushPromRemote -> POST the text with optional basic auth + extra labels // // The "instance" label is attached at push time so a single binary, configured // only with its node name and the hub endpoint, identifies itself in Grafana. package main import ( "context" "flag" "log" "net" "os" "os/signal" "syscall" "time" "fn-registry/functions/infra" ) // androidWorkarounds fixes two things that break a standard cross-compiled Go // binary on Android/Termux (no-op on Linux servers): // - DNS: the pure-Go resolver reads /etc/resolv.conf, which on Android does // not reflect the system DNS, so lookups hit ::1:53 and fail. We point the // default resolver at a public DNS server explicitly. // - TLS: Go's default CA bundle paths don't exist on Android, so HTTPS fails // with "certificate signed by unknown authority". We point SSL_CERT_FILE at // the Termux ca-certificates bundle (read lazily by crypto/x509 on first use). func androidWorkarounds() { if os.Getenv("ANDROID_ROOT") == "" && os.Getenv("ANDROID_DATA") == "" { return } net.DefaultResolver = &net.Resolver{ PreferGo: true, Dial: func(ctx context.Context, network, address string) (net.Conn, error) { d := net.Dialer{Timeout: 5 * time.Second} return d.DialContext(ctx, "udp", "1.1.1.1:53") }, } if os.Getenv("SSL_CERT_FILE") == "" { prefix := os.Getenv("PREFIX") if prefix == "" { prefix = "/data/data/com.termux/files/usr" } cert := prefix + "/etc/tls/cert.pem" if _, err := os.Stat(cert); err == nil { os.Setenv("SSL_CERT_FILE", cert) } } log.Print("android: using 1.1.1.1 resolver + Termux CA bundle") } func main() { configPath := flag.String("config", "", "path to JSON config file") once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)") flag.Parse() androidWorkarounds() cfg, err := loadConfig(*configPath) if err != nil { log.Fatalf("config: %v", err) } if cfg.HubURL == "" { log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)") } log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec) if *once { if err := pushOnce(cfg); err != nil { log.Fatalf("push: %v", err) } return } ctx, cancel := context.WithCancel(context.Background()) defer cancel() stop := make(chan os.Signal, 1) signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) go func() { <-stop log.Print("shutting down") cancel() }() // Optional: ship logs to Loki in the background (journald on Linux, // logcat on Android/Termux). if cfg.LokiURL != "" { go shipLogs(ctx, cfg) } ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second) defer ticker.Stop() // Push metrics once right away so a freshly started node shows up // immediately, then keep pushing on every tick. if err := pushOnce(cfg); err != nil { log.Printf("push error: %v", err) } for { select { case <-ticker.C: if err := pushOnce(cfg); err != nil { log.Printf("push error: %v", err) } case <-ctx.Done(): return } } } // pushOnce runs a single collect -> format -> push cycle. func pushOnce(cfg Config) error { samples, err := infra.CollectHostMetrics() if err != nil { return err } // Battery metrics are best-effort. On Android/Termux the agent cannot exec // termux-battery-status (seccomp), so a shell helper writes its JSON to // cfg.BatteryFile and we parse that here; elsewhere we collect directly. samples = append(samples, batterySamples(cfg)...) body := infra.FormatPromExposition(samples, time.Now().UnixMilli()) if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil { return err } log.Printf("pushed %d samples", len(samples)) return nil } // batterySamples returns battery metrics, reading them from a JSON file when // cfg.BatteryFile is set (Android path) or collecting them directly otherwise. // Always best-effort: any error yields no samples rather than failing the push. func batterySamples(cfg Config) []infra.PromSample { if cfg.BatteryFile != "" { data, err := os.ReadFile(cfg.BatteryFile) if err != nil { return nil } s, err := infra.BatterySamplesFromJSON(data) if err != nil { return nil } return s } s, err := infra.CollectBatteryMetrics() if err != nil { return nil } return s }