// Command metrics_agent collects host metrics and pushes them to a // VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval. // // It is the per-node component of the fleet_monitoring project and is meant to // run as a systemd service on every machine of the fleet. It does no work of // its own beyond orchestration: the actual capability comes from three registry // functions in fn-registry/functions/infra: // // - CollectHostMetrics -> []infra.PromSample (CPU/mem/swap/disk/net/temp/procs) // - FormatPromExposition -> Prometheus exposition text // - PushPromRemote -> POST the text with optional basic auth + extra labels // // The "instance" label is attached at push time so a single binary, configured // only with its node name and the hub endpoint, identifies itself in Grafana. package main import ( "context" "flag" "log" "os" "os/signal" "syscall" "time" "fn-registry/functions/infra" ) func main() { configPath := flag.String("config", "", "path to JSON config file") once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)") flag.Parse() cfg, err := loadConfig(*configPath) if err != nil { log.Fatalf("config: %v", err) } if cfg.HubURL == "" { log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)") } log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec) if *once { if err := pushOnce(cfg); err != nil { log.Fatalf("push: %v", err) } return } ctx, cancel := context.WithCancel(context.Background()) defer cancel() stop := make(chan os.Signal, 1) signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) go func() { <-stop log.Print("shutting down") cancel() }() // Optional: ship systemd journal logs to Loki in the background. if cfg.LokiURL != "" { go shipJournald(ctx, cfg) } ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second) defer ticker.Stop() // Push metrics once right away so a freshly started node shows up // immediately, then keep pushing on every tick. if err := pushOnce(cfg); err != nil { log.Printf("push error: %v", err) } for { select { case <-ticker.C: if err := pushOnce(cfg); err != nil { log.Printf("push error: %v", err) } case <-ctx.Done(): return } } } // pushOnce runs a single collect -> format -> push cycle. func pushOnce(cfg Config) error { samples, err := infra.CollectHostMetrics() if err != nil { return err } body := infra.FormatPromExposition(samples, time.Now().UnixMilli()) if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil { return err } log.Printf("pushed %d samples", len(samples)) return nil }