metrics_agent/main.go

// Command metrics_agent collects host metrics and pushes them to a
// VictoriaMetrics / Prometheus-compatible ingest endpoint on a fixed interval.
//
// It is the per-node component of the fleet_monitoring project and is meant to
// run as a systemd service on every machine of the fleet. It does no work of
// its own beyond orchestration: the actual capability comes from three registry
// functions in fn-registry/functions/infra:
//
//   - CollectHostMetrics   -> []infra.PromSample  (CPU/mem/swap/disk/net/temp/procs)
//   - FormatPromExposition -> Prometheus exposition text
//   - PushPromRemote       -> POST the text with optional basic auth + extra labels
//
// The "instance" label is attached at push time so a single binary, configured
// only with its node name and the hub endpoint, identifies itself in Grafana.
package main

import (
	"context"
	"flag"
	"log"
	"os"
	"os/signal"
	"syscall"
	"time"

	"fn-registry/functions/infra"
)

func main() {
	configPath := flag.String("config", "", "path to JSON config file")
	once := flag.Bool("once", false, "collect and push a single time, then exit (useful for testing)")
	flag.Parse()

	cfg, err := loadConfig(*configPath)
	if err != nil {
		log.Fatalf("config: %v", err)
	}
	if cfg.HubURL == "" {
		log.Fatal("config: hub_url is required (set it in the config file or via FLEET_HUB_URL)")
	}
	log.Printf("metrics_agent starting: node=%q hub=%q interval=%ds", cfg.Node, cfg.HubURL, cfg.IntervalSec)

	if *once {
		if err := pushOnce(cfg); err != nil {
			log.Fatalf("push: %v", err)
		}
		return
	}

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	stop := make(chan os.Signal, 1)
	signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
	go func() {
		<-stop
		log.Print("shutting down")
		cancel()
	}()

	// Optional: ship systemd journal logs to Loki in the background.
	if cfg.LokiURL != "" {
		go shipJournald(ctx, cfg)
	}

	ticker := time.NewTicker(time.Duration(cfg.IntervalSec) * time.Second)
	defer ticker.Stop()

	// Push metrics once right away so a freshly started node shows up
	// immediately, then keep pushing on every tick.
	if err := pushOnce(cfg); err != nil {
		log.Printf("push error: %v", err)
	}
	for {
		select {
		case <-ticker.C:
			if err := pushOnce(cfg); err != nil {
				log.Printf("push error: %v", err)
			}
		case <-ctx.Done():
			return
		}
	}
}

// pushOnce runs a single collect -> format -> push cycle.
func pushOnce(cfg Config) error {
	samples, err := infra.CollectHostMetrics()
	if err != nil {
		return err
	}
	body := infra.FormatPromExposition(samples, time.Now().UnixMilli())
	if err := infra.PushPromRemote(cfg.HubURL, cfg.User, cfg.Pass, body, map[string]string{"instance": cfg.Node}); err != nil {
		return err
	}
	log.Printf("pushed %d samples", len(samples))
	return nil
}