feat(embeddednats): UNIBUS_NATS_MONITOR flag decoupled from debug log
Add a dedicated UNIBUS_NATS_MONITOR=1 toggle that opens the embedded nats-server monitoring HTTP endpoint (127.0.0.1:8222, loopback only) so a local metrics scraper can read /varz, /connz and /jsz for server-level metrics (msgs/s, connections, KV bucket msgs, RAFT leader per stream, restarts). Previously the monitoring endpoint was only reachable via UNIBUS_NATS_DEBUG=1, which is coupled to the verbose nats-server debug log: enabling the endpoint also wrote routes/RAFT/room subjects to journald in clear, which regresses the hardened posture (issue 0007). The two concerns are now decoupled. The toggle computation is extracted to a pure function natsLogOpts(debugEnv, monitorEnv) (noLog, debug, trace, monitor): MONITOR=1 opens the endpoint while keeping the log quiet (NoLog true / Debug false). The inverse coupling is preserved for backward compatibility (DEBUG still implies MONITOR). The 127.0.0.1 bind stays hardcoded — the monitoring endpoint has no auth and must never be reachable from the network. Deploy wiring versioned: additive systemd drop-in membershipd-cluster.service.d/nats-monitor.conf (Environment=UNIBUS_NATS_MONITOR=1) plus a "NATS server metrics" section in the cluster README with the rolling activation runbook (magnus -> homer -> datardos) gated on R3 reconvergence (followers 2/2) between nodes. Tests: pure decoupling table (monitor on => log NOT debug; debug => monitor; default closed) + a real embedded server with MONITOR=1 asserting /varz answers 200 on loopback:8222, and a server without the flag with the endpoint closed. 100% additive: behavior is identical without the flag. Bump app.md 0.10.0 -> 0.11.0. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -103,17 +103,38 @@ func StartHostAuth(storeDir, host string, port int, auth server.Authentication)
|
||||
return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port, Auth: auth})
|
||||
}
|
||||
|
||||
// natsLogOpts maps the two independent environment toggles to the embedded
|
||||
// nats-server logging and monitoring flags. It is a pure function (no I/O) so the
|
||||
// decoupling between the two toggles can be unit-tested directly.
|
||||
//
|
||||
// - UNIBUS_NATS_DEBUG="1" enables the nats-server logger (route/RAFT/JetStream
|
||||
// errors); "2" additionally enables protocol tracing. Off by default so the
|
||||
// server stays silent (NoLog) and production behavior is unchanged.
|
||||
// - UNIBUS_NATS_MONITOR="1" opens the monitoring HTTP endpoint (loopback only)
|
||||
// for a local metrics scraper to read /varz, /connz and /jsz.
|
||||
//
|
||||
// The two are DECOUPLED on purpose: enabling the monitoring endpoint must NOT turn
|
||||
// on the verbose debug log, which would write room subjects and routing metadata
|
||||
// to journald in clear and regress the hardened posture (issue 0007). The reverse
|
||||
// coupling is kept for backward compatibility: debug mode still exposes the
|
||||
// monitoring endpoint as well (debug implies monitor), so existing debugging
|
||||
// workflows are unchanged.
|
||||
func natsLogOpts(debugEnv, monitorEnv string) (noLog, debug, trace, monitor bool) {
|
||||
debug = debugEnv == "1" || debugEnv == "2"
|
||||
trace = debugEnv == "2"
|
||||
monitor = monitorEnv == "1" || debug
|
||||
noLog = !debug
|
||||
return noLog, debug, trace, monitor
|
||||
}
|
||||
|
||||
// StartServer launches an embedded nats-server with JetStream from cfg. It
|
||||
// blocks until the server is ready to accept connections (up to 5s) and returns
|
||||
// the running server; the caller must Shutdown it.
|
||||
func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
// Diagnostic toggle: UNIBUS_NATS_DEBUG=1 enables the embedded nats-server's own
|
||||
// logger (route/RAFT/JetStream errors), which is otherwise silenced. Off by
|
||||
// default so production behavior is unchanged; only set it when debugging the
|
||||
// cluster route layer.
|
||||
debugLevel := os.Getenv("UNIBUS_NATS_DEBUG")
|
||||
debugNATS := debugLevel == "1" || debugLevel == "2"
|
||||
traceNATS := debugLevel == "2"
|
||||
// Map the two independent env toggles to the nats-server logging + monitoring
|
||||
// flags. See natsLogOpts for the decoupling rationale (issue 0007).
|
||||
noLog, debugNATS, traceNATS, monitorNATS := natsLogOpts(
|
||||
os.Getenv("UNIBUS_NATS_DEBUG"), os.Getenv("UNIBUS_NATS_MONITOR"))
|
||||
opts := &server.Options{
|
||||
JetStream: true,
|
||||
StoreDir: cfg.StoreDir,
|
||||
@@ -122,15 +143,17 @@ func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
ServerName: cfg.ServerName,
|
||||
DontListen: false,
|
||||
// Keep the embedded server quiet by default; the host app logs the URLs.
|
||||
NoLog: !debugNATS,
|
||||
NoLog: noLog,
|
||||
Debug: debugNATS,
|
||||
Trace: traceNATS,
|
||||
Logtime: true,
|
||||
NoSigs: true,
|
||||
}
|
||||
if debugNATS {
|
||||
// Expose the nats-server monitoring endpoint (loopback) so the operator can
|
||||
// inspect /jsz, /routez, /varz while debugging the cluster meta-group.
|
||||
if monitorNATS {
|
||||
// Expose the nats-server monitoring endpoint on LOOPBACK ONLY (never public):
|
||||
// the operator (or a local metrics scraper) inspects /varz, /connz, /jsz,
|
||||
// /routez. The 127.0.0.1 bind is mandatory because this endpoint has no auth;
|
||||
// it must stay unreachable from the network.
|
||||
opts.HTTPHost = "127.0.0.1"
|
||||
opts.HTTPPort = 8222
|
||||
}
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
package embeddednats
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestNatsLogOptsDecoupled is the core regression guard for issue 0007: turning
|
||||
// on the monitoring endpoint must NEVER turn on the verbose nats-server debug log
|
||||
// (which would leak room subjects/routing metadata to journald). It also checks
|
||||
// the backward-compatible coupling (debug still implies monitoring) and the quiet
|
||||
// default.
|
||||
func TestNatsLogOptsDecoupled(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
debugEnv, monitorEnv string
|
||||
noLog, debug, trace, monitor bool
|
||||
}{
|
||||
{"default off — quiet, no monitor", "", "", true, false, false, false},
|
||||
{"monitor only — endpoint on, log stays quiet", "", "1", true, false, false, true},
|
||||
{"debug implies monitor", "1", "", false, true, false, true},
|
||||
{"trace implies debug+monitor", "2", "", false, true, true, true},
|
||||
{"both set", "1", "1", false, true, false, true},
|
||||
{"monitor garbage value ignored", "", "yes", true, false, false, false},
|
||||
{"debug garbage value ignored", "true", "", true, false, false, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
noLog, debug, trace, monitor := natsLogOpts(c.debugEnv, c.monitorEnv)
|
||||
if noLog != c.noLog || debug != c.debug || trace != c.trace || monitor != c.monitor {
|
||||
t.Fatalf("natsLogOpts(%q,%q) = (noLog=%v debug=%v trace=%v monitor=%v), want (noLog=%v debug=%v trace=%v monitor=%v)",
|
||||
c.debugEnv, c.monitorEnv, noLog, debug, trace, monitor,
|
||||
c.noLog, c.debug, c.trace, c.monitor)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Explicit golden assertion of the security property: monitor on, log off.
|
||||
noLog, debug, _, monitor := natsLogOpts("", "1")
|
||||
if !monitor {
|
||||
t.Fatal("UNIBUS_NATS_MONITOR=1 must open the monitoring endpoint")
|
||||
}
|
||||
if debug || !noLog {
|
||||
t.Fatalf("UNIBUS_NATS_MONITOR=1 must NOT enable the debug log (got debug=%v noLog=%v)", debug, noLog)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMonitorEndpointLoopback boots a real embedded server with
|
||||
// UNIBUS_NATS_MONITOR=1 (and DEBUG explicitly off) and proves the monitoring HTTP
|
||||
// endpoint answers on loopback only — the exact contract the metrics scraper
|
||||
// relies on. The pure decoupling check above already guarantees the log stays out
|
||||
// of debug mode for this same env combination.
|
||||
func TestMonitorEndpointLoopback(t *testing.T) {
|
||||
t.Setenv("UNIBUS_NATS_DEBUG", "")
|
||||
t.Setenv("UNIBUS_NATS_MONITOR", "1")
|
||||
|
||||
ns, err := StartServer(ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: freeLoopbackPort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start server with monitoring: %v", err)
|
||||
}
|
||||
defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
|
||||
|
||||
addr := ns.MonitorAddr()
|
||||
if addr == nil {
|
||||
t.Fatal("monitoring endpoint not open with UNIBUS_NATS_MONITOR=1 (MonitorAddr is nil)")
|
||||
}
|
||||
if !addr.IP.IsLoopback() {
|
||||
t.Fatalf("monitoring endpoint bound to %s, must be loopback only", addr.IP)
|
||||
}
|
||||
if addr.Port != 8222 {
|
||||
t.Fatalf("monitoring endpoint on port %d, want the fixed loopback port 8222", addr.Port)
|
||||
}
|
||||
|
||||
// /varz must answer 200 with a non-empty body on loopback.
|
||||
url := "http://" + addr.String() + "/varz"
|
||||
var resp *http.Response
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
resp, err = http.Get(url) //nolint:gosec // loopback monitoring endpoint, no auth by design
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("GET %s: %v", url, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("GET %s -> %d, want 200", url, resp.StatusCode)
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if len(body) == 0 {
|
||||
t.Fatalf("GET %s returned an empty body", url)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMonitorDisabledByDefault proves a server started without either toggle does
|
||||
// NOT open the monitoring endpoint, so production stays closed unless opted in.
|
||||
func TestMonitorDisabledByDefault(t *testing.T) {
|
||||
t.Setenv("UNIBUS_NATS_DEBUG", "")
|
||||
t.Setenv("UNIBUS_NATS_MONITOR", "")
|
||||
|
||||
ns, err := StartServer(ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: freeLoopbackPort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start server: %v", err)
|
||||
}
|
||||
defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
|
||||
|
||||
if addr := ns.MonitorAddr(); addr != nil {
|
||||
t.Fatalf("monitoring endpoint open (%s) without UNIBUS_NATS_MONITOR — must stay closed by default", addr)
|
||||
}
|
||||
}
|
||||
|
||||
func freeLoopbackPort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
Reference in New Issue
Block a user