Files
unibus/pkg/embeddednats/monitor_test.go
T
Egutierrez 1c9325104c feat(embeddednats): UNIBUS_NATS_MONITOR flag decoupled from debug log
Add a dedicated UNIBUS_NATS_MONITOR=1 toggle that opens the embedded
nats-server monitoring HTTP endpoint (127.0.0.1:8222, loopback only) so a
local metrics scraper can read /varz, /connz and /jsz for server-level
metrics (msgs/s, connections, KV bucket msgs, RAFT leader per stream,
restarts).

Previously the monitoring endpoint was only reachable via UNIBUS_NATS_DEBUG=1,
which is coupled to the verbose nats-server debug log: enabling the endpoint
also wrote routes/RAFT/room subjects to journald in clear, which regresses the
hardened posture (issue 0007). The two concerns are now decoupled.

The toggle computation is extracted to a pure function
natsLogOpts(debugEnv, monitorEnv) (noLog, debug, trace, monitor): MONITOR=1
opens the endpoint while keeping the log quiet (NoLog true / Debug false). The
inverse coupling is preserved for backward compatibility (DEBUG still implies
MONITOR). The 127.0.0.1 bind stays hardcoded — the monitoring endpoint has no
auth and must never be reachable from the network.

Deploy wiring versioned: additive systemd drop-in
membershipd-cluster.service.d/nats-monitor.conf (Environment=UNIBUS_NATS_MONITOR=1)
plus a "NATS server metrics" section in the cluster README with the rolling
activation runbook (magnus -> homer -> datardos) gated on R3 reconvergence
(followers 2/2) between nodes.

Tests: pure decoupling table (monitor on => log NOT debug; debug => monitor;
default closed) + a real embedded server with MONITOR=1 asserting /varz answers
200 on loopback:8222, and a server without the flag with the endpoint closed.
100% additive: behavior is identical without the flag. Bump app.md 0.10.0 ->
0.11.0.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 20:57:46 +02:00

135 lines
4.5 KiB
Go

package embeddednats
import (
"io"
"net"
"net/http"
"testing"
"time"
)
// TestNatsLogOptsDecoupled is the core regression guard for issue 0007: turning
// on the monitoring endpoint must NEVER turn on the verbose nats-server debug log
// (which would leak room subjects/routing metadata to journald). It also checks
// the backward-compatible coupling (debug still implies monitoring) and the quiet
// default.
func TestNatsLogOptsDecoupled(t *testing.T) {
cases := []struct {
name string
debugEnv, monitorEnv string
noLog, debug, trace, monitor bool
}{
{"default off — quiet, no monitor", "", "", true, false, false, false},
{"monitor only — endpoint on, log stays quiet", "", "1", true, false, false, true},
{"debug implies monitor", "1", "", false, true, false, true},
{"trace implies debug+monitor", "2", "", false, true, true, true},
{"both set", "1", "1", false, true, false, true},
{"monitor garbage value ignored", "", "yes", true, false, false, false},
{"debug garbage value ignored", "true", "", true, false, false, false},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
noLog, debug, trace, monitor := natsLogOpts(c.debugEnv, c.monitorEnv)
if noLog != c.noLog || debug != c.debug || trace != c.trace || monitor != c.monitor {
t.Fatalf("natsLogOpts(%q,%q) = (noLog=%v debug=%v trace=%v monitor=%v), want (noLog=%v debug=%v trace=%v monitor=%v)",
c.debugEnv, c.monitorEnv, noLog, debug, trace, monitor,
c.noLog, c.debug, c.trace, c.monitor)
}
})
}
// Explicit golden assertion of the security property: monitor on, log off.
noLog, debug, _, monitor := natsLogOpts("", "1")
if !monitor {
t.Fatal("UNIBUS_NATS_MONITOR=1 must open the monitoring endpoint")
}
if debug || !noLog {
t.Fatalf("UNIBUS_NATS_MONITOR=1 must NOT enable the debug log (got debug=%v noLog=%v)", debug, noLog)
}
}
// TestMonitorEndpointLoopback boots a real embedded server with
// UNIBUS_NATS_MONITOR=1 (and DEBUG explicitly off) and proves the monitoring HTTP
// endpoint answers on loopback only — the exact contract the metrics scraper
// relies on. The pure decoupling check above already guarantees the log stays out
// of debug mode for this same env combination.
func TestMonitorEndpointLoopback(t *testing.T) {
t.Setenv("UNIBUS_NATS_DEBUG", "")
t.Setenv("UNIBUS_NATS_MONITOR", "1")
ns, err := StartServer(ServerConfig{
StoreDir: t.TempDir(),
Host: "127.0.0.1",
Port: freeLoopbackPort(t),
})
if err != nil {
t.Fatalf("start server with monitoring: %v", err)
}
defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
addr := ns.MonitorAddr()
if addr == nil {
t.Fatal("monitoring endpoint not open with UNIBUS_NATS_MONITOR=1 (MonitorAddr is nil)")
}
if !addr.IP.IsLoopback() {
t.Fatalf("monitoring endpoint bound to %s, must be loopback only", addr.IP)
}
if addr.Port != 8222 {
t.Fatalf("monitoring endpoint on port %d, want the fixed loopback port 8222", addr.Port)
}
// /varz must answer 200 with a non-empty body on loopback.
url := "http://" + addr.String() + "/varz"
var resp *http.Response
deadline := time.Now().Add(3 * time.Second)
for time.Now().Before(deadline) {
resp, err = http.Get(url) //nolint:gosec // loopback monitoring endpoint, no auth by design
if err == nil {
break
}
time.Sleep(50 * time.Millisecond)
}
if err != nil {
t.Fatalf("GET %s: %v", url, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("GET %s -> %d, want 200", url, resp.StatusCode)
}
body, _ := io.ReadAll(resp.Body)
if len(body) == 0 {
t.Fatalf("GET %s returned an empty body", url)
}
}
// TestMonitorDisabledByDefault proves a server started without either toggle does
// NOT open the monitoring endpoint, so production stays closed unless opted in.
func TestMonitorDisabledByDefault(t *testing.T) {
t.Setenv("UNIBUS_NATS_DEBUG", "")
t.Setenv("UNIBUS_NATS_MONITOR", "")
ns, err := StartServer(ServerConfig{
StoreDir: t.TempDir(),
Host: "127.0.0.1",
Port: freeLoopbackPort(t),
})
if err != nil {
t.Fatalf("start server: %v", err)
}
defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
if addr := ns.MonitorAddr(); addr != nil {
t.Fatalf("monitoring endpoint open (%s) without UNIBUS_NATS_MONITOR — must stay closed by default", addr)
}
}
func freeLoopbackPort(t *testing.T) int {
t.Helper()
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("free port: %v", err)
}
defer l.Close()
return l.Addr().(*net.TCPAddr).Port
}