Merge quick/nats-monitor-flag: UNIBUS_NATS_MONITOR loopback monitoring decoupled from debug log (bump 0.11.0)

2026-06-07 21:18:59 +02:00
parent b4f3118e85 1c9325104c
commit f31580deec
5 changed files with 274 additions and 12 deletions
@@ -2,7 +2,7 @@
 name: unibus
 lang: go
 domain: infra
-version: 0.10.0
+version: 0.11.0
 description: "Bus de mensajería unificado sobre NATS+JetStream con cifrado E2E por room (megolm/olm reducido): service de membresía/claves, librería cliente y peers demo."
 tags: [service, messaging, nats, e2e]
 uses_functions:
@@ -169,6 +169,26 @@ agent.<nombre>.{in,out}   inbox/outbox de agente LLM (agent.scout.in)

 ## Capability growth log

+- v0.11.0 (2026-06-07) — flag dedicado `UNIBUS_NATS_MONITOR` que abre el endpoint
+  de monitoring HTTP del nats-server embebido (`127.0.0.1:8222`, loopback only) de
+  forma DESACOPLADA del debug-log. Antes el monitoring solo se abría con
+  `UNIBUS_NATS_DEBUG=1`, que además encendía el log verboso del nats-server
+  (rutas/RAFT/subjects a journald en claro) — incompatible con el endurecimiento
+  del issue 0007. El cómputo de los toggles se extrae a una función pura
+  `natsLogOpts(debugEnv, monitorEnv) (noLog, debug, trace, monitor)`: `MONITOR=1`
+  abre el endpoint dejando el log en silencio (`NoLog` true / `Debug` false), y se
+  mantiene el acoplamiento inverso por compatibilidad (`DEBUG` sigue implicando
+  `MONITOR`). El bind loopback `127.0.0.1` queda hardcoded — el monitoring NUNCA es
+  público y no lleva auth; lo lee un scraper local que empuja a VictoriaMetrics
+  (dashboard `unibus-nats` en `fleet_monitoring`). Se versiona el cableado de
+  deploy: drop-in systemd aditivo `membershipd-cluster.service.d/nats-monitor.conf`
+  (`Environment=UNIBUS_NATS_MONITOR=1`) + sección "NATS server metrics" en el
+  README del cluster con el runbook de activación rolling (magnus→homer→datardos)
+  y gate de reconvergencia R3 (`followers 2/2`) entre nodos. Tests nuevos: tabla
+  pura del desacoplamiento (monitor on ⇒ log NO debug; debug ⇒ monitor; default
+  cerrado) + server real con `MONITOR=1` que confirma `/varz` 200 en loopback:8222
+  y server sin flag con el endpoint cerrado. Cambios 100% aditivos: sin el flag el
+  comportamiento es idéntico; build/test verdes.
 - v0.10.0 (2026-06-07) — API HTTP admin-only de gestión de usuarios, cerrando la
  última asimetría del control plane: las rooms tenían superficie HTTP firmada
  (`POST /rooms`, etc.) pero los users solo se gestionaban por CLI local o acceso
@@ -283,3 +283,61 @@ ssh dd 'sudo systemctl start membershipd-cluster'   # rejoins, catches up
 the unit and start it without `--store kv`/`--cluster-name`; the KV buckets remain
 for a later retry. To rotate the cluster CA, re-run `generate-cluster-certs.sh
 --force` and re-stage (every node must get the new `cluster-ca.crt` together).
+
+## NATS server metrics (loopback monitoring — optional)
+
+The embedded NATS server can expose its own monitoring HTTP endpoint so a local
+scraper reads server-level metrics that `/healthz` does not surface: msgs/s,
+connections, slow consumers, memory, KV bucket message counts, the RAFT leader per
+stream and per-stream restarts. This feeds the `unibus-nats` dashboard in
+`fleet_monitoring` (the scraper hits `127.0.0.1:8222/varz|/connz|/jsz` over
+loopback and pushes to VictoriaMetrics).
+
+The endpoint is opened by the **dedicated** environment toggle `UNIBUS_NATS_MONITOR=1`
+(0.11.0+ binary). It is **decoupled** from `UNIBUS_NATS_DEBUG`: it opens the
+monitoring endpoint WITHOUT enabling the verbose nats-server debug log, so no room
+subjects or routing metadata leak to journald (keeps the hardened posture, issue
+0007). The endpoint binds `127.0.0.1:8222` **only** — the binary hardcodes the
+loopback bind, so it is never reachable from the network and needs no auth. Never
+use `UNIBUS_NATS_DEBUG` in production just to get the endpoint.
+
+### Enable it (HUMAN — requires the 0.11.0+ binary on the node)
+
+The clean way is the additive systemd drop-in in this directory:
+
+```bash
+# On each node, AFTER the 0.11.0+ binary is in /opt/unibus/membershipd:
+ssh <node> 'sudo mkdir -p /etc/systemd/system/membershipd-cluster.service.d'
+scp membershipd-cluster.service.d/nats-monitor.conf <node>:/tmp/nats-monitor.conf
+ssh <node> 'sudo cp /tmp/nats-monitor.conf /etc/systemd/system/membershipd-cluster.service.d/ \
+  && sudo systemctl daemon-reload && sudo systemctl restart membershipd-cluster'
+```
+
+(Equivalently, add `UNIBUS_NATS_MONITOR=1` to `/opt/unibus/cluster.env`, which the
+unit already sources via `EnvironmentFile`; the drop-in is preferred because it is
+self-documenting and does not edit the generated env file.)
+
+### Rolling restart with the R3 reconvergence gate (CRITICAL)
+
+`systemctl restart membershipd-cluster` restarts that node's JetStream RAFT member.
+**Never restart two nodes at once** — that would drop the cluster below quorum
+(2/3) and fail the control plane closed. Roll **one node at a time**, in the order
+`magnus → homer → datardos`, and between each node wait until the cluster has
+reconverged to R3 (every control-plane bucket back to `followers_current=2/2`):
+
+```bash
+# After restarting ONE node, gate on R3 reconvergence before touching the next:
+ssh root@magnus 'for s in KV_UNIBUS_users KV_UNIBUS_rooms KV_UNIBUS_members \
+  KV_UNIBUS_room_keys KV_UNIBUS_rooms_by_member KV_UNIBUS_nonces; do
+    nats --server nats://127.0.0.1:4250 stream info "$s" -j \
+      | jq -r --arg s "$s" \"\\($s): replicas=\\(.cluster.replicas|length) leader=\\(.cluster.leader)\"
+  done'
+# Proceed to the next node ONLY when all six show 3 replicas with a leader
+# (i.e. 2/2 followers current). Also confirm healthz is green on the just-restarted
+# node first:
+ssh <node> 'curl -fsS https://127.0.0.1:8470/healthz --cacert /opt/unibus/tls/ca.crt'
+```
+
+This restart is normally **not** done as a standalone step: the 0.11.0 binary that
+carries the flag is rolled to the three nodes in the consolidated rollout, and the
+drop-in is installed during that same rolling restart.
@@ -0,0 +1,27 @@
+# Drop-in: enable the embedded NATS server monitoring HTTP endpoint so a local
+# metrics scraper can read /varz, /connz and /jsz for server-level metrics
+# (msgs/s, connections, KV bucket msgs, RAFT leader per stream, restarts).
+#
+# ADDITIVE and minimal: it only sets one environment variable; the base unit
+# (membershipd-cluster.service) is otherwise unchanged.
+#
+# UNIBUS_NATS_MONITOR is DECOUPLED from UNIBUS_NATS_DEBUG: it opens the monitoring
+# endpoint WITHOUT enabling the verbose nats-server debug log, so no room subjects
+# or routing metadata are written to journald (keeps the hardened posture, issue
+# 0007). Do NOT use UNIBUS_NATS_DEBUG in production just to get the endpoint.
+#
+# The endpoint binds 127.0.0.1:8222 ONLY — the binary hardcodes the loopback bind,
+# so it is never reachable from the network and needs no auth. The scraper runs on
+# the same host and reads it over loopback.
+#
+# Requires the 0.11.0+ membershipd binary (the one that honors UNIBUS_NATS_MONITOR).
+# Install on a node:
+#   sudo mkdir -p /etc/systemd/system/membershipd-cluster.service.d
+#   sudo cp nats-monitor.conf /etc/systemd/system/membershipd-cluster.service.d/
+#   sudo systemctl daemon-reload && sudo systemctl restart membershipd-cluster
+#
+# Restarting a node restarts its JetStream RAFT member, so roll ONE node at a time
+# and wait for R3 reconvergence (followers 2/2) before touching the next. See the
+# "NATS server metrics" section of this directory's README for the full runbook.
+[Service]
+Environment=UNIBUS_NATS_MONITOR=1
@@ -103,17 +103,38 @@ func StartHostAuth(storeDir, host string, port int, auth server.Authentication)
 	return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port, Auth: auth})
 }

+// natsLogOpts maps the two independent environment toggles to the embedded
+// nats-server logging and monitoring flags. It is a pure function (no I/O) so the
+// decoupling between the two toggles can be unit-tested directly.
+//
+//   - UNIBUS_NATS_DEBUG="1" enables the nats-server logger (route/RAFT/JetStream
+//     errors); "2" additionally enables protocol tracing. Off by default so the
+//     server stays silent (NoLog) and production behavior is unchanged.
+//   - UNIBUS_NATS_MONITOR="1" opens the monitoring HTTP endpoint (loopback only)
+//     for a local metrics scraper to read /varz, /connz and /jsz.
+//
+// The two are DECOUPLED on purpose: enabling the monitoring endpoint must NOT turn
+// on the verbose debug log, which would write room subjects and routing metadata
+// to journald in clear and regress the hardened posture (issue 0007). The reverse
+// coupling is kept for backward compatibility: debug mode still exposes the
+// monitoring endpoint as well (debug implies monitor), so existing debugging
+// workflows are unchanged.
+func natsLogOpts(debugEnv, monitorEnv string) (noLog, debug, trace, monitor bool) {
+	debug = debugEnv == "1" || debugEnv == "2"
+	trace = debugEnv == "2"
+	monitor = monitorEnv == "1" || debug
+	noLog = !debug
+	return noLog, debug, trace, monitor
+}
+
 // StartServer launches an embedded nats-server with JetStream from cfg. It
 // blocks until the server is ready to accept connections (up to 5s) and returns
 // the running server; the caller must Shutdown it.
 func StartServer(cfg ServerConfig) (*server.Server, error) {
-	// Diagnostic toggle: UNIBUS_NATS_DEBUG=1 enables the embedded nats-server's own
-	// logger (route/RAFT/JetStream errors), which is otherwise silenced. Off by
-	// default so production behavior is unchanged; only set it when debugging the
-	// cluster route layer.
-	debugLevel := os.Getenv("UNIBUS_NATS_DEBUG")
-	debugNATS := debugLevel == "1" || debugLevel == "2"
-	traceNATS := debugLevel == "2"
+	// Map the two independent env toggles to the nats-server logging + monitoring
+	// flags. See natsLogOpts for the decoupling rationale (issue 0007).
+	noLog, debugNATS, traceNATS, monitorNATS := natsLogOpts(
+		os.Getenv("UNIBUS_NATS_DEBUG"), os.Getenv("UNIBUS_NATS_MONITOR"))
 	opts := &server.Options{
 		JetStream:  true,
 		StoreDir:   cfg.StoreDir,
@@ -122,15 +143,17 @@ func StartServer(cfg ServerConfig) (*server.Server, error) {
 		ServerName: cfg.ServerName,
 		DontListen: false,
 		// Keep the embedded server quiet by default; the host app logs the URLs.
-		NoLog:   !debugNATS,
+		NoLog:   noLog,
 		Debug:   debugNATS,
 		Trace:   traceNATS,
 		Logtime: true,
 		NoSigs:  true,
 	}
-	if debugNATS {
-		// Expose the nats-server monitoring endpoint (loopback) so the operator can
-		// inspect /jsz, /routez, /varz while debugging the cluster meta-group.
+	if monitorNATS {
+		// Expose the nats-server monitoring endpoint on LOOPBACK ONLY (never public):
+		// the operator (or a local metrics scraper) inspects /varz, /connz, /jsz,
+		// /routez. The 127.0.0.1 bind is mandatory because this endpoint has no auth;
+		// it must stay unreachable from the network.
 		opts.HTTPHost = "127.0.0.1"
 		opts.HTTPPort = 8222
 	}
@@ -0,0 +1,134 @@
+package embeddednats
+
+import (
+	"io"
+	"net"
+	"net/http"
+	"testing"
+	"time"
+)
+
+// TestNatsLogOptsDecoupled is the core regression guard for issue 0007: turning
+// on the monitoring endpoint must NEVER turn on the verbose nats-server debug log
+// (which would leak room subjects/routing metadata to journald). It also checks
+// the backward-compatible coupling (debug still implies monitoring) and the quiet
+// default.
+func TestNatsLogOptsDecoupled(t *testing.T) {
+	cases := []struct {
+		name                         string
+		debugEnv, monitorEnv         string
+		noLog, debug, trace, monitor bool
+	}{
+		{"default off — quiet, no monitor", "", "", true, false, false, false},
+		{"monitor only — endpoint on, log stays quiet", "", "1", true, false, false, true},
+		{"debug implies monitor", "1", "", false, true, false, true},
+		{"trace implies debug+monitor", "2", "", false, true, true, true},
+		{"both set", "1", "1", false, true, false, true},
+		{"monitor garbage value ignored", "", "yes", true, false, false, false},
+		{"debug garbage value ignored", "true", "", true, false, false, false},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			noLog, debug, trace, monitor := natsLogOpts(c.debugEnv, c.monitorEnv)
+			if noLog != c.noLog || debug != c.debug || trace != c.trace || monitor != c.monitor {
+				t.Fatalf("natsLogOpts(%q,%q) = (noLog=%v debug=%v trace=%v monitor=%v), want (noLog=%v debug=%v trace=%v monitor=%v)",
+					c.debugEnv, c.monitorEnv, noLog, debug, trace, monitor,
+					c.noLog, c.debug, c.trace, c.monitor)
+			}
+		})
+	}
+
+	// Explicit golden assertion of the security property: monitor on, log off.
+	noLog, debug, _, monitor := natsLogOpts("", "1")
+	if !monitor {
+		t.Fatal("UNIBUS_NATS_MONITOR=1 must open the monitoring endpoint")
+	}
+	if debug || !noLog {
+		t.Fatalf("UNIBUS_NATS_MONITOR=1 must NOT enable the debug log (got debug=%v noLog=%v)", debug, noLog)
+	}
+}
+
+// TestMonitorEndpointLoopback boots a real embedded server with
+// UNIBUS_NATS_MONITOR=1 (and DEBUG explicitly off) and proves the monitoring HTTP
+// endpoint answers on loopback only — the exact contract the metrics scraper
+// relies on. The pure decoupling check above already guarantees the log stays out
+// of debug mode for this same env combination.
+func TestMonitorEndpointLoopback(t *testing.T) {
+	t.Setenv("UNIBUS_NATS_DEBUG", "")
+	t.Setenv("UNIBUS_NATS_MONITOR", "1")
+
+	ns, err := StartServer(ServerConfig{
+		StoreDir: t.TempDir(),
+		Host:     "127.0.0.1",
+		Port:     freeLoopbackPort(t),
+	})
+	if err != nil {
+		t.Fatalf("start server with monitoring: %v", err)
+	}
+	defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
+
+	addr := ns.MonitorAddr()
+	if addr == nil {
+		t.Fatal("monitoring endpoint not open with UNIBUS_NATS_MONITOR=1 (MonitorAddr is nil)")
+	}
+	if !addr.IP.IsLoopback() {
+		t.Fatalf("monitoring endpoint bound to %s, must be loopback only", addr.IP)
+	}
+	if addr.Port != 8222 {
+		t.Fatalf("monitoring endpoint on port %d, want the fixed loopback port 8222", addr.Port)
+	}
+
+	// /varz must answer 200 with a non-empty body on loopback.
+	url := "http://" + addr.String() + "/varz"
+	var resp *http.Response
+	deadline := time.Now().Add(3 * time.Second)
+	for time.Now().Before(deadline) {
+		resp, err = http.Get(url) //nolint:gosec // loopback monitoring endpoint, no auth by design
+		if err == nil {
+			break
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+	if err != nil {
+		t.Fatalf("GET %s: %v", url, err)
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("GET %s -> %d, want 200", url, resp.StatusCode)
+	}
+	body, _ := io.ReadAll(resp.Body)
+	if len(body) == 0 {
+		t.Fatalf("GET %s returned an empty body", url)
+	}
+}
+
+// TestMonitorDisabledByDefault proves a server started without either toggle does
+// NOT open the monitoring endpoint, so production stays closed unless opted in.
+func TestMonitorDisabledByDefault(t *testing.T) {
+	t.Setenv("UNIBUS_NATS_DEBUG", "")
+	t.Setenv("UNIBUS_NATS_MONITOR", "")
+
+	ns, err := StartServer(ServerConfig{
+		StoreDir: t.TempDir(),
+		Host:     "127.0.0.1",
+		Port:     freeLoopbackPort(t),
+	})
+	if err != nil {
+		t.Fatalf("start server: %v", err)
+	}
+	defer func() { ns.Shutdown(); ns.WaitForShutdown() }()
+
+	if addr := ns.MonitorAddr(); addr != nil {
+		t.Fatalf("monitoring endpoint open (%s) without UNIBUS_NATS_MONITOR — must stay closed by default", addr)
+	}
+}
+
+func freeLoopbackPort(t *testing.T) int {
+	t.Helper()
+	l, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("free port: %v", err)
+	}
+	defer l.Close()
+	return l.Addr().(*net.TCPAddr).Port
+}