From 1c9325104cfaf88cc389ad494f6e261fa8d5dc25 Mon Sep 17 00:00:00 2001 From: Egutierrez Date: Sun, 7 Jun 2026 20:57:46 +0200 Subject: [PATCH] feat(embeddednats): UNIBUS_NATS_MONITOR flag decoupled from debug log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a dedicated UNIBUS_NATS_MONITOR=1 toggle that opens the embedded nats-server monitoring HTTP endpoint (127.0.0.1:8222, loopback only) so a local metrics scraper can read /varz, /connz and /jsz for server-level metrics (msgs/s, connections, KV bucket msgs, RAFT leader per stream, restarts). Previously the monitoring endpoint was only reachable via UNIBUS_NATS_DEBUG=1, which is coupled to the verbose nats-server debug log: enabling the endpoint also wrote routes/RAFT/room subjects to journald in clear, which regresses the hardened posture (issue 0007). The two concerns are now decoupled. The toggle computation is extracted to a pure function natsLogOpts(debugEnv, monitorEnv) (noLog, debug, trace, monitor): MONITOR=1 opens the endpoint while keeping the log quiet (NoLog true / Debug false). The inverse coupling is preserved for backward compatibility (DEBUG still implies MONITOR). The 127.0.0.1 bind stays hardcoded — the monitoring endpoint has no auth and must never be reachable from the network. Deploy wiring versioned: additive systemd drop-in membershipd-cluster.service.d/nats-monitor.conf (Environment=UNIBUS_NATS_MONITOR=1) plus a "NATS server metrics" section in the cluster README with the rolling activation runbook (magnus -> homer -> datardos) gated on R3 reconvergence (followers 2/2) between nodes. Tests: pure decoupling table (monitor on => log NOT debug; debug => monitor; default closed) + a real embedded server with MONITOR=1 asserting /varz answers 200 on loopback:8222, and a server without the flag with the endpoint closed. 100% additive: behavior is identical without the flag. Bump app.md 0.10.0 -> 0.11.0. Co-Authored-By: Claude Opus 4.8 (1M context) --- app.md | 22 ++- deploy/cluster/README.md | 58 ++++++++ .../nats-monitor.conf | 27 ++++ pkg/embeddednats/embeddednats.go | 45 ++++-- pkg/embeddednats/monitor_test.go | 134 ++++++++++++++++++ 5 files changed, 274 insertions(+), 12 deletions(-) create mode 100644 deploy/cluster/membershipd-cluster.service.d/nats-monitor.conf create mode 100644 pkg/embeddednats/monitor_test.go diff --git a/app.md b/app.md index 561c261..f131a98 100644 --- a/app.md +++ b/app.md @@ -2,7 +2,7 @@ name: unibus lang: go domain: infra -version: 0.10.0 +version: 0.11.0 description: "Bus de mensajería unificado sobre NATS+JetStream con cifrado E2E por room (megolm/olm reducido): service de membresía/claves, librería cliente y peers demo." tags: [service, messaging, nats, e2e] uses_functions: @@ -169,6 +169,26 @@ agent..{in,out} inbox/outbox de agente LLM (agent.scout.in) ## Capability growth log +- v0.11.0 (2026-06-07) — flag dedicado `UNIBUS_NATS_MONITOR` que abre el endpoint + de monitoring HTTP del nats-server embebido (`127.0.0.1:8222`, loopback only) de + forma DESACOPLADA del debug-log. Antes el monitoring solo se abría con + `UNIBUS_NATS_DEBUG=1`, que además encendía el log verboso del nats-server + (rutas/RAFT/subjects a journald en claro) — incompatible con el endurecimiento + del issue 0007. El cómputo de los toggles se extrae a una función pura + `natsLogOpts(debugEnv, monitorEnv) (noLog, debug, trace, monitor)`: `MONITOR=1` + abre el endpoint dejando el log en silencio (`NoLog` true / `Debug` false), y se + mantiene el acoplamiento inverso por compatibilidad (`DEBUG` sigue implicando + `MONITOR`). El bind loopback `127.0.0.1` queda hardcoded — el monitoring NUNCA es + público y no lleva auth; lo lee un scraper local que empuja a VictoriaMetrics + (dashboard `unibus-nats` en `fleet_monitoring`). Se versiona el cableado de + deploy: drop-in systemd aditivo `membershipd-cluster.service.d/nats-monitor.conf` + (`Environment=UNIBUS_NATS_MONITOR=1`) + sección "NATS server metrics" en el + README del cluster con el runbook de activación rolling (magnus→homer→datardos) + y gate de reconvergencia R3 (`followers 2/2`) entre nodos. Tests nuevos: tabla + pura del desacoplamiento (monitor on ⇒ log NO debug; debug ⇒ monitor; default + cerrado) + server real con `MONITOR=1` que confirma `/varz` 200 en loopback:8222 + y server sin flag con el endpoint cerrado. Cambios 100% aditivos: sin el flag el + comportamiento es idéntico; build/test verdes. - v0.10.0 (2026-06-07) — API HTTP admin-only de gestión de usuarios, cerrando la última asimetría del control plane: las rooms tenían superficie HTTP firmada (`POST /rooms`, etc.) pero los users solo se gestionaban por CLI local o acceso diff --git a/deploy/cluster/README.md b/deploy/cluster/README.md index a1777ec..bb1dd6b 100644 --- a/deploy/cluster/README.md +++ b/deploy/cluster/README.md @@ -283,3 +283,61 @@ ssh dd 'sudo systemctl start membershipd-cluster' # rejoins, catches up the unit and start it without `--store kv`/`--cluster-name`; the KV buckets remain for a later retry. To rotate the cluster CA, re-run `generate-cluster-certs.sh --force` and re-stage (every node must get the new `cluster-ca.crt` together). + +## NATS server metrics (loopback monitoring — optional) + +The embedded NATS server can expose its own monitoring HTTP endpoint so a local +scraper reads server-level metrics that `/healthz` does not surface: msgs/s, +connections, slow consumers, memory, KV bucket message counts, the RAFT leader per +stream and per-stream restarts. This feeds the `unibus-nats` dashboard in +`fleet_monitoring` (the scraper hits `127.0.0.1:8222/varz|/connz|/jsz` over +loopback and pushes to VictoriaMetrics). + +The endpoint is opened by the **dedicated** environment toggle `UNIBUS_NATS_MONITOR=1` +(0.11.0+ binary). It is **decoupled** from `UNIBUS_NATS_DEBUG`: it opens the +monitoring endpoint WITHOUT enabling the verbose nats-server debug log, so no room +subjects or routing metadata leak to journald (keeps the hardened posture, issue +0007). The endpoint binds `127.0.0.1:8222` **only** — the binary hardcodes the +loopback bind, so it is never reachable from the network and needs no auth. Never +use `UNIBUS_NATS_DEBUG` in production just to get the endpoint. + +### Enable it (HUMAN — requires the 0.11.0+ binary on the node) + +The clean way is the additive systemd drop-in in this directory: + +```bash +# On each node, AFTER the 0.11.0+ binary is in /opt/unibus/membershipd: +ssh 'sudo mkdir -p /etc/systemd/system/membershipd-cluster.service.d' +scp membershipd-cluster.service.d/nats-monitor.conf :/tmp/nats-monitor.conf +ssh 'sudo cp /tmp/nats-monitor.conf /etc/systemd/system/membershipd-cluster.service.d/ \ + && sudo systemctl daemon-reload && sudo systemctl restart membershipd-cluster' +``` + +(Equivalently, add `UNIBUS_NATS_MONITOR=1` to `/opt/unibus/cluster.env`, which the +unit already sources via `EnvironmentFile`; the drop-in is preferred because it is +self-documenting and does not edit the generated env file.) + +### Rolling restart with the R3 reconvergence gate (CRITICAL) + +`systemctl restart membershipd-cluster` restarts that node's JetStream RAFT member. +**Never restart two nodes at once** — that would drop the cluster below quorum +(2/3) and fail the control plane closed. Roll **one node at a time**, in the order +`magnus → homer → datardos`, and between each node wait until the cluster has +reconverged to R3 (every control-plane bucket back to `followers_current=2/2`): + +```bash +# After restarting ONE node, gate on R3 reconvergence before touching the next: +ssh root@magnus 'for s in KV_UNIBUS_users KV_UNIBUS_rooms KV_UNIBUS_members \ + KV_UNIBUS_room_keys KV_UNIBUS_rooms_by_member KV_UNIBUS_nonces; do + nats --server nats://127.0.0.1:4250 stream info "$s" -j \ + | jq -r --arg s "$s" \"\\($s): replicas=\\(.cluster.replicas|length) leader=\\(.cluster.leader)\" + done' +# Proceed to the next node ONLY when all six show 3 replicas with a leader +# (i.e. 2/2 followers current). Also confirm healthz is green on the just-restarted +# node first: +ssh 'curl -fsS https://127.0.0.1:8470/healthz --cacert /opt/unibus/tls/ca.crt' +``` + +This restart is normally **not** done as a standalone step: the 0.11.0 binary that +carries the flag is rolled to the three nodes in the consolidated rollout, and the +drop-in is installed during that same rolling restart. diff --git a/deploy/cluster/membershipd-cluster.service.d/nats-monitor.conf b/deploy/cluster/membershipd-cluster.service.d/nats-monitor.conf new file mode 100644 index 0000000..d9c39cf --- /dev/null +++ b/deploy/cluster/membershipd-cluster.service.d/nats-monitor.conf @@ -0,0 +1,27 @@ +# Drop-in: enable the embedded NATS server monitoring HTTP endpoint so a local +# metrics scraper can read /varz, /connz and /jsz for server-level metrics +# (msgs/s, connections, KV bucket msgs, RAFT leader per stream, restarts). +# +# ADDITIVE and minimal: it only sets one environment variable; the base unit +# (membershipd-cluster.service) is otherwise unchanged. +# +# UNIBUS_NATS_MONITOR is DECOUPLED from UNIBUS_NATS_DEBUG: it opens the monitoring +# endpoint WITHOUT enabling the verbose nats-server debug log, so no room subjects +# or routing metadata are written to journald (keeps the hardened posture, issue +# 0007). Do NOT use UNIBUS_NATS_DEBUG in production just to get the endpoint. +# +# The endpoint binds 127.0.0.1:8222 ONLY — the binary hardcodes the loopback bind, +# so it is never reachable from the network and needs no auth. The scraper runs on +# the same host and reads it over loopback. +# +# Requires the 0.11.0+ membershipd binary (the one that honors UNIBUS_NATS_MONITOR). +# Install on a node: +# sudo mkdir -p /etc/systemd/system/membershipd-cluster.service.d +# sudo cp nats-monitor.conf /etc/systemd/system/membershipd-cluster.service.d/ +# sudo systemctl daemon-reload && sudo systemctl restart membershipd-cluster +# +# Restarting a node restarts its JetStream RAFT member, so roll ONE node at a time +# and wait for R3 reconvergence (followers 2/2) before touching the next. See the +# "NATS server metrics" section of this directory's README for the full runbook. +[Service] +Environment=UNIBUS_NATS_MONITOR=1 diff --git a/pkg/embeddednats/embeddednats.go b/pkg/embeddednats/embeddednats.go index 26cdd10..9cd02c6 100644 --- a/pkg/embeddednats/embeddednats.go +++ b/pkg/embeddednats/embeddednats.go @@ -103,17 +103,38 @@ func StartHostAuth(storeDir, host string, port int, auth server.Authentication) return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port, Auth: auth}) } +// natsLogOpts maps the two independent environment toggles to the embedded +// nats-server logging and monitoring flags. It is a pure function (no I/O) so the +// decoupling between the two toggles can be unit-tested directly. +// +// - UNIBUS_NATS_DEBUG="1" enables the nats-server logger (route/RAFT/JetStream +// errors); "2" additionally enables protocol tracing. Off by default so the +// server stays silent (NoLog) and production behavior is unchanged. +// - UNIBUS_NATS_MONITOR="1" opens the monitoring HTTP endpoint (loopback only) +// for a local metrics scraper to read /varz, /connz and /jsz. +// +// The two are DECOUPLED on purpose: enabling the monitoring endpoint must NOT turn +// on the verbose debug log, which would write room subjects and routing metadata +// to journald in clear and regress the hardened posture (issue 0007). The reverse +// coupling is kept for backward compatibility: debug mode still exposes the +// monitoring endpoint as well (debug implies monitor), so existing debugging +// workflows are unchanged. +func natsLogOpts(debugEnv, monitorEnv string) (noLog, debug, trace, monitor bool) { + debug = debugEnv == "1" || debugEnv == "2" + trace = debugEnv == "2" + monitor = monitorEnv == "1" || debug + noLog = !debug + return noLog, debug, trace, monitor +} + // StartServer launches an embedded nats-server with JetStream from cfg. It // blocks until the server is ready to accept connections (up to 5s) and returns // the running server; the caller must Shutdown it. func StartServer(cfg ServerConfig) (*server.Server, error) { - // Diagnostic toggle: UNIBUS_NATS_DEBUG=1 enables the embedded nats-server's own - // logger (route/RAFT/JetStream errors), which is otherwise silenced. Off by - // default so production behavior is unchanged; only set it when debugging the - // cluster route layer. - debugLevel := os.Getenv("UNIBUS_NATS_DEBUG") - debugNATS := debugLevel == "1" || debugLevel == "2" - traceNATS := debugLevel == "2" + // Map the two independent env toggles to the nats-server logging + monitoring + // flags. See natsLogOpts for the decoupling rationale (issue 0007). + noLog, debugNATS, traceNATS, monitorNATS := natsLogOpts( + os.Getenv("UNIBUS_NATS_DEBUG"), os.Getenv("UNIBUS_NATS_MONITOR")) opts := &server.Options{ JetStream: true, StoreDir: cfg.StoreDir, @@ -122,15 +143,17 @@ func StartServer(cfg ServerConfig) (*server.Server, error) { ServerName: cfg.ServerName, DontListen: false, // Keep the embedded server quiet by default; the host app logs the URLs. - NoLog: !debugNATS, + NoLog: noLog, Debug: debugNATS, Trace: traceNATS, Logtime: true, NoSigs: true, } - if debugNATS { - // Expose the nats-server monitoring endpoint (loopback) so the operator can - // inspect /jsz, /routez, /varz while debugging the cluster meta-group. + if monitorNATS { + // Expose the nats-server monitoring endpoint on LOOPBACK ONLY (never public): + // the operator (or a local metrics scraper) inspects /varz, /connz, /jsz, + // /routez. The 127.0.0.1 bind is mandatory because this endpoint has no auth; + // it must stay unreachable from the network. opts.HTTPHost = "127.0.0.1" opts.HTTPPort = 8222 } diff --git a/pkg/embeddednats/monitor_test.go b/pkg/embeddednats/monitor_test.go new file mode 100644 index 0000000..bb24666 --- /dev/null +++ b/pkg/embeddednats/monitor_test.go @@ -0,0 +1,134 @@ +package embeddednats + +import ( + "io" + "net" + "net/http" + "testing" + "time" +) + +// TestNatsLogOptsDecoupled is the core regression guard for issue 0007: turning +// on the monitoring endpoint must NEVER turn on the verbose nats-server debug log +// (which would leak room subjects/routing metadata to journald). It also checks +// the backward-compatible coupling (debug still implies monitoring) and the quiet +// default. +func TestNatsLogOptsDecoupled(t *testing.T) { + cases := []struct { + name string + debugEnv, monitorEnv string + noLog, debug, trace, monitor bool + }{ + {"default off — quiet, no monitor", "", "", true, false, false, false}, + {"monitor only — endpoint on, log stays quiet", "", "1", true, false, false, true}, + {"debug implies monitor", "1", "", false, true, false, true}, + {"trace implies debug+monitor", "2", "", false, true, true, true}, + {"both set", "1", "1", false, true, false, true}, + {"monitor garbage value ignored", "", "yes", true, false, false, false}, + {"debug garbage value ignored", "true", "", true, false, false, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + noLog, debug, trace, monitor := natsLogOpts(c.debugEnv, c.monitorEnv) + if noLog != c.noLog || debug != c.debug || trace != c.trace || monitor != c.monitor { + t.Fatalf("natsLogOpts(%q,%q) = (noLog=%v debug=%v trace=%v monitor=%v), want (noLog=%v debug=%v trace=%v monitor=%v)", + c.debugEnv, c.monitorEnv, noLog, debug, trace, monitor, + c.noLog, c.debug, c.trace, c.monitor) + } + }) + } + + // Explicit golden assertion of the security property: monitor on, log off. + noLog, debug, _, monitor := natsLogOpts("", "1") + if !monitor { + t.Fatal("UNIBUS_NATS_MONITOR=1 must open the monitoring endpoint") + } + if debug || !noLog { + t.Fatalf("UNIBUS_NATS_MONITOR=1 must NOT enable the debug log (got debug=%v noLog=%v)", debug, noLog) + } +} + +// TestMonitorEndpointLoopback boots a real embedded server with +// UNIBUS_NATS_MONITOR=1 (and DEBUG explicitly off) and proves the monitoring HTTP +// endpoint answers on loopback only — the exact contract the metrics scraper +// relies on. The pure decoupling check above already guarantees the log stays out +// of debug mode for this same env combination. +func TestMonitorEndpointLoopback(t *testing.T) { + t.Setenv("UNIBUS_NATS_DEBUG", "") + t.Setenv("UNIBUS_NATS_MONITOR", "1") + + ns, err := StartServer(ServerConfig{ + StoreDir: t.TempDir(), + Host: "127.0.0.1", + Port: freeLoopbackPort(t), + }) + if err != nil { + t.Fatalf("start server with monitoring: %v", err) + } + defer func() { ns.Shutdown(); ns.WaitForShutdown() }() + + addr := ns.MonitorAddr() + if addr == nil { + t.Fatal("monitoring endpoint not open with UNIBUS_NATS_MONITOR=1 (MonitorAddr is nil)") + } + if !addr.IP.IsLoopback() { + t.Fatalf("monitoring endpoint bound to %s, must be loopback only", addr.IP) + } + if addr.Port != 8222 { + t.Fatalf("monitoring endpoint on port %d, want the fixed loopback port 8222", addr.Port) + } + + // /varz must answer 200 with a non-empty body on loopback. + url := "http://" + addr.String() + "/varz" + var resp *http.Response + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + resp, err = http.Get(url) //nolint:gosec // loopback monitoring endpoint, no auth by design + if err == nil { + break + } + time.Sleep(50 * time.Millisecond) + } + if err != nil { + t.Fatalf("GET %s: %v", url, err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("GET %s -> %d, want 200", url, resp.StatusCode) + } + body, _ := io.ReadAll(resp.Body) + if len(body) == 0 { + t.Fatalf("GET %s returned an empty body", url) + } +} + +// TestMonitorDisabledByDefault proves a server started without either toggle does +// NOT open the monitoring endpoint, so production stays closed unless opted in. +func TestMonitorDisabledByDefault(t *testing.T) { + t.Setenv("UNIBUS_NATS_DEBUG", "") + t.Setenv("UNIBUS_NATS_MONITOR", "") + + ns, err := StartServer(ServerConfig{ + StoreDir: t.TempDir(), + Host: "127.0.0.1", + Port: freeLoopbackPort(t), + }) + if err != nil { + t.Fatalf("start server: %v", err) + } + defer func() { ns.Shutdown(); ns.WaitForShutdown() }() + + if addr := ns.MonitorAddr(); addr != nil { + t.Fatalf("monitoring endpoint open (%s) without UNIBUS_NATS_MONITOR — must stay closed by default", addr) + } +} + +func freeLoopbackPort(t *testing.T) int { + t.Helper() + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("free port: %v", err) + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +}