fix(0006a): wire replicated nonce store on clustered nodes (audit 0008 N3)

membershipd never called Server.UseReplicatedNonces, so every node kept a
per-process anti-replay cache and a signed request accepted on node A could be
replayed to node B (200+200). This wires the shared JetStream KV nonce bucket on
any clustered node, closing the cross-node replay hole.

Bootstrap: under enforce the service needs JetStream on its own embedded server,
but the data plane only accepts allowlisted clients. Resolved with an ephemeral
internal service identity the authenticator recognizes and grants full
permissions (NewNkeyAuthenticatorACLInternal), connected over the in-process
transport (no TLS/CA needed for the self-connection).

Hard rule: --cluster-name != "" means the replicated nonce bucket is mandatory;
if it cannot be created the node refuses to start (wireReplicatedNonces returns a
fatal error) rather than run insecurely. Standalone nodes keep the in-memory
cache unchanged (branch-by-abstraction: no JetStream dependency added).

Changes:
- busauth: NewNkeyAuthenticatorACLInternal + fullPermissions for the internal id.
- cmd/membershipd: connectInternalJS (in-process, privileged) / connectExternalJS;
  wireReplicatedNonces helper; main wires it when clustered; --kv-replicas flag.

Tests (regression of audit 0008 N3):
- TestAttack0008_N3: 2 clustered nodes share the bucket, cross-node replay -> 401.
- TestAttack0008_N3_StandaloneKeepsLocalCache: standalone needs no JetStream,
  same-node replay still 401.
- TestAttack0008_N3_ClusteredRequiresJetStream: clustered + no JetStream -> fatal.
- TestInternalConnPrivilegedUnderEnforce / ...OutsiderRejected: the privileged
  self-connection works under enforce and no other identity can claim it.

CGO_ENABLED=0 go build/vet/test green; govulncheck 0 reachable.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 17:02:19 +02:00
parent 5df99fa4c4
commit 8b6a01d280
6 changed files with 570 additions and 1 deletions
+66 -1
View File
@@ -7,6 +7,7 @@ package main
import (
"context"
"crypto/tls"
"encoding/hex"
"flag"
"log"
"net/http"
@@ -15,6 +16,10 @@ import (
"syscall"
"time"
cs "fn-registry/functions/cybersecurity"
"github.com/nats-io/nats.go"
"github.com/nats-io/nats.go/jetstream"
server "github.com/nats-io/nats-server/v2/server"
"github.com/enmanuel/unibus/pkg/blobstore"
@@ -62,6 +67,12 @@ func main() {
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
// Replicated control plane (issue 0006a/c): the JetStream replication factor
// for the shared nonce bucket (and, with --store kv, the control-plane KV).
// 1 for a 1-2 node rollout, 3 for real HA quorum (raise in place with
// `nats stream update --replicas 3` when the third node joins).
kvReplicas = flag.Int("kv-replicas", 1, "JetStream replication factor for the shared nonce/KV buckets (1..3)")
caFile = flag.String("ca", "", "bus CA cert; only used to pin TLS on the internal JetStream connection to an EXTERNAL --nats-url (the embedded server uses an in-process connection that needs no CA)")
)
flag.Parse()
@@ -85,6 +96,31 @@ func main() {
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
log.SetPrefix("[membershipd] ")
// A clustered node shares its control plane with peers, so it needs a JetStream
// client to manage the replicated nonce bucket (issue 0006a). needJS will also
// be true under --store kv (issue 0006c), where the control-plane state lives in
// JetStream KV. A standalone single-node deployment needs none of this and keeps
// the in-process, in-memory behavior unchanged.
clustered := *clusterName != ""
needJS := clustered
enforce := authMode == membership.AuthEnforce
// Internal service identity (issue 0006a): when the embedded data plane enforces
// auth, membershipd must still connect to its OWN server to manage JetStream.
// It does so with this ephemeral identity, which the authenticator is built to
// recognize and grant full permissions (it never enters the user allowlist). It
// is only generated when actually needed (JetStream required AND enforce on AND
// the server is embedded), so a standalone or non-enforce node is unchanged.
var internalID cs.Identity
var internalPubHex string
if needJS && enforce && *natsURL == "" {
internalID, err = cs.GenerateIdentity()
if err != nil {
log.Fatalf("generate internal identity: %v", err)
}
internalPubHex = hex.EncodeToString(internalID.SignPub)
}
// Control plane store first: the NATS authenticator consults IsAuthorized, so
// the store must exist before the embedded server starts.
store, err := membership.Open(*dbPath)
@@ -145,9 +181,10 @@ func main() {
// Subscribe(">") and harvest every room's subject and JetStream activity.
// NATS freezes permissions at connect time, so a peer that joins a room
// after connecting must client.RefreshSession to gain that room's subject.
cfg.Auth = busauth.NewNkeyAuthenticatorACL(
cfg.Auth = busauth.NewNkeyAuthenticatorACLInternal(
store.IsAuthorized,
busauth.PermissionsFromSubjects(membership.SubjectACLFor(store)),
internalPubHex,
)
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
}
@@ -181,6 +218,34 @@ func main() {
srv.RequireEncryptedRooms = true
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
}
// Replicated anti-replay (issue 0006a, audit 0008 N3): a clustered node MUST
// share its nonce store across the cluster via JetStream KV, or a request
// accepted on one node can be replayed to another. Open a privileged JetStream
// client (in-process for the embedded server, a plain client for an external
// NATS) and wire the shared nonce bucket. This is a HARD requirement: if the
// bucket cannot be created the node refuses to start rather than run with a
// per-process cache that leaves the replay hole open.
if needJS {
var (
internalNC *nats.Conn
js jetstream.JetStream
)
if *natsURL == "" {
internalNC, js, err = connectInternalJS(ns, internalID, enforce)
} else {
internalNC, js, err = connectExternalJS(natsClientURL, *caFile)
}
if err != nil {
log.Fatalf("internal JetStream connection (required by --cluster-name): %v", err)
}
defer internalNC.Close()
if err := wireReplicatedNonces(srv, js, clustered, *kvReplicas); err != nil {
log.Fatalf("%v", err)
}
log.Printf("anti-replay: replicated nonce bucket %q (replicas=%d) — cluster-safe", "KV_UNIBUS_nonces", *kvReplicas)
}
log.Printf("control-plane auth: %s", authMode)
addr := *bind + ":" + *httpPort
httpSrv := &http.Server{