feat(0003a): NATS cluster routes with shared-secret auth + mutual route TLS

Add high-availability cluster support to the embedded NATS server
(issue 0003a, first phase of decentralization).

pkg/embeddednats:
- ServerConfig gains ServerName (unique per node, required by JetStream
  RAFT) and an optional *ClusterConfig (cluster name, route host/port,
  peer route URLs, shared-secret Username/Password, and a mutual-TLS
  *tls.Config). applyClusterOpts maps it onto server.Options.Cluster +
  Routes. Nil Cluster keeps the legacy standalone server.

pkg/busauth:
- RouteTLSConfig builds the route layer's mutual-TLS config: the node
  presents its CA-signed certificate AND verifies the peer's certificate
  against the bus CA (RequireAndVerifyClientCert), reusing the issue-0001
  CA. Routes authenticate NODES, never the client nkey authenticator.

cmd/membershipd:
- Cluster flags (--cluster-name/--server-name/--cluster-port/--routes/
  --cluster-user/--cluster-pass/--route-tls-cert/-key/-ca) wire a node
  into the cluster. validateClusterConfig refuses a public cluster
  without a route secret and complete mutual route TLS, and rejects
  partial route-TLS flags (all-or-nothing). splitRoutes parses the CSV.

Tests (DoD: golden + 2 edge + error path):
- TestClusterForwardsAcrossNodes: 2-node cluster forwards a client
  subject from one node to a subscriber on the other.
- TestClusterThreeNodesForward: 3-node (HA shape) cross-node forwarding.
- TestClusterMutualTLSForwards: forwarding over mutual-TLS routes.
- TestClusterRejectsBadRouteAuth: wrong cluster password -> no route.
- TestClusterRejectsUnsignedNode: cert not signed by the bus CA -> no route.
- TestClusterConfigPolicy / TestSplitRoutes: boot-guard + CSV parsing.

Master stays green: standalone (no --cluster-name) is unchanged.
This commit is contained in:
agent
2026-06-07 14:54:53 +02:00
parent 618f6b61da
commit c90f145a05
6 changed files with 623 additions and 5 deletions
+40 -3
View File
@@ -45,6 +45,16 @@ func main() {
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
)
flag.Parse()
@@ -59,6 +69,11 @@ func main() {
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
log.Fatalf("%v", err)
}
// Cluster route guard (issue 0003a): a public cluster needs a route secret
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
log.Fatalf("%v", err)
}
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
log.SetPrefix("[membershipd] ")
@@ -89,9 +104,31 @@ func main() {
// Bind the embedded NATS to the same interface as the HTTP API so a
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
StoreDir: *natsStore,
Host: *bind,
Port: *natsPort,
StoreDir: *natsStore,
Host: *bind,
Port: *natsPort,
ServerName: *serverName,
}
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
if *clusterName != "" {
cc := &embeddednats.ClusterConfig{
Name: *clusterName,
Host: *bind,
Port: *clusterPort,
Routes: splitRoutes(*routesCSV),
Username: *clusterUser,
Password: *clusterPass,
}
if *routeTLSCert != "" {
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
if err != nil {
log.Fatalf("load route TLS: %v", err)
}
cc.TLS = rtls
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
}
cfg.Cluster = cc
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
}
if authMode == membership.AuthEnforce {
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)