feat(0003a): NATS cluster routes with shared-secret auth + mutual route TLS
Add high-availability cluster support to the embedded NATS server (issue 0003a, first phase of decentralization). pkg/embeddednats: - ServerConfig gains ServerName (unique per node, required by JetStream RAFT) and an optional *ClusterConfig (cluster name, route host/port, peer route URLs, shared-secret Username/Password, and a mutual-TLS *tls.Config). applyClusterOpts maps it onto server.Options.Cluster + Routes. Nil Cluster keeps the legacy standalone server. pkg/busauth: - RouteTLSConfig builds the route layer's mutual-TLS config: the node presents its CA-signed certificate AND verifies the peer's certificate against the bus CA (RequireAndVerifyClientCert), reusing the issue-0001 CA. Routes authenticate NODES, never the client nkey authenticator. cmd/membershipd: - Cluster flags (--cluster-name/--server-name/--cluster-port/--routes/ --cluster-user/--cluster-pass/--route-tls-cert/-key/-ca) wire a node into the cluster. validateClusterConfig refuses a public cluster without a route secret and complete mutual route TLS, and rejects partial route-TLS flags (all-or-nothing). splitRoutes parses the CSV. Tests (DoD: golden + 2 edge + error path): - TestClusterForwardsAcrossNodes: 2-node cluster forwards a client subject from one node to a subscriber on the other. - TestClusterThreeNodesForward: 3-node (HA shape) cross-node forwarding. - TestClusterMutualTLSForwards: forwarding over mutual-TLS routes. - TestClusterRejectsBadRouteAuth: wrong cluster password -> no route. - TestClusterRejectsUnsignedNode: cert not signed by the bus CA -> no route. - TestClusterConfigPolicy / TestSplitRoutes: boot-guard + CSV parsing. Master stays green: standalone (no --cluster-name) is unchanged.
This commit is contained in:
@@ -3,10 +3,24 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// splitRoutes parses the comma-separated --routes flag into a clean slice of
|
||||
// route URLs, dropping empty entries and surrounding whitespace so a trailing
|
||||
// comma or a spaced list does not yield a bogus empty route.
|
||||
func splitRoutes(csv string) []string {
|
||||
var out []string
|
||||
for _, r := range strings.Split(csv, ",") {
|
||||
if r = strings.TrimSpace(r); r != "" {
|
||||
out = append(out, r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// isLoopbackBind reports whether the --bind value keeps the service reachable
|
||||
// only from this host. An empty bind means "all interfaces" (public), and a
|
||||
// hostname we cannot resolve to a loopback literal is treated as public — the
|
||||
@@ -48,3 +62,42 @@ func validateBootConfig(bind string, mode membership.AuthMode, tlsCert, tlsKey s
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateClusterConfig guards the cluster route layer (issue 0003a). The route
|
||||
// layer is a server-to-server trust boundary distinct from the client data
|
||||
// plane: leaving it open lets anyone who reaches the route port join the cluster
|
||||
// or inject messages into the whole bus (audit 0004, "auth of the cluster
|
||||
// routes"). So on a public (non-loopback) bind, a cluster MUST carry both a
|
||||
// shared route secret AND mutual route TLS. It is a pure function of the parsed
|
||||
// flags. An empty clusterName means "no cluster" (standalone) and is always
|
||||
// allowed.
|
||||
//
|
||||
// The three route-TLS paths are all-or-nothing (mutual TLS needs the node cert,
|
||||
// its key, and the CA together), independent of the bind, so a partial TLS
|
||||
// config never silently degrades to plaintext routes.
|
||||
func validateClusterConfig(clusterName, bind, user, pass, rtCert, rtKey, rtCA string) error {
|
||||
rtAny := rtCert != "" || rtKey != "" || rtCA != ""
|
||||
rtAll := rtCert != "" && rtKey != "" && rtCA != ""
|
||||
if rtAny && !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --route-tls-cert/--route-tls-key/--route-tls-ca must be set together (mutual route TLS needs all three)")
|
||||
}
|
||||
if clusterName == "" {
|
||||
return nil // standalone: no route layer to secure
|
||||
}
|
||||
if isLoopbackBind(bind) {
|
||||
return nil // loopback cluster is dev-only and unreachable from outside
|
||||
}
|
||||
// Public cluster: demand a route secret and mutual route TLS.
|
||||
if user == "" || pass == "" {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires --cluster-user and --cluster-pass; an unauthenticated route port lets anyone join the cluster",
|
||||
clusterName, bind)
|
||||
}
|
||||
if !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires mutual route TLS (--route-tls-cert/--route-tls-key/--route-tls-ca); plaintext routes expose server-to-server traffic and admit unsigned nodes",
|
||||
clusterName, bind)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -70,3 +70,63 @@ func TestBootConfigPolicy(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestClusterConfigPolicy is the cluster route guard (issue 0003a): a standalone
|
||||
// server is always fine; a loopback cluster is dev-only and unguarded; a public
|
||||
// cluster demands both a route secret and complete mutual route TLS; and the
|
||||
// route-TLS flags are all-or-nothing regardless of bind.
|
||||
func TestClusterConfigPolicy(t *testing.T) {
|
||||
const c, k, ca = "node.crt", "node.key", "ca.crt"
|
||||
cases := []struct {
|
||||
name string
|
||||
clusterName, bind string
|
||||
user, pass string
|
||||
rtCert, rtKey, rtCA string
|
||||
wantErr bool
|
||||
}{
|
||||
// Standalone (no cluster name) is always allowed, even on a public bind.
|
||||
{"standalone-public", "", "0.0.0.0", "", "", "", "", "", false},
|
||||
// Loopback dev cluster: unguarded (unreachable from outside).
|
||||
{"loopback-cluster-bare", "unibus", "127.0.0.1", "", "", "", "", "", false},
|
||||
// Golden: full public HA config.
|
||||
{"public-full", "unibus", "0.0.0.0", "u", "p", c, k, ca, false},
|
||||
// Error: public cluster without a route secret.
|
||||
{"public-no-secret", "unibus", "0.0.0.0", "", "", c, k, ca, true},
|
||||
{"public-half-secret", "unibus", "0.0.0.0", "u", "", c, k, ca, true},
|
||||
// Error: public cluster without mutual route TLS.
|
||||
{"public-no-tls", "unibus", "10.0.0.1", "u", "p", "", "", "", true},
|
||||
// Error: partial route-TLS flags trip regardless of bind.
|
||||
{"loopback-partial-tls", "unibus", "127.0.0.1", "", "", c, "", "", true},
|
||||
{"standalone-partial-tls", "", "127.0.0.1", "", "", c, k, "", true},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := validateClusterConfig(tc.clusterName, tc.bind, tc.user, tc.pass, tc.rtCert, tc.rtKey, tc.rtCA)
|
||||
if tc.wantErr && err == nil {
|
||||
t.Fatalf("cluster config %+v should be refused", tc)
|
||||
}
|
||||
if !tc.wantErr && err != nil {
|
||||
t.Fatalf("cluster config %+v should be allowed, got: %v", tc, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitRoutes(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"", 0},
|
||||
{"nats://a:1", 1},
|
||||
{"nats://a:1,nats://b:2", 2},
|
||||
{" nats://a:1 , nats://b:2 ", 2}, // spaces trimmed
|
||||
{"nats://a:1,,", 1}, // empty entries dropped
|
||||
{",", 0},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := splitRoutes(c.in); len(got) != c.want {
|
||||
t.Fatalf("splitRoutes(%q) = %v (len %d), want len %d", c.in, got, len(got), c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+40
-3
@@ -45,6 +45,16 @@ func main() {
|
||||
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
||||
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
||||
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
||||
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
||||
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
||||
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
||||
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
||||
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
||||
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
||||
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
|
||||
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
||||
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
||||
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
@@ -59,6 +69,11 @@ func main() {
|
||||
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
||||
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
||||
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
||||
log.SetPrefix("[membershipd] ")
|
||||
@@ -89,9 +104,31 @@ func main() {
|
||||
// Bind the embedded NATS to the same interface as the HTTP API so a
|
||||
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
||||
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
||||
StoreDir: *natsStore,
|
||||
Host: *bind,
|
||||
Port: *natsPort,
|
||||
StoreDir: *natsStore,
|
||||
Host: *bind,
|
||||
Port: *natsPort,
|
||||
ServerName: *serverName,
|
||||
}
|
||||
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
||||
if *clusterName != "" {
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: *clusterName,
|
||||
Host: *bind,
|
||||
Port: *clusterPort,
|
||||
Routes: splitRoutes(*routesCSV),
|
||||
Username: *clusterUser,
|
||||
Password: *clusterPass,
|
||||
}
|
||||
if *routeTLSCert != "" {
|
||||
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
||||
if err != nil {
|
||||
log.Fatalf("load route TLS: %v", err)
|
||||
}
|
||||
cc.TLS = rtls
|
||||
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
||||
}
|
||||
cfg.Cluster = cc
|
||||
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
||||
}
|
||||
if authMode == membership.AuthEnforce {
|
||||
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)
|
||||
|
||||
Reference in New Issue
Block a user