Files
unibus/cmd/membershipd/config.go
T
agent c90f145a05 feat(0003a): NATS cluster routes with shared-secret auth + mutual route TLS
Add high-availability cluster support to the embedded NATS server
(issue 0003a, first phase of decentralization).

pkg/embeddednats:
- ServerConfig gains ServerName (unique per node, required by JetStream
  RAFT) and an optional *ClusterConfig (cluster name, route host/port,
  peer route URLs, shared-secret Username/Password, and a mutual-TLS
  *tls.Config). applyClusterOpts maps it onto server.Options.Cluster +
  Routes. Nil Cluster keeps the legacy standalone server.

pkg/busauth:
- RouteTLSConfig builds the route layer's mutual-TLS config: the node
  presents its CA-signed certificate AND verifies the peer's certificate
  against the bus CA (RequireAndVerifyClientCert), reusing the issue-0001
  CA. Routes authenticate NODES, never the client nkey authenticator.

cmd/membershipd:
- Cluster flags (--cluster-name/--server-name/--cluster-port/--routes/
  --cluster-user/--cluster-pass/--route-tls-cert/-key/-ca) wire a node
  into the cluster. validateClusterConfig refuses a public cluster
  without a route secret and complete mutual route TLS, and rejects
  partial route-TLS flags (all-or-nothing). splitRoutes parses the CSV.

Tests (DoD: golden + 2 edge + error path):
- TestClusterForwardsAcrossNodes: 2-node cluster forwards a client
  subject from one node to a subscriber on the other.
- TestClusterThreeNodesForward: 3-node (HA shape) cross-node forwarding.
- TestClusterMutualTLSForwards: forwarding over mutual-TLS routes.
- TestClusterRejectsBadRouteAuth: wrong cluster password -> no route.
- TestClusterRejectsUnsignedNode: cert not signed by the bus CA -> no route.
- TestClusterConfigPolicy / TestSplitRoutes: boot-guard + CSV parsing.

Master stays green: standalone (no --cluster-name) is unchanged.
2026-06-07 14:54:53 +02:00

104 lines
4.2 KiB
Go

package main
import (
"fmt"
"net"
"strings"
"github.com/enmanuel/unibus/pkg/membership"
)
// splitRoutes parses the comma-separated --routes flag into a clean slice of
// route URLs, dropping empty entries and surrounding whitespace so a trailing
// comma or a spaced list does not yield a bogus empty route.
func splitRoutes(csv string) []string {
var out []string
for _, r := range strings.Split(csv, ",") {
if r = strings.TrimSpace(r); r != "" {
out = append(out, r)
}
}
return out
}
// isLoopbackBind reports whether the --bind value keeps the service reachable
// only from this host. An empty bind means "all interfaces" (public), and a
// hostname we cannot resolve to a loopback literal is treated as public — the
// conservative choice, so an unusual bind never silently slips past the guard.
func isLoopbackBind(bind string) bool {
switch bind {
case "localhost":
return true
case "":
return false // empty binds every interface
}
ip := net.ParseIP(bind)
if ip == nil {
return false // a hostname we can't classify: assume public
}
return ip.IsLoopback()
}
// validateBootConfig is the fail-open guard (audit H2). It refuses any startup
// configuration that would expose the bus without enforced authentication:
//
// - a non-loopback --bind without --bus-auth enforce (the data plane and
// control plane would both accept anyone), and
// - --tls-cert/--tls-key without --bus-auth enforce (TLS encrypts the channel
// but authenticates no one — encrypted access for everybody is still open).
//
// It is a pure function of the parsed flags so the command can fail fast at
// startup and tests can assert the policy without booting a server.
func validateBootConfig(bind string, mode membership.AuthMode, tlsCert, tlsKey string) error {
if !isLoopbackBind(bind) && mode != membership.AuthEnforce {
return fmt.Errorf(
"refusing to start: --bind %q is not loopback but --bus-auth is %q; a public bind requires --bus-auth enforce (or bind 127.0.0.1 for local dev)",
bind, mode)
}
if (tlsCert != "" || tlsKey != "") && mode != membership.AuthEnforce {
return fmt.Errorf(
"refusing to start: --tls-cert/--tls-key set but --bus-auth is %q; TLS without enforced auth is fail-open (encrypted channel, no authentication) — set --bus-auth enforce",
mode)
}
return nil
}
// validateClusterConfig guards the cluster route layer (issue 0003a). The route
// layer is a server-to-server trust boundary distinct from the client data
// plane: leaving it open lets anyone who reaches the route port join the cluster
// or inject messages into the whole bus (audit 0004, "auth of the cluster
// routes"). So on a public (non-loopback) bind, a cluster MUST carry both a
// shared route secret AND mutual route TLS. It is a pure function of the parsed
// flags. An empty clusterName means "no cluster" (standalone) and is always
// allowed.
//
// The three route-TLS paths are all-or-nothing (mutual TLS needs the node cert,
// its key, and the CA together), independent of the bind, so a partial TLS
// config never silently degrades to plaintext routes.
func validateClusterConfig(clusterName, bind, user, pass, rtCert, rtKey, rtCA string) error {
rtAny := rtCert != "" || rtKey != "" || rtCA != ""
rtAll := rtCert != "" && rtKey != "" && rtCA != ""
if rtAny && !rtAll {
return fmt.Errorf(
"refusing to start: --route-tls-cert/--route-tls-key/--route-tls-ca must be set together (mutual route TLS needs all three)")
}
if clusterName == "" {
return nil // standalone: no route layer to secure
}
if isLoopbackBind(bind) {
return nil // loopback cluster is dev-only and unreachable from outside
}
// Public cluster: demand a route secret and mutual route TLS.
if user == "" || pass == "" {
return fmt.Errorf(
"refusing to start: cluster %q on public bind %q requires --cluster-user and --cluster-pass; an unauthenticated route port lets anyone join the cluster",
clusterName, bind)
}
if !rtAll {
return fmt.Errorf(
"refusing to start: cluster %q on public bind %q requires mutual route TLS (--route-tls-cert/--route-tls-key/--route-tls-ca); plaintext routes expose server-to-server traffic and admit unsigned nodes",
clusterName, bind)
}
return nil
}