Files
unibus/cmd/membershipd/config.go
T
egutierrez b8201a82cd fix(0006f): cluster secret out of argv, migrate-to-kv TLS guard, R1/CA docs (audit 0008 lows)
Low-severity cluster hardening from audit 0008:

- Route secret out of argv (N1-low): --cluster-pass and a nats://user:pass@host in
  --routes are visible in ps/journald. New --cluster-pass-file and the
  UNIBUS_CLUSTER_PASS env var (precedence file > env > flag); the resolved secret
  guards the route layer and is injected into bare --routes entries
  (injectRouteCreds), so peers can be listed as nats://host:6250 with no secret in
  argv. The legacy --cluster-pass stays for dev/compat.
- migrate-to-kv confidentiality (N6): refuse a remote --nats-url without --ca (the
  allowlist would travel cleartext); loopback targets are exempt (isLoopbackURL).
- Docs (N1 route CA, N3 DoS): deploy/README gains a Clustering section — use a
  SEPARATE cluster CA for routes (not the client CA), keep the secret out of argv,
  run migrate-to-kv loopback/TLS only, and R1 is a SPOF of auth (not HA); R3
  quorum is real HA. The generated cert material lives in deploy/cluster/ (0006g).

Tests:
- TestResolveClusterPass (file > env > flag precedence; missing file errors),
- TestInjectRouteCreds (injects only into userinfo-less routes; preserves overrides),
- TestIsLoopbackURL (loopback vs remote vs malformed).

CGO_ENABLED=0 go build/vet/test green; govulncheck 0 reachable.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 17:24:46 +02:00

199 lines
8.4 KiB
Go

package main
import (
"fmt"
"net"
"net/url"
"os"
"strings"
"github.com/enmanuel/unibus/pkg/membership"
)
// splitRoutes parses the comma-separated --routes flag into a clean slice of
// route URLs, dropping empty entries and surrounding whitespace so a trailing
// comma or a spaced list does not yield a bogus empty route.
func splitRoutes(csv string) []string {
var out []string
for _, r := range strings.Split(csv, ",") {
if r = strings.TrimSpace(r); r != "" {
out = append(out, r)
}
}
return out
}
// resolveClusterPass resolves the cluster route secret WITHOUT leaking it through
// argv (audit 0008 N1-low: --cluster-pass in argv is visible in ps/journald).
// Precedence: --cluster-pass-file (read + trim the file), then the env var
// UNIBUS_CLUSTER_PASS, then the legacy --cluster-pass flag (argv-visible, kept for
// dev/compat). env is injected (os.Getenv result) so the function stays testable.
// It returns the secret and a short source label for logging (never the secret).
func resolveClusterPass(passFlag, passFile, env string) (secret, source string, err error) {
if passFile != "" {
b, rerr := os.ReadFile(passFile)
if rerr != nil {
return "", "", fmt.Errorf("read --cluster-pass-file %q: %w", passFile, rerr)
}
return strings.TrimSpace(string(b)), "file", nil
}
if env != "" {
return env, "env", nil
}
if passFlag != "" {
return passFlag, "flag", nil
}
return "", "none", nil
}
// injectRouteCreds rewrites each route URL that carries NO userinfo to embed
// user:pass, so the cluster secret is supplied once (via file/env) instead of
// repeated in every --routes argv entry where ps/journald would expose it. A route
// that already carries userinfo is left untouched (operator override). With an
// empty user it is a no-op. A malformed route URL is an error (configuration bug)
// rather than a silently dropped peer.
func injectRouteCreds(routes []string, user, pass string) ([]string, error) {
if user == "" {
return routes, nil
}
out := make([]string, 0, len(routes))
for _, r := range routes {
u, err := url.Parse(r)
if err != nil {
return nil, fmt.Errorf("parse route %q: %w", r, err)
}
if u.User == nil {
u.User = url.UserPassword(user, pass)
}
out = append(out, u.String())
}
return out, nil
}
// isLoopbackURL reports whether a NATS url targets this host only (loopback). Used
// to guard migrate-to-kv (audit 0008 N6): pushing the allowlist to a REMOTE NATS
// without TLS would send handles/roles/sign-pubs in cleartext, so a remote target
// must be TLS-pinned (--ca). A url we cannot classify is treated as NON-loopback
// (conservative: it then requires --ca).
func isLoopbackURL(natsURL string) bool {
u, err := url.Parse(natsURL)
if err != nil {
return false
}
host := u.Hostname()
switch host {
case "localhost":
return true
case "":
return false
}
ip := net.ParseIP(host)
return ip != nil && ip.IsLoopback()
}
// isLoopbackBind reports whether the --bind value keeps the service reachable
// only from this host. An empty bind means "all interfaces" (public), and a
// hostname we cannot resolve to a loopback literal is treated as public — the
// conservative choice, so an unusual bind never silently slips past the guard.
func isLoopbackBind(bind string) bool {
switch bind {
case "localhost":
return true
case "":
return false // empty binds every interface
}
ip := net.ParseIP(bind)
if ip == nil {
return false // a hostname we can't classify: assume public
}
return ip.IsLoopback()
}
// validateBootConfig is the fail-open guard (audit H2). It refuses any startup
// configuration that would expose the bus without enforced authentication:
//
// - a non-loopback --bind without --bus-auth enforce (the data plane and
// control plane would both accept anyone),
// - --tls-cert/--tls-key without --bus-auth enforce (TLS encrypts the channel
// but authenticates no one — encrypted access for everybody is still open), and
// - a non-loopback --bind WITHOUT --tls-cert/--tls-key (the control plane would
// serve metadata over plaintext HTTP publicly — audit H5 reappearing, the N4
// gap the re-audit found: TLS was available but not mandatory).
//
// It is a pure function of the parsed flags so the command can fail fast at
// startup and tests can assert the policy without booting a server.
func validateBootConfig(bind string, mode membership.AuthMode, tlsCert, tlsKey string) error {
if !isLoopbackBind(bind) && mode != membership.AuthEnforce {
return fmt.Errorf(
"refusing to start: --bind %q is not loopback but --bus-auth is %q; a public bind requires --bus-auth enforce (or bind 127.0.0.1 for local dev)",
bind, mode)
}
if (tlsCert != "" || tlsKey != "") && mode != membership.AuthEnforce {
return fmt.Errorf(
"refusing to start: --tls-cert/--tls-key set but --bus-auth is %q; TLS without enforced auth is fail-open (encrypted channel, no authentication) — set --bus-auth enforce",
mode)
}
if !isLoopbackBind(bind) && (tlsCert == "" || tlsKey == "") {
return fmt.Errorf(
"refusing to start: --bind %q is not loopback but --tls-cert/--tls-key are not both set; a public control plane must serve HTTPS or its metadata (subjects, pubkeys, sealed keys, the social graph) travels in cleartext to a network MITM (audit H5/N4) — provide a CA-signed --tls-cert/--tls-key, or bind 127.0.0.1 for local dev",
bind)
}
return nil
}
// validateClusterConfig guards the cluster route layer (issue 0003a). The route
// layer is a server-to-server trust boundary distinct from the client data
// plane: leaving it open lets anyone who reaches the route port join the cluster
// or inject messages into the whole bus (audit 0004, "auth of the cluster
// routes"). So on a public (non-loopback) bind, a cluster MUST carry both a
// shared route secret AND mutual route TLS. It is a pure function of the parsed
// flags. An empty clusterName means "no cluster" (standalone) and is always
// allowed.
//
// The three route-TLS paths are all-or-nothing (mutual TLS needs the node cert,
// its key, and the CA together), independent of the bind, so a partial TLS
// config never silently degrades to plaintext routes.
//
// Homogeneous posture (issue 0006d, audit 0008 N1): a cluster is only as secure
// as its weakest node — the data plane forwards every subject between nodes, so a
// single node running without enforced auth lets an unauthenticated peer
// Subscribe(">") on it and harvest the traffic forwarded from the ACL'd nodes.
// This node therefore REFUSES to join a cluster unless it runs --bus-auth enforce,
// regardless of bind: a clustered node is a production node, and there is no safe
// "dev cluster without auth". (A peer running a tampered binary is out of this
// node's control; /healthz exposes each node's posture so a monitor can detect
// one that is not enforce+ACL — see Server.Posture.)
func validateClusterConfig(clusterName, bind, user, pass, rtCert, rtKey, rtCA string, mode membership.AuthMode) error {
rtAny := rtCert != "" || rtKey != "" || rtCA != ""
rtAll := rtCert != "" && rtKey != "" && rtCA != ""
if rtAny && !rtAll {
return fmt.Errorf(
"refusing to start: --route-tls-cert/--route-tls-key/--route-tls-ca must be set together (mutual route TLS needs all three)")
}
if clusterName == "" {
return nil // standalone: no route layer to secure
}
// A clustered node MUST enforce auth (homogeneous posture). Checked before the
// loopback shortcut so even a loopback cluster cannot form without enforce.
if mode != membership.AuthEnforce {
return fmt.Errorf(
"refusing to start: cluster %q requires --bus-auth enforce; a cluster node without enforced auth+ACL lets an unauthenticated peer harvest the traffic forwarded from the other nodes (audit 0008 N1) — every node must run the same enforce+ACL+TLS posture",
clusterName)
}
if isLoopbackBind(bind) {
return nil // loopback cluster is dev-only and unreachable from outside
}
// Public cluster: demand a route secret and mutual route TLS.
if user == "" || pass == "" {
return fmt.Errorf(
"refusing to start: cluster %q on public bind %q requires --cluster-user and --cluster-pass; an unauthenticated route port lets anyone join the cluster",
clusterName, bind)
}
if !rtAll {
return fmt.Errorf(
"refusing to start: cluster %q on public bind %q requires mutual route TLS (--route-tls-cert/--route-tls-key/--route-tls-ca); plaintext routes expose server-to-server traffic and admit unsigned nodes",
clusterName, bind)
}
return nil
}