diff --git a/cmd/membershipd/config.go b/cmd/membershipd/config.go index 1ed6753..490e544 100644 --- a/cmd/membershipd/config.go +++ b/cmd/membershipd/config.go @@ -3,6 +3,8 @@ package main import ( "fmt" "net" + "net/url" + "os" "strings" "github.com/enmanuel/unibus/pkg/membership" @@ -21,6 +23,74 @@ func splitRoutes(csv string) []string { return out } +// resolveClusterPass resolves the cluster route secret WITHOUT leaking it through +// argv (audit 0008 N1-low: --cluster-pass in argv is visible in ps/journald). +// Precedence: --cluster-pass-file (read + trim the file), then the env var +// UNIBUS_CLUSTER_PASS, then the legacy --cluster-pass flag (argv-visible, kept for +// dev/compat). env is injected (os.Getenv result) so the function stays testable. +// It returns the secret and a short source label for logging (never the secret). +func resolveClusterPass(passFlag, passFile, env string) (secret, source string, err error) { + if passFile != "" { + b, rerr := os.ReadFile(passFile) + if rerr != nil { + return "", "", fmt.Errorf("read --cluster-pass-file %q: %w", passFile, rerr) + } + return strings.TrimSpace(string(b)), "file", nil + } + if env != "" { + return env, "env", nil + } + if passFlag != "" { + return passFlag, "flag", nil + } + return "", "none", nil +} + +// injectRouteCreds rewrites each route URL that carries NO userinfo to embed +// user:pass, so the cluster secret is supplied once (via file/env) instead of +// repeated in every --routes argv entry where ps/journald would expose it. A route +// that already carries userinfo is left untouched (operator override). With an +// empty user it is a no-op. A malformed route URL is an error (configuration bug) +// rather than a silently dropped peer. +func injectRouteCreds(routes []string, user, pass string) ([]string, error) { + if user == "" { + return routes, nil + } + out := make([]string, 0, len(routes)) + for _, r := range routes { + u, err := url.Parse(r) + if err != nil { + return nil, fmt.Errorf("parse route %q: %w", r, err) + } + if u.User == nil { + u.User = url.UserPassword(user, pass) + } + out = append(out, u.String()) + } + return out, nil +} + +// isLoopbackURL reports whether a NATS url targets this host only (loopback). Used +// to guard migrate-to-kv (audit 0008 N6): pushing the allowlist to a REMOTE NATS +// without TLS would send handles/roles/sign-pubs in cleartext, so a remote target +// must be TLS-pinned (--ca). A url we cannot classify is treated as NON-loopback +// (conservative: it then requires --ca). +func isLoopbackURL(natsURL string) bool { + u, err := url.Parse(natsURL) + if err != nil { + return false + } + host := u.Hostname() + switch host { + case "localhost": + return true + case "": + return false + } + ip := net.ParseIP(host) + return ip != nil && ip.IsLoopback() +} + // isLoopbackBind reports whether the --bind value keeps the service reachable // only from this host. An empty bind means "all interfaces" (public), and a // hostname we cannot resolve to a loopback literal is treated as public — the diff --git a/cmd/membershipd/lows_test.go b/cmd/membershipd/lows_test.go new file mode 100644 index 0000000..390025e --- /dev/null +++ b/cmd/membershipd/lows_test.go @@ -0,0 +1,75 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// TestResolveClusterPass verifies the secret resolution precedence +// (file > env > flag) that keeps the cluster password out of argv (issue 0006f). +func TestResolveClusterPass(t *testing.T) { + // file wins over env and flag, and is trimmed. + f := filepath.Join(t.TempDir(), "pass") + if err := os.WriteFile(f, []byte("filesecret\n"), 0o600); err != nil { + t.Fatalf("write: %v", err) + } + if got, src, err := resolveClusterPass("flagsecret", f, "envsecret"); err != nil || got != "filesecret" || src != "file" { + t.Fatalf("file precedence: got %q src %q err %v", got, src, err) + } + // env wins over flag when no file. + if got, src, err := resolveClusterPass("flagsecret", "", "envsecret"); err != nil || got != "envsecret" || src != "env" { + t.Fatalf("env precedence: got %q src %q err %v", got, src, err) + } + // flag is the last resort. + if got, src, err := resolveClusterPass("flagsecret", "", ""); err != nil || got != "flagsecret" || src != "flag" { + t.Fatalf("flag fallback: got %q src %q err %v", got, src, err) + } + // none set. + if got, src, err := resolveClusterPass("", "", ""); err != nil || got != "" || src != "none" { + t.Fatalf("none: got %q src %q err %v", got, src, err) + } + // missing file is an error. + if _, _, err := resolveClusterPass("", filepath.Join(t.TempDir(), "nope"), ""); err == nil { + t.Fatalf("missing file must error") + } +} + +// TestInjectRouteCreds verifies the secret is injected only into routes that omit +// userinfo, so --routes argv need not carry the password (issue 0006f). +func TestInjectRouteCreds(t *testing.T) { + in := []string{"nats://10.0.0.2:6250", "nats://override:pw@10.0.0.3:6250"} + out, err := injectRouteCreds(in, "user", "secret") + if err != nil { + t.Fatalf("inject: %v", err) + } + if !strings.Contains(out[0], "user:secret@10.0.0.2:6250") { + t.Fatalf("creds not injected into bare route: %q", out[0]) + } + if !strings.Contains(out[1], "override:pw@10.0.0.3:6250") { + t.Fatalf("existing userinfo must be preserved: %q", out[1]) + } + // empty user is a no-op. + noop, err := injectRouteCreds(in, "", "") + if err != nil || noop[0] != in[0] { + t.Fatalf("empty user must be a no-op: %v %q", err, noop[0]) + } +} + +// TestIsLoopbackURL guards migrate-to-kv against pushing the allowlist cleartext +// to a remote NATS (issue 0006f, audit 0008 N6). +func TestIsLoopbackURL(t *testing.T) { + loop := []string{"nats://127.0.0.1:4250", "nats://localhost:4250", "nats://[::1]:4250"} + for _, u := range loop { + if !isLoopbackURL(u) { + t.Fatalf("%q should be loopback", u) + } + } + remote := []string{"nats://10.0.0.2:4250", "nats://bus.example.com:4250", "::not-a-url"} + for _, u := range remote { + if isLoopbackURL(u) { + t.Fatalf("%q should NOT be loopback", u) + } + } +} diff --git a/cmd/membershipd/main.go b/cmd/membershipd/main.go index ae8b41f..a789c01 100644 --- a/cmd/membershipd/main.go +++ b/cmd/membershipd/main.go @@ -63,7 +63,12 @@ func main() { clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic") routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250") clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)") - clusterPass = flag.String("cluster-pass", "", "shared route secret password") + clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)") + // Secret out of argv (issue 0006f, audit 0008 N1-low): a password in + // --cluster-pass / --routes is visible in ps/journald. Prefer a file or the + // UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret + // is injected from here. + clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)") routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca") routeTLSKey = flag.String("route-tls-key", "", "this node's route private key") routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)") @@ -89,6 +94,14 @@ func main() { log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend) } + // Resolve the cluster route secret out of argv (file/env preferred). The + // resolved value (not *clusterPass) is what guards the route layer and is + // injected into peer route URLs below. + clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS")) + if err != nil { + log.Fatalf("%v", err) + } + // Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands // --bus-auth enforce. This makes an insecure public startup impossible rather // than silently exposing the bus with the appearance of security. @@ -97,7 +110,7 @@ func main() { } // Cluster route guard (issue 0003a): a public cluster needs a route secret // and mutual route TLS, and the route-TLS flags are all-or-nothing. - if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil { + if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil { log.Fatalf("%v", err) } @@ -178,14 +191,21 @@ func main() { } // Cluster (issue 0003a): with a cluster name, join the route layer for HA. if *clusterName != "" { + // Inject the resolved secret into peer route URLs that omit userinfo, so + // the password need not appear in --routes argv (issue 0006f). + routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved) + if rerr != nil { + log.Fatalf("%v", rerr) + } cc := &embeddednats.ClusterConfig{ Name: *clusterName, Host: *bind, Port: *clusterPort, - Routes: splitRoutes(*routesCSV), + Routes: routes, Username: *clusterUser, - Password: *clusterPass, + Password: clusterPassResolved, } + log.Printf("cluster route secret source: %s", passSource) if *routeTLSCert != "" { rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA) if err != nil { diff --git a/cmd/membershipd/migrate_cli.go b/cmd/membershipd/migrate_cli.go index 997b0ad..faabdca 100644 --- a/cmd/membershipd/migrate_cli.go +++ b/cmd/membershipd/migrate_cli.go @@ -33,6 +33,14 @@ func runMigrateCLI(args []string) { fmt.Fprintln(os.Stderr, "membershipd migrate-to-kv: --nats-url is required (the cluster to write the KV buckets into)") os.Exit(2) } + // Confidentiality guard (issue 0006f, audit 0008 N6): the migration writes the + // allowlist (handles, roles, signing pubkeys) into the KV. Against a REMOTE NATS + // without TLS that metadata would travel in cleartext, so a remote target MUST + // be TLS-pinned with --ca. A loopback target is local-only and exempt. + if !isLoopbackURL(*natsURL) && *ca == "" { + fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: refusing to migrate to remote %q without --ca; the allowlist (handles/roles/sign pubs) would travel in cleartext — pin TLS with --ca, or run against a loopback nats-url\n", *natsURL) + os.Exit(2) + } // Back up the SQLite database first so a botched migration can be undone. var backupPath string diff --git a/deploy/README.md b/deploy/README.md index a94ad60..4d20536 100644 --- a/deploy/README.md +++ b/deploy/README.md @@ -65,3 +65,34 @@ curl -fsS http://:8470/healthz - To run against an external NATS instead of the embedded one, append `--nats-url nats://:4222` to `ExecStart` and re-run `daemon-reload` + `restart`. + +## Clustering (HA) — see `deploy/cluster/` + +The single-node service above is secure on its own. Running unibus as a +multi-node **cluster** has extra hardening rules (issues 0006a–0006f); the full +runbook and the generated material live in `deploy/cluster/`. Key points an +operator must know: + +- **Homogeneous posture (0006d).** Every node MUST run `--bus-auth enforce` (the + binary refuses to join a cluster otherwise) and present mutual route TLS on a + public bind. `/healthz` publishes each node's `posture` so a monitor can flag a + node that is not `enforce`+`acl`+`tls`. +- **Separate route CA (0006f).** The cluster route layer authenticates *nodes*, + not bus users — sign the route certs with a **dedicated cluster CA** + (`--route-tls-ca`), NOT the client data-plane CA (`--tls-cert`'s CA). Keeping + the two trust roots separate means a client cert can never be presented to the + route port. `deploy/cluster/generate-cluster-certs.sh` builds this CA. +- **Secret out of argv (0006f).** Pass the route password via + `--cluster-pass-file` or the `UNIBUS_CLUSTER_PASS` env var, NOT `--cluster-pass` + or a `nats://user:pass@host` in `--routes` (both are visible in `ps`/journald). + When the secret comes from a file/env, list peers as bare `--routes + nats://:6250` and the binary injects the credentials. +- **`migrate-to-kv` confidentiality (0006f).** The migration writes the allowlist + (handles/roles/sign pubs) into KV. Run it only against a **loopback** nats-url, + or pin TLS with `--ca` for a remote target — otherwise that metadata travels in + cleartext. The binary refuses a remote target without `--ca`. +- **R1 is NOT HA (0006a/N3-DoS).** With `--kv-replicas 1` the control plane + (including the nonce bucket) is a single point of failure: if the node owning + the stream dies, every authenticated request fails closed (auth DoS). Real HA + needs **R3** (quorum 2/3): raise replicas in place with `nats stream update + --replicas 3` once the third node has joined. Do not advertise R1 as HA.