Merge issue/0006f-lows: cluster secret out of argv + migrate guard + docs (audit 0008 lows)

This commit is contained in:
2026-06-07 17:24:46 +02:00
5 changed files with 208 additions and 4 deletions
+70
View File
@@ -3,6 +3,8 @@ package main
import (
"fmt"
"net"
"net/url"
"os"
"strings"
"github.com/enmanuel/unibus/pkg/membership"
@@ -21,6 +23,74 @@ func splitRoutes(csv string) []string {
return out
}
// resolveClusterPass resolves the cluster route secret WITHOUT leaking it through
// argv (audit 0008 N1-low: --cluster-pass in argv is visible in ps/journald).
// Precedence: --cluster-pass-file (read + trim the file), then the env var
// UNIBUS_CLUSTER_PASS, then the legacy --cluster-pass flag (argv-visible, kept for
// dev/compat). env is injected (os.Getenv result) so the function stays testable.
// It returns the secret and a short source label for logging (never the secret).
func resolveClusterPass(passFlag, passFile, env string) (secret, source string, err error) {
if passFile != "" {
b, rerr := os.ReadFile(passFile)
if rerr != nil {
return "", "", fmt.Errorf("read --cluster-pass-file %q: %w", passFile, rerr)
}
return strings.TrimSpace(string(b)), "file", nil
}
if env != "" {
return env, "env", nil
}
if passFlag != "" {
return passFlag, "flag", nil
}
return "", "none", nil
}
// injectRouteCreds rewrites each route URL that carries NO userinfo to embed
// user:pass, so the cluster secret is supplied once (via file/env) instead of
// repeated in every --routes argv entry where ps/journald would expose it. A route
// that already carries userinfo is left untouched (operator override). With an
// empty user it is a no-op. A malformed route URL is an error (configuration bug)
// rather than a silently dropped peer.
func injectRouteCreds(routes []string, user, pass string) ([]string, error) {
if user == "" {
return routes, nil
}
out := make([]string, 0, len(routes))
for _, r := range routes {
u, err := url.Parse(r)
if err != nil {
return nil, fmt.Errorf("parse route %q: %w", r, err)
}
if u.User == nil {
u.User = url.UserPassword(user, pass)
}
out = append(out, u.String())
}
return out, nil
}
// isLoopbackURL reports whether a NATS url targets this host only (loopback). Used
// to guard migrate-to-kv (audit 0008 N6): pushing the allowlist to a REMOTE NATS
// without TLS would send handles/roles/sign-pubs in cleartext, so a remote target
// must be TLS-pinned (--ca). A url we cannot classify is treated as NON-loopback
// (conservative: it then requires --ca).
func isLoopbackURL(natsURL string) bool {
u, err := url.Parse(natsURL)
if err != nil {
return false
}
host := u.Hostname()
switch host {
case "localhost":
return true
case "":
return false
}
ip := net.ParseIP(host)
return ip != nil && ip.IsLoopback()
}
// isLoopbackBind reports whether the --bind value keeps the service reachable
// only from this host. An empty bind means "all interfaces" (public), and a
// hostname we cannot resolve to a loopback literal is treated as public — the
+75
View File
@@ -0,0 +1,75 @@
package main
import (
"os"
"path/filepath"
"strings"
"testing"
)
// TestResolveClusterPass verifies the secret resolution precedence
// (file > env > flag) that keeps the cluster password out of argv (issue 0006f).
func TestResolveClusterPass(t *testing.T) {
// file wins over env and flag, and is trimmed.
f := filepath.Join(t.TempDir(), "pass")
if err := os.WriteFile(f, []byte("filesecret\n"), 0o600); err != nil {
t.Fatalf("write: %v", err)
}
if got, src, err := resolveClusterPass("flagsecret", f, "envsecret"); err != nil || got != "filesecret" || src != "file" {
t.Fatalf("file precedence: got %q src %q err %v", got, src, err)
}
// env wins over flag when no file.
if got, src, err := resolveClusterPass("flagsecret", "", "envsecret"); err != nil || got != "envsecret" || src != "env" {
t.Fatalf("env precedence: got %q src %q err %v", got, src, err)
}
// flag is the last resort.
if got, src, err := resolveClusterPass("flagsecret", "", ""); err != nil || got != "flagsecret" || src != "flag" {
t.Fatalf("flag fallback: got %q src %q err %v", got, src, err)
}
// none set.
if got, src, err := resolveClusterPass("", "", ""); err != nil || got != "" || src != "none" {
t.Fatalf("none: got %q src %q err %v", got, src, err)
}
// missing file is an error.
if _, _, err := resolveClusterPass("", filepath.Join(t.TempDir(), "nope"), ""); err == nil {
t.Fatalf("missing file must error")
}
}
// TestInjectRouteCreds verifies the secret is injected only into routes that omit
// userinfo, so --routes argv need not carry the password (issue 0006f).
func TestInjectRouteCreds(t *testing.T) {
in := []string{"nats://10.0.0.2:6250", "nats://override:pw@10.0.0.3:6250"}
out, err := injectRouteCreds(in, "user", "secret")
if err != nil {
t.Fatalf("inject: %v", err)
}
if !strings.Contains(out[0], "user:secret@10.0.0.2:6250") {
t.Fatalf("creds not injected into bare route: %q", out[0])
}
if !strings.Contains(out[1], "override:pw@10.0.0.3:6250") {
t.Fatalf("existing userinfo must be preserved: %q", out[1])
}
// empty user is a no-op.
noop, err := injectRouteCreds(in, "", "")
if err != nil || noop[0] != in[0] {
t.Fatalf("empty user must be a no-op: %v %q", err, noop[0])
}
}
// TestIsLoopbackURL guards migrate-to-kv against pushing the allowlist cleartext
// to a remote NATS (issue 0006f, audit 0008 N6).
func TestIsLoopbackURL(t *testing.T) {
loop := []string{"nats://127.0.0.1:4250", "nats://localhost:4250", "nats://[::1]:4250"}
for _, u := range loop {
if !isLoopbackURL(u) {
t.Fatalf("%q should be loopback", u)
}
}
remote := []string{"nats://10.0.0.2:4250", "nats://bus.example.com:4250", "::not-a-url"}
for _, u := range remote {
if isLoopbackURL(u) {
t.Fatalf("%q should NOT be loopback", u)
}
}
}
+24 -4
View File
@@ -63,7 +63,12 @@ func main() {
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)")
// Secret out of argv (issue 0006f, audit 0008 N1-low): a password in
// --cluster-pass / --routes is visible in ps/journald. Prefer a file or the
// UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret
// is injected from here.
clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)")
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
@@ -89,6 +94,14 @@ func main() {
log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend)
}
// Resolve the cluster route secret out of argv (file/env preferred). The
// resolved value (not *clusterPass) is what guards the route layer and is
// injected into peer route URLs below.
clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS"))
if err != nil {
log.Fatalf("%v", err)
}
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
// --bus-auth enforce. This makes an insecure public startup impossible rather
// than silently exposing the bus with the appearance of security.
@@ -97,7 +110,7 @@ func main() {
}
// Cluster route guard (issue 0003a): a public cluster needs a route secret
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
log.Fatalf("%v", err)
}
@@ -178,14 +191,21 @@ func main() {
}
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
if *clusterName != "" {
// Inject the resolved secret into peer route URLs that omit userinfo, so
// the password need not appear in --routes argv (issue 0006f).
routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved)
if rerr != nil {
log.Fatalf("%v", rerr)
}
cc := &embeddednats.ClusterConfig{
Name: *clusterName,
Host: *bind,
Port: *clusterPort,
Routes: splitRoutes(*routesCSV),
Routes: routes,
Username: *clusterUser,
Password: *clusterPass,
Password: clusterPassResolved,
}
log.Printf("cluster route secret source: %s", passSource)
if *routeTLSCert != "" {
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
if err != nil {
+8
View File
@@ -33,6 +33,14 @@ func runMigrateCLI(args []string) {
fmt.Fprintln(os.Stderr, "membershipd migrate-to-kv: --nats-url is required (the cluster to write the KV buckets into)")
os.Exit(2)
}
// Confidentiality guard (issue 0006f, audit 0008 N6): the migration writes the
// allowlist (handles, roles, signing pubkeys) into the KV. Against a REMOTE NATS
// without TLS that metadata would travel in cleartext, so a remote target MUST
// be TLS-pinned with --ca. A loopback target is local-only and exempt.
if !isLoopbackURL(*natsURL) && *ca == "" {
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: refusing to migrate to remote %q without --ca; the allowlist (handles/roles/sign pubs) would travel in cleartext — pin TLS with --ca, or run against a loopback nats-url\n", *natsURL)
os.Exit(2)
}
// Back up the SQLite database first so a botched migration can be undone.
var backupPath string
+31
View File
@@ -65,3 +65,34 @@ curl -fsS http://<host-lan-ip>:8470/healthz
- To run against an external NATS instead of the embedded one, append
`--nats-url nats://<host>:4222` to `ExecStart` and re-run `daemon-reload` +
`restart`.
## Clustering (HA) — see `deploy/cluster/`
The single-node service above is secure on its own. Running unibus as a
multi-node **cluster** has extra hardening rules (issues 0006a0006f); the full
runbook and the generated material live in `deploy/cluster/`. Key points an
operator must know:
- **Homogeneous posture (0006d).** Every node MUST run `--bus-auth enforce` (the
binary refuses to join a cluster otherwise) and present mutual route TLS on a
public bind. `/healthz` publishes each node's `posture` so a monitor can flag a
node that is not `enforce`+`acl`+`tls`.
- **Separate route CA (0006f).** The cluster route layer authenticates *nodes*,
not bus users — sign the route certs with a **dedicated cluster CA**
(`--route-tls-ca`), NOT the client data-plane CA (`--tls-cert`'s CA). Keeping
the two trust roots separate means a client cert can never be presented to the
route port. `deploy/cluster/generate-cluster-certs.sh` builds this CA.
- **Secret out of argv (0006f).** Pass the route password via
`--cluster-pass-file` or the `UNIBUS_CLUSTER_PASS` env var, NOT `--cluster-pass`
or a `nats://user:pass@host` in `--routes` (both are visible in `ps`/journald).
When the secret comes from a file/env, list peers as bare `--routes
nats://<host>:6250` and the binary injects the credentials.
- **`migrate-to-kv` confidentiality (0006f).** The migration writes the allowlist
(handles/roles/sign pubs) into KV. Run it only against a **loopback** nats-url,
or pin TLS with `--ca` for a remote target — otherwise that metadata travels in
cleartext. The binary refuses a remote target without `--ca`.
- **R1 is NOT HA (0006a/N3-DoS).** With `--kv-replicas 1` the control plane
(including the nonce bucket) is a single point of failure: if the node owning
the stream dies, every authenticated request fails closed (auth DoS). Real HA
needs **R3** (quorum 2/3): raise replicas in place with `nats stream update
--replicas 3` once the third node has joined. Do not advertise R1 as HA.