Merge issue/0006f-lows: cluster secret out of argv + migrate guard + docs (audit 0008 lows)
This commit is contained in:
@@ -3,6 +3,8 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
@@ -21,6 +23,74 @@ func splitRoutes(csv string) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// resolveClusterPass resolves the cluster route secret WITHOUT leaking it through
|
||||
// argv (audit 0008 N1-low: --cluster-pass in argv is visible in ps/journald).
|
||||
// Precedence: --cluster-pass-file (read + trim the file), then the env var
|
||||
// UNIBUS_CLUSTER_PASS, then the legacy --cluster-pass flag (argv-visible, kept for
|
||||
// dev/compat). env is injected (os.Getenv result) so the function stays testable.
|
||||
// It returns the secret and a short source label for logging (never the secret).
|
||||
func resolveClusterPass(passFlag, passFile, env string) (secret, source string, err error) {
|
||||
if passFile != "" {
|
||||
b, rerr := os.ReadFile(passFile)
|
||||
if rerr != nil {
|
||||
return "", "", fmt.Errorf("read --cluster-pass-file %q: %w", passFile, rerr)
|
||||
}
|
||||
return strings.TrimSpace(string(b)), "file", nil
|
||||
}
|
||||
if env != "" {
|
||||
return env, "env", nil
|
||||
}
|
||||
if passFlag != "" {
|
||||
return passFlag, "flag", nil
|
||||
}
|
||||
return "", "none", nil
|
||||
}
|
||||
|
||||
// injectRouteCreds rewrites each route URL that carries NO userinfo to embed
|
||||
// user:pass, so the cluster secret is supplied once (via file/env) instead of
|
||||
// repeated in every --routes argv entry where ps/journald would expose it. A route
|
||||
// that already carries userinfo is left untouched (operator override). With an
|
||||
// empty user it is a no-op. A malformed route URL is an error (configuration bug)
|
||||
// rather than a silently dropped peer.
|
||||
func injectRouteCreds(routes []string, user, pass string) ([]string, error) {
|
||||
if user == "" {
|
||||
return routes, nil
|
||||
}
|
||||
out := make([]string, 0, len(routes))
|
||||
for _, r := range routes {
|
||||
u, err := url.Parse(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse route %q: %w", r, err)
|
||||
}
|
||||
if u.User == nil {
|
||||
u.User = url.UserPassword(user, pass)
|
||||
}
|
||||
out = append(out, u.String())
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// isLoopbackURL reports whether a NATS url targets this host only (loopback). Used
|
||||
// to guard migrate-to-kv (audit 0008 N6): pushing the allowlist to a REMOTE NATS
|
||||
// without TLS would send handles/roles/sign-pubs in cleartext, so a remote target
|
||||
// must be TLS-pinned (--ca). A url we cannot classify is treated as NON-loopback
|
||||
// (conservative: it then requires --ca).
|
||||
func isLoopbackURL(natsURL string) bool {
|
||||
u, err := url.Parse(natsURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := u.Hostname()
|
||||
switch host {
|
||||
case "localhost":
|
||||
return true
|
||||
case "":
|
||||
return false
|
||||
}
|
||||
ip := net.ParseIP(host)
|
||||
return ip != nil && ip.IsLoopback()
|
||||
}
|
||||
|
||||
// isLoopbackBind reports whether the --bind value keeps the service reachable
|
||||
// only from this host. An empty bind means "all interfaces" (public), and a
|
||||
// hostname we cannot resolve to a loopback literal is treated as public — the
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestResolveClusterPass verifies the secret resolution precedence
|
||||
// (file > env > flag) that keeps the cluster password out of argv (issue 0006f).
|
||||
func TestResolveClusterPass(t *testing.T) {
|
||||
// file wins over env and flag, and is trimmed.
|
||||
f := filepath.Join(t.TempDir(), "pass")
|
||||
if err := os.WriteFile(f, []byte("filesecret\n"), 0o600); err != nil {
|
||||
t.Fatalf("write: %v", err)
|
||||
}
|
||||
if got, src, err := resolveClusterPass("flagsecret", f, "envsecret"); err != nil || got != "filesecret" || src != "file" {
|
||||
t.Fatalf("file precedence: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// env wins over flag when no file.
|
||||
if got, src, err := resolveClusterPass("flagsecret", "", "envsecret"); err != nil || got != "envsecret" || src != "env" {
|
||||
t.Fatalf("env precedence: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// flag is the last resort.
|
||||
if got, src, err := resolveClusterPass("flagsecret", "", ""); err != nil || got != "flagsecret" || src != "flag" {
|
||||
t.Fatalf("flag fallback: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// none set.
|
||||
if got, src, err := resolveClusterPass("", "", ""); err != nil || got != "" || src != "none" {
|
||||
t.Fatalf("none: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// missing file is an error.
|
||||
if _, _, err := resolveClusterPass("", filepath.Join(t.TempDir(), "nope"), ""); err == nil {
|
||||
t.Fatalf("missing file must error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestInjectRouteCreds verifies the secret is injected only into routes that omit
|
||||
// userinfo, so --routes argv need not carry the password (issue 0006f).
|
||||
func TestInjectRouteCreds(t *testing.T) {
|
||||
in := []string{"nats://10.0.0.2:6250", "nats://override:pw@10.0.0.3:6250"}
|
||||
out, err := injectRouteCreds(in, "user", "secret")
|
||||
if err != nil {
|
||||
t.Fatalf("inject: %v", err)
|
||||
}
|
||||
if !strings.Contains(out[0], "user:secret@10.0.0.2:6250") {
|
||||
t.Fatalf("creds not injected into bare route: %q", out[0])
|
||||
}
|
||||
if !strings.Contains(out[1], "override:pw@10.0.0.3:6250") {
|
||||
t.Fatalf("existing userinfo must be preserved: %q", out[1])
|
||||
}
|
||||
// empty user is a no-op.
|
||||
noop, err := injectRouteCreds(in, "", "")
|
||||
if err != nil || noop[0] != in[0] {
|
||||
t.Fatalf("empty user must be a no-op: %v %q", err, noop[0])
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsLoopbackURL guards migrate-to-kv against pushing the allowlist cleartext
|
||||
// to a remote NATS (issue 0006f, audit 0008 N6).
|
||||
func TestIsLoopbackURL(t *testing.T) {
|
||||
loop := []string{"nats://127.0.0.1:4250", "nats://localhost:4250", "nats://[::1]:4250"}
|
||||
for _, u := range loop {
|
||||
if !isLoopbackURL(u) {
|
||||
t.Fatalf("%q should be loopback", u)
|
||||
}
|
||||
}
|
||||
remote := []string{"nats://10.0.0.2:4250", "nats://bus.example.com:4250", "::not-a-url"}
|
||||
for _, u := range remote {
|
||||
if isLoopbackURL(u) {
|
||||
t.Fatalf("%q should NOT be loopback", u)
|
||||
}
|
||||
}
|
||||
}
|
||||
+24
-4
@@ -63,7 +63,12 @@ func main() {
|
||||
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
||||
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
||||
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
||||
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
|
||||
clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)")
|
||||
// Secret out of argv (issue 0006f, audit 0008 N1-low): a password in
|
||||
// --cluster-pass / --routes is visible in ps/journald. Prefer a file or the
|
||||
// UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret
|
||||
// is injected from here.
|
||||
clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)")
|
||||
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
||||
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
||||
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
||||
@@ -89,6 +94,14 @@ func main() {
|
||||
log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend)
|
||||
}
|
||||
|
||||
// Resolve the cluster route secret out of argv (file/env preferred). The
|
||||
// resolved value (not *clusterPass) is what guards the route layer and is
|
||||
// injected into peer route URLs below.
|
||||
clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS"))
|
||||
if err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
|
||||
// --bus-auth enforce. This makes an insecure public startup impossible rather
|
||||
// than silently exposing the bus with the appearance of security.
|
||||
@@ -97,7 +110,7 @@ func main() {
|
||||
}
|
||||
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
||||
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
||||
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
|
||||
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
@@ -178,14 +191,21 @@ func main() {
|
||||
}
|
||||
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
||||
if *clusterName != "" {
|
||||
// Inject the resolved secret into peer route URLs that omit userinfo, so
|
||||
// the password need not appear in --routes argv (issue 0006f).
|
||||
routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved)
|
||||
if rerr != nil {
|
||||
log.Fatalf("%v", rerr)
|
||||
}
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: *clusterName,
|
||||
Host: *bind,
|
||||
Port: *clusterPort,
|
||||
Routes: splitRoutes(*routesCSV),
|
||||
Routes: routes,
|
||||
Username: *clusterUser,
|
||||
Password: *clusterPass,
|
||||
Password: clusterPassResolved,
|
||||
}
|
||||
log.Printf("cluster route secret source: %s", passSource)
|
||||
if *routeTLSCert != "" {
|
||||
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
||||
if err != nil {
|
||||
|
||||
@@ -33,6 +33,14 @@ func runMigrateCLI(args []string) {
|
||||
fmt.Fprintln(os.Stderr, "membershipd migrate-to-kv: --nats-url is required (the cluster to write the KV buckets into)")
|
||||
os.Exit(2)
|
||||
}
|
||||
// Confidentiality guard (issue 0006f, audit 0008 N6): the migration writes the
|
||||
// allowlist (handles, roles, signing pubkeys) into the KV. Against a REMOTE NATS
|
||||
// without TLS that metadata would travel in cleartext, so a remote target MUST
|
||||
// be TLS-pinned with --ca. A loopback target is local-only and exempt.
|
||||
if !isLoopbackURL(*natsURL) && *ca == "" {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: refusing to migrate to remote %q without --ca; the allowlist (handles/roles/sign pubs) would travel in cleartext — pin TLS with --ca, or run against a loopback nats-url\n", *natsURL)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
// Back up the SQLite database first so a botched migration can be undone.
|
||||
var backupPath string
|
||||
|
||||
@@ -65,3 +65,34 @@ curl -fsS http://<host-lan-ip>:8470/healthz
|
||||
- To run against an external NATS instead of the embedded one, append
|
||||
`--nats-url nats://<host>:4222` to `ExecStart` and re-run `daemon-reload` +
|
||||
`restart`.
|
||||
|
||||
## Clustering (HA) — see `deploy/cluster/`
|
||||
|
||||
The single-node service above is secure on its own. Running unibus as a
|
||||
multi-node **cluster** has extra hardening rules (issues 0006a–0006f); the full
|
||||
runbook and the generated material live in `deploy/cluster/`. Key points an
|
||||
operator must know:
|
||||
|
||||
- **Homogeneous posture (0006d).** Every node MUST run `--bus-auth enforce` (the
|
||||
binary refuses to join a cluster otherwise) and present mutual route TLS on a
|
||||
public bind. `/healthz` publishes each node's `posture` so a monitor can flag a
|
||||
node that is not `enforce`+`acl`+`tls`.
|
||||
- **Separate route CA (0006f).** The cluster route layer authenticates *nodes*,
|
||||
not bus users — sign the route certs with a **dedicated cluster CA**
|
||||
(`--route-tls-ca`), NOT the client data-plane CA (`--tls-cert`'s CA). Keeping
|
||||
the two trust roots separate means a client cert can never be presented to the
|
||||
route port. `deploy/cluster/generate-cluster-certs.sh` builds this CA.
|
||||
- **Secret out of argv (0006f).** Pass the route password via
|
||||
`--cluster-pass-file` or the `UNIBUS_CLUSTER_PASS` env var, NOT `--cluster-pass`
|
||||
or a `nats://user:pass@host` in `--routes` (both are visible in `ps`/journald).
|
||||
When the secret comes from a file/env, list peers as bare `--routes
|
||||
nats://<host>:6250` and the binary injects the credentials.
|
||||
- **`migrate-to-kv` confidentiality (0006f).** The migration writes the allowlist
|
||||
(handles/roles/sign pubs) into KV. Run it only against a **loopback** nats-url,
|
||||
or pin TLS with `--ca` for a remote target — otherwise that metadata travels in
|
||||
cleartext. The binary refuses a remote target without `--ca`.
|
||||
- **R1 is NOT HA (0006a/N3-DoS).** With `--kv-replicas 1` the control plane
|
||||
(including the nonce bucket) is a single point of failure: if the node owning
|
||||
the stream dies, every authenticated request fails closed (auth DoS). Real HA
|
||||
needs **R3** (quorum 2/3): raise replicas in place with `nats stream update
|
||||
--replicas 3` once the third node has joined. Do not advertise R1 as HA.
|
||||
|
||||
Reference in New Issue
Block a user