Files
unibus/cmd/membershipd/main.go
T
egutierrez b8201a82cd fix(0006f): cluster secret out of argv, migrate-to-kv TLS guard, R1/CA docs (audit 0008 lows)
Low-severity cluster hardening from audit 0008:

- Route secret out of argv (N1-low): --cluster-pass and a nats://user:pass@host in
  --routes are visible in ps/journald. New --cluster-pass-file and the
  UNIBUS_CLUSTER_PASS env var (precedence file > env > flag); the resolved secret
  guards the route layer and is injected into bare --routes entries
  (injectRouteCreds), so peers can be listed as nats://host:6250 with no secret in
  argv. The legacy --cluster-pass stays for dev/compat.
- migrate-to-kv confidentiality (N6): refuse a remote --nats-url without --ca (the
  allowlist would travel cleartext); loopback targets are exempt (isLoopbackURL).
- Docs (N1 route CA, N3 DoS): deploy/README gains a Clustering section — use a
  SEPARATE cluster CA for routes (not the client CA), keep the secret out of argv,
  run migrate-to-kv loopback/TLS only, and R1 is a SPOF of auth (not HA); R3
  quorum is real HA. The generated cert material lives in deploy/cluster/ (0006g).

Tests:
- TestResolveClusterPass (file > env > flag precedence; missing file errors),
- TestInjectRouteCreds (injects only into userinfo-less routes; preserves overrides),
- TestIsLoopbackURL (loopback vs remote vs malformed).

CGO_ENABLED=0 go build/vet/test green; govulncheck 0 reachable.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 17:24:46 +02:00

372 lines
16 KiB
Go

// Command membershipd is the unibus control-plane service: room metadata,
// member directory, sealed key distribution, and the media blob store. The data
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
// JetStream so the whole stack runs with `go run` and nothing to install.
package main
import (
"context"
"crypto/tls"
"encoding/hex"
"flag"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
cs "fn-registry/functions/cybersecurity"
"github.com/nats-io/nats.go"
"github.com/nats-io/nats.go/jetstream"
server "github.com/nats-io/nats-server/v2/server"
"github.com/enmanuel/unibus/pkg/blobstore"
"github.com/enmanuel/unibus/pkg/busauth"
"github.com/enmanuel/unibus/pkg/embeddednats"
"github.com/enmanuel/unibus/pkg/membership"
)
func main() {
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
// (seed/list/revoke bus users) and must be handled before the server flag set
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
// has a shell there already controls the service), which is how the first admin
// is seeded without a chicken-egg auth problem.
if len(os.Args) > 1 && os.Args[1] == "user" {
runUserCLI(os.Args[2:])
return
}
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
// data move for decentralization (issue 0003c). Like the user CLI it runs on
// the host and is dispatched before the server flag set parses os.Args.
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
runMigrateCLI(os.Args[2:])
return
}
var (
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)")
// Secret out of argv (issue 0006f, audit 0008 N1-low): a password in
// --cluster-pass / --routes is visible in ps/journald. Prefer a file or the
// UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret
// is injected from here.
clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)")
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
// Replicated control plane (issue 0006a/c): the JetStream replication factor
// for the shared nonce bucket (and, with --store kv, the control-plane KV).
// 1 for a 1-2 node rollout, 3 for real HA quorum (raise in place with
// `nats stream update --replicas 3` when the third node joins).
kvReplicas = flag.Int("kv-replicas", 1, "JetStream replication factor for the shared nonce/KV buckets (1..3)")
caFile = flag.String("ca", "", "bus CA cert; only used to pin TLS on the internal JetStream connection to an EXTERNAL --nats-url (the embedded server uses an in-process connection that needs no CA)")
// Control-plane store backend (issue 0006c, feature flag decentralized):
// "sqlite" (default) keeps the local single-node SQLite control plane;
// "kv" puts rooms/members/keys/users in replicated JetStream KV so any node
// in the cluster serves the same state.
storeBackend = flag.String("store", "sqlite", "control-plane store backend: sqlite (default, single-node) | kv (replicated JetStream, decentralized)")
)
flag.Parse()
authMode, err := membership.ParseAuthMode(*busAuth)
if err != nil {
log.Fatalf("%v", err)
}
if *storeBackend != "sqlite" && *storeBackend != "kv" {
log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend)
}
// Resolve the cluster route secret out of argv (file/env preferred). The
// resolved value (not *clusterPass) is what guards the route layer and is
// injected into peer route URLs below.
clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS"))
if err != nil {
log.Fatalf("%v", err)
}
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
// --bus-auth enforce. This makes an insecure public startup impossible rather
// than silently exposing the bus with the appearance of security.
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
log.Fatalf("%v", err)
}
// Cluster route guard (issue 0003a): a public cluster needs a route secret
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
log.Fatalf("%v", err)
}
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
log.SetPrefix("[membershipd] ")
// A clustered node shares its control plane with peers, so it needs a JetStream
// client to manage the replicated nonce bucket (issue 0006a). --store kv (issue
// 0006c) also needs JetStream, for the control-plane KV itself. A standalone
// single-node SQLite deployment needs none of this and keeps the in-process,
// in-memory behavior unchanged.
clustered := *clusterName != ""
decentralized := *storeBackend == "kv"
needJS := clustered || decentralized
enforce := authMode == membership.AuthEnforce
// Internal service identity (issue 0006a): when the embedded data plane enforces
// auth, membershipd must still connect to its OWN server to manage JetStream.
// It does so with this ephemeral identity, which the authenticator is built to
// recognize and grant full permissions (it never enters the user allowlist). It
// is only generated when actually needed (JetStream required AND enforce on AND
// the server is embedded), so a standalone or non-enforce node is unchanged.
var internalID cs.Identity
var internalPubHex string
if needJS && enforce && *natsURL == "" {
internalID, err = cs.GenerateIdentity()
if err != nil {
log.Fatalf("generate internal identity: %v", err)
}
internalPubHex = hex.EncodeToString(internalID.SignPub)
}
// The authenticator consults the store through a holder so it can be built
// before the store exists: with --store kv the JetStream KV store opens only
// after NATS is up (the bootstrap cycle). In the default SQLite path the store
// is opened and set into the holder right here, before the server starts, so
// behavior is identical to the pre-0006c baseline. `store` is the final store
// used by the HTTP server (set below for the KV path).
holder := &storeHolder{}
var store membership.Store
if !decentralized {
store, err = membership.Open(*dbPath)
if err != nil {
log.Fatalf("open membership store: %v", err)
}
holder.set(store)
log.Printf("membership store: sqlite %s", *dbPath)
}
// Close whichever store ends up final (SQLite closes its file; the JetStream KV
// store's Close is a no-op — its NATS connection is closed separately).
defer func() {
if store != nil {
store.Close()
}
}()
blobs, err := blobstore.New(*storeDir)
if err != nil {
log.Fatalf("open blob store: %v", err)
}
log.Printf("blob store: %s", *storeDir)
// Data plane: embedded or external NATS. For the embedded server, enforce
// turns on the nkey authenticator (only allowlisted identities may connect)
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
// auth/TLS, so those flags do not apply to it.
var ns *server.Server
natsClientURL := *natsURL
if natsClientURL == "" {
cfg := embeddednats.ServerConfig{
// Bind the embedded NATS to the same interface as the HTTP API so a
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
StoreDir: *natsStore,
Host: *bind,
Port: *natsPort,
ServerName: *serverName,
}
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
if *clusterName != "" {
// Inject the resolved secret into peer route URLs that omit userinfo, so
// the password need not appear in --routes argv (issue 0006f).
routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved)
if rerr != nil {
log.Fatalf("%v", rerr)
}
cc := &embeddednats.ClusterConfig{
Name: *clusterName,
Host: *bind,
Port: *clusterPort,
Routes: routes,
Username: *clusterUser,
Password: clusterPassResolved,
}
log.Printf("cluster route secret source: %s", passSource)
if *routeTLSCert != "" {
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
if err != nil {
log.Fatalf("load route TLS: %v", err)
}
cc.TLS = rtls
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
}
cfg.Cluster = cc
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
}
if authMode == membership.AuthEnforce {
// Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator
// authorizes by the bus allowlist AND confines each connection to the
// subjects of the rooms it belongs to (plus client-infra subjects). This
// closes the wildcard metadata leak where a registered non-member could
// Subscribe(">") and harvest every room's subject and JetStream activity.
// NATS freezes permissions at connect time, so a peer that joins a room
// after connecting must client.RefreshSession to gain that room's subject.
cfg.Auth = busauth.NewNkeyAuthenticatorACLInternal(
holder.IsAuthorized,
busauth.PermissionsFromSubjects(holder.subjectACL),
internalPubHex,
)
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
}
if *tlsCert != "" || *tlsKey != "" {
if *tlsCert == "" || *tlsKey == "" {
log.Fatalf("--tls-cert and --tls-key must be set together")
}
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
if err != nil {
log.Fatalf("load NATS TLS: %v", err)
}
cfg.TLS = tlsCfg
log.Printf("NATS TLS: ON (%s)", *tlsCert)
}
ns, err = embeddednats.StartServer(cfg)
if err != nil {
log.Fatalf("start embedded nats: %v", err)
}
natsClientURL = embeddednats.ClientURL(ns)
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
} else {
log.Printf("using external NATS: %s", natsClientURL)
}
// JetStream client + decentralized store (issue 0006a/c). needJS is set for a
// clustered node (shared nonce bucket) and for --store kv (the KV control
// plane). Open the privileged JetStream client first (in-process for the
// embedded server, a plain client for external NATS), then — for --store kv —
// open the replicated KV store and publish it into the holder so the
// authenticator and HTTP server serve from it. The privileged connection is the
// only client that can connect in this window (the holder still denies everyone
// else; the internal identity bypasses the store).
var js jetstream.JetStream
if needJS {
var internalNC *nats.Conn
if *natsURL == "" {
internalNC, js, err = connectInternalJS(ns, internalID, enforce)
} else {
internalNC, js, err = connectExternalJS(natsClientURL, *caFile)
}
if err != nil {
log.Fatalf("internal JetStream connection (required by --cluster-name/--store kv): %v", err)
}
defer internalNC.Close()
if decentralized {
kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: *kvReplicas})
if err != nil {
log.Fatalf("open decentralized control-plane KV store: %v", err)
}
store = kvStore
holder.set(store)
log.Printf("membership store: jetstream KV (replicas=%d)", *kvReplicas)
}
}
srv := membership.NewServer(store, blobs, authMode)
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
// has no per-subject ACL, so cleartext content would be readable by any
// registered peer. Forcing E2E keeps message content confidential regardless
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
if !isLoopbackBind(*bind) {
srv.RequireEncryptedRooms = true
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
}
// Publish this node's posture on /healthz so a monitor (or a peer) can detect a
// cluster member not running the homogeneous enforce+ACL+TLS posture (audit
// 0008 N1). enforce implies the per-subject ACL in this binary (they are wired
// together above).
srv.Posture = membership.Posture{
Enforce: enforce,
ACL: enforce,
TLS: *tlsCert != "",
Cluster: clustered,
Store: *storeBackend,
}
// Replicated anti-replay (issue 0006a, audit 0008 N3): a clustered node MUST
// share its nonce store across the cluster, or a request accepted on one node
// can be replayed to another. HARD requirement: if the bucket cannot be created
// the node refuses to start rather than run with a per-process cache that leaves
// the replay hole open.
if needJS {
if err := wireReplicatedNonces(srv, js, clustered, *kvReplicas); err != nil {
log.Fatalf("%v", err)
}
if clustered {
log.Printf("anti-replay: replicated nonce bucket \"KV_UNIBUS_nonces\" (replicas=%d) — cluster-safe", *kvReplicas)
}
}
log.Printf("control-plane auth: %s", authMode)
addr := *bind + ":" + *httpPort
httpSrv := &http.Server{
Addr: addr,
Handler: srv,
// Bound request header size so a peer cannot exhaust memory with huge
// headers before any body limit applies (the body ceilings live in the
// membership middleware).
MaxHeaderBytes: membership.MaxHeaderBytes,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
var serveErr error
if *tlsCert != "" {
// Serve the control plane over TLS with the same CA-signed cert as the
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
// social graph) is no longer readable by a network MITM. The fail-open
// guard already requires --bus-auth enforce alongside these flags.
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
log.Printf("HTTPS control-plane API: https://%s", addr)
log.Printf(" health: https://%s/healthz", addr)
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
} else {
log.Printf("HTTP control-plane API: http://%s", addr)
log.Printf(" health: http://%s/healthz", addr)
serveErr = httpSrv.ListenAndServe()
}
if serveErr != nil && serveErr != http.ErrServerClosed {
log.Fatalf("http server: %v", serveErr)
}
}()
// Graceful shutdown on SIGINT/SIGTERM.
stop := make(chan os.Signal, 1)
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
<-stop
log.Printf("shutting down...")
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = httpSrv.Shutdown(ctx)
if ns != nil {
ns.Shutdown()
ns.WaitForShutdown()
}
log.Printf("bye")
}