b8201a82cd
Low-severity cluster hardening from audit 0008: - Route secret out of argv (N1-low): --cluster-pass and a nats://user:pass@host in --routes are visible in ps/journald. New --cluster-pass-file and the UNIBUS_CLUSTER_PASS env var (precedence file > env > flag); the resolved secret guards the route layer and is injected into bare --routes entries (injectRouteCreds), so peers can be listed as nats://host:6250 with no secret in argv. The legacy --cluster-pass stays for dev/compat. - migrate-to-kv confidentiality (N6): refuse a remote --nats-url without --ca (the allowlist would travel cleartext); loopback targets are exempt (isLoopbackURL). - Docs (N1 route CA, N3 DoS): deploy/README gains a Clustering section — use a SEPARATE cluster CA for routes (not the client CA), keep the secret out of argv, run migrate-to-kv loopback/TLS only, and R1 is a SPOF of auth (not HA); R3 quorum is real HA. The generated cert material lives in deploy/cluster/ (0006g). Tests: - TestResolveClusterPass (file > env > flag precedence; missing file errors), - TestInjectRouteCreds (injects only into userinfo-less routes; preserves overrides), - TestIsLoopbackURL (loopback vs remote vs malformed). CGO_ENABLED=0 go build/vet/test green; govulncheck 0 reachable. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
372 lines
16 KiB
Go
372 lines
16 KiB
Go
// Command membershipd is the unibus control-plane service: room metadata,
|
|
// member directory, sealed key distribution, and the media blob store. The data
|
|
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
|
|
// JetStream so the whole stack runs with `go run` and nothing to install.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/hex"
|
|
"flag"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
cs "fn-registry/functions/cybersecurity"
|
|
|
|
"github.com/nats-io/nats.go"
|
|
"github.com/nats-io/nats.go/jetstream"
|
|
server "github.com/nats-io/nats-server/v2/server"
|
|
|
|
"github.com/enmanuel/unibus/pkg/blobstore"
|
|
"github.com/enmanuel/unibus/pkg/busauth"
|
|
"github.com/enmanuel/unibus/pkg/embeddednats"
|
|
"github.com/enmanuel/unibus/pkg/membership"
|
|
)
|
|
|
|
func main() {
|
|
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
|
|
// (seed/list/revoke bus users) and must be handled before the server flag set
|
|
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
|
|
// has a shell there already controls the service), which is how the first admin
|
|
// is seeded without a chicken-egg auth problem.
|
|
if len(os.Args) > 1 && os.Args[1] == "user" {
|
|
runUserCLI(os.Args[2:])
|
|
return
|
|
}
|
|
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
|
|
// data move for decentralization (issue 0003c). Like the user CLI it runs on
|
|
// the host and is dispatched before the server flag set parses os.Args.
|
|
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
|
|
runMigrateCLI(os.Args[2:])
|
|
return
|
|
}
|
|
|
|
var (
|
|
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
|
|
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
|
|
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
|
|
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
|
|
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
|
|
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
|
|
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
|
|
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
|
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
|
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
|
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
|
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
|
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
|
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
|
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
|
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
|
clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)")
|
|
// Secret out of argv (issue 0006f, audit 0008 N1-low): a password in
|
|
// --cluster-pass / --routes is visible in ps/journald. Prefer a file or the
|
|
// UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret
|
|
// is injected from here.
|
|
clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)")
|
|
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
|
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
|
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
|
// Replicated control plane (issue 0006a/c): the JetStream replication factor
|
|
// for the shared nonce bucket (and, with --store kv, the control-plane KV).
|
|
// 1 for a 1-2 node rollout, 3 for real HA quorum (raise in place with
|
|
// `nats stream update --replicas 3` when the third node joins).
|
|
kvReplicas = flag.Int("kv-replicas", 1, "JetStream replication factor for the shared nonce/KV buckets (1..3)")
|
|
caFile = flag.String("ca", "", "bus CA cert; only used to pin TLS on the internal JetStream connection to an EXTERNAL --nats-url (the embedded server uses an in-process connection that needs no CA)")
|
|
// Control-plane store backend (issue 0006c, feature flag decentralized):
|
|
// "sqlite" (default) keeps the local single-node SQLite control plane;
|
|
// "kv" puts rooms/members/keys/users in replicated JetStream KV so any node
|
|
// in the cluster serves the same state.
|
|
storeBackend = flag.String("store", "sqlite", "control-plane store backend: sqlite (default, single-node) | kv (replicated JetStream, decentralized)")
|
|
)
|
|
flag.Parse()
|
|
|
|
authMode, err := membership.ParseAuthMode(*busAuth)
|
|
if err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
if *storeBackend != "sqlite" && *storeBackend != "kv" {
|
|
log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend)
|
|
}
|
|
|
|
// Resolve the cluster route secret out of argv (file/env preferred). The
|
|
// resolved value (not *clusterPass) is what guards the route layer and is
|
|
// injected into peer route URLs below.
|
|
clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS"))
|
|
if err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
|
|
// --bus-auth enforce. This makes an insecure public startup impossible rather
|
|
// than silently exposing the bus with the appearance of security.
|
|
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
|
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
|
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
|
log.SetPrefix("[membershipd] ")
|
|
|
|
// A clustered node shares its control plane with peers, so it needs a JetStream
|
|
// client to manage the replicated nonce bucket (issue 0006a). --store kv (issue
|
|
// 0006c) also needs JetStream, for the control-plane KV itself. A standalone
|
|
// single-node SQLite deployment needs none of this and keeps the in-process,
|
|
// in-memory behavior unchanged.
|
|
clustered := *clusterName != ""
|
|
decentralized := *storeBackend == "kv"
|
|
needJS := clustered || decentralized
|
|
enforce := authMode == membership.AuthEnforce
|
|
|
|
// Internal service identity (issue 0006a): when the embedded data plane enforces
|
|
// auth, membershipd must still connect to its OWN server to manage JetStream.
|
|
// It does so with this ephemeral identity, which the authenticator is built to
|
|
// recognize and grant full permissions (it never enters the user allowlist). It
|
|
// is only generated when actually needed (JetStream required AND enforce on AND
|
|
// the server is embedded), so a standalone or non-enforce node is unchanged.
|
|
var internalID cs.Identity
|
|
var internalPubHex string
|
|
if needJS && enforce && *natsURL == "" {
|
|
internalID, err = cs.GenerateIdentity()
|
|
if err != nil {
|
|
log.Fatalf("generate internal identity: %v", err)
|
|
}
|
|
internalPubHex = hex.EncodeToString(internalID.SignPub)
|
|
}
|
|
|
|
// The authenticator consults the store through a holder so it can be built
|
|
// before the store exists: with --store kv the JetStream KV store opens only
|
|
// after NATS is up (the bootstrap cycle). In the default SQLite path the store
|
|
// is opened and set into the holder right here, before the server starts, so
|
|
// behavior is identical to the pre-0006c baseline. `store` is the final store
|
|
// used by the HTTP server (set below for the KV path).
|
|
holder := &storeHolder{}
|
|
var store membership.Store
|
|
if !decentralized {
|
|
store, err = membership.Open(*dbPath)
|
|
if err != nil {
|
|
log.Fatalf("open membership store: %v", err)
|
|
}
|
|
holder.set(store)
|
|
log.Printf("membership store: sqlite %s", *dbPath)
|
|
}
|
|
// Close whichever store ends up final (SQLite closes its file; the JetStream KV
|
|
// store's Close is a no-op — its NATS connection is closed separately).
|
|
defer func() {
|
|
if store != nil {
|
|
store.Close()
|
|
}
|
|
}()
|
|
|
|
blobs, err := blobstore.New(*storeDir)
|
|
if err != nil {
|
|
log.Fatalf("open blob store: %v", err)
|
|
}
|
|
log.Printf("blob store: %s", *storeDir)
|
|
|
|
// Data plane: embedded or external NATS. For the embedded server, enforce
|
|
// turns on the nkey authenticator (only allowlisted identities may connect)
|
|
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
|
|
// auth/TLS, so those flags do not apply to it.
|
|
var ns *server.Server
|
|
natsClientURL := *natsURL
|
|
if natsClientURL == "" {
|
|
cfg := embeddednats.ServerConfig{
|
|
// Bind the embedded NATS to the same interface as the HTTP API so a
|
|
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
|
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
|
StoreDir: *natsStore,
|
|
Host: *bind,
|
|
Port: *natsPort,
|
|
ServerName: *serverName,
|
|
}
|
|
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
|
if *clusterName != "" {
|
|
// Inject the resolved secret into peer route URLs that omit userinfo, so
|
|
// the password need not appear in --routes argv (issue 0006f).
|
|
routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved)
|
|
if rerr != nil {
|
|
log.Fatalf("%v", rerr)
|
|
}
|
|
cc := &embeddednats.ClusterConfig{
|
|
Name: *clusterName,
|
|
Host: *bind,
|
|
Port: *clusterPort,
|
|
Routes: routes,
|
|
Username: *clusterUser,
|
|
Password: clusterPassResolved,
|
|
}
|
|
log.Printf("cluster route secret source: %s", passSource)
|
|
if *routeTLSCert != "" {
|
|
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
|
if err != nil {
|
|
log.Fatalf("load route TLS: %v", err)
|
|
}
|
|
cc.TLS = rtls
|
|
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
|
}
|
|
cfg.Cluster = cc
|
|
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
|
}
|
|
if authMode == membership.AuthEnforce {
|
|
// Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator
|
|
// authorizes by the bus allowlist AND confines each connection to the
|
|
// subjects of the rooms it belongs to (plus client-infra subjects). This
|
|
// closes the wildcard metadata leak where a registered non-member could
|
|
// Subscribe(">") and harvest every room's subject and JetStream activity.
|
|
// NATS freezes permissions at connect time, so a peer that joins a room
|
|
// after connecting must client.RefreshSession to gain that room's subject.
|
|
cfg.Auth = busauth.NewNkeyAuthenticatorACLInternal(
|
|
holder.IsAuthorized,
|
|
busauth.PermissionsFromSubjects(holder.subjectACL),
|
|
internalPubHex,
|
|
)
|
|
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
|
|
}
|
|
if *tlsCert != "" || *tlsKey != "" {
|
|
if *tlsCert == "" || *tlsKey == "" {
|
|
log.Fatalf("--tls-cert and --tls-key must be set together")
|
|
}
|
|
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
|
|
if err != nil {
|
|
log.Fatalf("load NATS TLS: %v", err)
|
|
}
|
|
cfg.TLS = tlsCfg
|
|
log.Printf("NATS TLS: ON (%s)", *tlsCert)
|
|
}
|
|
ns, err = embeddednats.StartServer(cfg)
|
|
if err != nil {
|
|
log.Fatalf("start embedded nats: %v", err)
|
|
}
|
|
natsClientURL = embeddednats.ClientURL(ns)
|
|
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
|
|
} else {
|
|
log.Printf("using external NATS: %s", natsClientURL)
|
|
}
|
|
|
|
// JetStream client + decentralized store (issue 0006a/c). needJS is set for a
|
|
// clustered node (shared nonce bucket) and for --store kv (the KV control
|
|
// plane). Open the privileged JetStream client first (in-process for the
|
|
// embedded server, a plain client for external NATS), then — for --store kv —
|
|
// open the replicated KV store and publish it into the holder so the
|
|
// authenticator and HTTP server serve from it. The privileged connection is the
|
|
// only client that can connect in this window (the holder still denies everyone
|
|
// else; the internal identity bypasses the store).
|
|
var js jetstream.JetStream
|
|
if needJS {
|
|
var internalNC *nats.Conn
|
|
if *natsURL == "" {
|
|
internalNC, js, err = connectInternalJS(ns, internalID, enforce)
|
|
} else {
|
|
internalNC, js, err = connectExternalJS(natsClientURL, *caFile)
|
|
}
|
|
if err != nil {
|
|
log.Fatalf("internal JetStream connection (required by --cluster-name/--store kv): %v", err)
|
|
}
|
|
defer internalNC.Close()
|
|
|
|
if decentralized {
|
|
kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: *kvReplicas})
|
|
if err != nil {
|
|
log.Fatalf("open decentralized control-plane KV store: %v", err)
|
|
}
|
|
store = kvStore
|
|
holder.set(store)
|
|
log.Printf("membership store: jetstream KV (replicas=%d)", *kvReplicas)
|
|
}
|
|
}
|
|
|
|
srv := membership.NewServer(store, blobs, authMode)
|
|
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
|
|
// has no per-subject ACL, so cleartext content would be readable by any
|
|
// registered peer. Forcing E2E keeps message content confidential regardless
|
|
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
|
|
if !isLoopbackBind(*bind) {
|
|
srv.RequireEncryptedRooms = true
|
|
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
|
|
}
|
|
// Publish this node's posture on /healthz so a monitor (or a peer) can detect a
|
|
// cluster member not running the homogeneous enforce+ACL+TLS posture (audit
|
|
// 0008 N1). enforce implies the per-subject ACL in this binary (they are wired
|
|
// together above).
|
|
srv.Posture = membership.Posture{
|
|
Enforce: enforce,
|
|
ACL: enforce,
|
|
TLS: *tlsCert != "",
|
|
Cluster: clustered,
|
|
Store: *storeBackend,
|
|
}
|
|
|
|
// Replicated anti-replay (issue 0006a, audit 0008 N3): a clustered node MUST
|
|
// share its nonce store across the cluster, or a request accepted on one node
|
|
// can be replayed to another. HARD requirement: if the bucket cannot be created
|
|
// the node refuses to start rather than run with a per-process cache that leaves
|
|
// the replay hole open.
|
|
if needJS {
|
|
if err := wireReplicatedNonces(srv, js, clustered, *kvReplicas); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
if clustered {
|
|
log.Printf("anti-replay: replicated nonce bucket \"KV_UNIBUS_nonces\" (replicas=%d) — cluster-safe", *kvReplicas)
|
|
}
|
|
}
|
|
|
|
log.Printf("control-plane auth: %s", authMode)
|
|
addr := *bind + ":" + *httpPort
|
|
httpSrv := &http.Server{
|
|
Addr: addr,
|
|
Handler: srv,
|
|
// Bound request header size so a peer cannot exhaust memory with huge
|
|
// headers before any body limit applies (the body ceilings live in the
|
|
// membership middleware).
|
|
MaxHeaderBytes: membership.MaxHeaderBytes,
|
|
ReadHeaderTimeout: 10 * time.Second,
|
|
}
|
|
|
|
go func() {
|
|
var serveErr error
|
|
if *tlsCert != "" {
|
|
// Serve the control plane over TLS with the same CA-signed cert as the
|
|
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
|
|
// social graph) is no longer readable by a network MITM. The fail-open
|
|
// guard already requires --bus-auth enforce alongside these flags.
|
|
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
|
|
log.Printf("HTTPS control-plane API: https://%s", addr)
|
|
log.Printf(" health: https://%s/healthz", addr)
|
|
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
|
|
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
|
|
} else {
|
|
log.Printf("HTTP control-plane API: http://%s", addr)
|
|
log.Printf(" health: http://%s/healthz", addr)
|
|
serveErr = httpSrv.ListenAndServe()
|
|
}
|
|
if serveErr != nil && serveErr != http.ErrServerClosed {
|
|
log.Fatalf("http server: %v", serveErr)
|
|
}
|
|
}()
|
|
|
|
// Graceful shutdown on SIGINT/SIGTERM.
|
|
stop := make(chan os.Signal, 1)
|
|
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
|
<-stop
|
|
log.Printf("shutting down...")
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
_ = httpSrv.Shutdown(ctx)
|
|
if ns != nil {
|
|
ns.Shutdown()
|
|
ns.WaitForShutdown()
|
|
}
|
|
log.Printf("bye")
|
|
}
|