// Command membershipd is the unibus control-plane service: room metadata, // member directory, sealed key distribution, and the media blob store. The data // plane is NATS — if --nats-url is empty it starts an embedded nats-server with // JetStream so the whole stack runs with `go run` and nothing to install. package main import ( "context" "crypto/tls" "encoding/hex" "flag" "log" "net/http" "os" "os/signal" "syscall" "time" cs "fn-registry/functions/cybersecurity" "github.com/nats-io/nats.go" "github.com/nats-io/nats.go/jetstream" server "github.com/nats-io/nats-server/v2/server" "github.com/enmanuel/unibus/pkg/blobstore" "github.com/enmanuel/unibus/pkg/busauth" "github.com/enmanuel/unibus/pkg/embeddednats" "github.com/enmanuel/unibus/pkg/membership" ) func main() { // Subcommand dispatch: `membershipd user ...` is the local administration CLI // (seed/list/revoke bus users) and must be handled before the server flag set // parses os.Args. Running the CLI on the bus host is trusted by design (whoever // has a shell there already controls the service), which is how the first admin // is seeded without a chicken-egg auth problem. if len(os.Args) > 1 && os.Args[1] == "user" { runUserCLI(os.Args[2:]) return } // `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV // data move for decentralization (issue 0003c). Like the user CLI it runs on // the host and is dispatched before the server flag set parses os.Args. if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" { runMigrateCLI(os.Args[2:]) return } var ( bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers") natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server") httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API") dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path") storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory") natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)") natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir") busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)") tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane") tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert") // Cluster (issue 0003a): empty --cluster-name keeps the server standalone. clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA") serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)") clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic") routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250") clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)") clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)") // Secret out of argv (issue 0006f, audit 0008 N1-low): a password in // --cluster-pass / --routes is visible in ps/journald. Prefer a file or the // UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret // is injected from here. clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)") routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca") routeTLSKey = flag.String("route-tls-key", "", "this node's route private key") routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)") // Replicated control plane (issue 0006a/c): the JetStream replication factor // for the shared nonce bucket (and, with --store kv, the control-plane KV). // 1 for a 1-2 node rollout, 3 for real HA quorum (raise in place with // `nats stream update --replicas 3` when the third node joins). kvReplicas = flag.Int("kv-replicas", 1, "JetStream replication factor for the shared nonce/KV buckets (1..3)") caFile = flag.String("ca", "", "bus CA cert; only used to pin TLS on the internal JetStream connection to an EXTERNAL --nats-url (the embedded server uses an in-process connection that needs no CA)") // Control-plane store backend (issue 0006c, feature flag decentralized): // "sqlite" (default) keeps the local single-node SQLite control plane; // "kv" puts rooms/members/keys/users in replicated JetStream KV so any node // in the cluster serves the same state. storeBackend = flag.String("store", "sqlite", "control-plane store backend: sqlite (default, single-node) | kv (replicated JetStream, decentralized)") ) flag.Parse() authMode, err := membership.ParseAuthMode(*busAuth) if err != nil { log.Fatalf("%v", err) } if *storeBackend != "sqlite" && *storeBackend != "kv" { log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend) } // Resolve the cluster route secret out of argv (file/env preferred). The // resolved value (not *clusterPass) is what guards the route layer and is // injected into peer route URLs below. clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS")) if err != nil { log.Fatalf("%v", err) } // Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands // --bus-auth enforce. This makes an insecure public startup impossible rather // than silently exposing the bus with the appearance of security. if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil { log.Fatalf("%v", err) } // Cluster route guard (issue 0003a): a public cluster needs a route secret // and mutual route TLS, and the route-TLS flags are all-or-nothing. if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil { log.Fatalf("%v", err) } log.SetFlags(log.LstdFlags | log.Lmsgprefix) log.SetPrefix("[membershipd] ") // A clustered node shares its control plane with peers, so it needs a JetStream // client to manage the replicated nonce bucket (issue 0006a). --store kv (issue // 0006c) also needs JetStream, for the control-plane KV itself. A standalone // single-node SQLite deployment needs none of this and keeps the in-process, // in-memory behavior unchanged. clustered := *clusterName != "" decentralized := *storeBackend == "kv" needJS := clustered || decentralized enforce := authMode == membership.AuthEnforce // Internal service identity (issue 0006a): when the embedded data plane enforces // auth, membershipd must still connect to its OWN server to manage JetStream. // It does so with this ephemeral identity, which the authenticator is built to // recognize and grant full permissions (it never enters the user allowlist). It // is only generated when actually needed (JetStream required AND enforce on AND // the server is embedded), so a standalone or non-enforce node is unchanged. var internalID cs.Identity var internalPubHex string if needJS && enforce && *natsURL == "" { internalID, err = cs.GenerateIdentity() if err != nil { log.Fatalf("generate internal identity: %v", err) } internalPubHex = hex.EncodeToString(internalID.SignPub) } // The authenticator consults the store through a holder so it can be built // before the store exists: with --store kv the JetStream KV store opens only // after NATS is up (the bootstrap cycle). In the default SQLite path the store // is opened and set into the holder right here, before the server starts, so // behavior is identical to the pre-0006c baseline. `store` is the final store // used by the HTTP server (set below for the KV path). holder := &storeHolder{} var store membership.Store if !decentralized { store, err = membership.Open(*dbPath) if err != nil { log.Fatalf("open membership store: %v", err) } holder.set(store) log.Printf("membership store: sqlite %s", *dbPath) } // Close whichever store ends up final (SQLite closes its file; the JetStream KV // store's Close is a no-op — its NATS connection is closed separately). defer func() { if store != nil { store.Close() } }() blobs, err := blobstore.New(*storeDir) if err != nil { log.Fatalf("open blob store: %v", err) } log.Printf("blob store: %s", *storeDir) // Data plane: embedded or external NATS. For the embedded server, enforce // turns on the nkey authenticator (only allowlisted identities may connect) // and --tls-cert/--tls-key turn on TLS. An external NATS manages its own // auth/TLS, so those flags do not apply to it. var ns *server.Server natsClientURL := *natsURL if natsClientURL == "" { cfg := embeddednats.ServerConfig{ // Bind the embedded NATS to the same interface as the HTTP API so a // single --bind flag governs reachability: 127.0.0.1 keeps the whole // stack loopback-only; 0.0.0.0 exposes both planes to the LAN. StoreDir: *natsStore, Host: *bind, Port: *natsPort, ServerName: *serverName, } // Cluster (issue 0003a): with a cluster name, join the route layer for HA. if *clusterName != "" { // Inject the resolved secret into peer route URLs that omit userinfo, so // the password need not appear in --routes argv (issue 0006f). routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved) if rerr != nil { log.Fatalf("%v", rerr) } cc := &embeddednats.ClusterConfig{ Name: *clusterName, Host: *bind, Port: *clusterPort, Routes: routes, Username: *clusterUser, Password: clusterPassResolved, } log.Printf("cluster route secret source: %s", passSource) if *routeTLSCert != "" { rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA) if err != nil { log.Fatalf("load route TLS: %v", err) } cc.TLS = rtls log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA) } cfg.Cluster = cc log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes)) } if authMode == membership.AuthEnforce { // Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator // authorizes by the bus allowlist AND confines each connection to the // subjects of the rooms it belongs to (plus client-infra subjects). This // closes the wildcard metadata leak where a registered non-member could // Subscribe(">") and harvest every room's subject and JetStream activity. // NATS freezes permissions at connect time, so a peer that joins a room // after connecting must client.RefreshSession to gain that room's subject. cfg.Auth = busauth.NewNkeyAuthenticatorACLInternal( holder.IsAuthorized, busauth.PermissionsFromSubjects(holder.subjectACL), internalPubHex, ) log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)") } if *tlsCert != "" || *tlsKey != "" { if *tlsCert == "" || *tlsKey == "" { log.Fatalf("--tls-cert and --tls-key must be set together") } tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey) if err != nil { log.Fatalf("load NATS TLS: %v", err) } cfg.TLS = tlsCfg log.Printf("NATS TLS: ON (%s)", *tlsCert) } ns, err = embeddednats.StartServer(cfg) if err != nil { log.Fatalf("start embedded nats: %v", err) } natsClientURL = embeddednats.ClientURL(ns) log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL) } else { log.Printf("using external NATS: %s", natsClientURL) } // JetStream client + decentralized store (issue 0006a/c). needJS is set for a // clustered node (shared nonce bucket) and for --store kv (the KV control // plane). Open the privileged JetStream client first (in-process for the // embedded server, a plain client for external NATS), then — for --store kv — // open the replicated KV store and publish it into the holder so the // authenticator and HTTP server serve from it. The privileged connection is the // only client that can connect in this window (the holder still denies everyone // else; the internal identity bypasses the store). var js jetstream.JetStream if needJS { var internalNC *nats.Conn if *natsURL == "" { internalNC, js, err = connectInternalJS(ns, internalID, enforce) } else { internalNC, js, err = connectExternalJS(natsClientURL, *caFile) } if err != nil { log.Fatalf("internal JetStream connection (required by --cluster-name/--store kv): %v", err) } defer internalNC.Close() if decentralized { kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: *kvReplicas}) if err != nil { log.Fatalf("open decentralized control-plane KV store: %v", err) } store = kvStore holder.set(store) log.Printf("membership store: jetstream KV (replicas=%d)", *kvReplicas) } } srv := membership.NewServer(store, blobs, authMode) // On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS // has no per-subject ACL, so cleartext content would be readable by any // registered peer. Forcing E2E keeps message content confidential regardless // (audit H4 minimum defense; see dev/0004d-dataplane-acl.md). if !isLoopbackBind(*bind) { srv.RequireEncryptedRooms = true log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)") } // Publish this node's posture on /healthz so a monitor (or a peer) can detect a // cluster member not running the homogeneous enforce+ACL+TLS posture (audit // 0008 N1). enforce implies the per-subject ACL in this binary (they are wired // together above). srv.Posture = membership.Posture{ Enforce: enforce, ACL: enforce, TLS: *tlsCert != "", Cluster: clustered, Store: *storeBackend, } // Replicated anti-replay (issue 0006a, audit 0008 N3): a clustered node MUST // share its nonce store across the cluster, or a request accepted on one node // can be replayed to another. HARD requirement: if the bucket cannot be created // the node refuses to start rather than run with a per-process cache that leaves // the replay hole open. if needJS { if err := wireReplicatedNonces(srv, js, clustered, *kvReplicas); err != nil { log.Fatalf("%v", err) } if clustered { log.Printf("anti-replay: replicated nonce bucket \"KV_UNIBUS_nonces\" (replicas=%d) — cluster-safe", *kvReplicas) } } log.Printf("control-plane auth: %s", authMode) addr := *bind + ":" + *httpPort httpSrv := &http.Server{ Addr: addr, Handler: srv, // Bound request header size so a peer cannot exhaust memory with huge // headers before any body limit applies (the body ceilings live in the // membership middleware). MaxHeaderBytes: membership.MaxHeaderBytes, ReadHeaderTimeout: 10 * time.Second, } go func() { var serveErr error if *tlsCert != "" { // Serve the control plane over TLS with the same CA-signed cert as the // data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the // social graph) is no longer readable by a network MITM. The fail-open // guard already requires --bus-auth enforce alongside these flags. httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12} log.Printf("HTTPS control-plane API: https://%s", addr) log.Printf(" health: https://%s/healthz", addr) log.Printf("control-plane TLS: ON (%s)", *tlsCert) serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey) } else { log.Printf("HTTP control-plane API: http://%s", addr) log.Printf(" health: http://%s/healthz", addr) serveErr = httpSrv.ListenAndServe() } if serveErr != nil && serveErr != http.ErrServerClosed { log.Fatalf("http server: %v", serveErr) } }() // Graceful shutdown on SIGINT/SIGTERM. stop := make(chan os.Signal, 1) signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM) <-stop log.Printf("shutting down...") ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() _ = httpSrv.Shutdown(ctx) if ns != nil { ns.Shutdown() ns.WaitForShutdown() } log.Printf("bye") }