// Package embeddednats starts an in-process NATS server with JetStream enabled. // // This lets the whole unibus stack run with `go run` without installing or // managing a separate NATS deployment. In production, point clients at an // external NATS via the --nats-url flag instead of using this. package embeddednats import ( "crypto/tls" "fmt" "net/url" "os" "time" server "github.com/nats-io/nats-server/v2/server" ) // ClusterConfig configures the route layer that links several embedded NATS // servers into a single cluster (issue 0003a). It is the data-plane side of // high availability: with a cluster, a client subject published on one node is // forwarded to subscribers connected to any other node, and (with JetStream // replicas > 1) streams/KV are RAFT-replicated across nodes so the loss of one // node does not lose the bus. // // The route layer is a SEPARATE trust boundary from the client data plane: it // carries server-to-server traffic, so it authenticates NODES, not bus users. // Never reuse the nkey client authenticator here. Routes are secured with their // own shared secret (Username/Password -> NATS Cluster.Authorization) and their // own mutual TLS (TLS, built from the bus CA with busauth.RouteTLSConfig): a // node without the cluster secret and a CA-signed node certificate cannot join // the cluster nor inject messages into it. type ClusterConfig struct { // Name is the cluster name; it MUST be identical on every node or the // servers refuse to gossip routes to each other. Name string // Host and Port are the route listener (server-to-server), distinct from the // client Host/Port. Use a free, non-client port (e.g. 6250). Host string Port int // Routes are the nats-route URLs of the OTHER nodes, e.g. // "nats://user:pass@10.0.0.2:6250". When the route layer is password // protected each URL must carry the same userinfo as the local Username / // Password so this node authenticates outbound to its peers. Routes []string // Username and Password gate the route listener (NATS Cluster.Authorization). // A peer (or impostor) that connects to this node's route port without these // credentials is rejected, so it never becomes a route. Empty disables route // auth (dev / trusted-network only). Username string Password string // TLS, when non-nil, secures the route connections with mutual TLS. Build it // with busauth.RouteTLSConfig(cert, key, ca): the server presents its node // certificate AND requires+verifies the connecting node's certificate against // the bus CA, so an unsigned impostor cannot establish a route even with the // right password. Nil keeps routes plaintext (dev / WireGuard-only). TLS *tls.Config } // ServerConfig is the full set of knobs for the embedded NATS server. The zero // value (empty StoreDir aside) yields a dev-friendly server: JetStream on, bound // to all interfaces, no client auth, no TLS, standalone (no cluster). Secured // deployments set Auth and TLS; HA deployments set ServerName + Cluster; tests // set Host to loopback and a free Port. type ServerConfig struct { StoreDir string // JetStream store directory Host string // bind interface; "" = nats-server default ("0.0.0.0") Port int // listen port // ServerName is this node's unique name within the cluster. JetStream's RAFT // layer requires a stable, unique name per node to form its meta-group; leave // it empty for a standalone server (nats-server then auto-generates one). ServerName string // Auth, when non-nil, is installed as CustomClientAuthentication so the data // plane only accepts approved clients (nkey signature + bus allowlist). Auth server.Authentication // TLS, when non-nil, makes the server present a certificate and require TLS // on the data plane. Clients must trust the issuing CA (see busauth). TLS *tls.Config // Cluster, when non-nil, joins this server to a route cluster for high // availability (issue 0003a). Nil keeps the server standalone (the legacy // single-node behavior). Cluster *ClusterConfig } // Start is a thin backward-compatible wrapper: embedded JetStream server on the // default interface, no auth, no TLS. func Start(storeDir string, port int) (*server.Server, error) { return StartServer(ServerConfig{StoreDir: storeDir, Port: port}) } // StartHost is Start with explicit control over the bind interface. host selects // which network interface the data plane listens on: pass "127.0.0.1" to keep // NATS loopback-only (the safe default for a single-host dev stack) or "0.0.0.0" // to expose it to the LAN so remote peers (phones, other PCs) can connect. An // empty host falls back to the nats-server default ("0.0.0.0", all interfaces). func StartHost(storeDir, host string, port int) (*server.Server, error) { return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port}) } // StartHostAuth is StartHost with an optional custom client authenticator. When // auth is non-nil only clients the authenticator approves may connect; when nil // the server accepts any client (legacy, network-trusted behavior). func StartHostAuth(storeDir, host string, port int, auth server.Authentication) (*server.Server, error) { return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port, Auth: auth}) } // StartServer launches an embedded nats-server with JetStream from cfg. It // blocks until the server is ready to accept connections (up to 5s) and returns // the running server; the caller must Shutdown it. func StartServer(cfg ServerConfig) (*server.Server, error) { // Diagnostic toggle: UNIBUS_NATS_DEBUG=1 enables the embedded nats-server's own // logger (route/RAFT/JetStream errors), which is otherwise silenced. Off by // default so production behavior is unchanged; only set it when debugging the // cluster route layer. debugLevel := os.Getenv("UNIBUS_NATS_DEBUG") debugNATS := debugLevel == "1" || debugLevel == "2" traceNATS := debugLevel == "2" opts := &server.Options{ JetStream: true, StoreDir: cfg.StoreDir, Host: cfg.Host, Port: cfg.Port, ServerName: cfg.ServerName, DontListen: false, // Keep the embedded server quiet by default; the host app logs the URLs. NoLog: !debugNATS, Debug: debugNATS, Trace: traceNATS, Logtime: true, NoSigs: true, } if debugNATS { // Expose the nats-server monitoring endpoint (loopback) so the operator can // inspect /jsz, /routez, /varz while debugging the cluster meta-group. opts.HTTPHost = "127.0.0.1" opts.HTTPPort = 8222 } if cfg.Auth != nil { opts.CustomClientAuthentication = cfg.Auth // A CustomClientAuthentication alone does not make the server advertise a // nonce in its INFO line, and nats.go refuses to connect with an nkey to a // server that does not ("nkeys not supported by the server"). Forcing the // nonce makes nkey clients sign the challenge our authenticator verifies. opts.AlwaysEnableNonce = true } if cfg.TLS != nil { opts.TLSConfig = cfg.TLS opts.TLS = true } if cfg.Cluster != nil { if err := applyClusterOpts(opts, cfg.Cluster); err != nil { return nil, err } } ns, err := server.NewServer(opts) if err != nil { return nil, fmt.Errorf("embeddednats: new server: %w", err) } if debugNATS { ns.ConfigureLogger() } go ns.Start() if !ns.ReadyForConnections(5 * time.Second) { ns.Shutdown() return nil, fmt.Errorf("embeddednats: server not ready for connections within 5s") } return ns, nil } // applyClusterOpts translates a ClusterConfig into the nats-server route options // on opts: the cluster listener (name + host/port + shared-secret auth + mutual // TLS) and the outbound routes to the other nodes. A malformed route URL is a // configuration error and aborts startup rather than silently dropping a peer. func applyClusterOpts(opts *server.Options, c *ClusterConfig) error { opts.Cluster = server.ClusterOpts{ Name: c.Name, Host: c.Host, Port: c.Port, Username: c.Username, Password: c.Password, // Disable route connection pooling (nats-server 2.10+ defaults to a pool of // 3 connections per peer). On a small cluster the pool churns with // "duplicate route"/"client closed" reconnects that interrupt the meta-group // RAFT heartbeats, causing perpetual leader re-elections so the JetStream // meta never becomes current and stream/KV creation hangs (issue 0006g). // PoolSize=-1 forces the classic single route per peer, which is stable for // the 3-node unibus cluster. PoolSize: -1, // NoAdvertise stops the server from gossiping its locally-discovered IPs to // peers. The cluster nodes are Docker hosts, so without this NATS advertises // the docker bridge addresses (172.x / 10.0.x) as reachable routes; peers // then try to dial those private, mutually-unreachable IPs, churning the // route layer and destabilizing the JetStream meta-group. With NoAdvertise // the nodes use ONLY the explicit public-IP routes we configure (issue 0006g). NoAdvertise: true, } if c.TLS != nil { opts.Cluster.TLSConfig = c.TLS // A generous handshake budget: route TLS does a mutual handshake and the // peer may still be booting. The default 2s can flap on a cold cluster. opts.Cluster.TLSTimeout = 5.0 } for _, r := range c.Routes { u, err := url.Parse(r) if err != nil { return fmt.Errorf("embeddednats: parse route %q: %w", r, err) } opts.Routes = append(opts.Routes, u) } return nil } // ClientURL returns a NATS connection URL for the running embedded server. func ClientURL(ns *server.Server) string { return ns.ClientURL() }