9013ea5e33
The one-time data move decentralization needs (issue 0003c): copy the entire control-plane state from the local SQLite database into the replicated JetStream KV buckets, with a backup taken first. pkg/membership: - Snapshot / SealedKeyRecord: a backend-agnostic dump of the whole control plane (rooms with their real epoch, members, every sealed-key row across epochs, users with status). - (*sqliteStore).ExportSnapshot and (*jetstreamStore).ExportSnapshot read a full Snapshot from each backend; (*jetstreamStore).importSnapshot writes one with raw Puts (preserving epoch/status, not resetting to defaults) so the migration is faithful and idempotent (every write is an overwrite, so re-running converges). - MigrateSQLiteToKV orchestrates export -> import; BackupSQLite makes a consistent copy via SQLite's VACUUM INTO before any migration. cmd/membershipd: - `membershipd migrate-to-kv --db <path> --nats-url <url> [--replicas N] [--ca <cert>] [--no-backup]` backs up the SQLite file, connects to the cluster's NATS, and migrates. Dispatched on the host like `user`. Tests (DoD: golden + edge + parity): - TestMigrateSQLiteToKVParity: seed a representative SQLite (two rooms, one rekeyed to epoch 2, members, a revoked user); after migration the KV ExportSnapshot equals the SQLite ExportSnapshot. - TestMigrateSQLiteToKVIdempotent: running the migration twice yields the same KV state. - TestBackupSQLiteCreatesConsistentCopy: the backup reopens with identical data. Plus a binary smoke (seed user -> run server -> migrate-to-kv -> re-run): backup written, 1 user migrated, second run identical.
223 lines
9.2 KiB
Go
223 lines
9.2 KiB
Go
// Command membershipd is the unibus control-plane service: room metadata,
|
|
// member directory, sealed key distribution, and the media blob store. The data
|
|
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
|
|
// JetStream so the whole stack runs with `go run` and nothing to install.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"flag"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
server "github.com/nats-io/nats-server/v2/server"
|
|
|
|
"github.com/enmanuel/unibus/pkg/blobstore"
|
|
"github.com/enmanuel/unibus/pkg/busauth"
|
|
"github.com/enmanuel/unibus/pkg/embeddednats"
|
|
"github.com/enmanuel/unibus/pkg/membership"
|
|
)
|
|
|
|
func main() {
|
|
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
|
|
// (seed/list/revoke bus users) and must be handled before the server flag set
|
|
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
|
|
// has a shell there already controls the service), which is how the first admin
|
|
// is seeded without a chicken-egg auth problem.
|
|
if len(os.Args) > 1 && os.Args[1] == "user" {
|
|
runUserCLI(os.Args[2:])
|
|
return
|
|
}
|
|
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
|
|
// data move for decentralization (issue 0003c). Like the user CLI it runs on
|
|
// the host and is dispatched before the server flag set parses os.Args.
|
|
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
|
|
runMigrateCLI(os.Args[2:])
|
|
return
|
|
}
|
|
|
|
var (
|
|
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
|
|
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
|
|
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
|
|
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
|
|
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
|
|
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
|
|
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
|
|
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
|
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
|
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
|
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
|
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
|
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
|
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
|
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
|
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
|
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
|
|
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
|
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
|
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
|
)
|
|
flag.Parse()
|
|
|
|
authMode, err := membership.ParseAuthMode(*busAuth)
|
|
if err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
|
|
// --bus-auth enforce. This makes an insecure public startup impossible rather
|
|
// than silently exposing the bus with the appearance of security.
|
|
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
|
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
|
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
|
log.SetPrefix("[membershipd] ")
|
|
|
|
// Control plane store first: the NATS authenticator consults IsAuthorized, so
|
|
// the store must exist before the embedded server starts.
|
|
store, err := membership.Open(*dbPath)
|
|
if err != nil {
|
|
log.Fatalf("open membership store: %v", err)
|
|
}
|
|
defer store.Close()
|
|
log.Printf("membership store: %s", *dbPath)
|
|
|
|
blobs, err := blobstore.New(*storeDir)
|
|
if err != nil {
|
|
log.Fatalf("open blob store: %v", err)
|
|
}
|
|
log.Printf("blob store: %s", *storeDir)
|
|
|
|
// Data plane: embedded or external NATS. For the embedded server, enforce
|
|
// turns on the nkey authenticator (only allowlisted identities may connect)
|
|
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
|
|
// auth/TLS, so those flags do not apply to it.
|
|
var ns *server.Server
|
|
natsClientURL := *natsURL
|
|
if natsClientURL == "" {
|
|
cfg := embeddednats.ServerConfig{
|
|
// Bind the embedded NATS to the same interface as the HTTP API so a
|
|
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
|
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
|
StoreDir: *natsStore,
|
|
Host: *bind,
|
|
Port: *natsPort,
|
|
ServerName: *serverName,
|
|
}
|
|
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
|
if *clusterName != "" {
|
|
cc := &embeddednats.ClusterConfig{
|
|
Name: *clusterName,
|
|
Host: *bind,
|
|
Port: *clusterPort,
|
|
Routes: splitRoutes(*routesCSV),
|
|
Username: *clusterUser,
|
|
Password: *clusterPass,
|
|
}
|
|
if *routeTLSCert != "" {
|
|
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
|
if err != nil {
|
|
log.Fatalf("load route TLS: %v", err)
|
|
}
|
|
cc.TLS = rtls
|
|
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
|
}
|
|
cfg.Cluster = cc
|
|
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
|
}
|
|
if authMode == membership.AuthEnforce {
|
|
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)
|
|
log.Printf("NATS nkey authentication: ON (enforce)")
|
|
}
|
|
if *tlsCert != "" || *tlsKey != "" {
|
|
if *tlsCert == "" || *tlsKey == "" {
|
|
log.Fatalf("--tls-cert and --tls-key must be set together")
|
|
}
|
|
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
|
|
if err != nil {
|
|
log.Fatalf("load NATS TLS: %v", err)
|
|
}
|
|
cfg.TLS = tlsCfg
|
|
log.Printf("NATS TLS: ON (%s)", *tlsCert)
|
|
}
|
|
ns, err = embeddednats.StartServer(cfg)
|
|
if err != nil {
|
|
log.Fatalf("start embedded nats: %v", err)
|
|
}
|
|
natsClientURL = embeddednats.ClientURL(ns)
|
|
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
|
|
} else {
|
|
log.Printf("using external NATS: %s", natsClientURL)
|
|
}
|
|
|
|
srv := membership.NewServer(store, blobs, authMode)
|
|
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
|
|
// has no per-subject ACL, so cleartext content would be readable by any
|
|
// registered peer. Forcing E2E keeps message content confidential regardless
|
|
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
|
|
if !isLoopbackBind(*bind) {
|
|
srv.RequireEncryptedRooms = true
|
|
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
|
|
}
|
|
log.Printf("control-plane auth: %s", authMode)
|
|
addr := *bind + ":" + *httpPort
|
|
httpSrv := &http.Server{
|
|
Addr: addr,
|
|
Handler: srv,
|
|
// Bound request header size so a peer cannot exhaust memory with huge
|
|
// headers before any body limit applies (the body ceilings live in the
|
|
// membership middleware).
|
|
MaxHeaderBytes: membership.MaxHeaderBytes,
|
|
ReadHeaderTimeout: 10 * time.Second,
|
|
}
|
|
|
|
go func() {
|
|
var serveErr error
|
|
if *tlsCert != "" {
|
|
// Serve the control plane over TLS with the same CA-signed cert as the
|
|
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
|
|
// social graph) is no longer readable by a network MITM. The fail-open
|
|
// guard already requires --bus-auth enforce alongside these flags.
|
|
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
|
|
log.Printf("HTTPS control-plane API: https://%s", addr)
|
|
log.Printf(" health: https://%s/healthz", addr)
|
|
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
|
|
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
|
|
} else {
|
|
log.Printf("HTTP control-plane API: http://%s", addr)
|
|
log.Printf(" health: http://%s/healthz", addr)
|
|
serveErr = httpSrv.ListenAndServe()
|
|
}
|
|
if serveErr != nil && serveErr != http.ErrServerClosed {
|
|
log.Fatalf("http server: %v", serveErr)
|
|
}
|
|
}()
|
|
|
|
// Graceful shutdown on SIGINT/SIGTERM.
|
|
stop := make(chan os.Signal, 1)
|
|
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
|
<-stop
|
|
log.Printf("shutting down...")
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
_ = httpSrv.Shutdown(ctx)
|
|
if ns != nil {
|
|
ns.Shutdown()
|
|
ns.WaitForShutdown()
|
|
}
|
|
log.Printf("bye")
|
|
}
|