Files
unibus/cmd/membershipd/main.go
T
agent 9013ea5e33 feat(0003c): membershipd migrate-to-kv (idempotent SQLite -> JetStream KV)
The one-time data move decentralization needs (issue 0003c): copy the
entire control-plane state from the local SQLite database into the
replicated JetStream KV buckets, with a backup taken first.

pkg/membership:
- Snapshot / SealedKeyRecord: a backend-agnostic dump of the whole
  control plane (rooms with their real epoch, members, every sealed-key
  row across epochs, users with status).
- (*sqliteStore).ExportSnapshot and (*jetstreamStore).ExportSnapshot read
  a full Snapshot from each backend; (*jetstreamStore).importSnapshot
  writes one with raw Puts (preserving epoch/status, not resetting to
  defaults) so the migration is faithful and idempotent (every write is
  an overwrite, so re-running converges).
- MigrateSQLiteToKV orchestrates export -> import; BackupSQLite makes a
  consistent copy via SQLite's VACUUM INTO before any migration.

cmd/membershipd:
- `membershipd migrate-to-kv --db <path> --nats-url <url> [--replicas N]
  [--ca <cert>] [--no-backup]` backs up the SQLite file, connects to the
  cluster's NATS, and migrates. Dispatched on the host like `user`.

Tests (DoD: golden + edge + parity):
- TestMigrateSQLiteToKVParity: seed a representative SQLite (two rooms,
  one rekeyed to epoch 2, members, a revoked user); after migration the
  KV ExportSnapshot equals the SQLite ExportSnapshot.
- TestMigrateSQLiteToKVIdempotent: running the migration twice yields the
  same KV state.
- TestBackupSQLiteCreatesConsistentCopy: the backup reopens with
  identical data.
Plus a binary smoke (seed user -> run server -> migrate-to-kv -> re-run):
backup written, 1 user migrated, second run identical.
2026-06-07 15:09:56 +02:00

223 lines
9.2 KiB
Go

// Command membershipd is the unibus control-plane service: room metadata,
// member directory, sealed key distribution, and the media blob store. The data
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
// JetStream so the whole stack runs with `go run` and nothing to install.
package main
import (
"context"
"crypto/tls"
"flag"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
server "github.com/nats-io/nats-server/v2/server"
"github.com/enmanuel/unibus/pkg/blobstore"
"github.com/enmanuel/unibus/pkg/busauth"
"github.com/enmanuel/unibus/pkg/embeddednats"
"github.com/enmanuel/unibus/pkg/membership"
)
func main() {
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
// (seed/list/revoke bus users) and must be handled before the server flag set
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
// has a shell there already controls the service), which is how the first admin
// is seeded without a chicken-egg auth problem.
if len(os.Args) > 1 && os.Args[1] == "user" {
runUserCLI(os.Args[2:])
return
}
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
// data move for decentralization (issue 0003c). Like the user CLI it runs on
// the host and is dispatched before the server flag set parses os.Args.
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
runMigrateCLI(os.Args[2:])
return
}
var (
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
)
flag.Parse()
authMode, err := membership.ParseAuthMode(*busAuth)
if err != nil {
log.Fatalf("%v", err)
}
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
// --bus-auth enforce. This makes an insecure public startup impossible rather
// than silently exposing the bus with the appearance of security.
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
log.Fatalf("%v", err)
}
// Cluster route guard (issue 0003a): a public cluster needs a route secret
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
log.Fatalf("%v", err)
}
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
log.SetPrefix("[membershipd] ")
// Control plane store first: the NATS authenticator consults IsAuthorized, so
// the store must exist before the embedded server starts.
store, err := membership.Open(*dbPath)
if err != nil {
log.Fatalf("open membership store: %v", err)
}
defer store.Close()
log.Printf("membership store: %s", *dbPath)
blobs, err := blobstore.New(*storeDir)
if err != nil {
log.Fatalf("open blob store: %v", err)
}
log.Printf("blob store: %s", *storeDir)
// Data plane: embedded or external NATS. For the embedded server, enforce
// turns on the nkey authenticator (only allowlisted identities may connect)
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
// auth/TLS, so those flags do not apply to it.
var ns *server.Server
natsClientURL := *natsURL
if natsClientURL == "" {
cfg := embeddednats.ServerConfig{
// Bind the embedded NATS to the same interface as the HTTP API so a
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
StoreDir: *natsStore,
Host: *bind,
Port: *natsPort,
ServerName: *serverName,
}
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
if *clusterName != "" {
cc := &embeddednats.ClusterConfig{
Name: *clusterName,
Host: *bind,
Port: *clusterPort,
Routes: splitRoutes(*routesCSV),
Username: *clusterUser,
Password: *clusterPass,
}
if *routeTLSCert != "" {
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
if err != nil {
log.Fatalf("load route TLS: %v", err)
}
cc.TLS = rtls
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
}
cfg.Cluster = cc
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
}
if authMode == membership.AuthEnforce {
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)
log.Printf("NATS nkey authentication: ON (enforce)")
}
if *tlsCert != "" || *tlsKey != "" {
if *tlsCert == "" || *tlsKey == "" {
log.Fatalf("--tls-cert and --tls-key must be set together")
}
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
if err != nil {
log.Fatalf("load NATS TLS: %v", err)
}
cfg.TLS = tlsCfg
log.Printf("NATS TLS: ON (%s)", *tlsCert)
}
ns, err = embeddednats.StartServer(cfg)
if err != nil {
log.Fatalf("start embedded nats: %v", err)
}
natsClientURL = embeddednats.ClientURL(ns)
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
} else {
log.Printf("using external NATS: %s", natsClientURL)
}
srv := membership.NewServer(store, blobs, authMode)
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
// has no per-subject ACL, so cleartext content would be readable by any
// registered peer. Forcing E2E keeps message content confidential regardless
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
if !isLoopbackBind(*bind) {
srv.RequireEncryptedRooms = true
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
}
log.Printf("control-plane auth: %s", authMode)
addr := *bind + ":" + *httpPort
httpSrv := &http.Server{
Addr: addr,
Handler: srv,
// Bound request header size so a peer cannot exhaust memory with huge
// headers before any body limit applies (the body ceilings live in the
// membership middleware).
MaxHeaderBytes: membership.MaxHeaderBytes,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
var serveErr error
if *tlsCert != "" {
// Serve the control plane over TLS with the same CA-signed cert as the
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
// social graph) is no longer readable by a network MITM. The fail-open
// guard already requires --bus-auth enforce alongside these flags.
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
log.Printf("HTTPS control-plane API: https://%s", addr)
log.Printf(" health: https://%s/healthz", addr)
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
} else {
log.Printf("HTTP control-plane API: http://%s", addr)
log.Printf(" health: http://%s/healthz", addr)
serveErr = httpSrv.ListenAndServe()
}
if serveErr != nil && serveErr != http.ErrServerClosed {
log.Fatalf("http server: %v", serveErr)
}
}()
// Graceful shutdown on SIGINT/SIGTERM.
stop := make(chan os.Signal, 1)
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
<-stop
log.Printf("shutting down...")
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = httpSrv.Shutdown(ctx)
if ns != nil {
ns.Shutdown()
ns.WaitForShutdown()
}
log.Printf("bye")
}