Files
unibus/cmd/membershipd/main.go
T
egutierrez 87ef52cc80 fix(0005e): wire per-subject ACL into membershipd (close H4 wildcard metadata leak)
The per-subject data-plane ACL existed since 0003e (membership.SubjectACLFor +
busauth.NewNkeyAuthenticatorACL, unit-tested in TestSubjectACLIsolation) but the
binary never used it: cmd/membershipd installed the plain NewNkeyAuthenticator, so
in production a registered NON-member could open a raw NATS connection,
Subscribe(">"), and harvest every room's subject plus JetStream stream/advisory
activity (payload stayed E2E ciphertext, metadata leaked) — the re-audit's H4
vector (report 0006).

Fix:
- New busauth.PermissionsFromSubjects adapts a subject-deriving function into the
  PermissionsFunc the ACL authenticator expects (subjects granted as both the
  publish and subscribe allow set; a derivation error fails closed). It lives in
  busauth so membership stays free of the nats-server dependency.
- cmd/membershipd, under enforce, now installs
  NewNkeyAuthenticatorACL(store.IsAuthorized,
    PermissionsFromSubjects(membership.SubjectACLFor(store)))
  so every connection is confined to the subjects of the rooms it belongs to plus
  the client-infra subjects.
- pkg/membership/acl_test.go's helper now delegates to the production wiring
  (PermissionsFromSubjects) instead of a test-only reimplementation, so the tests
  exercise the real path.

Verification (pkg/membership/acl_test.go):
- TestReaudit_H4_WildcardMetadataLeak: a non-member's Subscribe(">") and any
  foreign-subject subscribe raise permission violations; the member still pub/subs
  her own room and the non-member captures nothing. With the plain authenticator
  (the pre-0005e wiring) the test fails ("wildcard metadata leak still open"),
  confirming the wiring is what closes it.
- TestSubjectACLIsolation / TestRefreshSessionGainsNewRoom still green.
- CGO_ENABLED=0 go build ./... && go vet ./... && go test -count=1 ./...  green.

Residual (documented): the client-infra grant includes "$JS.API.>", shared by all
peers so per-connection JetStream works; a peer that subscribes specifically to
"$JS.API.>" can still observe stream-management requests whose subjects embed the
room-derived stream name. Fully closing that needs NATS accounts/permissions per
identity (deferred to the 0003 decentralization line). Operational note: NATS
freezes permissions at connect time, so clients must client.RefreshSession after a
membership change to gain a new room's subject; cmd/chat and cmd/worker do not yet
call it, a functional gap to close before an enforce+ACL deployment.

Refs: report 0006 H4, issue 0005e.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 16:15:52 +02:00

233 lines
9.8 KiB
Go

// Command membershipd is the unibus control-plane service: room metadata,
// member directory, sealed key distribution, and the media blob store. The data
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
// JetStream so the whole stack runs with `go run` and nothing to install.
package main
import (
"context"
"crypto/tls"
"flag"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
server "github.com/nats-io/nats-server/v2/server"
"github.com/enmanuel/unibus/pkg/blobstore"
"github.com/enmanuel/unibus/pkg/busauth"
"github.com/enmanuel/unibus/pkg/embeddednats"
"github.com/enmanuel/unibus/pkg/membership"
)
func main() {
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
// (seed/list/revoke bus users) and must be handled before the server flag set
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
// has a shell there already controls the service), which is how the first admin
// is seeded without a chicken-egg auth problem.
if len(os.Args) > 1 && os.Args[1] == "user" {
runUserCLI(os.Args[2:])
return
}
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
// data move for decentralization (issue 0003c). Like the user CLI it runs on
// the host and is dispatched before the server flag set parses os.Args.
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
runMigrateCLI(os.Args[2:])
return
}
var (
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
)
flag.Parse()
authMode, err := membership.ParseAuthMode(*busAuth)
if err != nil {
log.Fatalf("%v", err)
}
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
// --bus-auth enforce. This makes an insecure public startup impossible rather
// than silently exposing the bus with the appearance of security.
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
log.Fatalf("%v", err)
}
// Cluster route guard (issue 0003a): a public cluster needs a route secret
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
log.Fatalf("%v", err)
}
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
log.SetPrefix("[membershipd] ")
// Control plane store first: the NATS authenticator consults IsAuthorized, so
// the store must exist before the embedded server starts.
store, err := membership.Open(*dbPath)
if err != nil {
log.Fatalf("open membership store: %v", err)
}
defer store.Close()
log.Printf("membership store: %s", *dbPath)
blobs, err := blobstore.New(*storeDir)
if err != nil {
log.Fatalf("open blob store: %v", err)
}
log.Printf("blob store: %s", *storeDir)
// Data plane: embedded or external NATS. For the embedded server, enforce
// turns on the nkey authenticator (only allowlisted identities may connect)
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
// auth/TLS, so those flags do not apply to it.
var ns *server.Server
natsClientURL := *natsURL
if natsClientURL == "" {
cfg := embeddednats.ServerConfig{
// Bind the embedded NATS to the same interface as the HTTP API so a
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
StoreDir: *natsStore,
Host: *bind,
Port: *natsPort,
ServerName: *serverName,
}
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
if *clusterName != "" {
cc := &embeddednats.ClusterConfig{
Name: *clusterName,
Host: *bind,
Port: *clusterPort,
Routes: splitRoutes(*routesCSV),
Username: *clusterUser,
Password: *clusterPass,
}
if *routeTLSCert != "" {
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
if err != nil {
log.Fatalf("load route TLS: %v", err)
}
cc.TLS = rtls
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
}
cfg.Cluster = cc
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
}
if authMode == membership.AuthEnforce {
// Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator
// authorizes by the bus allowlist AND confines each connection to the
// subjects of the rooms it belongs to (plus client-infra subjects). This
// closes the wildcard metadata leak where a registered non-member could
// Subscribe(">") and harvest every room's subject and JetStream activity.
// NATS freezes permissions at connect time, so a peer that joins a room
// after connecting must client.RefreshSession to gain that room's subject.
cfg.Auth = busauth.NewNkeyAuthenticatorACL(
store.IsAuthorized,
busauth.PermissionsFromSubjects(membership.SubjectACLFor(store)),
)
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
}
if *tlsCert != "" || *tlsKey != "" {
if *tlsCert == "" || *tlsKey == "" {
log.Fatalf("--tls-cert and --tls-key must be set together")
}
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
if err != nil {
log.Fatalf("load NATS TLS: %v", err)
}
cfg.TLS = tlsCfg
log.Printf("NATS TLS: ON (%s)", *tlsCert)
}
ns, err = embeddednats.StartServer(cfg)
if err != nil {
log.Fatalf("start embedded nats: %v", err)
}
natsClientURL = embeddednats.ClientURL(ns)
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
} else {
log.Printf("using external NATS: %s", natsClientURL)
}
srv := membership.NewServer(store, blobs, authMode)
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
// has no per-subject ACL, so cleartext content would be readable by any
// registered peer. Forcing E2E keeps message content confidential regardless
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
if !isLoopbackBind(*bind) {
srv.RequireEncryptedRooms = true
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
}
log.Printf("control-plane auth: %s", authMode)
addr := *bind + ":" + *httpPort
httpSrv := &http.Server{
Addr: addr,
Handler: srv,
// Bound request header size so a peer cannot exhaust memory with huge
// headers before any body limit applies (the body ceilings live in the
// membership middleware).
MaxHeaderBytes: membership.MaxHeaderBytes,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
var serveErr error
if *tlsCert != "" {
// Serve the control plane over TLS with the same CA-signed cert as the
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
// social graph) is no longer readable by a network MITM. The fail-open
// guard already requires --bus-auth enforce alongside these flags.
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
log.Printf("HTTPS control-plane API: https://%s", addr)
log.Printf(" health: https://%s/healthz", addr)
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
} else {
log.Printf("HTTP control-plane API: http://%s", addr)
log.Printf(" health: http://%s/healthz", addr)
serveErr = httpSrv.ListenAndServe()
}
if serveErr != nil && serveErr != http.ErrServerClosed {
log.Fatalf("http server: %v", serveErr)
}
}()
// Graceful shutdown on SIGINT/SIGTERM.
stop := make(chan os.Signal, 1)
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
<-stop
log.Printf("shutting down...")
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
_ = httpSrv.Shutdown(ctx)
if ns != nil {
ns.Shutdown()
ns.WaitForShutdown()
}
log.Printf("bye")
}