87ef52cc80
The per-subject data-plane ACL existed since 0003e (membership.SubjectACLFor +
busauth.NewNkeyAuthenticatorACL, unit-tested in TestSubjectACLIsolation) but the
binary never used it: cmd/membershipd installed the plain NewNkeyAuthenticator, so
in production a registered NON-member could open a raw NATS connection,
Subscribe(">"), and harvest every room's subject plus JetStream stream/advisory
activity (payload stayed E2E ciphertext, metadata leaked) — the re-audit's H4
vector (report 0006).
Fix:
- New busauth.PermissionsFromSubjects adapts a subject-deriving function into the
PermissionsFunc the ACL authenticator expects (subjects granted as both the
publish and subscribe allow set; a derivation error fails closed). It lives in
busauth so membership stays free of the nats-server dependency.
- cmd/membershipd, under enforce, now installs
NewNkeyAuthenticatorACL(store.IsAuthorized,
PermissionsFromSubjects(membership.SubjectACLFor(store)))
so every connection is confined to the subjects of the rooms it belongs to plus
the client-infra subjects.
- pkg/membership/acl_test.go's helper now delegates to the production wiring
(PermissionsFromSubjects) instead of a test-only reimplementation, so the tests
exercise the real path.
Verification (pkg/membership/acl_test.go):
- TestReaudit_H4_WildcardMetadataLeak: a non-member's Subscribe(">") and any
foreign-subject subscribe raise permission violations; the member still pub/subs
her own room and the non-member captures nothing. With the plain authenticator
(the pre-0005e wiring) the test fails ("wildcard metadata leak still open"),
confirming the wiring is what closes it.
- TestSubjectACLIsolation / TestRefreshSessionGainsNewRoom still green.
- CGO_ENABLED=0 go build ./... && go vet ./... && go test -count=1 ./... green.
Residual (documented): the client-infra grant includes "$JS.API.>", shared by all
peers so per-connection JetStream works; a peer that subscribes specifically to
"$JS.API.>" can still observe stream-management requests whose subjects embed the
room-derived stream name. Fully closing that needs NATS accounts/permissions per
identity (deferred to the 0003 decentralization line). Operational note: NATS
freezes permissions at connect time, so clients must client.RefreshSession after a
membership change to gain a new room's subject; cmd/chat and cmd/worker do not yet
call it, a functional gap to close before an enforce+ACL deployment.
Refs: report 0006 H4, issue 0005e.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
233 lines
9.8 KiB
Go
233 lines
9.8 KiB
Go
// Command membershipd is the unibus control-plane service: room metadata,
|
|
// member directory, sealed key distribution, and the media blob store. The data
|
|
// plane is NATS — if --nats-url is empty it starts an embedded nats-server with
|
|
// JetStream so the whole stack runs with `go run` and nothing to install.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"flag"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
server "github.com/nats-io/nats-server/v2/server"
|
|
|
|
"github.com/enmanuel/unibus/pkg/blobstore"
|
|
"github.com/enmanuel/unibus/pkg/busauth"
|
|
"github.com/enmanuel/unibus/pkg/embeddednats"
|
|
"github.com/enmanuel/unibus/pkg/membership"
|
|
)
|
|
|
|
func main() {
|
|
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
|
|
// (seed/list/revoke bus users) and must be handled before the server flag set
|
|
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
|
|
// has a shell there already controls the service), which is how the first admin
|
|
// is seeded without a chicken-egg auth problem.
|
|
if len(os.Args) > 1 && os.Args[1] == "user" {
|
|
runUserCLI(os.Args[2:])
|
|
return
|
|
}
|
|
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
|
|
// data move for decentralization (issue 0003c). Like the user CLI it runs on
|
|
// the host and is dispatched before the server flag set parses os.Args.
|
|
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
|
|
runMigrateCLI(os.Args[2:])
|
|
return
|
|
}
|
|
|
|
var (
|
|
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
|
|
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
|
|
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
|
|
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
|
|
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
|
|
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
|
|
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
|
|
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
|
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
|
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
|
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
|
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
|
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
|
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
|
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
|
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
|
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
|
|
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
|
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
|
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
|
)
|
|
flag.Parse()
|
|
|
|
authMode, err := membership.ParseAuthMode(*busAuth)
|
|
if err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
|
|
// --bus-auth enforce. This makes an insecure public startup impossible rather
|
|
// than silently exposing the bus with the appearance of security.
|
|
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
|
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
|
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
|
|
log.Fatalf("%v", err)
|
|
}
|
|
|
|
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
|
log.SetPrefix("[membershipd] ")
|
|
|
|
// Control plane store first: the NATS authenticator consults IsAuthorized, so
|
|
// the store must exist before the embedded server starts.
|
|
store, err := membership.Open(*dbPath)
|
|
if err != nil {
|
|
log.Fatalf("open membership store: %v", err)
|
|
}
|
|
defer store.Close()
|
|
log.Printf("membership store: %s", *dbPath)
|
|
|
|
blobs, err := blobstore.New(*storeDir)
|
|
if err != nil {
|
|
log.Fatalf("open blob store: %v", err)
|
|
}
|
|
log.Printf("blob store: %s", *storeDir)
|
|
|
|
// Data plane: embedded or external NATS. For the embedded server, enforce
|
|
// turns on the nkey authenticator (only allowlisted identities may connect)
|
|
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
|
|
// auth/TLS, so those flags do not apply to it.
|
|
var ns *server.Server
|
|
natsClientURL := *natsURL
|
|
if natsClientURL == "" {
|
|
cfg := embeddednats.ServerConfig{
|
|
// Bind the embedded NATS to the same interface as the HTTP API so a
|
|
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
|
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
|
StoreDir: *natsStore,
|
|
Host: *bind,
|
|
Port: *natsPort,
|
|
ServerName: *serverName,
|
|
}
|
|
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
|
if *clusterName != "" {
|
|
cc := &embeddednats.ClusterConfig{
|
|
Name: *clusterName,
|
|
Host: *bind,
|
|
Port: *clusterPort,
|
|
Routes: splitRoutes(*routesCSV),
|
|
Username: *clusterUser,
|
|
Password: *clusterPass,
|
|
}
|
|
if *routeTLSCert != "" {
|
|
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
|
if err != nil {
|
|
log.Fatalf("load route TLS: %v", err)
|
|
}
|
|
cc.TLS = rtls
|
|
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
|
}
|
|
cfg.Cluster = cc
|
|
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
|
}
|
|
if authMode == membership.AuthEnforce {
|
|
// Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator
|
|
// authorizes by the bus allowlist AND confines each connection to the
|
|
// subjects of the rooms it belongs to (plus client-infra subjects). This
|
|
// closes the wildcard metadata leak where a registered non-member could
|
|
// Subscribe(">") and harvest every room's subject and JetStream activity.
|
|
// NATS freezes permissions at connect time, so a peer that joins a room
|
|
// after connecting must client.RefreshSession to gain that room's subject.
|
|
cfg.Auth = busauth.NewNkeyAuthenticatorACL(
|
|
store.IsAuthorized,
|
|
busauth.PermissionsFromSubjects(membership.SubjectACLFor(store)),
|
|
)
|
|
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
|
|
}
|
|
if *tlsCert != "" || *tlsKey != "" {
|
|
if *tlsCert == "" || *tlsKey == "" {
|
|
log.Fatalf("--tls-cert and --tls-key must be set together")
|
|
}
|
|
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
|
|
if err != nil {
|
|
log.Fatalf("load NATS TLS: %v", err)
|
|
}
|
|
cfg.TLS = tlsCfg
|
|
log.Printf("NATS TLS: ON (%s)", *tlsCert)
|
|
}
|
|
ns, err = embeddednats.StartServer(cfg)
|
|
if err != nil {
|
|
log.Fatalf("start embedded nats: %v", err)
|
|
}
|
|
natsClientURL = embeddednats.ClientURL(ns)
|
|
log.Printf("embedded NATS (JetStream) ready: %s", natsClientURL)
|
|
} else {
|
|
log.Printf("using external NATS: %s", natsClientURL)
|
|
}
|
|
|
|
srv := membership.NewServer(store, blobs, authMode)
|
|
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
|
|
// has no per-subject ACL, so cleartext content would be readable by any
|
|
// registered peer. Forcing E2E keeps message content confidential regardless
|
|
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
|
|
if !isLoopbackBind(*bind) {
|
|
srv.RequireEncryptedRooms = true
|
|
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
|
|
}
|
|
log.Printf("control-plane auth: %s", authMode)
|
|
addr := *bind + ":" + *httpPort
|
|
httpSrv := &http.Server{
|
|
Addr: addr,
|
|
Handler: srv,
|
|
// Bound request header size so a peer cannot exhaust memory with huge
|
|
// headers before any body limit applies (the body ceilings live in the
|
|
// membership middleware).
|
|
MaxHeaderBytes: membership.MaxHeaderBytes,
|
|
ReadHeaderTimeout: 10 * time.Second,
|
|
}
|
|
|
|
go func() {
|
|
var serveErr error
|
|
if *tlsCert != "" {
|
|
// Serve the control plane over TLS with the same CA-signed cert as the
|
|
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
|
|
// social graph) is no longer readable by a network MITM. The fail-open
|
|
// guard already requires --bus-auth enforce alongside these flags.
|
|
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
|
|
log.Printf("HTTPS control-plane API: https://%s", addr)
|
|
log.Printf(" health: https://%s/healthz", addr)
|
|
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
|
|
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
|
|
} else {
|
|
log.Printf("HTTP control-plane API: http://%s", addr)
|
|
log.Printf(" health: http://%s/healthz", addr)
|
|
serveErr = httpSrv.ListenAndServe()
|
|
}
|
|
if serveErr != nil && serveErr != http.ErrServerClosed {
|
|
log.Fatalf("http server: %v", serveErr)
|
|
}
|
|
}()
|
|
|
|
// Graceful shutdown on SIGINT/SIGTERM.
|
|
stop := make(chan os.Signal, 1)
|
|
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
|
<-stop
|
|
log.Printf("shutting down...")
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
_ = httpSrv.Shutdown(ctx)
|
|
if ns != nil {
|
|
ns.Shutdown()
|
|
ns.WaitForShutdown()
|
|
}
|
|
log.Printf("bye")
|
|
}
|