Compare commits
13 Commits
618f6b61da
...
d821bc1794
| Author | SHA1 | Date | |
|---|---|---|---|
| d821bc1794 | |||
| da420513b6 | |||
| 96abb75a2e | |||
| 37c778ca9a | |||
| c6ad63059f | |||
| 649dc9e244 | |||
| d6e668b984 | |||
| 94e7ced1ef | |||
| 9013ea5e33 | |||
| b8c9b2b652 | |||
| 6b3ace1d39 | |||
| 3230b31ade | |||
| c90f145a05 |
@@ -2,7 +2,7 @@
|
||||
name: unibus
|
||||
lang: go
|
||||
domain: infra
|
||||
version: 0.5.0
|
||||
version: 0.6.0
|
||||
description: "Bus de mensajería unificado sobre NATS+JetStream con cifrado E2E por room (megolm/olm reducido): service de membresía/claves, librería cliente y peers demo."
|
||||
tags: [service, messaging, nats, e2e]
|
||||
uses_functions:
|
||||
@@ -154,6 +154,24 @@ agent.<nombre>.{in,out} inbox/outbox de agente LLM (agent.scout.in)
|
||||
|
||||
## Capability growth log
|
||||
|
||||
- v0.6.0 (2026-06-07) — descentralización / alta disponibilidad (issue 0003,
|
||||
fases 0003a–0003e), report 0006. El servidor NATS embebido gana soporte de
|
||||
cluster con routes autenticadas (secreto de cluster) y TLS mutuo de nodo
|
||||
(`pkg/embeddednats.ClusterConfig` + `busauth.RouteTLSConfig`, reusando la CA
|
||||
del 0001). El control plane (`pkg/membership.Store`) pasa a interfaz por
|
||||
branch-by-abstraction: `sqliteStore` (default) + `jetstreamStore` nuevo sobre
|
||||
JetStream KV replicado (réplicas configurables R1→R3), con `IsAuthorized`
|
||||
fail-closed ante pérdida de quorum. `membershipd migrate-to-kv` mueve el
|
||||
estado SQLite→KV de forma idempotente con backup previo. Los blobs
|
||||
(`pkg/blobstore.Store`, ahora interfaz) ganan un backend NATS Object Store
|
||||
replicado además del disco. El cliente acepta listas de seeds NATS y de
|
||||
control planes con failover/reconnect nativo, el anti-replay pasa a un store
|
||||
de nonces compartido en KV con TTL (cierra el agujero de replay multi-nodo), y
|
||||
se implementa la ACL por subject derivada de pertenencia (audit H4 residual:
|
||||
`busauth.NewNkeyAuthenticatorACL` + `membership.SubjectACLFor` +
|
||||
`client.RefreshSession`). Todo viaja detrás del flag `decentralized` (off):
|
||||
el comportamiento de un solo nodo (SQLite + disco) no cambia y master sigue
|
||||
verde. El despliegue multi-nodo real (0003f) lo ejecuta el humano.
|
||||
- v0.5.0 (2026-06-07) — hardening de seguridad (issue 0004) que cierra los
|
||||
hallazgos de la auditoría red-team (report 0004) y lleva el veredicto de
|
||||
exposición pública de "NO" a "sí-con-condiciones". Anti-DoS pre-auth
|
||||
|
||||
@@ -3,10 +3,24 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// splitRoutes parses the comma-separated --routes flag into a clean slice of
|
||||
// route URLs, dropping empty entries and surrounding whitespace so a trailing
|
||||
// comma or a spaced list does not yield a bogus empty route.
|
||||
func splitRoutes(csv string) []string {
|
||||
var out []string
|
||||
for _, r := range strings.Split(csv, ",") {
|
||||
if r = strings.TrimSpace(r); r != "" {
|
||||
out = append(out, r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// isLoopbackBind reports whether the --bind value keeps the service reachable
|
||||
// only from this host. An empty bind means "all interfaces" (public), and a
|
||||
// hostname we cannot resolve to a loopback literal is treated as public — the
|
||||
@@ -48,3 +62,42 @@ func validateBootConfig(bind string, mode membership.AuthMode, tlsCert, tlsKey s
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateClusterConfig guards the cluster route layer (issue 0003a). The route
|
||||
// layer is a server-to-server trust boundary distinct from the client data
|
||||
// plane: leaving it open lets anyone who reaches the route port join the cluster
|
||||
// or inject messages into the whole bus (audit 0004, "auth of the cluster
|
||||
// routes"). So on a public (non-loopback) bind, a cluster MUST carry both a
|
||||
// shared route secret AND mutual route TLS. It is a pure function of the parsed
|
||||
// flags. An empty clusterName means "no cluster" (standalone) and is always
|
||||
// allowed.
|
||||
//
|
||||
// The three route-TLS paths are all-or-nothing (mutual TLS needs the node cert,
|
||||
// its key, and the CA together), independent of the bind, so a partial TLS
|
||||
// config never silently degrades to plaintext routes.
|
||||
func validateClusterConfig(clusterName, bind, user, pass, rtCert, rtKey, rtCA string) error {
|
||||
rtAny := rtCert != "" || rtKey != "" || rtCA != ""
|
||||
rtAll := rtCert != "" && rtKey != "" && rtCA != ""
|
||||
if rtAny && !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --route-tls-cert/--route-tls-key/--route-tls-ca must be set together (mutual route TLS needs all three)")
|
||||
}
|
||||
if clusterName == "" {
|
||||
return nil // standalone: no route layer to secure
|
||||
}
|
||||
if isLoopbackBind(bind) {
|
||||
return nil // loopback cluster is dev-only and unreachable from outside
|
||||
}
|
||||
// Public cluster: demand a route secret and mutual route TLS.
|
||||
if user == "" || pass == "" {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires --cluster-user and --cluster-pass; an unauthenticated route port lets anyone join the cluster",
|
||||
clusterName, bind)
|
||||
}
|
||||
if !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires mutual route TLS (--route-tls-cert/--route-tls-key/--route-tls-ca); plaintext routes expose server-to-server traffic and admit unsigned nodes",
|
||||
clusterName, bind)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -70,3 +70,63 @@ func TestBootConfigPolicy(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestClusterConfigPolicy is the cluster route guard (issue 0003a): a standalone
|
||||
// server is always fine; a loopback cluster is dev-only and unguarded; a public
|
||||
// cluster demands both a route secret and complete mutual route TLS; and the
|
||||
// route-TLS flags are all-or-nothing regardless of bind.
|
||||
func TestClusterConfigPolicy(t *testing.T) {
|
||||
const c, k, ca = "node.crt", "node.key", "ca.crt"
|
||||
cases := []struct {
|
||||
name string
|
||||
clusterName, bind string
|
||||
user, pass string
|
||||
rtCert, rtKey, rtCA string
|
||||
wantErr bool
|
||||
}{
|
||||
// Standalone (no cluster name) is always allowed, even on a public bind.
|
||||
{"standalone-public", "", "0.0.0.0", "", "", "", "", "", false},
|
||||
// Loopback dev cluster: unguarded (unreachable from outside).
|
||||
{"loopback-cluster-bare", "unibus", "127.0.0.1", "", "", "", "", "", false},
|
||||
// Golden: full public HA config.
|
||||
{"public-full", "unibus", "0.0.0.0", "u", "p", c, k, ca, false},
|
||||
// Error: public cluster without a route secret.
|
||||
{"public-no-secret", "unibus", "0.0.0.0", "", "", c, k, ca, true},
|
||||
{"public-half-secret", "unibus", "0.0.0.0", "u", "", c, k, ca, true},
|
||||
// Error: public cluster without mutual route TLS.
|
||||
{"public-no-tls", "unibus", "10.0.0.1", "u", "p", "", "", "", true},
|
||||
// Error: partial route-TLS flags trip regardless of bind.
|
||||
{"loopback-partial-tls", "unibus", "127.0.0.1", "", "", c, "", "", true},
|
||||
{"standalone-partial-tls", "", "127.0.0.1", "", "", c, k, "", true},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := validateClusterConfig(tc.clusterName, tc.bind, tc.user, tc.pass, tc.rtCert, tc.rtKey, tc.rtCA)
|
||||
if tc.wantErr && err == nil {
|
||||
t.Fatalf("cluster config %+v should be refused", tc)
|
||||
}
|
||||
if !tc.wantErr && err != nil {
|
||||
t.Fatalf("cluster config %+v should be allowed, got: %v", tc, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitRoutes(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"", 0},
|
||||
{"nats://a:1", 1},
|
||||
{"nats://a:1,nats://b:2", 2},
|
||||
{" nats://a:1 , nats://b:2 ", 2}, // spaces trimmed
|
||||
{"nats://a:1,,", 1}, // empty entries dropped
|
||||
{",", 0},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := splitRoutes(c.in); len(got) != c.want {
|
||||
t.Fatalf("splitRoutes(%q) = %v (len %d), want len %d", c.in, got, len(got), c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+47
-3
@@ -33,6 +33,13 @@ func main() {
|
||||
runUserCLI(os.Args[2:])
|
||||
return
|
||||
}
|
||||
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
|
||||
// data move for decentralization (issue 0003c). Like the user CLI it runs on
|
||||
// the host and is dispatched before the server flag set parses os.Args.
|
||||
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
|
||||
runMigrateCLI(os.Args[2:])
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
|
||||
@@ -45,6 +52,16 @@ func main() {
|
||||
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
||||
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
||||
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
||||
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
||||
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
||||
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
||||
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
||||
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
||||
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
||||
clusterPass = flag.String("cluster-pass", "", "shared route secret password")
|
||||
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
||||
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
||||
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
@@ -59,6 +76,11 @@ func main() {
|
||||
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
||||
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
||||
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, *clusterPass, *routeTLSCert, *routeTLSKey, *routeTLSCA); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
||||
log.SetPrefix("[membershipd] ")
|
||||
@@ -89,9 +111,31 @@ func main() {
|
||||
// Bind the embedded NATS to the same interface as the HTTP API so a
|
||||
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
||||
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
||||
StoreDir: *natsStore,
|
||||
Host: *bind,
|
||||
Port: *natsPort,
|
||||
StoreDir: *natsStore,
|
||||
Host: *bind,
|
||||
Port: *natsPort,
|
||||
ServerName: *serverName,
|
||||
}
|
||||
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
||||
if *clusterName != "" {
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: *clusterName,
|
||||
Host: *bind,
|
||||
Port: *clusterPort,
|
||||
Routes: splitRoutes(*routesCSV),
|
||||
Username: *clusterUser,
|
||||
Password: *clusterPass,
|
||||
}
|
||||
if *routeTLSCert != "" {
|
||||
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
||||
if err != nil {
|
||||
log.Fatalf("load route TLS: %v", err)
|
||||
}
|
||||
cc.TLS = rtls
|
||||
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
||||
}
|
||||
cfg.Cluster = cc
|
||||
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
||||
}
|
||||
if authMode == membership.AuthEnforce {
|
||||
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// runMigrateCLI implements `membershipd migrate-to-kv`, the idempotent move of
|
||||
// the control-plane state from the local SQLite database into replicated
|
||||
// JetStream KV (issue 0003c). It backs up the SQLite file first (VACUUM INTO),
|
||||
// then connects to the target NATS and copies every room/member/key/user into
|
||||
// the KV buckets. Re-running it converges to the same state.
|
||||
//
|
||||
// It runs on the bus host (no auth on the control-plane side), connecting to the
|
||||
// cluster's NATS; --ca pins TLS when the data plane is secured.
|
||||
func runMigrateCLI(args []string) {
|
||||
fs := flag.NewFlagSet("migrate-to-kv", flag.ExitOnError)
|
||||
dbPath := fs.String("db", defaultDBPath, "SQLite database path to migrate FROM")
|
||||
natsURL := fs.String("nats-url", "", "NATS url of the cluster to migrate INTO (required)")
|
||||
ca := fs.String("ca", "", "CA cert to pin TLS on the NATS connection (optional)")
|
||||
replicas := fs.Int("replicas", 1, "KV replication factor (1 for a 1-2 node rollout, 3 for HA quorum)")
|
||||
noBackup := fs.Bool("no-backup", false, "skip the SQLite backup before migrating (NOT recommended)")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
if *natsURL == "" {
|
||||
fmt.Fprintln(os.Stderr, "membershipd migrate-to-kv: --nats-url is required (the cluster to write the KV buckets into)")
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
// Back up the SQLite database first so a botched migration can be undone.
|
||||
var backupPath string
|
||||
if !*noBackup {
|
||||
bak, err := membership.BackupSQLite(*dbPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: backup failed: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
backupPath = bak
|
||||
fmt.Printf("backed up %s -> %s\n", *dbPath, backupPath)
|
||||
}
|
||||
|
||||
// Connect to the target NATS (optionally TLS-pinned to the bus CA).
|
||||
natsOpts := []nats.Option{nats.Name("unibus-migrate")}
|
||||
if *ca != "" {
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(*ca)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: load CA: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
natsOpts = append(natsOpts, nats.Secure(tlsCfg))
|
||||
}
|
||||
nc, err := nats.Connect(*natsURL, natsOpts...)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: connect %q: %v\n", *natsURL, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: jetstream: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
report, err := membership.MigrateSQLiteToKV(*dbPath, js, membership.JetStreamConfig{
|
||||
Replicas: *replicas,
|
||||
OpTimeout: 30 * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
report.BackupPath = backupPath
|
||||
|
||||
fmt.Printf("migrated to KV (replicas=%d): %d rooms, %d members, %d keys, %d users\n",
|
||||
*replicas, report.Rooms, report.Members, report.Keys, report.Users)
|
||||
if backupPath != "" {
|
||||
fmt.Printf("rollback: restore %s if needed\n", backupPath)
|
||||
}
|
||||
}
|
||||
@@ -65,7 +65,7 @@ const defaultDBPath = "./local_files/unibus.db"
|
||||
// openStore opens the membership store at path, exiting on failure. Migrations
|
||||
// (including 002_users.sql) are applied by membership.Open, so a fresh database
|
||||
// gets the users table on first use of the CLI.
|
||||
func openStore(path string) *membership.Store {
|
||||
func openStore(path string) membership.Store {
|
||||
store, err := membership.Open(path)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user: open store %q: %v\n", path, err)
|
||||
|
||||
@@ -14,6 +14,13 @@
|
||||
"description": "TLS on the NATS data plane using the project's self-signed CA (deploy/tls/). Server opts in via membershipd --tls-cert/--tls-key; clients pin ca.crt via client.Connect(caPath).",
|
||||
"added": "2026-06-07",
|
||||
"enabled_at": "2026-06-07"
|
||||
},
|
||||
"decentralized": {
|
||||
"enabled": false,
|
||||
"issue": "0003",
|
||||
"description": "Control-plane state on replicated JetStream KV instead of local SQLite (branch-by-abstraction membership.Store: sqliteStore default OFF, jetstreamStore ON). The route cluster (0003a) and the KV store (0003b) ship behind this flag; the membershipd boot wiring that selects the KV store completes with the session/reconnect redesign (0003e) and is activated on the multi-node deploy (0003f). OFF keeps the single-node SQLite control plane unchanged.",
|
||||
"added": "2026-06-07",
|
||||
"enabled_at": null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+27
-10
@@ -1,9 +1,15 @@
|
||||
// Package blobstore is a content-addressed object store on local disk.
|
||||
// Package blobstore is a content-addressed object store for media ciphertext.
|
||||
//
|
||||
// The bus transports messages, not blobs. Media (images, files, large payloads)
|
||||
// is encrypted by the client BEFORE being stored here, so the store only ever
|
||||
// sees ciphertext. Objects are addressed by the sha256 hex of their (encrypted)
|
||||
// bytes, which makes Put idempotent and deduplicating.
|
||||
//
|
||||
// Store is an interface (branch-by-abstraction, issue 0003d) with two backends:
|
||||
// diskStore (the default, local filesystem) and objectStore (NATS Object Store
|
||||
// on JetStream, replicated across the cluster so blobs survive a node loss and
|
||||
// are reachable from any node). The wire contract (sha256-hex addressing) is
|
||||
// identical, so a client cannot tell which backend a membershipd uses.
|
||||
package blobstore
|
||||
|
||||
import (
|
||||
@@ -14,27 +20,38 @@ import (
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// Store is a directory-backed content-addressed blob store.
|
||||
type Store struct {
|
||||
// Store is a content-addressed blob store: Put returns the sha256-hex address of
|
||||
// the stored bytes, Get fetches by that address, Has reports presence.
|
||||
type Store interface {
|
||||
Put(data []byte) (string, error)
|
||||
Get(hash string) ([]byte, error)
|
||||
Has(hash string) bool
|
||||
}
|
||||
|
||||
// diskStore is a directory-backed content-addressed blob store (the default,
|
||||
// single-node backend).
|
||||
type diskStore struct {
|
||||
dir string
|
||||
}
|
||||
|
||||
// New creates a Store rooted at dir, creating the directory if needed.
|
||||
func New(dir string) (*Store, error) {
|
||||
// New creates a disk-backed Store rooted at dir, creating the directory if
|
||||
// needed. It remains the default backend; the replicated NATS Object Store is
|
||||
// constructed separately (NewObjectStore) when decentralization is enabled.
|
||||
func New(dir string) (Store, error) {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("blobstore: mkdir %q: %w", dir, err)
|
||||
}
|
||||
return &Store{dir: dir}, nil
|
||||
return &diskStore{dir: dir}, nil
|
||||
}
|
||||
|
||||
// path returns the on-disk path for a given content hash.
|
||||
func (s *Store) path(hash string) string {
|
||||
func (s *diskStore) path(hash string) string {
|
||||
return filepath.Join(s.dir, hash)
|
||||
}
|
||||
|
||||
// Put writes data to the store and returns its sha256 hex hash. If an object
|
||||
// with the same content already exists, Put is a no-op and returns the hash.
|
||||
func (s *Store) Put(data []byte) (string, error) {
|
||||
func (s *diskStore) Put(data []byte) (string, error) {
|
||||
sum := sha256.Sum256(data)
|
||||
hash := hex.EncodeToString(sum[:])
|
||||
p := s.path(hash)
|
||||
@@ -66,7 +83,7 @@ func (s *Store) Put(data []byte) (string, error) {
|
||||
}
|
||||
|
||||
// Get reads the object with the given hash.
|
||||
func (s *Store) Get(hash string) ([]byte, error) {
|
||||
func (s *diskStore) Get(hash string) ([]byte, error) {
|
||||
data, err := os.ReadFile(s.path(hash))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: get %q: %w", hash, err)
|
||||
@@ -75,7 +92,7 @@ func (s *Store) Get(hash string) ([]byte, error) {
|
||||
}
|
||||
|
||||
// Has reports whether an object with the given hash exists.
|
||||
func (s *Store) Has(hash string) bool {
|
||||
func (s *diskStore) Has(hash string) bool {
|
||||
_, err := os.Stat(s.path(hash))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
package blobstore
|
||||
|
||||
// objectStore is the NATS Object Store implementation of Store (issue 0003d):
|
||||
// media ciphertext lives in a JetStream Object Store bucket replicated across
|
||||
// the cluster, so a blob uploaded to one node is durable against the loss of a
|
||||
// node and readable from any node. It is selected when decentralization is on;
|
||||
// diskStore stays the single-node default. The content-addressing (sha256-hex)
|
||||
// is identical to the disk backend, so the wire contract does not change.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultObjectBucket = "UNIBUS_blobs"
|
||||
defaultObjOpTime = 10 * time.Second
|
||||
)
|
||||
|
||||
// ObjectStoreConfig configures the replicated Object Store backend.
|
||||
type ObjectStoreConfig struct {
|
||||
// Bucket is the object store bucket name; empty uses UNIBUS_blobs.
|
||||
Bucket string
|
||||
// Replicas is the replication factor (R1..R5), matching the KV store's
|
||||
// R1->R3 rollout.
|
||||
Replicas int
|
||||
// OpTimeout bounds each object operation; zero uses defaultObjOpTime.
|
||||
OpTimeout time.Duration
|
||||
}
|
||||
|
||||
type objectStore struct {
|
||||
os jetstream.ObjectStore
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// NewObjectStore creates (or opens) the replicated Object Store bucket on js and
|
||||
// returns it as a Store. The JetStream context belongs to the caller.
|
||||
func NewObjectStore(js jetstream.JetStream, cfg ObjectStoreConfig) (Store, error) {
|
||||
if cfg.Bucket == "" {
|
||||
cfg.Bucket = defaultObjectBucket
|
||||
}
|
||||
if cfg.Replicas <= 0 {
|
||||
cfg.Replicas = 1
|
||||
}
|
||||
opTimeout := cfg.OpTimeout
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultObjOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
obj, err := js.CreateOrUpdateObjectStore(ctx, jetstream.ObjectStoreConfig{
|
||||
Bucket: cfg.Bucket,
|
||||
Replicas: cfg.Replicas,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: open object store %q (replicas=%d): %w", cfg.Bucket, cfg.Replicas, err)
|
||||
}
|
||||
return &objectStore{os: obj, opTimeout: opTimeout}, nil
|
||||
}
|
||||
|
||||
func (s *objectStore) ctx() (context.Context, context.CancelFunc) {
|
||||
return context.WithTimeout(context.Background(), s.opTimeout)
|
||||
}
|
||||
|
||||
// Put stores data under its sha256-hex address. Re-putting identical bytes is a
|
||||
// harmless overwrite (same address, same content), preserving the idempotent,
|
||||
// deduplicating semantics of the disk backend.
|
||||
func (s *objectStore) Put(data []byte) (string, error) {
|
||||
sum := sha256.Sum256(data)
|
||||
hash := hex.EncodeToString(sum[:])
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.os.PutBytes(ctx, hash, data); err != nil {
|
||||
return "", fmt.Errorf("blobstore: put object %q: %w", hash, err)
|
||||
}
|
||||
return hash, nil
|
||||
}
|
||||
|
||||
// Get fetches the object by its hash address.
|
||||
func (s *objectStore) Get(hash string) ([]byte, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
data, err := s.os.GetBytes(ctx, hash)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: get object %q: %w", hash, err)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Has reports whether an object with the given hash exists.
|
||||
func (s *objectStore) Has(hash string) bool {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
_, err := s.os.GetInfo(ctx, hash)
|
||||
return err == nil
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
package blobstore_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
func objFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// newObjectStore boots a single-node embedded NATS with JetStream and returns a
|
||||
// replicated (R1) Object Store backend over it.
|
||||
func newObjectStore(t *testing.T) blobstore.Store {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: objFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
st, err := blobstore.NewObjectStore(js, blobstore.ObjectStoreConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("new object store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { nc.Close(); ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return st
|
||||
}
|
||||
|
||||
// TestObjectStoreRoundTrip is the golden path: put ciphertext, get it back by
|
||||
// its hash, Has reports presence, and re-putting identical bytes returns the
|
||||
// same address (content-addressed dedup).
|
||||
func TestObjectStoreRoundTrip(t *testing.T) {
|
||||
s := newObjectStore(t)
|
||||
data := []byte("encrypted-media-ciphertext-payload")
|
||||
|
||||
hash, err := s.Put(data)
|
||||
if err != nil {
|
||||
t.Fatalf("Put: %v", err)
|
||||
}
|
||||
want := hex.EncodeToString(sha256Sum(data))
|
||||
if hash != want {
|
||||
t.Fatalf("hash = %q, want sha256 hex %q", hash, want)
|
||||
}
|
||||
got, err := s.Get(hash)
|
||||
if err != nil {
|
||||
t.Fatalf("Get: %v", err)
|
||||
}
|
||||
if !bytes.Equal(got, data) {
|
||||
t.Fatalf("Get returned %q, want %q", got, data)
|
||||
}
|
||||
if !s.Has(hash) {
|
||||
t.Fatalf("Has should be true for a stored blob")
|
||||
}
|
||||
// Re-put identical bytes: same address, no error.
|
||||
hash2, err := s.Put(data)
|
||||
if err != nil || hash2 != hash {
|
||||
t.Fatalf("re-Put: hash2=%q err=%v (want %q)", hash2, err, hash)
|
||||
}
|
||||
}
|
||||
|
||||
// TestObjectStoreMissing is the edge/error path: a hash that was never stored
|
||||
// is absent and unreadable.
|
||||
func TestObjectStoreMissing(t *testing.T) {
|
||||
s := newObjectStore(t)
|
||||
missing := hex.EncodeToString(sha256Sum([]byte("never stored")))
|
||||
if s.Has(missing) {
|
||||
t.Fatalf("Has should be false for an unknown hash")
|
||||
}
|
||||
if _, err := s.Get(missing); err == nil {
|
||||
t.Fatalf("Get of an unknown hash should error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestObjectStoreAddressMatchesDisk is the contract test: the Object Store and
|
||||
// the disk backend address identical bytes to the IDENTICAL hash, so a client
|
||||
// cannot tell which backend a node uses and a blob ref is portable across them.
|
||||
func TestObjectStoreAddressMatchesDisk(t *testing.T) {
|
||||
obj := newObjectStore(t)
|
||||
disk, err := blobstore.New(t.TempDir())
|
||||
if err != nil {
|
||||
t.Fatalf("disk store: %v", err)
|
||||
}
|
||||
for _, payload := range [][]byte{[]byte("a"), []byte("longer ciphertext blob \x00\x01\x02"), {}} {
|
||||
oh, err := obj.Put(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("object Put: %v", err)
|
||||
}
|
||||
dh, err := disk.Put(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("disk Put: %v", err)
|
||||
}
|
||||
if oh != dh {
|
||||
t.Fatalf("address mismatch for %q: object=%q disk=%q", payload, oh, dh)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sha256Sum(b []byte) []byte {
|
||||
sum := sha256.Sum256(b)
|
||||
return sum[:]
|
||||
}
|
||||
@@ -27,31 +27,88 @@ func NewNkeyAuthenticator(isAuthorized func(signPubHex string) bool) server.Auth
|
||||
|
||||
// Check verifies the client's nkey signature against the nonce the server
|
||||
// presented, then maps the nkey to its allowlist key and checks authorization.
|
||||
// Any malformed input or failed verification yields false (fail closed). The
|
||||
// signature decoding mirrors nats-server's own (raw-url base64, then std base64
|
||||
// fallback) so genuine clients using nats.Nkey are accepted unchanged.
|
||||
// Any malformed input or failed verification yields false (fail closed).
|
||||
func (a *nkeyAuthenticator) Check(c server.ClientAuthentication) bool {
|
||||
signPubHex, ok := verifyNkey(c)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return a.isAuthorized(signPubHex)
|
||||
}
|
||||
|
||||
// verifyNkey performs the shared nkey verification: it checks the client's
|
||||
// signature against the server-presented nonce and returns the lowercase-hex
|
||||
// Ed25519 public key behind the nkey. ok is false on any malformed input or
|
||||
// failed verification (fail closed). The signature decoding mirrors
|
||||
// nats-server's own (raw-url base64, then std base64 fallback) so genuine
|
||||
// clients using nats.Nkey are accepted unchanged.
|
||||
func verifyNkey(c server.ClientAuthentication) (signPubHex string, ok bool) {
|
||||
opts := c.GetOpts()
|
||||
if opts.Nkey == "" {
|
||||
return false
|
||||
return "", false
|
||||
}
|
||||
sig, err := base64.RawURLEncoding.DecodeString(opts.Sig)
|
||||
if err != nil {
|
||||
sig, err = base64.StdEncoding.DecodeString(opts.Sig)
|
||||
if err != nil {
|
||||
return false
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
pub, err := nkeys.FromPublicKey(opts.Nkey)
|
||||
if err != nil {
|
||||
return false
|
||||
return "", false
|
||||
}
|
||||
if err := pub.Verify(c.GetNonce(), sig); err != nil {
|
||||
return false
|
||||
return "", false
|
||||
}
|
||||
signPubHex, err := SignPubHexFromNkey(opts.Nkey)
|
||||
signPubHex, err = SignPubHexFromNkey(opts.Nkey)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
return signPubHex, true
|
||||
}
|
||||
|
||||
// PermissionsFunc maps a connecting identity (lowercase-hex Ed25519 signing key)
|
||||
// to the NATS permissions it should be granted for this connection. Returning an
|
||||
// error denies the connection (fail closed). It is how the data plane enforces
|
||||
// per-subject access from room membership (issue 0003e, audit H4 residual).
|
||||
type PermissionsFunc func(signPubHex string) (*server.Permissions, error)
|
||||
|
||||
// nkeyAuthenticatorACL is the nkey authenticator that ALSO scopes the connection
|
||||
// to per-subject permissions derived from room membership. NATS evaluates
|
||||
// permissions once, at connect time, so a peer that joins a room after
|
||||
// connecting must reconnect (client.RefreshSession) to gain that room's subject
|
||||
// — the dynamic-membership reconnection model the audit deferred to this issue.
|
||||
type nkeyAuthenticatorACL struct {
|
||||
isAuthorized func(signPubHex string) bool
|
||||
perms PermissionsFunc
|
||||
}
|
||||
|
||||
// NewNkeyAuthenticatorACL builds an authenticator that authorizes by the bus
|
||||
// allowlist AND registers per-subject permissions from perms. A registered but
|
||||
// permission-less peer can no longer subscribe to or publish on arbitrary
|
||||
// subjects: it is confined to the subjects of the rooms it belongs to (plus the
|
||||
// client infrastructure subjects perms includes). This is the per-subject ACL
|
||||
// the 0004 hardening left as a residual.
|
||||
func NewNkeyAuthenticatorACL(isAuthorized func(signPubHex string) bool, perms PermissionsFunc) server.Authentication {
|
||||
return &nkeyAuthenticatorACL{isAuthorized: isAuthorized, perms: perms}
|
||||
}
|
||||
|
||||
// Check verifies the nkey, authorizes against the allowlist, then derives and
|
||||
// registers the connection's subject permissions. A permissions-derivation
|
||||
// error denies the connection (fail closed) rather than granting open access.
|
||||
func (a *nkeyAuthenticatorACL) Check(c server.ClientAuthentication) bool {
|
||||
signPubHex, ok := verifyNkey(c)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return a.isAuthorized(signPubHex)
|
||||
if !a.isAuthorized(signPubHex) {
|
||||
return false
|
||||
}
|
||||
perms, err := a.perms(signPubHex)
|
||||
if err != nil {
|
||||
return false // fail closed: never grant open access on a derivation error
|
||||
}
|
||||
c.RegisterUser(&server.User{Permissions: perms})
|
||||
return true
|
||||
}
|
||||
|
||||
@@ -35,3 +35,41 @@ func ServerTLSConfig(certPEMPath, keyPEMPath string) (*tls.Config, error) {
|
||||
}
|
||||
return &tls.Config{Certificates: []tls.Certificate{cert}, MinVersion: tls.VersionTLS12}, nil
|
||||
}
|
||||
|
||||
// RouteTLSConfig builds the mutual-TLS config for the NATS CLUSTER route layer
|
||||
// (issue 0003a). Unlike the client data plane, where the server presents a cert
|
||||
// and only the client verifies it, routes are server-to-server: each node both
|
||||
// presents its own node certificate AND verifies the connecting node's
|
||||
// certificate against the bus CA. So this single config carries:
|
||||
//
|
||||
// - Certificates: this node's CA-signed certificate (presented in both the
|
||||
// server and the client role of a route handshake),
|
||||
// - RootCAs: the bus CA, to verify the certificate of a node we dial out to,
|
||||
// - ClientCAs + ClientAuth=RequireAndVerifyClientCert: the bus CA, to verify
|
||||
// the certificate of a node dialing in.
|
||||
//
|
||||
// The effect: a node that lacks a certificate signed by the bus CA cannot
|
||||
// establish a route in either direction, even if it knows the cluster password.
|
||||
// Reuse the same CA as the client data plane (deploy/tls) but a per-node cert
|
||||
// whose SAN covers that node's route address.
|
||||
func RouteTLSConfig(certPEMPath, keyPEMPath, caPEMPath string) (*tls.Config, error) {
|
||||
cert, err := tls.LoadX509KeyPair(certPEMPath, keyPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: load route keypair: %w", err)
|
||||
}
|
||||
pem, err := os.ReadFile(caPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: read route CA %q: %w", caPEMPath, err)
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("busauth: route CA %q contains no valid PEM certificate", caPEMPath)
|
||||
}
|
||||
return &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
RootCAs: pool,
|
||||
ClientCAs: pool,
|
||||
ClientAuth: tls.RequireAndVerifyClientCert,
|
||||
MinVersion: tls.VersionTLS12,
|
||||
}, nil
|
||||
}
|
||||
|
||||
+185
-76
@@ -51,9 +51,14 @@ type Client struct {
|
||||
endpoint string
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream // durable plane for rooms with Policy.Persist
|
||||
ctrlURL string
|
||||
ctrlURLs []string // control-plane HTTP endpoints, tried in order (failover)
|
||||
http *http.Client
|
||||
|
||||
// natsServers + natsOpts are retained so RefreshSession can rebuild the
|
||||
// data-plane connection (re-triggering the server's subject-ACL evaluation).
|
||||
natsServers []string
|
||||
natsOpts []nats.Option
|
||||
|
||||
mu sync.RWMutex
|
||||
keyCache map[string]map[int][]byte // roomID -> epoch -> K
|
||||
signCache map[string][]byte // sender endpoint -> sign pub (for verification)
|
||||
@@ -77,6 +82,33 @@ type Options struct {
|
||||
// secured independently (a test may TLS one and not the other); production
|
||||
// sets both to the same CA via Connect. Nil keeps the control plane plaintext.
|
||||
CtrlTLS *tls.Config
|
||||
// NatsServers are ADDITIONAL NATS seed URLs for cluster failover (issue
|
||||
// 0003e), beyond the primary natsURL passed to the constructor. With more
|
||||
// than one server nats.go reconnects to a surviving node automatically when
|
||||
// the one a client is attached to dies, so a node loss is transparent.
|
||||
NatsServers []string
|
||||
// CtrlURLs are ADDITIONAL control-plane HTTP endpoints (one per node) beyond
|
||||
// the primary ctrlURL. Each request is tried against them in order until one
|
||||
// answers, so the control plane survives a node loss too. With the
|
||||
// decentralized KV store every node serves the same state, so any of them
|
||||
// can answer any request.
|
||||
CtrlURLs []string
|
||||
}
|
||||
|
||||
// dedupNonEmpty returns the input with empty strings dropped and duplicates
|
||||
// removed, preserving order. Used to build the NATS seed list and control-plane
|
||||
// list from a primary URL plus optional extras without a redundant entry.
|
||||
func dedupNonEmpty(in []string) []string {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, s := range in {
|
||||
if s == "" || seen[s] {
|
||||
continue
|
||||
}
|
||||
seen[s] = true
|
||||
out = append(out, s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// New connects to NATS and records the control-plane URL with default Options
|
||||
@@ -116,7 +148,20 @@ func Connect(natsURL, ctrlURL string, id cs.Identity, caPath string) (*Client, e
|
||||
// so every peer (worker, chat, mobile, gateway) gets identical behavior by
|
||||
// passing the same Options.
|
||||
func NewWithOptions(natsURL, ctrlURL string, id cs.Identity, opts Options) (*Client, error) {
|
||||
natsOpts := []nats.Option{nats.Name("unibus-client")}
|
||||
// Seed list = primary + extras. With more than one seed, nats.go fails over
|
||||
// to a surviving node on disconnect; MaxReconnects(-1) keeps it retrying
|
||||
// indefinitely so a node coming back is rejoined rather than given up on.
|
||||
natsServers := dedupNonEmpty(append([]string{natsURL}, opts.NatsServers...))
|
||||
natsOpts := []nats.Option{
|
||||
nats.Name("unibus-client"),
|
||||
nats.MaxReconnects(-1),
|
||||
nats.ReconnectWait(250 * time.Millisecond),
|
||||
}
|
||||
if len(natsServers) > 1 {
|
||||
// Try every seed on the initial connect too, so startup tolerates one
|
||||
// seed being down.
|
||||
natsOpts = append(natsOpts, nats.RetryOnFailedConnect(true))
|
||||
}
|
||||
if opts.UseNkey {
|
||||
nkeyPub, nkeySign, err := busauth.ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
@@ -127,9 +172,9 @@ func NewWithOptions(natsURL, ctrlURL string, id cs.Identity, opts Options) (*Cli
|
||||
if opts.TLS != nil {
|
||||
natsOpts = append(natsOpts, nats.Secure(opts.TLS))
|
||||
}
|
||||
nc, err := nats.Connect(natsURL, natsOpts...)
|
||||
nc, err := nats.Connect(strings.Join(natsServers, ","), natsOpts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: connect nats %q: %w", natsURL, err)
|
||||
return nil, fmt.Errorf("client: connect nats %v: %w", natsServers, err)
|
||||
}
|
||||
// JetStream context for the durable plane. Obtaining it does not require any
|
||||
// stream to exist yet and has no effect on cleartext/ephemeral rooms — those
|
||||
@@ -147,17 +192,50 @@ func NewWithOptions(natsURL, ctrlURL string, id cs.Identity, opts Options) (*Cli
|
||||
httpClient.Transport = &http.Transport{TLSClientConfig: opts.CtrlTLS.Clone()}
|
||||
}
|
||||
return &Client{
|
||||
id: id,
|
||||
endpoint: frame.EndpointID(id.SignPub),
|
||||
nc: nc,
|
||||
js: js,
|
||||
ctrlURL: ctrlURL,
|
||||
http: httpClient,
|
||||
keyCache: map[string]map[int][]byte{},
|
||||
signCache: map[string][]byte{},
|
||||
id: id,
|
||||
endpoint: frame.EndpointID(id.SignPub),
|
||||
nc: nc,
|
||||
js: js,
|
||||
ctrlURLs: dedupNonEmpty(append([]string{ctrlURL}, opts.CtrlURLs...)),
|
||||
http: httpClient,
|
||||
natsServers: natsServers,
|
||||
natsOpts: natsOpts,
|
||||
keyCache: map[string]map[int][]byte{},
|
||||
signCache: map[string][]byte{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// RefreshSession rebuilds the data-plane NATS connection so the server's
|
||||
// subject-ACL authenticator re-evaluates this peer's room membership (issue
|
||||
// 0003e, audit H4 residual). Call it after a membership change — a room you
|
||||
// created, were invited to, or joined — when the bus enforces per-subject
|
||||
// permissions, so the new room's subject becomes publishable and subscribable
|
||||
// (NATS freezes permissions at connect time, so the prior connection cannot see
|
||||
// the new room).
|
||||
//
|
||||
// It opens a fresh connection with the same seeds/options and swaps it in.
|
||||
// IMPORTANT: active subscriptions from the previous connection are dropped —
|
||||
// re-subscribe (client.Subscribe) to your rooms after calling this. The key and
|
||||
// signer caches are preserved. On a non-ACL bus this is a no-op-safe reconnect.
|
||||
func (c *Client) RefreshSession() error {
|
||||
nc, err := nats.Connect(strings.Join(c.natsServers, ","), c.natsOpts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: refresh session: reconnect nats: %w", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return fmt.Errorf("client: refresh session: init jetstream: %w", err)
|
||||
}
|
||||
old := c.nc
|
||||
c.mu.Lock()
|
||||
c.nc = nc
|
||||
c.js = js
|
||||
c.mu.Unlock()
|
||||
old.Close()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Endpoint returns this client's public identity.
|
||||
func (c *Client) Endpoint() Endpoint {
|
||||
return Endpoint{ID: c.endpoint, SignPub: c.id.SignPub, KexPub: c.id.KexPub}
|
||||
@@ -169,6 +247,15 @@ func (c *Client) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ConnectedServer returns the URL of the NATS node this client is currently
|
||||
// attached to (empty when disconnected). It is observability for cluster
|
||||
// failover: after a node dies, this reports the surviving node nats.go
|
||||
// reconnected to. IsConnected reports whether the data-plane link is up.
|
||||
func (c *Client) ConnectedServer() string { return c.nc.ConnectedUrl() }
|
||||
|
||||
// IsConnected reports whether the NATS data-plane connection is currently up.
|
||||
func (c *Client) IsConnected() bool { return c.nc.IsConnected() }
|
||||
|
||||
// ---- key cache ------------------------------------------------------------
|
||||
|
||||
func (c *Client) cacheKey(roomID string, epoch int, k []byte) {
|
||||
@@ -203,36 +290,45 @@ func (c *Client) doJSON(method, path string, body, out any) error {
|
||||
}
|
||||
bodyBytes = b
|
||||
}
|
||||
req, err := c.newSignedRequest(method, path, bodyBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: do %s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
// Surface the server's structured {"error": "..."} message when present,
|
||||
// instead of leaking the raw HTTP envelope (method, path, status, JSON body).
|
||||
var er struct {
|
||||
Error string `json:"error"`
|
||||
// Try each control-plane endpoint in order. A transport error (a dead node)
|
||||
// falls over to the next; an HTTP response (any status) is authoritative and
|
||||
// returned, since every node serves the same state. Each attempt is freshly
|
||||
// signed (new nonce), so a failed-over retry is never seen as a replay.
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, method, path, bodyBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if json.Unmarshal(respBody, &er) == nil && er.Error != "" {
|
||||
return fmt.Errorf("%s (HTTP %d)", er.Error, resp.StatusCode)
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
return fmt.Errorf("client: %s %s -> %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
if out != nil {
|
||||
if err := json.Unmarshal(respBody, out); err != nil {
|
||||
return fmt.Errorf("client: decode response: %w", err)
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
// Surface the server's structured {"error": "..."} message when present,
|
||||
// instead of leaking the raw HTTP envelope (method, path, status, body).
|
||||
var er struct {
|
||||
Error string `json:"error"`
|
||||
}
|
||||
if json.Unmarshal(respBody, &er) == nil && er.Error != "" {
|
||||
return fmt.Errorf("%s (HTTP %d)", er.Error, resp.StatusCode)
|
||||
}
|
||||
return fmt.Errorf("client: %s %s -> %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
if out != nil {
|
||||
if err := json.Unmarshal(respBody, out); err != nil {
|
||||
return fmt.Errorf("client: decode response: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
return fmt.Errorf("client: %s %s: all control planes failed: %w", method, path, lastErr)
|
||||
}
|
||||
|
||||
// signRequest signs the canonical bytes of req (req must already have its Sig
|
||||
@@ -246,22 +342,25 @@ func (c *Client) signRequest(req any) []byte {
|
||||
return cs.SignEd25519(c.id.SignPriv, b)
|
||||
}
|
||||
|
||||
// newSignedRequest builds an *http.Request to the control plane and attaches the
|
||||
// transport authentication headers (X-Unibus-Pub/Ts/Nonce/Sig) signing the
|
||||
// canonical request bytes with this peer's Ed25519 key. path is the request URI
|
||||
// (path plus any query); body is the raw request body (nil for GET). The server
|
||||
// (membership.authenticate) verifies these headers under the bus-auth flag.
|
||||
// newSignedRequestTo builds an *http.Request to the control-plane endpoint
|
||||
// `base` and attaches the transport authentication headers
|
||||
// (X-Unibus-Pub/Ts/Nonce/Sig) signing the canonical request bytes with this
|
||||
// peer's Ed25519 key. path is the request URI (path plus any query); body is the
|
||||
// raw request body (nil for GET). The server (membership.authenticate) verifies
|
||||
// these headers under the bus-auth flag. The signature covers method+path+ts+
|
||||
// nonce+sha256(body), NOT the host, so the same request can be addressed to any
|
||||
// node — and each failover attempt mints a fresh nonce so it is never a replay.
|
||||
//
|
||||
// Signing happens on every request — including GETs — so that under enforce the
|
||||
// server can authenticate the caller and reject unregistered or revoked
|
||||
// identities uniformly. The canonical construction is the single source of truth
|
||||
// in membership.CanonicalRequest, shared by both sides.
|
||||
func (c *Client) newSignedRequest(method, path string, body []byte) (*http.Request, error) {
|
||||
func (c *Client) newSignedRequestTo(base, method, path string, body []byte) (*http.Request, error) {
|
||||
var rdr io.Reader
|
||||
if body != nil {
|
||||
rdr = bytes.NewReader(body)
|
||||
}
|
||||
req, err := http.NewRequest(method, c.ctrlURL+path, rdr)
|
||||
req, err := http.NewRequest(method, base+path, rdr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: new request: %w", err)
|
||||
}
|
||||
@@ -887,40 +986,50 @@ func (c *Client) FetchMedia(roomID string, f frame.Frame) ([]byte, error) {
|
||||
}
|
||||
|
||||
func (c *Client) putBlob(ciphertext []byte) (string, error) {
|
||||
req, err := c.newSignedRequest("POST", "/blobs", ciphertext)
|
||||
if err != nil {
|
||||
return "", err
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, "POST", "/blobs", ciphertext)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/octet-stream")
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("client: put blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var r blobResp
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
return "", fmt.Errorf("client: decode blob resp: %w", err)
|
||||
}
|
||||
return r.Hash, nil
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/octet-stream")
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("client: put blob: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("client: put blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var r blobResp
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
return "", fmt.Errorf("client: decode blob resp: %w", err)
|
||||
}
|
||||
return r.Hash, nil
|
||||
return "", fmt.Errorf("client: put blob: all control planes failed: %w", lastErr)
|
||||
}
|
||||
|
||||
func (c *Client) getBlob(hash string) ([]byte, error) {
|
||||
req, err := c.newSignedRequest("GET", "/blobs/"+hash, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, "GET", "/blobs/"+hash, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
return io.ReadAll(resp.Body)
|
||||
}
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: get blob: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
return io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob: all control planes failed: %w", lastErr)
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ type testHarness struct {
|
||||
ctrlURL string
|
||||
ns *server.Server
|
||||
httpts *httptest.Server
|
||||
store *membership.Store
|
||||
store membership.Store
|
||||
srv *membership.Server
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// startClusterNode boots a clustered embedded NATS node (auth off, no route TLS:
|
||||
// this test exercises client failover, not route security — that is covered in
|
||||
// pkg/embeddednats).
|
||||
func startClusterNode(t *testing.T, name string, clientPort, routePort int, peerRoutePorts []int) *server.Server {
|
||||
t.Helper()
|
||||
routes := make([]string, 0, len(peerRoutePorts))
|
||||
for _, p := range peerRoutePorts {
|
||||
routes = append(routes, fmt.Sprintf("nats://127.0.0.1:%d", p))
|
||||
}
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: clientPort,
|
||||
ServerName: name,
|
||||
Cluster: &embeddednats.ClusterConfig{Name: "unibus-failover", Host: "127.0.0.1", Port: routePort, Routes: routes},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start node %s: %v", name, err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
func waitClusterRoutes(t *testing.T, ns *server.Server) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if ns.NumRoutes() >= 1 {
|
||||
return
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("node %q never formed a route", ns.Name())
|
||||
}
|
||||
|
||||
// portOf extracts the :port of a nats URL for matching ConnectedServer() (which
|
||||
// may report a different host spelling than ClientURL()).
|
||||
func portOf(natsURL string) string {
|
||||
i := strings.LastIndex(natsURL, ":")
|
||||
if i < 0 {
|
||||
return ""
|
||||
}
|
||||
return natsURL[i+1:]
|
||||
}
|
||||
|
||||
// TestClientFailoverAcrossNodes is the issue's edge case: a client connected to
|
||||
// node A keeps its session when A is killed — nats.go reconnects it to node B
|
||||
// and it keeps receiving messages published on the surviving node.
|
||||
func TestClientFailoverAcrossNodes(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
p0, p1 := freePort(t), freePort(t)
|
||||
n0 := startClusterNode(t, "n0", p0, rp0, []int{rp1})
|
||||
n1 := startClusterNode(t, "n1", p1, rp1, []int{rp0})
|
||||
waitClusterRoutes(t, n0)
|
||||
waitClusterRoutes(t, n1)
|
||||
nodes := map[string]*server.Server{strconv.Itoa(p0): n0, strconv.Itoa(p1): n1}
|
||||
|
||||
// Control plane: one in-process membershipd (metadata only; the data plane is
|
||||
// the NATS cluster). Auth off keeps the test focused on data-plane failover.
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
t.Fatalf("blobs: %v", err)
|
||||
}
|
||||
ctrl := httptest.NewServer(membership.NewServer(store, blobs, membership.AuthOff))
|
||||
t.Cleanup(ctrl.Close)
|
||||
|
||||
url0 := n0.ClientURL()
|
||||
url1 := n1.ClientURL()
|
||||
|
||||
// A seeds BOTH nodes (failover list); B connects directly to n1.
|
||||
a, err := client.NewWithOptions(url0, ctrl.URL, mustIdentity(t), client.Options{NatsServers: []string{url1}})
|
||||
if err != nil {
|
||||
t.Fatalf("connect A: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
b, err := client.NewWithOptions(url1, ctrl.URL, mustIdentity(t), client.Options{NatsServers: []string{url0}})
|
||||
if err != nil {
|
||||
t.Fatalf("connect B: %v", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
roomID, err := a.CreateRoom("room.failover", room.ModeNATS)
|
||||
if err != nil {
|
||||
t.Fatalf("A create room: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var got []string
|
||||
sub, err := a.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
mu.Lock()
|
||||
got = append(got, string(plaintext))
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("A subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Pre-kill sanity: B publishes, A receives across the cluster.
|
||||
if err := b.Publish(roomID, []byte("before-kill")); err != nil {
|
||||
t.Fatalf("B publish 1: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool { return contains(rs, "before-kill") }, 3*time.Second) {
|
||||
t.Fatalf("A did not receive the pre-kill message; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
|
||||
// Identify and KILL the node A is attached to, forcing a reconnect.
|
||||
attached := a.ConnectedServer()
|
||||
killPort := portOf(attached)
|
||||
victim, ok := nodes[killPort]
|
||||
if !ok {
|
||||
t.Fatalf("A is attached to an unknown node %q (port %q)", attached, killPort)
|
||||
}
|
||||
survivorURL := url1
|
||||
if killPort == strconv.Itoa(p1) {
|
||||
survivorURL = url0
|
||||
}
|
||||
victim.Shutdown()
|
||||
victim.WaitForShutdown()
|
||||
|
||||
// A must reconnect to the surviving node.
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if a.IsConnected() && portOf(a.ConnectedServer()) == portOf(survivorURL) {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if !a.IsConnected() || portOf(a.ConnectedServer()) != portOf(survivorURL) {
|
||||
t.Fatalf("A did not fail over to the surviving node (now on %q, want port %s)", a.ConnectedServer(), portOf(survivorURL))
|
||||
}
|
||||
|
||||
// Make B publish from the surviving node and confirm A still receives —
|
||||
// the session (its subscription) survived the failover.
|
||||
if survivorURL == url0 {
|
||||
// B's primary was n1 (killed); ensure B is on the survivor too.
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) && portOf(b.ConnectedServer()) != portOf(survivorURL) {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
if err := b.Publish(roomID, []byte("after-kill")); err != nil {
|
||||
t.Fatalf("B publish 2: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool { return contains(rs, "after-kill") }, 6*time.Second) {
|
||||
t.Fatalf("A did not receive a message after failover; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
}
|
||||
|
||||
func contains(rs []string, want string) bool {
|
||||
for _, r := range rs {
|
||||
if r == want {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,344 @@
|
||||
package embeddednats_test
|
||||
|
||||
import (
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// freePort returns an OS-assigned free TCP port on loopback.
|
||||
func freePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// startNode boots a clustered embedded NATS node. peerRoutePorts are the route
|
||||
// ports of the OTHER nodes; user/pass gate the route layer (empty disables it);
|
||||
// routeTLS, when non-nil, secures the routes with mutual TLS.
|
||||
func startNode(t *testing.T, name string, clientPort, routePort int, peerRoutePorts []int, user, pass string, routeTLS *clusterTLS) *server.Server {
|
||||
t.Helper()
|
||||
routes := make([]string, 0, len(peerRoutePorts))
|
||||
for _, p := range peerRoutePorts {
|
||||
// Carry the cluster credentials in the route URL so this node
|
||||
// authenticates outbound to its peers' route listeners.
|
||||
if user != "" {
|
||||
routes = append(routes, fmt.Sprintf("nats://%s:%s@127.0.0.1:%d", user, pass, p))
|
||||
} else {
|
||||
routes = append(routes, fmt.Sprintf("nats://127.0.0.1:%d", p))
|
||||
}
|
||||
}
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: "unibus-test",
|
||||
Host: "127.0.0.1",
|
||||
Port: routePort,
|
||||
Routes: routes,
|
||||
Username: user,
|
||||
Password: pass,
|
||||
}
|
||||
if routeTLS != nil {
|
||||
cfg, err := busauth.RouteTLSConfig(routeTLS.cert, routeTLS.key, routeTLS.ca)
|
||||
if err != nil {
|
||||
t.Fatalf("route TLS for %s: %v", name, err)
|
||||
}
|
||||
cc.TLS = cfg
|
||||
}
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: clientPort,
|
||||
ServerName: name,
|
||||
Cluster: cc,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start node %s: %v", name, err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
// waitRoutes waits until ns has at least want established routes, or fails.
|
||||
func waitRoutes(t *testing.T, ns *server.Server, want int) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if ns.NumRoutes() >= want {
|
||||
return
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("node %q never reached %d routes (have %d)", ns.Name(), want, ns.NumRoutes())
|
||||
}
|
||||
|
||||
// stableRouteCount waits for ns's route count to stop changing (the NATS route
|
||||
// pool opens several connections per peer asynchronously) and returns it, so a
|
||||
// test can use it as a baseline that an impostor must not increase.
|
||||
func stableRouteCount(t *testing.T, ns *server.Server) int {
|
||||
t.Helper()
|
||||
prev := -1
|
||||
stableSince := time.Now()
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
n := ns.NumRoutes()
|
||||
if n != prev {
|
||||
prev = n
|
||||
stableSince = time.Now()
|
||||
} else if time.Since(stableSince) >= 750*time.Millisecond {
|
||||
return n
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
return prev
|
||||
}
|
||||
|
||||
// pubSubAcrossNodes connects a subscriber to subURL and a publisher to pubURL,
|
||||
// publishes one message on subject, and reports whether it arrived within 3s.
|
||||
// This proves the cluster forwards client subjects between nodes.
|
||||
func pubSubAcrossNodes(t *testing.T, subURL, pubURL, subject, payload string) bool {
|
||||
t.Helper()
|
||||
subConn, err := nats.Connect(subURL)
|
||||
if err != nil {
|
||||
t.Fatalf("subscriber connect %s: %v", subURL, err)
|
||||
}
|
||||
defer subConn.Close()
|
||||
got := make(chan string, 1)
|
||||
if _, err := subConn.Subscribe(subject, func(m *nats.Msg) {
|
||||
select {
|
||||
case got <- string(m.Data):
|
||||
default:
|
||||
}
|
||||
}); err != nil {
|
||||
t.Fatalf("subscribe: %v", err)
|
||||
}
|
||||
if err := subConn.Flush(); err != nil {
|
||||
t.Fatalf("flush sub: %v", err)
|
||||
}
|
||||
|
||||
pubConn, err := nats.Connect(pubURL)
|
||||
if err != nil {
|
||||
t.Fatalf("publisher connect %s: %v", pubURL, err)
|
||||
}
|
||||
defer pubConn.Close()
|
||||
// Retry the publish for a moment: route interest propagation across the
|
||||
// cluster is asynchronous, so the very first publish can race the gossip.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if err := pubConn.Publish(subject, []byte(payload)); err != nil {
|
||||
t.Fatalf("publish: %v", err)
|
||||
}
|
||||
_ = pubConn.Flush()
|
||||
select {
|
||||
case v := <-got:
|
||||
return v == payload
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// --- golden: two-node cluster forwards client subjects across nodes ----------
|
||||
|
||||
func TestClusterForwardsAcrossNodes(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1}, "clusteruser", "clusterpass", nil)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0}, "clusteruser", "clusterpass", nil)
|
||||
|
||||
waitRoutes(t, n0, 1)
|
||||
waitRoutes(t, n1, 1)
|
||||
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n1.ClientURL(), "test.cross", "hello-cluster") {
|
||||
t.Fatalf("subject published on n1 did not reach subscriber on n0")
|
||||
}
|
||||
}
|
||||
|
||||
// --- edge: three-node cluster (HA shape) forwards between non-adjacent nodes --
|
||||
|
||||
func TestClusterThreeNodesForward(t *testing.T) {
|
||||
rp0, rp1, rp2 := freePort(t), freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1, rp2}, "u", "p", nil)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0, rp2}, "u", "p", nil)
|
||||
n2 := startNode(t, "n2", freePort(t), rp2, []int{rp0, rp1}, "u", "p", nil)
|
||||
|
||||
waitRoutes(t, n0, 2)
|
||||
waitRoutes(t, n1, 2)
|
||||
waitRoutes(t, n2, 2)
|
||||
|
||||
// Publish on n2, subscribe on n0: a message must traverse the cluster.
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n2.ClientURL(), "test.ha", "three-node") {
|
||||
t.Fatalf("subject published on n2 did not reach subscriber on n0")
|
||||
}
|
||||
}
|
||||
|
||||
// --- error: a node with the wrong cluster password is rejected as a route -----
|
||||
|
||||
func TestClusterRejectsBadRouteAuth(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
good := startNode(t, "good", freePort(t), rp0, []int{rp1}, "secret", "right", nil)
|
||||
_ = startNode(t, "peer", freePort(t), rp1, []int{rp0}, "secret", "right", nil)
|
||||
waitRoutes(t, good, 1)
|
||||
// Let the route pool settle so the baseline count is stable (NATS opens a
|
||||
// pool of route connections per peer, so NumRoutes counts connections, not
|
||||
// distinct peers).
|
||||
base := stableRouteCount(t, good)
|
||||
|
||||
// Impostor knows the addresses but not the cluster password. It tries to
|
||||
// route to `good`; the route handshake must be rejected, so the impostor
|
||||
// never establishes a route.
|
||||
impostor := startNode(t, "impostor", freePort(t), freePort(t), []int{rp0}, "secret", "WRONG", nil)
|
||||
|
||||
// Give the route layer ample time to (fail to) connect, then assert it never
|
||||
// formed: the impostor has zero routes, and `good`'s route count is unchanged
|
||||
// (it did not accept a route from the impostor).
|
||||
time.Sleep(2 * time.Second)
|
||||
if n := impostor.NumRoutes(); n != 0 {
|
||||
t.Fatalf("impostor with wrong cluster password formed %d routes, want 0", n)
|
||||
}
|
||||
if n := good.NumRoutes(); n != base {
|
||||
t.Fatalf("legit node route count changed from %d to %d after impostor attempt (it accepted the impostor)", base, n)
|
||||
}
|
||||
}
|
||||
|
||||
// --- golden (TLS): mutual-TLS routes forward across nodes ---------------------
|
||||
|
||||
func TestClusterMutualTLSForwards(t *testing.T) {
|
||||
ca, caKey := genCA(t)
|
||||
dir := t.TempDir()
|
||||
tlsA := writeNodeCert(t, dir, "a", ca, caKey)
|
||||
tlsB := writeNodeCert(t, dir, "b", ca, caKey)
|
||||
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1}, "u", "p", tlsA)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0}, "u", "p", tlsB)
|
||||
|
||||
waitRoutes(t, n0, 1)
|
||||
waitRoutes(t, n1, 1)
|
||||
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n1.ClientURL(), "test.tls", "mtls-ok") {
|
||||
t.Fatalf("subject did not cross the mutual-TLS cluster")
|
||||
}
|
||||
}
|
||||
|
||||
// --- error (TLS): a node whose cert is not signed by the bus CA cannot join ---
|
||||
|
||||
func TestClusterRejectsUnsignedNode(t *testing.T) {
|
||||
ca, caKey := genCA(t)
|
||||
dir := t.TempDir()
|
||||
tlsGood := writeNodeCert(t, dir, "good", ca, caKey)
|
||||
tlsPeer := writeNodeCert(t, dir, "peer", ca, caKey)
|
||||
|
||||
// The impostor signs its node cert with a DIFFERENT CA, and pins only that
|
||||
// CA. The legit nodes' RequireAndVerifyClientCert against the bus CA rejects
|
||||
// it; the impostor likewise rejects the legit node's cert. No route forms.
|
||||
otherCA, otherKey := genCA(t)
|
||||
tlsImpostor := writeNodeCert(t, dir, "impostor", otherCA, otherKey)
|
||||
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
good := startNode(t, "good", freePort(t), rp0, []int{rp1}, "u", "p", tlsGood)
|
||||
_ = startNode(t, "peer", freePort(t), rp1, []int{rp0}, "u", "p", tlsPeer)
|
||||
waitRoutes(t, good, 1)
|
||||
base := stableRouteCount(t, good)
|
||||
|
||||
impostor := startNode(t, "impostor", freePort(t), freePort(t), []int{rp0}, "u", "p", tlsImpostor)
|
||||
time.Sleep(2 * time.Second)
|
||||
if n := impostor.NumRoutes(); n != 0 {
|
||||
t.Fatalf("impostor with unsigned cert formed %d routes, want 0", n)
|
||||
}
|
||||
if n := good.NumRoutes(); n != base {
|
||||
t.Fatalf("legit node route count changed from %d to %d after unsigned impostor attempt (it accepted the impostor)", base, n)
|
||||
}
|
||||
}
|
||||
|
||||
// --- cert helpers ------------------------------------------------------------
|
||||
|
||||
type clusterTLS struct{ cert, key, ca string } // PEM file paths
|
||||
|
||||
// genCA creates a self-signed ECDSA CA certificate and its key.
|
||||
func genCA(t *testing.T) (*x509.Certificate, *ecdsa.PrivateKey) {
|
||||
t.Helper()
|
||||
key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("gen CA key: %v", err)
|
||||
}
|
||||
tmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{CommonName: "unibus-test-CA"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature,
|
||||
BasicConstraintsValid: true,
|
||||
IsCA: true,
|
||||
}
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key)
|
||||
if err != nil {
|
||||
t.Fatalf("create CA cert: %v", err)
|
||||
}
|
||||
caCert, err := x509.ParseCertificate(der)
|
||||
if err != nil {
|
||||
t.Fatalf("parse CA cert: %v", err)
|
||||
}
|
||||
return caCert, key
|
||||
}
|
||||
|
||||
// writeNodeCert issues a node certificate signed by ca (SAN 127.0.0.1/::1,
|
||||
// usable as both server and client) and writes cert/key/ca PEM files, returning
|
||||
// their paths for RouteTLSConfig.
|
||||
func writeNodeCert(t *testing.T, dir, name string, ca *x509.Certificate, caKey *ecdsa.PrivateKey) *clusterTLS {
|
||||
t.Helper()
|
||||
key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("gen node key: %v", err)
|
||||
}
|
||||
tmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(time.Now().UnixNano()),
|
||||
Subject: pkix.Name{CommonName: name},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth},
|
||||
IPAddresses: []net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")},
|
||||
DNSNames: []string{"localhost"},
|
||||
}
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, ca, &key.PublicKey, caKey)
|
||||
if err != nil {
|
||||
t.Fatalf("create node cert: %v", err)
|
||||
}
|
||||
certPath := filepath.Join(dir, name+".crt")
|
||||
keyPath := filepath.Join(dir, name+".key")
|
||||
caPath := filepath.Join(dir, name+"-ca.crt")
|
||||
|
||||
writePEM(t, certPath, "CERTIFICATE", der)
|
||||
keyDER, err := x509.MarshalECPrivateKey(key)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal node key: %v", err)
|
||||
}
|
||||
writePEM(t, keyPath, "EC PRIVATE KEY", keyDER)
|
||||
writePEM(t, caPath, "CERTIFICATE", ca.Raw)
|
||||
return &clusterTLS{cert: certPath, key: keyPath, ca: caPath}
|
||||
}
|
||||
|
||||
func writePEM(t *testing.T, path, blockType string, der []byte) {
|
||||
t.Helper()
|
||||
b := pem.EncodeToMemory(&pem.Block{Type: blockType, Bytes: der})
|
||||
if err := os.WriteFile(path, b, 0o600); err != nil {
|
||||
t.Fatalf("write %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
@@ -8,25 +8,76 @@ package embeddednats
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"time"
|
||||
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// ClusterConfig configures the route layer that links several embedded NATS
|
||||
// servers into a single cluster (issue 0003a). It is the data-plane side of
|
||||
// high availability: with a cluster, a client subject published on one node is
|
||||
// forwarded to subscribers connected to any other node, and (with JetStream
|
||||
// replicas > 1) streams/KV are RAFT-replicated across nodes so the loss of one
|
||||
// node does not lose the bus.
|
||||
//
|
||||
// The route layer is a SEPARATE trust boundary from the client data plane: it
|
||||
// carries server-to-server traffic, so it authenticates NODES, not bus users.
|
||||
// Never reuse the nkey client authenticator here. Routes are secured with their
|
||||
// own shared secret (Username/Password -> NATS Cluster.Authorization) and their
|
||||
// own mutual TLS (TLS, built from the bus CA with busauth.RouteTLSConfig): a
|
||||
// node without the cluster secret and a CA-signed node certificate cannot join
|
||||
// the cluster nor inject messages into it.
|
||||
type ClusterConfig struct {
|
||||
// Name is the cluster name; it MUST be identical on every node or the
|
||||
// servers refuse to gossip routes to each other.
|
||||
Name string
|
||||
// Host and Port are the route listener (server-to-server), distinct from the
|
||||
// client Host/Port. Use a free, non-client port (e.g. 6250).
|
||||
Host string
|
||||
Port int
|
||||
// Routes are the nats-route URLs of the OTHER nodes, e.g.
|
||||
// "nats://user:pass@10.0.0.2:6250". When the route layer is password
|
||||
// protected each URL must carry the same userinfo as the local Username /
|
||||
// Password so this node authenticates outbound to its peers.
|
||||
Routes []string
|
||||
// Username and Password gate the route listener (NATS Cluster.Authorization).
|
||||
// A peer (or impostor) that connects to this node's route port without these
|
||||
// credentials is rejected, so it never becomes a route. Empty disables route
|
||||
// auth (dev / trusted-network only).
|
||||
Username string
|
||||
Password string
|
||||
// TLS, when non-nil, secures the route connections with mutual TLS. Build it
|
||||
// with busauth.RouteTLSConfig(cert, key, ca): the server presents its node
|
||||
// certificate AND requires+verifies the connecting node's certificate against
|
||||
// the bus CA, so an unsigned impostor cannot establish a route even with the
|
||||
// right password. Nil keeps routes plaintext (dev / WireGuard-only).
|
||||
TLS *tls.Config
|
||||
}
|
||||
|
||||
// ServerConfig is the full set of knobs for the embedded NATS server. The zero
|
||||
// value (empty StoreDir aside) yields a dev-friendly server: JetStream on, bound
|
||||
// to all interfaces, no client auth, no TLS. Secured deployments set Auth and
|
||||
// TLS; tests set Host to loopback and a free Port.
|
||||
// to all interfaces, no client auth, no TLS, standalone (no cluster). Secured
|
||||
// deployments set Auth and TLS; HA deployments set ServerName + Cluster; tests
|
||||
// set Host to loopback and a free Port.
|
||||
type ServerConfig struct {
|
||||
StoreDir string // JetStream store directory
|
||||
Host string // bind interface; "" = nats-server default ("0.0.0.0")
|
||||
Port int // listen port
|
||||
// ServerName is this node's unique name within the cluster. JetStream's RAFT
|
||||
// layer requires a stable, unique name per node to form its meta-group; leave
|
||||
// it empty for a standalone server (nats-server then auto-generates one).
|
||||
ServerName string
|
||||
// Auth, when non-nil, is installed as CustomClientAuthentication so the data
|
||||
// plane only accepts approved clients (nkey signature + bus allowlist).
|
||||
Auth server.Authentication
|
||||
// TLS, when non-nil, makes the server present a certificate and require TLS
|
||||
// on the data plane. Clients must trust the issuing CA (see busauth).
|
||||
TLS *tls.Config
|
||||
// Cluster, when non-nil, joins this server to a route cluster for high
|
||||
// availability (issue 0003a). Nil keeps the server standalone (the legacy
|
||||
// single-node behavior).
|
||||
Cluster *ClusterConfig
|
||||
}
|
||||
|
||||
// Start is a thin backward-compatible wrapper: embedded JetStream server on the
|
||||
@@ -60,6 +111,7 @@ func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
StoreDir: cfg.StoreDir,
|
||||
Host: cfg.Host,
|
||||
Port: cfg.Port,
|
||||
ServerName: cfg.ServerName,
|
||||
DontListen: false,
|
||||
// Keep the embedded server quiet by default; the host app logs the URLs.
|
||||
NoLog: true,
|
||||
@@ -78,6 +130,12 @@ func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
opts.TLS = true
|
||||
}
|
||||
|
||||
if cfg.Cluster != nil {
|
||||
if err := applyClusterOpts(opts, cfg.Cluster); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
ns, err := server.NewServer(opts)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("embeddednats: new server: %w", err)
|
||||
@@ -93,6 +151,34 @@ func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
return ns, nil
|
||||
}
|
||||
|
||||
// applyClusterOpts translates a ClusterConfig into the nats-server route options
|
||||
// on opts: the cluster listener (name + host/port + shared-secret auth + mutual
|
||||
// TLS) and the outbound routes to the other nodes. A malformed route URL is a
|
||||
// configuration error and aborts startup rather than silently dropping a peer.
|
||||
func applyClusterOpts(opts *server.Options, c *ClusterConfig) error {
|
||||
opts.Cluster = server.ClusterOpts{
|
||||
Name: c.Name,
|
||||
Host: c.Host,
|
||||
Port: c.Port,
|
||||
Username: c.Username,
|
||||
Password: c.Password,
|
||||
}
|
||||
if c.TLS != nil {
|
||||
opts.Cluster.TLSConfig = c.TLS
|
||||
// A generous handshake budget: route TLS does a mutual handshake and the
|
||||
// peer may still be booting. The default 2s can flap on a cold cluster.
|
||||
opts.Cluster.TLSTimeout = 5.0
|
||||
}
|
||||
for _, r := range c.Routes {
|
||||
u, err := url.Parse(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("embeddednats: parse route %q: %w", r, err)
|
||||
}
|
||||
opts.Routes = append(opts.Routes, u)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClientURL returns a NATS connection URL for the running embedded server.
|
||||
func ClientURL(ns *server.Server) string {
|
||||
return ns.ClientURL()
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
package membership
|
||||
|
||||
// Per-subject data-plane access control derived from room membership (issue
|
||||
// 0003e, audit H4 residual). The control plane already authorizes metadata by
|
||||
// membership; this is the matching restriction on the NATS data plane so a
|
||||
// registered peer can only publish/subscribe on the subjects of the rooms it
|
||||
// actually belongs to — not on every subject on the bus.
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// clientInfraSubjects are the subjects every peer needs regardless of room
|
||||
// membership: the request/reply inbox space and the JetStream API (the durable
|
||||
// plane of persisted rooms). They are granted to all authorized peers so
|
||||
// request/reply and persisted-room history keep working under the subject ACL.
|
||||
var clientInfraSubjects = []string{"_INBOX.>", "$JS.API.>"}
|
||||
|
||||
// SubjectACLFor returns a function that maps a signing public key (lowercase
|
||||
// hex) to the data-plane subjects that identity may publish and subscribe to:
|
||||
// the subject of every room it belongs to, plus the client infrastructure
|
||||
// subjects. It reads the live membership store, so the permissions reflect the
|
||||
// identity's rooms at the moment it connects. A decode error or a store failure
|
||||
// is returned as an error so the caller can fail closed (deny the connection)
|
||||
// rather than grant open access.
|
||||
//
|
||||
// Because NATS freezes permissions at connect time, a peer invited to a new room
|
||||
// after connecting must reconnect (client.RefreshSession) to pick up the new
|
||||
// room's subject. The bus is the authoritative directory of subjects, so an
|
||||
// unlisted subject is simply absent from the allow set.
|
||||
func SubjectACLFor(store Store) func(signPubHex string) ([]string, error) {
|
||||
return func(signPubHex string) ([]string, error) {
|
||||
pub, err := hex.DecodeString(signPubHex)
|
||||
if err != nil || len(pub) != 32 {
|
||||
return nil, fmt.Errorf("acl: malformed sign pub %q", signPubHex)
|
||||
}
|
||||
endpoint := frame.EndpointID(pub)
|
||||
rooms, err := store.ListRoomsForEndpoint(endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("acl: list rooms for %s: %w", endpoint, err)
|
||||
}
|
||||
subjects := make([]string, 0, len(rooms)+len(clientInfraSubjects))
|
||||
subjects = append(subjects, clientInfraSubjects...)
|
||||
for _, r := range rooms {
|
||||
subjects = append(subjects, r.Subject)
|
||||
}
|
||||
return subjects, nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,290 @@
|
||||
package membership_test
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
func aclFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
func mustID(t *testing.T) cs.Identity {
|
||||
t.Helper()
|
||||
id, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// aclPermsFunc adapts membership.SubjectACLFor into the busauth.PermissionsFunc
|
||||
// the ACL authenticator expects (same Allow set for publish and subscribe).
|
||||
func aclPermsFunc(store membership.Store) busauth.PermissionsFunc {
|
||||
derive := membership.SubjectACLFor(store)
|
||||
return func(signPubHex string) (*server.Permissions, error) {
|
||||
subs, err := derive(signPubHex)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sp := &server.SubjectPermission{Allow: subs}
|
||||
return &server.Permissions{Publish: sp, Subscribe: sp}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// startACLNats boots an embedded NATS whose authenticator confines each peer to
|
||||
// the subjects of the rooms it belongs to (audit H4 residual).
|
||||
func startACLNats(t *testing.T, store membership.Store) *server.Server {
|
||||
t.Helper()
|
||||
auth := busauth.NewNkeyAuthenticatorACL(store.IsAuthorized, aclPermsFunc(store))
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: aclFreePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("acl nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
func nkeyConn(t *testing.T, natsURL string, id cs.Identity, errCh chan error) *nats.Conn {
|
||||
t.Helper()
|
||||
pub, sign, err := busauth.ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("nkey: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(natsURL,
|
||||
nats.Nkey(pub, sign),
|
||||
nats.ErrorHandler(func(_ *nats.Conn, _ *nats.Subscription, e error) {
|
||||
select {
|
||||
case errCh <- e:
|
||||
default:
|
||||
}
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("connect nkey: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
return nc
|
||||
}
|
||||
|
||||
func mustAddUser(t *testing.T, store membership.Store, id cs.Identity, handle string) {
|
||||
t.Helper()
|
||||
if err := store.AddUser(hex.EncodeToString(id.SignPub), handle, membership.RoleMember); err != nil {
|
||||
t.Fatalf("add user %s: %v", handle, err)
|
||||
}
|
||||
}
|
||||
|
||||
func mustCreateRoom(t *testing.T, store membership.Store, roomID, subject, ownerEP string, owner cs.Identity) {
|
||||
t.Helper()
|
||||
info := membership.RoomInfo{RoomID: roomID, Subject: subject, OwnerEndpoint: ownerEP}
|
||||
if err := store.CreateRoom(info, owner.SignPub, owner.KexPub, nil); err != nil {
|
||||
t.Fatalf("create room %s: %v", roomID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func newCtrl(t *testing.T, store membership.Store, blobs blobstore.Store) string {
|
||||
t.Helper()
|
||||
ts := httptest.NewServer(membership.NewServer(store, blobs, membership.AuthOff))
|
||||
t.Cleanup(ts.Close)
|
||||
return ts.URL
|
||||
}
|
||||
|
||||
func waitErr(ch chan error, d time.Duration) error {
|
||||
select {
|
||||
case e := <-ch:
|
||||
return e
|
||||
case <-time.After(d):
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func drain(ch chan error) {
|
||||
for {
|
||||
select {
|
||||
case <-ch:
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSubjectACLIsolation closes the audit H4 residual: a registered peer is
|
||||
// confined to the subjects of the rooms it belongs to. alice (member of room.A)
|
||||
// may sub/pub room.A but is DENIED sub/pub on room.B, and never reads what bob
|
||||
// (member of room.B) publishes there.
|
||||
func TestSubjectACLIsolation(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, bob := mustID(t), mustID(t)
|
||||
aliceEP, bobEP := frame.EndpointID(alice.SignPub), frame.EndpointID(bob.SignPub)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, bob, "bob")
|
||||
const subjA, subjB = "room.acl.a", "room.acl.b"
|
||||
mustCreateRoom(t, store, "ROOMA", subjA, aliceEP, alice)
|
||||
mustCreateRoom(t, store, "ROOMB", subjB, bobEP, bob)
|
||||
|
||||
srv := startACLNats(t, store)
|
||||
url := srv.ClientURL()
|
||||
aliceErr := make(chan error, 4)
|
||||
bobErr := make(chan error, 4)
|
||||
aliceNC := nkeyConn(t, url, alice, aliceErr)
|
||||
bobNC := nkeyConn(t, url, bob, bobErr)
|
||||
|
||||
// alice may subscribe to her own room (no error).
|
||||
aliceGot := make(chan string, 4)
|
||||
if _, err := aliceNC.Subscribe(subjA, func(m *nats.Msg) { aliceGot <- string(m.Data) }); err != nil {
|
||||
t.Fatalf("alice sub A: %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 300*time.Millisecond); e != nil {
|
||||
t.Fatalf("alice sub to her OWN room raised an error: %v", e)
|
||||
}
|
||||
|
||||
// alice subscribing to bob's room is a permissions violation.
|
||||
if _, err := aliceNC.Subscribe(subjB, func(m *nats.Msg) { aliceGot <- "LEAK:" + string(m.Data) }); err != nil {
|
||||
t.Fatalf("alice sub B (queue): %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("alice subscribing to bob's room should raise a permissions violation")
|
||||
}
|
||||
|
||||
// bob publishes in his room; alice (denied) must not receive it.
|
||||
bobGot := make(chan string, 4)
|
||||
if _, err := bobNC.Subscribe(subjB, func(m *nats.Msg) { bobGot <- string(m.Data) }); err != nil {
|
||||
t.Fatalf("bob sub B: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
if err := bobNC.Publish(subjB, []byte("internal-bob")); err != nil {
|
||||
t.Fatalf("bob pub B: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
select {
|
||||
case got := <-bobGot:
|
||||
if got != "internal-bob" {
|
||||
t.Fatalf("bob got %q", got)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatalf("bob did not receive his own message")
|
||||
}
|
||||
select {
|
||||
case leak := <-aliceGot:
|
||||
t.Fatalf("alice received bob's room traffic despite the ACL: %q", leak)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// good: alice never got it
|
||||
}
|
||||
|
||||
// alice publishing into bob's room is denied; bob must not receive it.
|
||||
drain(aliceErr)
|
||||
if err := aliceNC.Publish(subjB, []byte("intruder")); err != nil {
|
||||
t.Fatalf("alice pub B (queue): %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("alice publishing into bob's room should raise a permissions violation")
|
||||
}
|
||||
select {
|
||||
case got := <-bobGot:
|
||||
t.Fatalf("bob received alice's cross-room publish despite the ACL: %q", got)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// good
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshSessionGainsNewRoom is the "permissions refreshed on join" path:
|
||||
// alice is not in room B, so her connection has no permission for its subject;
|
||||
// after she is added to room B and calls RefreshSession, the reconnect
|
||||
// re-derives her permissions and she gains the room's subject.
|
||||
func TestRefreshSessionGainsNewRoom(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, bob := mustID(t), mustID(t)
|
||||
aliceEP, bobEP := frame.EndpointID(alice.SignPub), frame.EndpointID(bob.SignPub)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, bob, "bob")
|
||||
const subjB = "room.refresh.b"
|
||||
mustCreateRoom(t, store, "ROOMB", subjB, bobEP, bob)
|
||||
|
||||
srv := startACLNats(t, store)
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
ctrl := newCtrl(t, store, blobs)
|
||||
|
||||
aliceC, err := client.NewWithOptions(srv.ClientURL(), ctrl, alice, client.Options{UseNkey: true})
|
||||
if err != nil {
|
||||
t.Fatalf("connect alice: %v", err)
|
||||
}
|
||||
defer aliceC.Close()
|
||||
|
||||
// Add alice to room B (as if invited), then RefreshSession so the
|
||||
// authenticator re-derives her permissions on reconnect.
|
||||
if _, err := store.GetMember("ROOMB", aliceEP); err == nil {
|
||||
t.Fatalf("alice should not be a member yet")
|
||||
}
|
||||
if err := store.AddMember("ROOMB", membership.Member{Endpoint: aliceEP, Role: "member", SignPub: alice.SignPub, KexPub: alice.KexPub}, 1, nil); err != nil {
|
||||
t.Fatalf("add alice to room B: %v", err)
|
||||
}
|
||||
if err := aliceC.RefreshSession(); err != nil {
|
||||
t.Fatalf("refresh session: %v", err)
|
||||
}
|
||||
|
||||
bobErr := make(chan error, 2)
|
||||
bobNC := nkeyConn(t, srv.ClientURL(), bob, bobErr)
|
||||
|
||||
got := make(chan string, 2)
|
||||
sub, err := aliceC.Subscribe("ROOMB", func(_ frame.Frame, plaintext []byte) { got <- string(plaintext) })
|
||||
if err != nil {
|
||||
t.Fatalf("alice subscribe room B after refresh: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// bob publishes a minimal cleartext frame on subjB.
|
||||
f := frame.Frame{Type: frame.PUB, Subject: subjB, Sender: bobEP, MsgID: "m1", Payload: []byte("hello-after-join")}
|
||||
b, _ := f.Marshal()
|
||||
if err := bobNC.Publish(subjB, b); err != nil {
|
||||
t.Fatalf("bob publish: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
|
||||
select {
|
||||
case msg := <-got:
|
||||
if msg != "hello-after-join" {
|
||||
t.Fatalf("alice got %q", msg)
|
||||
}
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatalf("alice did not receive room B traffic after RefreshSession (permissions not refreshed)")
|
||||
}
|
||||
}
|
||||
+19
-8
@@ -95,16 +95,27 @@ func CanonicalRequest(method, path, ts, nonce string, body []byte) []byte {
|
||||
return []byte(method + "\n" + path + "\n" + ts + "\n" + nonce + "\n" + hex.EncodeToString(sum[:]))
|
||||
}
|
||||
|
||||
// nonceCache remembers recently-seen nonces to reject replays. It is an
|
||||
// in-memory store guarded by a mutex — sufficient for a single membershipd
|
||||
// process (the spec's chosen tradeoff over a server-issued nonce round-trip). A
|
||||
// distributed deployment would need a shared store (tracked for issue 0003).
|
||||
// nonceStore is the anti-replay backend: rememberOrReject records a nonce and
|
||||
// reports whether it was unseen (true -> accept) or already seen (false ->
|
||||
// reject the replay). It is an interface (issue 0003e) so the single-node
|
||||
// in-memory cache can be swapped for a replicated KV store: a per-process cache
|
||||
// is BROKEN under multi-node failover (a request captured and replayed to a
|
||||
// DIFFERENT node whose cache never saw the nonce would be accepted), so a
|
||||
// cluster MUST share the nonce state. Every implementation fails CLOSED — a
|
||||
// backend it cannot reach rejects rather than admits.
|
||||
type nonceStore interface {
|
||||
rememberOrReject(nonce string, now time.Time) bool
|
||||
}
|
||||
|
||||
// memNonceCache remembers recently-seen nonces to reject replays. It is an
|
||||
// in-memory store guarded by a mutex — sufficient for a SINGLE membershipd
|
||||
// process. A clustered deployment uses kvNonceStore instead (issue 0003e).
|
||||
//
|
||||
// Pruning is O(expired), not O(n): because the TTL is constant, insertion order
|
||||
// equals expiry order, so the oldest entries (front of `order`) are exactly the
|
||||
// ones that expire first (audit H7 — the previous full-map scan under the mutex
|
||||
// was a CPU-amplification vector). A size cap bounds memory.
|
||||
type nonceCache struct {
|
||||
type memNonceCache struct {
|
||||
mu sync.Mutex
|
||||
seen map[string]time.Time // nonce -> expiry
|
||||
order []string // nonces in insertion order == expiry order
|
||||
@@ -112,13 +123,13 @@ type nonceCache struct {
|
||||
cap int
|
||||
}
|
||||
|
||||
func newNonceCache(ttl time.Duration, capacity int) *nonceCache {
|
||||
return &nonceCache{seen: make(map[string]time.Time), ttl: ttl, cap: capacity}
|
||||
func newMemNonceCache(ttl time.Duration, capacity int) *memNonceCache {
|
||||
return &memNonceCache{seen: make(map[string]time.Time), ttl: ttl, cap: capacity}
|
||||
}
|
||||
|
||||
// rememberOrReject records nonce and returns true if it was unseen, or false if
|
||||
// it is a replay (still live in the cache).
|
||||
func (n *nonceCache) rememberOrReject(nonce string, now time.Time) bool {
|
||||
func (n *memNonceCache) rememberOrReject(nonce string, now time.Time) bool {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ import (
|
||||
// with a fresh store + blob store, and seeds one active admin ("alice").
|
||||
type authHarness struct {
|
||||
ts *httptest.Server
|
||||
store *Store
|
||||
store Store
|
||||
alice cs.Identity
|
||||
alicePub string // hex
|
||||
}
|
||||
|
||||
@@ -0,0 +1,633 @@
|
||||
package membership
|
||||
|
||||
// jetstreamStore is the JetStream KV implementation of Store (issue 0003b): the
|
||||
// control-plane state (rooms, members, sealed room keys, the user allowlist)
|
||||
// lives in replicated JetStream Key/Value buckets instead of a process-local
|
||||
// SQLite file. Any node in the cluster reads and writes the same buckets, and
|
||||
// JetStream's RAFT layer keeps them consistent across replicas, so the HTTP
|
||||
// control plane becomes effectively stateless: any membershipd can serve any
|
||||
// request. It is selected only when the `decentralized` flag is on; sqliteStore
|
||||
// stays the default.
|
||||
//
|
||||
// Key layout (every path segment is a single KV token — ULIDs, RawURL endpoint
|
||||
// ids and lowercase-hex keys never contain a '.', so '.' is a safe separator and
|
||||
// a "<prefix>.*" watch enumerates exactly one trailing token):
|
||||
//
|
||||
// rooms roomID -> RoomInfo (JSON)
|
||||
// members roomID.endpoint -> Member (JSON, carries Role)
|
||||
// rooms_by_member endpoint.roomID -> role (reverse index for ListRoomsForEndpoint)
|
||||
// room_keys roomID.endpoint.epoch -> sealed_key bytes
|
||||
// users signPubHex -> User (JSON)
|
||||
//
|
||||
// Consistency caveat: KV has no multi-key transaction, so a multi-write op
|
||||
// (CreateRoom, AddMember) is a short sequence of single-key writes. The order is
|
||||
// chosen so a partial failure leaves a recoverable state (the room/member row
|
||||
// before its reverse index or sealed key), and writes are idempotent (Put
|
||||
// overwrites), which is also what makes the SQLite->KV migration (0003c) safe to
|
||||
// re-run.
|
||||
//
|
||||
// Fail-closed: every read uses a bounded context, and IsAuthorized/HasAdmin
|
||||
// return false on ANY backend error (a KV quorum loss or timeout denies access
|
||||
// rather than admitting it), mirroring the SQLite store's behavior.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// Bucket names (alphanumeric/dash/underscore only — no dots, per KV rules).
|
||||
const (
|
||||
bucketRooms = "UNIBUS_rooms"
|
||||
bucketMembers = "UNIBUS_members"
|
||||
bucketByMember = "UNIBUS_rooms_by_member"
|
||||
bucketRoomKeys = "UNIBUS_room_keys"
|
||||
bucketUsers = "UNIBUS_users"
|
||||
defaultKVOpTime = 5 * time.Second
|
||||
)
|
||||
|
||||
// JetStreamConfig configures the KV-backed store.
|
||||
type JetStreamConfig struct {
|
||||
// Replicas is the per-bucket replication factor (R1..R5). Use 1 for a single
|
||||
// node or a 1-2 node rollout, 3 for real HA (quorum 2/3). Scaling R1->R3 in
|
||||
// place is an operational step (nats kv update) done when the third node
|
||||
// joins; it does not require reopening the store.
|
||||
Replicas int
|
||||
// OpTimeout bounds every KV operation so a stalled backend fails closed
|
||||
// instead of hanging a request. Zero uses defaultKVOpTime.
|
||||
OpTimeout time.Duration
|
||||
}
|
||||
|
||||
type jetstreamStore struct {
|
||||
rooms jetstream.KeyValue
|
||||
members jetstream.KeyValue
|
||||
byMember jetstream.KeyValue
|
||||
keys jetstream.KeyValue
|
||||
users jetstream.KeyValue
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// OpenJetStream creates (or opens) the five KV buckets on js with the configured
|
||||
// replication factor and returns a Store backed by them. The JetStream context
|
||||
// belongs to the caller (it owns the NATS connection); Close is a no-op.
|
||||
func OpenJetStream(js jetstream.JetStream, cfg JetStreamConfig) (Store, error) {
|
||||
if cfg.Replicas <= 0 {
|
||||
cfg.Replicas = 1
|
||||
}
|
||||
opTimeout := cfg.OpTimeout
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultKVOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
|
||||
s := &jetstreamStore{opTimeout: opTimeout}
|
||||
for _, b := range []struct {
|
||||
name string
|
||||
dst *jetstream.KeyValue
|
||||
}{
|
||||
{bucketRooms, &s.rooms},
|
||||
{bucketMembers, &s.members},
|
||||
{bucketByMember, &s.byMember},
|
||||
{bucketRoomKeys, &s.keys},
|
||||
{bucketUsers, &s.users},
|
||||
} {
|
||||
kv, err := js.CreateOrUpdateKeyValue(ctx, jetstream.KeyValueConfig{
|
||||
Bucket: b.name,
|
||||
Replicas: cfg.Replicas,
|
||||
History: 1,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: open KV bucket %q (replicas=%d): %w", b.name, cfg.Replicas, err)
|
||||
}
|
||||
*b.dst = kv
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Close releases nothing: the JetStream context and NATS connection are owned by
|
||||
// the caller, which closes them on shutdown.
|
||||
func (s *jetstreamStore) Close() error { return nil }
|
||||
|
||||
func (s *jetstreamStore) ctx() (context.Context, context.CancelFunc) {
|
||||
return context.WithTimeout(context.Background(), s.opTimeout)
|
||||
}
|
||||
|
||||
// ---- key helpers ----------------------------------------------------------
|
||||
|
||||
func memberKey(roomID, endpoint string) string { return roomID + "." + endpoint }
|
||||
func byMemberKey(endpoint, roomID string) string { return endpoint + "." + roomID }
|
||||
func sealedKey(roomID, endpoint string, e int) string {
|
||||
return roomID + "." + endpoint + "." + strconv.Itoa(e)
|
||||
}
|
||||
|
||||
// watchEntries collects every current entry whose key matches pattern (a KV
|
||||
// watch with a "<prefix>.*" wildcard), draining the watcher until the nil marker
|
||||
// that signals "all initial values delivered". Tombstones are skipped.
|
||||
func (s *jetstreamStore) watchEntries(kv jetstream.KeyValue, pattern string) ([]jetstream.KeyValueEntry, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
w, err := kv.Watch(ctx, pattern, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer w.Stop()
|
||||
var out []jetstream.KeyValueEntry
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
return out, nil // initial snapshot complete
|
||||
}
|
||||
out = append(out, e)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- rooms / members / keys ----------------------------------------------
|
||||
|
||||
func (s *jetstreamStore) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
|
||||
info.Epoch = 1
|
||||
roomJSON, err := json.Marshal(info)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal room: %w", err)
|
||||
}
|
||||
// Create (not Put) so a duplicate room id is rejected, matching SQLite's
|
||||
// PRIMARY KEY behavior.
|
||||
if _, err := s.rooms.Create(ctx, info.RoomID, roomJSON); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return fmt.Errorf("membership: room %q already exists", info.RoomID)
|
||||
}
|
||||
return fmt.Errorf("membership: create room: %w", err)
|
||||
}
|
||||
|
||||
owner := Member{Endpoint: info.OwnerEndpoint, Role: "owner", SignPub: ownerSignPub, KexPub: ownerKexPub}
|
||||
if err := s.putMember(ctx, info.RoomID, owner); err != nil {
|
||||
return err
|
||||
}
|
||||
if info.Encrypt {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(info.RoomID, info.OwnerEndpoint, 1), ownerSealedKey); err != nil {
|
||||
return fmt.Errorf("membership: put owner key: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// putMember writes the member row and its reverse index together.
|
||||
func (s *jetstreamStore) putMember(ctx context.Context, roomID string, m Member) error {
|
||||
mb, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal member: %w", err)
|
||||
}
|
||||
if _, err := s.members.Put(ctx, memberKey(roomID, m.Endpoint), mb); err != nil {
|
||||
return fmt.Errorf("membership: put member: %w", err)
|
||||
}
|
||||
if _, err := s.byMember.Put(ctx, byMemberKey(m.Endpoint, roomID), []byte(m.Role)); err != nil {
|
||||
return fmt.Errorf("membership: put member index: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetRoom(roomID string) (RoomInfo, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.rooms.Get(ctx, roomID)
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return RoomInfo{}, fmt.Errorf("membership: get room %q: %w", roomID, ErrNotFound)
|
||||
}
|
||||
return RoomInfo{}, fmt.Errorf("membership: get room %q: %w", roomID, err)
|
||||
}
|
||||
var info RoomInfo
|
||||
if err := json.Unmarshal(e.Value(), &info); err != nil {
|
||||
return RoomInfo{}, fmt.Errorf("membership: unmarshal room %q: %w", roomID, err)
|
||||
}
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) AddMember(roomID string, m Member, epoch int, sealedKeyBytes []byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if err := s.putMember(ctx, roomID, m); err != nil {
|
||||
return err
|
||||
}
|
||||
if len(sealedKeyBytes) > 0 {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(roomID, m.Endpoint, epoch), sealedKeyBytes); err != nil {
|
||||
return fmt.Errorf("membership: put member key: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetMember(roomID, endpoint string) (Member, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.members.Get(ctx, memberKey(roomID, endpoint))
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return Member{}, fmt.Errorf("membership: get member %q/%q: %w", roomID, endpoint, ErrNotFound)
|
||||
}
|
||||
return Member{}, fmt.Errorf("membership: get member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return Member{}, fmt.Errorf("membership: unmarshal member: %w", err)
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListMembers(roomID string) ([]Member, error) {
|
||||
entries, err := s.watchEntries(s.members, roomID+".*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list members %q: %w", roomID, err)
|
||||
}
|
||||
out := make([]Member, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return nil, fmt.Errorf("membership: unmarshal member: %w", err)
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].Endpoint < out[j].Endpoint })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error) {
|
||||
entries, err := s.watchEntries(s.byMember, endpoint+".*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list rooms for endpoint %q: %w", endpoint, err)
|
||||
}
|
||||
out := make([]RoomMembership, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
// Key is "<endpoint>.<roomID>"; the roomID is everything after the dot.
|
||||
roomID := e.Key()[len(endpoint)+1:]
|
||||
info, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
continue // index points at a removed room: skip, stay consistent
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, RoomMembership{RoomInfo: info, Role: string(e.Value())})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].RoomID < out[j].RoomID })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
if epoch > 0 {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.keys.Get(ctx, sealedKey(roomID, endpoint, epoch))
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, ErrNotFound)
|
||||
}
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, err)
|
||||
}
|
||||
return epoch, e.Value(), nil
|
||||
}
|
||||
// epoch <= 0: latest. Enumerate "<roomID>.<endpoint>.*" and take the max.
|
||||
entries, err := s.watchEntries(s.keys, roomID+"."+endpoint+".*")
|
||||
if err != nil {
|
||||
return 0, nil, fmt.Errorf("membership: get latest sealed key %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
bestEpoch, bestVal := -1, []byte(nil)
|
||||
for _, e := range entries {
|
||||
k := e.Key()
|
||||
ep, perr := strconv.Atoi(k[len(roomID)+1+len(endpoint)+1:])
|
||||
if perr != nil {
|
||||
continue
|
||||
}
|
||||
if ep > bestEpoch {
|
||||
bestEpoch, bestVal = ep, e.Value()
|
||||
}
|
||||
}
|
||||
if bestEpoch < 0 {
|
||||
return 0, nil, fmt.Errorf("membership: get latest sealed key %q/%q: %w", roomID, endpoint, ErrNotFound)
|
||||
}
|
||||
return bestEpoch, bestVal, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
for endpoint, sealed := range keys {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(roomID, endpoint, epoch), sealed); err != nil {
|
||||
return fmt.Errorf("membership: put sealed key for %q: %w", endpoint, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) BumpEpoch(roomID string, newEpoch int) error {
|
||||
// Read-modify-write the room's epoch. The control plane serializes rekeys per
|
||||
// room (owner-signed), so the lost-update window is not exercised in practice.
|
||||
info, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
info.Epoch = newEpoch
|
||||
b, err := json.Marshal(info)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal room: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.rooms.Put(ctx, roomID, b); err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) RemoveMember(roomID, endpoint string) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
// Drop the member row and its reverse index. Past-epoch sealed keys are left
|
||||
// intact (they only decrypt data the member could already read), matching the
|
||||
// SQLite store.
|
||||
if err := s.members.Delete(ctx, memberKey(roomID, endpoint)); err != nil && !errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return fmt.Errorf("membership: remove member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
if err := s.byMember.Delete(ctx, byMemberKey(endpoint, roomID)); err != nil && !errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return fmt.Errorf("membership: remove member index %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- users (the bus allowlist) -------------------------------------------
|
||||
|
||||
func (s *jetstreamStore) AddUser(signPub, handle, role string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" || handle == "" {
|
||||
return fmt.Errorf("membership: AddUser: sign_pub and handle required")
|
||||
}
|
||||
if role == "" {
|
||||
role = RoleMember
|
||||
}
|
||||
if role != RoleAdmin && role != RoleMember {
|
||||
return fmt.Errorf("membership: AddUser: invalid role %q (want %q or %q)", role, RoleAdmin, RoleMember)
|
||||
}
|
||||
u := User{SignPub: signPub, Handle: handle, Role: role, Status: StatusActive, CreatedAt: nowRFC3339()}
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal user: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.users.Create(ctx, signPub, b); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return ErrUserExists
|
||||
}
|
||||
return fmt.Errorf("membership: insert user: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetUser(signPub string) (User, error) {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.users.Get(ctx, signPub)
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return User{}, fmt.Errorf("membership: get user %q: %w", signPub, ErrNotFound)
|
||||
}
|
||||
return User{}, fmt.Errorf("membership: get user %q: %w", signPub, err)
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return User{}, fmt.Errorf("membership: unmarshal user: %w", err)
|
||||
}
|
||||
return u, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListUsers() ([]User, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
w, err := s.users.WatchAll(ctx, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("membership: list users: %w", err)
|
||||
}
|
||||
defer cancel()
|
||||
defer w.Stop()
|
||||
var out []User
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
if out[i].Handle != out[j].Handle {
|
||||
return out[i].Handle < out[j].Handle
|
||||
}
|
||||
return out[i].SignPub < out[j].SignPub
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return nil, fmt.Errorf("membership: unmarshal user: %w", err)
|
||||
}
|
||||
out = append(out, u)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) RevokeUser(signPub string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
u, err := s.GetUser(signPub)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
return fmt.Errorf("membership: revoke user %q: no active user with that key", signPub)
|
||||
}
|
||||
return fmt.Errorf("membership: revoke user %q: %w", signPub, err)
|
||||
}
|
||||
if u.Status != StatusActive {
|
||||
return fmt.Errorf("membership: revoke user %q: no active user with that key", signPub)
|
||||
}
|
||||
u.Status = StatusRevoked
|
||||
u.RevokedAt = nowRFC3339()
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal user: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.users.Put(ctx, signPub, b); err != nil {
|
||||
return fmt.Errorf("membership: revoke user %q: %w", signPub, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsAuthorized reports whether signPub is an active bus user. Any backend error
|
||||
// (including a KV quorum loss or timeout) yields false: fail closed.
|
||||
func (s *jetstreamStore) IsAuthorized(signPub string) bool {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.users.Get(ctx, signPub)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return false
|
||||
}
|
||||
return u.Status == StatusActive
|
||||
}
|
||||
|
||||
// HasAdmin reports whether at least one active admin exists. On any backend
|
||||
// error it returns false, keeping the admin-gated endpoints closed (conservative).
|
||||
func (s *jetstreamStore) HasAdmin() bool {
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, u := range users {
|
||||
if u.Role == RoleAdmin && u.Status == StatusActive {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ---- snapshot import / export (issue 0003c migration) ---------------------
|
||||
|
||||
// importSnapshot writes a full Snapshot into the KV buckets, preserving each
|
||||
// room's epoch and each user's status (Put, not CreateRoom/AddUser, so the exact
|
||||
// state is reproduced rather than reset to defaults). Idempotent: every write is
|
||||
// an overwrite, so re-running the migration converges.
|
||||
func (s *jetstreamStore) importSnapshot(snap *Snapshot) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
for _, r := range snap.Rooms {
|
||||
b, err := json.Marshal(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("import: marshal room %q: %w", r.RoomID, err)
|
||||
}
|
||||
if _, err := s.rooms.Put(ctx, r.RoomID, b); err != nil {
|
||||
return fmt.Errorf("import: put room %q: %w", r.RoomID, err)
|
||||
}
|
||||
}
|
||||
for roomID, members := range snap.Members {
|
||||
for _, m := range members {
|
||||
if err := s.putMember(ctx, roomID, m); err != nil {
|
||||
return fmt.Errorf("import: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, rec := range snap.Keys {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(rec.RoomID, rec.Endpoint, rec.Epoch), rec.Sealed); err != nil {
|
||||
return fmt.Errorf("import: put key %q/%q@%d: %w", rec.RoomID, rec.Endpoint, rec.Epoch, err)
|
||||
}
|
||||
}
|
||||
for _, u := range snap.Users {
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("import: marshal user %q: %w", u.SignPub, err)
|
||||
}
|
||||
if _, err := s.users.Put(ctx, normalizeSignPub(u.SignPub), b); err != nil {
|
||||
return fmt.Errorf("import: put user %q: %w", u.SignPub, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExportSnapshot reads the entire KV control-plane state back into a Snapshot,
|
||||
// so the migration's parity test can compare it against the SQLite source.
|
||||
func (s *jetstreamStore) ExportSnapshot() (*Snapshot, error) {
|
||||
snap := &Snapshot{Members: map[string][]Member{}}
|
||||
|
||||
roomEntries, err := s.watchAll(s.rooms)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: rooms: %w", err)
|
||||
}
|
||||
for _, e := range roomEntries {
|
||||
var r RoomInfo
|
||||
if err := json.Unmarshal(e.Value(), &r); err != nil {
|
||||
return nil, fmt.Errorf("export kv: unmarshal room: %w", err)
|
||||
}
|
||||
snap.Rooms = append(snap.Rooms, r)
|
||||
}
|
||||
|
||||
memberEntries, err := s.watchAll(s.members)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: members: %w", err)
|
||||
}
|
||||
for _, e := range memberEntries {
|
||||
// Key is "<roomID>.<endpoint>"; neither segment contains a dot.
|
||||
roomID := strings.SplitN(e.Key(), ".", 2)[0]
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return nil, fmt.Errorf("export kv: unmarshal member: %w", err)
|
||||
}
|
||||
snap.Members[roomID] = append(snap.Members[roomID], m)
|
||||
}
|
||||
|
||||
keyEntries, err := s.watchAll(s.keys)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: keys: %w", err)
|
||||
}
|
||||
for _, e := range keyEntries {
|
||||
// Key is "<roomID>.<endpoint>.<epoch>".
|
||||
parts := strings.Split(e.Key(), ".")
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
epoch, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
snap.Keys = append(snap.Keys, SealedKeyRecord{RoomID: parts[0], Endpoint: parts[1], Epoch: epoch, Sealed: e.Value()})
|
||||
}
|
||||
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: users: %w", err)
|
||||
}
|
||||
snap.Users = users
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
// watchAll collects every current entry of a bucket (no key filter), draining
|
||||
// the watcher to its initial-snapshot nil marker.
|
||||
func (s *jetstreamStore) watchAll(kv jetstream.KeyValue) ([]jetstream.KeyValueEntry, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
w, err := kv.WatchAll(ctx, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer w.Stop()
|
||||
var out []jetstream.KeyValueEntry
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
return out, nil
|
||||
}
|
||||
out = append(out, e)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,275 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
func kvFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// newKVStore boots a single-node embedded NATS with JetStream and opens a
|
||||
// jetstreamStore (R1) over it, returning the store plus the server and
|
||||
// connection so a test can shut the backend down to exercise fail-closed paths.
|
||||
func newKVStore(t *testing.T) (*jetstreamStore, *server.Server, *nats.Conn) {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
st, err := OpenJetStream(js, JetStreamConfig{Replicas: 1, OpTimeout: 2 * time.Second})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("open jetstream store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
ns.WaitForShutdown()
|
||||
})
|
||||
return st.(*jetstreamStore), ns, nc
|
||||
}
|
||||
|
||||
// TestJetStreamStoreRoomsCRUD is the golden path: an encrypted room with an owner
|
||||
// and an invited member round-trips through every room/member/key method.
|
||||
func TestJetStreamStoreRoomsCRUD(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
|
||||
roomID := newULID()
|
||||
owner := "owner-ep-1"
|
||||
info := RoomInfo{RoomID: roomID, Subject: "room.kv", Encrypt: true, Persist: true, SignMsgs: true, OwnerEndpoint: owner}
|
||||
ownerSealed := []byte("sealed-owner-epoch1")
|
||||
if err := s.CreateRoom(info, []byte("owner-sign"), []byte("owner-kex"), ownerSealed); err != nil {
|
||||
t.Fatalf("CreateRoom: %v", err)
|
||||
}
|
||||
|
||||
// GetRoom returns epoch 1 and the policy.
|
||||
got, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetRoom: %v", err)
|
||||
}
|
||||
if got.Epoch != 1 || got.Subject != "room.kv" || !got.Encrypt || got.OwnerEndpoint != owner {
|
||||
t.Fatalf("GetRoom mismatch: %+v", got)
|
||||
}
|
||||
|
||||
// Owner is a member with role "owner".
|
||||
om, err := s.GetMember(roomID, owner)
|
||||
if err != nil {
|
||||
t.Fatalf("GetMember owner: %v", err)
|
||||
}
|
||||
if om.Role != "owner" || !bytes.Equal(om.SignPub, []byte("owner-sign")) {
|
||||
t.Fatalf("owner member mismatch: %+v", om)
|
||||
}
|
||||
|
||||
// Owner's sealed key at epoch 1.
|
||||
ep, sealed, err := s.GetSealedKey(roomID, owner, 1)
|
||||
if err != nil || ep != 1 || !bytes.Equal(sealed, ownerSealed) {
|
||||
t.Fatalf("GetSealedKey owner: ep=%d sealed=%q err=%v", ep, sealed, err)
|
||||
}
|
||||
|
||||
// Invite a member with a sealed key at epoch 1.
|
||||
bob := "member-ep-bob"
|
||||
bobSealed := []byte("sealed-bob-epoch1")
|
||||
if err := s.AddMember(roomID, Member{Endpoint: bob, Role: "member", SignPub: []byte("bob-sign"), KexPub: []byte("bob-kex")}, 1, bobSealed); err != nil {
|
||||
t.Fatalf("AddMember: %v", err)
|
||||
}
|
||||
|
||||
// ListMembers returns both, sorted by endpoint.
|
||||
members, err := s.ListMembers(roomID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListMembers: %v", err)
|
||||
}
|
||||
if len(members) != 2 {
|
||||
t.Fatalf("ListMembers want 2, got %d (%+v)", len(members), members)
|
||||
}
|
||||
|
||||
// Bob can find the room via the reverse index.
|
||||
rooms, err := s.ListRoomsForEndpoint(bob)
|
||||
if err != nil {
|
||||
t.Fatalf("ListRoomsForEndpoint: %v", err)
|
||||
}
|
||||
if len(rooms) != 1 || rooms[0].RoomID != roomID || rooms[0].Role != "member" {
|
||||
t.Fatalf("ListRoomsForEndpoint mismatch: %+v", rooms)
|
||||
}
|
||||
|
||||
// Latest sealed key (epoch <= 0) resolves to epoch 1 for bob.
|
||||
lep, lsealed, err := s.GetSealedKey(roomID, bob, 0)
|
||||
if err != nil || lep != 1 || !bytes.Equal(lsealed, bobSealed) {
|
||||
t.Fatalf("GetSealedKey latest bob: ep=%d err=%v", lep, err)
|
||||
}
|
||||
|
||||
// Rekey to epoch 2 (bump + new sealed keys), then latest resolves to 2.
|
||||
if err := s.BumpEpoch(roomID, 2); err != nil {
|
||||
t.Fatalf("BumpEpoch: %v", err)
|
||||
}
|
||||
if err := s.PutSealedKeys(roomID, 2, map[string][]byte{owner: []byte("owner-epoch2")}); err != nil {
|
||||
t.Fatalf("PutSealedKeys: %v", err)
|
||||
}
|
||||
got2, _ := s.GetRoom(roomID)
|
||||
if got2.Epoch != 2 {
|
||||
t.Fatalf("after BumpEpoch want epoch 2, got %d", got2.Epoch)
|
||||
}
|
||||
lep2, _, err := s.GetSealedKey(roomID, owner, 0)
|
||||
if err != nil || lep2 != 2 {
|
||||
t.Fatalf("latest owner key after rekey: ep=%d err=%v", lep2, err)
|
||||
}
|
||||
|
||||
// Remove bob; he disappears from members and his reverse index.
|
||||
if err := s.RemoveMember(roomID, bob); err != nil {
|
||||
t.Fatalf("RemoveMember: %v", err)
|
||||
}
|
||||
if _, err := s.GetMember(roomID, bob); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetMember after remove want ErrNotFound, got %v", err)
|
||||
}
|
||||
rooms2, _ := s.ListRoomsForEndpoint(bob)
|
||||
if len(rooms2) != 0 {
|
||||
t.Fatalf("ListRoomsForEndpoint after remove want 0, got %d", len(rooms2))
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreUsers exercises the allowlist: add, lookup, authorize,
|
||||
// revoke (which flips IsAuthorized), and the admin gate.
|
||||
func TestJetStreamStoreUsers(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
|
||||
const aliceHex = "aa11"
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("fresh store should have no admin")
|
||||
}
|
||||
if err := s.AddUser(aliceHex, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
if !s.HasAdmin() {
|
||||
t.Fatalf("HasAdmin should be true after adding an admin")
|
||||
}
|
||||
if !s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("alice should be authorized")
|
||||
}
|
||||
// Case-insensitive lookup (keys are normalized lowercase).
|
||||
if !s.IsAuthorized("AA11") {
|
||||
t.Fatalf("uppercase hex should normalize and authorize")
|
||||
}
|
||||
u, err := s.GetUser(aliceHex)
|
||||
if err != nil || u.Handle != "alice" || u.Role != RoleAdmin || u.Status != StatusActive {
|
||||
t.Fatalf("GetUser mismatch: %+v err=%v", u, err)
|
||||
}
|
||||
|
||||
// Duplicate add is rejected with ErrUserExists.
|
||||
if err := s.AddUser(aliceHex, "alice2", RoleMember); !errors.Is(err, ErrUserExists) {
|
||||
t.Fatalf("duplicate AddUser want ErrUserExists, got %v", err)
|
||||
}
|
||||
|
||||
if err := s.AddUser("bb22", "bob", RoleMember); err != nil {
|
||||
t.Fatalf("AddUser bob: %v", err)
|
||||
}
|
||||
users, err := s.ListUsers()
|
||||
if err != nil || len(users) != 2 {
|
||||
t.Fatalf("ListUsers want 2, got %d err=%v", len(users), err)
|
||||
}
|
||||
|
||||
// Revoke alice: authorization flips off immediately.
|
||||
if err := s.RevokeUser(aliceHex); err != nil {
|
||||
t.Fatalf("RevokeUser: %v", err)
|
||||
}
|
||||
if s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("revoked user must not be authorized")
|
||||
}
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("after revoking the only admin, HasAdmin must be false")
|
||||
}
|
||||
// Revoking again is an error (no active user).
|
||||
if err := s.RevokeUser(aliceHex); err == nil {
|
||||
t.Fatalf("re-revoke should error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreNotFound checks the ErrNotFound mapping for misses.
|
||||
func TestJetStreamStoreNotFound(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
if _, err := s.GetRoom("nope"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetRoom miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, err := s.GetMember("nope", "x"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetMember miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, _, err := s.GetSealedKey("nope", "x", 1); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetSealedKey miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, _, err := s.GetSealedKey("nope", "x", 0); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetSealedKey latest miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, err := s.GetUser("ffff"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetUser miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreIsAuthorizedFailClosed is the error path mandated by the
|
||||
// issue: when the KV backend is unavailable (here the NATS server is shut down),
|
||||
// IsAuthorized must DENY, never admit. A previously-authorized identity flips to
|
||||
// unauthorized once the backend cannot be reached.
|
||||
func TestJetStreamStoreIsAuthorizedFailClosed(t *testing.T) {
|
||||
s, ns, nc := newKVStore(t)
|
||||
|
||||
const aliceHex = "abcd"
|
||||
if err := s.AddUser(aliceHex, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
if !s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("alice should be authorized while the backend is up")
|
||||
}
|
||||
|
||||
// Take the KV backend away: close the client and stop the server. Every
|
||||
// subsequent KV Get fails, and the store must fail closed.
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
ns.WaitForShutdown()
|
||||
|
||||
// Bound the assertion: IsAuthorized internally caps each op at OpTimeout, so
|
||||
// this returns well before the test deadline.
|
||||
done := make(chan bool, 1)
|
||||
go func() { done <- s.IsAuthorized(aliceHex) }()
|
||||
select {
|
||||
case authorized := <-done:
|
||||
if authorized {
|
||||
t.Fatalf("KV backend down but IsAuthorized returned true: NOT fail-closed")
|
||||
}
|
||||
case <-time.After(10 * time.Second):
|
||||
t.Fatalf("IsAuthorized hung when the backend was down (no bounded timeout)")
|
||||
}
|
||||
|
||||
// HasAdmin is likewise conservative: backend down -> false (gates stay closed).
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("KV backend down but HasAdmin returned true: NOT fail-closed")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
package membership
|
||||
|
||||
// Migration from the local SQLite control plane to replicated JetStream KV
|
||||
// (issue 0003c). It is the one-time, idempotent data move that decentralization
|
||||
// needs: read the entire SQLite state, write it into the KV buckets. Re-running
|
||||
// it is safe (every KV write is an overwrite), so a partial/interrupted run is
|
||||
// recovered by running again, and a parity test can assert the two stores hold
|
||||
// the same state before and after.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// SealedKeyRecord is one row of room_keys: the sealed room key for an endpoint
|
||||
// at a given epoch. It is the unit the snapshot carries so a backend can be
|
||||
// imported with the exact epoch history (CreateRoom/AddMember alone could not
|
||||
// reproduce a multi-epoch room).
|
||||
type SealedKeyRecord struct {
|
||||
RoomID string
|
||||
Endpoint string
|
||||
Epoch int
|
||||
Sealed []byte
|
||||
}
|
||||
|
||||
// Snapshot is the complete control-plane state, backend-agnostic. It is what
|
||||
// ExportSnapshot produces and importSnapshot consumes, so the SQLite->KV
|
||||
// migration and the parity test both work in terms of it.
|
||||
type Snapshot struct {
|
||||
Rooms []RoomInfo
|
||||
Members map[string][]Member // roomID -> members
|
||||
Keys []SealedKeyRecord
|
||||
Users []User
|
||||
}
|
||||
|
||||
// MigrateReport summarizes what a migration moved, for the operator log.
|
||||
type MigrateReport struct {
|
||||
BackupPath string
|
||||
Rooms int
|
||||
Members int
|
||||
Keys int
|
||||
Users int
|
||||
}
|
||||
|
||||
// MigrateSQLiteToKV reads the SQLite store at sqlitePath and writes its entire
|
||||
// state into the JetStream KV buckets on js (created with cfg.Replicas). It is
|
||||
// idempotent: re-running converges to the same state. The caller is responsible
|
||||
// for backing up the SQLite file first (BackupSQLite) — this function only
|
||||
// reads it.
|
||||
func MigrateSQLiteToKV(sqlitePath string, js jetstream.JetStream, cfg JetStreamConfig) (*MigrateReport, error) {
|
||||
src, err := openSQLite(sqlitePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: open sqlite %q: %w", sqlitePath, err)
|
||||
}
|
||||
defer src.Close()
|
||||
|
||||
snap, err := src.ExportSnapshot()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: export sqlite: %w", err)
|
||||
}
|
||||
|
||||
dst, err := OpenJetStream(js, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: open kv: %w", err)
|
||||
}
|
||||
kv := dst.(*jetstreamStore)
|
||||
if err := kv.importSnapshot(snap); err != nil {
|
||||
return nil, fmt.Errorf("migrate: import to kv: %w", err)
|
||||
}
|
||||
|
||||
members := 0
|
||||
for _, ms := range snap.Members {
|
||||
members += len(ms)
|
||||
}
|
||||
return &MigrateReport{
|
||||
Rooms: len(snap.Rooms),
|
||||
Members: members,
|
||||
Keys: len(snap.Keys),
|
||||
Users: len(snap.Users),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// BackupSQLite makes a consistent copy of the SQLite database next to it,
|
||||
// named "<path>.bak.<unixnano>", using SQLite's own VACUUM INTO (which writes a
|
||||
// transactionally-consistent snapshot even with a live WAL). It returns the
|
||||
// backup path. Always call this before MigrateSQLiteToKV so a botched migration
|
||||
// can be undone.
|
||||
func BackupSQLite(path string) (string, error) {
|
||||
dst := fmt.Sprintf("%s.bak.%d", path, time.Now().UnixNano())
|
||||
db, err := sql.Open("sqlite", "file:"+path+"?_pragma=busy_timeout(5000)")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("backup: open %q: %w", path, err)
|
||||
}
|
||||
defer db.Close()
|
||||
if err := db.Ping(); err != nil {
|
||||
return "", fmt.Errorf("backup: ping %q: %w", path, err)
|
||||
}
|
||||
// VACUUM INTO writes a fresh, consistent database file; the literal path is
|
||||
// safely single-quoted (it is operator-supplied, never network input).
|
||||
if _, err := db.Exec("VACUUM INTO '" + strings.ReplaceAll(dst, "'", "''") + "'"); err != nil {
|
||||
return "", fmt.Errorf("backup: VACUUM INTO %q: %w", dst, err)
|
||||
}
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
// ---- SQLite export --------------------------------------------------------
|
||||
|
||||
// ExportSnapshot reads the entire SQLite control-plane state into a Snapshot.
|
||||
func (s *sqliteStore) ExportSnapshot() (*Snapshot, error) {
|
||||
snap := &Snapshot{Members: map[string][]Member{}}
|
||||
|
||||
rows, err := s.db.Query(`SELECT room_id, subject, key_epoch, encrypt, persist, sign_msgs, owner_endpoint FROM rooms ORDER BY room_id`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query rooms: %w", err)
|
||||
}
|
||||
for rows.Next() {
|
||||
var r RoomInfo
|
||||
var enc, per, sgn int
|
||||
if err := rows.Scan(&r.RoomID, &r.Subject, &r.Epoch, &enc, &per, &sgn, &r.OwnerEndpoint); err != nil {
|
||||
rows.Close()
|
||||
return nil, fmt.Errorf("export: scan room: %w", err)
|
||||
}
|
||||
r.Encrypt, r.Persist, r.SignMsgs = enc != 0, per != 0, sgn != 0
|
||||
snap.Rooms = append(snap.Rooms, r)
|
||||
}
|
||||
rows.Close()
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mrows, err := s.db.Query(`SELECT room_id, endpoint, role, sign_pub, kex_pub FROM members ORDER BY room_id, endpoint`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query members: %w", err)
|
||||
}
|
||||
for mrows.Next() {
|
||||
var roomID string
|
||||
var m Member
|
||||
if err := mrows.Scan(&roomID, &m.Endpoint, &m.Role, &m.SignPub, &m.KexPub); err != nil {
|
||||
mrows.Close()
|
||||
return nil, fmt.Errorf("export: scan member: %w", err)
|
||||
}
|
||||
snap.Members[roomID] = append(snap.Members[roomID], m)
|
||||
}
|
||||
mrows.Close()
|
||||
if err := mrows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
krows, err := s.db.Query(`SELECT room_id, epoch, endpoint, sealed_key FROM room_keys ORDER BY room_id, endpoint, epoch`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query room_keys: %w", err)
|
||||
}
|
||||
for krows.Next() {
|
||||
var rec SealedKeyRecord
|
||||
if err := krows.Scan(&rec.RoomID, &rec.Epoch, &rec.Endpoint, &rec.Sealed); err != nil {
|
||||
krows.Close()
|
||||
return nil, fmt.Errorf("export: scan room_key: %w", err)
|
||||
}
|
||||
snap.Keys = append(snap.Keys, rec)
|
||||
}
|
||||
krows.Close()
|
||||
if err := krows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: list users: %w", err)
|
||||
}
|
||||
snap.Users = users
|
||||
return snap, nil
|
||||
}
|
||||
@@ -0,0 +1,195 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// seedSQLite populates a SQLite store with a representative control plane: two
|
||||
// rooms (one rekeyed to epoch 2 with a removed member's keys left behind), a few
|
||||
// members and sealed keys, and a user allowlist with one revoked entry. It
|
||||
// returns the populated *sqliteStore and its file path.
|
||||
func seedSQLite(t *testing.T) (*sqliteStore, string) {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "seed.db")
|
||||
s, err := openSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("openSQLite: %v", err)
|
||||
}
|
||||
|
||||
r1 := RoomInfo{RoomID: newULID(), Subject: "room.alpha", Encrypt: true, Persist: true, SignMsgs: true, OwnerEndpoint: "ep-owner1"}
|
||||
if err := s.CreateRoom(r1, []byte("o1-sign"), []byte("o1-kex"), []byte("o1-sealed-e1")); err != nil {
|
||||
t.Fatalf("create r1: %v", err)
|
||||
}
|
||||
if err := s.AddMember(r1.RoomID, Member{Endpoint: "ep-bob", Role: "member", SignPub: []byte("bob-sign"), KexPub: []byte("bob-kex")}, 1, []byte("bob-sealed-e1")); err != nil {
|
||||
t.Fatalf("add bob: %v", err)
|
||||
}
|
||||
// Rekey r1 to epoch 2 (owner keeps a key at the new epoch).
|
||||
if err := s.BumpEpoch(r1.RoomID, 2); err != nil {
|
||||
t.Fatalf("bump: %v", err)
|
||||
}
|
||||
if err := s.PutSealedKeys(r1.RoomID, 2, map[string][]byte{"ep-owner1": []byte("o1-sealed-e2")}); err != nil {
|
||||
t.Fatalf("put keys e2: %v", err)
|
||||
}
|
||||
|
||||
r2 := RoomInfo{RoomID: newULID(), Subject: "room.beta", Encrypt: false, Persist: false, SignMsgs: false, OwnerEndpoint: "ep-owner2"}
|
||||
if err := s.CreateRoom(r2, []byte("o2-sign"), []byte("o2-kex"), nil); err != nil {
|
||||
t.Fatalf("create r2: %v", err)
|
||||
}
|
||||
|
||||
if err := s.AddUser("aa11", "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
if err := s.AddUser("bb22", "bob", RoleMember); err != nil {
|
||||
t.Fatalf("add bob user: %v", err)
|
||||
}
|
||||
if err := s.AddUser("cc33", "carol", RoleMember); err != nil {
|
||||
t.Fatalf("add carol: %v", err)
|
||||
}
|
||||
if err := s.RevokeUser("cc33"); err != nil {
|
||||
t.Fatalf("revoke carol: %v", err)
|
||||
}
|
||||
return s, path
|
||||
}
|
||||
|
||||
// normalizeSnapshot sorts every slice in a Snapshot so two snapshots from
|
||||
// different backends can be compared regardless of enumeration order.
|
||||
func normalizeSnapshot(snap *Snapshot) {
|
||||
sort.Slice(snap.Rooms, func(i, j int) bool { return snap.Rooms[i].RoomID < snap.Rooms[j].RoomID })
|
||||
for _, ms := range snap.Members {
|
||||
sort.Slice(ms, func(i, j int) bool { return ms[i].Endpoint < ms[j].Endpoint })
|
||||
}
|
||||
sort.Slice(snap.Keys, func(i, j int) bool {
|
||||
a, b := snap.Keys[i], snap.Keys[j]
|
||||
if a.RoomID != b.RoomID {
|
||||
return a.RoomID < b.RoomID
|
||||
}
|
||||
if a.Endpoint != b.Endpoint {
|
||||
return a.Endpoint < b.Endpoint
|
||||
}
|
||||
return a.Epoch < b.Epoch
|
||||
})
|
||||
sort.Slice(snap.Users, func(i, j int) bool { return snap.Users[i].SignPub < snap.Users[j].SignPub })
|
||||
}
|
||||
|
||||
func newJS(t *testing.T) jetstream.JetStream {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { nc.Close(); ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return js
|
||||
}
|
||||
|
||||
// TestMigrateSQLiteToKVParity is the parity test the issue mandates: after the
|
||||
// migration, the KV store holds exactly the SQLite source's state.
|
||||
func TestMigrateSQLiteToKVParity(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, err := src.ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export sqlite: %v", err)
|
||||
}
|
||||
src.Close() // release the file before the migration reopens it
|
||||
|
||||
js := newJS(t)
|
||||
report, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("migrate: %v", err)
|
||||
}
|
||||
if report.Rooms != 2 || report.Users != 3 {
|
||||
t.Fatalf("report mismatch: %+v", report)
|
||||
}
|
||||
|
||||
kv, err := OpenJetStream(js, JetStreamConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("open kv: %v", err)
|
||||
}
|
||||
kvSnap, err := kv.(*jetstreamStore).ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export kv: %v", err)
|
||||
}
|
||||
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(kvSnap)
|
||||
if !reflect.DeepEqual(srcSnap, kvSnap) {
|
||||
t.Fatalf("parity mismatch after migration:\n sqlite=%+v\n kv= %+v", srcSnap, kvSnap)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMigrateSQLiteToKVIdempotent: running the migration twice converges to the
|
||||
// same KV state (every write is an overwrite). A second run must not duplicate
|
||||
// or corrupt anything.
|
||||
func TestMigrateSQLiteToKVIdempotent(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, _ := src.ExportSnapshot()
|
||||
src.Close()
|
||||
|
||||
js := newJS(t)
|
||||
if _, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1}); err != nil {
|
||||
t.Fatalf("migrate run 1: %v", err)
|
||||
}
|
||||
if _, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1}); err != nil {
|
||||
t.Fatalf("migrate run 2: %v", err)
|
||||
}
|
||||
|
||||
kv, _ := OpenJetStream(js, JetStreamConfig{Replicas: 1})
|
||||
kvSnap, err := kv.(*jetstreamStore).ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export kv: %v", err)
|
||||
}
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(kvSnap)
|
||||
if !reflect.DeepEqual(srcSnap, kvSnap) {
|
||||
t.Fatalf("idempotency broken: a second migration changed the KV state\n sqlite=%+v\n kv= %+v", srcSnap, kvSnap)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBackupSQLiteCreatesConsistentCopy verifies the pre-migration backup is a
|
||||
// real, openable copy holding the same data.
|
||||
func TestBackupSQLiteCreatesConsistentCopy(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, _ := src.ExportSnapshot()
|
||||
src.Close()
|
||||
|
||||
bak, err := BackupSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("backup: %v", err)
|
||||
}
|
||||
restored, err := openSQLite(bak)
|
||||
if err != nil {
|
||||
t.Fatalf("open backup: %v", err)
|
||||
}
|
||||
defer restored.Close()
|
||||
bakSnap, err := restored.ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export backup: %v", err)
|
||||
}
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(bakSnap)
|
||||
if !reflect.DeepEqual(srcSnap, bakSnap) {
|
||||
t.Fatalf("backup is not a faithful copy")
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
// (error), and after the TTL the same nonce is accepted again because its entry
|
||||
// was pruned (edge).
|
||||
func TestNonceCacheRememberPrune(t *testing.T) {
|
||||
nc := newNonceCache(50*time.Millisecond, 1000)
|
||||
nc := newMemNonceCache(50*time.Millisecond, 1000)
|
||||
base := time.Now()
|
||||
|
||||
if !nc.rememberOrReject("a", base) {
|
||||
@@ -31,7 +31,7 @@ func TestNonceCacheRememberPrune(t *testing.T) {
|
||||
// from the map.
|
||||
func TestNonceCacheCapBounded(t *testing.T) {
|
||||
const capacity = 100
|
||||
nc := newNonceCache(time.Hour, capacity)
|
||||
nc := newMemNonceCache(time.Hour, capacity)
|
||||
base := time.Now()
|
||||
for i := 0; i < 500; i++ {
|
||||
nc.rememberOrReject("n"+strconv.Itoa(i), base)
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
package membership
|
||||
|
||||
// kvNonceStore is the replicated anti-replay backend (issue 0003e): seen nonces
|
||||
// live in a JetStream KV bucket shared by every node, with a per-key TTL so they
|
||||
// expire on their own. This closes the multi-node replay hole the auditor
|
||||
// flagged: the per-process memNonceCache let an attacker replay a captured
|
||||
// request to a DIFFERENT node, whose local cache never saw the nonce. With the
|
||||
// shared bucket the first node to see a nonce wins the atomic Create, and every
|
||||
// other node rejects the replay.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
const bucketNonces = "UNIBUS_nonces"
|
||||
|
||||
type kvNonceStore struct {
|
||||
kv jetstream.KeyValue
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// newKVNonceStore creates (or opens) the replicated nonce bucket. ttl is the
|
||||
// per-key expiry — it must be >= the request acceptance window (2*clockSkew) so
|
||||
// a replay can never outlive its memory, exactly like the in-memory cache's TTL.
|
||||
func newKVNonceStore(js jetstream.JetStream, ttl time.Duration, replicas int, opTimeout time.Duration) (*kvNonceStore, error) {
|
||||
if replicas <= 0 {
|
||||
replicas = 1
|
||||
}
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultKVOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
kv, err := js.CreateOrUpdateKeyValue(ctx, jetstream.KeyValueConfig{
|
||||
Bucket: bucketNonces,
|
||||
TTL: ttl,
|
||||
Replicas: replicas,
|
||||
History: 1,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: open nonce KV bucket (replicas=%d): %w", replicas, err)
|
||||
}
|
||||
return &kvNonceStore{kv: kv, opTimeout: opTimeout}, nil
|
||||
}
|
||||
|
||||
// nonceKVKey maps a raw nonce (std-base64, which contains '+' '/' '=' that KV
|
||||
// keys forbid) to a KV-safe token: the hex of its sha256. Deterministic, so the
|
||||
// same nonce always maps to the same key, and collision-free in practice.
|
||||
func nonceKVKey(nonce string) string {
|
||||
sum := sha256.Sum256([]byte(nonce))
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
// rememberOrReject atomically claims the nonce: Create succeeds only if the key
|
||||
// is absent, so the first sight returns true (accept) and any later sight (a
|
||||
// replay, on this or any other node sharing the bucket) returns false. A backend
|
||||
// error fails CLOSED — reject — so a KV outage never silently disables
|
||||
// anti-replay. The TTL on the bucket expires the key, reopening the window.
|
||||
func (s *kvNonceStore) rememberOrReject(nonce string, _ time.Time) bool {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), s.opTimeout)
|
||||
defer cancel()
|
||||
if _, err := s.kv.Create(ctx, nonceKVKey(nonce), nil); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return false // replay: already claimed
|
||||
}
|
||||
return false // backend unreachable: fail closed
|
||||
}
|
||||
return true // first sight: accept
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// TestReplicatedNonceRejectsCrossNodeReplay is the issue's mandated error path:
|
||||
// with the shared KV nonce store, a request accepted on node A is rejected as a
|
||||
// replay when the SAME signed bytes are sent to node B. This closes the
|
||||
// multi-node replay hole that the per-process cache left open.
|
||||
func TestReplicatedNonceRejectsCrossNodeReplay(t *testing.T) {
|
||||
// One NATS+JetStream backing the shared nonce bucket.
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
t.Fatalf("connect: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
|
||||
// One shared SQLite store (simulating the replicated control-plane state) and
|
||||
// two membershipd servers (two nodes) that BOTH use the shared KV nonce store.
|
||||
dir := t.TempDir()
|
||||
store, err := Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
alicePub := hex.EncodeToString(alice.SignPub)
|
||||
if err := store.AddUser(alicePub, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
|
||||
mkNode := func() *httptest.Server {
|
||||
srv := NewServer(store, blobs, AuthEnforce)
|
||||
if err := srv.UseReplicatedNonces(js, 1); err != nil {
|
||||
t.Fatalf("UseReplicatedNonces: %v", err)
|
||||
}
|
||||
return httptest.NewServer(srv)
|
||||
}
|
||||
nodeA := mkNode()
|
||||
t.Cleanup(nodeA.Close)
|
||||
nodeB := mkNode()
|
||||
t.Cleanup(nodeB.Close)
|
||||
|
||||
// Build ONE signed request (fixed ts+nonce) and send the identical bytes to
|
||||
// both nodes. Authenticated path: alice listing her own rooms (200, empty).
|
||||
ts := time.Now().Unix()
|
||||
nonceRaw := make([]byte, 16)
|
||||
if _, err := rand.Read(nonceRaw); err != nil {
|
||||
t.Fatalf("nonce: %v", err)
|
||||
}
|
||||
nonce := base64.StdEncoding.EncodeToString(nonceRaw)
|
||||
path := "/members/" + frame.EndpointID(alice.SignPub) + "/rooms"
|
||||
|
||||
reqA := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respA, err := http.DefaultClient.Do(reqA)
|
||||
if err != nil {
|
||||
t.Fatalf("do A: %v", err)
|
||||
}
|
||||
respA.Body.Close()
|
||||
if respA.StatusCode != http.StatusOK {
|
||||
t.Fatalf("node A first use: status %d, want 200 (auth should pass, nonce fresh)", respA.StatusCode)
|
||||
}
|
||||
|
||||
// Replay the SAME ts+nonce to node B: the shared bucket already holds the
|
||||
// nonce, so node B must reject it.
|
||||
reqB := signedReq(t, nodeB.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respB, err := http.DefaultClient.Do(reqB)
|
||||
if err != nil {
|
||||
t.Fatalf("do B: %v", err)
|
||||
}
|
||||
respB.Body.Close()
|
||||
if respB.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("cross-node replay to node B: status %d, want 401 (replayed nonce)", respB.StatusCode)
|
||||
}
|
||||
|
||||
// And replaying to node A again is likewise rejected (same bucket).
|
||||
reqA2 := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respA2, err := http.DefaultClient.Do(reqA2)
|
||||
if err != nil {
|
||||
t.Fatalf("do A2: %v", err)
|
||||
}
|
||||
respA2.Body.Close()
|
||||
if respA2.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("replay to node A: status %d, want 401", respA2.StatusCode)
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,6 @@ package membership
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
@@ -20,6 +19,7 @@ import (
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// Body-size ceilings for the control plane. They bound how much an unauthenticated
|
||||
@@ -56,11 +56,11 @@ const (
|
||||
// rate limiting, and read endpoints (GET) are unauthenticated. Hardening
|
||||
// (mTLS, capabilities, rate limits) is a later phase.
|
||||
type Server struct {
|
||||
store *Store
|
||||
blobs *blobstore.Store
|
||||
store Store
|
||||
blobs blobstore.Store
|
||||
mux *http.ServeMux
|
||||
authMode AuthMode
|
||||
nonces *nonceCache
|
||||
nonces nonceStore
|
||||
limiter *ipRateLimiter
|
||||
|
||||
// RequireEncryptedRooms, when true, refuses to create cleartext (ModeNATS)
|
||||
@@ -79,19 +79,35 @@ type Server struct {
|
||||
// tests that have not migrated to signed requests yet). It installs a per-IP
|
||||
// rate limiter with the package defaults; loopback dev behavior is unchanged
|
||||
// because the burst comfortably exceeds any single client's request rate.
|
||||
func NewServer(store *Store, blobs *blobstore.Store, authMode AuthMode) *Server {
|
||||
func NewServer(store Store, blobs blobstore.Store, authMode AuthMode) *Server {
|
||||
s := &Server{
|
||||
store: store,
|
||||
blobs: blobs,
|
||||
mux: http.NewServeMux(),
|
||||
authMode: authMode,
|
||||
nonces: newNonceCache(nonceTTL, maxNonceCacheEntries),
|
||||
nonces: newMemNonceCache(nonceTTL, maxNonceCacheEntries),
|
||||
limiter: newIPRateLimiter(defaultRatePerSec, defaultRateBurst, rateBucketTTL),
|
||||
}
|
||||
s.routes()
|
||||
return s
|
||||
}
|
||||
|
||||
// UseReplicatedNonces switches the server's anti-replay store from the
|
||||
// per-process in-memory cache to a JetStream KV bucket shared across the cluster
|
||||
// (issue 0003e). It MUST be called on every node of a multi-node deployment:
|
||||
// otherwise a request captured on one node can be replayed to another whose
|
||||
// local cache never saw the nonce. replicas is the bucket's replication factor
|
||||
// (R1..R3). The TTL matches the in-memory cache (nonceTTL = 2*clockSkew), so a
|
||||
// replay can never outlive its memory.
|
||||
func (s *Server) UseReplicatedNonces(js jetstream.JetStream, replicas int) error {
|
||||
ns, err := newKVNonceStore(js, nonceTTL, replicas, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.nonces = ns
|
||||
return nil
|
||||
}
|
||||
|
||||
// ServeHTTP satisfies http.Handler. It runs the control-plane auth middleware
|
||||
// (signature verification + anti-replay + allowlist) ahead of the router
|
||||
// according to authMode, then dispatches to the matched handler.
|
||||
@@ -456,7 +472,7 @@ func (s *Server) handleGetKey(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
ep, sealed, err := s.store.GetSealedKey(roomID, endpoint, epoch)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
writeErr(w, http.StatusForbidden,
|
||||
"not invited to this encrypted room: no key has been sealed for your identity. Ask the room owner to invite you before joining.")
|
||||
return
|
||||
|
||||
+77
-18
@@ -13,6 +13,7 @@ package membership
|
||||
import (
|
||||
"database/sql"
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"sort"
|
||||
@@ -26,6 +27,14 @@ import (
|
||||
//go:embed migrations/*.sql
|
||||
var migrationsFS embed.FS
|
||||
|
||||
// ErrNotFound is the store-agnostic "no such record" sentinel. Both backends
|
||||
// (SQLite and JetStream KV) return it, wrapped, when a lookup misses, so callers
|
||||
// distinguish "not invited / no key yet" from a genuine backend failure without
|
||||
// depending on a specific driver's error (the SQLite store maps sql.ErrNoRows to
|
||||
// it; the KV store maps a missing key to it). This is what lets the control
|
||||
// plane stay storage-agnostic under the branch-by-abstraction of issue 0003b.
|
||||
var ErrNotFound = errors.New("membership: not found")
|
||||
|
||||
// Member is a participant of a room with their published public keys.
|
||||
type Member struct {
|
||||
Endpoint string `json:"endpoint"`
|
||||
@@ -45,14 +54,58 @@ type RoomInfo struct {
|
||||
OwnerEndpoint string
|
||||
}
|
||||
|
||||
// Store is the SQLite-backed membership/key store.
|
||||
type Store struct {
|
||||
// Store is the membership/key control-plane store: the authoritative source of
|
||||
// room metadata, the member directory, per-epoch sealed room keys, and the bus
|
||||
// user allowlist. It is an interface (branch-by-abstraction, issue 0003b) with
|
||||
// two implementations: sqliteStore (the default, single-node, local SQLite) and
|
||||
// jetstreamStore (rooms/members/keys/users on replicated JetStream KV, selected
|
||||
// when the `decentralized` flag is on). Every lookup miss returns ErrNotFound
|
||||
// (wrapped); every implementation MUST fail closed (IsAuthorized returns false
|
||||
// on any backend error), so a KV quorum loss denies rather than admits.
|
||||
type Store interface {
|
||||
// Rooms / members / keys.
|
||||
CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error
|
||||
GetRoom(roomID string) (RoomInfo, error)
|
||||
AddMember(roomID string, m Member, epoch int, sealedKey []byte) error
|
||||
GetMember(roomID, endpoint string) (Member, error)
|
||||
ListMembers(roomID string) ([]Member, error)
|
||||
ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error)
|
||||
GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error)
|
||||
PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error
|
||||
BumpEpoch(roomID string, newEpoch int) error
|
||||
RemoveMember(roomID, endpoint string) error
|
||||
|
||||
// Users (the bus allowlist).
|
||||
AddUser(signPub, handle, role string) error
|
||||
GetUser(signPub string) (User, error)
|
||||
ListUsers() ([]User, error)
|
||||
RevokeUser(signPub string) error
|
||||
IsAuthorized(signPub string) bool
|
||||
HasAdmin() bool
|
||||
|
||||
// Lifecycle.
|
||||
Close() error
|
||||
}
|
||||
|
||||
// sqliteStore is the SQLite-backed implementation of Store (the default,
|
||||
// single-node backend). It stays the production default while the
|
||||
// `decentralized` flag is off.
|
||||
type sqliteStore struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
// Open opens (creating if needed) the SQLite database at path and applies all
|
||||
// embedded migrations idempotently.
|
||||
func Open(path string) (*Store, error) {
|
||||
// Open opens (creating if needed) the SQLite database at path, applies all
|
||||
// embedded migrations idempotently, and returns it as a Store. It remains the
|
||||
// default control-plane backend; the JetStream KV store is opened separately
|
||||
// (OpenJetStream) when decentralization is enabled.
|
||||
func Open(path string) (Store, error) {
|
||||
return openSQLite(path)
|
||||
}
|
||||
|
||||
// openSQLite is the concrete constructor, returning *sqliteStore so internal
|
||||
// callers (e.g. the SQLite->KV migration) can use SQLite-specific helpers that
|
||||
// are not part of the storage-agnostic Store interface.
|
||||
func openSQLite(path string) (*sqliteStore, error) {
|
||||
// _pragma busy_timeout avoids spurious "database is locked" under concurrent
|
||||
// HTTP handlers; foreign_keys kept off — we manage referential integrity in code.
|
||||
dsn := fmt.Sprintf("file:%s?_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)", path)
|
||||
@@ -64,7 +117,7 @@ func Open(path string) (*Store, error) {
|
||||
db.Close()
|
||||
return nil, fmt.Errorf("membership: ping db: %w", err)
|
||||
}
|
||||
s := &Store{db: db}
|
||||
s := &sqliteStore{db: db}
|
||||
if err := s.applyMigrations(); err != nil {
|
||||
db.Close()
|
||||
return nil, err
|
||||
@@ -73,11 +126,11 @@ func Open(path string) (*Store, error) {
|
||||
}
|
||||
|
||||
// Close closes the underlying database.
|
||||
func (s *Store) Close() error { return s.db.Close() }
|
||||
func (s *sqliteStore) Close() error { return s.db.Close() }
|
||||
|
||||
// applyMigrations runs every embedded migration in lexical order, tolerating
|
||||
// the "already applied" errors that SQLite's non-idempotent DDL produces.
|
||||
func (s *Store) applyMigrations() error {
|
||||
func (s *sqliteStore) applyMigrations() error {
|
||||
files, err := fs.Glob(migrationsFS, "migrations/*.sql")
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: glob migrations: %w", err)
|
||||
@@ -103,7 +156,7 @@ func nowRFC3339() string { return time.Now().UTC().Format(time.RFC3339Nano) }
|
||||
// CreateRoom inserts a room at epoch 1, registers the owner as a member with
|
||||
// role "owner", and stores the owner's sealed key for epoch 1. Idempotent
|
||||
// inserts are not used: a duplicate room_id returns an error.
|
||||
func (s *Store) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
func (s *sqliteStore) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -142,7 +195,7 @@ func (s *Store) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealed
|
||||
}
|
||||
|
||||
// GetRoom returns room metadata (including current epoch).
|
||||
func (s *Store) GetRoom(roomID string) (RoomInfo, error) {
|
||||
func (s *sqliteStore) GetRoom(roomID string) (RoomInfo, error) {
|
||||
var info RoomInfo
|
||||
var enc, per, sgn int
|
||||
err := s.db.QueryRow(
|
||||
@@ -158,7 +211,7 @@ func (s *Store) GetRoom(roomID string) (RoomInfo, error) {
|
||||
|
||||
// AddMember inserts a member at the given role and stores their sealed key for
|
||||
// the supplied epoch.
|
||||
func (s *Store) AddMember(roomID string, m Member, epoch int, sealedKey []byte) error {
|
||||
func (s *sqliteStore) AddMember(roomID string, m Member, epoch int, sealedKey []byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -185,7 +238,7 @@ func (s *Store) AddMember(roomID string, m Member, epoch int, sealedKey []byte)
|
||||
}
|
||||
|
||||
// GetMember returns a single member of a room.
|
||||
func (s *Store) GetMember(roomID, endpoint string) (Member, error) {
|
||||
func (s *sqliteStore) GetMember(roomID, endpoint string) (Member, error) {
|
||||
var m Member
|
||||
err := s.db.QueryRow(
|
||||
`SELECT endpoint, role, sign_pub, kex_pub FROM members WHERE room_id = ? AND endpoint = ?`,
|
||||
@@ -198,7 +251,7 @@ func (s *Store) GetMember(roomID, endpoint string) (Member, error) {
|
||||
}
|
||||
|
||||
// ListMembers returns all members of a room ordered by endpoint.
|
||||
func (s *Store) ListMembers(roomID string) ([]Member, error) {
|
||||
func (s *sqliteStore) ListMembers(roomID string) ([]Member, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT endpoint, role, sign_pub, kex_pub FROM members WHERE room_id = ? ORDER BY endpoint`,
|
||||
roomID,
|
||||
@@ -230,7 +283,7 @@ type RoomMembership struct {
|
||||
// ListRoomsForEndpoint returns every room the given endpoint is a member of,
|
||||
// with the room's current metadata and the endpoint's role, ordered by room id.
|
||||
// An endpoint that is in no rooms yields an empty slice (not an error).
|
||||
func (s *Store) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error) {
|
||||
func (s *sqliteStore) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT r.room_id, r.subject, r.key_epoch, r.encrypt, r.persist, r.sign_msgs, r.owner_endpoint, m.role
|
||||
FROM members m JOIN rooms r ON r.room_id = m.room_id
|
||||
@@ -257,7 +310,7 @@ func (s *Store) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error)
|
||||
|
||||
// GetSealedKey returns the sealed room key for an endpoint at a given epoch.
|
||||
// If epoch <= 0, the latest epoch for that endpoint is returned.
|
||||
func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
func (s *sqliteStore) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
var ep int
|
||||
var sealed []byte
|
||||
var err error
|
||||
@@ -275,6 +328,12 @@ func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, e
|
||||
).Scan(&ep, &sealed)
|
||||
}
|
||||
if err != nil {
|
||||
// Map "no such row" to the store-agnostic sentinel so the control plane
|
||||
// can tell "not invited / no key yet" (-> 403 with a helpful message) from
|
||||
// a genuine backend failure, the same way the KV store will.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, ErrNotFound)
|
||||
}
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, err)
|
||||
}
|
||||
return ep, sealed, nil
|
||||
@@ -282,7 +341,7 @@ func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, e
|
||||
|
||||
// PutSealedKeys stores a batch of sealed keys for the given epoch (endpoint ->
|
||||
// sealed bytes), upserting on conflict so a rekey can overwrite stale entries.
|
||||
func (s *Store) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
func (s *sqliteStore) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -301,7 +360,7 @@ func (s *Store) PutSealedKeys(roomID string, epoch int, keys map[string][]byte)
|
||||
}
|
||||
|
||||
// BumpEpoch sets the room's current key_epoch to newEpoch.
|
||||
func (s *Store) BumpEpoch(roomID string, newEpoch int) error {
|
||||
func (s *sqliteStore) BumpEpoch(roomID string, newEpoch int) error {
|
||||
if _, err := s.db.Exec(`UPDATE rooms SET key_epoch = ? WHERE room_id = ?`, newEpoch, roomID); err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
@@ -310,7 +369,7 @@ func (s *Store) BumpEpoch(roomID string, newEpoch int) error {
|
||||
|
||||
// RemoveMember deletes a member from a room. Their sealed keys for past epochs
|
||||
// are left intact (they encrypt only data that member could already read).
|
||||
func (s *Store) RemoveMember(roomID, endpoint string) error {
|
||||
func (s *sqliteStore) RemoveMember(roomID, endpoint string) error {
|
||||
if _, err := s.db.Exec(`DELETE FROM members WHERE room_id = ? AND endpoint = ?`, roomID, endpoint); err != nil {
|
||||
return fmt.Errorf("membership: remove member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
|
||||
@@ -6,10 +6,10 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func openTestStore(t *testing.T) *Store {
|
||||
func openTestStore(t *testing.T) *sqliteStore {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "test.db")
|
||||
s, err := Open(path)
|
||||
s, err := openSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("Open: %v", err)
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ func normalizeSignPub(signPub string) string {
|
||||
// AddUser inserts a new bus user. role defaults to RoleMember when empty. It
|
||||
// returns ErrUserExists if the sign_pub is already registered (the caller may
|
||||
// choose to revoke+re-add or ignore). handle and signPub must be non-empty.
|
||||
func (s *Store) AddUser(signPub, handle, role string) error {
|
||||
func (s *sqliteStore) AddUser(signPub, handle, role string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" || handle == "" {
|
||||
return fmt.Errorf("membership: AddUser: sign_pub and handle required")
|
||||
@@ -74,7 +74,7 @@ func (s *Store) AddUser(signPub, handle, role string) error {
|
||||
|
||||
// GetUser returns the user with the given signing public key. It returns
|
||||
// sql.ErrNoRows (wrapped) when there is no such user.
|
||||
func (s *Store) GetUser(signPub string) (User, error) {
|
||||
func (s *sqliteStore) GetUser(signPub string) (User, error) {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
var u User
|
||||
var revoked sql.NullString
|
||||
@@ -90,7 +90,7 @@ func (s *Store) GetUser(signPub string) (User, error) {
|
||||
}
|
||||
|
||||
// ListUsers returns every user ordered by handle then sign_pub (stable output).
|
||||
func (s *Store) ListUsers() ([]User, error) {
|
||||
func (s *sqliteStore) ListUsers() ([]User, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT sign_pub, handle, role, status, created_at, revoked_at FROM users ORDER BY handle, sign_pub`,
|
||||
)
|
||||
@@ -116,7 +116,7 @@ func (s *Store) ListUsers() ([]User, error) {
|
||||
// status flip (not a delete) so the identity stays auditable and IsAuthorized
|
||||
// immediately denies it on both planes. Revoking an unknown or already-revoked
|
||||
// user returns an error / is a no-op respectively.
|
||||
func (s *Store) RevokeUser(signPub string) error {
|
||||
func (s *sqliteStore) RevokeUser(signPub string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE users SET status = ?, revoked_at = ? WHERE sign_pub = ? AND status = ?`,
|
||||
@@ -140,7 +140,7 @@ func (s *Store) RevokeUser(signPub string) error {
|
||||
// plane (HTTP request middleware) and the data plane (NATS nkey authenticator),
|
||||
// so revoking a user denies access on both without restarting anything. An
|
||||
// unknown key, a revoked key, or any query error all yield false (fail closed).
|
||||
func (s *Store) IsAuthorized(signPub string) bool {
|
||||
func (s *sqliteStore) IsAuthorized(signPub string) bool {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" {
|
||||
return false
|
||||
@@ -155,7 +155,7 @@ func (s *Store) IsAuthorized(signPub string) bool {
|
||||
// HasAdmin reports whether at least one active admin exists. The control plane
|
||||
// uses it to gate user-management endpoints: until the host operator seeds the
|
||||
// first admin via the local CLI, those endpoints stay closed (chicken-egg).
|
||||
func (s *Store) HasAdmin() bool {
|
||||
func (s *sqliteStore) HasAdmin() bool {
|
||||
var one int
|
||||
err := s.db.QueryRow(
|
||||
`SELECT 1 FROM users WHERE role = ? AND status = ? LIMIT 1`, RoleAdmin, StatusActive,
|
||||
|
||||
Reference in New Issue
Block a user