37c778ca9a
The per-process nonce cache breaks anti-replay under multi-node failover (audit 0004): a request captured on one node can be replayed to a DIFFERENT node whose local cache never saw the nonce, and is accepted. This makes the nonce state shared so a replay is rejected cluster-wide. pkg/membership: - nonceStore is now an interface. The in-memory cache is renamed memNonceCache (still the default, single-node behavior). - kvNonceStore (new) claims each nonce with an atomic KV Create on a shared bucket: first sight wins (accept), any later sight on any node rejects (replay). A backend error fails CLOSED (reject), so a KV outage never silently disables anti-replay. The bucket carries a TTL = nonceTTL (2*clockSkew) so a key expires exactly when its replay window closes; raw base64 nonces are mapped to KV-safe keys via sha256-hex. - Server.UseReplicatedNonces(js, replicas) swaps the store on a node; every node in a cluster calls it. NewServer still defaults to the in-memory cache (master behavior unchanged). Test (DoD error path — the issue's cross-node replay case): - TestReplicatedNonceRejectsCrossNodeReplay: two membershipd nodes share one KV bucket; a request accepted (200) on node A, replayed with the same ts+nonce to node B, is rejected (401) — and replaying to A again is rejected too.
118 lines
3.7 KiB
Go
118 lines
3.7 KiB
Go
package membership
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/base64"
|
|
"encoding/hex"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
cs "fn-registry/functions/cybersecurity"
|
|
|
|
"github.com/enmanuel/unibus/pkg/blobstore"
|
|
"github.com/enmanuel/unibus/pkg/embeddednats"
|
|
"github.com/enmanuel/unibus/pkg/frame"
|
|
"github.com/nats-io/nats.go"
|
|
"github.com/nats-io/nats.go/jetstream"
|
|
)
|
|
|
|
// TestReplicatedNonceRejectsCrossNodeReplay is the issue's mandated error path:
|
|
// with the shared KV nonce store, a request accepted on node A is rejected as a
|
|
// replay when the SAME signed bytes are sent to node B. This closes the
|
|
// multi-node replay hole that the per-process cache left open.
|
|
func TestReplicatedNonceRejectsCrossNodeReplay(t *testing.T) {
|
|
// One NATS+JetStream backing the shared nonce bucket.
|
|
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
|
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: kvFreePort(t),
|
|
})
|
|
if err != nil {
|
|
t.Fatalf("nats: %v", err)
|
|
}
|
|
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
|
nc, err := nats.Connect(ns.ClientURL())
|
|
if err != nil {
|
|
t.Fatalf("connect: %v", err)
|
|
}
|
|
t.Cleanup(nc.Close)
|
|
js, err := jetstream.New(nc)
|
|
if err != nil {
|
|
t.Fatalf("jetstream: %v", err)
|
|
}
|
|
|
|
// One shared SQLite store (simulating the replicated control-plane state) and
|
|
// two membershipd servers (two nodes) that BOTH use the shared KV nonce store.
|
|
dir := t.TempDir()
|
|
store, err := Open(filepath.Join(dir, "unibus.db"))
|
|
if err != nil {
|
|
t.Fatalf("store: %v", err)
|
|
}
|
|
t.Cleanup(func() { store.Close() })
|
|
alice, err := cs.GenerateIdentity()
|
|
if err != nil {
|
|
t.Fatalf("identity: %v", err)
|
|
}
|
|
alicePub := hex.EncodeToString(alice.SignPub)
|
|
if err := store.AddUser(alicePub, "alice", RoleAdmin); err != nil {
|
|
t.Fatalf("add alice: %v", err)
|
|
}
|
|
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
|
|
|
mkNode := func() *httptest.Server {
|
|
srv := NewServer(store, blobs, AuthEnforce)
|
|
if err := srv.UseReplicatedNonces(js, 1); err != nil {
|
|
t.Fatalf("UseReplicatedNonces: %v", err)
|
|
}
|
|
return httptest.NewServer(srv)
|
|
}
|
|
nodeA := mkNode()
|
|
t.Cleanup(nodeA.Close)
|
|
nodeB := mkNode()
|
|
t.Cleanup(nodeB.Close)
|
|
|
|
// Build ONE signed request (fixed ts+nonce) and send the identical bytes to
|
|
// both nodes. Authenticated path: alice listing her own rooms (200, empty).
|
|
ts := time.Now().Unix()
|
|
nonceRaw := make([]byte, 16)
|
|
if _, err := rand.Read(nonceRaw); err != nil {
|
|
t.Fatalf("nonce: %v", err)
|
|
}
|
|
nonce := base64.StdEncoding.EncodeToString(nonceRaw)
|
|
path := "/members/" + frame.EndpointID(alice.SignPub) + "/rooms"
|
|
|
|
reqA := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
|
respA, err := http.DefaultClient.Do(reqA)
|
|
if err != nil {
|
|
t.Fatalf("do A: %v", err)
|
|
}
|
|
respA.Body.Close()
|
|
if respA.StatusCode != http.StatusOK {
|
|
t.Fatalf("node A first use: status %d, want 200 (auth should pass, nonce fresh)", respA.StatusCode)
|
|
}
|
|
|
|
// Replay the SAME ts+nonce to node B: the shared bucket already holds the
|
|
// nonce, so node B must reject it.
|
|
reqB := signedReq(t, nodeB.URL, "GET", path, nil, alice, ts, nonce)
|
|
respB, err := http.DefaultClient.Do(reqB)
|
|
if err != nil {
|
|
t.Fatalf("do B: %v", err)
|
|
}
|
|
respB.Body.Close()
|
|
if respB.StatusCode != http.StatusUnauthorized {
|
|
t.Fatalf("cross-node replay to node B: status %d, want 401 (replayed nonce)", respB.StatusCode)
|
|
}
|
|
|
|
// And replaying to node A again is likewise rejected (same bucket).
|
|
reqA2 := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
|
respA2, err := http.DefaultClient.Do(reqA2)
|
|
if err != nil {
|
|
t.Fatalf("do A2: %v", err)
|
|
}
|
|
respA2.Body.Close()
|
|
if respA2.StatusCode != http.StatusUnauthorized {
|
|
t.Fatalf("replay to node A: status %d, want 401", respA2.StatusCode)
|
|
}
|
|
}
|