feat(0003e/2): replicated anti-replay nonce store on JetStream KV
The per-process nonce cache breaks anti-replay under multi-node failover (audit 0004): a request captured on one node can be replayed to a DIFFERENT node whose local cache never saw the nonce, and is accepted. This makes the nonce state shared so a replay is rejected cluster-wide. pkg/membership: - nonceStore is now an interface. The in-memory cache is renamed memNonceCache (still the default, single-node behavior). - kvNonceStore (new) claims each nonce with an atomic KV Create on a shared bucket: first sight wins (accept), any later sight on any node rejects (replay). A backend error fails CLOSED (reject), so a KV outage never silently disables anti-replay. The bucket carries a TTL = nonceTTL (2*clockSkew) so a key expires exactly when its replay window closes; raw base64 nonces are mapped to KV-safe keys via sha256-hex. - Server.UseReplicatedNonces(js, replicas) swaps the store on a node; every node in a cluster calls it. NewServer still defaults to the in-memory cache (master behavior unchanged). Test (DoD error path — the issue's cross-node replay case): - TestReplicatedNonceRejectsCrossNodeReplay: two membershipd nodes share one KV bucket; a request accepted (200) on node A, replayed with the same ts+nonce to node B, is rejected (401) — and replaying to A again is rejected too.
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
package membership
|
||||
|
||||
// kvNonceStore is the replicated anti-replay backend (issue 0003e): seen nonces
|
||||
// live in a JetStream KV bucket shared by every node, with a per-key TTL so they
|
||||
// expire on their own. This closes the multi-node replay hole the auditor
|
||||
// flagged: the per-process memNonceCache let an attacker replay a captured
|
||||
// request to a DIFFERENT node, whose local cache never saw the nonce. With the
|
||||
// shared bucket the first node to see a nonce wins the atomic Create, and every
|
||||
// other node rejects the replay.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
const bucketNonces = "UNIBUS_nonces"
|
||||
|
||||
type kvNonceStore struct {
|
||||
kv jetstream.KeyValue
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// newKVNonceStore creates (or opens) the replicated nonce bucket. ttl is the
|
||||
// per-key expiry — it must be >= the request acceptance window (2*clockSkew) so
|
||||
// a replay can never outlive its memory, exactly like the in-memory cache's TTL.
|
||||
func newKVNonceStore(js jetstream.JetStream, ttl time.Duration, replicas int, opTimeout time.Duration) (*kvNonceStore, error) {
|
||||
if replicas <= 0 {
|
||||
replicas = 1
|
||||
}
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultKVOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
kv, err := js.CreateOrUpdateKeyValue(ctx, jetstream.KeyValueConfig{
|
||||
Bucket: bucketNonces,
|
||||
TTL: ttl,
|
||||
Replicas: replicas,
|
||||
History: 1,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: open nonce KV bucket (replicas=%d): %w", replicas, err)
|
||||
}
|
||||
return &kvNonceStore{kv: kv, opTimeout: opTimeout}, nil
|
||||
}
|
||||
|
||||
// nonceKVKey maps a raw nonce (std-base64, which contains '+' '/' '=' that KV
|
||||
// keys forbid) to a KV-safe token: the hex of its sha256. Deterministic, so the
|
||||
// same nonce always maps to the same key, and collision-free in practice.
|
||||
func nonceKVKey(nonce string) string {
|
||||
sum := sha256.Sum256([]byte(nonce))
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
// rememberOrReject atomically claims the nonce: Create succeeds only if the key
|
||||
// is absent, so the first sight returns true (accept) and any later sight (a
|
||||
// replay, on this or any other node sharing the bucket) returns false. A backend
|
||||
// error fails CLOSED — reject — so a KV outage never silently disables
|
||||
// anti-replay. The TTL on the bucket expires the key, reopening the window.
|
||||
func (s *kvNonceStore) rememberOrReject(nonce string, _ time.Time) bool {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), s.opTimeout)
|
||||
defer cancel()
|
||||
if _, err := s.kv.Create(ctx, nonceKVKey(nonce), nil); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return false // replay: already claimed
|
||||
}
|
||||
return false // backend unreachable: fail closed
|
||||
}
|
||||
return true // first sight: accept
|
||||
}
|
||||
Reference in New Issue
Block a user