Files
unibus/pkg/membership/auth.go
T
agent 37c778ca9a feat(0003e/2): replicated anti-replay nonce store on JetStream KV
The per-process nonce cache breaks anti-replay under multi-node failover
(audit 0004): a request captured on one node can be replayed to a
DIFFERENT node whose local cache never saw the nonce, and is accepted.
This makes the nonce state shared so a replay is rejected cluster-wide.

pkg/membership:
- nonceStore is now an interface. The in-memory cache is renamed
  memNonceCache (still the default, single-node behavior).
- kvNonceStore (new) claims each nonce with an atomic KV Create on a
  shared bucket: first sight wins (accept), any later sight on any node
  rejects (replay). A backend error fails CLOSED (reject), so a KV outage
  never silently disables anti-replay. The bucket carries a TTL =
  nonceTTL (2*clockSkew) so a key expires exactly when its replay window
  closes; raw base64 nonces are mapped to KV-safe keys via sha256-hex.
- Server.UseReplicatedNonces(js, replicas) swaps the store on a node;
  every node in a cluster calls it. NewServer still defaults to the
  in-memory cache (master behavior unchanged).

Test (DoD error path — the issue's cross-node replay case):
- TestReplicatedNonceRejectsCrossNodeReplay: two membershipd nodes share
  one KV bucket; a request accepted (200) on node A, replayed with the
  same ts+nonce to node B, is rejected (401) — and replaying to A again
  is rejected too.
2026-06-07 15:21:45 +02:00

242 lines
8.5 KiB
Go

package membership
import (
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"fmt"
"net/http"
"strconv"
"sync"
"time"
cs "fn-registry/functions/cybersecurity"
"github.com/enmanuel/unibus/pkg/frame"
)
// AuthMode is the control-plane authentication rollout state (feature flag
// bus-auth). It governs how the HTTP middleware treats a request whose signature
// is missing, invalid, replayed, skewed, or from an unregistered identity.
//
// AuthOff — do not verify anything (legacy behavior; default).
// AuthSoft — verify and LOG rejections, but let the request through. Lets
// clients migrate to signing without an outage.
// AuthEnforce — reject unauthenticated requests with 401.
type AuthMode int
const (
AuthOff AuthMode = iota
AuthSoft
AuthEnforce
)
func (m AuthMode) String() string {
switch m {
case AuthOff:
return "off"
case AuthSoft:
return "soft"
case AuthEnforce:
return "enforce"
default:
return "unknown"
}
}
// ParseAuthMode maps the bus-auth flag string to an AuthMode.
func ParseAuthMode(s string) (AuthMode, error) {
switch s {
case "off", "":
return AuthOff, nil
case "soft":
return AuthSoft, nil
case "enforce":
return AuthEnforce, nil
default:
return AuthOff, fmt.Errorf("membership: invalid bus-auth mode %q (want off|soft|enforce)", s)
}
}
// Control-plane signature headers. The client signs the canonical bytes of the
// request and presents these; the server reconstructs the canonical bytes and
// verifies. See canonicalRequest for the exact byte layout.
const (
hdrPub = "X-Unibus-Pub" // signer Ed25519 public key, lowercase hex
hdrTs = "X-Unibus-Ts" // unix seconds (string)
hdrNonce = "X-Unibus-Nonce" // 16 random bytes, std base64
hdrSig = "X-Unibus-Sig" // Ed25519 signature over canonical, std base64
)
// Anti-replay parameters. A request is accepted only if its timestamp is within
// clockSkew of now; nonces are remembered for nonceTTL so a captured request
// cannot be replayed inside its acceptance window. nonceTTL must be >= the full
// acceptance window (2*clockSkew) so a replay can never outlive its memory.
const (
clockSkew = 30 * time.Second
nonceTTL = 60 * time.Second
// maxNonceCacheEntries bounds the replay cache so it cannot grow without limit
// (audit H7). With IsAuthorized now gating insertion, only authorized traffic
// is cached, so this ceiling is only approached under a legitimate burst; at
// the cap the oldest nonce is evicted (its TTL is nearly up anyway).
maxNonceCacheEntries = 100_000
)
// CanonicalRequest returns the exact bytes that are signed and verified for a
// control-plane request:
//
// method "\n" path "\n" ts "\n" nonce "\n" hex(sha256(body))
//
// path is the request URI (path plus raw query) so query parameters (endpoint,
// epoch) are covered by the signature. It is exported so the client library and
// tests sign with the identical construction — the one place this format lives.
func CanonicalRequest(method, path, ts, nonce string, body []byte) []byte {
sum := sha256.Sum256(body)
return []byte(method + "\n" + path + "\n" + ts + "\n" + nonce + "\n" + hex.EncodeToString(sum[:]))
}
// nonceStore is the anti-replay backend: rememberOrReject records a nonce and
// reports whether it was unseen (true -> accept) or already seen (false ->
// reject the replay). It is an interface (issue 0003e) so the single-node
// in-memory cache can be swapped for a replicated KV store: a per-process cache
// is BROKEN under multi-node failover (a request captured and replayed to a
// DIFFERENT node whose cache never saw the nonce would be accepted), so a
// cluster MUST share the nonce state. Every implementation fails CLOSED — a
// backend it cannot reach rejects rather than admits.
type nonceStore interface {
rememberOrReject(nonce string, now time.Time) bool
}
// memNonceCache remembers recently-seen nonces to reject replays. It is an
// in-memory store guarded by a mutex — sufficient for a SINGLE membershipd
// process. A clustered deployment uses kvNonceStore instead (issue 0003e).
//
// Pruning is O(expired), not O(n): because the TTL is constant, insertion order
// equals expiry order, so the oldest entries (front of `order`) are exactly the
// ones that expire first (audit H7 — the previous full-map scan under the mutex
// was a CPU-amplification vector). A size cap bounds memory.
type memNonceCache struct {
mu sync.Mutex
seen map[string]time.Time // nonce -> expiry
order []string // nonces in insertion order == expiry order
ttl time.Duration
cap int
}
func newMemNonceCache(ttl time.Duration, capacity int) *memNonceCache {
return &memNonceCache{seen: make(map[string]time.Time), ttl: ttl, cap: capacity}
}
// rememberOrReject records nonce and returns true if it was unseen, or false if
// it is a replay (still live in the cache).
func (n *memNonceCache) rememberOrReject(nonce string, now time.Time) bool {
n.mu.Lock()
defer n.mu.Unlock()
// Prune expired entries from the front (oldest first). The first live entry
// ends the scan — everything behind it was inserted later and is newer.
cut := 0
for cut < len(n.order) {
exp, ok := n.seen[n.order[cut]]
if !ok {
cut++ // already evicted by the cap path below
continue
}
if !exp.Before(now) {
break
}
delete(n.seen, n.order[cut])
cut++
}
if cut > 0 {
n.order = append(n.order[:0], n.order[cut:]...)
}
if exp, ok := n.seen[nonce]; ok && !exp.Before(now) {
return false // a live replay
}
// Bound memory: at capacity, evict the oldest entry (its TTL is nearly up).
for len(n.seen) >= n.cap && len(n.order) > 0 {
oldest := n.order[0]
n.order = n.order[1:]
delete(n.seen, oldest)
}
n.seen[nonce] = now.Add(n.ttl)
n.order = append(n.order, nonce)
return true
}
// authResult is what a successful authentication yields: the verified signing
// key (hex), the endpoint id derived from it, and the authorized user record.
// Handlers use endpoint for membership authorization (only a member of a room
// may read its metadata/keys); user is available for role checks.
type authResult struct {
pubHex string
endpoint string
user User
}
// authenticate verifies the signature headers on r against body and the user
// allowlist. It returns an error describing the first failing check; the
// middleware decides whether that error blocks (enforce) or only logs (soft).
//
// Order matters: cheap, non-cryptographic checks (header presence, key shape,
// clock skew) run first; the Ed25519 verification runs before the replay cache
// is touched so an attacker cannot poison the cache with unsigned nonces; the
// allowlist lookup runs last.
func (s *Server) authenticate(r *http.Request, body []byte, now time.Time) (authResult, error) {
pubHex := r.Header.Get(hdrPub)
ts := r.Header.Get(hdrTs)
nonce := r.Header.Get(hdrNonce)
sigB64 := r.Header.Get(hdrSig)
if pubHex == "" || ts == "" || nonce == "" || sigB64 == "" {
return authResult{}, fmt.Errorf("missing auth headers")
}
pub, err := hex.DecodeString(pubHex)
if err != nil || len(pub) != 32 {
return authResult{}, fmt.Errorf("malformed %s (want 32-byte Ed25519 hex)", hdrPub)
}
tsInt, err := strconv.ParseInt(ts, 10, 64)
if err != nil {
return authResult{}, fmt.Errorf("malformed %s", hdrTs)
}
if d := now.Unix() - tsInt; d > int64(clockSkew/time.Second) || d < -int64(clockSkew/time.Second) {
return authResult{}, fmt.Errorf("timestamp out of range (skew %ds)", d)
}
sig, err := base64.StdEncoding.DecodeString(sigB64)
if err != nil {
return authResult{}, fmt.Errorf("malformed %s", hdrSig)
}
canonical := CanonicalRequest(r.Method, r.URL.RequestURI(), ts, nonce, body)
if !cs.VerifyEd25519(pub, canonical, sig) {
return authResult{}, fmt.Errorf("invalid signature")
}
// Authorize BEFORE touching the replay cache (audit H7): an unregistered
// identity can mint valid signatures for free, so caching its nonces would let
// it poison/grow the cache pre-auth. Only authorized identities are remembered.
if !s.store.IsAuthorized(pubHex) {
return authResult{}, fmt.Errorf("identity not authorized")
}
user, err := s.store.GetUser(pubHex)
if err != nil {
// IsAuthorized passed but the row vanished (race with revoke): fail closed.
return authResult{}, fmt.Errorf("identity not authorized")
}
// Anti-replay last: a replayed request from an authorized identity is still
// rejected here (the nonce is already live in the cache from its first use).
if !s.nonces.rememberOrReject(nonce, now) {
return authResult{}, fmt.Errorf("replayed nonce")
}
return authResult{pubHex: pubHex, endpoint: frame.EndpointID(pub), user: user}, nil
}