37c778ca9a
The per-process nonce cache breaks anti-replay under multi-node failover (audit 0004): a request captured on one node can be replayed to a DIFFERENT node whose local cache never saw the nonce, and is accepted. This makes the nonce state shared so a replay is rejected cluster-wide. pkg/membership: - nonceStore is now an interface. The in-memory cache is renamed memNonceCache (still the default, single-node behavior). - kvNonceStore (new) claims each nonce with an atomic KV Create on a shared bucket: first sight wins (accept), any later sight on any node rejects (replay). A backend error fails CLOSED (reject), so a KV outage never silently disables anti-replay. The bucket carries a TTL = nonceTTL (2*clockSkew) so a key expires exactly when its replay window closes; raw base64 nonces are mapped to KV-safe keys via sha256-hex. - Server.UseReplicatedNonces(js, replicas) swaps the store on a node; every node in a cluster calls it. NewServer still defaults to the in-memory cache (master behavior unchanged). Test (DoD error path — the issue's cross-node replay case): - TestReplicatedNonceRejectsCrossNodeReplay: two membershipd nodes share one KV bucket; a request accepted (200) on node A, replayed with the same ts+nonce to node B, is rejected (401) — and replaying to A again is rejected too.
242 lines
8.5 KiB
Go
242 lines
8.5 KiB
Go
package membership
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/base64"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
cs "fn-registry/functions/cybersecurity"
|
|
|
|
"github.com/enmanuel/unibus/pkg/frame"
|
|
)
|
|
|
|
// AuthMode is the control-plane authentication rollout state (feature flag
|
|
// bus-auth). It governs how the HTTP middleware treats a request whose signature
|
|
// is missing, invalid, replayed, skewed, or from an unregistered identity.
|
|
//
|
|
// AuthOff — do not verify anything (legacy behavior; default).
|
|
// AuthSoft — verify and LOG rejections, but let the request through. Lets
|
|
// clients migrate to signing without an outage.
|
|
// AuthEnforce — reject unauthenticated requests with 401.
|
|
type AuthMode int
|
|
|
|
const (
|
|
AuthOff AuthMode = iota
|
|
AuthSoft
|
|
AuthEnforce
|
|
)
|
|
|
|
func (m AuthMode) String() string {
|
|
switch m {
|
|
case AuthOff:
|
|
return "off"
|
|
case AuthSoft:
|
|
return "soft"
|
|
case AuthEnforce:
|
|
return "enforce"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// ParseAuthMode maps the bus-auth flag string to an AuthMode.
|
|
func ParseAuthMode(s string) (AuthMode, error) {
|
|
switch s {
|
|
case "off", "":
|
|
return AuthOff, nil
|
|
case "soft":
|
|
return AuthSoft, nil
|
|
case "enforce":
|
|
return AuthEnforce, nil
|
|
default:
|
|
return AuthOff, fmt.Errorf("membership: invalid bus-auth mode %q (want off|soft|enforce)", s)
|
|
}
|
|
}
|
|
|
|
// Control-plane signature headers. The client signs the canonical bytes of the
|
|
// request and presents these; the server reconstructs the canonical bytes and
|
|
// verifies. See canonicalRequest for the exact byte layout.
|
|
const (
|
|
hdrPub = "X-Unibus-Pub" // signer Ed25519 public key, lowercase hex
|
|
hdrTs = "X-Unibus-Ts" // unix seconds (string)
|
|
hdrNonce = "X-Unibus-Nonce" // 16 random bytes, std base64
|
|
hdrSig = "X-Unibus-Sig" // Ed25519 signature over canonical, std base64
|
|
)
|
|
|
|
// Anti-replay parameters. A request is accepted only if its timestamp is within
|
|
// clockSkew of now; nonces are remembered for nonceTTL so a captured request
|
|
// cannot be replayed inside its acceptance window. nonceTTL must be >= the full
|
|
// acceptance window (2*clockSkew) so a replay can never outlive its memory.
|
|
const (
|
|
clockSkew = 30 * time.Second
|
|
nonceTTL = 60 * time.Second
|
|
// maxNonceCacheEntries bounds the replay cache so it cannot grow without limit
|
|
// (audit H7). With IsAuthorized now gating insertion, only authorized traffic
|
|
// is cached, so this ceiling is only approached under a legitimate burst; at
|
|
// the cap the oldest nonce is evicted (its TTL is nearly up anyway).
|
|
maxNonceCacheEntries = 100_000
|
|
)
|
|
|
|
// CanonicalRequest returns the exact bytes that are signed and verified for a
|
|
// control-plane request:
|
|
//
|
|
// method "\n" path "\n" ts "\n" nonce "\n" hex(sha256(body))
|
|
//
|
|
// path is the request URI (path plus raw query) so query parameters (endpoint,
|
|
// epoch) are covered by the signature. It is exported so the client library and
|
|
// tests sign with the identical construction — the one place this format lives.
|
|
func CanonicalRequest(method, path, ts, nonce string, body []byte) []byte {
|
|
sum := sha256.Sum256(body)
|
|
return []byte(method + "\n" + path + "\n" + ts + "\n" + nonce + "\n" + hex.EncodeToString(sum[:]))
|
|
}
|
|
|
|
// nonceStore is the anti-replay backend: rememberOrReject records a nonce and
|
|
// reports whether it was unseen (true -> accept) or already seen (false ->
|
|
// reject the replay). It is an interface (issue 0003e) so the single-node
|
|
// in-memory cache can be swapped for a replicated KV store: a per-process cache
|
|
// is BROKEN under multi-node failover (a request captured and replayed to a
|
|
// DIFFERENT node whose cache never saw the nonce would be accepted), so a
|
|
// cluster MUST share the nonce state. Every implementation fails CLOSED — a
|
|
// backend it cannot reach rejects rather than admits.
|
|
type nonceStore interface {
|
|
rememberOrReject(nonce string, now time.Time) bool
|
|
}
|
|
|
|
// memNonceCache remembers recently-seen nonces to reject replays. It is an
|
|
// in-memory store guarded by a mutex — sufficient for a SINGLE membershipd
|
|
// process. A clustered deployment uses kvNonceStore instead (issue 0003e).
|
|
//
|
|
// Pruning is O(expired), not O(n): because the TTL is constant, insertion order
|
|
// equals expiry order, so the oldest entries (front of `order`) are exactly the
|
|
// ones that expire first (audit H7 — the previous full-map scan under the mutex
|
|
// was a CPU-amplification vector). A size cap bounds memory.
|
|
type memNonceCache struct {
|
|
mu sync.Mutex
|
|
seen map[string]time.Time // nonce -> expiry
|
|
order []string // nonces in insertion order == expiry order
|
|
ttl time.Duration
|
|
cap int
|
|
}
|
|
|
|
func newMemNonceCache(ttl time.Duration, capacity int) *memNonceCache {
|
|
return &memNonceCache{seen: make(map[string]time.Time), ttl: ttl, cap: capacity}
|
|
}
|
|
|
|
// rememberOrReject records nonce and returns true if it was unseen, or false if
|
|
// it is a replay (still live in the cache).
|
|
func (n *memNonceCache) rememberOrReject(nonce string, now time.Time) bool {
|
|
n.mu.Lock()
|
|
defer n.mu.Unlock()
|
|
|
|
// Prune expired entries from the front (oldest first). The first live entry
|
|
// ends the scan — everything behind it was inserted later and is newer.
|
|
cut := 0
|
|
for cut < len(n.order) {
|
|
exp, ok := n.seen[n.order[cut]]
|
|
if !ok {
|
|
cut++ // already evicted by the cap path below
|
|
continue
|
|
}
|
|
if !exp.Before(now) {
|
|
break
|
|
}
|
|
delete(n.seen, n.order[cut])
|
|
cut++
|
|
}
|
|
if cut > 0 {
|
|
n.order = append(n.order[:0], n.order[cut:]...)
|
|
}
|
|
|
|
if exp, ok := n.seen[nonce]; ok && !exp.Before(now) {
|
|
return false // a live replay
|
|
}
|
|
|
|
// Bound memory: at capacity, evict the oldest entry (its TTL is nearly up).
|
|
for len(n.seen) >= n.cap && len(n.order) > 0 {
|
|
oldest := n.order[0]
|
|
n.order = n.order[1:]
|
|
delete(n.seen, oldest)
|
|
}
|
|
|
|
n.seen[nonce] = now.Add(n.ttl)
|
|
n.order = append(n.order, nonce)
|
|
return true
|
|
}
|
|
|
|
// authResult is what a successful authentication yields: the verified signing
|
|
// key (hex), the endpoint id derived from it, and the authorized user record.
|
|
// Handlers use endpoint for membership authorization (only a member of a room
|
|
// may read its metadata/keys); user is available for role checks.
|
|
type authResult struct {
|
|
pubHex string
|
|
endpoint string
|
|
user User
|
|
}
|
|
|
|
// authenticate verifies the signature headers on r against body and the user
|
|
// allowlist. It returns an error describing the first failing check; the
|
|
// middleware decides whether that error blocks (enforce) or only logs (soft).
|
|
//
|
|
// Order matters: cheap, non-cryptographic checks (header presence, key shape,
|
|
// clock skew) run first; the Ed25519 verification runs before the replay cache
|
|
// is touched so an attacker cannot poison the cache with unsigned nonces; the
|
|
// allowlist lookup runs last.
|
|
func (s *Server) authenticate(r *http.Request, body []byte, now time.Time) (authResult, error) {
|
|
pubHex := r.Header.Get(hdrPub)
|
|
ts := r.Header.Get(hdrTs)
|
|
nonce := r.Header.Get(hdrNonce)
|
|
sigB64 := r.Header.Get(hdrSig)
|
|
if pubHex == "" || ts == "" || nonce == "" || sigB64 == "" {
|
|
return authResult{}, fmt.Errorf("missing auth headers")
|
|
}
|
|
|
|
pub, err := hex.DecodeString(pubHex)
|
|
if err != nil || len(pub) != 32 {
|
|
return authResult{}, fmt.Errorf("malformed %s (want 32-byte Ed25519 hex)", hdrPub)
|
|
}
|
|
|
|
tsInt, err := strconv.ParseInt(ts, 10, 64)
|
|
if err != nil {
|
|
return authResult{}, fmt.Errorf("malformed %s", hdrTs)
|
|
}
|
|
if d := now.Unix() - tsInt; d > int64(clockSkew/time.Second) || d < -int64(clockSkew/time.Second) {
|
|
return authResult{}, fmt.Errorf("timestamp out of range (skew %ds)", d)
|
|
}
|
|
|
|
sig, err := base64.StdEncoding.DecodeString(sigB64)
|
|
if err != nil {
|
|
return authResult{}, fmt.Errorf("malformed %s", hdrSig)
|
|
}
|
|
|
|
canonical := CanonicalRequest(r.Method, r.URL.RequestURI(), ts, nonce, body)
|
|
if !cs.VerifyEd25519(pub, canonical, sig) {
|
|
return authResult{}, fmt.Errorf("invalid signature")
|
|
}
|
|
|
|
// Authorize BEFORE touching the replay cache (audit H7): an unregistered
|
|
// identity can mint valid signatures for free, so caching its nonces would let
|
|
// it poison/grow the cache pre-auth. Only authorized identities are remembered.
|
|
if !s.store.IsAuthorized(pubHex) {
|
|
return authResult{}, fmt.Errorf("identity not authorized")
|
|
}
|
|
|
|
user, err := s.store.GetUser(pubHex)
|
|
if err != nil {
|
|
// IsAuthorized passed but the row vanished (race with revoke): fail closed.
|
|
return authResult{}, fmt.Errorf("identity not authorized")
|
|
}
|
|
|
|
// Anti-replay last: a replayed request from an authorized identity is still
|
|
// rejected here (the nonce is already live in the cache from its first use).
|
|
if !s.nonces.rememberOrReject(nonce, now) {
|
|
return authResult{}, fmt.Errorf("replayed nonce")
|
|
}
|
|
|
|
return authResult{pubHex: pubHex, endpoint: frame.EndpointID(pub), user: user}, nil
|
|
}
|