3aa5a2c9a9
The 0011 chaos test validated only the control plane (healthz + leader failover + KV readable with 2/3); it never connected an authenticated bus client to the data plane. cmd/clientcheck is a reusable verification tool: it connects with a real identity (nkey + TLS on both planes, multi-node seed lists), creates an ephemeral E2E room (encrypted + signed, no durable stream), and either publishes N messages and asserts all come back decrypted (golden) or publishes a counter for a duration while logging the attached node (loop), so stopping a node mid-run shows the client fail over to a survivor and keep receiving with quorum 2/3. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
261 lines
9.0 KiB
Go
261 lines
9.0 KiB
Go
// Command clientcheck is an end-to-end verification client for a live unibus
|
|
// cluster (issue 0011 GAP B). The 0011 chaos test validated only the control
|
|
// plane (healthz + meta/stream-leader failover + KV readable with 2/3); it never
|
|
// connected an authenticated bus client (nkey + TLS) to create a room and
|
|
// publish/subscribe through it, least of all across a node loss. clientcheck does
|
|
// exactly that with a real identity (the operator), so the data-plane end-to-end
|
|
// path — connect, create an E2E room, publish, receive decrypted — is exercised
|
|
// against the running cluster, including while a node is stopped.
|
|
//
|
|
// It is a reusable tool, not a throwaway script: point it at the cluster's CA,
|
|
// an identity file, and the NATS + control-plane seed lists.
|
|
//
|
|
// # golden: connect, create an E2E room, publish N, confirm N decrypted back
|
|
// clientcheck --ca ca.crt --identity-file operator.id \
|
|
// --nats-seeds nats://A:4250,nats://B:4250,nats://C:4250 \
|
|
// --ctrl-seeds https://A:8470,https://B:8470,https://C:8470 --messages 5
|
|
//
|
|
// # loop: publish a counter every interval for the duration, logging the node
|
|
// # it is attached to — stop a node mid-run (systemctl stop membershipd-cluster)
|
|
// # and watch it fail over to a survivor and keep receiving (quorum 2/3).
|
|
// clientcheck ... --mode loop --duration 45s --interval 1s
|
|
package main
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"encoding/hex"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/enmanuel/unibus/pkg/busauth"
|
|
"github.com/enmanuel/unibus/pkg/client"
|
|
"github.com/enmanuel/unibus/pkg/frame"
|
|
"github.com/enmanuel/unibus/pkg/room"
|
|
)
|
|
|
|
func main() {
|
|
var (
|
|
caPath = flag.String("ca", "", "bus CA cert pinning TLS on both planes (required for a secured cluster)")
|
|
idFile = flag.String("identity-file", "", "path to the client identity JSON (e.g. `pass show unibus/operator-identity` written 0600) (required)")
|
|
natsSeeds = flag.String("nats-seeds", "", "comma-separated NATS urls of the cluster nodes (required)")
|
|
ctrlSeeds = flag.String("ctrl-seeds", "", "comma-separated control-plane https urls of the cluster nodes (required)")
|
|
subject = flag.String("subject", "test.gapcheck", "test room subject PREFIX; a random token is appended so runs never collide with real rooms")
|
|
messages = flag.Int("messages", 5, "golden mode: number of messages to publish and expect back")
|
|
mode = flag.String("mode", "golden", "golden (publish N, verify N decrypted) | loop (publish a counter for --duration, for failover testing)")
|
|
duration = flag.Duration("duration", 30*time.Second, "loop mode: how long to keep publishing")
|
|
interval = flag.Duration("interval", 1*time.Second, "loop mode: delay between published messages")
|
|
)
|
|
flag.Parse()
|
|
|
|
if *idFile == "" || *natsSeeds == "" || *ctrlSeeds == "" {
|
|
log.Fatalf("clientcheck: --identity-file, --nats-seeds and --ctrl-seeds are required")
|
|
}
|
|
|
|
id, err := client.LoadIdentity(*idFile)
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: load identity: %v", err)
|
|
}
|
|
natsList := splitCSV(*natsSeeds)
|
|
ctrlList := splitCSV(*ctrlSeeds)
|
|
if len(natsList) == 0 || len(ctrlList) == 0 {
|
|
log.Fatalf("clientcheck: empty --nats-seeds or --ctrl-seeds")
|
|
}
|
|
|
|
// Build the secure client options: nkey on the data plane, TLS pinned to the
|
|
// bus CA on both planes, and the FULL seed lists so nats.go fails over to a
|
|
// surviving node when the attached one dies (the failover this tool verifies).
|
|
opts := client.Options{
|
|
NatsServers: natsList[1:],
|
|
CtrlURLs: ctrlList[1:],
|
|
}
|
|
if *caPath != "" {
|
|
tlsCfg, err := busauth.LoadCATLSConfig(*caPath)
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: load CA: %v", err)
|
|
}
|
|
opts.UseNkey = true
|
|
opts.TLS = tlsCfg
|
|
opts.CtrlTLS = tlsCfg
|
|
for _, u := range ctrlList {
|
|
if !strings.HasPrefix(u, "https://") {
|
|
log.Fatalf("clientcheck: control URL %q must be https:// when --ca is set", u)
|
|
}
|
|
}
|
|
}
|
|
|
|
c, err := client.NewWithOptions(natsList[0], ctrlList[0], id, opts)
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: connect: %v", err)
|
|
}
|
|
defer c.Close()
|
|
log.Printf("connected: endpoint=%s nats=%s", c.Endpoint().ID, c.ConnectedServer())
|
|
|
|
// Create an EPHEMERAL E2E room (encrypted + signed, NOT persisted): the test
|
|
// stays end-to-end encrypted (the cluster requires encryption on a public
|
|
// bind) while leaving no durable JetStream stream behind. The random subject
|
|
// token guarantees the room is unique and never a real room.
|
|
rnd := make([]byte, 8)
|
|
if _, err := rand.Read(rnd); err != nil {
|
|
log.Fatalf("clientcheck: random: %v", err)
|
|
}
|
|
subj := fmt.Sprintf("%s.%s", *subject, hex.EncodeToString(rnd))
|
|
policy := room.Policy{Encrypt: true, Persist: false, SignMsgs: true}
|
|
roomID, err := c.CreateRoom(subj, policy)
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: create room: %v", err)
|
|
}
|
|
log.Printf("created E2E room: id=%s subject=%s (encrypt=%v sign=%v persist=%v)", roomID, subj, policy.Encrypt, policy.SignMsgs, policy.Persist)
|
|
|
|
// Under the per-subject ACL, NATS freezes permissions at connect time, so the
|
|
// just-created room's subject is not yet publishable/subscribable on the live
|
|
// connection. RefreshSession reconnects so the authenticator re-derives the
|
|
// ACL (now including this room) — the post-0006 contract every client follows
|
|
// after a membership change.
|
|
if err := c.RefreshSession(); err != nil {
|
|
log.Fatalf("clientcheck: refresh session: %v", err)
|
|
}
|
|
|
|
switch *mode {
|
|
case "golden":
|
|
runGolden(c, roomID, *messages)
|
|
case "loop":
|
|
runLoop(c, roomID, *duration, *interval)
|
|
default:
|
|
log.Fatalf("clientcheck: --mode must be golden or loop, got %q", *mode)
|
|
}
|
|
}
|
|
|
|
// runGolden subscribes, publishes n messages, and asserts all n come back
|
|
// decrypted. Exits non-zero if any are missing.
|
|
func runGolden(c *client.Client, roomID string, n int) {
|
|
var mu sync.Mutex
|
|
got := map[string]bool{}
|
|
sub, err := c.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
|
mu.Lock()
|
|
got[string(plaintext)] = true
|
|
mu.Unlock()
|
|
})
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: subscribe: %v", err)
|
|
}
|
|
defer sub.Unsubscribe()
|
|
time.Sleep(300 * time.Millisecond) // let the subscription settle
|
|
|
|
want := make([]string, n)
|
|
for i := 0; i < n; i++ {
|
|
msg := fmt.Sprintf("gapcheck-e2e-%d", i)
|
|
want[i] = msg
|
|
if err := c.Publish(roomID, []byte(msg)); err != nil {
|
|
log.Fatalf("clientcheck: publish %d: %v", i, err)
|
|
}
|
|
}
|
|
log.Printf("published %d messages to %s; waiting for decrypted echoes...", n, roomID)
|
|
|
|
deadline := time.Now().Add(15 * time.Second)
|
|
for time.Now().Before(deadline) {
|
|
mu.Lock()
|
|
have := len(got)
|
|
mu.Unlock()
|
|
if have >= n {
|
|
break
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
missing := 0
|
|
for _, w := range want {
|
|
if !got[w] {
|
|
missing++
|
|
log.Printf(" MISSING: %q", w)
|
|
}
|
|
}
|
|
log.Printf("connected node at finish: %s", c.ConnectedServer())
|
|
if missing > 0 {
|
|
log.Fatalf("GOLDEN FAIL: %d/%d messages not received decrypted", missing, n)
|
|
}
|
|
log.Printf("GOLDEN OK: all %d messages received and decrypted end-to-end", n)
|
|
}
|
|
|
|
// runLoop publishes a numbered message every interval for the duration and logs
|
|
// the count received plus the node currently attached, so an operator stopping a
|
|
// cluster node mid-run sees the client fail over to a survivor and keep receiving
|
|
// (quorum 2/3). It is the live failover-with-a-connected-client test the 0011
|
|
// chaos run never performed.
|
|
func runLoop(c *client.Client, roomID string, duration, interval time.Duration) {
|
|
var mu sync.Mutex
|
|
received := 0
|
|
servers := map[string]int{} // node -> #ticks observed attached
|
|
sub, err := c.Subscribe(roomID, func(_ frame.Frame, _ []byte) {
|
|
mu.Lock()
|
|
received++
|
|
mu.Unlock()
|
|
})
|
|
if err != nil {
|
|
log.Fatalf("clientcheck: subscribe: %v", err)
|
|
}
|
|
defer sub.Unsubscribe()
|
|
time.Sleep(300 * time.Millisecond)
|
|
|
|
log.Printf("loop: publishing every %s for %s — stop a node now to test failover", interval, duration)
|
|
end := time.Now().Add(duration)
|
|
sent := 0
|
|
for time.Now().Before(end) {
|
|
msg := fmt.Sprintf("gapcheck-loop-%d", sent)
|
|
err := c.Publish(roomID, []byte(msg))
|
|
sent++
|
|
mu.Lock()
|
|
recv := received
|
|
mu.Unlock()
|
|
node := c.ConnectedServer()
|
|
up := c.IsConnected()
|
|
if node != "" {
|
|
mu.Lock()
|
|
servers[node]++
|
|
mu.Unlock()
|
|
}
|
|
pubStatus := "ok"
|
|
if err != nil {
|
|
pubStatus = "ERR:" + err.Error()
|
|
}
|
|
log.Printf(" t=%2ds sent=%d recv=%d up=%v node=%s publish=%s",
|
|
sent, sent, recv, up, node, pubStatus)
|
|
time.Sleep(interval)
|
|
}
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
log.Printf("loop done: sent=%d received=%d", sent, received)
|
|
nodes := make([]string, 0, len(servers))
|
|
for n := range servers {
|
|
nodes = append(nodes, n)
|
|
}
|
|
sort.Strings(nodes)
|
|
for _, n := range nodes {
|
|
log.Printf(" attached to %s for %d ticks", n, servers[n])
|
|
}
|
|
if len(servers) > 1 {
|
|
log.Printf("FAILOVER OBSERVED: client was attached to %d distinct nodes across the run", len(servers))
|
|
}
|
|
if received == 0 {
|
|
log.Fatalf("LOOP FAIL: received 0 messages")
|
|
}
|
|
log.Printf("LOOP OK: client kept receiving across the run (received=%d)", received)
|
|
}
|
|
|
|
func splitCSV(s string) []string {
|
|
var out []string
|
|
for _, p := range strings.Split(s, ",") {
|
|
if p = strings.TrimSpace(p); p != "" {
|
|
out = append(out, p)
|
|
}
|
|
}
|
|
return out
|
|
}
|