Compare commits
110 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e1a7402ff1 | |||
| ce72131ddf | |||
| 3aa5a2c9a9 | |||
| 02c2004ebd | |||
| ff580ac031 | |||
| 9fbff79df4 | |||
| 33746d9962 | |||
| caf005f04b | |||
| 9787c218ac | |||
| 926b8e96af | |||
| ae39e35fb4 | |||
| 48a3d6be33 | |||
| 24ff45ca7e | |||
| b8201a82cd | |||
| 3a33656cac | |||
| 2f5b372a80 | |||
| 32bec75665 | |||
| 9b96537aa6 | |||
| 18ee7c469b | |||
| e9ad719424 | |||
| d1e1a478f8 | |||
| cacf608fde | |||
| a9c245d468 | |||
| 8b6a01d280 | |||
| 5df99fa4c4 | |||
| df3b62a601 | |||
| 6976537842 | |||
| a4bbe8209b | |||
| 87ef52cc80 | |||
| a2ec78c81d | |||
| d01da9d396 | |||
| db8618ddc3 | |||
| e7d59fd01d | |||
| 0f79708338 | |||
| ef3af6dfd1 | |||
| 88b47912bd | |||
| a3ac58fb70 | |||
| fb0291ad8a | |||
| d821bc1794 | |||
| da420513b6 | |||
| 96abb75a2e | |||
| 37c778ca9a | |||
| c6ad63059f | |||
| 649dc9e244 | |||
| d6e668b984 | |||
| 94e7ced1ef | |||
| 9013ea5e33 | |||
| b8c9b2b652 | |||
| 6b3ace1d39 | |||
| 3230b31ade | |||
| c90f145a05 | |||
| 618f6b61da | |||
| d483c90356 | |||
| 1bcca987a4 | |||
| 0aa2caae43 | |||
| 957b728160 | |||
| 07f4af817e | |||
| 0d56c3c81d | |||
| fb6c796059 | |||
| e502b16675 | |||
| 47ff74d837 | |||
| b81e5f26f1 | |||
| d742f91881 | |||
| 30577145ce | |||
| 01e2ee1aa0 | |||
| e7bdcc978c | |||
| 60d6a86655 | |||
| bcd02716d5 | |||
| 484a07d6fd | |||
| 04e27518af | |||
| 6b0916f1fa | |||
| 87dbc421cd | |||
| b647779521 | |||
| 74c8d4f941 | |||
| 2ccd11b68c | |||
| 75939a192c | |||
| 1b56f14c20 | |||
| 2786ae2dde | |||
| 6d3d6d2562 | |||
| 217daae472 | |||
| 00058ea0af | |||
| 1630f6f163 | |||
| b09bafe242 | |||
| 413dd61041 | |||
| 89e0d0e64a | |||
| 2130eaa44d | |||
| 567e604fc7 | |||
| 0f8a38d62b | |||
| e0ef3a27cc | |||
| 3e39e23fe0 | |||
| e9711bf74b | |||
| 822982b71b | |||
| ddc6cabc24 | |||
| 0d7ab22d4a | |||
| c5387028e0 | |||
| 7de05c8591 | |||
| 9a915839c8 | |||
| 0dde60a05e | |||
| 7fab473bc3 | |||
| 92d4e4cb97 | |||
| ab4b099ab1 | |||
| a11d67cf70 | |||
| d33ca6278a | |||
| 915f926136 | |||
| 12fc77f25a | |||
| 69079d17d5 | |||
| 22092834bd | |||
| b2e6712dd2 | |||
| f6b53620e9 | |||
| 01f8988cc3 |
@@ -2,7 +2,7 @@
|
||||
name: unibus
|
||||
lang: go
|
||||
domain: infra
|
||||
version: 0.2.0
|
||||
version: 0.9.0
|
||||
description: "Bus de mensajería unificado sobre NATS+JetStream con cifrado E2E por room (megolm/olm reducido): service de membresía/claves, librería cliente y peers demo."
|
||||
tags: [service, messaging, nats, e2e]
|
||||
uses_functions:
|
||||
@@ -22,13 +22,13 @@ service:
|
||||
port: 8470
|
||||
health_endpoint: /healthz
|
||||
health_timeout_s: 3
|
||||
systemd_unit: null
|
||||
systemd_scope: null
|
||||
restart_policy: none
|
||||
runtime: manual
|
||||
systemd_unit: unibus-membershipd.service
|
||||
systemd_scope: user
|
||||
restart_policy: always
|
||||
runtime: systemd-user
|
||||
pc_targets:
|
||||
- lucas-linux
|
||||
is_local_only: true
|
||||
is_local_only: false
|
||||
e2e_checks:
|
||||
- id: build
|
||||
cmd: "CGO_ENABLED=0 go build ./..."
|
||||
@@ -154,6 +154,125 @@ agent.<nombre>.{in,out} inbox/outbox de agente LLM (agent.scout.in)
|
||||
|
||||
## Capability growth log
|
||||
|
||||
- v0.9.0 (2026-06-07) — cierre de los gaps que el despliegue del cluster (report
|
||||
0011) dejó abiertos (report 0012). (GAP A) Nueva capability `membershipd user
|
||||
add|list|revoke --store kv`: alta/baja de usuarios contra el KV replicado del
|
||||
cluster EN MARCHA, sin el procedimiento de parar-sembrar-rearrancar. Usa la
|
||||
conexión interna privilegiada — el daemon persiste su identidad de servicio con
|
||||
`--internal-id-file` (cada nodo genera/carga la suya, 0600 junto a las claves TLS)
|
||||
y la CLI, ejecutada por loopback en un nodo, presenta esa nkey que el
|
||||
autenticador reconoce con permisos plenos de JetStream; ninguna identidad de
|
||||
usuario normal puede tocar los buckets `KV_UNIBUS_*` bajo la ACL por-subject. El
|
||||
alta es idempotente (re-alta de la misma clave = `ErrUserExists` explícito, sin
|
||||
sobrescribir ni elevar rol), commitea con quórum 2/3 (HA, imprime
|
||||
`followers_current`) y rechaza un destino remoto sin `--ca` (igual que
|
||||
`migrate-to-kv`). (GAP B) Nuevo `cmd/clientcheck`: verificación end-to-end real
|
||||
con un cliente autenticado (identidad operator, nkey+TLS+https) que crea una room
|
||||
E2E, publica y recibe descifrado contra el cluster vivo, incluido un nodo parado a
|
||||
media transmisión donde el cliente hace failover a un superviviente y sigue
|
||||
recibiendo con cero pérdida (quórum 2/3) — el plano de datos que el chaos test del
|
||||
0011 nunca probó. (GAP C) Runbook `deploy/cluster/README.md` corregido: el orden
|
||||
de arranque "magnus solo y verifica healthz" deadlockeaba (un nodo solo no tiene
|
||||
quórum del meta-group y nunca sirve healthz); se documenta el arranque por quórum,
|
||||
que R1 es un SPOF inservible (ir directo a R3) y la nueva vía de alta con el
|
||||
cluster vivo. La plantilla de deploy (unit + `deploy-cluster.sh`) emite ya
|
||||
`INTERNAL_ID_FILE` y el flag. Verificado contra los 3 VPS reales (magnus + homer +
|
||||
datardos); posture enforce+ACL+TLS+R3 intacta.
|
||||
- v0.8.0 (2026-06-07) — completar y endurecer el cluster (issue 0006, fases
|
||||
0006a–0006g) que cierra los bloqueantes de la auditoría dedicada del cluster
|
||||
(report 0008) y cablea el control plane descentralizado que 0003 dejó a medias.
|
||||
(0006a) Se cablea el nonce replicado en el binario: un nodo con `--cluster-name`
|
||||
usa el bucket JetStream KV compartido obligatoriamente (fail-fast si no se crea),
|
||||
cerrando el replay cross-node (N3); el "ciclo bootstrap" se resuelve con una
|
||||
identidad interna efímera que el authenticator reconoce (full perms) y una
|
||||
conexión in-process privilegiada. (0006b) Se cierra la fuga del control plane
|
||||
por `$JS.API.>` (N2): la ACL pasa a un allow-set cerrado por-room (JS API solo de
|
||||
los streams `UNIBUS_<room>` del peer), dejando `KV_UNIBUS_*`/`OBJ_*` fuera del
|
||||
set y, por tanto, denegados. (0006c) Se cablea el store KV descentralizado
|
||||
(`--store kv|sqlite`, default sqlite = baseline idéntico) con un `storeHolder`
|
||||
fail-closed que rompe el ciclo bootstrap del authenticator. (0006d) Posture
|
||||
homogénea: un nodo rechaza unirse al cluster sin `enforce`, y `/healthz` publica
|
||||
la posture (N1). (0006e) Todos los clientes llaman `RefreshSession` tras cambios
|
||||
de membresía (N4), de modo que la ACL es usable bajo enforce sin desactivarla.
|
||||
(0006f) Bajos: secreto de cluster fuera de argv (`--cluster-pass-file`/env +
|
||||
inyección en routes), `migrate-to-kv` rechaza target remoto sin `--ca`, y docs
|
||||
de CA separada para routes + R1 SPOF vs R3 HA. (0006g) Material de deploy del
|
||||
cluster de 3 nodos (magnus+homer+datardos) en `deploy/cluster/` (certs, unit,
|
||||
script de despliegue dry-run, runbook) — sin tocar ningún VPS. Toda la
|
||||
regresión de auditorías previas + los ataques 0008 siguen verdes; govulncheck 0
|
||||
alcanzables. Branch-by-abstraction: con `--store sqlite` el single-node sigue
|
||||
idéntico y desplegable en todo momento.
|
||||
- v0.7.0 (2026-06-07) — hardening de seguridad 2 (issue 0005, fases 0005a–0005e)
|
||||
que cierra los hallazgos nuevos de la re-auditoría red-team (report 0006) y
|
||||
lleva el veredicto de exposición pública a "sí-con-condiciones". (0005a) Bump de
|
||||
`github.com/nats-io/nats-server/v2` v2.10.22→v2.11.15 y de la toolchain a
|
||||
go1.26.4: `govulncheck ./...` pasa de 16 vulnerabilidades alcanzables (14 del
|
||||
servidor NATS embebido + 2 de la stdlib) a 0. (0005b) `client.processFrame`
|
||||
ahora descarta cualquier frame sin firma en una room `SignMsgs` (antes verificaba
|
||||
solo si la firma venía presente, lo que permitía suplantar `Sender` con
|
||||
`Sig==nil`). (0005c) Nuevo limiter global de bytes en vuelo
|
||||
(`pkg/membership.inflightLimiter`) que acota la memoria agregada que el control
|
||||
plane bufferiza bajo concurrencia (el límite por-request y el rate-limit por-IP
|
||||
no acotaban el total): un flood concurrente multi-IP se descarta con 503 en vez
|
||||
de crecer sin techo (el RSS deja de escalar con N). (0005d) El guard de arranque
|
||||
`validateBootConfig` ahora exige `--tls-cert/--tls-key` en bind no-loopback (un
|
||||
control plane público sin TLS servía metadata en claro). (0005e) Se cablea por
|
||||
fin en `membershipd` la ACL por subject que ya existía huérfana desde 0003e
|
||||
(`busauth.NewNkeyAuthenticatorACL` + nuevo adaptador `busauth.PermissionsFromSubjects`
|
||||
sobre `membership.SubjectACLFor`): un registrado no-miembro ya no puede
|
||||
`Subscribe(">")` y captar los subjects/advisories de rooms ajenas. Residuales
|
||||
documentados: `$JS.API.>` sigue compartido (cierre completo = NATS accounts por
|
||||
identidad, diferido) y los clientes deben `RefreshSession` tras cambios de
|
||||
membresía (chat/worker aún no lo hacen). El comportamiento de un solo nodo no
|
||||
cambia y master sigue verde.
|
||||
- v0.6.0 (2026-06-07) — descentralización / alta disponibilidad (issue 0003,
|
||||
fases 0003a–0003e), report 0006. El servidor NATS embebido gana soporte de
|
||||
cluster con routes autenticadas (secreto de cluster) y TLS mutuo de nodo
|
||||
(`pkg/embeddednats.ClusterConfig` + `busauth.RouteTLSConfig`, reusando la CA
|
||||
del 0001). El control plane (`pkg/membership.Store`) pasa a interfaz por
|
||||
branch-by-abstraction: `sqliteStore` (default) + `jetstreamStore` nuevo sobre
|
||||
JetStream KV replicado (réplicas configurables R1→R3), con `IsAuthorized`
|
||||
fail-closed ante pérdida de quorum. `membershipd migrate-to-kv` mueve el
|
||||
estado SQLite→KV de forma idempotente con backup previo. Los blobs
|
||||
(`pkg/blobstore.Store`, ahora interfaz) ganan un backend NATS Object Store
|
||||
replicado además del disco. El cliente acepta listas de seeds NATS y de
|
||||
control planes con failover/reconnect nativo, el anti-replay pasa a un store
|
||||
de nonces compartido en KV con TTL (cierra el agujero de replay multi-nodo), y
|
||||
se implementa la ACL por subject derivada de pertenencia (audit H4 residual:
|
||||
`busauth.NewNkeyAuthenticatorACL` + `membership.SubjectACLFor` +
|
||||
`client.RefreshSession`). Todo viaja detrás del flag `decentralized` (off):
|
||||
el comportamiento de un solo nodo (SQLite + disco) no cambia y master sigue
|
||||
verde. El despliegue multi-nodo real (0003f) lo ejecuta el humano.
|
||||
- v0.5.0 (2026-06-07) — hardening de seguridad (issue 0004) que cierra los
|
||||
hallazgos de la auditoría red-team (report 0004) y lleva el veredicto de
|
||||
exposición pública de "NO" a "sí-con-condiciones". Anti-DoS pre-auth
|
||||
(`http.MaxBytesReader` por ruta + rechazo por `Content-Length` + rate-limit
|
||||
por IP + `MaxHeaderBytes`); guard de fail-open que prohíbe arrancar con bind
|
||||
público o TLS sin `--bus-auth enforce`; autorización por pertenencia en los GET
|
||||
de room (metadata y clave sellada solo para miembros / el propio endpoint);
|
||||
rooms cleartext deshabilitadas en bind público (contenido siempre E2E, mínimo
|
||||
defensivo del data plane mientras la ACL por subject llega con 0003); TLS en el
|
||||
control plane HTTP con la CA propia y cliente que exige `https` cuando hay CA;
|
||||
y los medios H6/H7/H12 (owner ligado al firmante, `IsAuthorized` antes del
|
||||
nonce-cache con poda O(expired) + cap, errores genéricos al cliente). Cada
|
||||
hallazgo lleva su test adversarial `TestAudit_*` portado como regresión.
|
||||
- v0.4.0 (2026-06-07) — descubrimiento de rooms: `GET /members/{endpoint}/rooms`
|
||||
lista las rooms de un endpoint con su metadata y rol, y `client.ListMyRooms()`
|
||||
lo consume. El control plane es pull (no hay push de invitaciones), así que un
|
||||
peer recién invitado a una room cifrada la descubre por polling y luego hace
|
||||
`Join` + `Subscribe`. Pieza base para que los bots de `agents_and_robots`
|
||||
hablen por el bus en vez de Matrix (modelo "todo son rooms", E2E).
|
||||
- v0.3.0 (2026-06-06) — `membershipd` se convierte en service de verdad: flag
|
||||
`--bind` (default 127.0.0.1) que gobierna a la vez el HTTP de control y el NATS
|
||||
embebido (`embeddednats.StartHost`), de modo que con `--bind 0.0.0.0` un
|
||||
teléfono o PC de la LAN conecta a ambos planos. Se añade un systemd-user unit
|
||||
(`deploy/unibus-membershipd.service`, `Restart=always`) + `deploy/install.sh`
|
||||
idempotente, y el bloque `service:` queda completo (systemd-user, restart
|
||||
always, health `/healthz`). El `Frame` (pkg/frame) gana threading aditivo
|
||||
(`ThreadID`, `ReplyTo`) y un tipo `REACT`, con `PublishReply`/`React` en el
|
||||
cliente — la base para que bots de chat hablen por el bus (fase 2). Cambios
|
||||
100% aditivos: el wire de los frames no-threaded es idéntico y los tests
|
||||
existentes siguen verdes.
|
||||
- v0.2.0 (2026-06-03) — el playground gana un benchmark de rendimiento
|
||||
(`GET /api/bench`, SSE): un publisher inunda una room con miles de mensajes a
|
||||
N subscribers y una gráfica en vivo anima el throughput. Expone las dos
|
||||
|
||||
+28
-12
@@ -27,11 +27,12 @@ import (
|
||||
|
||||
func main() {
|
||||
var (
|
||||
natsURL = flag.String("nats-url", "nats://127.0.0.1:4250", "NATS url")
|
||||
ctrlURL = flag.String("ctrl-url", "http://127.0.0.1:8470", "membershipd control-plane url")
|
||||
roomSub = flag.String("room", "proc.test.ticks", "room subject to subscribe to")
|
||||
idFile = flag.String("id-file", "./local_files/chat.id", "identity file path")
|
||||
demoEnc = flag.Bool("demo-encrypted", false, "run the encrypted forward-secrecy demo")
|
||||
natsURL = flag.String("nats-url", "nats://127.0.0.1:4250", "NATS url")
|
||||
ctrlURL = flag.String("ctrl-url", "http://127.0.0.1:8470", "membershipd control-plane url")
|
||||
roomSub = flag.String("room", "proc.test.ticks", "room subject to subscribe to")
|
||||
idFile = flag.String("id-file", "./local_files/chat.id", "identity file path")
|
||||
demoEnc = flag.Bool("demo-encrypted", false, "run the encrypted forward-secrecy demo")
|
||||
caFile = flag.String("ca", "", "path to the bus CA cert (ca.crt); set to connect with TLS + nkey to a secured bus")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
@@ -39,19 +40,19 @@ func main() {
|
||||
log.SetPrefix("[chat] ")
|
||||
|
||||
if *demoEnc {
|
||||
runEncryptedDemo(*natsURL, *ctrlURL)
|
||||
runEncryptedDemo(*natsURL, *ctrlURL, *caFile)
|
||||
return
|
||||
}
|
||||
runSimple(*natsURL, *ctrlURL, *roomSub, *idFile)
|
||||
runSimple(*natsURL, *ctrlURL, *roomSub, *idFile, *caFile)
|
||||
}
|
||||
|
||||
// runSimple subscribes to a cleartext subject and prints messages live.
|
||||
func runSimple(natsURL, ctrlURL, roomSub, idFile string) {
|
||||
func runSimple(natsURL, ctrlURL, roomSub, idFile, caFile string) {
|
||||
id, err := client.LoadOrCreateIdentity(idFile)
|
||||
if err != nil {
|
||||
log.Fatalf("identity: %v", err)
|
||||
}
|
||||
c, err := client.New(natsURL, ctrlURL, id)
|
||||
c, err := client.Connect(natsURL, ctrlURL, id, caFile)
|
||||
if err != nil {
|
||||
log.Fatalf("connect: %v", err)
|
||||
}
|
||||
@@ -68,6 +69,12 @@ func runSimple(natsURL, ctrlURL, roomSub, idFile string) {
|
||||
if err := c.Join(roomID); err != nil {
|
||||
log.Fatalf("join: %v", err)
|
||||
}
|
||||
// Membership-change contract (issue 0006e): refresh so the just-created room's
|
||||
// subject is subscribable under enforce+ACL (permissions are frozen at connect
|
||||
// time). Must run BEFORE Subscribe — RefreshSession drops active subscriptions.
|
||||
if err := c.RefreshSession(); err != nil {
|
||||
log.Fatalf("refresh session after create room: %v", err)
|
||||
}
|
||||
sub, err := c.Subscribe(roomID, func(f frame.Frame, plaintext []byte) {
|
||||
fmt.Printf("[%s] %s: %s\n", f.Subject, shortID(f.Sender), string(plaintext))
|
||||
})
|
||||
@@ -91,7 +98,7 @@ func shortID(id string) string {
|
||||
}
|
||||
|
||||
// runEncryptedDemo proves E2E encryption + forward secrecy end-to-end.
|
||||
func runEncryptedDemo(natsURL, ctrlURL string) {
|
||||
func runEncryptedDemo(natsURL, ctrlURL, caFile string) {
|
||||
log.Printf("=== encrypted forward-secrecy demo ===")
|
||||
pass := true
|
||||
check := func(name string, ok bool) {
|
||||
@@ -109,10 +116,10 @@ func runEncryptedDemo(natsURL, ctrlURL string) {
|
||||
idB, err := newEphemeralIdentity()
|
||||
must(err, "generate B identity")
|
||||
|
||||
a, err := client.New(natsURL, ctrlURL, idA)
|
||||
a, err := client.Connect(natsURL, ctrlURL, idA, caFile)
|
||||
must(err, "connect A")
|
||||
defer a.Close()
|
||||
b, err := client.New(natsURL, ctrlURL, idB)
|
||||
b, err := client.Connect(natsURL, ctrlURL, idB, caFile)
|
||||
must(err, "connect B")
|
||||
defer b.Close()
|
||||
|
||||
@@ -121,12 +128,21 @@ func runEncryptedDemo(natsURL, ctrlURL string) {
|
||||
must(err, "A create room")
|
||||
fmt.Printf(" room.test -> %s (E2E, persisted, signed)\n", roomID)
|
||||
|
||||
// Membership-change contract (issue 0006e): A only became a member of this room
|
||||
// after connecting, so refresh to gain its subject + per-room JetStream API
|
||||
// under enforce+ACL before publishing.
|
||||
must(a.RefreshSession(), "A refresh after create room")
|
||||
|
||||
// A invites B (seals K to B's X25519 key).
|
||||
must(a.Invite(roomID, b.Endpoint()), "A invite B")
|
||||
|
||||
// B joins (fetches + decrypts K).
|
||||
must(b.Join(roomID), "B join")
|
||||
|
||||
// B became a member via the invite above; refresh so B can subscribe to the
|
||||
// room's subject under enforce+ACL (before subscribing — refresh drops subs).
|
||||
must(b.RefreshSession(), "B refresh after join")
|
||||
|
||||
// B subscribes; capture received plaintexts.
|
||||
recv := make(chan string, 4)
|
||||
subB, err := b.Subscribe(roomID, func(f frame.Frame, plaintext []byte) {
|
||||
|
||||
@@ -0,0 +1,260 @@
|
||||
// Command clientcheck is an end-to-end verification client for a live unibus
|
||||
// cluster (issue 0011 GAP B). The 0011 chaos test validated only the control
|
||||
// plane (healthz + meta/stream-leader failover + KV readable with 2/3); it never
|
||||
// connected an authenticated bus client (nkey + TLS) to create a room and
|
||||
// publish/subscribe through it, least of all across a node loss. clientcheck does
|
||||
// exactly that with a real identity (the operator), so the data-plane end-to-end
|
||||
// path — connect, create an E2E room, publish, receive decrypted — is exercised
|
||||
// against the running cluster, including while a node is stopped.
|
||||
//
|
||||
// It is a reusable tool, not a throwaway script: point it at the cluster's CA,
|
||||
// an identity file, and the NATS + control-plane seed lists.
|
||||
//
|
||||
// # golden: connect, create an E2E room, publish N, confirm N decrypted back
|
||||
// clientcheck --ca ca.crt --identity-file operator.id \
|
||||
// --nats-seeds nats://A:4250,nats://B:4250,nats://C:4250 \
|
||||
// --ctrl-seeds https://A:8470,https://B:8470,https://C:8470 --messages 5
|
||||
//
|
||||
// # loop: publish a counter every interval for the duration, logging the node
|
||||
// # it is attached to — stop a node mid-run (systemctl stop membershipd-cluster)
|
||||
// # and watch it fail over to a survivor and keep receiving (quorum 2/3).
|
||||
// clientcheck ... --mode loop --duration 45s --interval 1s
|
||||
package main
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var (
|
||||
caPath = flag.String("ca", "", "bus CA cert pinning TLS on both planes (required for a secured cluster)")
|
||||
idFile = flag.String("identity-file", "", "path to the client identity JSON (e.g. `pass show unibus/operator-identity` written 0600) (required)")
|
||||
natsSeeds = flag.String("nats-seeds", "", "comma-separated NATS urls of the cluster nodes (required)")
|
||||
ctrlSeeds = flag.String("ctrl-seeds", "", "comma-separated control-plane https urls of the cluster nodes (required)")
|
||||
subject = flag.String("subject", "test.gapcheck", "test room subject PREFIX; a random token is appended so runs never collide with real rooms")
|
||||
messages = flag.Int("messages", 5, "golden mode: number of messages to publish and expect back")
|
||||
mode = flag.String("mode", "golden", "golden (publish N, verify N decrypted) | loop (publish a counter for --duration, for failover testing)")
|
||||
duration = flag.Duration("duration", 30*time.Second, "loop mode: how long to keep publishing")
|
||||
interval = flag.Duration("interval", 1*time.Second, "loop mode: delay between published messages")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
if *idFile == "" || *natsSeeds == "" || *ctrlSeeds == "" {
|
||||
log.Fatalf("clientcheck: --identity-file, --nats-seeds and --ctrl-seeds are required")
|
||||
}
|
||||
|
||||
id, err := client.LoadIdentity(*idFile)
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: load identity: %v", err)
|
||||
}
|
||||
natsList := splitCSV(*natsSeeds)
|
||||
ctrlList := splitCSV(*ctrlSeeds)
|
||||
if len(natsList) == 0 || len(ctrlList) == 0 {
|
||||
log.Fatalf("clientcheck: empty --nats-seeds or --ctrl-seeds")
|
||||
}
|
||||
|
||||
// Build the secure client options: nkey on the data plane, TLS pinned to the
|
||||
// bus CA on both planes, and the FULL seed lists so nats.go fails over to a
|
||||
// surviving node when the attached one dies (the failover this tool verifies).
|
||||
opts := client.Options{
|
||||
NatsServers: natsList[1:],
|
||||
CtrlURLs: ctrlList[1:],
|
||||
}
|
||||
if *caPath != "" {
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(*caPath)
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: load CA: %v", err)
|
||||
}
|
||||
opts.UseNkey = true
|
||||
opts.TLS = tlsCfg
|
||||
opts.CtrlTLS = tlsCfg
|
||||
for _, u := range ctrlList {
|
||||
if !strings.HasPrefix(u, "https://") {
|
||||
log.Fatalf("clientcheck: control URL %q must be https:// when --ca is set", u)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c, err := client.NewWithOptions(natsList[0], ctrlList[0], id, opts)
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: connect: %v", err)
|
||||
}
|
||||
defer c.Close()
|
||||
log.Printf("connected: endpoint=%s nats=%s", c.Endpoint().ID, c.ConnectedServer())
|
||||
|
||||
// Create an EPHEMERAL E2E room (encrypted + signed, NOT persisted): the test
|
||||
// stays end-to-end encrypted (the cluster requires encryption on a public
|
||||
// bind) while leaving no durable JetStream stream behind. The random subject
|
||||
// token guarantees the room is unique and never a real room.
|
||||
rnd := make([]byte, 8)
|
||||
if _, err := rand.Read(rnd); err != nil {
|
||||
log.Fatalf("clientcheck: random: %v", err)
|
||||
}
|
||||
subj := fmt.Sprintf("%s.%s", *subject, hex.EncodeToString(rnd))
|
||||
policy := room.Policy{Encrypt: true, Persist: false, SignMsgs: true}
|
||||
roomID, err := c.CreateRoom(subj, policy)
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: create room: %v", err)
|
||||
}
|
||||
log.Printf("created E2E room: id=%s subject=%s (encrypt=%v sign=%v persist=%v)", roomID, subj, policy.Encrypt, policy.SignMsgs, policy.Persist)
|
||||
|
||||
// Under the per-subject ACL, NATS freezes permissions at connect time, so the
|
||||
// just-created room's subject is not yet publishable/subscribable on the live
|
||||
// connection. RefreshSession reconnects so the authenticator re-derives the
|
||||
// ACL (now including this room) — the post-0006 contract every client follows
|
||||
// after a membership change.
|
||||
if err := c.RefreshSession(); err != nil {
|
||||
log.Fatalf("clientcheck: refresh session: %v", err)
|
||||
}
|
||||
|
||||
switch *mode {
|
||||
case "golden":
|
||||
runGolden(c, roomID, *messages)
|
||||
case "loop":
|
||||
runLoop(c, roomID, *duration, *interval)
|
||||
default:
|
||||
log.Fatalf("clientcheck: --mode must be golden or loop, got %q", *mode)
|
||||
}
|
||||
}
|
||||
|
||||
// runGolden subscribes, publishes n messages, and asserts all n come back
|
||||
// decrypted. Exits non-zero if any are missing.
|
||||
func runGolden(c *client.Client, roomID string, n int) {
|
||||
var mu sync.Mutex
|
||||
got := map[string]bool{}
|
||||
sub, err := c.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
mu.Lock()
|
||||
got[string(plaintext)] = true
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(300 * time.Millisecond) // let the subscription settle
|
||||
|
||||
want := make([]string, n)
|
||||
for i := 0; i < n; i++ {
|
||||
msg := fmt.Sprintf("gapcheck-e2e-%d", i)
|
||||
want[i] = msg
|
||||
if err := c.Publish(roomID, []byte(msg)); err != nil {
|
||||
log.Fatalf("clientcheck: publish %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
log.Printf("published %d messages to %s; waiting for decrypted echoes...", n, roomID)
|
||||
|
||||
deadline := time.Now().Add(15 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
have := len(got)
|
||||
mu.Unlock()
|
||||
if have >= n {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
missing := 0
|
||||
for _, w := range want {
|
||||
if !got[w] {
|
||||
missing++
|
||||
log.Printf(" MISSING: %q", w)
|
||||
}
|
||||
}
|
||||
log.Printf("connected node at finish: %s", c.ConnectedServer())
|
||||
if missing > 0 {
|
||||
log.Fatalf("GOLDEN FAIL: %d/%d messages not received decrypted", missing, n)
|
||||
}
|
||||
log.Printf("GOLDEN OK: all %d messages received and decrypted end-to-end", n)
|
||||
}
|
||||
|
||||
// runLoop publishes a numbered message every interval for the duration and logs
|
||||
// the count received plus the node currently attached, so an operator stopping a
|
||||
// cluster node mid-run sees the client fail over to a survivor and keep receiving
|
||||
// (quorum 2/3). It is the live failover-with-a-connected-client test the 0011
|
||||
// chaos run never performed.
|
||||
func runLoop(c *client.Client, roomID string, duration, interval time.Duration) {
|
||||
var mu sync.Mutex
|
||||
received := 0
|
||||
servers := map[string]int{} // node -> #ticks observed attached
|
||||
sub, err := c.Subscribe(roomID, func(_ frame.Frame, _ []byte) {
|
||||
mu.Lock()
|
||||
received++
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatalf("clientcheck: subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(300 * time.Millisecond)
|
||||
|
||||
log.Printf("loop: publishing every %s for %s — stop a node now to test failover", interval, duration)
|
||||
end := time.Now().Add(duration)
|
||||
sent := 0
|
||||
for time.Now().Before(end) {
|
||||
msg := fmt.Sprintf("gapcheck-loop-%d", sent)
|
||||
err := c.Publish(roomID, []byte(msg))
|
||||
sent++
|
||||
mu.Lock()
|
||||
recv := received
|
||||
mu.Unlock()
|
||||
node := c.ConnectedServer()
|
||||
up := c.IsConnected()
|
||||
if node != "" {
|
||||
mu.Lock()
|
||||
servers[node]++
|
||||
mu.Unlock()
|
||||
}
|
||||
pubStatus := "ok"
|
||||
if err != nil {
|
||||
pubStatus = "ERR:" + err.Error()
|
||||
}
|
||||
log.Printf(" t=%2ds sent=%d recv=%d up=%v node=%s publish=%s",
|
||||
sent, sent, recv, up, node, pubStatus)
|
||||
time.Sleep(interval)
|
||||
}
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
log.Printf("loop done: sent=%d received=%d", sent, received)
|
||||
nodes := make([]string, 0, len(servers))
|
||||
for n := range servers {
|
||||
nodes = append(nodes, n)
|
||||
}
|
||||
sort.Strings(nodes)
|
||||
for _, n := range nodes {
|
||||
log.Printf(" attached to %s for %d ticks", n, servers[n])
|
||||
}
|
||||
if len(servers) > 1 {
|
||||
log.Printf("FAILOVER OBSERVED: client was attached to %d distinct nodes across the run", len(servers))
|
||||
}
|
||||
if received == 0 {
|
||||
log.Fatalf("LOOP FAIL: received 0 messages")
|
||||
}
|
||||
log.Printf("LOOP OK: client kept receiving across the run (received=%d)", received)
|
||||
}
|
||||
|
||||
func splitCSV(s string) []string {
|
||||
var out []string
|
||||
for _, p := range strings.Split(s, ",") {
|
||||
if p = strings.TrimSpace(p); p != "" {
|
||||
out = append(out, p)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,221 @@
|
||||
package main
|
||||
|
||||
// Regression for audit report 0008, vector N3: the binary must wire the
|
||||
// replicated nonce store on a clustered node so a signed request accepted on one
|
||||
// node cannot be replayed to another. The auditor's ephemeral attack showed the
|
||||
// OLD binary never called UseReplicatedNonces (each node kept a per-process
|
||||
// cache), so a captured request replayed to a second node with 200+200. These
|
||||
// tests drive the SAME helper the binary uses (wireReplicatedNonces) so they
|
||||
// prove the WIRING, not just the underlying API.
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
func freePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// signed008 builds a transport-signed control-plane request with a caller-chosen
|
||||
// ts+nonce, so a test can reuse the exact same signed bytes against two nodes to
|
||||
// exercise replay.
|
||||
func signed008(t *testing.T, baseURL, method, path string, body []byte, id cs.Identity, ts int64, nonce string) *http.Request {
|
||||
t.Helper()
|
||||
canonical := membership.CanonicalRequest(method, path, strconv.FormatInt(ts, 10), nonce, body)
|
||||
sig := cs.SignEd25519(id.SignPriv, canonical)
|
||||
var rdr io.Reader
|
||||
if body != nil {
|
||||
rdr = bytes.NewReader(body)
|
||||
}
|
||||
req, err := http.NewRequest(method, baseURL+path, rdr)
|
||||
if err != nil {
|
||||
t.Fatalf("new request: %v", err)
|
||||
}
|
||||
req.Header.Set("X-Unibus-Pub", hex.EncodeToString(id.SignPub))
|
||||
req.Header.Set("X-Unibus-Ts", strconv.FormatInt(ts, 10))
|
||||
req.Header.Set("X-Unibus-Nonce", nonce)
|
||||
req.Header.Set("X-Unibus-Sig", base64.StdEncoding.EncodeToString(sig))
|
||||
return req
|
||||
}
|
||||
|
||||
func randNonce(t *testing.T) string {
|
||||
t.Helper()
|
||||
raw := make([]byte, 16)
|
||||
if _, err := rand.Read(raw); err != nil {
|
||||
t.Fatalf("nonce: %v", err)
|
||||
}
|
||||
return base64.StdEncoding.EncodeToString(raw)
|
||||
}
|
||||
|
||||
// TestAttack0008_N3 is the blocker regression: two clustered membershipd nodes
|
||||
// wired through wireReplicatedNonces share a JetStream KV nonce bucket, so a
|
||||
// request accepted on node A is rejected (401) when replayed to node B. Before
|
||||
// the fix the binary never wired this and the replay returned 200.
|
||||
func TestAttack0008_N3(t *testing.T) {
|
||||
// One NATS+JetStream backing the shared nonce bucket (no client auth needed:
|
||||
// the test drives the membership.Server's nonce store directly via HTTP).
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: freePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
t.Fatalf("connect: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
|
||||
// Shared control-plane state (stand-in for the replicated store) + two nodes.
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
if err := store.AddUser(hex.EncodeToString(alice.SignPub), "alice", membership.RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
|
||||
// Each node is wired EXACTLY as the binary wires a clustered node.
|
||||
mkNode := func() *httptest.Server {
|
||||
srv := membership.NewServer(store, blobs, membership.AuthEnforce)
|
||||
if err := wireReplicatedNonces(srv, js, true /*clustered*/, 1); err != nil {
|
||||
t.Fatalf("wireReplicatedNonces: %v", err)
|
||||
}
|
||||
return httptest.NewServer(srv)
|
||||
}
|
||||
nodeA := mkNode()
|
||||
t.Cleanup(nodeA.Close)
|
||||
nodeB := mkNode()
|
||||
t.Cleanup(nodeB.Close)
|
||||
|
||||
ts := time.Now().Unix()
|
||||
nonce := randNonce(t)
|
||||
path := "/members/" + frame.EndpointID(alice.SignPub) + "/rooms"
|
||||
|
||||
// Golden: alice's signed request is accepted on node A.
|
||||
respA, err := http.DefaultClient.Do(signed008(t, nodeA.URL, "GET", path, nil, alice, ts, nonce))
|
||||
if err != nil {
|
||||
t.Fatalf("do A: %v", err)
|
||||
}
|
||||
respA.Body.Close()
|
||||
if respA.StatusCode != http.StatusOK {
|
||||
t.Fatalf("node A first use: status %d, want 200", respA.StatusCode)
|
||||
}
|
||||
|
||||
// Error path (the attack): replay the SAME signed bytes to node B → 401.
|
||||
respB, err := http.DefaultClient.Do(signed008(t, nodeB.URL, "GET", path, nil, alice, ts, nonce))
|
||||
if err != nil {
|
||||
t.Fatalf("do B: %v", err)
|
||||
}
|
||||
respB.Body.Close()
|
||||
if respB.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("cross-node replay to node B: status %d, want 401 (replayed nonce must be rejected)", respB.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAttack0008_N3_StandaloneKeepsLocalCache is the edge: a NON-clustered node
|
||||
// must NOT require JetStream — wireReplicatedNonces is a no-op and the node keeps
|
||||
// its in-memory cache, which still rejects a same-node replay (the single-node
|
||||
// guarantee is unchanged). This proves the fix does not add a JetStream
|
||||
// dependency to standalone deployments.
|
||||
func TestAttack0008_N3_StandaloneKeepsLocalCache(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
if err := store.AddUser(hex.EncodeToString(alice.SignPub), "alice", membership.RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
|
||||
srv := membership.NewServer(store, blobs, membership.AuthEnforce)
|
||||
// Standalone: clustered=false, js=nil. Must succeed (no JetStream needed).
|
||||
if err := wireReplicatedNonces(srv, nil, false /*clustered*/, 1); err != nil {
|
||||
t.Fatalf("standalone wireReplicatedNonces must be a no-op, got: %v", err)
|
||||
}
|
||||
node := httptest.NewServer(srv)
|
||||
t.Cleanup(node.Close)
|
||||
|
||||
ts := time.Now().Unix()
|
||||
nonce := randNonce(t)
|
||||
path := "/members/" + frame.EndpointID(alice.SignPub) + "/rooms"
|
||||
|
||||
resp1, err := http.DefaultClient.Do(signed008(t, node.URL, "GET", path, nil, alice, ts, nonce))
|
||||
if err != nil {
|
||||
t.Fatalf("do 1: %v", err)
|
||||
}
|
||||
resp1.Body.Close()
|
||||
if resp1.StatusCode != http.StatusOK {
|
||||
t.Fatalf("first use: status %d, want 200", resp1.StatusCode)
|
||||
}
|
||||
// Same-node replay is still rejected by the in-memory cache.
|
||||
resp2, err := http.DefaultClient.Do(signed008(t, node.URL, "GET", path, nil, alice, ts, nonce))
|
||||
if err != nil {
|
||||
t.Fatalf("do 2: %v", err)
|
||||
}
|
||||
resp2.Body.Close()
|
||||
if resp2.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("same-node replay: status %d, want 401", resp2.StatusCode)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAttack0008_N3_ClusteredRequiresJetStream proves the hard rule: a clustered
|
||||
// node with NO JetStream available refuses (error), so the binary fails fast
|
||||
// instead of silently running with a per-process cache.
|
||||
func TestAttack0008_N3_ClusteredRequiresJetStream(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
srv := membership.NewServer(store, blobs, membership.AuthEnforce)
|
||||
if err := wireReplicatedNonces(srv, nil, true /*clustered*/, 1); err == nil {
|
||||
t.Fatalf("clustered node with no JetStream must fail, got nil")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// splitRoutes parses the comma-separated --routes flag into a clean slice of
|
||||
// route URLs, dropping empty entries and surrounding whitespace so a trailing
|
||||
// comma or a spaced list does not yield a bogus empty route.
|
||||
func splitRoutes(csv string) []string {
|
||||
var out []string
|
||||
for _, r := range strings.Split(csv, ",") {
|
||||
if r = strings.TrimSpace(r); r != "" {
|
||||
out = append(out, r)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// resolveClusterPass resolves the cluster route secret WITHOUT leaking it through
|
||||
// argv (audit 0008 N1-low: --cluster-pass in argv is visible in ps/journald).
|
||||
// Precedence: --cluster-pass-file (read + trim the file), then the env var
|
||||
// UNIBUS_CLUSTER_PASS, then the legacy --cluster-pass flag (argv-visible, kept for
|
||||
// dev/compat). env is injected (os.Getenv result) so the function stays testable.
|
||||
// It returns the secret and a short source label for logging (never the secret).
|
||||
func resolveClusterPass(passFlag, passFile, env string) (secret, source string, err error) {
|
||||
if passFile != "" {
|
||||
b, rerr := os.ReadFile(passFile)
|
||||
if rerr != nil {
|
||||
return "", "", fmt.Errorf("read --cluster-pass-file %q: %w", passFile, rerr)
|
||||
}
|
||||
return strings.TrimSpace(string(b)), "file", nil
|
||||
}
|
||||
if env != "" {
|
||||
return env, "env", nil
|
||||
}
|
||||
if passFlag != "" {
|
||||
return passFlag, "flag", nil
|
||||
}
|
||||
return "", "none", nil
|
||||
}
|
||||
|
||||
// injectRouteCreds rewrites each route URL that carries NO userinfo to embed
|
||||
// user:pass, so the cluster secret is supplied once (via file/env) instead of
|
||||
// repeated in every --routes argv entry where ps/journald would expose it. A route
|
||||
// that already carries userinfo is left untouched (operator override). With an
|
||||
// empty user it is a no-op. A malformed route URL is an error (configuration bug)
|
||||
// rather than a silently dropped peer.
|
||||
func injectRouteCreds(routes []string, user, pass string) ([]string, error) {
|
||||
if user == "" {
|
||||
return routes, nil
|
||||
}
|
||||
out := make([]string, 0, len(routes))
|
||||
for _, r := range routes {
|
||||
u, err := url.Parse(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse route %q: %w", r, err)
|
||||
}
|
||||
if u.User == nil {
|
||||
u.User = url.UserPassword(user, pass)
|
||||
}
|
||||
out = append(out, u.String())
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// isLoopbackURL reports whether a NATS url targets this host only (loopback). Used
|
||||
// to guard migrate-to-kv (audit 0008 N6): pushing the allowlist to a REMOTE NATS
|
||||
// without TLS would send handles/roles/sign-pubs in cleartext, so a remote target
|
||||
// must be TLS-pinned (--ca). A url we cannot classify is treated as NON-loopback
|
||||
// (conservative: it then requires --ca).
|
||||
func isLoopbackURL(natsURL string) bool {
|
||||
u, err := url.Parse(natsURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := u.Hostname()
|
||||
switch host {
|
||||
case "localhost":
|
||||
return true
|
||||
case "":
|
||||
return false
|
||||
}
|
||||
ip := net.ParseIP(host)
|
||||
return ip != nil && ip.IsLoopback()
|
||||
}
|
||||
|
||||
// isLoopbackBind reports whether the --bind value keeps the service reachable
|
||||
// only from this host. An empty bind means "all interfaces" (public), and a
|
||||
// hostname we cannot resolve to a loopback literal is treated as public — the
|
||||
// conservative choice, so an unusual bind never silently slips past the guard.
|
||||
func isLoopbackBind(bind string) bool {
|
||||
switch bind {
|
||||
case "localhost":
|
||||
return true
|
||||
case "":
|
||||
return false // empty binds every interface
|
||||
}
|
||||
ip := net.ParseIP(bind)
|
||||
if ip == nil {
|
||||
return false // a hostname we can't classify: assume public
|
||||
}
|
||||
return ip.IsLoopback()
|
||||
}
|
||||
|
||||
// validateBootConfig is the fail-open guard (audit H2). It refuses any startup
|
||||
// configuration that would expose the bus without enforced authentication:
|
||||
//
|
||||
// - a non-loopback --bind without --bus-auth enforce (the data plane and
|
||||
// control plane would both accept anyone),
|
||||
// - --tls-cert/--tls-key without --bus-auth enforce (TLS encrypts the channel
|
||||
// but authenticates no one — encrypted access for everybody is still open), and
|
||||
// - a non-loopback --bind WITHOUT --tls-cert/--tls-key (the control plane would
|
||||
// serve metadata over plaintext HTTP publicly — audit H5 reappearing, the N4
|
||||
// gap the re-audit found: TLS was available but not mandatory).
|
||||
//
|
||||
// It is a pure function of the parsed flags so the command can fail fast at
|
||||
// startup and tests can assert the policy without booting a server.
|
||||
func validateBootConfig(bind string, mode membership.AuthMode, tlsCert, tlsKey string) error {
|
||||
if !isLoopbackBind(bind) && mode != membership.AuthEnforce {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --bind %q is not loopback but --bus-auth is %q; a public bind requires --bus-auth enforce (or bind 127.0.0.1 for local dev)",
|
||||
bind, mode)
|
||||
}
|
||||
if (tlsCert != "" || tlsKey != "") && mode != membership.AuthEnforce {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --tls-cert/--tls-key set but --bus-auth is %q; TLS without enforced auth is fail-open (encrypted channel, no authentication) — set --bus-auth enforce",
|
||||
mode)
|
||||
}
|
||||
if !isLoopbackBind(bind) && (tlsCert == "" || tlsKey == "") {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --bind %q is not loopback but --tls-cert/--tls-key are not both set; a public control plane must serve HTTPS or its metadata (subjects, pubkeys, sealed keys, the social graph) travels in cleartext to a network MITM (audit H5/N4) — provide a CA-signed --tls-cert/--tls-key, or bind 127.0.0.1 for local dev",
|
||||
bind)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateClusterConfig guards the cluster route layer (issue 0003a). The route
|
||||
// layer is a server-to-server trust boundary distinct from the client data
|
||||
// plane: leaving it open lets anyone who reaches the route port join the cluster
|
||||
// or inject messages into the whole bus (audit 0004, "auth of the cluster
|
||||
// routes"). So on a public (non-loopback) bind, a cluster MUST carry both a
|
||||
// shared route secret AND mutual route TLS. It is a pure function of the parsed
|
||||
// flags. An empty clusterName means "no cluster" (standalone) and is always
|
||||
// allowed.
|
||||
//
|
||||
// The three route-TLS paths are all-or-nothing (mutual TLS needs the node cert,
|
||||
// its key, and the CA together), independent of the bind, so a partial TLS
|
||||
// config never silently degrades to plaintext routes.
|
||||
//
|
||||
// Homogeneous posture (issue 0006d, audit 0008 N1): a cluster is only as secure
|
||||
// as its weakest node — the data plane forwards every subject between nodes, so a
|
||||
// single node running without enforced auth lets an unauthenticated peer
|
||||
// Subscribe(">") on it and harvest the traffic forwarded from the ACL'd nodes.
|
||||
// This node therefore REFUSES to join a cluster unless it runs --bus-auth enforce,
|
||||
// regardless of bind: a clustered node is a production node, and there is no safe
|
||||
// "dev cluster without auth". (A peer running a tampered binary is out of this
|
||||
// node's control; /healthz exposes each node's posture so a monitor can detect
|
||||
// one that is not enforce+ACL — see Server.Posture.)
|
||||
func validateClusterConfig(clusterName, bind, user, pass, rtCert, rtKey, rtCA string, mode membership.AuthMode) error {
|
||||
rtAny := rtCert != "" || rtKey != "" || rtCA != ""
|
||||
rtAll := rtCert != "" && rtKey != "" && rtCA != ""
|
||||
if rtAny && !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: --route-tls-cert/--route-tls-key/--route-tls-ca must be set together (mutual route TLS needs all three)")
|
||||
}
|
||||
if clusterName == "" {
|
||||
return nil // standalone: no route layer to secure
|
||||
}
|
||||
// A clustered node MUST enforce auth (homogeneous posture). Checked before the
|
||||
// loopback shortcut so even a loopback cluster cannot form without enforce.
|
||||
if mode != membership.AuthEnforce {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q requires --bus-auth enforce; a cluster node without enforced auth+ACL lets an unauthenticated peer harvest the traffic forwarded from the other nodes (audit 0008 N1) — every node must run the same enforce+ACL+TLS posture",
|
||||
clusterName)
|
||||
}
|
||||
if isLoopbackBind(bind) {
|
||||
return nil // loopback cluster is dev-only and unreachable from outside
|
||||
}
|
||||
// Public cluster: demand a route secret and mutual route TLS.
|
||||
if user == "" || pass == "" {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires --cluster-user and --cluster-pass; an unauthenticated route port lets anyone join the cluster",
|
||||
clusterName, bind)
|
||||
}
|
||||
if !rtAll {
|
||||
return fmt.Errorf(
|
||||
"refusing to start: cluster %q on public bind %q requires mutual route TLS (--route-tls-cert/--route-tls-key/--route-tls-ca); plaintext routes expose server-to-server traffic and admit unsigned nodes",
|
||||
clusterName, bind)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,188 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// TestAudit_FailOpenTLSWithoutAuth ports the auditor's H2 vector. Before the
|
||||
// guard, booting with TLS on but the authenticator off ("--bind 0.0.0.0
|
||||
// --tls-cert … " without enforce) produced an encrypted data plane that an
|
||||
// unregistered, nkey-less client could still connect to — a fail-open config
|
||||
// wearing the appearance of security. validateBootConfig now refuses it, so the
|
||||
// insecure server never starts (the client therefore has nothing to connect to).
|
||||
func TestAudit_FailOpenTLSWithoutAuth(t *testing.T) {
|
||||
// The exact auditor configuration: public bind, TLS provided, auth off.
|
||||
err := validateBootConfig("0.0.0.0", membership.AuthOff, "server.crt", "server.key")
|
||||
if err == nil {
|
||||
t.Fatalf("TLS without enforce on a public bind must be refused at startup")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "enforce") {
|
||||
t.Fatalf("error should point the operator at --bus-auth enforce, got: %v", err)
|
||||
}
|
||||
|
||||
// And TLS without enforce is rejected even on loopback: TLS implies a
|
||||
// security posture, so authenticating no one is always a misconfiguration.
|
||||
if err := validateBootConfig("127.0.0.1", membership.AuthOff, "server.crt", "server.key"); err == nil {
|
||||
t.Fatalf("TLS flags without enforce must be refused regardless of bind")
|
||||
}
|
||||
}
|
||||
|
||||
// TestGap_PublicEnforceNoTLS ports the re-auditor's N4 gap: the H2 guard refused
|
||||
// "public without enforce" and "TLS without enforce", but ALLOWED a public bind
|
||||
// with enforce and NO --tls-cert, so the control plane served metadata over
|
||||
// plaintext HTTP publicly (H5 reappearing). The guard now refuses it.
|
||||
func TestGap_PublicEnforceNoTLS(t *testing.T) {
|
||||
// The exact auditor configuration: public bind, enforce on, no TLS cert/key.
|
||||
err := validateBootConfig("0.0.0.0", membership.AuthEnforce, "", "")
|
||||
if err == nil {
|
||||
t.Fatalf("public bind + enforce + NO --tls-cert must be refused: the control plane would serve plaintext HTTP publicly (audit N4)")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "tls-cert") {
|
||||
t.Fatalf("error should point the operator at --tls-cert/--tls-key, got: %v", err)
|
||||
}
|
||||
|
||||
// Golden: the same public+enforce config WITH a cert/key is allowed.
|
||||
if err := validateBootConfig("0.0.0.0", membership.AuthEnforce, "server.crt", "server.key"); err != nil {
|
||||
t.Fatalf("public + enforce + TLS is the intended production config, got: %v", err)
|
||||
}
|
||||
|
||||
// Edge: loopback without TLS stays allowed (local dev is not a public exposure).
|
||||
if err := validateBootConfig("127.0.0.1", membership.AuthOff, "", ""); err != nil {
|
||||
t.Fatalf("loopback dev without TLS must remain allowed, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBootConfigPolicy is the full table: the golden secure-public config is
|
||||
// allowed, dev loopback is allowed, and every fail-open shape is refused.
|
||||
func TestBootConfigPolicy(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
bind string
|
||||
mode membership.AuthMode
|
||||
cert string
|
||||
key string
|
||||
wantErr bool
|
||||
}{
|
||||
// Golden: the intended public production config — enforce AND TLS.
|
||||
{"public+enforce+tls", "0.0.0.0", membership.AuthEnforce, "s.crt", "s.key", false},
|
||||
// Edge: local dev on loopback may stay open (no auth, no TLS).
|
||||
{"loopback+off", "127.0.0.1", membership.AuthOff, "", "", false},
|
||||
{"loopback-ipv6+off", "::1", membership.AuthOff, "", "", false},
|
||||
{"localhost+off", "localhost", membership.AuthOff, "", "", false},
|
||||
{"loopback+soft", "127.0.0.1", membership.AuthSoft, "", "", false},
|
||||
// Edge: loopback with full enforce+TLS is also fine.
|
||||
{"loopback+enforce+tls", "127.0.0.1", membership.AuthEnforce, "s.crt", "s.key", false},
|
||||
// Error: public bind without enforce.
|
||||
{"public+off", "0.0.0.0", membership.AuthOff, "", "", true},
|
||||
{"public+soft", "0.0.0.0", membership.AuthSoft, "", "", true},
|
||||
{"lan-ip+off", "192.168.1.10", membership.AuthOff, "", "", true},
|
||||
{"empty-bind+off", "", membership.AuthOff, "", "", true},
|
||||
// Error (N4): public bind + enforce but NO TLS -> plaintext control plane.
|
||||
{"public+enforce+notls", "0.0.0.0", membership.AuthEnforce, "", "", true},
|
||||
{"public+enforce+certonly", "0.0.0.0", membership.AuthEnforce, "s.crt", "", true},
|
||||
{"public+enforce+keyonly", "0.0.0.0", membership.AuthEnforce, "", "s.key", true},
|
||||
{"lan-ip+enforce+notls", "192.168.1.10", membership.AuthEnforce, "", "", true},
|
||||
// Error: TLS flags without enforce (cert or key alone is enough to trip it).
|
||||
{"loopback+tlscert+off", "127.0.0.1", membership.AuthOff, "s.crt", "", true},
|
||||
{"loopback+tlskey+soft", "127.0.0.1", membership.AuthSoft, "", "s.key", true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
err := validateBootConfig(c.bind, c.mode, c.cert, c.key)
|
||||
if c.wantErr && err == nil {
|
||||
t.Fatalf("config %+v should be refused", c)
|
||||
}
|
||||
if !c.wantErr && err != nil {
|
||||
t.Fatalf("config %+v should be allowed, got: %v", c, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestClusterConfigPolicy is the cluster route guard (issue 0003a): a standalone
|
||||
// server is always fine; a loopback cluster is dev-only and unguarded; a public
|
||||
// cluster demands both a route secret and complete mutual route TLS; and the
|
||||
// route-TLS flags are all-or-nothing regardless of bind.
|
||||
func TestClusterConfigPolicy(t *testing.T) {
|
||||
const c, k, ca = "node.crt", "node.key", "ca.crt"
|
||||
en := membership.AuthEnforce
|
||||
off := membership.AuthOff
|
||||
soft := membership.AuthSoft
|
||||
cases := []struct {
|
||||
name string
|
||||
clusterName, bind string
|
||||
user, pass string
|
||||
rtCert, rtKey, rtCA string
|
||||
mode membership.AuthMode
|
||||
wantErr bool
|
||||
}{
|
||||
// Standalone (no cluster name) is always allowed, even on a public bind and
|
||||
// without enforce — the cluster posture rule does not apply to a single node.
|
||||
{"standalone-public-off", "", "0.0.0.0", "", "", "", "", "", off, false},
|
||||
// Loopback dev cluster WITH enforce: allowed (unreachable from outside).
|
||||
{"loopback-cluster-enforce", "unibus", "127.0.0.1", "", "", "", "", "", en, false},
|
||||
// Golden: full public HA config under enforce.
|
||||
{"public-full-enforce", "unibus", "0.0.0.0", "u", "p", c, k, ca, en, false},
|
||||
// N1 (audit 0008): a clustered node WITHOUT enforce is refused — even on
|
||||
// loopback — so no weak node can join the cluster.
|
||||
{"cluster-off-refused", "unibus", "127.0.0.1", "", "", "", "", "", off, true},
|
||||
{"cluster-soft-refused", "unibus", "0.0.0.0", "u", "p", c, k, ca, soft, true},
|
||||
// Error: public cluster without a route secret (enforce on, fails on secret).
|
||||
{"public-no-secret", "unibus", "0.0.0.0", "", "", c, k, ca, en, true},
|
||||
{"public-half-secret", "unibus", "0.0.0.0", "u", "", c, k, ca, en, true},
|
||||
// Error: public cluster without mutual route TLS.
|
||||
{"public-no-tls", "unibus", "10.0.0.1", "u", "p", "", "", "", en, true},
|
||||
// Error: partial route-TLS flags trip regardless of bind/mode.
|
||||
{"loopback-partial-tls", "unibus", "127.0.0.1", "", "", c, "", "", en, true},
|
||||
{"standalone-partial-tls", "", "127.0.0.1", "", "", c, k, "", off, true},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := validateClusterConfig(tc.clusterName, tc.bind, tc.user, tc.pass, tc.rtCert, tc.rtKey, tc.rtCA, tc.mode)
|
||||
if tc.wantErr && err == nil {
|
||||
t.Fatalf("cluster config %+v should be refused", tc)
|
||||
}
|
||||
if !tc.wantErr && err != nil {
|
||||
t.Fatalf("cluster config %+v should be allowed, got: %v", tc, err)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestAttack0008_N1 is the regression for audit 0008 N1 scenario 2: a node
|
||||
// configured to join a cluster while NOT enforcing auth (the weak node that lets
|
||||
// an unauthenticated peer harvest the cluster's forwarded traffic) must be refused
|
||||
// at startup. The homogeneous-posture rule makes this binary unable to BE that
|
||||
// weak node.
|
||||
func TestAttack0008_N1(t *testing.T) {
|
||||
// Weak node: clustered but --bus-auth off -> refused.
|
||||
if err := validateClusterConfig("unibus", "0.0.0.0", "u", "p", "n.crt", "n.key", "ca.crt", membership.AuthOff); err == nil {
|
||||
t.Fatalf("a clustered node without enforce must be refused (audit 0008 N1)")
|
||||
}
|
||||
// Same node WITH enforce + full route security -> allowed.
|
||||
if err := validateClusterConfig("unibus", "0.0.0.0", "u", "p", "n.crt", "n.key", "ca.crt", membership.AuthEnforce); err != nil {
|
||||
t.Fatalf("a clustered enforce node with full route security must be allowed, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSplitRoutes(t *testing.T) {
|
||||
cases := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"", 0},
|
||||
{"nats://a:1", 1},
|
||||
{"nats://a:1,nats://b:2", 2},
|
||||
{" nats://a:1 , nats://b:2 ", 2}, // spaces trimmed
|
||||
{"nats://a:1,,", 1}, // empty entries dropped
|
||||
{",", 0},
|
||||
}
|
||||
for _, c := range cases {
|
||||
if got := splitRoutes(c.in); len(got) != c.want {
|
||||
t.Fatalf("splitRoutes(%q) = %v (len %d), want len %d", c.in, got, len(got), c.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// connectInternalJS opens a privileged JetStream client from membershipd to its
|
||||
// OWN embedded NATS server. This is the resolution of the "bootstrap cycle"
|
||||
// (issue 0006a/c): the service needs JetStream to create the replicated nonce
|
||||
// bucket and the control-plane KV, but under enforce the data plane only accepts
|
||||
// allowlisted clients confined to their rooms. The connection therefore
|
||||
// authenticates with the process's ephemeral internal identity — the identity the
|
||||
// authenticator was built to recognize (NewNkeyAuthenticatorACLInternal) and
|
||||
// grant full permissions — without ever appearing in the user allowlist.
|
||||
//
|
||||
// It uses the in-process transport (nats.InProcessServer), a Go pipe inside the
|
||||
// process, so it bypasses TLS entirely: no CA wiring is needed for this
|
||||
// self-connection even when the public data plane is TLS-only. useNkey mirrors
|
||||
// whether the embedded server enforces auth: under enforce the internal identity
|
||||
// presents its nkey; without enforce the server accepts an unauthenticated
|
||||
// in-process client and the nkey is omitted.
|
||||
//
|
||||
// The caller owns the returned connection and must Close it on shutdown (after
|
||||
// the JetStream context is no longer used).
|
||||
func connectInternalJS(ns *server.Server, internalID cs.Identity, useNkey bool) (*nats.Conn, jetstream.JetStream, error) {
|
||||
opts := []nats.Option{
|
||||
nats.Name("membershipd-internal"),
|
||||
nats.InProcessServer(ns),
|
||||
}
|
||||
if useNkey {
|
||||
pub, sign, err := busauth.ClientNkey(internalID.SignPriv)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("internal nkey: %w", err)
|
||||
}
|
||||
opts = append(opts, nats.Nkey(pub, sign))
|
||||
}
|
||||
// The URL is ignored for an in-process connection; the InProcessServer option
|
||||
// supplies the transport.
|
||||
nc, err := nats.Connect("", opts...)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("connect internal nats: %w", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, nil, fmt.Errorf("internal jetstream: %w", err)
|
||||
}
|
||||
return nc, js, nil
|
||||
}
|
||||
|
||||
// connectExternalJS opens a JetStream client to an EXTERNAL NATS the operator
|
||||
// runs (membershipd started with --nats-url). Unlike the embedded path there is
|
||||
// no in-process transport and no internal identity: the external server enforces
|
||||
// its own auth, so membershipd connects as a plain client (optionally TLS-pinned
|
||||
// to the bus CA). It is best-effort and intended for an operator-managed cluster;
|
||||
// the standard unibus deploy uses the embedded server (connectInternalJS).
|
||||
func connectExternalJS(natsURL, caPath string) (*nats.Conn, jetstream.JetStream, error) {
|
||||
opts := []nats.Option{nats.Name("membershipd-internal")}
|
||||
if caPath != "" {
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(caPath)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("load CA %q: %w", caPath, err)
|
||||
}
|
||||
opts = append(opts, nats.Secure(tlsCfg))
|
||||
}
|
||||
nc, err := nats.Connect(natsURL, opts...)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("connect external nats %q: %w", natsURL, err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, nil, fmt.Errorf("external jetstream: %w", err)
|
||||
}
|
||||
return nc, js, nil
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
package main
|
||||
|
||||
// Bootstrap test for issue 0006a/c: under enforce, membershipd must still reach
|
||||
// JetStream on its OWN embedded server to create the nonce/KV buckets. It does so
|
||||
// with an ephemeral internal identity the authenticator grants full permissions
|
||||
// (NewNkeyAuthenticatorACLInternal). These tests prove that privileged
|
||||
// self-connection works AND that no other identity can claim it.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
func icFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// TestInternalConnPrivilegedUnderEnforce: with an enforce authenticator that
|
||||
// authorizes NO bus user, the internal identity still connects in-process and has
|
||||
// full permissions — it creates a KV bucket and round-trips a value. This is the
|
||||
// resolution of the bootstrap cycle the audit flagged as the reason the KV store
|
||||
// was never wired.
|
||||
func TestInternalConnPrivilegedUnderEnforce(t *testing.T) {
|
||||
internalID, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("internal identity: %v", err)
|
||||
}
|
||||
internalPubHex := hex.EncodeToString(internalID.SignPub)
|
||||
|
||||
// Authenticator: no bus user is authorized; only the internal identity passes.
|
||||
auth := busauth.NewNkeyAuthenticatorACLInternal(
|
||||
func(string) bool { return false },
|
||||
busauth.PermissionsFromSubjects(func(string) ([]string, error) { return []string{"_INBOX.>"}, nil }),
|
||||
internalPubHex,
|
||||
)
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: icFreePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
|
||||
nc, js, err := connectInternalJS(ns, internalID, true /*useNkey*/)
|
||||
if err != nil {
|
||||
t.Fatalf("connectInternalJS: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
kv, err := js.CreateKeyValue(ctx, jetstream.KeyValueConfig{Bucket: "KV_UNIBUS_test", Replicas: 1})
|
||||
if err != nil {
|
||||
t.Fatalf("internal conn could not create KV bucket (full perms expected): %v", err)
|
||||
}
|
||||
if _, err := kv.Put(ctx, "k", []byte("v")); err != nil {
|
||||
t.Fatalf("kv put: %v", err)
|
||||
}
|
||||
e, err := kv.Get(ctx, "k")
|
||||
if err != nil || string(e.Value()) != "v" {
|
||||
t.Fatalf("kv get: val=%q err=%v", e, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestInternalConnOutsiderRejected: an identity that is neither the internal one
|
||||
// nor an allowlisted bus user cannot connect — proving the internal bypass is
|
||||
// scoped to the exact internal key, not a blanket hole.
|
||||
func TestInternalConnOutsiderRejected(t *testing.T) {
|
||||
internalID, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("internal identity: %v", err)
|
||||
}
|
||||
auth := busauth.NewNkeyAuthenticatorACLInternal(
|
||||
func(string) bool { return false },
|
||||
busauth.PermissionsFromSubjects(func(string) ([]string, error) { return []string{"_INBOX.>"}, nil }),
|
||||
hex.EncodeToString(internalID.SignPub),
|
||||
)
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: icFreePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
|
||||
outsider, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("outsider identity: %v", err)
|
||||
}
|
||||
pub, sign, err := busauth.ClientNkey(outsider.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("outsider nkey: %v", err)
|
||||
}
|
||||
conn, err := nats.Connect(ns.ClientURL(),
|
||||
nats.Nkey(pub, sign),
|
||||
nats.MaxReconnects(0),
|
||||
nats.Timeout(2*time.Second),
|
||||
)
|
||||
if err == nil {
|
||||
conn.Close()
|
||||
t.Fatalf("outsider (unauthorized, non-internal) must be rejected, but connected")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package main
|
||||
|
||||
// Wiring tests for issue 0006c: --store kv selects the replicated JetStream KV
|
||||
// control plane, the authenticator serves from it through the storeHolder, and a
|
||||
// new node sees state created by another (the divergence that per-node SQLite
|
||||
// caused — audit 0008 N5 — is gone). Branch-by-abstraction is verified elsewhere
|
||||
// (the SQLite default path is the unchanged baseline covered by the existing
|
||||
// suite).
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// TestKVStoreBootstrapUnderEnforce drives the exact decentralized boot the binary
|
||||
// performs: build the authenticator over an empty holder, start NATS, open the
|
||||
// privileged internal connection, open the KV store, publish it into the holder,
|
||||
// then a real bus user (seeded into the KV store) authenticates over nkey. This
|
||||
// proves the bootstrap cycle is broken correctly — the KV-backed control plane
|
||||
// authorizes live clients under enforce.
|
||||
func TestKVStoreBootstrapUnderEnforce(t *testing.T) {
|
||||
internalID, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("internal identity: %v", err)
|
||||
}
|
||||
holder := &storeHolder{}
|
||||
auth := busauth.NewNkeyAuthenticatorACLInternal(
|
||||
holder.IsAuthorized,
|
||||
busauth.PermissionsFromSubjects(holder.subjectACL),
|
||||
hex.EncodeToString(internalID.SignPub),
|
||||
)
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: freePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
|
||||
// Privileged internal connection opens the KV store while the holder still
|
||||
// denies every normal client.
|
||||
intNC, js, err := connectInternalJS(ns, internalID, true)
|
||||
if err != nil {
|
||||
t.Fatalf("connectInternalJS: %v", err)
|
||||
}
|
||||
t.Cleanup(intNC.Close)
|
||||
kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: 1, OpTimeout: 3 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("open kv store: %v", err)
|
||||
}
|
||||
holder.set(kvStore)
|
||||
|
||||
// Seed a bus user into the KV control plane.
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("alice: %v", err)
|
||||
}
|
||||
if err := kvStore.AddUser(hex.EncodeToString(alice.SignPub), "alice", membership.RoleMember); err != nil {
|
||||
t.Fatalf("seed alice: %v", err)
|
||||
}
|
||||
|
||||
// alice authenticates over nkey — authorized via the KV store through the holder.
|
||||
pub, sign, err := busauth.ClientNkey(alice.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("alice nkey: %v", err)
|
||||
}
|
||||
aliceNC, err := nats.Connect(ns.ClientURL(), nats.Nkey(pub, sign), nats.MaxReconnects(0), nats.Timeout(2*time.Second))
|
||||
if err != nil {
|
||||
t.Fatalf("alice (KV-authorized) must connect under enforce: %v", err)
|
||||
}
|
||||
aliceNC.Close()
|
||||
|
||||
// An outsider not in the KV store is denied (fail closed).
|
||||
outsider, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("outsider: %v", err)
|
||||
}
|
||||
opub, osign, err := busauth.ClientNkey(outsider.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("outsider nkey: %v", err)
|
||||
}
|
||||
if oc, err := nats.Connect(ns.ClientURL(), nats.Nkey(opub, osign), nats.MaxReconnects(0), nats.Timeout(2*time.Second)); err == nil {
|
||||
oc.Close()
|
||||
t.Fatalf("an outsider absent from the KV store must be rejected")
|
||||
}
|
||||
}
|
||||
|
||||
// TestKVStoreDecentralizedConsistency: a room/user created via one node's KV store
|
||||
// is immediately visible to another node's KV store over the same JetStream — the
|
||||
// shared, replicated control plane that ends the per-node SQLite divergence.
|
||||
func TestKVStoreDecentralizedConsistency(t *testing.T) {
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: freePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
|
||||
open := func() membership.Store {
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
t.Fatalf("connect: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
st, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: 1, OpTimeout: 3 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("open kv: %v", err)
|
||||
}
|
||||
return st
|
||||
}
|
||||
nodeA := open()
|
||||
nodeB := open()
|
||||
|
||||
owner, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("owner: %v", err)
|
||||
}
|
||||
ownerPub := hex.EncodeToString(owner.SignPub)
|
||||
if err := nodeA.AddUser(ownerPub, "owner", membership.RoleAdmin); err != nil {
|
||||
t.Fatalf("nodeA add user: %v", err)
|
||||
}
|
||||
if err := nodeA.CreateRoom(
|
||||
membership.RoomInfo{RoomID: "ROOMX", Subject: "room.shared.x", OwnerEndpoint: "owner-ep"},
|
||||
owner.SignPub, owner.KexPub, nil,
|
||||
); err != nil {
|
||||
t.Fatalf("nodeA create room: %v", err)
|
||||
}
|
||||
|
||||
// nodeB (a different connection, same buckets) sees both immediately.
|
||||
if !nodeB.IsAuthorized(ownerPub) {
|
||||
t.Fatalf("nodeB must see the user created on nodeA (decentralized state divergence)")
|
||||
}
|
||||
got, err := nodeB.GetRoom("ROOMX")
|
||||
if err != nil {
|
||||
t.Fatalf("nodeB must see the room created on nodeA: %v", err)
|
||||
}
|
||||
if got.Subject != "room.shared.x" {
|
||||
t.Fatalf("nodeB read wrong room subject: %q", got.Subject)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
package main
|
||||
|
||||
// Integration tests for issue 0011 GAP A: `membershipd user add --store kv`
|
||||
// adds users to a RUNNING cluster's replicated allowlist via the privileged
|
||||
// internal connection, instead of the stop-seed-restart procedure the 0011
|
||||
// deploy required. These exercise the real connectKVStore path (load the
|
||||
// persisted internal identity from a file, present its nkey, open the KV store,
|
||||
// write the user) against an embedded enforce node, plus the idempotency and
|
||||
// error semantics the DoD calls for. Multi-node replication and node-down quorum
|
||||
// are validated against the live cluster (report 0012).
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// startEnforceKVNode boots a single embedded enforce node whose authenticator
|
||||
// recognizes internalPubHex as the privileged internal identity, bootstraps the
|
||||
// KV control-plane store over the in-process internal connection, and publishes
|
||||
// it into the holder — the exact sequence main.go performs for --store kv. It
|
||||
// returns the client URL the CLI connects to.
|
||||
func startEnforceKVNode(t *testing.T, internalID cs.Identity) string {
|
||||
t.Helper()
|
||||
holder := &storeHolder{}
|
||||
auth := busauth.NewNkeyAuthenticatorACLInternal(
|
||||
holder.IsAuthorized,
|
||||
busauth.PermissionsFromSubjects(holder.subjectACL),
|
||||
hex.EncodeToString(internalID.SignPub),
|
||||
)
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: freePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start enforce node: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
|
||||
intNC, js, err := connectInternalJS(ns, internalID, true)
|
||||
if err != nil {
|
||||
t.Fatalf("bootstrap internal connection: %v", err)
|
||||
}
|
||||
t.Cleanup(intNC.Close)
|
||||
kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: 1, OpTimeout: 3 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("bootstrap KV store: %v", err)
|
||||
}
|
||||
holder.set(kvStore)
|
||||
return ns.ClientURL()
|
||||
}
|
||||
|
||||
// TestUserAddStoreKV_GoldenAndIdempotent is the GAP A golden + edge-1: the CLI
|
||||
// connection (real connectKVStore, loading the internal identity from a file and
|
||||
// presenting its nkey) writes a user into the live KV allowlist, the user is
|
||||
// authorized afterward, and re-adding the same key is an explicit ErrUserExists
|
||||
// with no corruption (the unchanged row is still authorized).
|
||||
func TestUserAddStoreKV_GoldenAndIdempotent(t *testing.T) {
|
||||
idFile := filepath.Join(t.TempDir(), "internal.id")
|
||||
internalID, err := client.LoadOrCreateIdentity(idFile) // persists 0600
|
||||
if err != nil {
|
||||
t.Fatalf("persist internal identity: %v", err)
|
||||
}
|
||||
url := startEnforceKVNode(t, internalID)
|
||||
|
||||
// Golden: connect as the privileged internal identity (loopback, no TLS) and
|
||||
// add a new user, exactly as `user add --store kv` does.
|
||||
kv, err := connectKVStore(url, idFile, "", 1)
|
||||
if err != nil {
|
||||
t.Fatalf("connectKVStore (privileged): %v", err)
|
||||
}
|
||||
defer kv.Close()
|
||||
|
||||
newUser, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("new user identity: %v", err)
|
||||
}
|
||||
pub := hex.EncodeToString(newUser.SignPub)
|
||||
if err := kv.store.AddUser(pub, "gapcheck_user", membership.RoleMember); err != nil {
|
||||
t.Fatalf("add user to live KV: %v", err)
|
||||
}
|
||||
if !kv.store.IsAuthorized(pub) {
|
||||
t.Fatalf("user added to KV must be authorized")
|
||||
}
|
||||
|
||||
// Edge 1: re-adding the same key is a clean, non-destructive ErrUserExists.
|
||||
err = kv.store.AddUser(pub, "gapcheck_user", membership.RoleMember)
|
||||
if !errors.Is(err, membership.ErrUserExists) {
|
||||
t.Fatalf("re-add must return ErrUserExists (idempotent), got %v", err)
|
||||
}
|
||||
// A different handle/role with the SAME key is also rejected — the row is not
|
||||
// silently overwritten (no role flip).
|
||||
if err := kv.store.AddUser(pub, "impostor", membership.RoleAdmin); !errors.Is(err, membership.ErrUserExists) {
|
||||
t.Fatalf("re-add with a different role must NOT overwrite; want ErrUserExists, got %v", err)
|
||||
}
|
||||
u, err := kv.store.GetUser(pub)
|
||||
if err != nil {
|
||||
t.Fatalf("get user: %v", err)
|
||||
}
|
||||
if u.Handle != "gapcheck_user" || u.Role != membership.RoleMember || u.Status != membership.StatusActive {
|
||||
t.Fatalf("idempotent re-add corrupted the row: %+v", u)
|
||||
}
|
||||
}
|
||||
|
||||
// TestUserAddStoreKV_RequiresInternalIdentity: --store kv without a usable
|
||||
// internal identity file fails loudly (missing file, empty path) rather than
|
||||
// silently connecting unprivileged.
|
||||
func TestUserAddStoreKV_RequiresInternalIdentity(t *testing.T) {
|
||||
if _, err := connectKVStore("nats://127.0.0.1:4250", "", "", 1); err == nil {
|
||||
t.Fatalf("empty --internal-id-file must be an error")
|
||||
}
|
||||
missing := filepath.Join(t.TempDir(), "nope.id")
|
||||
if _, err := connectKVStore("nats://127.0.0.1:4250", missing, "", 1); err == nil {
|
||||
t.Fatalf("missing internal identity file must be an error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestUserAddStoreKV_UnreachableKV is the GAP A error case: pointing --store kv
|
||||
// at a dead endpoint yields a clear, handled error (no crash, no silent success).
|
||||
func TestUserAddStoreKV_UnreachableKV(t *testing.T) {
|
||||
idFile := filepath.Join(t.TempDir(), "internal.id")
|
||||
if _, err := client.LoadOrCreateIdentity(idFile); err != nil {
|
||||
t.Fatalf("persist internal identity: %v", err)
|
||||
}
|
||||
// A loopback port with nothing listening: connect must fail fast and wrapped.
|
||||
_, err := connectKVStore("nats://127.0.0.1:1/", idFile, "", 1)
|
||||
if err == nil {
|
||||
t.Fatalf("connecting to a dead endpoint must error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestUserAddStoreKV_RemoteWithoutCARefused: a non-loopback target without --ca
|
||||
// is refused so the allowlist write never travels in cleartext (audit 0008 N6,
|
||||
// same guard as migrate-to-kv).
|
||||
func TestUserAddStoreKV_RemoteWithoutCARefused(t *testing.T) {
|
||||
idFile := filepath.Join(t.TempDir(), "internal.id")
|
||||
if _, err := client.LoadOrCreateIdentity(idFile); err != nil {
|
||||
t.Fatalf("persist internal identity: %v", err)
|
||||
}
|
||||
_, err := connectKVStore("nats://203.0.113.1:4250", idFile, "", 1)
|
||||
if err == nil {
|
||||
t.Fatalf("remote target without --ca must be refused")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestResolveClusterPass verifies the secret resolution precedence
|
||||
// (file > env > flag) that keeps the cluster password out of argv (issue 0006f).
|
||||
func TestResolveClusterPass(t *testing.T) {
|
||||
// file wins over env and flag, and is trimmed.
|
||||
f := filepath.Join(t.TempDir(), "pass")
|
||||
if err := os.WriteFile(f, []byte("filesecret\n"), 0o600); err != nil {
|
||||
t.Fatalf("write: %v", err)
|
||||
}
|
||||
if got, src, err := resolveClusterPass("flagsecret", f, "envsecret"); err != nil || got != "filesecret" || src != "file" {
|
||||
t.Fatalf("file precedence: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// env wins over flag when no file.
|
||||
if got, src, err := resolveClusterPass("flagsecret", "", "envsecret"); err != nil || got != "envsecret" || src != "env" {
|
||||
t.Fatalf("env precedence: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// flag is the last resort.
|
||||
if got, src, err := resolveClusterPass("flagsecret", "", ""); err != nil || got != "flagsecret" || src != "flag" {
|
||||
t.Fatalf("flag fallback: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// none set.
|
||||
if got, src, err := resolveClusterPass("", "", ""); err != nil || got != "" || src != "none" {
|
||||
t.Fatalf("none: got %q src %q err %v", got, src, err)
|
||||
}
|
||||
// missing file is an error.
|
||||
if _, _, err := resolveClusterPass("", filepath.Join(t.TempDir(), "nope"), ""); err == nil {
|
||||
t.Fatalf("missing file must error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestInjectRouteCreds verifies the secret is injected only into routes that omit
|
||||
// userinfo, so --routes argv need not carry the password (issue 0006f).
|
||||
func TestInjectRouteCreds(t *testing.T) {
|
||||
in := []string{"nats://10.0.0.2:6250", "nats://override:pw@10.0.0.3:6250"}
|
||||
out, err := injectRouteCreds(in, "user", "secret")
|
||||
if err != nil {
|
||||
t.Fatalf("inject: %v", err)
|
||||
}
|
||||
if !strings.Contains(out[0], "user:secret@10.0.0.2:6250") {
|
||||
t.Fatalf("creds not injected into bare route: %q", out[0])
|
||||
}
|
||||
if !strings.Contains(out[1], "override:pw@10.0.0.3:6250") {
|
||||
t.Fatalf("existing userinfo must be preserved: %q", out[1])
|
||||
}
|
||||
// empty user is a no-op.
|
||||
noop, err := injectRouteCreds(in, "", "")
|
||||
if err != nil || noop[0] != in[0] {
|
||||
t.Fatalf("empty user must be a no-op: %v %q", err, noop[0])
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsLoopbackURL guards migrate-to-kv against pushing the allowlist cleartext
|
||||
// to a remote NATS (issue 0006f, audit 0008 N6).
|
||||
func TestIsLoopbackURL(t *testing.T) {
|
||||
loop := []string{"nats://127.0.0.1:4250", "nats://localhost:4250", "nats://[::1]:4250"}
|
||||
for _, u := range loop {
|
||||
if !isLoopbackURL(u) {
|
||||
t.Fatalf("%q should be loopback", u)
|
||||
}
|
||||
}
|
||||
remote := []string{"nats://10.0.0.2:4250", "nats://bus.example.com:4250", "::not-a-url"}
|
||||
for _, u := range remote {
|
||||
if isLoopbackURL(u) {
|
||||
t.Fatalf("%q should NOT be loopback", u)
|
||||
}
|
||||
}
|
||||
}
|
||||
+323
-21
@@ -6,6 +6,8 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/hex"
|
||||
"flag"
|
||||
"log"
|
||||
"net/http"
|
||||
@@ -14,33 +16,258 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Subcommand dispatch: `membershipd user ...` is the local administration CLI
|
||||
// (seed/list/revoke bus users) and must be handled before the server flag set
|
||||
// parses os.Args. Running the CLI on the bus host is trusted by design (whoever
|
||||
// has a shell there already controls the service), which is how the first admin
|
||||
// is seeded without a chicken-egg auth problem.
|
||||
if len(os.Args) > 1 && os.Args[1] == "user" {
|
||||
runUserCLI(os.Args[2:])
|
||||
return
|
||||
}
|
||||
// `membershipd migrate-to-kv` is the one-time, idempotent SQLite->JetStream KV
|
||||
// data move for decentralization (issue 0003c). Like the user CLI it runs on
|
||||
// the host and is dispatched before the server flag set parses os.Args.
|
||||
if len(os.Args) > 1 && os.Args[1] == "migrate-to-kv" {
|
||||
runMigrateCLI(os.Args[2:])
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
bind = flag.String("bind", "127.0.0.1", "network interface to bind the HTTP API and the embedded NATS to; use 0.0.0.0 to accept LAN/remote peers")
|
||||
natsURL = flag.String("nats-url", "", "external NATS url; empty starts an embedded server")
|
||||
httpPort = flag.String("http-port", "8470", "HTTP port for the control-plane API")
|
||||
dbPath = flag.String("db", "./local_files/unibus.db", "SQLite database path")
|
||||
storeDir = flag.String("store-dir", "./local_files/blobs", "blob store directory")
|
||||
natsPort = flag.Int("nats-port", 4250, "embedded NATS listen port (when --nats-url empty)")
|
||||
natsStore = flag.String("nats-store", "./local_files/jetstream", "embedded JetStream store dir")
|
||||
busAuth = flag.String("bus-auth", "off", "control-plane auth rollout: off|soft|enforce (feature flag bus-auth)")
|
||||
tlsCert = flag.String("tls-cert", "", "PATH to the NATS server certificate (deploy/tls/server.crt); enables TLS on the embedded data plane")
|
||||
tlsKey = flag.String("tls-key", "", "path to the NATS server private key (deploy/tls/server.key); required with --tls-cert")
|
||||
// Cluster (issue 0003a): empty --cluster-name keeps the server standalone.
|
||||
clusterName = flag.String("cluster-name", "", "NATS cluster name (identical on every node); empty = standalone, no HA")
|
||||
serverName = flag.String("server-name", "", "unique node name within the cluster (required by JetStream RAFT when clustered)")
|
||||
clusterPort = flag.Int("cluster-port", 6250, "route listener port for server-to-server cluster traffic")
|
||||
routesCSV = flag.String("routes", "", "comma-separated nats-route URLs of the OTHER nodes, e.g. nats://user:pass@10.0.0.2:6250")
|
||||
clusterUser = flag.String("cluster-user", "", "shared route secret username (gates the route listener)")
|
||||
clusterPass = flag.String("cluster-pass", "", "shared route secret password (argv-visible — prefer --cluster-pass-file or UNIBUS_CLUSTER_PASS)")
|
||||
// Secret out of argv (issue 0006f, audit 0008 N1-low): a password in
|
||||
// --cluster-pass / --routes is visible in ps/journald. Prefer a file or the
|
||||
// UNIBUS_CLUSTER_PASS env var; routes may then omit userinfo and the secret
|
||||
// is injected from here.
|
||||
clusterPassFile = flag.String("cluster-pass-file", "", "path to a file holding the cluster route password (preferred over --cluster-pass; keeps the secret out of argv)")
|
||||
routeTLSCert = flag.String("route-tls-cert", "", "this node's route certificate (CA-signed); enables mutual route TLS with --route-tls-key/--route-tls-ca")
|
||||
routeTLSKey = flag.String("route-tls-key", "", "this node's route private key")
|
||||
routeTLSCA = flag.String("route-tls-ca", "", "bus CA that signs every node's route certificate (deploy/tls/ca.crt)")
|
||||
// Replicated control plane (issue 0006a/c): the JetStream replication factor
|
||||
// for the shared nonce bucket (and, with --store kv, the control-plane KV).
|
||||
// 1 for a 1-2 node rollout, 3 for real HA quorum (raise in place with
|
||||
// `nats stream update --replicas 3` when the third node joins).
|
||||
kvReplicas = flag.Int("kv-replicas", 1, "JetStream replication factor for the shared nonce/KV buckets (1..3)")
|
||||
caFile = flag.String("ca", "", "bus CA cert; only used to pin TLS on the internal JetStream connection to an EXTERNAL --nats-url (the embedded server uses an in-process connection that needs no CA)")
|
||||
// Control-plane store backend (issue 0006c, feature flag decentralized):
|
||||
// "sqlite" (default) keeps the local single-node SQLite control plane;
|
||||
// "kv" puts rooms/members/keys/users in replicated JetStream KV so any node
|
||||
// in the cluster serves the same state.
|
||||
storeBackend = flag.String("store", "sqlite", "control-plane store backend: sqlite (default, single-node) | kv (replicated JetStream, decentralized)")
|
||||
// Persisted internal service identity (issue 0011 gaps, GAP A): when set, the
|
||||
// privileged internal identity used to manage JetStream is LOADED from this
|
||||
// file (generated and persisted on first start) instead of being a fresh
|
||||
// ephemeral key each boot. Persisting it is what lets `membershipd user add
|
||||
// --store kv` write the replicated allowlist of a LIVE cluster: that CLI,
|
||||
// run over loopback on a node, loads the SAME identity and presents the nkey
|
||||
// this node's authenticator already grants full permissions. Empty keeps the
|
||||
// ephemeral-per-process behavior (single-node/dev default, unchanged). The
|
||||
// file holds a private key: it is written 0600 and belongs next to the node's
|
||||
// TLS keys (deploy keeps it under secrets/, gitignored).
|
||||
internalIDFile = flag.String("internal-id-file", "", "path to a persisted internal service identity (JSON); enables `membershipd user add --store kv` against the live cluster. Empty = ephemeral per-process identity (dev default)")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
authMode, err := membership.ParseAuthMode(*busAuth)
|
||||
if err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if *storeBackend != "sqlite" && *storeBackend != "kv" {
|
||||
log.Fatalf("--store must be \"sqlite\" or \"kv\", got %q", *storeBackend)
|
||||
}
|
||||
|
||||
// Resolve the cluster route secret out of argv (file/env preferred). The
|
||||
// resolved value (not *clusterPass) is what guards the route layer and is
|
||||
// injected into peer route URLs below.
|
||||
clusterPassResolved, passSource, err := resolveClusterPass(*clusterPass, *clusterPassFile, os.Getenv("UNIBUS_CLUSTER_PASS"))
|
||||
if err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
// Fail-open guard (audit H2): a non-loopback bind, or any TLS flag, demands
|
||||
// --bus-auth enforce. This makes an insecure public startup impossible rather
|
||||
// than silently exposing the bus with the appearance of security.
|
||||
if err := validateBootConfig(*bind, authMode, *tlsCert, *tlsKey); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
// Cluster route guard (issue 0003a): a public cluster needs a route secret
|
||||
// and mutual route TLS, and the route-TLS flags are all-or-nothing.
|
||||
if err := validateClusterConfig(*clusterName, *bind, *clusterUser, clusterPassResolved, *routeTLSCert, *routeTLSKey, *routeTLSCA, authMode); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
||||
log.SetPrefix("[membershipd] ")
|
||||
|
||||
// Data plane: embedded or external NATS.
|
||||
// A clustered node shares its control plane with peers, so it needs a JetStream
|
||||
// client to manage the replicated nonce bucket (issue 0006a). --store kv (issue
|
||||
// 0006c) also needs JetStream, for the control-plane KV itself. A standalone
|
||||
// single-node SQLite deployment needs none of this and keeps the in-process,
|
||||
// in-memory behavior unchanged.
|
||||
clustered := *clusterName != ""
|
||||
decentralized := *storeBackend == "kv"
|
||||
needJS := clustered || decentralized
|
||||
enforce := authMode == membership.AuthEnforce
|
||||
|
||||
// Internal service identity (issue 0006a): when the embedded data plane enforces
|
||||
// auth, membershipd must still connect to its OWN server to manage JetStream.
|
||||
// It does so with this ephemeral identity, which the authenticator is built to
|
||||
// recognize and grant full permissions (it never enters the user allowlist). It
|
||||
// is only generated when actually needed (JetStream required AND enforce on AND
|
||||
// the server is embedded), so a standalone or non-enforce node is unchanged.
|
||||
var internalID cs.Identity
|
||||
var internalPubHex string
|
||||
if needJS && enforce && *natsURL == "" {
|
||||
if *internalIDFile != "" {
|
||||
// Persisted identity: load it, generating + writing it (0600) on first
|
||||
// start. A stable internal key is what `user add --store kv` presents to
|
||||
// add users to a live cluster (GAP A); rotate it by deleting the file and
|
||||
// restarting.
|
||||
internalID, err = client.LoadOrCreateIdentity(*internalIDFile)
|
||||
if err != nil {
|
||||
log.Fatalf("load internal service identity %q: %v", *internalIDFile, err)
|
||||
}
|
||||
log.Printf("internal service identity: persisted (%s)", *internalIDFile)
|
||||
} else {
|
||||
internalID, err = cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
log.Fatalf("generate internal identity: %v", err)
|
||||
}
|
||||
}
|
||||
internalPubHex = hex.EncodeToString(internalID.SignPub)
|
||||
}
|
||||
|
||||
// The authenticator consults the store through a holder so it can be built
|
||||
// before the store exists: with --store kv the JetStream KV store opens only
|
||||
// after NATS is up (the bootstrap cycle). In the default SQLite path the store
|
||||
// is opened and set into the holder right here, before the server starts, so
|
||||
// behavior is identical to the pre-0006c baseline. `store` is the final store
|
||||
// used by the HTTP server (set below for the KV path).
|
||||
holder := &storeHolder{}
|
||||
var store membership.Store
|
||||
if !decentralized {
|
||||
store, err = membership.Open(*dbPath)
|
||||
if err != nil {
|
||||
log.Fatalf("open membership store: %v", err)
|
||||
}
|
||||
holder.set(store)
|
||||
log.Printf("membership store: sqlite %s", *dbPath)
|
||||
}
|
||||
// Close whichever store ends up final (SQLite closes its file; the JetStream KV
|
||||
// store's Close is a no-op — its NATS connection is closed separately).
|
||||
defer func() {
|
||||
if store != nil {
|
||||
store.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
blobs, err := blobstore.New(*storeDir)
|
||||
if err != nil {
|
||||
log.Fatalf("open blob store: %v", err)
|
||||
}
|
||||
log.Printf("blob store: %s", *storeDir)
|
||||
|
||||
// Data plane: embedded or external NATS. For the embedded server, enforce
|
||||
// turns on the nkey authenticator (only allowlisted identities may connect)
|
||||
// and --tls-cert/--tls-key turn on TLS. An external NATS manages its own
|
||||
// auth/TLS, so those flags do not apply to it.
|
||||
var ns *server.Server
|
||||
natsClientURL := *natsURL
|
||||
if natsClientURL == "" {
|
||||
var err error
|
||||
ns, err = embeddednats.Start(*natsStore, *natsPort)
|
||||
cfg := embeddednats.ServerConfig{
|
||||
// Bind the embedded NATS to the same interface as the HTTP API so a
|
||||
// single --bind flag governs reachability: 127.0.0.1 keeps the whole
|
||||
// stack loopback-only; 0.0.0.0 exposes both planes to the LAN.
|
||||
StoreDir: *natsStore,
|
||||
Host: *bind,
|
||||
Port: *natsPort,
|
||||
ServerName: *serverName,
|
||||
}
|
||||
// Cluster (issue 0003a): with a cluster name, join the route layer for HA.
|
||||
if *clusterName != "" {
|
||||
// Inject the resolved secret into peer route URLs that omit userinfo, so
|
||||
// the password need not appear in --routes argv (issue 0006f).
|
||||
routes, rerr := injectRouteCreds(splitRoutes(*routesCSV), *clusterUser, clusterPassResolved)
|
||||
if rerr != nil {
|
||||
log.Fatalf("%v", rerr)
|
||||
}
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: *clusterName,
|
||||
Host: *bind,
|
||||
Port: *clusterPort,
|
||||
Routes: routes,
|
||||
Username: *clusterUser,
|
||||
Password: clusterPassResolved,
|
||||
}
|
||||
log.Printf("cluster route secret source: %s", passSource)
|
||||
if *routeTLSCert != "" {
|
||||
rtls, err := busauth.RouteTLSConfig(*routeTLSCert, *routeTLSKey, *routeTLSCA)
|
||||
if err != nil {
|
||||
log.Fatalf("load route TLS: %v", err)
|
||||
}
|
||||
cc.TLS = rtls
|
||||
log.Printf("cluster route TLS: ON (mutual, CA %s)", *routeTLSCA)
|
||||
}
|
||||
cfg.Cluster = cc
|
||||
log.Printf("cluster: %q node %q, route port %d, %d peer route(s)", *clusterName, *serverName, *clusterPort, len(cc.Routes))
|
||||
}
|
||||
if authMode == membership.AuthEnforce {
|
||||
// Per-subject data-plane ACL (audit H4 / N4 residual): the authenticator
|
||||
// authorizes by the bus allowlist AND confines each connection to the
|
||||
// subjects of the rooms it belongs to (plus client-infra subjects). This
|
||||
// closes the wildcard metadata leak where a registered non-member could
|
||||
// Subscribe(">") and harvest every room's subject and JetStream activity.
|
||||
// NATS freezes permissions at connect time, so a peer that joins a room
|
||||
// after connecting must client.RefreshSession to gain that room's subject.
|
||||
cfg.Auth = busauth.NewNkeyAuthenticatorACLInternal(
|
||||
holder.IsAuthorized,
|
||||
busauth.PermissionsFromSubjects(holder.subjectACL),
|
||||
internalPubHex,
|
||||
)
|
||||
log.Printf("NATS nkey authentication: ON (enforce, per-subject ACL)")
|
||||
}
|
||||
if *tlsCert != "" || *tlsKey != "" {
|
||||
if *tlsCert == "" || *tlsKey == "" {
|
||||
log.Fatalf("--tls-cert and --tls-key must be set together")
|
||||
}
|
||||
tlsCfg, err := busauth.ServerTLSConfig(*tlsCert, *tlsKey)
|
||||
if err != nil {
|
||||
log.Fatalf("load NATS TLS: %v", err)
|
||||
}
|
||||
cfg.TLS = tlsCfg
|
||||
log.Printf("NATS TLS: ON (%s)", *tlsCert)
|
||||
}
|
||||
ns, err = embeddednats.StartServer(cfg)
|
||||
if err != nil {
|
||||
log.Fatalf("start embedded nats: %v", err)
|
||||
}
|
||||
@@ -50,29 +277,104 @@ func main() {
|
||||
log.Printf("using external NATS: %s", natsClientURL)
|
||||
}
|
||||
|
||||
// Control plane: SQLite store + blob store + HTTP API.
|
||||
store, err := membership.Open(*dbPath)
|
||||
if err != nil {
|
||||
log.Fatalf("open membership store: %v", err)
|
||||
}
|
||||
defer store.Close()
|
||||
log.Printf("membership store: %s", *dbPath)
|
||||
// JetStream client + decentralized store (issue 0006a/c). needJS is set for a
|
||||
// clustered node (shared nonce bucket) and for --store kv (the KV control
|
||||
// plane). Open the privileged JetStream client first (in-process for the
|
||||
// embedded server, a plain client for external NATS), then — for --store kv —
|
||||
// open the replicated KV store and publish it into the holder so the
|
||||
// authenticator and HTTP server serve from it. The privileged connection is the
|
||||
// only client that can connect in this window (the holder still denies everyone
|
||||
// else; the internal identity bypasses the store).
|
||||
var js jetstream.JetStream
|
||||
if needJS {
|
||||
var internalNC *nats.Conn
|
||||
if *natsURL == "" {
|
||||
internalNC, js, err = connectInternalJS(ns, internalID, enforce)
|
||||
} else {
|
||||
internalNC, js, err = connectExternalJS(natsClientURL, *caFile)
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatalf("internal JetStream connection (required by --cluster-name/--store kv): %v", err)
|
||||
}
|
||||
defer internalNC.Close()
|
||||
|
||||
blobs, err := blobstore.New(*storeDir)
|
||||
if err != nil {
|
||||
log.Fatalf("open blob store: %v", err)
|
||||
if decentralized {
|
||||
kvStore, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: *kvReplicas})
|
||||
if err != nil {
|
||||
log.Fatalf("open decentralized control-plane KV store: %v", err)
|
||||
}
|
||||
store = kvStore
|
||||
holder.set(store)
|
||||
log.Printf("membership store: jetstream KV (replicas=%d)", *kvReplicas)
|
||||
}
|
||||
}
|
||||
log.Printf("blob store: %s", *storeDir)
|
||||
|
||||
srv := membership.NewServer(store, blobs)
|
||||
addr := "127.0.0.1:" + *httpPort
|
||||
httpSrv := &http.Server{Addr: addr, Handler: srv}
|
||||
srv := membership.NewServer(store, blobs, authMode)
|
||||
// On a public (non-loopback) bind, disable cleartext rooms: the embedded NATS
|
||||
// has no per-subject ACL, so cleartext content would be readable by any
|
||||
// registered peer. Forcing E2E keeps message content confidential regardless
|
||||
// (audit H4 minimum defense; see dev/0004d-dataplane-acl.md).
|
||||
if !isLoopbackBind(*bind) {
|
||||
srv.RequireEncryptedRooms = true
|
||||
log.Printf("cleartext rooms: DISABLED (public bind requires end-to-end encryption)")
|
||||
}
|
||||
// Publish this node's posture on /healthz so a monitor (or a peer) can detect a
|
||||
// cluster member not running the homogeneous enforce+ACL+TLS posture (audit
|
||||
// 0008 N1). enforce implies the per-subject ACL in this binary (they are wired
|
||||
// together above).
|
||||
srv.Posture = membership.Posture{
|
||||
Enforce: enforce,
|
||||
ACL: enforce,
|
||||
TLS: *tlsCert != "",
|
||||
Cluster: clustered,
|
||||
Store: *storeBackend,
|
||||
}
|
||||
|
||||
// Replicated anti-replay (issue 0006a, audit 0008 N3): a clustered node MUST
|
||||
// share its nonce store across the cluster, or a request accepted on one node
|
||||
// can be replayed to another. HARD requirement: if the bucket cannot be created
|
||||
// the node refuses to start rather than run with a per-process cache that leaves
|
||||
// the replay hole open.
|
||||
if needJS {
|
||||
if err := wireReplicatedNonces(srv, js, clustered, *kvReplicas); err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
if clustered {
|
||||
log.Printf("anti-replay: replicated nonce bucket \"KV_UNIBUS_nonces\" (replicas=%d) — cluster-safe", *kvReplicas)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("control-plane auth: %s", authMode)
|
||||
addr := *bind + ":" + *httpPort
|
||||
httpSrv := &http.Server{
|
||||
Addr: addr,
|
||||
Handler: srv,
|
||||
// Bound request header size so a peer cannot exhaust memory with huge
|
||||
// headers before any body limit applies (the body ceilings live in the
|
||||
// membership middleware).
|
||||
MaxHeaderBytes: membership.MaxHeaderBytes,
|
||||
ReadHeaderTimeout: 10 * time.Second,
|
||||
}
|
||||
|
||||
go func() {
|
||||
log.Printf("HTTP control-plane API: http://%s", addr)
|
||||
log.Printf(" health: http://%s/healthz", addr)
|
||||
if err := httpSrv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
|
||||
log.Fatalf("http server: %v", err)
|
||||
var serveErr error
|
||||
if *tlsCert != "" {
|
||||
// Serve the control plane over TLS with the same CA-signed cert as the
|
||||
// data plane (audit H5): metadata (subjects, pubkeys, sealed keys, the
|
||||
// social graph) is no longer readable by a network MITM. The fail-open
|
||||
// guard already requires --bus-auth enforce alongside these flags.
|
||||
httpSrv.TLSConfig = &tls.Config{MinVersion: tls.VersionTLS12}
|
||||
log.Printf("HTTPS control-plane API: https://%s", addr)
|
||||
log.Printf(" health: https://%s/healthz", addr)
|
||||
log.Printf("control-plane TLS: ON (%s)", *tlsCert)
|
||||
serveErr = httpSrv.ListenAndServeTLS(*tlsCert, *tlsKey)
|
||||
} else {
|
||||
log.Printf("HTTP control-plane API: http://%s", addr)
|
||||
log.Printf(" health: http://%s/healthz", addr)
|
||||
serveErr = httpSrv.ListenAndServe()
|
||||
}
|
||||
if serveErr != nil && serveErr != http.ErrServerClosed {
|
||||
log.Fatalf("http server: %v", serveErr)
|
||||
}
|
||||
}()
|
||||
|
||||
|
||||
@@ -0,0 +1,95 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// runMigrateCLI implements `membershipd migrate-to-kv`, the idempotent move of
|
||||
// the control-plane state from the local SQLite database into replicated
|
||||
// JetStream KV (issue 0003c). It backs up the SQLite file first (VACUUM INTO),
|
||||
// then connects to the target NATS and copies every room/member/key/user into
|
||||
// the KV buckets. Re-running it converges to the same state.
|
||||
//
|
||||
// It runs on the bus host (no auth on the control-plane side), connecting to the
|
||||
// cluster's NATS; --ca pins TLS when the data plane is secured.
|
||||
func runMigrateCLI(args []string) {
|
||||
fs := flag.NewFlagSet("migrate-to-kv", flag.ExitOnError)
|
||||
dbPath := fs.String("db", defaultDBPath, "SQLite database path to migrate FROM")
|
||||
natsURL := fs.String("nats-url", "", "NATS url of the cluster to migrate INTO (required)")
|
||||
ca := fs.String("ca", "", "CA cert to pin TLS on the NATS connection (optional)")
|
||||
replicas := fs.Int("replicas", 1, "KV replication factor (1 for a 1-2 node rollout, 3 for HA quorum)")
|
||||
noBackup := fs.Bool("no-backup", false, "skip the SQLite backup before migrating (NOT recommended)")
|
||||
_ = fs.Parse(args)
|
||||
|
||||
if *natsURL == "" {
|
||||
fmt.Fprintln(os.Stderr, "membershipd migrate-to-kv: --nats-url is required (the cluster to write the KV buckets into)")
|
||||
os.Exit(2)
|
||||
}
|
||||
// Confidentiality guard (issue 0006f, audit 0008 N6): the migration writes the
|
||||
// allowlist (handles, roles, signing pubkeys) into the KV. Against a REMOTE NATS
|
||||
// without TLS that metadata would travel in cleartext, so a remote target MUST
|
||||
// be TLS-pinned with --ca. A loopback target is local-only and exempt.
|
||||
if !isLoopbackURL(*natsURL) && *ca == "" {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: refusing to migrate to remote %q without --ca; the allowlist (handles/roles/sign pubs) would travel in cleartext — pin TLS with --ca, or run against a loopback nats-url\n", *natsURL)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
// Back up the SQLite database first so a botched migration can be undone.
|
||||
var backupPath string
|
||||
if !*noBackup {
|
||||
bak, err := membership.BackupSQLite(*dbPath)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: backup failed: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
backupPath = bak
|
||||
fmt.Printf("backed up %s -> %s\n", *dbPath, backupPath)
|
||||
}
|
||||
|
||||
// Connect to the target NATS (optionally TLS-pinned to the bus CA).
|
||||
natsOpts := []nats.Option{nats.Name("unibus-migrate")}
|
||||
if *ca != "" {
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(*ca)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: load CA: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
natsOpts = append(natsOpts, nats.Secure(tlsCfg))
|
||||
}
|
||||
nc, err := nats.Connect(*natsURL, natsOpts...)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: connect %q: %v\n", *natsURL, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer nc.Close()
|
||||
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: jetstream: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
report, err := membership.MigrateSQLiteToKV(*dbPath, js, membership.JetStreamConfig{
|
||||
Replicas: *replicas,
|
||||
OpTimeout: 30 * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd migrate-to-kv: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
report.BackupPath = backupPath
|
||||
|
||||
fmt.Printf("migrated to KV (replicas=%d): %d rooms, %d members, %d keys, %d users\n",
|
||||
*replicas, report.Rooms, report.Members, report.Keys, report.Users)
|
||||
if backupPath != "" {
|
||||
fmt.Printf("rollback: restore %s if needed\n", backupPath)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// storeHolder is a concurrency-safe slot for the control-plane store, used to
|
||||
// break the decentralized bootstrap cycle (issue 0006c): the NATS authenticator
|
||||
// must be built BEFORE the embedded server starts, but the JetStream KV store can
|
||||
// only be opened AFTER NATS is up (it needs a JetStream client). The authenticator
|
||||
// therefore consults the holder instead of a concrete store.
|
||||
//
|
||||
// Fail-closed by construction: until the store is set, IsAuthorized denies and
|
||||
// SubjectACL errors, so any client connecting in the startup window is rejected.
|
||||
// The only connection expected in that window is membershipd's own internal
|
||||
// service identity, which the authenticator recognizes by key and lets through
|
||||
// without consulting the store at all. In the SQLite (default) path the store is
|
||||
// set before StartServer, so the window does not exist and behavior is identical
|
||||
// to the pre-0006c baseline.
|
||||
type storeHolder struct {
|
||||
mu sync.RWMutex
|
||||
s membership.Store
|
||||
}
|
||||
|
||||
func (h *storeHolder) set(s membership.Store) {
|
||||
h.mu.Lock()
|
||||
h.s = s
|
||||
h.mu.Unlock()
|
||||
}
|
||||
|
||||
func (h *storeHolder) get() membership.Store {
|
||||
h.mu.RLock()
|
||||
defer h.mu.RUnlock()
|
||||
return h.s
|
||||
}
|
||||
|
||||
// IsAuthorized reports whether signPubHex is an active bus user, denying while the
|
||||
// store is not yet set (fail closed). It is the predicate the nkey authenticator
|
||||
// uses for every connecting client.
|
||||
func (h *storeHolder) IsAuthorized(signPubHex string) bool {
|
||||
s := h.get()
|
||||
if s == nil {
|
||||
return false
|
||||
}
|
||||
return s.IsAuthorized(signPubHex)
|
||||
}
|
||||
|
||||
// subjectACL derives the per-subject permissions for signPubHex via the live
|
||||
// store, erroring (so the caller fails closed and denies the connection) while the
|
||||
// store is not yet set.
|
||||
func (h *storeHolder) subjectACL(signPubHex string) ([]string, error) {
|
||||
s := h.get()
|
||||
if s == nil {
|
||||
return nil, fmt.Errorf("control-plane store not ready")
|
||||
}
|
||||
return membership.SubjectACLFor(s)(signPubHex)
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// TestStoreHolderFailClosed: an empty holder denies everything (the bootstrap
|
||||
// window before the store is set), and starts serving once a store is published.
|
||||
func TestStoreHolderFailClosed(t *testing.T) {
|
||||
h := &storeHolder{}
|
||||
|
||||
// Empty: deny + error (fail closed).
|
||||
if h.IsAuthorized("anything") {
|
||||
t.Fatalf("empty holder must deny IsAuthorized")
|
||||
}
|
||||
if _, err := h.subjectACL("anything"); err == nil {
|
||||
t.Fatalf("empty holder must error from subjectACL (fail closed)")
|
||||
}
|
||||
|
||||
// After set: serves from the real store.
|
||||
store, err := membership.Open(filepath.Join(t.TempDir(), "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
id, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
pub := hex.EncodeToString(id.SignPub)
|
||||
if err := store.AddUser(pub, "alice", membership.RoleMember); err != nil {
|
||||
t.Fatalf("add user: %v", err)
|
||||
}
|
||||
h.set(store)
|
||||
|
||||
if !h.IsAuthorized(pub) {
|
||||
t.Fatalf("after set, an active user must be authorized")
|
||||
}
|
||||
if _, err := h.subjectACL(pub); err != nil {
|
||||
t.Fatalf("after set, subjectACL must succeed: %v", err)
|
||||
}
|
||||
if h.IsAuthorized("deadbeef") {
|
||||
t.Fatalf("a non-user must not be authorized")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,252 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"text/tabwriter"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// runUserCLI implements `membershipd user <add|list|revoke> ...`, the local
|
||||
// administration surface for the bus user allowlist. It opens the SQLite store
|
||||
// directly (no network, no auth): it is meant to run on the bus host, where
|
||||
// shell access already implies full control. This is the seam that seeds the
|
||||
// first admin, breaking the chicken-egg of "you need an admin to add an admin".
|
||||
//
|
||||
// The function never returns: it exits the process with a non-zero status on
|
||||
// error so it composes cleanly in shell scripts and systemd ExecStartPre hooks.
|
||||
func runUserCLI(args []string) {
|
||||
if len(args) == 0 {
|
||||
userUsage()
|
||||
os.Exit(2)
|
||||
}
|
||||
sub, rest := args[0], args[1:]
|
||||
switch sub {
|
||||
case "add":
|
||||
userAdd(rest)
|
||||
case "list":
|
||||
userList(rest)
|
||||
case "revoke":
|
||||
userRevoke(rest)
|
||||
case "-h", "--help", "help":
|
||||
userUsage()
|
||||
os.Exit(0)
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "membershipd user: unknown subcommand %q\n\n", sub)
|
||||
userUsage()
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
|
||||
func userUsage() {
|
||||
fmt.Fprint(os.Stderr, `usage: membershipd user <command> [flags]
|
||||
|
||||
commands:
|
||||
add Register a bus user from their Ed25519 signing public key
|
||||
list List all registered users
|
||||
revoke Revoke a user (denies access on both planes immediately)
|
||||
|
||||
store backends (--store):
|
||||
sqlite local SQLite database (default; seeds the first admin offline)
|
||||
kv the RUNNING cluster's replicated JetStream KV allowlist, via the
|
||||
privileged internal connection — add users with the cluster live,
|
||||
no stop-seed-restart needed (run over loopback/SSH on a node)
|
||||
|
||||
examples:
|
||||
membershipd user add --handle alice --sign-pub <64-hex> --role admin
|
||||
membershipd user add --store kv --handle bob --sign-pub <64-hex> --role member
|
||||
membershipd user list --store kv
|
||||
membershipd user revoke <64-hex>
|
||||
|
||||
common flags:
|
||||
--db <path> SQLite database path (--store sqlite; default ./local_files/unibus.db)
|
||||
|
||||
--store kv flags (defaults assume an on-node invocation):
|
||||
--nats-url <url> cluster NATS (default nats://127.0.0.1:4250)
|
||||
--internal-id-file <path> persisted internal service identity (default /opt/unibus/secrets/internal.id)
|
||||
--ca <path> CA cert pinning the data-plane TLS (default /opt/unibus/tls/ca.crt)
|
||||
--kv-replicas <n> KV replication factor, match the cluster (default 3)
|
||||
`)
|
||||
}
|
||||
|
||||
const defaultDBPath = "./local_files/unibus.db"
|
||||
|
||||
// openStore opens the membership store at path, exiting on failure. Migrations
|
||||
// (including 002_users.sql) are applied by membership.Open, so a fresh database
|
||||
// gets the users table on first use of the CLI.
|
||||
func openStore(path string) membership.Store {
|
||||
store, err := membership.Open(path)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user: open store %q: %v\n", path, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
return store
|
||||
}
|
||||
|
||||
// validateSignPubHex ensures the key is exactly a 32-byte Ed25519 public key in
|
||||
// hex (64 hex chars). Catching this here turns a silent "authorized nobody" into
|
||||
// an explicit error at seed time.
|
||||
func validateSignPubHex(signPub string) error {
|
||||
b, err := hex.DecodeString(signPub)
|
||||
if err != nil {
|
||||
return fmt.Errorf("sign-pub is not valid hex: %w", err)
|
||||
}
|
||||
if len(b) != 32 {
|
||||
return fmt.Errorf("sign-pub must be a 32-byte Ed25519 public key (64 hex chars), got %d bytes", len(b))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// kvFlags holds the connection flags shared by the --store kv path of the user
|
||||
// subcommands. registerKVFlags wires them onto a flag set so add and list expose
|
||||
// an identical interface.
|
||||
type kvFlags struct {
|
||||
store *string
|
||||
natsURL *string
|
||||
internalID *string
|
||||
ca *string
|
||||
replicas *int
|
||||
}
|
||||
|
||||
func registerKVFlags(fs *flag.FlagSet) kvFlags {
|
||||
return kvFlags{
|
||||
store: fs.String("store", "sqlite", "user store backend: sqlite (local DB) | kv (the live cluster's replicated allowlist)"),
|
||||
natsURL: fs.String("nats-url", defaultClusterNatsURL, "cluster NATS url for --store kv"),
|
||||
internalID: fs.String("internal-id-file", defaultInternalIDFile, "persisted internal service identity for --store kv"),
|
||||
ca: fs.String("ca", defaultClusterCAFile, "CA cert pinning TLS on the --store kv NATS connection"),
|
||||
replicas: fs.Int("kv-replicas", 3, "KV replication factor for --store kv (match the cluster)"),
|
||||
}
|
||||
}
|
||||
|
||||
// resolveStore returns the membership store for the chosen backend plus a cleanup
|
||||
// func. For --store kv it opens the privileged connection to the live cluster; for
|
||||
// sqlite it opens the local file. It exits the process with a clear message on any
|
||||
// failure (a dead NATS, a missing identity file), so a broken --store kv add fails
|
||||
// loudly instead of silently — Error case of the GAP A DoD. The returned *kvConn
|
||||
// is non-nil only for the kv backend (so the caller can report replication).
|
||||
func resolveStore(cmd string, kf kvFlags, dbPath string) (membership.Store, *kvConn, func()) {
|
||||
switch *kf.store {
|
||||
case "sqlite":
|
||||
store := openStore(dbPath)
|
||||
return store, nil, func() { store.Close() }
|
||||
case "kv":
|
||||
kv, err := connectKVStore(*kf.natsURL, *kf.internalID, *kf.ca, *kf.replicas)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd %s: --store kv: %v\n", cmd, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
return kv.store, kv, kv.Close
|
||||
default:
|
||||
fmt.Fprintf(os.Stderr, "membershipd %s: --store must be \"sqlite\" or \"kv\", got %q\n", cmd, *kf.store)
|
||||
os.Exit(2)
|
||||
return nil, nil, func() {}
|
||||
}
|
||||
}
|
||||
|
||||
func userAdd(args []string) {
|
||||
fs := flag.NewFlagSet("user add", flag.ExitOnError)
|
||||
handle := fs.String("handle", "", "human-readable user name (required)")
|
||||
signPub := fs.String("sign-pub", "", "Ed25519 signing public key in hex (required)")
|
||||
role := fs.String("role", membership.RoleMember, "role: admin or member")
|
||||
dbPath := fs.String("db", defaultDBPath, "SQLite database path")
|
||||
kf := registerKVFlags(fs)
|
||||
_ = fs.Parse(args)
|
||||
|
||||
if *handle == "" || *signPub == "" {
|
||||
fmt.Fprintln(os.Stderr, "membershipd user add: --handle and --sign-pub are required")
|
||||
os.Exit(2)
|
||||
}
|
||||
if err := validateSignPubHex(*signPub); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user add: %v\n", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
store, kv, closeStore := resolveStore("user add", kf, *dbPath)
|
||||
defer closeStore()
|
||||
|
||||
if err := store.AddUser(*signPub, *handle, *role); err != nil {
|
||||
if errors.Is(err, membership.ErrUserExists) {
|
||||
// Idempotency contract (GAP A): re-adding the same key is an EXPLICIT,
|
||||
// non-destructive error — the existing row is left untouched (no silent
|
||||
// upsert that could flip a role or clobber status, which would corrupt the
|
||||
// allowlist). To replace a user, `user revoke <key>` then add again.
|
||||
fmt.Fprintf(os.Stderr, "membershipd user add: user %s already registered (unchanged); revoke it first to replace\n", *signPub)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Fprintf(os.Stderr, "membershipd user add: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Printf("added user %q (%s) role=%s\n", *handle, *signPub, *role)
|
||||
if kv != nil {
|
||||
reportKVReplication(kv.js)
|
||||
}
|
||||
}
|
||||
|
||||
func userList(args []string) {
|
||||
fs := flag.NewFlagSet("user list", flag.ExitOnError)
|
||||
dbPath := fs.String("db", defaultDBPath, "SQLite database path")
|
||||
kf := registerKVFlags(fs)
|
||||
_ = fs.Parse(args)
|
||||
|
||||
store, _, closeStore := resolveStore("user list", kf, *dbPath)
|
||||
defer closeStore()
|
||||
|
||||
users, err := store.ListUsers()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user list: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if len(users) == 0 {
|
||||
fmt.Println("(no users)")
|
||||
return
|
||||
}
|
||||
w := tabwriter.NewWriter(os.Stdout, 0, 2, 2, ' ', 0)
|
||||
fmt.Fprintln(w, "HANDLE\tROLE\tSTATUS\tSIGN_PUB\tCREATED")
|
||||
for _, u := range users {
|
||||
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", u.Handle, u.Role, u.Status, u.SignPub, u.CreatedAt)
|
||||
}
|
||||
_ = w.Flush()
|
||||
}
|
||||
|
||||
func userRevoke(args []string) {
|
||||
fs := flag.NewFlagSet("user revoke", flag.ExitOnError)
|
||||
dbPath := fs.String("db", defaultDBPath, "SQLite database path")
|
||||
kf := registerKVFlags(fs)
|
||||
|
||||
// Go's flag package stops at the first non-flag argument, so `revoke <key>
|
||||
// --db path` would otherwise leave --db unparsed. Pull a leading positional
|
||||
// (the sign-pub) off the front before parsing so both `revoke <key> --db p`
|
||||
// and `revoke --db p <key>` work for the operator.
|
||||
var signPub string
|
||||
if len(args) > 0 && !strings.HasPrefix(args[0], "-") {
|
||||
signPub, args = args[0], args[1:]
|
||||
}
|
||||
_ = fs.Parse(args)
|
||||
if signPub == "" {
|
||||
if rest := fs.Args(); len(rest) == 1 {
|
||||
signPub = rest[0]
|
||||
}
|
||||
}
|
||||
if signPub == "" {
|
||||
fmt.Fprintln(os.Stderr, "membershipd user revoke: exactly one <sign-pub> argument required")
|
||||
os.Exit(2)
|
||||
}
|
||||
if err := validateSignPubHex(signPub); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user revoke: %v\n", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
store, _, closeStore := resolveStore("user revoke", kf, *dbPath)
|
||||
defer closeStore()
|
||||
|
||||
if err := store.RevokeUser(signPub); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "membershipd user revoke: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Printf("revoked user %s\n", signPub)
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// users_kv.go is the `--store kv` half of the user administration CLI (issue 0011
|
||||
// gaps, GAP A): adding and listing bus users directly against the RUNNING
|
||||
// cluster's replicated JetStream KV allowlist, with no need to stop the cluster,
|
||||
// seed a standalone node, and restart (the procedure the 0011 deploy required).
|
||||
//
|
||||
// The mechanism is the cluster's own privileged internal connection. Under
|
||||
// enforce every bus user is confined by the per-subject ACL to the JetStream API
|
||||
// of its own rooms, so no ordinary identity may touch the control-plane buckets
|
||||
// (KV_UNIBUS_*). The ONLY identity the authenticator grants full JetStream
|
||||
// permissions is membershipd's internal service identity. By persisting that
|
||||
// identity to a file (membershipd --internal-id-file) the same key becomes
|
||||
// available to this CLI, which presents it as its NATS nkey and is therefore
|
||||
// recognized as the privileged internal client and allowed to read/write the KV.
|
||||
//
|
||||
// Intended invocation is over loopback on a cluster node (SSH): the data-plane
|
||||
// TLS certificate's SAN covers 127.0.0.1/localhost and the internal identity file
|
||||
// lives 0600 next to the node's TLS keys. Using the file requires root on the
|
||||
// node, which already implies full control of that node — so co-locating it adds
|
||||
// no practical exposure beyond what the TLS server key and cluster password
|
||||
// already represent.
|
||||
|
||||
// defaultClusterNatsURL is the node-local NATS listener. The CLI is meant to run
|
||||
// on a cluster node over SSH, talking to that node's own embedded server.
|
||||
const defaultClusterNatsURL = "nats://127.0.0.1:4250"
|
||||
|
||||
// Deploy-default paths for the privileged identity and the data-plane CA, so an
|
||||
// on-node invocation needs only --handle/--sign-pub/--role. Override for other
|
||||
// layouts.
|
||||
const (
|
||||
defaultInternalIDFile = "/opt/unibus/secrets/internal.id"
|
||||
defaultClusterCAFile = "/opt/unibus/tls/ca.crt"
|
||||
)
|
||||
|
||||
// kvConn bundles the privileged NATS connection to a live cluster and the
|
||||
// KV-backed control-plane store opened over it. Close releases both.
|
||||
type kvConn struct {
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream
|
||||
store membership.Store
|
||||
}
|
||||
|
||||
func (k *kvConn) Close() {
|
||||
if k == nil {
|
||||
return
|
||||
}
|
||||
if k.store != nil {
|
||||
_ = k.store.Close()
|
||||
}
|
||||
if k.nc != nil {
|
||||
k.nc.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// connectKVStore opens the privileged internal connection to the cluster's NATS
|
||||
// and the JetStream KV control-plane store on top of it. internalIDFile is the
|
||||
// membershipd-persisted internal service identity whose nkey the authenticator
|
||||
// grants full permissions; caPath pins the data-plane TLS (empty only for a
|
||||
// non-TLS dev cluster). A non-loopback target without --ca is refused, mirroring
|
||||
// migrate-to-kv (audit 0008 N6): the allowlist write must not travel in cleartext.
|
||||
func connectKVStore(natsURL, internalIDFile, caPath string, replicas int) (*kvConn, error) {
|
||||
if internalIDFile == "" {
|
||||
return nil, fmt.Errorf("--internal-id-file is required for --store kv (the privileged identity membershipd persists with --internal-id-file)")
|
||||
}
|
||||
// Confidentiality guard: a remote NATS without TLS would expose the allowlist
|
||||
// (handles/roles/sign-pubs) and the privileged nkey handshake in cleartext.
|
||||
if !isLoopbackURL(natsURL) && caPath == "" {
|
||||
return nil, fmt.Errorf("refusing to connect to remote %q without --ca: the allowlist write would travel in cleartext — pin TLS with --ca, or run over a loopback --nats-url on a node", natsURL)
|
||||
}
|
||||
|
||||
id, err := client.LoadIdentity(internalIDFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load internal identity: %w", err)
|
||||
}
|
||||
nkeyPub, nkeySign, err := busauth.ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("derive nkey from internal identity: %w", err)
|
||||
}
|
||||
opts := []nats.Option{
|
||||
nats.Name("membershipd-user-cli"),
|
||||
nats.Nkey(nkeyPub, nkeySign),
|
||||
}
|
||||
if caPath != "" {
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(caPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load CA %q: %w", caPath, err)
|
||||
}
|
||||
opts = append(opts, nats.Secure(tlsCfg))
|
||||
}
|
||||
nc, err := nats.Connect(natsURL, opts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("connect cluster NATS %q: %w", natsURL, err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("jetstream: %w", err)
|
||||
}
|
||||
store, err := membership.OpenJetStream(js, membership.JetStreamConfig{Replicas: replicas})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("open KV control-plane store: %w", err)
|
||||
}
|
||||
return &kvConn{nc: nc, js: js, store: store}, nil
|
||||
}
|
||||
|
||||
// reportKVReplication prints the replication status of the allowlist bucket
|
||||
// stream (KV_UNIBUS_users) right after a write, so the operator sees the add
|
||||
// landed on a quorum and replicated to the followers — executable evidence that
|
||||
// the live-cluster add is HA, not single-node. Best-effort: a read failure is a
|
||||
// note, not an error (the write itself already succeeded).
|
||||
func reportKVReplication(js jetstream.JetStream) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
st, err := js.Stream(ctx, "KV_UNIBUS_users")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "note: could not read KV_UNIBUS_users stream info: %v\n", err)
|
||||
return
|
||||
}
|
||||
info, err := st.Info(ctx)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "note: could not read KV_UNIBUS_users stream info: %v\n", err)
|
||||
return
|
||||
}
|
||||
if info.Cluster == nil {
|
||||
fmt.Printf("KV_UNIBUS_users: standalone (R1, no cluster replication); msgs=%d\n", info.State.Msgs)
|
||||
return
|
||||
}
|
||||
current := 0
|
||||
for _, r := range info.Cluster.Replicas {
|
||||
if r.Current {
|
||||
current++
|
||||
}
|
||||
}
|
||||
fmt.Printf("KV_UNIBUS_users: leader=%s followers_current=%d/%d msgs=%d\n",
|
||||
info.Cluster.Leader, current, len(info.Cluster.Replicas), info.State.Msgs)
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// wireReplicatedNonces applies the cluster anti-replay policy to srv. It is the
|
||||
// single piece of wiring the binary uses to decide whether a node must share its
|
||||
// nonce store, extracted so a regression test exercises the EXACT decision the
|
||||
// running binary makes (issue 0006a, audit 0008 N3).
|
||||
//
|
||||
// Policy:
|
||||
// - A clustered node (clustered == true) MUST use the shared JetStream KV nonce
|
||||
// bucket. Every node sees the same bucket, so a request accepted on one node
|
||||
// cannot be replayed to another whose per-process cache never saw the nonce.
|
||||
// A missing JetStream context, or a failure to create the bucket, is a FATAL
|
||||
// configuration error returned to the caller — a clustered node running with a
|
||||
// per-process nonce cache is precisely the replay hole the audit flagged, so
|
||||
// it must refuse to start rather than serve insecurely.
|
||||
// - A standalone node (clustered == false) keeps the in-memory cache that
|
||||
// NewServer installed: there is no second node to replay to, so the shared
|
||||
// bucket would only add a JetStream dependency for no security gain.
|
||||
//
|
||||
// replicas is the nonce bucket's replication factor (R1..R3). Returns nil when no
|
||||
// action is required (standalone).
|
||||
func wireReplicatedNonces(srv *membership.Server, js jetstream.JetStream, clustered bool, replicas int) error {
|
||||
if !clustered {
|
||||
return nil // standalone: the in-memory nonce cache is sufficient and safe
|
||||
}
|
||||
if js == nil {
|
||||
return fmt.Errorf("clustered node requires JetStream for the shared nonce bucket, but none is available")
|
||||
}
|
||||
if err := srv.UseReplicatedNonces(js, replicas); err != nil {
|
||||
return fmt.Errorf("replicated nonces: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
+9
-1
@@ -23,6 +23,7 @@ func main() {
|
||||
ctrlURL = flag.String("ctrl-url", "http://127.0.0.1:8470", "membershipd control-plane url")
|
||||
roomSub = flag.String("room", "proc.test.ticks", "room subject to publish to")
|
||||
idFile = flag.String("id-file", "./local_files/worker.id", "identity file path")
|
||||
caFile = flag.String("ca", "", "path to the bus CA cert (ca.crt); set to connect with TLS + nkey to a secured bus")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
@@ -33,7 +34,7 @@ func main() {
|
||||
if err != nil {
|
||||
log.Fatalf("identity: %v", err)
|
||||
}
|
||||
c, err := client.New(*natsURL, *ctrlURL, id)
|
||||
c, err := client.Connect(*natsURL, *ctrlURL, id, *caFile)
|
||||
if err != nil {
|
||||
log.Fatalf("connect: %v", err)
|
||||
}
|
||||
@@ -46,6 +47,13 @@ func main() {
|
||||
if err != nil {
|
||||
log.Fatalf("create room: %v", err)
|
||||
}
|
||||
// Membership-change contract (issue 0006e): the bus freezes per-subject
|
||||
// permissions at connect time, and this room did not exist then. Refresh the
|
||||
// session so the new room's subject becomes publishable under enforce+ACL. On
|
||||
// an unsecured/dev bus this is a harmless reconnect.
|
||||
if err := c.RefreshSession(); err != nil {
|
||||
log.Fatalf("refresh session after create room: %v", err)
|
||||
}
|
||||
log.Printf("room %q -> %s (subject %s, cleartext)", *roomSub, roomID, *roomSub)
|
||||
|
||||
stop := make(chan os.Signal, 1)
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
# Running membershipd as a systemd user service
|
||||
|
||||
`membershipd` is the unibus control plane (rooms, members, sealed keys, blob
|
||||
store) and, unless you point it at an external NATS with `--nats-url`, it also
|
||||
runs the embedded NATS + JetStream data plane. Running it as a **systemd user
|
||||
service** keeps it alive across logout/reboot and restarts it if it crashes.
|
||||
|
||||
The unit (`unibus-membershipd.service`) binds both planes to `0.0.0.0`:
|
||||
|
||||
| Plane | Port | Reachable from |
|
||||
|--------------|-------|----------------|
|
||||
| HTTP control | 8470 | LAN (`http://<host-ip>:8470/healthz`) |
|
||||
| NATS data | 4250 | LAN (`nats://<host-ip>:4250`) |
|
||||
|
||||
## Install (idempotent)
|
||||
|
||||
```bash
|
||||
cd ~/fn_registry/projects/message_bus/apps/unibus
|
||||
./deploy/install.sh
|
||||
```
|
||||
|
||||
This builds the binary, symlinks the unit into `~/.config/systemd/user/`,
|
||||
reloads systemd, and enables + starts the service.
|
||||
|
||||
## Manual steps (what install.sh does)
|
||||
|
||||
```bash
|
||||
cd ~/fn_registry/projects/message_bus/apps/unibus
|
||||
|
||||
# 1. Build the pure-Go binary (no CGO).
|
||||
CGO_ENABLED=0 go build -o membershipd ./cmd/membershipd
|
||||
|
||||
# 2. Link the unit into the systemd user directory.
|
||||
mkdir -p ~/.config/systemd/user
|
||||
ln -sf "$PWD/deploy/unibus-membershipd.service" ~/.config/systemd/user/unibus-membershipd.service
|
||||
|
||||
# 3. Reload, enable (start on login) and start now.
|
||||
systemctl --user daemon-reload
|
||||
systemctl --user enable --now unibus-membershipd.service
|
||||
|
||||
# (optional) survive logout without an active session:
|
||||
# sudo loginctl enable-linger "$USER"
|
||||
```
|
||||
|
||||
## Operate
|
||||
|
||||
```bash
|
||||
systemctl --user status unibus-membershipd.service # is it active?
|
||||
systemctl --user restart unibus-membershipd.service # after a rebuild
|
||||
systemctl --user stop unibus-membershipd.service
|
||||
systemctl --user disable unibus-membershipd.service # stop starting on login
|
||||
journalctl --user -u unibus-membershipd.service -f # follow logs
|
||||
|
||||
# Health (local and from another LAN host):
|
||||
curl -fsS http://127.0.0.1:8470/healthz
|
||||
curl -fsS http://<host-lan-ip>:8470/healthz
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Writable state (SQLite DB, JetStream store, blobs) lives under `local_files/`
|
||||
relative to `WorkingDirectory`, which the unit sets to the app directory.
|
||||
- After editing the app code, rebuild (`CGO_ENABLED=0 go build -o membershipd
|
||||
./cmd/membershipd`) and `systemctl --user restart unibus-membershipd.service`.
|
||||
- To run against an external NATS instead of the embedded one, append
|
||||
`--nats-url nats://<host>:4222` to `ExecStart` and re-run `daemon-reload` +
|
||||
`restart`.
|
||||
|
||||
## Clustering (HA) — see `deploy/cluster/`
|
||||
|
||||
The single-node service above is secure on its own. Running unibus as a
|
||||
multi-node **cluster** has extra hardening rules (issues 0006a–0006f); the full
|
||||
runbook and the generated material live in `deploy/cluster/`. Key points an
|
||||
operator must know:
|
||||
|
||||
- **Homogeneous posture (0006d).** Every node MUST run `--bus-auth enforce` (the
|
||||
binary refuses to join a cluster otherwise) and present mutual route TLS on a
|
||||
public bind. `/healthz` publishes each node's `posture` so a monitor can flag a
|
||||
node that is not `enforce`+`acl`+`tls`.
|
||||
- **Separate route CA (0006f).** The cluster route layer authenticates *nodes*,
|
||||
not bus users — sign the route certs with a **dedicated cluster CA**
|
||||
(`--route-tls-ca`), NOT the client data-plane CA (`--tls-cert`'s CA). Keeping
|
||||
the two trust roots separate means a client cert can never be presented to the
|
||||
route port. `deploy/cluster/generate-cluster-certs.sh` builds this CA.
|
||||
- **Secret out of argv (0006f).** Pass the route password via
|
||||
`--cluster-pass-file` or the `UNIBUS_CLUSTER_PASS` env var, NOT `--cluster-pass`
|
||||
or a `nats://user:pass@host` in `--routes` (both are visible in `ps`/journald).
|
||||
When the secret comes from a file/env, list peers as bare `--routes
|
||||
nats://<host>:6250` and the binary injects the credentials.
|
||||
- **`migrate-to-kv` confidentiality (0006f).** The migration writes the allowlist
|
||||
(handles/roles/sign pubs) into KV. Run it only against a **loopback** nats-url,
|
||||
or pin TLS with `--ca` for a remote target — otherwise that metadata travels in
|
||||
cleartext. The binary refuses a remote target without `--ca`.
|
||||
- **R1 is NOT HA (0006a/N3-DoS).** With `--kv-replicas 1` the control plane
|
||||
(including the nonce bucket) is a single point of failure: if the node owning
|
||||
the stream dies, every authenticated request fails closed (auth DoS). Real HA
|
||||
needs **R3** (quorum 2/3): raise replicas in place with `nats stream update
|
||||
--replicas 3` once the third node has joined. Do not advertise R1 as HA.
|
||||
@@ -0,0 +1,7 @@
|
||||
# Generated TLS material and secrets — NEVER commit (audit 0008: keys/secret).
|
||||
out/
|
||||
build/
|
||||
secrets/
|
||||
*.key
|
||||
*.srl
|
||||
cluster-ca.crt
|
||||
@@ -0,0 +1,285 @@
|
||||
# unibus cluster — 3-node deploy runbook (issue 0006g)
|
||||
|
||||
This directory holds the material to bring up unibus as a **3-node cluster**
|
||||
(`magnus` + `homer` + `datardos`) for real HA: with **R3** replication the control
|
||||
plane (rooms/members/keys/users on JetStream KV + the anti-replay nonce bucket)
|
||||
survives the loss of any one node (quorum 2/3).
|
||||
|
||||
> **Status: this cluster is DEPLOYED in production** (magnus + homer + datardos,
|
||||
> R3, enforce+ACL+TLS) — see report 0011. The runbook below was authored before any
|
||||
> VPS existed and has since been **corrected against the real deploy** (report 0012):
|
||||
> the start ordering, the R1→R3 reality, and the live user-add path were all wrong
|
||||
> or missing. Steps that change a remote host are marked **HUMAN**; `deploy-cluster.sh`
|
||||
> still defaults to a dry run.
|
||||
|
||||
## Files
|
||||
|
||||
| File | What it is |
|
||||
|---|---|
|
||||
| `nodes.env` | Topology: cluster name, ports, and the per-node rows (name, ssh host, public IP, WG IP). **HUMAN fills the placeholders.** |
|
||||
| `generate-cluster-certs.sh` | Mints a **separate cluster route CA** + a route cert per node, and a data-plane server cert per node signed by the **client CA** (`../tls/ca.*`). |
|
||||
| `membershipd-cluster.service` | One systemd unit, parameterized per node by `/opt/unibus/cluster.env`. enforce + per-subject ACL + TLS + `--store kv`, `Restart=always`. |
|
||||
| `deploy-cluster.sh` | Cross-builds the linux binary, generates each node's `cluster.env`, and (with `--yes`) rsyncs everything + installs the unit. Staggered start is manual. |
|
||||
|
||||
Generated keys/secrets (`out/`, `build/`, `secrets/`) are **gitignored** — they are
|
||||
secret and never leave the operator's trusted machine except over the secure
|
||||
rsync channel.
|
||||
|
||||
## Topology (as deployed, report 0011)
|
||||
|
||||
| Node | SSH | Public IP | Role |
|
||||
|---|---|---|---|
|
||||
| magnus | `magnus` (root) | `135.125.201.30` | node — **= organic-machine.com = `om`**, the critical host (caddy + gitea + registry-api + monitoring); the bus runs alongside, untouched |
|
||||
| homer | `homer` (ubuntu+sudo) | `141.94.69.66` | node |
|
||||
| datardos | `dd` (ubuntu+sudo) | `51.91.100.142` | node |
|
||||
|
||||
`ROUTE_NETWORK=public`, **not `wg`**: there is no WireGuard mesh between the three
|
||||
nodes (homer and datardos do not even have the `wg` binary; om's only WG peers are
|
||||
the operator's PCs). The server-to-server routes therefore travel over the public
|
||||
IPs, protected by the **separate cluster route CA** (mutual route TLS) — a client
|
||||
data-plane cert can never be presented to the route port. The client data plane and
|
||||
the HTTP control plane are also reached over the public IPs. There is no fixed
|
||||
"seed" node: with R3 the three are peers (see "Bring up" for why a lone node cannot
|
||||
self-serve).
|
||||
|
||||
## Prerequisites (HUMAN, once)
|
||||
|
||||
1. **Fill `nodes.env`** — replace every `<PLACEHOLDER>` (magnus public IP, all WG
|
||||
IPs). The scripts refuse to run while any remain.
|
||||
2. **Client CA exists** — `../tls/ca.crt` + `../tls/ca.key`. If not, run
|
||||
`../tls/generate-certs.sh` on the CA host (om) first. The cluster reuses this CA
|
||||
for the data plane so existing clients keep trusting the bus.
|
||||
3. **Mint cluster TLS**:
|
||||
```bash
|
||||
./generate-cluster-certs.sh # writes out/<name>/ ; --force to rotate the cluster CA
|
||||
```
|
||||
4. **Create the route secret** (out of argv, shared by all nodes):
|
||||
```bash
|
||||
mkdir -p secrets && openssl rand -hex 32 > secrets/cluster.pass
|
||||
```
|
||||
5. **SSH** to each node's SSH host as `root` works (`ssh magnus true`, `ssh dd true`, ...).
|
||||
|
||||
## Stage the nodes
|
||||
|
||||
```bash
|
||||
./deploy-cluster.sh # DRY RUN — prints the full plan, touches nothing
|
||||
./deploy-cluster.sh --yes # HUMAN: actually rsync + install the unit on all 3 nodes
|
||||
```
|
||||
|
||||
This cross-builds `membershipd` (linux/amd64, `CGO_ENABLED=0`), writes each node's
|
||||
`cluster.env` (its `NODE_NAME` and the `--routes` to the OTHER two nodes), and
|
||||
ships the binary, the node's TLS material, the secret, the env file and the unit.
|
||||
It does **not** start anything.
|
||||
|
||||
## Seed the first admin into the KV (HUMAN — loopback bootstrap)
|
||||
|
||||
The empty KV control plane has no users, and under `enforce` no external tool can
|
||||
write the FIRST admin over NATS (it would need to be an admin already — a
|
||||
chicken-and-egg). The `user` CLI also writes only to a local SQLite file, not the
|
||||
KV. So the first admin is seeded on the seed node through a **loopback, no-auth
|
||||
bootstrap** that populates the same JetStream store the cluster unit then reuses:
|
||||
|
||||
```bash
|
||||
ssh root@magnus 'bash -s' <<'SEED'
|
||||
set -euo pipefail
|
||||
cd /opt/unibus
|
||||
# a) Put the first admin into a local SQLite seed file.
|
||||
./membershipd user add --db ./seed.db --handle root --sign-pub <ADMIN_SIGN_PUB_HEX> --role admin
|
||||
# b) Bring up a TEMPORARY loopback, no-auth, single-node KV server on the cluster's
|
||||
# own JetStream store dir (not exposed; bus-auth off is allowed on 127.0.0.1).
|
||||
./membershipd --store kv --bus-auth off --bind 127.0.0.1 \
|
||||
--nats-store ./local_files/jetstream --db ./seed.db >/tmp/seed-boot.log 2>&1 &
|
||||
BOOT=$!; sleep 2
|
||||
# c) Migrate the admin from SQLite into the replicated KV (loopback — no --ca needed).
|
||||
./membershipd migrate-to-kv --db ./seed.db --nats-url nats://127.0.0.1:4250 --replicas 1
|
||||
# d) Stop the bootstrap server. The KV buckets persist in ./local_files/jetstream.
|
||||
kill "$BOOT"; wait "$BOOT" 2>/dev/null || true
|
||||
rm -f ./seed.db
|
||||
SEED
|
||||
```
|
||||
|
||||
> The KV written here lives in `./local_files/jetstream`, which the cluster unit
|
||||
> reuses (`--nats-store` default), so the admin is present when the enforce cluster
|
||||
> starts. This loopback bootstrap is needed ONLY for the very first admin (the
|
||||
> chicken-and-egg). **Every user after that is added with the cluster live** — no
|
||||
> stop-seed-restart — via `user add --store kv` (see "Add users to the live
|
||||
> cluster" below, report 0012).
|
||||
|
||||
## Bring up (HUMAN)
|
||||
|
||||
> **CORRECTION (report 0012).** The original instruction — "start magnus alone and
|
||||
> verify healthz, then add the others" — is **WRONG and will look like a hung
|
||||
> deploy.** A 3-node JetStream cluster forms a RAFT meta-group that needs a quorum
|
||||
> (2 of 3) to elect a leader. A single started node has no quorum, so its JetStream
|
||||
> meta never becomes current: `--store kv` blocks creating the KV buckets and
|
||||
> **`/healthz` never returns ok** until a second node joins. Waiting for magnus to
|
||||
> "go green" before starting the others therefore deadlocks the rollout.
|
||||
|
||||
Start the nodes so a quorum forms. On a **clean cluster** the simplest correct
|
||||
procedure is to start all three close together and let the meta-group converge:
|
||||
|
||||
```bash
|
||||
# Start all three (order does not matter); each blocks on the others until a
|
||||
# 2/3 quorum elects a JetStream meta leader, then the KV buckets are created.
|
||||
for h in magnus homer datardos; do ssh "$h" 'sudo systemctl enable --now membershipd-cluster'; done
|
||||
|
||||
# Only NOW does healthz return ok — once the meta-group has a leader (give it
|
||||
# ~10-30s on a cold start). Poll, do not assume the first node is broken.
|
||||
for h in magnus homer datardos; do
|
||||
echo "== $h =="; ssh "$h" 'curl -fsS https://127.0.0.1:8470/healthz --cacert /opt/unibus/tls/ca.crt || echo "(not ready yet — needs quorum)"'
|
||||
done
|
||||
```
|
||||
|
||||
A **staggered** start also works, but only because `membershipd`'s KV open RETRIES
|
||||
the bucket creation for a 120s bootstrap budget (issue 0006g, fix #3): the first
|
||||
node sits in that retry loop — NOT serving healthz — until the second node makes a
|
||||
quorum, then both converge and the third catches up. Either way, a lone node never
|
||||
self-serves; do not gate the next node's start on the previous one's healthz.
|
||||
|
||||
> A cold multi-node start only converges because of **three cold-start fixes**
|
||||
> (report 0011): route pooling off (`PoolSize=-1`), `NoAdvertise=true` (Docker
|
||||
> bridge IPs not gossiped), and the KV-open retry loop above. Without them the
|
||||
> meta-group re-elects leaders forever and bucket creation hangs. If a fresh
|
||||
> cluster will not form, confirm the running binary contains these fixes before
|
||||
> touching config.
|
||||
|
||||
## Promote an existing single-node (SQLite) deployment (HUMAN, optional)
|
||||
|
||||
Instead of seeding fresh, you can migrate an existing single-node `unibus.db` into
|
||||
the KV — **loopback only** (the allowlist would otherwise travel cleartext; the
|
||||
command refuses a remote target without `--ca`). Use the same loopback-bootstrap
|
||||
shape as the seed step (temporary `--bus-auth off` server on 127.0.0.1, then
|
||||
`migrate-to-kv --db /opt/unibus/local_files/unibus.db`).
|
||||
|
||||
## Verify
|
||||
|
||||
```bash
|
||||
# Posture on every node — all must be enforce+acl+tls+cluster, store=kv.
|
||||
for h in magnus homer datardos; do
|
||||
echo "== $h =="
|
||||
ssh root@$h 'curl -fsS https://127.0.0.1:8470/healthz --cacert /opt/unibus/tls/ca.crt'
|
||||
done
|
||||
|
||||
# Cluster + JetStream meta-group health (needs the `nats` CLI on a node):
|
||||
ssh root@magnus 'nats --server nats://127.0.0.1:4250 server report jetstream'
|
||||
ssh root@magnus 'nats --server nats://127.0.0.1:4250 server list' # 3 servers, routes up
|
||||
```
|
||||
|
||||
A healthy cluster shows 3 routed servers and a JetStream meta-group with a leader.
|
||||
|
||||
## Add users to the live cluster (HUMAN — `user add --store kv`)
|
||||
|
||||
With the cluster up, add (and revoke) bus users **without stopping anything**,
|
||||
directly against the replicated KV allowlist. This replaces the stop-seed-restart
|
||||
procedure the original runbook implied for every user beyond the first admin.
|
||||
|
||||
The mechanism is the cluster's own **privileged internal connection**: under
|
||||
`enforce` every bus user is confined by the per-subject ACL to its own rooms, so no
|
||||
ordinary identity may write the control-plane buckets. The only identity the
|
||||
authenticator grants full JetStream permissions is `membershipd`'s internal service
|
||||
identity. The unit persists that identity to `${INTERNAL_ID_FILE}`
|
||||
(`/opt/unibus/secrets/internal.id`, 0600) via `--internal-id-file`, so the same key
|
||||
is available to the CLI. Run the CLI **on a node, over loopback** (the data-plane
|
||||
TLS cert SAN covers `127.0.0.1`); reading the identity file requires root on that
|
||||
node, which already implies full control of it, so this adds no practical exposure.
|
||||
|
||||
```bash
|
||||
# Add a member to the live cluster's replicated allowlist (run on any node).
|
||||
ssh root@magnus 'sudo /opt/unibus/membershipd user add --store kv \
|
||||
--handle alice --role member --sign-pub <64-hex-ed25519-pub>'
|
||||
# -> added user "alice" (...) role=member
|
||||
# -> KV_UNIBUS_users: leader=<node> followers_current=2/2 msgs=N (replicated, HA)
|
||||
|
||||
# List / revoke against the same live KV:
|
||||
ssh root@magnus 'sudo /opt/unibus/membershipd user list --store kv'
|
||||
ssh root@magnus 'sudo /opt/unibus/membershipd user revoke --store kv <64-hex-ed25519-pub>'
|
||||
```
|
||||
|
||||
Defaults assume an on-node invocation (`--nats-url nats://127.0.0.1:4250`,
|
||||
`--internal-id-file /opt/unibus/secrets/internal.id`, `--ca /opt/unibus/tls/ca.crt`,
|
||||
`--kv-replicas 3`). Semantics:
|
||||
|
||||
- **Idempotent / non-destructive**: re-adding the same key is an explicit
|
||||
`already registered` error (exit 1), never a silent overwrite — a re-add cannot
|
||||
flip a member to admin. To replace a user, `revoke` then add.
|
||||
- **HA**: the write commits through the JetStream quorum, so it succeeds even with
|
||||
one node down (2/3); the printed `followers_current` shows replication.
|
||||
- **No hard delete**: `revoke` flips status to `revoked` (denied on both planes,
|
||||
auditable); the KV has no row deletion, matching the SQLite store.
|
||||
|
||||
> **Rollout note (report 0012):** the live verification deployed this binary +
|
||||
> `--internal-id-file` to **datardos only** (the non-critical node). magnus and
|
||||
> homer still run the 0011 binary. To make the capability available (and the unit)
|
||||
> on all three — recommended, the posture is identical so there is no urgency — roll
|
||||
> the new binary with backups, one node at a time, verifying healthz between each:
|
||||
> ```bash
|
||||
> for h in homer magnus; do
|
||||
> ssh "$h" 'sudo cp -a /opt/unibus/membershipd /opt/unibus/membershipd.bak' # backup
|
||||
> scp build/membershipd "$h:/tmp/m" && ssh "$h" 'sudo install -o ubuntu -g ubuntu -m0775 /tmp/m /opt/unibus/membershipd'
|
||||
> # add INTERNAL_ID_FILE=/opt/unibus/secrets/internal.id to /opt/unibus/cluster.env
|
||||
> # add `--internal-id-file ${INTERNAL_ID_FILE} \` to the unit before `--store kv`
|
||||
> ssh "$h" 'sudo systemctl daemon-reload && sudo systemctl restart membershipd-cluster'
|
||||
> ssh "$h" 'curl -fsS https://127.0.0.1:8470/healthz --cacert /opt/unibus/tls/ca.crt' # green before next
|
||||
> done
|
||||
> ```
|
||||
> (`deploy-cluster.sh` + the unit template already emit `INTERNAL_ID_FILE` and the
|
||||
> flag, so a fresh `./deploy-cluster.sh --yes` is correct for all three.)
|
||||
|
||||
## Replication: go straight to R3 (HUMAN — real HA)
|
||||
|
||||
> **CORRECTION (report 0012).** The original "start at R1, then scale to R3" plan
|
||||
> assumed R1 is a usable interim state. **It is not, in this cluster.** At R1 all six
|
||||
> control-plane buckets (`KV_UNIBUS_users/rooms/members/room_keys/rooms_by_member`
|
||||
> + `KV_UNIBUS_nonces`) live on a SINGLE node — a hard **SPOF for authentication**:
|
||||
> if that node dies, the nonce/KV control plane is unreachable and EVERY
|
||||
> authenticated request fails closed (auth DoS). Worse, the cold multi-node start
|
||||
> only converges at all because of the three cold-start fixes (see "Bring up"); the
|
||||
> real deploy never ran a healthy R1 and **jumped straight to R3 once the cluster
|
||||
> formed.** Treat R1 as a transient artifact of bucket creation, not a milestone.
|
||||
|
||||
The deployed config already sets `KV_REPLICAS=3` in `nodes.env`. If buckets were
|
||||
created at R1 (e.g. only one node was up when `--store kv` first opened them), raise
|
||||
every control-plane stream to R3 IN PLACE (no data loss) once all three nodes are
|
||||
routed:
|
||||
|
||||
```bash
|
||||
for s in KV_UNIBUS_users KV_UNIBUS_rooms KV_UNIBUS_members KV_UNIBUS_room_keys \
|
||||
KV_UNIBUS_rooms_by_member KV_UNIBUS_nonces; do
|
||||
ssh root@magnus "nats --server nats://127.0.0.1:4250 stream update $s --replicas 3 -f"
|
||||
done
|
||||
# (also OBJ_UNIBUS_blobs if the object store is in use)
|
||||
```
|
||||
|
||||
After this each bucket shows `followers_current=2/2` (quorum 2/3). The
|
||||
`user add --store kv` command prints that figure for `KV_UNIBUS_users` on every add,
|
||||
which is a cheap live HA check.
|
||||
|
||||
## Chaos test (HUMAN — requires the 3 live VPS)
|
||||
|
||||
Validate quorum tolerance after R3:
|
||||
|
||||
```bash
|
||||
# Kill one node; the cluster keeps serving (quorum 2/3). On ubuntu nodes use sudo.
|
||||
ssh dd 'sudo systemctl stop membershipd-cluster'
|
||||
# -> clients fail over (multiple seed URLs); reads/writes still succeed.
|
||||
ssh dd 'sudo systemctl start membershipd-cluster' # rejoins, catches up
|
||||
|
||||
# Kill two nodes; quorum is LOST — the control plane should fail CLOSED (deny),
|
||||
# never fail open. Verify a request is rejected, not silently served.
|
||||
```
|
||||
|
||||
> **Validated (report 0012).** The 0011 chaos run checked only the control plane
|
||||
> (healthz + meta/stream-leader failover + KV readable with 2/3). Report 0012 added
|
||||
> the missing data-plane proofs against the live cluster: a real authenticated
|
||||
> client (`cmd/clientcheck`, operator identity, nkey+TLS) creating an E2E room and
|
||||
> publishing/subscribing — including a node stopped mid-stream, where the client
|
||||
> failed over to a survivor and kept receiving with zero loss (quorum 2/3) — and
|
||||
> `user add --store kv` committing with one node (the KV leader) down. The kill-2/3
|
||||
> fail-closed case remains a documented manual step.
|
||||
|
||||
## Rollback
|
||||
|
||||
`membershipd` does not delete data. To revert a node to standalone SQLite, stop
|
||||
the unit and start it without `--store kv`/`--cluster-name`; the KV buckets remain
|
||||
for a later retry. To rotate the cluster CA, re-run `generate-cluster-certs.sh
|
||||
--force` and re-stage (every node must get the new `cluster-ca.crt` together).
|
||||
Executable
+130
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# deploy-cluster.sh — cross-build membershipd and stage it onto the three cluster
|
||||
# nodes (issue 0006g). DEFAULT IS DRY-RUN: it prints the plan and touches nothing.
|
||||
# Pass --yes to actually rsync + run remote commands. Steps that a HUMAN must run
|
||||
# (or confirm) are marked "HUMAN:".
|
||||
#
|
||||
# Prerequisites (HUMAN, once):
|
||||
# 1. Fill nodes.env (no <PLACEHOLDER> left).
|
||||
# 2. ./generate-cluster-certs.sh (mints out/<name>/ TLS material)
|
||||
# 3. Create the route secret locally: mkdir -p secrets && openssl rand -hex 32 > secrets/cluster.pass
|
||||
# (secrets/ is gitignored; it is rsynced to each node as cluster.pass)
|
||||
# 4. SSH access to every node's SSH_HOST with sudo-less root (SSH_USER=root).
|
||||
#
|
||||
# What it does per node (with --yes):
|
||||
# - rsync the membershipd binary, the node's TLS material, the unit, the
|
||||
# generated cluster.env and the route secret into REMOTE_DIR.
|
||||
# - install + daemon-reload the systemd unit.
|
||||
# Start is STAGGERED and left to the human (see README): start the seed node,
|
||||
# seed the admin, then start the rest.
|
||||
set -euo pipefail
|
||||
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$DIR"
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source ./nodes.env
|
||||
|
||||
APPLY=0
|
||||
[[ "${1:-}" == "--yes" ]] && APPLY=1
|
||||
|
||||
if grep -q '<[A-Z_]\+>' nodes.env; then
|
||||
echo "ERROR: nodes.env still has <PLACEHOLDER> values — fill them in first." >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
SECRET_FILE="secrets/cluster.pass"
|
||||
if [[ ! -f "$SECRET_FILE" ]]; then
|
||||
echo "ERROR: $SECRET_FILE missing. HUMAN: mkdir -p secrets && openssl rand -hex 32 > $SECRET_FILE" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
run() {
|
||||
# Echo every action; only execute it under --yes.
|
||||
echo " + $*"
|
||||
if [[ $APPLY -eq 1 ]]; then
|
||||
"$@"
|
||||
fi
|
||||
}
|
||||
|
||||
echo "==> [1/3] cross-build membershipd (linux/amd64, CGO disabled)"
|
||||
run env CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o build/membershipd ../../cmd/membershipd
|
||||
|
||||
# Build the comma-separated route list for a node = the OTHER nodes' addresses on
|
||||
# the chosen network, with NO userinfo (the secret is injected by membershipd from
|
||||
# the file). Echoes nothing; prints the value.
|
||||
routes_for() {
|
||||
local self="$1" out=""
|
||||
local row name _ssh pub wg addr
|
||||
for row in "${CLUSTER_NODES[@]}"; do
|
||||
read -r name _ssh pub wg <<<"$row"
|
||||
[[ "$name" == "$self" ]] && continue
|
||||
if [[ "$ROUTE_NETWORK" == "public" ]]; then addr="$pub"; else addr="$wg"; fi
|
||||
out+="nats://${addr}:${NATS_ROUTE_PORT},"
|
||||
done
|
||||
echo "${out%,}"
|
||||
}
|
||||
|
||||
echo "==> [2/3] stage each node (REMOTE_DIR=$REMOTE_DIR)"
|
||||
for row in "${CLUSTER_NODES[@]}"; do
|
||||
read -r name ssh _pub _wg <<<"$row"
|
||||
target="${SSH_USER}@${ssh}"
|
||||
nodedir="out/${name}"
|
||||
if [[ ! -d "$nodedir" ]]; then
|
||||
echo "ERROR: $nodedir missing — run ./generate-cluster-certs.sh first." >&2
|
||||
exit 2
|
||||
fi
|
||||
routes="$(routes_for "$name")"
|
||||
|
||||
echo "-- node ${name} (ssh ${ssh}) routes=${routes}"
|
||||
|
||||
# Generate this node's cluster.env locally, then ship it.
|
||||
envfile="build/cluster-${name}.env"
|
||||
mkdir -p build
|
||||
cat > "$envfile" <<EOF
|
||||
NODE_NAME=${name}
|
||||
CLUSTER_NAME=${CLUSTER_NAME}
|
||||
CLUSTER_USER=${CLUSTER_USER}
|
||||
KV_REPLICAS=${KV_REPLICAS}
|
||||
HTTP_PORT=${HTTP_PORT}
|
||||
NATS_CLIENT_PORT=${NATS_CLIENT_PORT}
|
||||
NATS_ROUTE_PORT=${NATS_ROUTE_PORT}
|
||||
ROUTES=${routes}
|
||||
CLUSTER_PASS_FILE=${REMOTE_DIR}/secrets/cluster.pass
|
||||
TLS_CERT=${REMOTE_DIR}/tls/server-${name}.crt
|
||||
TLS_KEY=${REMOTE_DIR}/tls/server-${name}.key
|
||||
ROUTE_TLS_CERT=${REMOTE_DIR}/tls/route-${name}.crt
|
||||
ROUTE_TLS_KEY=${REMOTE_DIR}/tls/route-${name}.key
|
||||
ROUTE_TLS_CA=${REMOTE_DIR}/tls/cluster-ca.crt
|
||||
INTERNAL_ID_FILE=${REMOTE_DIR}/secrets/internal.id
|
||||
EOF
|
||||
|
||||
run ssh "$target" "mkdir -p ${REMOTE_DIR}/tls ${REMOTE_DIR}/secrets"
|
||||
run rsync -az build/membershipd "${target}:${REMOTE_DIR}/membershipd"
|
||||
run rsync -az "${nodedir}/" "${target}:${REMOTE_DIR}/tls/"
|
||||
run rsync -az "$SECRET_FILE" "${target}:${REMOTE_DIR}/secrets/cluster.pass"
|
||||
run rsync -az "$envfile" "${target}:${REMOTE_DIR}/cluster.env"
|
||||
run rsync -az membershipd-cluster.service "${target}:/etc/systemd/system/membershipd-cluster.service"
|
||||
run ssh "$target" "chmod 600 ${REMOTE_DIR}/secrets/cluster.pass ${REMOTE_DIR}/tls/*.key && systemctl daemon-reload"
|
||||
done
|
||||
|
||||
echo "==> [3/3] staged."
|
||||
if [[ $APPLY -eq 0 ]]; then
|
||||
echo " DRY-RUN: nothing was sent. Re-run with --yes to apply."
|
||||
fi
|
||||
cat <<'NEXT'
|
||||
|
||||
HUMAN — bring up (see README "Bring up" — a LONE node has no quorum and never
|
||||
serves healthz, so do NOT gate the next node on the previous one going green):
|
||||
1. Seed the FIRST admin into the KV via the loopback bootstrap (README
|
||||
"Seed the first admin"); this is needed only for the chicken-and-egg admin.
|
||||
2. Start all three so a 2/3 quorum forms (order does not matter); healthz
|
||||
turns ok only once the meta-group elects a leader (~10-30s cold):
|
||||
for h in magnus homer datardos; do ssh "$h" 'sudo systemctl enable --now membershipd-cluster'; done
|
||||
3. Verify posture + quorum (README "Verify").
|
||||
4. Ensure R3 on every control-plane stream (README "Replication: go straight to
|
||||
R3"); R1 is a SPOF, not a milestone.
|
||||
5. Add further users with the cluster LIVE — no restart — via
|
||||
`membershipd user add --store kv` (README "Add users to the live cluster").
|
||||
NEXT
|
||||
Executable
+120
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# generate-cluster-certs.sh — mint the TLS material for a unibus 3-node cluster
|
||||
# (issue 0006g). Run ONCE on a trusted machine (e.g. om, which custodies the bus
|
||||
# CA); distribute the per-node output to each node over a secure channel. This
|
||||
# script touches NO remote host.
|
||||
#
|
||||
# It produces two trust roots, kept SEPARATE on purpose (audit 0008 N1-low):
|
||||
#
|
||||
# 1. The CLUSTER route CA (cluster-ca.crt/key, generated here): signs each
|
||||
# node's ROUTE certificate. The route layer authenticates NODES, not bus
|
||||
# users, so it must NOT share the client data-plane CA — a client cert can
|
||||
# then never be presented to the route port.
|
||||
# 2. The CLIENT data-plane CA (../tls/ca.crt/key, the one clients pin): signs
|
||||
# each node's DATA-PLANE server certificate. Reused, not regenerated, so
|
||||
# existing clients keep trusting the bus.
|
||||
#
|
||||
# Per node it emits, under out/<name>/:
|
||||
# route-<name>.crt/key route cert (cluster CA), EKU server+clientAuth
|
||||
# (each node is BOTH server and dialer to its peers)
|
||||
# server-<name>.crt/key data-plane cert (client CA), EKU serverAuth
|
||||
# cluster-ca.crt the route CA cert (for --route-tls-ca)
|
||||
# ca.crt the client CA cert (for clients / control-plane TLS)
|
||||
#
|
||||
# SANs per node = its public IP + its WireGuard IP + its hostname + localhost.
|
||||
#
|
||||
# Key material: EC P-256 (Go crypto/tls + nats-server friendly), matching
|
||||
# ../tls/generate-certs.sh.
|
||||
set -euo pipefail
|
||||
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$DIR"
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source ./nodes.env
|
||||
|
||||
# Refuse to run while any placeholder remains (HUMAN must fill nodes.env first).
|
||||
if grep -q '<[A-Z_]\+>' nodes.env; then
|
||||
echo "ERROR: nodes.env still has <PLACEHOLDER> values — fill them in first." >&2
|
||||
grep -n '<[A-Z_]\+>' nodes.env >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
CLIENT_CA_CRT="../tls/ca.crt"
|
||||
CLIENT_CA_KEY="../tls/ca.key"
|
||||
if [[ ! -f "$CLIENT_CA_CRT" || ! -f "$CLIENT_CA_KEY" ]]; then
|
||||
echo "ERROR: client data-plane CA not found at ../tls/ca.{crt,key}." >&2
|
||||
echo " Run ../tls/generate-certs.sh first (it mints the client CA)." >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
DAYS_CA=3650
|
||||
DAYS_CRT=825
|
||||
|
||||
force=0
|
||||
[[ "${1:-}" == "--force" ]] && force=1
|
||||
|
||||
# --- cluster route CA (separate trust root) ---
|
||||
if [[ ! -f cluster-ca.crt || ! -f cluster-ca.key || $force -eq 1 ]]; then
|
||||
echo "==> generating cluster route CA (separate from the client CA)"
|
||||
openssl ecparam -name prime256v1 -genkey -noout -out cluster-ca.key
|
||||
chmod 600 cluster-ca.key
|
||||
openssl req -x509 -new -key cluster-ca.key -sha256 -days "$DAYS_CA" \
|
||||
-subj "/CN=unibus-cluster-ca" -out cluster-ca.crt
|
||||
else
|
||||
echo "==> reusing existing cluster route CA (pass --force to regenerate)"
|
||||
fi
|
||||
|
||||
# mint <out_key> <out_crt> <subject_cn> <san> <eku> <ca_crt> <ca_key>
|
||||
mint_cert() {
|
||||
local out_key="$1" out_crt="$2" cn="$3" san="$4" eku="$5" ca_crt="$6" ca_key="$7"
|
||||
local csr ext
|
||||
csr="$(mktemp)"
|
||||
ext="$(mktemp)"
|
||||
openssl ecparam -name prime256v1 -genkey -noout -out "$out_key"
|
||||
chmod 600 "$out_key"
|
||||
openssl req -new -key "$out_key" -subj "/CN=${cn}" -out "$csr"
|
||||
cat > "$ext" <<EOF
|
||||
subjectAltName=${san}
|
||||
extendedKeyUsage=${eku}
|
||||
keyUsage=digitalSignature,keyEncipherment
|
||||
EOF
|
||||
openssl x509 -req -in "$csr" -CA "$ca_crt" -CAkey "$ca_key" -CAcreateserial \
|
||||
-sha256 -days "$DAYS_CRT" -extfile "$ext" -out "$out_crt"
|
||||
rm -f "$csr" "$ext"
|
||||
}
|
||||
|
||||
for row in "${CLUSTER_NODES[@]}"; do
|
||||
read -r name _ssh pub wg <<<"$row"
|
||||
echo "==> node ${name}: SAN IP:${pub}, IP:${wg}, DNS:${name}, localhost, 127.0.0.1"
|
||||
nodedir="out/${name}"
|
||||
mkdir -p "$nodedir"
|
||||
san="IP:${pub},IP:${wg},DNS:${name},DNS:localhost,IP:127.0.0.1"
|
||||
|
||||
# Route cert: signed by the cluster CA; server+client auth (mutual routes).
|
||||
mint_cert "${nodedir}/route-${name}.key" "${nodedir}/route-${name}.crt" \
|
||||
"unibus-route-${name}" "$san" "serverAuth,clientAuth" \
|
||||
cluster-ca.crt cluster-ca.key
|
||||
|
||||
# Data-plane server cert: signed by the client CA; serverAuth only.
|
||||
mint_cert "${nodedir}/server-${name}.key" "${nodedir}/server-${name}.crt" \
|
||||
"unibus-${name}" "$san" "serverAuth" \
|
||||
"$CLIENT_CA_CRT" "$CLIENT_CA_KEY"
|
||||
|
||||
# Co-locate the two CA certs each node needs.
|
||||
cp cluster-ca.crt "${nodedir}/cluster-ca.crt"
|
||||
cp "$CLIENT_CA_CRT" "${nodedir}/ca.crt"
|
||||
done
|
||||
|
||||
rm -f cluster-ca.srl ../tls/ca.srl 2>/dev/null || true
|
||||
|
||||
echo
|
||||
echo "==> done. Per-node material under out/<name>/ (KEYS ARE SECRET — never git):"
|
||||
for row in "${CLUSTER_NODES[@]}"; do
|
||||
read -r name _rest <<<"$row"
|
||||
echo " out/${name}/ (route-${name}.*, server-${name}.*, cluster-ca.crt, ca.crt)"
|
||||
done
|
||||
echo
|
||||
echo "verify a SAN with:"
|
||||
echo " openssl x509 -in out/<name>/server-<name>.crt -noout -text | grep -A1 'Subject Alternative Name'"
|
||||
@@ -0,0 +1,46 @@
|
||||
[Unit]
|
||||
# unibus membershipd — cluster node (issue 0006g).
|
||||
#
|
||||
# One unit, parameterized per node by /opt/unibus/cluster.env (generated by
|
||||
# deploy-cluster.sh): NODE_NAME, ROUTES and the cert paths differ per node, the
|
||||
# rest of the posture (enforce + per-subject ACL + TLS + --store kv) is identical
|
||||
# on every node, which is the homogeneous posture a secure cluster requires
|
||||
# (audit 0008 N1).
|
||||
Description=unibus membershipd (cluster node)
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/opt/unibus
|
||||
EnvironmentFile=/opt/unibus/cluster.env
|
||||
# The route password comes from a FILE referenced by ${CLUSTER_PASS_FILE}, never
|
||||
# from argv (audit 0008 N1-low). The peer --routes carry no userinfo; membershipd
|
||||
# injects the credentials from the file/user.
|
||||
ExecStart=/opt/unibus/membershipd \
|
||||
--bind 0.0.0.0 \
|
||||
--bus-auth enforce \
|
||||
--http-port ${HTTP_PORT} \
|
||||
--nats-port ${NATS_CLIENT_PORT} \
|
||||
--tls-cert ${TLS_CERT} \
|
||||
--tls-key ${TLS_KEY} \
|
||||
--cluster-name ${CLUSTER_NAME} \
|
||||
--server-name ${NODE_NAME} \
|
||||
--cluster-port ${NATS_ROUTE_PORT} \
|
||||
--routes ${ROUTES} \
|
||||
--cluster-user ${CLUSTER_USER} \
|
||||
--cluster-pass-file ${CLUSTER_PASS_FILE} \
|
||||
--route-tls-cert ${ROUTE_TLS_CERT} \
|
||||
--route-tls-key ${ROUTE_TLS_KEY} \
|
||||
--route-tls-ca ${ROUTE_TLS_CA} \
|
||||
--internal-id-file ${INTERNAL_ID_FILE} \
|
||||
--store kv \
|
||||
--kv-replicas ${KV_REPLICAS}
|
||||
# Restart=always (NOT on-failure): a clean SIGTERM exits success, and on-failure
|
||||
# would then NOT restart, leaving the node silently dead (see function_tags.md).
|
||||
Restart=always
|
||||
RestartSec=2
|
||||
LimitNOFILE=65536
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -0,0 +1,57 @@
|
||||
# Cluster topology for the unibus 3-node deployment (issue 0006g).
|
||||
#
|
||||
# This file is SOURCED by generate-cluster-certs.sh and deploy-cluster.sh.
|
||||
#
|
||||
# HUMAN: fill in every placeholder with the real value before running the
|
||||
# scripts. The public IPs known at authoring time are pre-filled; the WireGuard
|
||||
# mesh IPs and magnus's public IP must be supplied. The scripts refuse to run
|
||||
# while any unfilled placeholder remains.
|
||||
|
||||
# Cluster identity (must be identical on every node).
|
||||
CLUSTER_NAME="unibus"
|
||||
# Route-secret username; the password is NOT here — it lives in a file (see
|
||||
# CLUSTER_PASS_FILE in deploy-cluster.sh) so it never lands in argv or git.
|
||||
CLUSTER_USER="unibus-cluster"
|
||||
|
||||
# KV/nonce replication factor. START AT 1 for the initial 1->3 rollout, then raise
|
||||
# to 3 IN PLACE (see README "Scale to R3") once all three nodes have joined. Only
|
||||
# set this to 3 here after the third node is up and you re-run the KV update.
|
||||
KV_REPLICAS=3
|
||||
|
||||
# Ports (same on every node; the route port is server-to-server only).
|
||||
NATS_CLIENT_PORT=4250
|
||||
NATS_ROUTE_PORT=6250
|
||||
HTTP_PORT=8470
|
||||
|
||||
# Remote install layout and SSH login user.
|
||||
REMOTE_DIR="/opt/unibus"
|
||||
SSH_USER="root"
|
||||
|
||||
# Which address family the inter-node routes use. "wg" builds --routes from the
|
||||
# WireGuard mesh IPs (private server-to-server links, preferred); "public" uses
|
||||
# the public IPs. The route layer is always mutual-TLS regardless.
|
||||
#
|
||||
# DEPLOY DECISION (2026-06-07): set to "public". No WireGuard mesh exists between
|
||||
# the three cluster nodes — homer and datardos do not even have the `wg` binary
|
||||
# installed, and om's only WG peers are the operator's personal PCs, not the VPS.
|
||||
# Rather than stand up a fresh mesh blindly, the routes go over the public IPs,
|
||||
# still protected by the separate cluster route CA (mutual-TLS). On magnus (the
|
||||
# only node with ufw active) the route port 6250 is restricted to the homer and
|
||||
# datardos public IPs; homer/datardos run ufw inactive (Docker hosts) and rely on
|
||||
# the route mutual-TLS for 6250.
|
||||
ROUTE_NETWORK="public"
|
||||
|
||||
# One row per node: NAME SSH_HOST PUBLIC_IP WG_IP
|
||||
# NAME -> --server-name and the per-node cert filenames (unique).
|
||||
# SSH_HOST -> the `ssh ALIAS` alias (see ~/.ssh/config).
|
||||
# PUBLIC_IP -> public address; goes in the cert SANs (client-facing data plane).
|
||||
# WG_IP -> WireGuard mesh address; cert SAN + route target when ROUTE_NETWORK=wg.
|
||||
# NOTE: with ROUTE_NETWORK=public and no WireGuard mesh, the WG_IP column is set to
|
||||
# each node's public IP so the cert SAN covers the address actually used by the
|
||||
# public routes and no unfilled placeholder remains (scripts refuse to run otherwise).
|
||||
# magnus == organic-machine.com == om (135.125.201.30); SSH alias `magnus` enters as root.
|
||||
CLUSTER_NODES=(
|
||||
"magnus magnus 135.125.201.30 135.125.201.30"
|
||||
"homer homer 141.94.69.66 141.94.69.66"
|
||||
"datardos dd 51.91.100.142 51.91.100.142"
|
||||
)
|
||||
Executable
+31
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
# Build membershipd and install/enable/start it as a systemd user service.
|
||||
# Idempotent: safe to re-run after a code change to rebuild and restart.
|
||||
set -euo pipefail
|
||||
|
||||
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
UNIT="unibus-membershipd.service"
|
||||
USER_UNIT_DIR="${XDG_CONFIG_HOME:-$HOME/.config}/systemd/user"
|
||||
|
||||
cd "$APP_DIR"
|
||||
|
||||
echo "==> building membershipd (CGO_ENABLED=0)"
|
||||
CGO_ENABLED=0 go build -o membershipd ./cmd/membershipd
|
||||
|
||||
echo "==> linking unit into $USER_UNIT_DIR"
|
||||
mkdir -p "$USER_UNIT_DIR"
|
||||
ln -sf "$APP_DIR/deploy/$UNIT" "$USER_UNIT_DIR/$UNIT"
|
||||
|
||||
echo "==> reloading systemd and (re)starting the service"
|
||||
systemctl --user daemon-reload
|
||||
systemctl --user enable --now "$UNIT"
|
||||
# If the service was already running, enable --now does not restart it; do so to
|
||||
# pick up the freshly built binary.
|
||||
systemctl --user restart "$UNIT"
|
||||
|
||||
echo "==> status"
|
||||
systemctl --user --no-pager status "$UNIT" || true
|
||||
|
||||
echo
|
||||
echo "Health check:"
|
||||
echo " curl -fsS http://127.0.0.1:8470/healthz"
|
||||
@@ -0,0 +1,6 @@
|
||||
# Private keys and the deploy-specific server certificate never go to git.
|
||||
# Only the public CA certificate (ca.crt) is versioned, because clients embed it.
|
||||
*.key
|
||||
*.csr
|
||||
*.srl
|
||||
server.crt
|
||||
@@ -0,0 +1,56 @@
|
||||
# Bus TLS — self-signed CA and server certificate
|
||||
|
||||
The unibus data plane (NATS) is encrypted with TLS using the project's own
|
||||
self-signed CA. The bus is exposed publicly, protected by auth + TLS, so the CA
|
||||
is private (not Let's Encrypt) and every client we control embeds the public
|
||||
`ca.crt`; the server presents `server.crt`/`server.key`.
|
||||
|
||||
## Files
|
||||
|
||||
| File | Secret? | Goes where |
|
||||
|---|---|---|
|
||||
| `ca.crt` | no (public) | versioned in git; embedded/distributed to every client |
|
||||
| `ca.key` | **yes** | stays on the machine that mints certs; gitignored |
|
||||
| `server.crt` | no | deployed to the bus host; gitignored (deploy-specific SANs) |
|
||||
| `server.key` | **yes** | deployed to the bus host over a secure channel; gitignored |
|
||||
|
||||
Only `ca.crt` is committed. `ca.key`, `server.key`, `server.crt`, and any
|
||||
`*.csr`/`*.srl` are gitignored — see `.gitignore`.
|
||||
|
||||
## Generate
|
||||
|
||||
```bash
|
||||
cd deploy/tls
|
||||
./generate-certs.sh # CA (if missing) + server cert with default SANs
|
||||
./generate-certs.sh --force # also regenerate the CA (invalidates pinned clients)
|
||||
```
|
||||
|
||||
The server certificate's SANs cover the public IP, the WireGuard IP, the om
|
||||
hostname, plus `localhost`/`127.0.0.1` for on-host smoke tests. Override the
|
||||
defaults via environment variables:
|
||||
|
||||
```bash
|
||||
UNIBUS_PUBLIC_IP=135.125.201.30 UNIBUS_WG_IP=10.42.0.1 UNIBUS_HOSTNAME=om ./generate-certs.sh
|
||||
```
|
||||
|
||||
Verify the SANs:
|
||||
|
||||
```bash
|
||||
openssl x509 -in server.crt -noout -text | grep -A1 'Subject Alternative Name'
|
||||
```
|
||||
|
||||
## Use
|
||||
|
||||
- **Server** (`membershipd`, phase 0001e): point it at `server.crt`/`server.key`
|
||||
so the embedded NATS presents the certificate and requires TLS. Built with
|
||||
`busauth.ServerTLSConfig(certPath, keyPath)`.
|
||||
- **Clients** (Go peers, mobile binding, gateway): pin `ca.crt` with
|
||||
`busauth.LoadCATLSConfig(caPath)` and pass the result as `client.Options.TLS`.
|
||||
|
||||
## Rotation
|
||||
|
||||
The CA is long-lived (10 years). Rotate the server certificate (825 days) by
|
||||
re-running `generate-certs.sh` (without `--force`) and redeploying
|
||||
`server.crt`/`server.key`; clients are unaffected because they pin the CA, not
|
||||
the server cert. Rotating the CA (`--force`) requires redistributing `ca.crt` to
|
||||
every client.
|
||||
@@ -0,0 +1,11 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIBfTCCASOgAwIBAgIUW2HZJDDlixxw/DgNP/IDIrJ7MeMwCgYIKoZIzj0EAwIw
|
||||
FDESMBAGA1UEAwwJdW5pYnVzLWNhMB4XDTI2MDYwNzEwNDIyNloXDTM2MDYwNDEw
|
||||
NDIyNlowFDESMBAGA1UEAwwJdW5pYnVzLWNhMFkwEwYHKoZIzj0CAQYIKoZIzj0D
|
||||
AQcDQgAEe2by5l9dcEbqKB11yJtPIH9S/01XNhuFnBB/IpDevO2fWLLV+muqoB8C
|
||||
ADH1wKleq8jF5D0sSlK2DCuYrjAjPqNTMFEwHQYDVR0OBBYEFABX+UI7bXICRF4l
|
||||
WmmDR/rUtxnrMB8GA1UdIwQYMBaAFABX+UI7bXICRF4lWmmDR/rUtxnrMA8GA1Ud
|
||||
EwEB/wQFMAMBAf8wCgYIKoZIzj0EAwIDSAAwRQIgCAeOYTKvA6SBB8xMdMdqNrp1
|
||||
20OPyi2BwFovW6vTCLMCIQC1qRi8SGRHTui8BVqIvp/DFJaZ/U8ocAg/qedLdy+R
|
||||
/w==
|
||||
-----END CERTIFICATE-----
|
||||
Executable
+64
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# generate-certs.sh — mint the unibus bus's self-signed CA and the NATS server
|
||||
# certificate. Run once on a trusted machine; distribute ca.crt to clients and
|
||||
# server.crt/server.key to the bus host (server.key by a secure channel, never
|
||||
# git). Re-running regenerates the server cert; pass --force to also regenerate
|
||||
# the CA (which invalidates every client that pinned the old ca.crt).
|
||||
#
|
||||
# SANs cover the public IP, the WireGuard IP, the om hostname, plus localhost so
|
||||
# the operator can smoke-test the TLS handshake on the box. Override via env:
|
||||
# UNIBUS_PUBLIC_IP (default 135.125.201.30)
|
||||
# UNIBUS_WG_IP (default 10.42.0.1)
|
||||
# UNIBUS_HOSTNAME (default om)
|
||||
#
|
||||
# Key material: EC P-256 (widely supported by Go's crypto/tls and nats-server).
|
||||
set -euo pipefail
|
||||
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$DIR"
|
||||
|
||||
PUBLIC_IP="${UNIBUS_PUBLIC_IP:-135.125.201.30}"
|
||||
WG_IP="${UNIBUS_WG_IP:-10.42.0.1}"
|
||||
HOSTNAME_OM="${UNIBUS_HOSTNAME:-om}"
|
||||
DAYS_CA=3650
|
||||
DAYS_SRV=825
|
||||
|
||||
force=0
|
||||
[[ "${1:-}" == "--force" ]] && force=1
|
||||
|
||||
# --- CA (long-lived; only the cert is public) ---
|
||||
if [[ ! -f ca.crt || ! -f ca.key || $force -eq 1 ]]; then
|
||||
echo "==> generating CA"
|
||||
openssl ecparam -name prime256v1 -genkey -noout -out ca.key
|
||||
chmod 600 ca.key
|
||||
openssl req -x509 -new -key ca.key -sha256 -days "$DAYS_CA" \
|
||||
-subj "/CN=unibus-ca" -out ca.crt
|
||||
else
|
||||
echo "==> reusing existing CA (pass --force to regenerate)"
|
||||
fi
|
||||
|
||||
# --- server certificate, signed by the CA, with the bus SANs ---
|
||||
echo "==> generating server certificate (SAN: $PUBLIC_IP, $WG_IP, $HOSTNAME_OM, localhost, 127.0.0.1)"
|
||||
openssl ecparam -name prime256v1 -genkey -noout -out server.key
|
||||
chmod 600 server.key
|
||||
openssl req -new -key server.key -subj "/CN=unibus-bus" -out server.csr
|
||||
|
||||
cat > server.ext <<EOF
|
||||
subjectAltName=IP:${PUBLIC_IP},IP:${WG_IP},DNS:${HOSTNAME_OM},DNS:localhost,IP:127.0.0.1
|
||||
extendedKeyUsage=serverAuth
|
||||
keyUsage=digitalSignature,keyEncipherment
|
||||
EOF
|
||||
|
||||
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial \
|
||||
-sha256 -days "$DAYS_SRV" -extfile server.ext -out server.crt
|
||||
|
||||
rm -f server.csr server.ext ca.srl
|
||||
|
||||
echo "==> done:"
|
||||
echo " ca.crt -> embed/distribute to every client (public)"
|
||||
echo " server.crt -> deploy to the bus host"
|
||||
echo " server.key -> deploy to the bus host over a secure channel (NEVER git)"
|
||||
echo
|
||||
echo "verify SANs with:"
|
||||
echo " openssl x509 -in server.crt -noout -text | grep -A1 'Subject Alternative Name'"
|
||||
@@ -0,0 +1,22 @@
|
||||
[Unit]
|
||||
Description=unibus membershipd — control plane (rooms, keys, blobs) + embedded NATS/JetStream
|
||||
Documentation=https://gitea-dgg044oo04woo4ggcsws4gk0.organic-machine.com/dataforge/unibus
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=%h/fn_registry/projects/message_bus/apps/unibus
|
||||
# --bind 0.0.0.0 exposes BOTH the HTTP control plane (:8470) and the embedded
|
||||
# NATS data plane (:4250) to the LAN so phones / other PCs can connect.
|
||||
ExecStart=%h/fn_registry/projects/message_bus/apps/unibus/membershipd --bind 0.0.0.0
|
||||
# Restart=always (NOT on-failure): a clean SIGTERM shutdown exits 0, and
|
||||
# on-failure would then NOT restart, leaving the service silently dead. always
|
||||
# brings it back regardless of exit code. See .claude/rules/function_tags.md.
|
||||
Restart=always
|
||||
RestartSec=2
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=default.target
|
||||
@@ -0,0 +1,55 @@
|
||||
# Issue 0001e — remaining client migrations (notes, NOT implemented)
|
||||
|
||||
Phase 0001e migrated the first-class Go clients and the mobile binding to the
|
||||
secure connection path (`client.Connect(caPath)` → TLS + nkey; control-plane
|
||||
requests are always signed). Two consumers are intentionally **left as notes**
|
||||
because they live outside this sub-repo or need their own coordination:
|
||||
|
||||
## 1. Web gateway (`playground/server.go`)
|
||||
|
||||
The playground is a local dev gateway that embeds its own membershipd
|
||||
(`membership.NewServer(..., AuthOff)`) and an open embedded NATS, and connects
|
||||
browser sessions through an in-process client. To run it against a **secured**
|
||||
bus it would need:
|
||||
|
||||
- Connect its internal client via `client.Connect(natsURL, ctrlURL, id, caPath)`
|
||||
with the bundled `ca.crt` (it currently builds the client without options).
|
||||
- If it should itself enforce auth on the browser-facing side, start its
|
||||
embedded membershipd with an auth mode and its embedded NATS with
|
||||
`embeddednats.StartServer(ServerConfig{Auth: ..., TLS: ...})` — but a local
|
||||
dev gateway typically stays open and only the *upstream* bus is secured.
|
||||
- The gateway's own bus identity must be registered in the upstream allowlist
|
||||
(`membershipd user add`).
|
||||
|
||||
Decision: left at `AuthOff` + plaintext for now (local dev tool). Migrate when
|
||||
the gateway is pointed at the public bus.
|
||||
|
||||
## 2. unibots (`shell/transportunibus`, in the agents repo — NOT this sub-repo)
|
||||
|
||||
The bot transport lives in the `agents_and_robots` / message_bus consumer, not
|
||||
in `dataforge/unibus`. To talk to the secured bus it must, after recompiling
|
||||
against this `pkg/client`:
|
||||
|
||||
- Switch its connect call to `client.Connect(natsURL, ctrlURL, id, caPath)`,
|
||||
passing the path to the bundled `ca.crt`.
|
||||
- Ship `ca.crt` alongside the bot binary (read-only) and point `caPath` at it.
|
||||
- Register each bot's identity (`hex(SignPub)`) in the bus allowlist via
|
||||
`membershipd user add --handle <bot> --sign-pub <hex>` on the bus host.
|
||||
- Run as `systemd --user` with `caPath` set, per the deploy plan (0001f).
|
||||
|
||||
No code change is possible from this sub-repo; this is the contract the bot
|
||||
transport consumes.
|
||||
|
||||
## Server enablement (operator, phase 0001f)
|
||||
|
||||
`membershipd` now accepts:
|
||||
|
||||
- `--bus-auth enforce` — verify signed control-plane requests AND turn on the
|
||||
NATS nkey authenticator (only allowlisted identities connect).
|
||||
- `--tls-cert deploy/tls/server.crt --tls-key deploy/tls/server.key` — present
|
||||
the server certificate and require TLS on the embedded NATS.
|
||||
|
||||
`dev/feature_flags.json` now declares both `bus-auth: enforce` and
|
||||
`bus-tls: enabled` as the project's target state. The flags are declarative;
|
||||
the operator activates them at deploy time with the flags above. The CLI
|
||||
defaults remain off so local dev and the test suite are unaffected.
|
||||
@@ -0,0 +1,80 @@
|
||||
# 0004d — Data-plane access control on NATS (audit H4)
|
||||
|
||||
## The finding
|
||||
|
||||
The NATS authenticator (`pkg/busauth`) decides one thing per connection:
|
||||
*is this identity registered on the bus?* It does **not** scope what a connected
|
||||
client may subscribe to or publish. There is a single NATS account with no
|
||||
`Permissions`, so any registered peer can subscribe to, or publish on, **any**
|
||||
subject. Concretely:
|
||||
|
||||
- A cleartext room (`ModeNATS`) carries its payload in the clear on its subject.
|
||||
A registered peer that knows or guesses the subject subscribes and reads the
|
||||
content directly (the auditor's `TestAudit_NoSubjectACL`: eve, never invited,
|
||||
receives `"internal: salary numbers"`).
|
||||
- An encrypted room (`ModeMatrix`) keeps its **content** confidential (the
|
||||
payload is AEAD ciphertext), but the **metadata of traffic** — that a subject
|
||||
is active, message sizes and timing, who is publishing — is still observable by
|
||||
any registered peer that subscribes to the subject.
|
||||
|
||||
## Why the "complete" fix does not fit here
|
||||
|
||||
The preferred fix is per-subject permissions derived from room membership: when a
|
||||
client connects, the authenticator looks up the rooms it belongs to and grants
|
||||
`Sub`/`Pub` only on those subjects. NATS supports this — `CustomClientAuthentication`
|
||||
can register a `*server.User` carrying `Permissions`.
|
||||
|
||||
The blocker is that **NATS evaluates permissions once, at connect time, and never
|
||||
re-evaluates them on a live connection.** unibus clients routinely *connect → create
|
||||
or get invited to a room → publish/subscribe* within the **same** connection
|
||||
(`TestSecureBusEndToEnd` does exactly this: A connects, then creates `room.secure`,
|
||||
then publishes to it). Permissions frozen at connect time would not include a room
|
||||
created or joined afterwards, so the legitimate owner could not publish to the room
|
||||
it just made. Making per-subject ACLs work would therefore require the client to
|
||||
**reconnect on every membership change**, an invasive change to the client library
|
||||
and to every peer (worker, chat, mobile) — and the prompt for this issue scopes the
|
||||
client changes to the minimum.
|
||||
|
||||
That dynamic-membership reconnection model is precisely the redesign that issue
|
||||
**0003** (decentralization) already has to do: it moves the control-plane state to a
|
||||
replicated JetStream KV and reworks how nodes and clients (re)establish sessions. Per
|
||||
the issue's own guidance ("if a complete strategy does not fit, implement the minimum
|
||||
defense and document the rest"), the full subject ACL is deferred to 0003, where the
|
||||
session/permission model is being rebuilt anyway.
|
||||
|
||||
## The strategy implemented here: forbid cleartext rooms in public
|
||||
|
||||
`Server.RequireEncryptedRooms` (set by `membershipd` on any non-loopback bind)
|
||||
refuses to create a cleartext (`ModeNATS`) room. Every room on a public deployment
|
||||
is therefore end-to-end encrypted, so **message content stays confidential even
|
||||
though the transport offers no subject isolation**: a peer that sniffs another
|
||||
room's subject receives only AEAD ciphertext it has no key for.
|
||||
|
||||
This composes with the 0004c control-plane authorization: a non-member cannot even
|
||||
learn a room's subject through the control plane (`GET /rooms/{id}` → 403), so to
|
||||
sniff it an attacker must already know or guess the subject out of band.
|
||||
|
||||
## What this does NOT close (residual exposure, by design)
|
||||
|
||||
- **Traffic metadata.** A registered peer that already knows a subject can still
|
||||
subscribe and observe that the subject is active, the ciphertext sizes, and the
|
||||
timing/cadence of messages. It cannot read content.
|
||||
- **Cross-room publish.** A registered peer can still *publish* arbitrary bytes on
|
||||
any subject. In an encrypted room those bytes fail AEAD open and the signature
|
||||
check (`SignMsgs`), so receivers drop them — it is a nuisance/spam vector, not a
|
||||
confidentiality or integrity break.
|
||||
- **WireGuard-only deployments** may still use cleartext rooms (the guard only trips
|
||||
on a public bind), because the network already restricts who can reach the bus.
|
||||
|
||||
Closing the residual metadata exposure requires the per-subject ACL described above,
|
||||
tracked for issue 0003.
|
||||
|
||||
## Regression evidence
|
||||
|
||||
- `pkg/membership` — `TestRequireEncryptedRoomsRejectsCleartext`: with
|
||||
`RequireEncryptedRooms` on, `POST /rooms` for a cleartext policy returns 403 while
|
||||
an encrypted-room create returns 201.
|
||||
- `pkg/client` — `TestAudit_NoSubjectACL`: under the public posture, creating a
|
||||
`ModeNATS` room fails; alice creates an encrypted room and publishes; eve (a
|
||||
registered non-member) raw-subscribes to the subject and receives only ciphertext —
|
||||
she never recovers the plaintext.
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"flags": {
|
||||
"bus-auth": {
|
||||
"enabled": true,
|
||||
"state": "enforce",
|
||||
"issue": "0001",
|
||||
"description": "Signed control-plane auth + NATS nkey auth. Rollout: off -> soft (verify+log, allow) -> enforce (reject). 'enabled' mirrors state!=off. Server opts in via membershipd --bus-auth; clients via client.Connect(caPath).",
|
||||
"added": "2026-06-07",
|
||||
"enabled_at": "2026-06-07"
|
||||
},
|
||||
"bus-tls": {
|
||||
"enabled": true,
|
||||
"issue": "0001",
|
||||
"description": "TLS on the NATS data plane using the project's self-signed CA (deploy/tls/). Server opts in via membershipd --tls-cert/--tls-key; clients pin ca.crt via client.Connect(caPath).",
|
||||
"added": "2026-06-07",
|
||||
"enabled_at": "2026-06-07"
|
||||
},
|
||||
"decentralized": {
|
||||
"enabled": false,
|
||||
"issue": "0003",
|
||||
"description": "Control-plane state on replicated JetStream KV instead of local SQLite (branch-by-abstraction membership.Store: sqliteStore default, jetstreamStore opt-in). The route cluster (0003a) and the KV store (0003b) shipped behind this flag; the membershipd boot wiring that selects the store is COMPLETE since issue 0006c and is realized at runtime with the server flag --store kv|sqlite (default sqlite). The internal-identity bootstrap (0006a) lets membershipd open the KV store on its own embedded NATS under enforce. Per-deploy opt-in: a node joins the decentralized control plane by starting with --store kv (and --cluster-name for HA). OFF (--store sqlite) keeps the single-node SQLite control plane unchanged.",
|
||||
"added": "2026-06-07",
|
||||
"enabled_at": null
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,214 @@
|
||||
---
|
||||
issue: 0001
|
||||
title: Seguridad del bus — sistema de usuarios, auth firmada del control plane, NATS nkey + TLS
|
||||
status: spec
|
||||
created: 2026-06-07
|
||||
domain: security
|
||||
scope: unibus (membershipd, pkg/membership, pkg/embeddednats, pkg/client) + clientes (mobile, web gateway, unibots)
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
Hoy el bus unibus solo está protegido por la red (WireGuard) y por el cifrado E2E
|
||||
por room (megolm). El **control plane** (HTTP `:8470`) y el **data plane** (NATS
|
||||
`:4250`) **no tienen autenticación ni TLS**: cualquiera que alcance esos puertos
|
||||
puede crear rooms, leer metadata, publicar, y hacer DoS. El contenido de las rooms
|
||||
`ModeMatrix` está cifrado E2E, pero las rooms `ModeNATS` (cleartext), la metadata
|
||||
de subjects y todo el control plane viajan en claro y sin control de acceso.
|
||||
|
||||
Este issue añade tres capas de seguridad al propio bus, de modo que **WireGuard
|
||||
pase a ser opcional** (defensa en profundidad) y el bus pueda exponerse de forma
|
||||
segura incluso a un cliente móvil en una red ajena:
|
||||
|
||||
1. **Sistema de usuarios** — un registro a nivel bus de las identidades autorizadas
|
||||
(allowlist de claves públicas Ed25519), con roles y revocación.
|
||||
2. **Auth del control plane** — cada request HTTP va firmado con la identidad del
|
||||
peer; el server verifica la firma y que la identidad esté autorizada.
|
||||
3. **NATS endurecido** — autenticación por nkey (Ed25519) contra el registro de
|
||||
usuarios + TLS para cifrar todo el transporte del data plane.
|
||||
|
||||
# Modelo de amenazas y capas
|
||||
|
||||
| Capa | Qué protege | Estado hoy | Tras este issue |
|
||||
|---|---|---|---|
|
||||
| WireGuard | Acceso de red; oculta el bus de internet | activo (opcional) | sigue disponible, ya no imprescindible |
|
||||
| TLS NATS | Confidencialidad/integridad del **canal** (cleartext rooms, metadata, nonces de auth) | ausente | CA propia self-signed |
|
||||
| Auth (firma Ed25519 / nkey) | **Autenticación**: solo identidades registradas conectan/operan | ausente | control plane + data plane |
|
||||
| E2E por room (megolm) | Confidencialidad del **contenido** de rooms cifradas | activo | sin cambios |
|
||||
|
||||
Principio: cada capa es independiente. TLS cifra el canal, la auth decide quién
|
||||
entra, el E2E protege el contenido aunque el bus fuera comprometido.
|
||||
|
||||
# Diseño
|
||||
|
||||
## Pieza 1 — Sistema de usuarios
|
||||
|
||||
Registro a nivel bus (no por room) de las identidades autorizadas. Migración
|
||||
**aditiva** `migrations/002_users.sql` (y su gemela embebida en
|
||||
`pkg/membership/migrations/`):
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
sign_pub TEXT PRIMARY KEY, -- clave pública Ed25519 en hex (identidad del peer)
|
||||
handle TEXT NOT NULL, -- nombre legible (único recomendado, no PK)
|
||||
role TEXT NOT NULL DEFAULT 'member', -- 'admin' | 'member'
|
||||
status TEXT NOT NULL DEFAULT 'active', -- 'active' | 'revoked'
|
||||
created_at TEXT NOT NULL,
|
||||
revoked_at TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_users_status ON users(status);
|
||||
```
|
||||
|
||||
- `sign_pub` es la misma clave que ya deriva el `endpoint` (`frame.EndpointID(SignPub)`).
|
||||
- CRUD en `pkg/membership/store.go`: `AddUser`, `GetUser`, `ListUsers`,
|
||||
`RevokeUser`, `IsAuthorized(signPubHex) bool`.
|
||||
- CLI de administración en `cmd/membershipd`: `membershipd user add --handle h
|
||||
--sign-pub <hex> [--role admin]`, `user list`, `user revoke <sign-pub>`.
|
||||
- **Bootstrap (chicken-egg):** el primer `admin` se siembra ejecutando el CLI
|
||||
localmente en el host del bus (`user add --role admin --sign-pub <tu_pub>`). El
|
||||
CLI local se considera de confianza (quien tiene shell en el host ya manda). Sin
|
||||
al menos un admin, los endpoints de gestión de usuarios devuelven 403.
|
||||
|
||||
## Pieza 2 — Auth del control plane (HTTP :8470)
|
||||
|
||||
Generaliza la firma que ya existe (`pkg/client.signRequest` ↔
|
||||
`pkg/membership.verifyOwnerSig`) de "solo owner" a "todo request".
|
||||
|
||||
**Cliente** (`pkg/client`): cada request añade cabeceras:
|
||||
|
||||
```
|
||||
X-Unibus-Pub: <sign_pub hex>
|
||||
X-Unibus-Ts: <unix seconds>
|
||||
X-Unibus-Nonce: <16 bytes aleatorios, base64>
|
||||
X-Unibus-Sig: Ed25519( canonical ) ; canonical = method "\n" path "\n" ts "\n" nonce "\n" sha256(body)
|
||||
```
|
||||
|
||||
**Server** (middleware en `membershipd`):
|
||||
1. Parsear cabeceras; reconstruir `canonical`; verificar firma con `X-Unibus-Pub`.
|
||||
2. Comprobar `IsAuthorized(pub)` (status active). Si no → `401`.
|
||||
3. **Anti-replay:** rechazar si `|now - ts| > 30s`; cachear `nonce` con TTL 60s y
|
||||
rechazar repetidos (LRU en memoria, suficiente para un único membershipd).
|
||||
4. Autorización fina: operaciones de gestión de usuarios exigen `role=admin`;
|
||||
operaciones de room siguen exigiendo ownership donde ya aplica.
|
||||
|
||||
Feature flag `bus-auth` en `dev/feature_flags.json` con tres estados de rollout:
|
||||
`off` (sin verificar) → `soft` (verifica y **loguea** rechazos pero deja pasar) →
|
||||
`enforce` (rechaza). Permite migrar clientes sin cortar el servicio.
|
||||
|
||||
## Pieza 3 — NATS: nkey auth + TLS
|
||||
|
||||
### Auth (nkey sobre la identidad Ed25519)
|
||||
|
||||
Los nkeys de NATS **son** claves Ed25519, así que reutilizamos la identidad del
|
||||
peer sin material nuevo.
|
||||
|
||||
- **Server** (`pkg/embeddednats`): `server.Options.CustomClientAuthentication` con
|
||||
un autenticador que, dado el nonce que NATS presenta al cliente y la firma que el
|
||||
cliente devuelve, verifica la firma con la pubkey declarada y consulta
|
||||
`store.IsAuthorized(pub)`. Validar dinámicamente contra la BD permite **revocar
|
||||
sin reiniciar** el server (ventaja sobre precargar `Options.Nkeys`).
|
||||
- **Cliente** (`pkg/client`): conectar con `nats.Nkey(pubSeedEncoded, sigCB)` donde
|
||||
`sigCB` firma el nonce con la Ed25519 del peer. Convertir `cs.Identity` →
|
||||
formato nkey con `github.com/nats-io/nkeys` (`nkeys.FromRawSeed(PrefixByteUser,
|
||||
seed)`).
|
||||
|
||||
### TLS (CA self-signed propia)
|
||||
|
||||
**Exposición DECIDIDA: pública.** El bus se expone a internet protegido por
|
||||
auth+TLS (WireGuard pasa a ser una vía de acceso más, no la barrera). En
|
||||
consecuencia: `ufw` en om abre `8470/tcp` y `4250/tcp`, y el server cert incluye en
|
||||
su SAN la **IP pública de om `135.125.201.30`**, la **IP WG `10.42.0.1`** (los peers
|
||||
internos siguen funcionando) y el hostname de om. Los clientes son todos
|
||||
controlados por nosotros (`pkg/client`, binding móvil, gateway web, unibots), así
|
||||
que **embeben el `ca.crt`** propio — no hace falta Let's Encrypt ni un dominio
|
||||
público apuntando al NATS.
|
||||
|
||||
- Generar una **CA propia** una vez (`deploy/tls/ca.{key,crt}`), y un **server
|
||||
cert** para el bus con SAN = `135.125.201.30`, `10.42.0.1`, hostname de om.
|
||||
- `pkg/embeddednats`: `server.Options.TLSConfig` con el server cert. NATS pasa a
|
||||
`tls://`.
|
||||
- Cliente: `nats.Secure(&tls.Config{RootCAs: caPool})` cargando la CA propia.
|
||||
- Las claves privadas (CA key, server key) **nunca** se commitean: van gitignored y
|
||||
se distribuyen por `pass`/scp. Solo el `ca.crt` (público) viaja con los clientes.
|
||||
|
||||
# Decisiones técnicas
|
||||
|
||||
| Decisión | Elegido | Alternativa descartada | Razón |
|
||||
|---|---|---|---|
|
||||
| Auth NATS | `CustomClientAuthentication` contra tabla `users` | `Options.Nkeys` estático | revocación dinámica sin reinicio |
|
||||
| TLS | CA self-signed propia | Let's Encrypt | infra privada, sin dependencia de dominio público apuntando al NATS |
|
||||
| Anti-replay control plane | timestamp ±30s + cache de nonce | nonce emitido por server (round-trip extra) | menos latencia, suficiente con un solo membershipd |
|
||||
| Material de identidad | reutilizar la Ed25519 del peer (firma + nkey) | claves separadas por capa | una identidad, menos gestión |
|
||||
| Rollout | feature flag `bus-auth` off→soft→enforce | corte directo | no romper clientes en vuelo |
|
||||
|
||||
# Fases (TBD, ramas `issue/0001x-*`, feature flags)
|
||||
|
||||
1. **0001a — users store + CLI** — migración `002_users.sql`, CRUD en store,
|
||||
comandos `membershipd user *`, seed admin. Flag `bus-auth: off`. Tests de store.
|
||||
2. **0001b — control-plane auth** — firma generalizada en `pkg/client`, middleware
|
||||
de verificación + anti-replay en `membershipd`. Flag `bus-auth: soft`. Tests:
|
||||
request firmado OK, no-autorizado 401, replay rechazado, reloj desfasado 401.
|
||||
3. **0001c — NATS nkey auth** — `CustomClientAuthentication` + cliente con
|
||||
`nats.Nkey`. Tests: peer no registrado rechazado al conectar; revocado pierde
|
||||
acceso sin reiniciar.
|
||||
4. **0001d — TLS NATS** — generación de CA/cert (`deploy/tls/` + script), server
|
||||
`TLSConfig`, cliente `RootCAs`. Flag `bus-tls`. Test: handshake TLS, cliente sin
|
||||
CA rechazado.
|
||||
5. **0001e — migrar clientes** — `mobile/` (binding), gateway web (`playground/`),
|
||||
`unibots` (`shell/transportunibus`): todos firman requests y conectan con
|
||||
nkey+TLS. Pasar `bus-auth` a `enforce`.
|
||||
6. **0001f — deploy** — unibus en om (bind `10.42.0.1` o público con auth+TLS),
|
||||
unibots como systemd-user en el PC local. Verificación E2E.
|
||||
|
||||
# Migración de clientes
|
||||
|
||||
Todo el cambio se concentra en `pkg/client` (firma de requests HTTP + conexión
|
||||
NATS nkey+TLS). `mobile/`, el gateway web y `unibots` lo heredan al recompilar; solo
|
||||
necesitan **pasar la ruta de la CA** y su identidad (que ya tienen). El binding
|
||||
gomobile expone un parámetro nuevo `caPath` en `NewSession`.
|
||||
|
||||
# Plan de despliegue (fase 0001f)
|
||||
|
||||
1. Cross-build `CGO_ENABLED=0 GOOS=linux GOARCH=amd64` del `membershipd`.
|
||||
2. `scp` binario + `ca.crt` + server cert/key a om (`/opt/unibus/`), dir de datos
|
||||
persistente para JetStream/db/blobs.
|
||||
3. systemd-system unit, `--bind 0.0.0.0` (exposición pública), `Restart=always`.
|
||||
4. `ufw allow 8470/tcp` y `ufw allow 4250/tcp` en om.
|
||||
5. Seed del admin (tu identidad) por CLI local en om.
|
||||
6. Verificar **desde fuera de la VPN** (red pública) y desde la WG: handshake TLS,
|
||||
`curl` firmado a `/healthz` OK, `curl` sin firma → 401, conexión NATS de un peer
|
||||
no registrado → rechazada.
|
||||
7. unibots local: systemd-user con `caPath` + identidad registrada.
|
||||
|
||||
> **Nota:** la fase de despliegue (0001f: abrir firewall público, scp a om, systemd
|
||||
> en el VPS) la ejecuta el humano en coordinación, no el agente autónomo — es una
|
||||
> acción outward sobre infraestructura pública. El agente entrega 0001a–0001e
|
||||
> (código + tests + CA/cert generados) en master de unibus, listos para desplegar.
|
||||
|
||||
# Tests (DoD: golden + edge + error path, evidencia ejecutable)
|
||||
|
||||
- **Golden:** peer autorizado crea room, publica y recibe por el bus con auth+TLS
|
||||
activos.
|
||||
- **Edge:** revocar un usuario activo → su próxima conexión NATS y su próximo
|
||||
request HTTP son rechazados sin reiniciar el server.
|
||||
- **Error path:** request con firma válida pero identidad no registrada → 401;
|
||||
conexión NATS con nkey no autorizado → rechazada; cliente sin la CA → fallo de
|
||||
handshake TLS; replay de un request firmado → rechazado.
|
||||
- Suite completa `CGO_ENABLED=0 go test ./...` verde.
|
||||
|
||||
# Riesgos y mitigaciones
|
||||
|
||||
| Riesgo | Mitigación |
|
||||
|---|---|
|
||||
| Chicken-egg del primer admin | seed por CLI local en el host (confianza de shell) |
|
||||
| Romper clientes en vuelo al activar auth | flag `bus-auth` off→soft→enforce; migrar clientes en soft |
|
||||
| Rotación/caducidad de certs | CA propia de larga vida; documentar regeneración del server cert en `deploy/tls/README.md` |
|
||||
| Coste de verificar firma por request | Ed25519 verify ≈ µs; despreciable frente a la latencia de red |
|
||||
| Conversión Ed25519 → nkey mal hecha | test dedicado de ida y vuelta firma/verify nkey antes de tocar el server |
|
||||
| Claves privadas filtradas en git | CA key / server key gitignored; distribución por `pass`/scp; solo `ca.crt` versionado |
|
||||
|
||||
# Fuera de alcance (futuro)
|
||||
|
||||
- Rotación automática de credenciales de usuario.
|
||||
- Cuentas/multi-tenant de NATS (un solo account basta hoy).
|
||||
- Federación entre buses.
|
||||
@@ -0,0 +1,146 @@
|
||||
---
|
||||
issue: 0002
|
||||
title: Media v2 — archivos grandes (chunking), metadata, GC del object store, exponer en clientes
|
||||
status: spec
|
||||
created: 2026-06-07
|
||||
domain: media
|
||||
scope: unibus (pkg/blobstore, pkg/frame, pkg/client, pkg/membership) + clientes (mobile binding, gateway web, unibots)
|
||||
depends_on: 0001 (la auth firmada del control plane debe cubrir /blobs antes de exponer media)
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
El envío de archivos (imágenes, audio, vídeo) ya funciona en v1, pero con límites
|
||||
que lo hacen inviable para vídeo grande y poco usable para los clientes. Este issue
|
||||
lleva la media a un estado de producción: archivos grandes por chunks, metadata de
|
||||
tipo/nombre, recolección de basura del object store, y exposición en los frontends.
|
||||
|
||||
# Contexto — cómo funciona media v1 (hoy)
|
||||
|
||||
`PublishMedia(roomID, data []byte)` cifra el archivo **entero** con la clave de la
|
||||
room (`SealAEAD`), lo sube **entero** al object store (`pkg/blobstore`,
|
||||
content-addressed por hash) vía el control plane (`POST /blobs`), y publica por el
|
||||
bus solo una referencia `frame.BlobRef{Hash, Nonce, Size}`. `FetchMedia` baja el
|
||||
ciphertext por hash (`GET /blobs/{hash}`) y lo descifra. El binario nunca viaja por
|
||||
NATS; el bus solo lleva la referencia. El object store guarda solo ciphertext (E2E
|
||||
real). Es correcto y simple, pero:
|
||||
|
||||
| Limitación v1 | Consecuencia |
|
||||
|---|---|
|
||||
| Todo el archivo en RAM (cifra y sube de una vez) | imágenes/audio OK; vídeo grande (cientos MB/GB) revienta memoria |
|
||||
| `BlobRef` solo lleva hash+nonce+size | el receptor no sabe mimetype/filename; no puede renderizar bien |
|
||||
| Sin resumable | si falla la subida de un archivo grande, reempezar de cero |
|
||||
| Object store sin GC | blobs content-addressed crecen indefinidamente, sin refcount ni TTL |
|
||||
| `mobile/` solo expone `Publish` (texto) | no se puede enviar una foto desde el móvil |
|
||||
| Gateway web sin endpoints de media | la SPA no sube/baja archivos |
|
||||
|
||||
Fuera de alcance de este issue (sería otro): **streaming en vivo** (videollamada,
|
||||
audio en tiempo real) — eso no es modelo blob, requiere WebRTC señalizado por el bus.
|
||||
|
||||
# Diseño
|
||||
|
||||
## Pieza 1 — Chunking de archivos grandes
|
||||
|
||||
Partir el archivo en chunks de tamaño fijo (propuesta: 4 MB), cifrar **cada chunk**
|
||||
de forma independiente con la clave de la room (nonce por chunk), y subir cada chunk
|
||||
como un blob propio (content-addressed). La referencia pasa de un solo blob a un
|
||||
manifiesto de chunks.
|
||||
|
||||
- `frame.BlobRef` evoluciona (de forma compatible) a soportar lista de chunks:
|
||||
```
|
||||
BlobRef{
|
||||
Hash string // hash del manifiesto (o del blob único si no hay chunks)
|
||||
Nonce []byte // nonce del manifiesto / del blob único
|
||||
Size int64 // tamaño total en claro
|
||||
Chunks []ChunkRef // vacío en archivos pequeños (camino v1 intacto)
|
||||
}
|
||||
ChunkRef{ Hash string; Nonce []byte; Size int64 } // por chunk cifrado
|
||||
```
|
||||
- `PublishMediaStream(roomID string, r io.Reader, meta MediaMeta) (BlobRef, error)`:
|
||||
lee del `io.Reader` en chunks (no carga el archivo entero en RAM), cifra y sube
|
||||
cada chunk, y construye el manifiesto. El `PublishMedia([]byte)` v1 se mantiene
|
||||
como atajo para archivos pequeños (sin chunks).
|
||||
- `FetchMediaStream(roomID, BlobRef) (io.ReadCloser, error)`: baja y descifra chunks
|
||||
bajo demanda, exponiendo un `io.Reader` (descarga progresiva, no todo en RAM).
|
||||
- Subida/descarga de chunks en paralelo acotado (p. ej. 4 a la vez) para throughput.
|
||||
|
||||
## Pieza 2 — Metadata (mimetype + filename)
|
||||
|
||||
Añadir a `BlobRef` (o a un sidecar cifrado) los campos `Mime string` y `Name
|
||||
string`, de modo que el receptor sepa renderizar (imagen inline, reproductor de
|
||||
audio/vídeo, icono de descarga). Como `Name`/`Mime` pueden ser sensibles, viajan
|
||||
**dentro del campo cifrado** del frame, no en claro. Detección de mimetype por
|
||||
sniffing del primer chunk + extensión.
|
||||
|
||||
## Pieza 3 — Garbage collection del object store
|
||||
|
||||
Hoy los blobs no se borran nunca. Introducir refcount o barrido:
|
||||
|
||||
- **Refcount por referencia**: una tabla `blob_refs(hash, room_id, msg_id)` en el
|
||||
control plane; al expirar un mensaje de una room efímera o al purgar historial de
|
||||
una room persistente, decrementar y borrar el blob cuando llega a cero.
|
||||
- **Alternativa TTL**: blobs de rooms efímeras con TTL; blobs de rooms persistentes
|
||||
viven mientras viva el mensaje en JetStream.
|
||||
- Comando `membershipd blobs gc [--dry-run]` para barrido manual + métrica de
|
||||
espacio. Debe ser idempotente y seguro (nunca borrar un blob aún referenciado).
|
||||
|
||||
## Pieza 4 — Exponer media en los clientes
|
||||
|
||||
- **Binding móvil** (`mobile/unibus.go`): `SendFile(roomID, path, mime)` y
|
||||
`FetchFile(roomID, frameJSON) -> path` (escribe a un archivo local del sandbox de
|
||||
la app y devuelve la ruta; no pasa []byte grandes por el puente gomobile).
|
||||
- **Gateway web** (`playground/server.go`): `POST /api/media` (multipart, streaming
|
||||
al store) y `GET /api/media/{room}/{hash}` (descarga descifrada con los headers
|
||||
`Content-Type`/`Content-Disposition` derivados de la metadata).
|
||||
- **unibots**: una tool `send_file` para que un bot pueda adjuntar archivos.
|
||||
|
||||
# Decisiones técnicas
|
||||
|
||||
| Decisión | Elegido | Alternativa | Razón |
|
||||
|---|---|---|---|
|
||||
| Tamaño de chunk | 4 MB | 1 MB / 16 MB | equilibrio RAM vs overhead de manifiesto |
|
||||
| Cifrado por chunk | nonce independiente por chunk, misma clave de room | re-cifrar todo | permite descarga/borrado parcial y paralelismo |
|
||||
| Metadata sensible | dentro del frame cifrado | en claro en BlobRef | filename/mime pueden filtrar info |
|
||||
| GC | refcount en control plane | solo TTL | preciso, no borra lo aún referenciado |
|
||||
| Compatibilidad v1 | `Chunks` vacío = camino v1 | romper formato | no romper media ya enviada |
|
||||
|
||||
# Fases (TBD, ramas `issue/0002x-*`)
|
||||
|
||||
1. **0002a — BlobRef con chunks (compatible)** — extender el tipo + tests de
|
||||
marshalling con `Chunks` vacío (v1) y con chunks (v2). Sin cambiar clientes aún.
|
||||
2. **0002b — PublishMediaStream / FetchMediaStream** — API de streaming en
|
||||
`pkg/client` sobre `io.Reader`/`io.ReadCloser`, cifrado por chunk, subida/descarga
|
||||
paralela acotada. Tests con un archivo > tamaño de chunk.
|
||||
3. **0002c — metadata mime+name** (en el campo cifrado) + sniffing.
|
||||
4. **0002d — GC del object store** — refcount + `membershipd blobs gc` + tests de
|
||||
"no borrar referenciado / borrar huérfano".
|
||||
5. **0002e — exponer en clientes** — binding móvil (`SendFile`/`FetchFile`), gateway
|
||||
web (`/api/media`), tool `send_file` en unibots.
|
||||
|
||||
# Definition of Done (evidencia ejecutable)
|
||||
|
||||
- **Golden:** enviar y recibir una imagen pequeña (camino v1, sin chunks) sigue
|
||||
funcionando; enviar y recibir un archivo de 50 MB por chunks sin cargar 50 MB en
|
||||
RAM (medir RSS durante la operación).
|
||||
- **Edge:** archivo cuyo tamaño es múltiplo exacto del chunk; archivo de 1 byte;
|
||||
archivo justo por debajo y por encima del umbral de chunking.
|
||||
- **Error path:** chunk corrupto/no descifrable → error claro, no panic; `blobs gc`
|
||||
con un blob aún referenciado → NO lo borra (assert).
|
||||
- `CGO_ENABLED=0 go test ./...` verde.
|
||||
|
||||
# Riesgos y mitigaciones
|
||||
|
||||
| Riesgo | Mitigación |
|
||||
|---|---|
|
||||
| Romper media v1 ya enviada | `Chunks` vacío preserva el camino v1; tests de compatibilidad |
|
||||
| GC borra un blob aún referenciado | refcount + barrido conservador + `--dry-run` por defecto en CI |
|
||||
| Puente gomobile con []byte grandes | el binding trabaja con rutas de archivo, no buffers en memoria |
|
||||
| Paralelismo de chunks satura el control plane | límite de concurrencia (4) + el endurecimiento de auth del issue 0001 |
|
||||
|
||||
# Relación con otros issues
|
||||
|
||||
- **0001 (seguridad)** — prerequisito: la auth firmada del control plane debe cubrir
|
||||
`POST/GET /blobs` antes de exponer media públicamente; si no, cualquiera llena el
|
||||
store o descarga ciphertext ajeno.
|
||||
- **Streaming en vivo** (futuro, no este issue) — videollamada/audio en tiempo real =
|
||||
WebRTC con el bus como canal de señalización; modelo distinto al blob.
|
||||
@@ -0,0 +1,195 @@
|
||||
---
|
||||
issue: 0003
|
||||
title: Descentralización / alta disponibilidad — cluster NATS + JetStream replicado + control plane sin SPOF
|
||||
status: spec
|
||||
created: 2026-06-07
|
||||
domain: infra
|
||||
scope: unibus (pkg/embeddednats, pkg/membership, pkg/blobstore, pkg/client, cmd/membershipd) + despliegue multi-nodo
|
||||
depends_on: 0001 (la auth de cluster y de clientes va junto con el endurecimiento)
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
Que la caída de un servidor **no deje el bus sin servicio**. Hoy unibus es un único
|
||||
`membershipd` (con NATS embebido + SQLite local): si ese host muere, no hay bus.
|
||||
Este issue lleva unibus a un modelo **descentralizado / alta disponibilidad** usando
|
||||
las capacidades nativas de NATS: cluster multi-nodo, JetStream replicado (RAFT), y
|
||||
el estado del control plane fuera de la SQLite local. **No es federación**
|
||||
(multi-operador con dominios distintos); es eliminar el punto único de fallo dentro
|
||||
de un único dominio administrativo controlado por nosotros.
|
||||
|
||||
# Requisito clave de quorum (decisión de infraestructura)
|
||||
|
||||
JetStream replica con RAFT, que necesita **mayoría (quorum)** para confirmar
|
||||
escrituras. Las consecuencias son duras y hay que asumirlas desde el diseño:
|
||||
|
||||
| Nodos | Réplica | Tolera caída de | Nota |
|
||||
|---|---|---|---|
|
||||
| 1 | R1 | 0 | situación actual (SPOF) |
|
||||
| 2 | R2 | **0** | si cae uno se pierde quorum: las escrituras se bloquean. NO sirve para HA |
|
||||
| **3** | **R3** | **1** | mínimo real para "si un server cae, seguimos" |
|
||||
| 5 | R5 | 2 | mayor tolerancia |
|
||||
|
||||
**Por tanto el objetivo del usuario ("si mi server falla, no nos quedamos sin
|
||||
servicio") exige 3 nodos JetStream.** Servers disponibles hoy: **magnus** y
|
||||
**homer** (ambos VPS OVH). El tercero está pendiente de conseguir.
|
||||
|
||||
| Nodo | IP pública | Estado | Notas |
|
||||
|---|---|---|---|
|
||||
| magnus | (en pass: `MAGNUS_ovh_ssh_ROOT`) | disponible, **cargado** | corre coolify, minio, postgres, authentik, portainer, dagu — revisar recursos antes |
|
||||
| homer | `141.94.69.66` | disponible, vivo | creds en pass (`vps_ovhcloud_SSH_SERVER_HOMER_-_root`, `vps_SSH_SERVER_HOMER_dataherrero`); tenía coolify |
|
||||
| nodo 3 | — | **pendiente** | conseguir un tercer VPS siempre-on, o reusar om/datardos si se liberan |
|
||||
|
||||
Preparación previa al deploy de cada nodo: alta del alias SSH + clave, integración en
|
||||
la WireGuard, y revisar/aligerar la carga existente (coolify, etc.).
|
||||
|
||||
## Rollout R1 → R3: funcionar con 2 nodos hoy, HA con 3 mañana
|
||||
|
||||
No se "desactiva el quorum"; se controla el **número de réplicas** de cada stream/KV:
|
||||
|
||||
| Réplicas | Quorum | Tolera | Sirve con |
|
||||
|---|---|---|---|
|
||||
| R1 | ninguno (1 copia) | 0 caídas | 1-2 nodos, sin bloqueo |
|
||||
| R3 | 2 de 3 | 1 caída | 3 nodos |
|
||||
|
||||
- **Fase actual (magnus + homer):** desplegar con streams/KV en **R1** (flag
|
||||
`decentralized: off`). El bus funciona al 100% para operar, sin tolerancia a fallo
|
||||
todavía. Opción: streams en **R2** para duplicar los datos en ambos nodos
|
||||
(durabilidad/backup vivo), asumiendo que la escritura necesita los dos hasta el 3er
|
||||
nodo.
|
||||
- **Cuando entre el nodo 3:** escalar en caliente `nats stream update --replicas 3`
|
||||
(idem KV/Object Store) + añadir el nodo al cluster + flag `decentralized: on`. **HA
|
||||
real, sin downtime, sin reescritura, sin migrar datos.**
|
||||
- **Aviso de 2 nodos:** NO montar el meta-group de JetStream con 2 nodos como si
|
||||
fuera HA — su quorum es 2, y la caída de uno bloquea la gestión de streams. Con 2
|
||||
servers, modelo recomendado: **magnus principal (R1) + homer 2º nodo/réplica**, y
|
||||
escalar a R3 al tener el tercero.
|
||||
|
||||
Mientras solo haya 2 nodos: el **data plane efímero** (core-NATS, rooms `ModeNATS`)
|
||||
sí tolera la caída de uno (los clientes reconectan al otro), pero las **rooms
|
||||
persistentes y el control plane** (que necesitan quorum) no. El issue se despliega
|
||||
de verdad cuando haya 3 nodos.
|
||||
|
||||
# Contexto — por qué hoy es un SPOF
|
||||
|
||||
- `pkg/embeddednats` arranca un NATS **standalone** (sin cluster).
|
||||
- `pkg/membership` guarda rooms/members/room_keys/users en una **SQLite local** al
|
||||
proceso.
|
||||
- `pkg/blobstore` guarda los blobs en el **disco local** del proceso.
|
||||
- El cliente (`pkg/client`) conecta a **una** URL de NATS y **una** de control plane.
|
||||
|
||||
Todo vive en un host. Ese host es el punto único de fallo.
|
||||
|
||||
# Diseño
|
||||
|
||||
## Pieza 1 — Cluster NATS (data plane replicado)
|
||||
|
||||
`pkg/embeddednats` gana opciones de cluster: `server.Options.Cluster` (nombre +
|
||||
host/puerto de routes) y `Routes` (los otros nodos). Cada `membershipd` arranca su
|
||||
NATS embebido en cluster con los demás. JetStream se habilita con `Replicas: 3` en
|
||||
streams y KV. Auth entre nodos (routes) con credenciales propias (no las de
|
||||
clientes), y TLS también en las routes (reusa la CA del issue 0001).
|
||||
|
||||
## Pieza 2 — Control plane sin estado local (SQLite → JetStream KV)
|
||||
|
||||
Es el corazón del issue. Hoy `pkg/membership.Store` es SQLite. Se introduce, por
|
||||
**branch-by-abstraction**, una interfaz `Store` con dos implementaciones:
|
||||
|
||||
- `sqliteStore` — la actual (sigue siendo el default mientras el flag está off; útil
|
||||
para un solo nodo / desarrollo).
|
||||
- `jetstreamStore` — nueva: rooms, members, room_keys y users (la tabla del issue
|
||||
0001) viven en **JetStream KV** (buckets replicados R3). Cualquier nodo lee/escribe
|
||||
el mismo estado; RAFT garantiza consistencia. El HTTP control plane pasa a ser
|
||||
efectivamente **stateless**: cualquier `membershipd` sirve cualquier request
|
||||
porque el estado está en el KV replicado.
|
||||
|
||||
Flag `decentralized` (off → on). Migración inicial de datos SQLite → KV con un
|
||||
comando `membershipd migrate-to-kv` (idempotente). Las claves de room siguen
|
||||
selladas igual; solo cambia **dónde se guardan**, no el cifrado.
|
||||
|
||||
## Pieza 3 — Blobs replicados (object store → NATS Object Store)
|
||||
|
||||
`pkg/blobstore` gana una implementación sobre **NATS Object Store** (encima de
|
||||
JetStream, replicado R3) además de la de disco local. Los blobs (ya ciphertext, E2E)
|
||||
quedan disponibles desde cualquier nodo. Encaja con el GC del issue 0002.
|
||||
|
||||
## Pieza 4 — Cliente con failover
|
||||
|
||||
`pkg/client`: aceptar **lista** de seeds de NATS y **lista** de URLs de control
|
||||
plane. `nats.go` ya hace reconnect/failover entre servidores del cluster nativamente
|
||||
(`nats.Servers([...])`, `nats.MaxReconnects(-1)`). El control plane HTTP se prueba en
|
||||
orden con reintento. Así, si un nodo cae, el cliente reconecta a otro de forma
|
||||
transparente.
|
||||
|
||||
## Pieza 5 — Despliegue multi-nodo
|
||||
|
||||
3 nodos `membershipd`, cada uno con su NATS embebido en cluster, JetStream R3, mismo
|
||||
`ca.crt`/credenciales de routes. systemd en cada VPS. Los clientes reciben la lista
|
||||
de los 3 endpoints. Health/observabilidad por nodo (`/healthz` + métricas de
|
||||
JetStream: líder RAFT, lag de réplica).
|
||||
|
||||
# Decisiones técnicas
|
||||
|
||||
| Decisión | Elegido | Alternativa | Razón |
|
||||
|---|---|---|---|
|
||||
| Nº de nodos de quorum | 3 (R3) | 2 (R2) | 2 no tolera caída de uno; 3 es el mínimo real de HA |
|
||||
| Estado del control plane | JetStream KV replicado | SQLite replicada a mano / Postgres externo | KV ya viene con NATS, mismo RAFT que JetStream, cero infra extra |
|
||||
| Migración del store | branch-by-abstraction (interfaz `Store`, dos impls, flag) | reescritura directa | master nunca se rompe; sqlite sigue para 1 nodo/dev |
|
||||
| Blobs | NATS Object Store | disco compartido / S3 | replicado nativamente, sin dependencia externa |
|
||||
| Failover de cliente | lista de seeds + reconnect nativo nats.go | balanceador externo | menos infra, nats.go ya lo hace |
|
||||
| Federación multi-operador | **fuera de alcance** | — | no es el objetivo; es otra liga (trust entre dominios) |
|
||||
|
||||
# Fases (TBD, ramas `issue/0003x-*`)
|
||||
|
||||
1. **0003a — cluster NATS** — opciones de cluster/routes + TLS de routes en
|
||||
`pkg/embeddednats`; arrancar 2-3 nodos locales en tests e2e y verificar que un
|
||||
subject publicado en uno llega a un suscriptor en otro.
|
||||
2. **0003b — interfaz Store + jetstreamStore (KV)** — abstraer `pkg/membership.Store`;
|
||||
implementar rooms/members/room_keys/users sobre JetStream KV R3; tests de
|
||||
consistencia. Flag `decentralized: off`.
|
||||
3. **0003c — migrate-to-kv** — comando idempotente SQLite → KV + test de paridad
|
||||
(mismo estado antes/después).
|
||||
4. **0003d — blobs en Object Store** — impl `pkg/blobstore` sobre NATS Object Store
|
||||
replicado.
|
||||
5. **0003e — cliente failover** — lista de seeds + lista de ctrl-urls + reconnect;
|
||||
test que mata el nodo al que está conectado y verifica que sigue operando.
|
||||
6. **0003f — despliegue 3 nodos** (humano) — 3 VPS en cluster, JetStream R3, flag
|
||||
`decentralized: on`. Chaos test real: matar un nodo en producción y comprobar que
|
||||
el servicio sigue.
|
||||
|
||||
# Definition of Done (evidencia ejecutable)
|
||||
|
||||
- **Golden:** 3 nodos en cluster; un cliente publica en un nodo y otro cliente
|
||||
suscrito a otro nodo lo recibe; crear room + invitar funciona desde cualquier nodo.
|
||||
- **Edge:** un cliente conectado al nodo A; se **mata el nodo A**; el cliente
|
||||
reconecta a B automáticamente y sigue publicando/recibiendo sin perder la sesión.
|
||||
- **Error path (chaos):** matar 1 de 3 nodos → el control plane sigue aceptando
|
||||
escrituras (quorum 2/3); matar 2 de 3 → las escrituras se bloquean (quorum perdido,
|
||||
comportamiento esperado y documentado, no corrupción).
|
||||
- `CGO_ENABLED=0 go test ./...` verde, incluido un test e2e multi-nodo en proceso.
|
||||
|
||||
# Riesgos y mitigaciones
|
||||
|
||||
| Riesgo | Mitigación |
|
||||
|---|---|
|
||||
| Solo 2 nodos disponibles → sin quorum real | prerequisito explícito de 3 nodos antes de 0003f; hasta entonces, despliegue queda en standalone |
|
||||
| Latencia inter-VPS afecta RAFT | nodos en la misma región o con buena red; medir; R3 tolera latencias moderadas |
|
||||
| Migración SQLite→KV pierde datos | comando idempotente + test de paridad + backup de la SQLite antes |
|
||||
| Partición de red (split-brain) | RAFT lo previene: el lado sin quorum se bloquea para escritura, no diverge |
|
||||
| Complejidad operativa de 3 nodos | observabilidad de JetStream (líder, lag) + `/healthz` por nodo + runbook en deploy/ |
|
||||
|
||||
# Orden recomendado respecto a otros issues
|
||||
|
||||
1. **0001 (seguridad)** primero: la auth de clientes (nkey) y la CA/TLS se reutilizan
|
||||
para las routes del cluster. Desplegar descentralizado sin auth sería abrir varios
|
||||
puntos públicos sin protección.
|
||||
2. **0003 (este)** después: una vez el bus es seguro, replicarlo en 3 nodos.
|
||||
3. **0002 (media v2)** es ortogonal; su object store encaja con la pieza 3 (blobs
|
||||
replicados) cuando ambos estén.
|
||||
|
||||
# Fuera de alcance
|
||||
|
||||
- Federación entre operadores/dominios distintos (otra liga; requiere protocolo de
|
||||
trust entre dominios).
|
||||
- Multi-tenant / accounts de NATS por organización.
|
||||
- Auto-escalado dinámico de nodos.
|
||||
@@ -0,0 +1,146 @@
|
||||
---
|
||||
issue: 0004
|
||||
title: Hardening de seguridad — autorización, anti-DoS y confidencialidad antes de exponer público
|
||||
status: done
|
||||
created: 2026-06-07
|
||||
completed: 2026-06-07
|
||||
report: projects/message_bus/reports/0005-2026-06-07-unibus-security-hardening.md
|
||||
domain: security
|
||||
scope: unibus (pkg/membership/server.go, auth.go, pkg/embeddednats, pkg/client, cmd/membershipd, deploy/tls)
|
||||
depends_on: 0001 (cierra los gaps que la auditoría 0004 encontró sobre lo entregado en 0001)
|
||||
blocks: 0001f (deploy público) y 0003f (deploy descentralizado)
|
||||
source: projects/message_bus/reports/0004-2026-06-07-unibus-security-audit.md
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
La auditoría red-team (report 0004) concluyó: la **autenticación** del bus es sólida,
|
||||
pero faltan **autorización, disponibilidad y confidencialidad de metadata** — justo lo
|
||||
que un bus *público* necesita. Veredicto: **NO exponer público hoy**. Este issue cierra
|
||||
los hallazgos bloqueantes (1 crítico + 4 altos) y los medios relevantes, de modo que el
|
||||
deploy 0001f (público) y luego 0003 (descentralizado) sean seguros.
|
||||
|
||||
Cada fase corresponde a un hallazgo del report 0004. La **DoD de cada fase es portar el
|
||||
test adversarial del auditor** (`TestAudit_*`) y verificar que ahora arroja el resultado
|
||||
SEGURO (lo que antes pasaba el ataque, ahora lo rechaza).
|
||||
|
||||
# Fases (TBD, ramas `issue/0004x-*`, una por hallazgo)
|
||||
|
||||
## 0004a — H1 (Crítico): límite de cuerpo + anti-DoS pre-auth
|
||||
|
||||
**Problema:** `Server.ServeHTTP` hace `io.ReadAll(r.Body)` **sin límite y antes** de
|
||||
`authenticate()`; `handlePutBlob` repite el `io.ReadAll` sin límite. 400 MB sin
|
||||
credenciales → 898 MB RSS → OOM con pocas conexiones.
|
||||
|
||||
**Fix:**
|
||||
- `http.MaxBytesReader` en el middleware **antes** del `io.ReadAll` (límite control plane,
|
||||
p.ej. 1 MB).
|
||||
- Límite separado y mayor para `/blobs`, con rechazo temprano por `Content-Length` antes
|
||||
de bufferizar; idealmente stream a disco en vez de RAM.
|
||||
- `Server.MaxHeaderBytes` ajustado.
|
||||
- Rate-limit por IP (y por identidad tras auth). Reusar/crear una función del registry si
|
||||
aplica (delegar a `fn-constructor` si es genérica).
|
||||
|
||||
**DoD:** test que envía un cuerpo > límite sin firma → `413`/`401` **sin** que el RSS se
|
||||
dispare (medir `/proc/self/status` antes/después, delta acotado). Golden (cuerpo normal
|
||||
pasa) + edge (justo en el límite) + error (excede → rechazo barato).
|
||||
|
||||
## 0004b — H2 (Alto): cerrar el fail-open de configuración
|
||||
|
||||
**Problema:** default `--bus-auth off`; el nkey de NATS solo se activa en `enforce`; TLS
|
||||
es flag independiente. `--bind 0.0.0.0 --tls-cert …` **sin** `--bus-auth enforce` deja el
|
||||
bus abierto con apariencia de seguro.
|
||||
|
||||
**Fix:**
|
||||
- Si `--bind` no es loopback ⇒ exigir `--bus-auth enforce` (si no, `log.Fatal` con mensaje
|
||||
claro).
|
||||
- `--tls-cert`/`--tls-key` sin `--bus-auth enforce` ⇒ error de arranque.
|
||||
- Arranque inseguro imposible o, como mínimo, ruidoso y rechazado.
|
||||
|
||||
**DoD:** portar `TestAudit_FailOpenTLSWithoutAuth` → ahora el arranque público-sin-enforce
|
||||
falla; cliente no registrado NO conecta. Golden (bind loopback dev sigue permitido) + error
|
||||
(bind público sin enforce aborta).
|
||||
|
||||
## 0004c — H3 (Alto): autorización por pertenencia en el control plane
|
||||
|
||||
**Problema:** "autorizado" = "registrado", no "miembro". Los GET de room no comprueban
|
||||
pertenencia: `/rooms/{id}`, `/rooms/{id}/members` (expone `sign_pub`+`kex_pub` de todos),
|
||||
`/members/{endpoint}/rooms`, y `/rooms/{id}/key?endpoint=X` (devuelve la `sealed_key` ajena).
|
||||
|
||||
**Fix:**
|
||||
- Cada handler de room consulta `members` y exige que el firmante (`X-Unibus-Pub` →
|
||||
endpoint) sea miembro.
|
||||
- `/rooms/{id}/key` solo sirve la clave sellada **para el propio firmante** (`endpoint ==
|
||||
signer`), nunca de un tercero.
|
||||
- `/members/{endpoint}/rooms` solo si `endpoint == signer`.
|
||||
- No exponer la member-list completa a no-miembros.
|
||||
|
||||
**DoD:** portar `TestAudit_HorizontalMetadataLeak` → bob (no miembro) ahora recibe `403`
|
||||
en todos. Golden (miembro legítimo accede) + edge (owner accede) + error (no-miembro 403).
|
||||
|
||||
## 0004d — H4 (Alto): control de acceso en el data plane NATS
|
||||
|
||||
**Problema:** el authenticator nkey solo decide "registrado sí/no"; no hay permisos por
|
||||
subject. Cualquier registrado se suscribe/publica en cualquier subject; las rooms
|
||||
`ModeNATS` (cleartext) quedan expuestas entre usuarios.
|
||||
|
||||
**Fix (elegir y documentar la estrategia):**
|
||||
- Preferente: NATS `Permissions` por identidad (subjects que el usuario puede sub/pub),
|
||||
derivadas de su pertenencia a rooms; o
|
||||
- Subjects impredecibles (no derivables del nombre) + verificación de pertenencia
|
||||
server-side; o
|
||||
- Prohibir `ModeNATS` en despliegue público (forzar siempre E2E) como mínimo defensivo.
|
||||
|
||||
**DoD:** portar `TestAudit_NoSubjectACL` → eve (no invitada) ya NO recibe el mensaje de la
|
||||
room ajena. Documentar la estrategia elegida y su límite.
|
||||
|
||||
## 0004e — H5 (Alto, público): TLS en el control plane
|
||||
|
||||
**Problema:** HTTP `:8470` firmado pero **sin TLS** → metadata (subjects, endpoints,
|
||||
pubkeys, sealed keys, hashes de blobs, grafo social) legible por un MITM en la red pública.
|
||||
|
||||
**Fix:**
|
||||
- Servir el control plane sobre TLS con la misma CA propia (o documentar un reverse-proxy
|
||||
TLS delante).
|
||||
- El cliente exige `https` cuando se le pasa una CA (`client.Connect(caPath)` ⇒ control
|
||||
plane también TLS).
|
||||
|
||||
**DoD:** cliente contra control plane `https` con la CA → OK; contra `http` con CA esperada
|
||||
→ rechaza; un observador no ve la metadata (argumentado + test de esquema).
|
||||
|
||||
## 0004f — medios: owner binding, nonce-cache, error leak
|
||||
|
||||
- **H6** `handleCreateRoom`: exigir `Owner.Endpoint == frame.EndpointID(X-Unibus-Pub)` y
|
||||
`Owner.SignPub == pub`. (Portar `TestAudit_OwnerSpoof` → ahora 403.)
|
||||
- **H7** mover `IsAuthorized` **antes** de tocar el `nonceCache` (no cachear nonces de
|
||||
no-autorizados); poda por expiry-bucket/heap en vez de O(n) bajo mutex global; cap de
|
||||
tamaño. (Portar `TestAudit_NonceCachePoisonPreAuth`.) **Nota:** este fix es prerequisito
|
||||
del cambio a nonce-cache replicado del issue 0003.
|
||||
- **H12** mensajes de error genéricos al cliente; detalle solo al log (no filtrar rutas/SQL).
|
||||
|
||||
# Fuera de alcance de este issue (encolado en otros)
|
||||
|
||||
- **H9** (cuota/GC de blobs) → issue 0002 (media v2) ya lo cubre.
|
||||
- **H10** (AEAD nonce 12B → XChaCha o rekey por volumen) → bajo, futuro; abrir issue propio
|
||||
si se necesitan rooms de muy alto volumen.
|
||||
- **H11** (firma de owner sin nonce/ts) → cubierto en la práctica por el envelope `enforce`;
|
||||
documentar la dependencia. Reforzar si se relaja `enforce`.
|
||||
- **H8** (custodia de la CA: generar en om, `ca.key` fuera del PC) → tarea operacional del
|
||||
deploy 0001f/0003f, no de código.
|
||||
- **govulncheck** sobre nats-server/nats.go/modernc → paso de CI aparte.
|
||||
|
||||
# Definition of Done global
|
||||
|
||||
- Las cuatro pruebas adversariales bloqueantes del report 0004 (DoS acotado, fail-open
|
||||
cerrado, fuga horizontal 403, ACL data plane) portadas como tests de regresión y en verde.
|
||||
- `CGO_ENABLED=0 go build ./...` + `go vet ./...` + `go test ./...` verdes.
|
||||
- Re-evaluación: tras el hardening, el veredicto de exposición pública pasa de "NO" a
|
||||
"sí-con-condiciones operacionales" (CA custodiada, Restart=always). Anotar en un report
|
||||
nuevo o como addendum al 0004.
|
||||
|
||||
# Orden respecto a otros issues
|
||||
|
||||
1. **0004 (este)** — primero: hace el bus seguro para exponer.
|
||||
2. **0003 (descentralización)** — después: absorbe el nonce-cache→KV replicado (apoyado en
|
||||
0004f-H7), la auth de routes del cluster y el guard de fail-open ×N nodos.
|
||||
3. **0002 (media v2)** — ortogonal; incluye la cuota/GC de blobs (H9).
|
||||
@@ -0,0 +1,132 @@
|
||||
---
|
||||
issue: 0005
|
||||
title: Hardening 2 — CVEs, spoof por firma omitida, DoS por concurrencia, TLS forzado (re-auditoría)
|
||||
status: done
|
||||
created: 2026-06-07
|
||||
completed: 2026-06-07
|
||||
domain: security
|
||||
scope: unibus (go.mod, pkg/client, pkg/membership/server.go, cmd/membershipd/config.go, pkg/embeddednats, pkg/blobstore)
|
||||
depends_on: 0001, 0004 (cierra los hallazgos NUEVOS de la re-auditoría sobre lo entregado)
|
||||
blocks: 0001f (deploy público) y 0003f (deploy descentralizado)
|
||||
source: projects/message_bus/reports/0006-2026-06-07-unibus-security-reaudit.md
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
La re-auditoría red-team (report 0006) confirmó que el hardening 0004 cerró H1–H7/H12,
|
||||
pero encontró **hallazgos nuevos** que mantienen el veredicto en **"NO exponer público
|
||||
aún"**. Este issue los cierra. La re-auditoría se hizo sobre el commit `618f6b6`
|
||||
(pre-0003); algunos hallazgos pueden haber cambiado con 0003 — **cada fase debe primero
|
||||
verificar si el hallazgo sigue vivo en el master actual** (post-0003, v0.6.0) antes de
|
||||
arreglarlo.
|
||||
|
||||
Estado verificado al crear este issue (master post-0003):
|
||||
- **N1 vivo**: `go.mod` sigue en `nats-server v2.10.22` y `go 1.25.0`.
|
||||
- **N3 vivo**: `pkg/client/client.go:802` tiene `if info.Policy.SignMsgs && f.Sig != nil` (el patrón vulnerable exacto).
|
||||
- **H4**: 0003 añadió `pkg/membership/acl.go` — hay que evaluar si cierra el wildcard `Subscribe(">")` o si falta la capa de NATS Permissions.
|
||||
- N2, N4: presumiblemente vivos (0003 no los tocó); verificar.
|
||||
|
||||
# Fases (TBD, ramas `issue/0005x-*`)
|
||||
|
||||
## 0005a — N1 (Alto): CVEs en dependencias
|
||||
|
||||
**Hallazgo:** `govulncheck ./...` → 16 vulnerabilidades alcanzables: 14 en
|
||||
`github.com/nats-io/nats-server/v2@v2.10.22` (servidor embebido, expuesto público en el
|
||||
deploy decidido) + 2 en la stdlib de Go (`net/textproto` GO-2026-5039, `crypto/x509`
|
||||
GO-2026-5037).
|
||||
|
||||
**Fix:**
|
||||
- `go get github.com/nats-io/nats-server/v2@v2.11.15` (o superior que cubra las 14).
|
||||
- Subir la toolchain a `go1.26.4` (cubre las 2 de stdlib); actualizar la directiva `go`
|
||||
en `go.mod` si procede.
|
||||
- Re-correr `govulncheck ./...` hasta **0 affected**.
|
||||
- **Nota:** este es un cambio de `go.mod`/`go.sum` justificado por CVE; documentarlo en el
|
||||
commit. Verificar que el bump de nats-server no rompe el cluster/JetStream de 0003
|
||||
(correr toda la suite, incluido el e2e multi-nodo).
|
||||
|
||||
**DoD:** `govulncheck ./...` → "No vulnerabilities found" (o solo no-alcanzables); suite
|
||||
completa verde tras el bump.
|
||||
|
||||
## 0005b — N3 (Alto): spoof por firma omitida en rooms firmadas
|
||||
|
||||
**Hallazgo:** `pkg/client/client.go::processFrame` verifica la firma **solo si el frame la
|
||||
trae**: `if info.Policy.SignMsgs && f.Sig != nil { verify }`. Un atacante con acceso al
|
||||
data plane publica un frame con `Sig==nil` y `Sender` forjado → el receptor lo acepta como
|
||||
auténtico en una room que EXIGE firma.
|
||||
|
||||
**Fix:** en una room `SignMsgs`, un frame sin firma debe **dropearse**:
|
||||
```go
|
||||
if info.Policy.SignMsgs {
|
||||
if f.Sig == nil { return } // exige firma; sin ella, descarta
|
||||
if !verify(...) { return }
|
||||
}
|
||||
```
|
||||
|
||||
**DoD:** portar `TestReaudit_SigNilSpoof` → ahora el frame `Sig==nil` con `Sender` forjado
|
||||
en una room `SignMsgs` se **descarta** (no se entrega al handler). Golden (frame firmado
|
||||
válido se entrega) + edge (room sin SignMsgs no se ve afectada) + error (Sig==nil en
|
||||
SignMsgs → drop).
|
||||
|
||||
## 0005c — N2 (Medio-Alto): DoS por concurrencia
|
||||
|
||||
**Hallazgo:** el límite por-request (16 MiB) + rate-limit per-IP NO acotan la memoria
|
||||
agregada. 40 subidas de 16 MiB simultáneas (= el burst per-IP) → 1.42 GB RSS. Multi-IP
|
||||
escala sin techo.
|
||||
|
||||
**Fix (elegir y documentar):**
|
||||
- Límite global de conexiones concurrentes y/o de bytes-en-vuelo (semáforo con cota de
|
||||
memoria total), y/o
|
||||
- Stream del blob a disco en vez de `io.ReadAll` en RAM (encaja con la cuota/GC del issue
|
||||
0002), y/o
|
||||
- Bajar `maxBlobBytes` y separar mejor el límite de control (1 MiB) del de blobs.
|
||||
|
||||
**DoD:** test que lanza N subidas concurrentes al techo y verifica que el RSS agregado
|
||||
queda **acotado** (mide `/proc/self/status`, cota declarada) en vez de crecer linealmente
|
||||
con N. Golden (concurrencia normal pasa) + edge (en la cota) + error (exceso → 429/503 sin
|
||||
OOM).
|
||||
|
||||
## 0005d — N4 (Medio): forzar TLS del control plane en bind público
|
||||
|
||||
**Hallazgo:** el guard `validateBootConfig` cierra "público sin enforce" y "TLS sin
|
||||
enforce", pero **permite** público + enforce **sin** `--tls-cert` → el control plane sirve
|
||||
HTTP plano públicamente (reaparece H5: metadata en claro).
|
||||
|
||||
**Fix:** el guard debe exigir `--tls-cert`/`--tls-key` cuando el bind no es loopback.
|
||||
`public + enforce + sin TLS` → `log.Fatal`.
|
||||
|
||||
**DoD:** portar `TestGap_PublicEnforceNoTLS` → ahora `validateBootConfig("0.0.0.0",
|
||||
enforce, "", "")` **rechaza**. Golden (público+enforce+TLS OK) + edge (loopback sin TLS
|
||||
sigue OK para dev) + error (público sin TLS aborta).
|
||||
|
||||
## 0005e — H4 (Medio, residual): evaluar y completar la ACL por subject
|
||||
|
||||
**Contexto:** 0003 añadió `pkg/membership/acl.go`. Primero **evaluar** con el ataque del
|
||||
report 0006 (`TestReaudit_H4_WildcardMetadataLeak`: un registrado no-miembro con
|
||||
`Subscribe(">")` raw capta subjects + advisories de JetStream de rooms ajenas) si ese
|
||||
acl.go ya lo cierra.
|
||||
- Si lo cierra → portar el test como regresión y documentar.
|
||||
- Si NO (probable: la ACL real necesita NATS `Permissions` por identidad a nivel del
|
||||
authenticator/cuenta, no solo lógica de membership en el control plane) → implementar las
|
||||
Permissions por identidad derivadas de pertenencia, o documentar el límite y el plan.
|
||||
|
||||
**DoD:** `TestReaudit_H4_WildcardMetadataLeak` → el no-miembro ya NO capta los subjects de
|
||||
rooms ajenas (o, si queda residual, está documentado con su límite exacto).
|
||||
|
||||
# Fuera de alcance (otros issues)
|
||||
|
||||
- **H9** (cuota/GC de blobs) → issue 0002; se solapa con 0005c (streaming a disco).
|
||||
- **H10** (AEAD nonce) / **H11** (nonce/ts en firma de owner) → bajo, futuro.
|
||||
- **H8** (custodia de la CA: generar en om) → operacional del deploy.
|
||||
- **Auditoría de la superficie nueva de 0003** (cluster routes auth, jetstreamStore KV
|
||||
fail-closed, nonce-cache replicado, failover) → el report 0006 NO la cubrió (auditó
|
||||
pre-0003). Pendiente una re-auditoría dedicada de 0003 (prompt ya preparado).
|
||||
|
||||
# Definition of Done global
|
||||
|
||||
- `govulncheck ./...` → 0 alcanzables.
|
||||
- Los tests adversariales de la re-auditoría (`TestReaudit_SigNilSpoof`,
|
||||
`TestGap_PublicEnforceNoTLS`, `TestReaudit_H4_WildcardMetadataLeak`, DoS-concurrencia)
|
||||
portados como regresión y en verde (o el residual documentado).
|
||||
- `CGO_ENABLED=0 go build ./... && go vet ./... && go test ./...` verdes (incluido el e2e
|
||||
multi-nodo de 0003, para confirmar que el bump de nats-server no lo rompió).
|
||||
- Re-evaluación: el veredicto de exposición pública pasa de "NO-aún" a "sí-con-condiciones".
|
||||
@@ -0,0 +1,160 @@
|
||||
---
|
||||
issue: 0006
|
||||
title: Completar y endurecer el cluster — wiring del control plane KV + N1-N6 de la auditoría 0008
|
||||
status: done
|
||||
created: 2026-06-07
|
||||
closed: 2026-06-07
|
||||
closed_by: fases 0006a–0006g (ver report 0009); unibus v0.8.0
|
||||
domain: security
|
||||
scope: unibus (cmd/membershipd, pkg/membership, pkg/embeddednats, pkg/busauth, pkg/client)
|
||||
depends_on: 0003 (completa su wiring), 0005 (hereda el bus single-node ya seguro)
|
||||
blocks: 0003f (deploy del cluster descentralizado)
|
||||
source: projects/message_bus/reports/0008-2026-06-07-unibus-decentralization-audit.md
|
||||
---
|
||||
|
||||
# Objetivo
|
||||
|
||||
La auditoría dedicada de la superficie de 0003 (report 0008) concluyó: **el bus en
|
||||
cluster NO es seguro para público** por dos bloqueantes, y además **0003 dejó el
|
||||
control plane descentralizado SIN cablear** (el binario sigue usando SQLite single-store;
|
||||
el flag `decentralized` existe pero ningún código Go lo lee). Como nodo único standalone
|
||||
unibus YA es seguro (report 0008 lo confirma); como cluster, no.
|
||||
|
||||
Este issue cierra los bloqueantes de seguridad del cluster Y completa el wiring que 0003
|
||||
dejó a medias, de modo que el deploy descentralizado (0003f) sea seguro. Cada fase
|
||||
reproduce el ataque del report 0008 (`TestAttack0008_*`) y verifica que ahora se rechaza.
|
||||
|
||||
# Fases (TBD, ramas `issue/0006x-*`)
|
||||
|
||||
## 0006a — N3 (BLOQUEANTE): cablear el nonce replicado en el binario
|
||||
|
||||
**Hallazgo (ALTA):** `membershipd` **nunca llama** `Server.UseReplicatedNonces`; cada nodo
|
||||
usa `memNonceCache` por-proceso. Un request firmado aceptado en el nodo A se **replaya con
|
||||
éxito en el nodo B** (200+200). La API (`kvNonceStore`) y el test
|
||||
(`TestReplicatedNonceRejectsCrossNodeReplay`) existen, pero el binario no los invoca.
|
||||
|
||||
**Fix:** en `cmd/membershipd/main.go`, cuando se arranca con `--cluster-name` (o siempre que
|
||||
haya JetStream disponible), llamar `srv.UseReplicatedNonces(js, replicas)` y **fail-fast** si
|
||||
el bucket `KV_UNIBUS_nonces` no se crea. Regla dura: `--cluster-name != ""` ⇒ nonce replicado
|
||||
**obligatorio** (no arrancar un nodo de cluster con nonce-cache local).
|
||||
|
||||
**DoD:** reproducir `TestAttack0008_N3` (2 nodos con el wiring exacto del binario) → el replay
|
||||
del nonce al nodo B ahora da **401**. Golden (request normal OK en cualquier nodo) + edge
|
||||
(single-node sin cluster sigue usando cache local OK) + error (replay cross-node → 401).
|
||||
|
||||
## 0006b — N2 (BLOQUEANTE): cerrar `$JS.API.>` / aislar el control plane KV
|
||||
|
||||
**Hallazgo (ALTA):** el grant ACL `clientInfraSubjects = {"_INBOX.>", "$JS.API.>"}`
|
||||
(`acl.go:20`) deja a cualquier peer registrado leer los buckets KV del control plane
|
||||
(`KV_UNIBUS_users/rooms/members/room_keys`) directo por NATS, saltándose `requireMember` y
|
||||
los chequeos del HTTP. Fuga del allowlist (handles+roles+claves), grafo de rooms y metadata
|
||||
de sealed-keys. (La ESCRITURA al KV ya está denegada — verificado; la fuga es de lectura.)
|
||||
|
||||
**Fix (elegir y documentar):**
|
||||
- Sustituir el grant amplio `$JS.API.>` por permisos JetStream **mínimos por-room** (solo la
|
||||
API del stream/consumer de las rooms del peer), y **denegar explícitamente** los streams
|
||||
`KV_UNIBUS_*` y `OBJ_*`; o
|
||||
- (Más robusto) aislar el control plane KV en una NATS **account separada**, inaccesible
|
||||
desde la account de clientes.
|
||||
|
||||
**DoD:** reproducir `TestAttack0008_N2` → eve (registrada, no miembro) ya **NO** puede leer
|
||||
los buckets KV (`Permissions Violation` o equivalente). La JetStream API legítima de las
|
||||
rooms del peer sigue funcionando.
|
||||
|
||||
## 0006c — wiring del control plane KV (completar 0003)
|
||||
|
||||
**Hallazgo (MEDIA / raíz):** el binario no activa el store descentralizado. `membership.Open`
|
||||
(SQLite) está hardcodeado en `main.go:90`; `OpenJetStream` solo lo usa `migrate-to-kv`.
|
||||
|
||||
**Fix:** leer el flag `decentralized` (o un `--store kv|sqlite`) y **seleccionar el store** en
|
||||
el arranque: SQLite (default, single-node/dev) o `jetstreamStore` (cluster). Resolver el
|
||||
"ciclo bootstrap" del authenticator interno (el authenticator necesita el store para
|
||||
`IsAuthorized`, y el store KV necesita el NATS arrancado). Mantener branch-by-abstraction:
|
||||
con el flag off, comportamiento idéntico al actual. `IsAuthorized`/lecturas sobre KV
|
||||
**fail-closed** ante pérdida de quorum/timeout (ya implementado en `jetstreamStore` —
|
||||
verificar que el wiring lo preserva).
|
||||
|
||||
**DoD:** con `decentralized: on` + cluster, el control plane sirve desde el KV replicado y un
|
||||
nodo nuevo ve las rooms creadas en otro (cierra la divergencia de estado que nota N5).
|
||||
Fail-closed: simular KV no disponible → deniega. Con flag off, suite idéntica al baseline.
|
||||
|
||||
## 0006d — N1 (ALTA): posture homogénea del cluster
|
||||
|
||||
**Hallazgo:** el cluster es tan seguro como su nodo más débil; un nodo sin authenticator o
|
||||
`--bus-auth off` deja a un peer no autenticado `Subscribe(">")` y cosechar el tráfico
|
||||
reenviado de los nodos con ACL.
|
||||
|
||||
**Fix:** garantizar (en arranque/health) que todos los nodos corren `enforce`+ACL+TLS;
|
||||
rechazar formar cluster con un peer en posture inferior, o como mínimo documentar y exponer
|
||||
un health que lo detecte. Nunca exponer el puerto de cliente de un nodo sin enforce.
|
||||
|
||||
**DoD:** reproducir `TestAttack0008_N1` escenario 2 (cluster con un nodo `withACL=false`) →
|
||||
el arranque/health lo rechaza o lo señala; documentar la garantía.
|
||||
|
||||
## 0006e — N4 (MEDIA): RefreshSession en los clientes
|
||||
|
||||
**Hallazgo:** la ACL congela permisos al conectar; un peer que crea/se une a una room debe
|
||||
llamar `client.RefreshSession()` para pub/sub en su subject. **Ningún cliente lo llama**
|
||||
(`cmd/chat`, `cmd/worker`, `mobile`, `gateway`). Es fail-closed (deniega), pero rompe la
|
||||
usabilidad bajo `enforce`+ACL → empuja al operador a desactivar la ACL (regresión de
|
||||
seguridad a discreción del operador).
|
||||
|
||||
**Fix:** llamar `RefreshSession` tras cambios de membresía en `cmd/chat`/`cmd/worker` (y
|
||||
documentar el contrato para `mobile`/`gateway`), o implementar refresh transparente (rehacer
|
||||
suscripciones automáticamente al unirse a una room).
|
||||
|
||||
**DoD:** test que crea/une room bajo enforce+ACL y publica/recibe SIN intervención manual
|
||||
(el cliente refresca solo o el demo llama RefreshSession). Documentar el requisito.
|
||||
|
||||
## 0006f — bajos: CA de routes, secreto de cluster, migrate-to-kv, R1≠HA
|
||||
|
||||
- **N1 (BAJA):** CA **separada** para las routes del cluster (no reusar la CA del data plane
|
||||
de clientes); pasar el secreto de cluster por **archivo/env**, no por `--routes
|
||||
nats://user:pass@host` en argv (hoy visible en `ps`/`journald`).
|
||||
- **N6 (BAJA):** `migrate-to-kv` solo en loopback o con TLS (hoy el allowlist viaja plaintext
|
||||
si `--nats-url` remoto sin `--ca`).
|
||||
- **N3-DoS (MEDIA, doc):** documentar que el nonce/control plane en **R1 es SPOF de auth** (su
|
||||
caída rechaza todos los requests autenticados); R3 (quorum 2/3) es la condición de HA real.
|
||||
No vender R1 como HA.
|
||||
|
||||
## 0006g — preparar el material de deploy del cluster (3 nodos)
|
||||
|
||||
Los tres nodos del cluster están decididos: **magnus + homer + datardos** (3 VPS OVH →
|
||||
quorum R3 real, tolera la caída de uno). Datos: homer `141.94.69.66`; datardos `ssh dd`
|
||||
`51.91.100.142` (WG `datardos-wg` 10.21.0.x); magnus en `pass` (`MAGNUS_ovh_ssh_ROOT`).
|
||||
|
||||
**Preparar (NO ejecutar en los VPS — eso es 0003f, lo hace el humano):** dejar en
|
||||
`deploy/cluster/` el material parametrizado por nodo:
|
||||
- `generate-cluster-certs.sh` — CA propia del cluster (separada de la de clientes, ver 0006f)
|
||||
+ un server cert por nodo con SAN = su IP pública + su IP WG + hostname.
|
||||
- una plantilla de systemd unit por nodo (`membershipd@.service` o tres units) con
|
||||
`--bind 0.0.0.0 --bus-auth enforce --tls-cert … --cluster-name unibus --routes
|
||||
nats://…@<otros-2-nodos> --store kv` y `Restart=always`, secreto de cluster por archivo/env.
|
||||
- `deploy-cluster.sh` (cross-build linux + rsync por nodo + plan de arranque escalonado).
|
||||
- un `README.md` con el runbook: orden de arranque, seed del admin, `migrate-to-kv` (loopback/TLS),
|
||||
escalado de réplicas a R3 (`nats stream update --replicas 3`), verificación de quorum y chaos
|
||||
test (matar un nodo). Marcar claramente qué pasos toca el humano.
|
||||
|
||||
**DoD:** el material existe y es coherente (los certs cubren los 3 nodos; las units referencian
|
||||
los routes correctos); un `bash -n` de los scripts pasa; el README describe el deploy end-to-end.
|
||||
NO se toca ningún VPS desde el agente.
|
||||
|
||||
# Fuera de alcance (otros issues / operacional)
|
||||
|
||||
- **H8** (CA generada/custodiada en om) → operacional del deploy 0003f.
|
||||
- **H9/H10/H11** → issue 0002 / futuro.
|
||||
- **Object Store (blobs) vía `$JS.API.>`**: el report 0008 lo marca como "probable misma
|
||||
clase que N2, no verificado" (impacto menor: blobs son ciphertext E2E). El fix de 0006b
|
||||
(denegar `OBJ_*`) lo cubre; verificar.
|
||||
- **Chaos test de red real** (matar 1/3, 2/3, split-brain) → 0003f (requiere 3 VPS).
|
||||
|
||||
# Definition of Done global
|
||||
|
||||
- `TestAttack0008_N3` → replay cross-node **401**; `TestAttack0008_N2` → eve no lee buckets KV;
|
||||
`TestAttack0008_N1` → nodo débil rechazado/señalado. Portados como regresión.
|
||||
- Con `decentralized: on`: control plane sobre KV replicado, fail-closed verificado, estado
|
||||
consistente entre nodos. Con flag off: baseline idéntico.
|
||||
- Clientes operan bajo `enforce`+ACL sin intervención manual (RefreshSession resuelto).
|
||||
- `CGO_ENABLED=0 go build ./... && go vet ./... && go test ./...` verdes + `govulncheck` 0.
|
||||
- Veredicto re-evaluado: el bus DESCENTRALIZADO pasa de "NO" a "sí-con-condiciones" (3 nodos
|
||||
R3 para HA real, posture homogénea, CA en om).
|
||||
@@ -1,25 +1,28 @@
|
||||
module github.com/enmanuel/unibus
|
||||
|
||||
go 1.25.0
|
||||
go 1.26.4
|
||||
|
||||
replace fn-registry => ../../../../
|
||||
|
||||
require (
|
||||
fn-registry v0.0.0-00010101000000-000000000000
|
||||
github.com/nats-io/nats-server/v2 v2.10.22
|
||||
github.com/nats-io/nats.go v1.37.0
|
||||
github.com/nats-io/nats-server/v2 v2.11.15
|
||||
github.com/nats-io/nats.go v1.49.0
|
||||
github.com/nats-io/nkeys v0.4.15
|
||||
github.com/oklog/ulid/v2 v2.1.0
|
||||
golang.org/x/time v0.15.0
|
||||
modernc.org/sqlite v1.47.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/google/go-tpm v0.9.8 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/klauspost/compress v1.18.3 // indirect
|
||||
github.com/klauspost/compress v1.18.4 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/minio/highwayhash v1.0.3 // indirect
|
||||
github.com/nats-io/jwt/v2 v2.5.8 // indirect
|
||||
github.com/nats-io/nkeys v0.4.7 // indirect
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 // indirect
|
||||
github.com/nats-io/jwt/v2 v2.8.1 // indirect
|
||||
github.com/nats-io/nuid v1.0.1 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
@@ -28,8 +31,6 @@ require (
|
||||
golang.org/x/mod v0.36.0 // indirect
|
||||
golang.org/x/sync v0.20.0 // indirect
|
||||
golang.org/x/sys v0.44.0 // indirect
|
||||
golang.org/x/text v0.37.0 // indirect
|
||||
golang.org/x/time v0.7.0 // indirect
|
||||
golang.org/x/tools v0.45.0 // indirect
|
||||
modernc.org/libc v1.70.0 // indirect
|
||||
modernc.org/mathutil v1.7.1 // indirect
|
||||
|
||||
@@ -1,25 +1,31 @@
|
||||
github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op h1:kpBdlEPbRvff0mDD1gk7o9BhI16b9p5yYAXRlidpqJE=
|
||||
github.com/antithesishq/antithesis-sdk-go v0.6.0-default-no-op/go.mod h1:IUpT2DPAKh6i/YhSbt6Gl3v2yvUZjmKncl7U91fup7E=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-tpm v0.9.8 h1:slArAR9Ft+1ybZu0lBwpSmpwhRXaa85hWtMinMyRAWo=
|
||||
github.com/google/go-tpm v0.9.8/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
|
||||
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/klauspost/compress v1.18.4 h1:RPhnKRAQ4Fh8zU2FY/6ZFDwTVTxgJ/EMydqSTzE9a2c=
|
||||
github.com/klauspost/compress v1.18.4/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/minio/highwayhash v1.0.3 h1:kbnuUMoHYyVl7szWjSxJnxw11k2U709jqFPPmIUyD6Q=
|
||||
github.com/minio/highwayhash v1.0.3/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
||||
github.com/nats-io/jwt/v2 v2.5.8 h1:uvdSzwWiEGWGXf+0Q+70qv6AQdvcvxrv9hPM0RiPamE=
|
||||
github.com/nats-io/jwt/v2 v2.5.8/go.mod h1:ZdWS1nZa6WMZfFwwgpEaqBV8EPGVgOTDHN/wTbz0Y5A=
|
||||
github.com/nats-io/nats-server/v2 v2.10.22 h1:Yt63BGu2c3DdMoBZNcR6pjGQwk/asrKU7VX846ibxDA=
|
||||
github.com/nats-io/nats-server/v2 v2.10.22/go.mod h1:X/m1ye9NYansUXYFrbcDwUi/blHkrgHh2rgCJaakonk=
|
||||
github.com/nats-io/nats.go v1.37.0 h1:07rauXbVnnJvv1gfIyghFEo6lUcYRY0WXc3x7x0vUxE=
|
||||
github.com/nats-io/nats.go v1.37.0/go.mod h1:Ubdu4Nh9exXdSz0RVWRFBbRfrbSxOYd26oF0wkWclB8=
|
||||
github.com/nats-io/nkeys v0.4.7 h1:RwNJbbIdYCoClSDNY7QVKZlyb/wfT6ugvFCiKy6vDvI=
|
||||
github.com/nats-io/nkeys v0.4.7/go.mod h1:kqXRgRDPlGy7nGaEDMuYzmiJCIAAWDK0IMBtDmGD0nc=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76 h1:KGuD/pM2JpL9FAYvBrnBBeENKZNh6eNtjqytV6TYjnk=
|
||||
github.com/minio/highwayhash v1.0.4-0.20251030100505-070ab1a87a76/go.mod h1:GGYsuwP/fPD6Y9hMiXuapVvlIUEhFhMTh0rxU3ik1LQ=
|
||||
github.com/nats-io/jwt/v2 v2.8.1 h1:V0xpGuD/N8Mi+fQNDynXohVvp7ZztevW5io8CUWlPmU=
|
||||
github.com/nats-io/jwt/v2 v2.8.1/go.mod h1:nWnOEEiVMiKHQpnAy4eXlizVEtSfzacZ1Q43LIRavZg=
|
||||
github.com/nats-io/nats-server/v2 v2.11.15 h1:StSf9TINInaZtr4oww2+kXmfwa9SkN//g/LwS19/UJ0=
|
||||
github.com/nats-io/nats-server/v2 v2.11.15/go.mod h1:zwhv8Y0PE3KHyKgznJc/9Xoai638SaJd83zzJ5GJn74=
|
||||
github.com/nats-io/nats.go v1.49.0 h1:yh/WvY59gXqYpgl33ZI+XoVPKyut/IcEaqtsiuTJpoE=
|
||||
github.com/nats-io/nats.go v1.49.0/go.mod h1:fDCn3mN5cY8HooHwE2ukiLb4p4G4ImmzvXyJt+tGwdw=
|
||||
github.com/nats-io/nkeys v0.4.15 h1:JACV5jRVO9V856KOapQ7x+EY8Jo3qw1vJt/9Jpwzkk4=
|
||||
github.com/nats-io/nkeys v0.4.15/go.mod h1:CpMchTXC9fxA5zrMo4KpySxNjiDVvr8ANOSZdiNfUrs=
|
||||
github.com/nats-io/nuid v1.0.1 h1:5iA8DT8V7q8WK2EScv2padNa/rTESc1KdnPw4TC2paw=
|
||||
github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
@@ -41,12 +47,14 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ=
|
||||
golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc=
|
||||
golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38=
|
||||
golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ=
|
||||
golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||
golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
|
||||
golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
|
||||
golang.org/x/tools v0.45.0 h1:18qN3FAooORvApf5XjCXgsuayZOEtXf6JK18I3+ONa8=
|
||||
golang.org/x/tools v0.45.0/go.mod h1:LuUGqqaXcXMEFEruIVJVm5mgDD8vww/z/SR1gQ4uE/0=
|
||||
golang.org/x/tools/go/expect v0.1.1-deprecated h1:jpBZDwmgPhXsKZC6WhL20P4b/wmnpsEAGHaNy0n/rJM=
|
||||
golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY=
|
||||
golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM=
|
||||
golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8=
|
||||
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
|
||||
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
|
||||
modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw=
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
-- 002_users.sql — bus-level user directory (issue 0001a).
|
||||
--
|
||||
-- The authoritative allowlist of identities permitted to use the bus, independent
|
||||
-- of room membership. A user is identified by its Ed25519 signing public key (the
|
||||
-- same key that derives the endpoint via frame.EndpointID); roles gate admin-only
|
||||
-- control-plane operations; status enables revocation without deleting history.
|
||||
--
|
||||
-- Additive and idempotent: safe to apply repeatedly. Never modify this file;
|
||||
-- further schema changes go in new numbered migrations (see
|
||||
-- .claude/rules/db_migrations.md). The embedded copy under
|
||||
-- pkg/membership/migrations/002_users.sql mirrors this file byte-for-byte.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
sign_pub TEXT PRIMARY KEY, -- Ed25519 public key in lowercase hex (peer identity)
|
||||
handle TEXT NOT NULL, -- human-readable name (unique recommended, not enforced as PK)
|
||||
role TEXT NOT NULL DEFAULT 'member', -- 'admin' | 'member'
|
||||
status TEXT NOT NULL DEFAULT 'active', -- 'active' | 'revoked'
|
||||
created_at TEXT NOT NULL,
|
||||
revoked_at TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_users_status ON users(status);
|
||||
@@ -1,108 +0,0 @@
|
||||
// Package mobile exposes a flat, gomobile-friendly API over the unibus client
|
||||
// so an Android app can join rooms, publish, and receive messages with the same
|
||||
// end-to-end encryption as any native Go peer.
|
||||
//
|
||||
// gomobile only supports a limited set of types across the binding boundary
|
||||
// (string, []byte, int, bool, error, named structs, and interfaces). This layer
|
||||
// translates the richer client API into those primitives and delivers incoming
|
||||
// frames through a Java/Kotlin-implemented FrameListener callback. No protocol
|
||||
// or cryptography is reimplemented here: every call delegates to pkg/client,
|
||||
// which is the single source of truth shared with every other peer on the bus.
|
||||
package mobile
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
// FrameListener receives decrypted messages for a subscribed room. The Android
|
||||
// side implements this interface. Its methods are invoked from a NATS delivery
|
||||
// goroutine, so implementations must hop back to the UI thread (for example via
|
||||
// a coroutine on the main dispatcher) before touching Android views.
|
||||
type FrameListener interface {
|
||||
OnFrame(roomID string, sender string, msgID string, text string)
|
||||
}
|
||||
|
||||
// Session is a connected unibus peer. Create it with NewSession and close it
|
||||
// with Close when the app stops.
|
||||
type Session struct {
|
||||
c *client.Client
|
||||
}
|
||||
|
||||
// GenerateIdentity creates (or loads) the long-term keypair stored at path.
|
||||
// Call it once on first launch. The resulting file holds the peer's private
|
||||
// Ed25519 and X25519 keys and must be kept private to the app sandbox.
|
||||
func GenerateIdentity(path string) error {
|
||||
_, err := client.LoadOrCreateIdentity(path)
|
||||
return err
|
||||
}
|
||||
|
||||
// NewSession loads the identity at idPath and connects to the bus. natsURL is
|
||||
// the data plane (for example nats://host:4250) and ctrlURL is the control
|
||||
// plane HTTP endpoint (for example http://host:8470).
|
||||
func NewSession(idPath, natsURL, ctrlURL string) (*Session, error) {
|
||||
id, err := client.LoadOrCreateIdentity(idPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
c, err := client.New(natsURL, ctrlURL, id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &Session{c: c}, nil
|
||||
}
|
||||
|
||||
// EndpointID returns this peer's stable endpoint identifier, derived from its
|
||||
// signing public key. It is the value that appears as the sender of frames.
|
||||
func (s *Session) EndpointID() string {
|
||||
return s.c.Endpoint().ID
|
||||
}
|
||||
|
||||
// CreateRoom opens a room on the given subject. mode is "matrix" for the
|
||||
// encrypted, persisted and signed policy, or "nats" for plain cleartext. It
|
||||
// returns the room id used by Join, Publish and Subscribe.
|
||||
func (s *Session) CreateRoom(subject, mode string) (string, error) {
|
||||
p := room.ModeNATS
|
||||
if mode == "matrix" {
|
||||
p = room.ModeMatrix
|
||||
}
|
||||
return s.c.CreateRoom(subject, p)
|
||||
}
|
||||
|
||||
// Join fetches the room key when the room is encrypted and prepares the session
|
||||
// to publish to and receive from the room.
|
||||
func (s *Session) Join(roomID string) error {
|
||||
return s.c.Join(roomID)
|
||||
}
|
||||
|
||||
// Publish sends a UTF-8 text message to the room.
|
||||
func (s *Session) Publish(roomID, text string) error {
|
||||
return s.c.Publish(roomID, []byte(text))
|
||||
}
|
||||
|
||||
// Subscribe streams decrypted messages of the room to the listener until the
|
||||
// session is closed.
|
||||
func (s *Session) Subscribe(roomID string, l FrameListener) error {
|
||||
_, err := s.c.Subscribe(roomID, func(f frame.Frame, plaintext []byte) {
|
||||
l.OnFrame(roomID, f.Sender, f.MsgID, string(plaintext))
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
// Request performs an RPC request/reply against subject and returns the reply
|
||||
// payload as text. timeoutMs bounds the wait in milliseconds.
|
||||
func (s *Session) Request(subject, text string, timeoutMs int) (string, error) {
|
||||
out, err := s.c.Request(subject, []byte(text), time.Duration(timeoutMs)*time.Millisecond)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// Close disconnects the peer from the bus.
|
||||
func (s *Session) Close() error {
|
||||
return s.c.Close()
|
||||
}
|
||||
+27
-10
@@ -1,9 +1,15 @@
|
||||
// Package blobstore is a content-addressed object store on local disk.
|
||||
// Package blobstore is a content-addressed object store for media ciphertext.
|
||||
//
|
||||
// The bus transports messages, not blobs. Media (images, files, large payloads)
|
||||
// is encrypted by the client BEFORE being stored here, so the store only ever
|
||||
// sees ciphertext. Objects are addressed by the sha256 hex of their (encrypted)
|
||||
// bytes, which makes Put idempotent and deduplicating.
|
||||
//
|
||||
// Store is an interface (branch-by-abstraction, issue 0003d) with two backends:
|
||||
// diskStore (the default, local filesystem) and objectStore (NATS Object Store
|
||||
// on JetStream, replicated across the cluster so blobs survive a node loss and
|
||||
// are reachable from any node). The wire contract (sha256-hex addressing) is
|
||||
// identical, so a client cannot tell which backend a membershipd uses.
|
||||
package blobstore
|
||||
|
||||
import (
|
||||
@@ -14,27 +20,38 @@ import (
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
// Store is a directory-backed content-addressed blob store.
|
||||
type Store struct {
|
||||
// Store is a content-addressed blob store: Put returns the sha256-hex address of
|
||||
// the stored bytes, Get fetches by that address, Has reports presence.
|
||||
type Store interface {
|
||||
Put(data []byte) (string, error)
|
||||
Get(hash string) ([]byte, error)
|
||||
Has(hash string) bool
|
||||
}
|
||||
|
||||
// diskStore is a directory-backed content-addressed blob store (the default,
|
||||
// single-node backend).
|
||||
type diskStore struct {
|
||||
dir string
|
||||
}
|
||||
|
||||
// New creates a Store rooted at dir, creating the directory if needed.
|
||||
func New(dir string) (*Store, error) {
|
||||
// New creates a disk-backed Store rooted at dir, creating the directory if
|
||||
// needed. It remains the default backend; the replicated NATS Object Store is
|
||||
// constructed separately (NewObjectStore) when decentralization is enabled.
|
||||
func New(dir string) (Store, error) {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("blobstore: mkdir %q: %w", dir, err)
|
||||
}
|
||||
return &Store{dir: dir}, nil
|
||||
return &diskStore{dir: dir}, nil
|
||||
}
|
||||
|
||||
// path returns the on-disk path for a given content hash.
|
||||
func (s *Store) path(hash string) string {
|
||||
func (s *diskStore) path(hash string) string {
|
||||
return filepath.Join(s.dir, hash)
|
||||
}
|
||||
|
||||
// Put writes data to the store and returns its sha256 hex hash. If an object
|
||||
// with the same content already exists, Put is a no-op and returns the hash.
|
||||
func (s *Store) Put(data []byte) (string, error) {
|
||||
func (s *diskStore) Put(data []byte) (string, error) {
|
||||
sum := sha256.Sum256(data)
|
||||
hash := hex.EncodeToString(sum[:])
|
||||
p := s.path(hash)
|
||||
@@ -66,7 +83,7 @@ func (s *Store) Put(data []byte) (string, error) {
|
||||
}
|
||||
|
||||
// Get reads the object with the given hash.
|
||||
func (s *Store) Get(hash string) ([]byte, error) {
|
||||
func (s *diskStore) Get(hash string) ([]byte, error) {
|
||||
data, err := os.ReadFile(s.path(hash))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: get %q: %w", hash, err)
|
||||
@@ -75,7 +92,7 @@ func (s *Store) Get(hash string) ([]byte, error) {
|
||||
}
|
||||
|
||||
// Has reports whether an object with the given hash exists.
|
||||
func (s *Store) Has(hash string) bool {
|
||||
func (s *diskStore) Has(hash string) bool {
|
||||
_, err := os.Stat(s.path(hash))
|
||||
return err == nil
|
||||
}
|
||||
|
||||
@@ -0,0 +1,102 @@
|
||||
package blobstore
|
||||
|
||||
// objectStore is the NATS Object Store implementation of Store (issue 0003d):
|
||||
// media ciphertext lives in a JetStream Object Store bucket replicated across
|
||||
// the cluster, so a blob uploaded to one node is durable against the loss of a
|
||||
// node and readable from any node. It is selected when decentralization is on;
|
||||
// diskStore stays the single-node default. The content-addressing (sha256-hex)
|
||||
// is identical to the disk backend, so the wire contract does not change.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
const (
|
||||
defaultObjectBucket = "UNIBUS_blobs"
|
||||
defaultObjOpTime = 10 * time.Second
|
||||
)
|
||||
|
||||
// ObjectStoreConfig configures the replicated Object Store backend.
|
||||
type ObjectStoreConfig struct {
|
||||
// Bucket is the object store bucket name; empty uses UNIBUS_blobs.
|
||||
Bucket string
|
||||
// Replicas is the replication factor (R1..R5), matching the KV store's
|
||||
// R1->R3 rollout.
|
||||
Replicas int
|
||||
// OpTimeout bounds each object operation; zero uses defaultObjOpTime.
|
||||
OpTimeout time.Duration
|
||||
}
|
||||
|
||||
type objectStore struct {
|
||||
os jetstream.ObjectStore
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// NewObjectStore creates (or opens) the replicated Object Store bucket on js and
|
||||
// returns it as a Store. The JetStream context belongs to the caller.
|
||||
func NewObjectStore(js jetstream.JetStream, cfg ObjectStoreConfig) (Store, error) {
|
||||
if cfg.Bucket == "" {
|
||||
cfg.Bucket = defaultObjectBucket
|
||||
}
|
||||
if cfg.Replicas <= 0 {
|
||||
cfg.Replicas = 1
|
||||
}
|
||||
opTimeout := cfg.OpTimeout
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultObjOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
obj, err := js.CreateOrUpdateObjectStore(ctx, jetstream.ObjectStoreConfig{
|
||||
Bucket: cfg.Bucket,
|
||||
Replicas: cfg.Replicas,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: open object store %q (replicas=%d): %w", cfg.Bucket, cfg.Replicas, err)
|
||||
}
|
||||
return &objectStore{os: obj, opTimeout: opTimeout}, nil
|
||||
}
|
||||
|
||||
func (s *objectStore) ctx() (context.Context, context.CancelFunc) {
|
||||
return context.WithTimeout(context.Background(), s.opTimeout)
|
||||
}
|
||||
|
||||
// Put stores data under its sha256-hex address. Re-putting identical bytes is a
|
||||
// harmless overwrite (same address, same content), preserving the idempotent,
|
||||
// deduplicating semantics of the disk backend.
|
||||
func (s *objectStore) Put(data []byte) (string, error) {
|
||||
sum := sha256.Sum256(data)
|
||||
hash := hex.EncodeToString(sum[:])
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.os.PutBytes(ctx, hash, data); err != nil {
|
||||
return "", fmt.Errorf("blobstore: put object %q: %w", hash, err)
|
||||
}
|
||||
return hash, nil
|
||||
}
|
||||
|
||||
// Get fetches the object by its hash address.
|
||||
func (s *objectStore) Get(hash string) ([]byte, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
data, err := s.os.GetBytes(ctx, hash)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("blobstore: get object %q: %w", hash, err)
|
||||
}
|
||||
return data, nil
|
||||
}
|
||||
|
||||
// Has reports whether an object with the given hash exists.
|
||||
func (s *objectStore) Has(hash string) bool {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
_, err := s.os.GetInfo(ctx, hash)
|
||||
return err == nil
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
package blobstore_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
func objFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// newObjectStore boots a single-node embedded NATS with JetStream and returns a
|
||||
// replicated (R1) Object Store backend over it.
|
||||
func newObjectStore(t *testing.T) blobstore.Store {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: objFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
st, err := blobstore.NewObjectStore(js, blobstore.ObjectStoreConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("new object store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { nc.Close(); ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return st
|
||||
}
|
||||
|
||||
// TestObjectStoreRoundTrip is the golden path: put ciphertext, get it back by
|
||||
// its hash, Has reports presence, and re-putting identical bytes returns the
|
||||
// same address (content-addressed dedup).
|
||||
func TestObjectStoreRoundTrip(t *testing.T) {
|
||||
s := newObjectStore(t)
|
||||
data := []byte("encrypted-media-ciphertext-payload")
|
||||
|
||||
hash, err := s.Put(data)
|
||||
if err != nil {
|
||||
t.Fatalf("Put: %v", err)
|
||||
}
|
||||
want := hex.EncodeToString(sha256Sum(data))
|
||||
if hash != want {
|
||||
t.Fatalf("hash = %q, want sha256 hex %q", hash, want)
|
||||
}
|
||||
got, err := s.Get(hash)
|
||||
if err != nil {
|
||||
t.Fatalf("Get: %v", err)
|
||||
}
|
||||
if !bytes.Equal(got, data) {
|
||||
t.Fatalf("Get returned %q, want %q", got, data)
|
||||
}
|
||||
if !s.Has(hash) {
|
||||
t.Fatalf("Has should be true for a stored blob")
|
||||
}
|
||||
// Re-put identical bytes: same address, no error.
|
||||
hash2, err := s.Put(data)
|
||||
if err != nil || hash2 != hash {
|
||||
t.Fatalf("re-Put: hash2=%q err=%v (want %q)", hash2, err, hash)
|
||||
}
|
||||
}
|
||||
|
||||
// TestObjectStoreMissing is the edge/error path: a hash that was never stored
|
||||
// is absent and unreadable.
|
||||
func TestObjectStoreMissing(t *testing.T) {
|
||||
s := newObjectStore(t)
|
||||
missing := hex.EncodeToString(sha256Sum([]byte("never stored")))
|
||||
if s.Has(missing) {
|
||||
t.Fatalf("Has should be false for an unknown hash")
|
||||
}
|
||||
if _, err := s.Get(missing); err == nil {
|
||||
t.Fatalf("Get of an unknown hash should error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestObjectStoreAddressMatchesDisk is the contract test: the Object Store and
|
||||
// the disk backend address identical bytes to the IDENTICAL hash, so a client
|
||||
// cannot tell which backend a node uses and a blob ref is portable across them.
|
||||
func TestObjectStoreAddressMatchesDisk(t *testing.T) {
|
||||
obj := newObjectStore(t)
|
||||
disk, err := blobstore.New(t.TempDir())
|
||||
if err != nil {
|
||||
t.Fatalf("disk store: %v", err)
|
||||
}
|
||||
for _, payload := range [][]byte{[]byte("a"), []byte("longer ciphertext blob \x00\x01\x02"), {}} {
|
||||
oh, err := obj.Put(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("object Put: %v", err)
|
||||
}
|
||||
dh, err := disk.Put(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("disk Put: %v", err)
|
||||
}
|
||||
if oh != dh {
|
||||
t.Fatalf("address mismatch for %q: object=%q disk=%q", payload, oh, dh)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sha256Sum(b []byte) []byte {
|
||||
sum := sha256.Sum256(b)
|
||||
return sum[:]
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
package busauth
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
"github.com/nats-io/nkeys"
|
||||
)
|
||||
|
||||
// nkeyAuthenticator is a NATS server.Authentication that authorizes a client by
|
||||
// verifying the nkey signature over the server-presented nonce and then
|
||||
// consulting the bus user allowlist. Authorization is checked on every new
|
||||
// connection via the injected predicate (not a static Options.Nkeys map), so
|
||||
// revoking a user denies its next connection without restarting the server.
|
||||
type nkeyAuthenticator struct {
|
||||
// isAuthorized reports whether the lowercase-hex Ed25519 public key behind an
|
||||
// nkey belongs to an active bus user. Injected (membership.Store.IsAuthorized)
|
||||
// so this package stays free of the store dependency.
|
||||
isAuthorized func(signPubHex string) bool
|
||||
}
|
||||
|
||||
// NewNkeyAuthenticator builds a NATS custom authenticator backed by isAuthorized.
|
||||
// Pass it to embeddednats so the data plane only accepts registered identities.
|
||||
func NewNkeyAuthenticator(isAuthorized func(signPubHex string) bool) server.Authentication {
|
||||
return &nkeyAuthenticator{isAuthorized: isAuthorized}
|
||||
}
|
||||
|
||||
// Check verifies the client's nkey signature against the nonce the server
|
||||
// presented, then maps the nkey to its allowlist key and checks authorization.
|
||||
// Any malformed input or failed verification yields false (fail closed).
|
||||
func (a *nkeyAuthenticator) Check(c server.ClientAuthentication) bool {
|
||||
signPubHex, ok := verifyNkey(c)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
return a.isAuthorized(signPubHex)
|
||||
}
|
||||
|
||||
// verifyNkey performs the shared nkey verification: it checks the client's
|
||||
// signature against the server-presented nonce and returns the lowercase-hex
|
||||
// Ed25519 public key behind the nkey. ok is false on any malformed input or
|
||||
// failed verification (fail closed). The signature decoding mirrors
|
||||
// nats-server's own (raw-url base64, then std base64 fallback) so genuine
|
||||
// clients using nats.Nkey are accepted unchanged.
|
||||
func verifyNkey(c server.ClientAuthentication) (signPubHex string, ok bool) {
|
||||
opts := c.GetOpts()
|
||||
if opts.Nkey == "" {
|
||||
return "", false
|
||||
}
|
||||
sig, err := base64.RawURLEncoding.DecodeString(opts.Sig)
|
||||
if err != nil {
|
||||
sig, err = base64.StdEncoding.DecodeString(opts.Sig)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
}
|
||||
pub, err := nkeys.FromPublicKey(opts.Nkey)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
if err := pub.Verify(c.GetNonce(), sig); err != nil {
|
||||
return "", false
|
||||
}
|
||||
signPubHex, err = SignPubHexFromNkey(opts.Nkey)
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
return signPubHex, true
|
||||
}
|
||||
|
||||
// PermissionsFunc maps a connecting identity (lowercase-hex Ed25519 signing key)
|
||||
// to the NATS permissions it should be granted for this connection. Returning an
|
||||
// error denies the connection (fail closed). It is how the data plane enforces
|
||||
// per-subject access from room membership (issue 0003e, audit H4 residual).
|
||||
type PermissionsFunc func(signPubHex string) (*server.Permissions, error)
|
||||
|
||||
// nkeyAuthenticatorACL is the nkey authenticator that ALSO scopes the connection
|
||||
// to per-subject permissions derived from room membership. NATS evaluates
|
||||
// permissions once, at connect time, so a peer that joins a room after
|
||||
// connecting must reconnect (client.RefreshSession) to gain that room's subject
|
||||
// — the dynamic-membership reconnection model the audit deferred to this issue.
|
||||
type nkeyAuthenticatorACL struct {
|
||||
isAuthorized func(signPubHex string) bool
|
||||
perms PermissionsFunc
|
||||
// internalPubHex is the lowercase-hex Ed25519 public key of membershipd's own
|
||||
// ephemeral internal service identity. A connection that proves that key is
|
||||
// granted full permissions WITHOUT consulting the allowlist, so the service can
|
||||
// bootstrap and manage JetStream (the replicated nonce bucket and, when
|
||||
// decentralized, the control-plane KV buckets) against its own embedded server
|
||||
// even while the data plane confines every client to its rooms. Empty disables
|
||||
// the internal-identity path entirely (behavior identical to a plain ACL
|
||||
// authenticator).
|
||||
internalPubHex string
|
||||
}
|
||||
|
||||
// NewNkeyAuthenticatorACL builds an authenticator that authorizes by the bus
|
||||
// allowlist AND registers per-subject permissions from perms. A registered but
|
||||
// permission-less peer can no longer subscribe to or publish on arbitrary
|
||||
// subjects: it is confined to the subjects of the rooms it belongs to (plus the
|
||||
// client infrastructure subjects perms includes). This is the per-subject ACL
|
||||
// the 0004 hardening left as a residual.
|
||||
func NewNkeyAuthenticatorACL(isAuthorized func(signPubHex string) bool, perms PermissionsFunc) server.Authentication {
|
||||
return &nkeyAuthenticatorACL{isAuthorized: isAuthorized, perms: perms}
|
||||
}
|
||||
|
||||
// NewNkeyAuthenticatorACLInternal is NewNkeyAuthenticatorACL that also recognizes
|
||||
// membershipd's internal service identity (internalPubHex, the lowercase hex of
|
||||
// its ephemeral Ed25519 public key): a connection proving that key is granted
|
||||
// full permissions without an allowlist lookup, so the service can create and
|
||||
// manage JetStream against its own embedded server under enforce (issue 0006a/c —
|
||||
// the replicated nonce bucket and the control-plane KV). Every other identity
|
||||
// goes through the allowlist + per-subject ACL unchanged. An empty internalPubHex
|
||||
// is identical to NewNkeyAuthenticatorACL, so this is a superset and safe to use
|
||||
// everywhere the plain constructor was used.
|
||||
func NewNkeyAuthenticatorACLInternal(isAuthorized func(signPubHex string) bool, perms PermissionsFunc, internalPubHex string) server.Authentication {
|
||||
return &nkeyAuthenticatorACL{isAuthorized: isAuthorized, perms: perms, internalPubHex: internalPubHex}
|
||||
}
|
||||
|
||||
// fullPermissions grants publish and subscribe on every subject (">"). It is the
|
||||
// permission set for membershipd's own internal service connection, which must
|
||||
// manage the JetStream control plane (nonce bucket + KV buckets) over NATS. It is
|
||||
// NEVER granted to a bus user — only to the process's own ephemeral internal
|
||||
// identity, recognized by exact public-key match in Check.
|
||||
func fullPermissions() *server.Permissions {
|
||||
sp := &server.SubjectPermission{Allow: []string{">"}}
|
||||
return &server.Permissions{Publish: sp, Subscribe: sp}
|
||||
}
|
||||
|
||||
// Check verifies the nkey, authorizes against the allowlist, then derives and
|
||||
// registers the connection's subject permissions. A permissions-derivation
|
||||
// error denies the connection (fail closed) rather than granting open access.
|
||||
func (a *nkeyAuthenticatorACL) Check(c server.ClientAuthentication) bool {
|
||||
signPubHex, ok := verifyNkey(c)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
// membershipd's own internal service identity bypasses the allowlist and is
|
||||
// granted full permissions so the service can bootstrap JetStream under
|
||||
// enforce. The key is matched exactly against the cryptographically verified
|
||||
// connecting key, so no other identity can claim these permissions.
|
||||
if a.internalPubHex != "" && signPubHex == a.internalPubHex {
|
||||
c.RegisterUser(&server.User{Permissions: fullPermissions()})
|
||||
return true
|
||||
}
|
||||
if !a.isAuthorized(signPubHex) {
|
||||
return false
|
||||
}
|
||||
perms, err := a.perms(signPubHex)
|
||||
if err != nil {
|
||||
return false // fail closed: never grant open access on a derivation error
|
||||
}
|
||||
c.RegisterUser(&server.User{Permissions: perms})
|
||||
return true
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
// Package busauth bridges a unibus peer's Ed25519 identity to NATS nkey
|
||||
// authentication. A NATS nkey IS an Ed25519 keypair, so the bus reuses the
|
||||
// peer's existing signing identity for the data plane instead of minting new
|
||||
// key material — one identity authenticates both planes (HTTP request signatures
|
||||
// and NATS connections), keyed in the user allowlist by the same Ed25519 public
|
||||
// key.
|
||||
//
|
||||
// This is transport glue specific to NATS + unibus, not a general-purpose
|
||||
// registry primitive: it deliberately lives in the app to avoid pulling
|
||||
// github.com/nats-io/nkeys into the multi-domain registry module. The Ed25519
|
||||
// signing/verification it relies on comes from the registry cybersecurity
|
||||
// package; this package never reimplements a primitive.
|
||||
package busauth
|
||||
|
||||
import (
|
||||
"crypto/ed25519"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
|
||||
"github.com/nats-io/nkeys"
|
||||
)
|
||||
|
||||
// ClientNkey derives, from a peer's Ed25519 private key, the NATS user nkey
|
||||
// public string ("U...") and a signature callback suitable for
|
||||
// nats.Nkey(pub, sign). The callback signs the server-presented nonce with the
|
||||
// same Ed25519 key, so the server can verify it and map it back to the bus user.
|
||||
//
|
||||
// signPriv must be a 64-byte Ed25519 private key (as produced by the registry's
|
||||
// GenerateIdentity). Its first 32 bytes are the seed nkeys needs.
|
||||
func ClientNkey(signPriv []byte) (pub string, sign func([]byte) ([]byte, error), err error) {
|
||||
if len(signPriv) != ed25519.PrivateKeySize {
|
||||
return "", nil, fmt.Errorf("busauth: signPriv must be %d bytes, got %d", ed25519.PrivateKeySize, len(signPriv))
|
||||
}
|
||||
seed := ed25519.PrivateKey(signPriv).Seed() // 32-byte Ed25519 seed
|
||||
kp, err := nkeys.FromRawSeed(nkeys.PrefixByteUser, seed)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("busauth: derive nkey from seed: %w", err)
|
||||
}
|
||||
pub, err = kp.PublicKey()
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("busauth: nkey public key: %w", err)
|
||||
}
|
||||
sign = func(nonce []byte) ([]byte, error) {
|
||||
return kp.Sign(nonce)
|
||||
}
|
||||
return pub, sign, nil
|
||||
}
|
||||
|
||||
// NkeyPublicFromSignPub derives the NATS user nkey public string from a 32-byte
|
||||
// Ed25519 public key. It is the inverse view of the identity used by callers
|
||||
// that have only the public key (e.g. to display or pre-register an nkey).
|
||||
func NkeyPublicFromSignPub(signPub []byte) (string, error) {
|
||||
if len(signPub) != ed25519.PublicKeySize {
|
||||
return "", fmt.Errorf("busauth: signPub must be %d bytes, got %d", ed25519.PublicKeySize, len(signPub))
|
||||
}
|
||||
pub, err := nkeys.Encode(nkeys.PrefixByteUser, signPub)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("busauth: encode nkey public: %w", err)
|
||||
}
|
||||
return string(pub), nil
|
||||
}
|
||||
|
||||
// SignPubHexFromNkey decodes a NATS user nkey public string ("U...") back to the
|
||||
// lowercase hex of its 32-byte Ed25519 public key — the identity key used to
|
||||
// look a peer up in the bus user allowlist. The server calls this to map the
|
||||
// nkey a client presented to the users table.
|
||||
func SignPubHexFromNkey(nkeyPub string) (string, error) {
|
||||
raw, err := nkeys.Decode(nkeys.PrefixByteUser, []byte(nkeyPub))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("busauth: decode nkey %q: %w", nkeyPub, err)
|
||||
}
|
||||
if len(raw) != ed25519.PublicKeySize {
|
||||
return "", fmt.Errorf("busauth: decoded nkey is %d bytes, want %d", len(raw), ed25519.PublicKeySize)
|
||||
}
|
||||
return hex.EncodeToString(raw), nil
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package busauth
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/ed25519"
|
||||
"encoding/hex"
|
||||
"testing"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/nats-io/nkeys"
|
||||
)
|
||||
|
||||
// TestNkeyRoundTrip is the dedicated sign/verify round-trip the spec requires
|
||||
// BEFORE the NATS server depends on this conversion. It proves three things end
|
||||
// to end: (1) ClientNkey produces a signature callback whose output verifies
|
||||
// under the derived nkey public key; (2) that signature is exactly the Ed25519
|
||||
// signature of the same identity (the nkey is the same key, not a new one);
|
||||
// (3) the nkey public string maps back to the identity's Ed25519 hex, which is
|
||||
// the key the allowlist is indexed by.
|
||||
func TestNkeyRoundTrip(t *testing.T) {
|
||||
id, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
|
||||
pub, sign, err := ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("ClientNkey: %v", err)
|
||||
}
|
||||
|
||||
// (1) The callback's signature over a server-style nonce verifies under the
|
||||
// public nkey, exactly as the NATS server will verify it.
|
||||
nonce := []byte("server-presented-nonce-1234567890")
|
||||
sig, err := sign(nonce)
|
||||
if err != nil {
|
||||
t.Fatalf("sign: %v", err)
|
||||
}
|
||||
kpPub, err := nkeys.FromPublicKey(pub)
|
||||
if err != nil {
|
||||
t.Fatalf("FromPublicKey: %v", err)
|
||||
}
|
||||
if err := kpPub.Verify(nonce, sig); err != nil {
|
||||
t.Fatalf("nkey verify failed: %v", err)
|
||||
}
|
||||
|
||||
// (2) The signature is the very same bytes as a raw Ed25519 sign with the
|
||||
// identity's private key — confirming no separate key material was minted.
|
||||
want := ed25519.Sign(ed25519.PrivateKey(id.SignPriv), nonce)
|
||||
if !bytes.Equal(sig, want) {
|
||||
t.Fatalf("nkey signature differs from Ed25519 signature of the same identity")
|
||||
}
|
||||
|
||||
// (3) The nkey public maps back to the identity's Ed25519 hex (allowlist key).
|
||||
gotHex, err := SignPubHexFromNkey(pub)
|
||||
if err != nil {
|
||||
t.Fatalf("SignPubHexFromNkey: %v", err)
|
||||
}
|
||||
if gotHex != hex.EncodeToString(id.SignPub) {
|
||||
t.Fatalf("nkey->hex mismatch: got %s want %s", gotHex, hex.EncodeToString(id.SignPub))
|
||||
}
|
||||
|
||||
// And NkeyPublicFromSignPub is consistent with ClientNkey's public.
|
||||
pub2, err := NkeyPublicFromSignPub(id.SignPub)
|
||||
if err != nil {
|
||||
t.Fatalf("NkeyPublicFromSignPub: %v", err)
|
||||
}
|
||||
if pub2 != pub {
|
||||
t.Fatalf("public nkey mismatch between derivations: %s vs %s", pub2, pub)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: a wrong-length private key is rejected, not silently misused.
|
||||
func TestClientNkeyBadKey(t *testing.T) {
|
||||
if _, _, err := ClientNkey([]byte("too-short")); err == nil {
|
||||
t.Fatalf("expected error for short private key")
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: a non-nkey string does not decode to an allowlist key.
|
||||
func TestSignPubHexFromNkeyBad(t *testing.T) {
|
||||
if _, err := SignPubHexFromNkey("not-a-real-nkey"); err == nil {
|
||||
t.Fatalf("expected error decoding a bogus nkey")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package busauth
|
||||
|
||||
import server "github.com/nats-io/nats-server/v2/server"
|
||||
|
||||
// PermissionsFromSubjects adapts a subject-deriving function (e.g.
|
||||
// membership.SubjectACLFor, which maps an identity to the subjects of the rooms
|
||||
// it belongs to plus the client infrastructure subjects) into the PermissionsFunc
|
||||
// the ACL authenticator expects. The derived subjects are granted as BOTH the
|
||||
// publish and subscribe allow set, so a connection can only pub/sub on the
|
||||
// subjects it is entitled to. A derivation error is propagated so the caller
|
||||
// fails closed (denies the connection) rather than granting open access.
|
||||
//
|
||||
// This is the production wiring for the per-subject data-plane ACL (issue 0003e,
|
||||
// audit H4): membershipd passes PermissionsFromSubjects(membership.SubjectACLFor(
|
||||
// store)) to NewNkeyAuthenticatorACL. It lives in busauth (not membership) so the
|
||||
// membership package stays free of the nats-server dependency.
|
||||
func PermissionsFromSubjects(derive func(signPubHex string) ([]string, error)) PermissionsFunc {
|
||||
return func(signPubHex string) (*server.Permissions, error) {
|
||||
subjects, err := derive(signPubHex)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sp := &server.SubjectPermission{Allow: subjects}
|
||||
return &server.Permissions{Publish: sp, Subscribe: sp}, nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package busauth
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"fmt"
|
||||
"os"
|
||||
)
|
||||
|
||||
// LoadCATLSConfig builds a *tls.Config that trusts ONLY the given CA certificate
|
||||
// (PEM file), for a bus client pinning the project's self-signed CA. Because the
|
||||
// bus uses a private CA rather than a public one, clients must pin it explicitly;
|
||||
// trusting the system roots would reject the server cert. This is the single
|
||||
// helper every client (Go peers, the mobile binding, the gateway) uses to turn a
|
||||
// ca.crt path into a connection config.
|
||||
func LoadCATLSConfig(caPEMPath string) (*tls.Config, error) {
|
||||
pem, err := os.ReadFile(caPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: read CA %q: %w", caPEMPath, err)
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("busauth: CA %q contains no valid PEM certificate", caPEMPath)
|
||||
}
|
||||
return &tls.Config{RootCAs: pool, MinVersion: tls.VersionTLS12}, nil
|
||||
}
|
||||
|
||||
// ServerTLSConfig loads the bus NATS server's certificate and private key (PEM
|
||||
// files) into a *tls.Config to present to clients. The private key never leaves
|
||||
// the host; only the CA cert travels to clients.
|
||||
func ServerTLSConfig(certPEMPath, keyPEMPath string) (*tls.Config, error) {
|
||||
cert, err := tls.LoadX509KeyPair(certPEMPath, keyPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: load server keypair: %w", err)
|
||||
}
|
||||
return &tls.Config{Certificates: []tls.Certificate{cert}, MinVersion: tls.VersionTLS12}, nil
|
||||
}
|
||||
|
||||
// RouteTLSConfig builds the mutual-TLS config for the NATS CLUSTER route layer
|
||||
// (issue 0003a). Unlike the client data plane, where the server presents a cert
|
||||
// and only the client verifies it, routes are server-to-server: each node both
|
||||
// presents its own node certificate AND verifies the connecting node's
|
||||
// certificate against the bus CA. So this single config carries:
|
||||
//
|
||||
// - Certificates: this node's CA-signed certificate (presented in both the
|
||||
// server and the client role of a route handshake),
|
||||
// - RootCAs: the bus CA, to verify the certificate of a node we dial out to,
|
||||
// - ClientCAs + ClientAuth=RequireAndVerifyClientCert: the bus CA, to verify
|
||||
// the certificate of a node dialing in.
|
||||
//
|
||||
// The effect: a node that lacks a certificate signed by the bus CA cannot
|
||||
// establish a route in either direction, even if it knows the cluster password.
|
||||
// Reuse the same CA as the client data plane (deploy/tls) but a per-node cert
|
||||
// whose SAN covers that node's route address.
|
||||
func RouteTLSConfig(certPEMPath, keyPEMPath, caPEMPath string) (*tls.Config, error) {
|
||||
cert, err := tls.LoadX509KeyPair(certPEMPath, keyPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: load route keypair: %w", err)
|
||||
}
|
||||
pem, err := os.ReadFile(caPEMPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("busauth: read route CA %q: %w", caPEMPath, err)
|
||||
}
|
||||
pool := x509.NewCertPool()
|
||||
if !pool.AppendCertsFromPEM(pem) {
|
||||
return nil, fmt.Errorf("busauth: route CA %q contains no valid PEM certificate", caPEMPath)
|
||||
}
|
||||
return &tls.Config{
|
||||
Certificates: []tls.Certificate{cert},
|
||||
RootCAs: pool,
|
||||
ClientCAs: pool,
|
||||
ClientAuth: tls.RequireAndVerifyClientCert,
|
||||
MinVersion: tls.VersionTLS12,
|
||||
}, nil
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
package busauth
|
||||
|
||||
import (
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"math/big"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// writeSelfSigned writes a self-signed cert + key PEM pair to dir and returns
|
||||
// their paths. It is enough to exercise both LoadCATLSConfig (reads the cert as
|
||||
// a CA) and ServerTLSConfig (reads the cert+key as a server keypair).
|
||||
func writeSelfSigned(t *testing.T, dir string) (certPath, keyPath string) {
|
||||
t.Helper()
|
||||
key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("key: %v", err)
|
||||
}
|
||||
tmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{CommonName: "unibus-tls-test"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(time.Hour),
|
||||
IsCA: true,
|
||||
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature,
|
||||
BasicConstraintsValid: true,
|
||||
}
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key)
|
||||
if err != nil {
|
||||
t.Fatalf("cert: %v", err)
|
||||
}
|
||||
certPath = filepath.Join(dir, "cert.pem")
|
||||
keyPath = filepath.Join(dir, "key.pem")
|
||||
certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der})
|
||||
if err := os.WriteFile(certPath, certPEM, 0o644); err != nil {
|
||||
t.Fatalf("write cert: %v", err)
|
||||
}
|
||||
keyDER, err := x509.MarshalECPrivateKey(key)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal key: %v", err)
|
||||
}
|
||||
keyPEM := pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: keyDER})
|
||||
if err := os.WriteFile(keyPath, keyPEM, 0o600); err != nil {
|
||||
t.Fatalf("write key: %v", err)
|
||||
}
|
||||
return certPath, keyPath
|
||||
}
|
||||
|
||||
// Golden: a valid CA PEM loads into a config with a non-empty RootCAs pool, and
|
||||
// a valid keypair loads into a config presenting one certificate.
|
||||
func TestLoadTLSConfigsGolden(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
certPath, keyPath := writeSelfSigned(t, dir)
|
||||
|
||||
caCfg, err := LoadCATLSConfig(certPath)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadCATLSConfig: %v", err)
|
||||
}
|
||||
if caCfg.RootCAs == nil {
|
||||
t.Fatalf("expected a populated RootCAs pool")
|
||||
}
|
||||
|
||||
srvCfg, err := ServerTLSConfig(certPath, keyPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ServerTLSConfig: %v", err)
|
||||
}
|
||||
if len(srvCfg.Certificates) != 1 {
|
||||
t.Fatalf("expected exactly one server certificate, got %d", len(srvCfg.Certificates))
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: missing file, and a file that is not valid PEM.
|
||||
func TestLoadTLSConfigsErrors(t *testing.T) {
|
||||
if _, err := LoadCATLSConfig("/no/such/ca.crt"); err == nil {
|
||||
t.Fatalf("expected error for missing CA file")
|
||||
}
|
||||
dir := t.TempDir()
|
||||
junk := filepath.Join(dir, "junk.crt")
|
||||
if err := os.WriteFile(junk, []byte("not a pem"), 0o644); err != nil {
|
||||
t.Fatalf("write junk: %v", err)
|
||||
}
|
||||
if _, err := LoadCATLSConfig(junk); err == nil {
|
||||
t.Fatalf("expected error for non-PEM CA file")
|
||||
}
|
||||
if _, err := ServerTLSConfig("/no/such/server.crt", "/no/such/server.key"); err == nil {
|
||||
t.Fatalf("expected error for missing server keypair")
|
||||
}
|
||||
}
|
||||
+396
-79
@@ -16,16 +16,23 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"crypto/tls"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
@@ -44,20 +51,130 @@ type Client struct {
|
||||
endpoint string
|
||||
nc *nats.Conn
|
||||
js jetstream.JetStream // durable plane for rooms with Policy.Persist
|
||||
ctrlURL string
|
||||
ctrlURLs []string // control-plane HTTP endpoints, tried in order (failover)
|
||||
http *http.Client
|
||||
|
||||
// natsServers + natsOpts are retained so RefreshSession can rebuild the
|
||||
// data-plane connection (re-triggering the server's subject-ACL evaluation).
|
||||
natsServers []string
|
||||
natsOpts []nats.Option
|
||||
|
||||
mu sync.RWMutex
|
||||
keyCache map[string]map[int][]byte // roomID -> epoch -> K
|
||||
signCache map[string][]byte // sender endpoint -> sign pub (for verification)
|
||||
}
|
||||
|
||||
// New connects to NATS and records the control-plane URL. The identity holds
|
||||
// the peer's long-term keypairs.
|
||||
// Options configures how a client connects to the bus. The zero value is the
|
||||
// legacy behavior: a plain NATS connection with no nkey and no TLS — what dev
|
||||
// stacks and a not-yet-secured server expect. Secured deployments set these.
|
||||
type Options struct {
|
||||
// UseNkey authenticates the NATS connection with the peer's Ed25519 identity
|
||||
// reused as a NATS nkey. It MUST match the server: nats.go refuses to connect
|
||||
// with an nkey to a server that does not advertise nkey auth ("nkeys not
|
||||
// supported by the server"), so this is opt-in rather than always-on.
|
||||
UseNkey bool
|
||||
// TLS, when non-nil, secures the NATS (data plane) connection and pins the
|
||||
// server to this config's RootCAs (the bus's self-signed CA). Build it with
|
||||
// busauth.LoadCATLSConfig(caPath). Nil keeps the data plane plaintext.
|
||||
TLS *tls.Config
|
||||
// CtrlTLS, when non-nil, secures the HTTP control-plane connection and pins it
|
||||
// to this config's RootCAs. It is separate from TLS so the two planes can be
|
||||
// secured independently (a test may TLS one and not the other); production
|
||||
// sets both to the same CA via Connect. Nil keeps the control plane plaintext.
|
||||
CtrlTLS *tls.Config
|
||||
// NatsServers are ADDITIONAL NATS seed URLs for cluster failover (issue
|
||||
// 0003e), beyond the primary natsURL passed to the constructor. With more
|
||||
// than one server nats.go reconnects to a surviving node automatically when
|
||||
// the one a client is attached to dies, so a node loss is transparent.
|
||||
NatsServers []string
|
||||
// CtrlURLs are ADDITIONAL control-plane HTTP endpoints (one per node) beyond
|
||||
// the primary ctrlURL. Each request is tried against them in order until one
|
||||
// answers, so the control plane survives a node loss too. With the
|
||||
// decentralized KV store every node serves the same state, so any of them
|
||||
// can answer any request.
|
||||
CtrlURLs []string
|
||||
}
|
||||
|
||||
// dedupNonEmpty returns the input with empty strings dropped and duplicates
|
||||
// removed, preserving order. Used to build the NATS seed list and control-plane
|
||||
// list from a primary URL plus optional extras without a redundant entry.
|
||||
func dedupNonEmpty(in []string) []string {
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, s := range in {
|
||||
if s == "" || seen[s] {
|
||||
continue
|
||||
}
|
||||
seen[s] = true
|
||||
out = append(out, s)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// New connects to NATS and records the control-plane URL with default Options
|
||||
// (no nkey, no TLS). The identity holds the peer's long-term keypairs.
|
||||
func New(natsURL, ctrlURL string, id cs.Identity) (*Client, error) {
|
||||
nc, err := nats.Connect(natsURL, nats.Name("unibus-client"))
|
||||
return NewWithOptions(natsURL, ctrlURL, id, Options{})
|
||||
}
|
||||
|
||||
// Connect is the single migration seam every peer (worker, chat, mobile,
|
||||
// gateway) uses to pick its security posture from one input: the CA path. With
|
||||
// a non-empty caPath it connects securely — TLS pinned to that CA plus nkey
|
||||
// authentication on the data plane — matching a bus running with bus-auth
|
||||
// enforce + bus-tls. With an empty caPath it falls back to the legacy plaintext,
|
||||
// no-nkey connection for local dev against an unsecured bus. The control-plane
|
||||
// HTTP requests are signed in both cases (that signing is unconditional).
|
||||
func Connect(natsURL, ctrlURL string, id cs.Identity, caPath string) (*Client, error) {
|
||||
if caPath == "" {
|
||||
return New(natsURL, ctrlURL, id)
|
||||
}
|
||||
// A CA implies the bus is TLS on BOTH planes. Refuse a plaintext control-plane
|
||||
// URL: signing gives integrity, not confidentiality, so sending metadata over
|
||||
// http:// when the operator provisioned a CA would silently leak it to a MITM
|
||||
// (audit H5). Force https rather than silently downgrade.
|
||||
if !strings.HasPrefix(ctrlURL, "https://") {
|
||||
return nil, fmt.Errorf("client: control-plane URL %q must be https:// when a CA is provided", ctrlURL)
|
||||
}
|
||||
tlsCfg, err := busauth.LoadCATLSConfig(caPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: connect nats %q: %w", natsURL, err)
|
||||
return nil, fmt.Errorf("client: load CA %q: %w", caPath, err)
|
||||
}
|
||||
// Pin the same CA on both planes: nkey+TLS on NATS, TLS on the HTTP control plane.
|
||||
return NewWithOptions(natsURL, ctrlURL, id, Options{UseNkey: true, TLS: tlsCfg, CtrlTLS: tlsCfg})
|
||||
}
|
||||
|
||||
// NewWithOptions is New with explicit connection options (nkey auth, and, from
|
||||
// phase 0001d, TLS). It is the single place the data-plane connection is built,
|
||||
// so every peer (worker, chat, mobile, gateway) gets identical behavior by
|
||||
// passing the same Options.
|
||||
func NewWithOptions(natsURL, ctrlURL string, id cs.Identity, opts Options) (*Client, error) {
|
||||
// Seed list = primary + extras. With more than one seed, nats.go fails over
|
||||
// to a surviving node on disconnect; MaxReconnects(-1) keeps it retrying
|
||||
// indefinitely so a node coming back is rejoined rather than given up on.
|
||||
natsServers := dedupNonEmpty(append([]string{natsURL}, opts.NatsServers...))
|
||||
natsOpts := []nats.Option{
|
||||
nats.Name("unibus-client"),
|
||||
nats.MaxReconnects(-1),
|
||||
nats.ReconnectWait(250 * time.Millisecond),
|
||||
}
|
||||
if len(natsServers) > 1 {
|
||||
// Try every seed on the initial connect too, so startup tolerates one
|
||||
// seed being down.
|
||||
natsOpts = append(natsOpts, nats.RetryOnFailedConnect(true))
|
||||
}
|
||||
if opts.UseNkey {
|
||||
nkeyPub, nkeySign, err := busauth.ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: derive nkey: %w", err)
|
||||
}
|
||||
natsOpts = append(natsOpts, nats.Nkey(nkeyPub, nkeySign))
|
||||
}
|
||||
if opts.TLS != nil {
|
||||
natsOpts = append(natsOpts, nats.Secure(opts.TLS))
|
||||
}
|
||||
nc, err := nats.Connect(strings.Join(natsServers, ","), natsOpts...)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: connect nats %v: %w", natsServers, err)
|
||||
}
|
||||
// JetStream context for the durable plane. Obtaining it does not require any
|
||||
// stream to exist yet and has no effect on cleartext/ephemeral rooms — those
|
||||
@@ -67,18 +184,58 @@ func New(natsURL, ctrlURL string, id cs.Identity) (*Client, error) {
|
||||
nc.Close()
|
||||
return nil, fmt.Errorf("client: init jetstream: %w", err)
|
||||
}
|
||||
// The control-plane HTTP client pins the bus CA when CtrlTLS is set, so an
|
||||
// https:// control plane is verified against the bus's own CA rather than the
|
||||
// system roots (audit H5). Without it the client stays plaintext for dev.
|
||||
httpClient := &http.Client{Timeout: 10 * time.Second}
|
||||
if opts.CtrlTLS != nil {
|
||||
httpClient.Transport = &http.Transport{TLSClientConfig: opts.CtrlTLS.Clone()}
|
||||
}
|
||||
return &Client{
|
||||
id: id,
|
||||
endpoint: frame.EndpointID(id.SignPub),
|
||||
nc: nc,
|
||||
js: js,
|
||||
ctrlURL: ctrlURL,
|
||||
http: &http.Client{Timeout: 10 * time.Second},
|
||||
keyCache: map[string]map[int][]byte{},
|
||||
signCache: map[string][]byte{},
|
||||
id: id,
|
||||
endpoint: frame.EndpointID(id.SignPub),
|
||||
nc: nc,
|
||||
js: js,
|
||||
ctrlURLs: dedupNonEmpty(append([]string{ctrlURL}, opts.CtrlURLs...)),
|
||||
http: httpClient,
|
||||
natsServers: natsServers,
|
||||
natsOpts: natsOpts,
|
||||
keyCache: map[string]map[int][]byte{},
|
||||
signCache: map[string][]byte{},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// RefreshSession rebuilds the data-plane NATS connection so the server's
|
||||
// subject-ACL authenticator re-evaluates this peer's room membership (issue
|
||||
// 0003e, audit H4 residual). Call it after a membership change — a room you
|
||||
// created, were invited to, or joined — when the bus enforces per-subject
|
||||
// permissions, so the new room's subject becomes publishable and subscribable
|
||||
// (NATS freezes permissions at connect time, so the prior connection cannot see
|
||||
// the new room).
|
||||
//
|
||||
// It opens a fresh connection with the same seeds/options and swaps it in.
|
||||
// IMPORTANT: active subscriptions from the previous connection are dropped —
|
||||
// re-subscribe (client.Subscribe) to your rooms after calling this. The key and
|
||||
// signer caches are preserved. On a non-ACL bus this is a no-op-safe reconnect.
|
||||
func (c *Client) RefreshSession() error {
|
||||
nc, err := nats.Connect(strings.Join(c.natsServers, ","), c.natsOpts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: refresh session: reconnect nats: %w", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
return fmt.Errorf("client: refresh session: init jetstream: %w", err)
|
||||
}
|
||||
old := c.nc
|
||||
c.mu.Lock()
|
||||
c.nc = nc
|
||||
c.js = js
|
||||
c.mu.Unlock()
|
||||
old.Close()
|
||||
return nil
|
||||
}
|
||||
|
||||
// Endpoint returns this client's public identity.
|
||||
func (c *Client) Endpoint() Endpoint {
|
||||
return Endpoint{ID: c.endpoint, SignPub: c.id.SignPub, KexPub: c.id.KexPub}
|
||||
@@ -90,6 +247,15 @@ func (c *Client) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ConnectedServer returns the URL of the NATS node this client is currently
|
||||
// attached to (empty when disconnected). It is observability for cluster
|
||||
// failover: after a node dies, this reports the surviving node nats.go
|
||||
// reconnected to. IsConnected reports whether the data-plane link is up.
|
||||
func (c *Client) ConnectedServer() string { return c.nc.ConnectedUrl() }
|
||||
|
||||
// IsConnected reports whether the NATS data-plane connection is currently up.
|
||||
func (c *Client) IsConnected() bool { return c.nc.IsConnected() }
|
||||
|
||||
// ---- key cache ------------------------------------------------------------
|
||||
|
||||
func (c *Client) cacheKey(roomID string, epoch int, k []byte) {
|
||||
@@ -116,54 +282,105 @@ func (c *Client) getCachedKey(roomID string, epoch int) ([]byte, bool) {
|
||||
// ---- control-plane HTTP helpers ------------------------------------------
|
||||
|
||||
func (c *Client) doJSON(method, path string, body, out any) error {
|
||||
var rdr io.Reader
|
||||
var bodyBytes []byte
|
||||
if body != nil {
|
||||
b, err := json.Marshal(body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: marshal request: %w", err)
|
||||
}
|
||||
rdr = bytes.NewReader(b)
|
||||
bodyBytes = b
|
||||
}
|
||||
req, err := http.NewRequest(method, c.ctrlURL+path, rdr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: new request: %w", err)
|
||||
}
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("client: do %s %s: %w", method, path, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
// Surface the server's structured {"error": "..."} message when present,
|
||||
// instead of leaking the raw HTTP envelope (method, path, status, JSON body).
|
||||
var er struct {
|
||||
Error string `json:"error"`
|
||||
// Try each control-plane endpoint in order. A transport error (a dead node)
|
||||
// falls over to the next; an HTTP response (any status) is authoritative and
|
||||
// returned, since every node serves the same state. Each attempt is freshly
|
||||
// signed (new nonce), so a failed-over retry is never seen as a replay.
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, method, path, bodyBytes)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if json.Unmarshal(respBody, &er) == nil && er.Error != "" {
|
||||
return fmt.Errorf("%s (HTTP %d)", er.Error, resp.StatusCode)
|
||||
if body != nil {
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
}
|
||||
return fmt.Errorf("client: %s %s -> %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
if out != nil {
|
||||
if err := json.Unmarshal(respBody, out); err != nil {
|
||||
return fmt.Errorf("client: decode response: %w", err)
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
// Surface the server's structured {"error": "..."} message when present,
|
||||
// instead of leaking the raw HTTP envelope (method, path, status, body).
|
||||
var er struct {
|
||||
Error string `json:"error"`
|
||||
}
|
||||
if json.Unmarshal(respBody, &er) == nil && er.Error != "" {
|
||||
return fmt.Errorf("%s (HTTP %d)", er.Error, resp.StatusCode)
|
||||
}
|
||||
return fmt.Errorf("client: %s %s -> %d: %s", method, path, resp.StatusCode, string(respBody))
|
||||
}
|
||||
if out != nil {
|
||||
if err := json.Unmarshal(respBody, out); err != nil {
|
||||
return fmt.Errorf("client: decode response: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
return fmt.Errorf("client: %s %s: all control planes failed: %w", method, path, lastErr)
|
||||
}
|
||||
|
||||
// signRequest signs the canonical bytes of req (req must already have its Sig
|
||||
// field cleared) with the client's Ed25519 key. It is symmetric with the
|
||||
// server's verifyOwnerSig.
|
||||
// server's verifyOwnerSig. This is the PAYLOAD-level owner signature that
|
||||
// authorizes room operations (invite/rekey) by ownership — distinct from the
|
||||
// transport-level request signature applied by newSignedRequest below, which
|
||||
// authenticates the caller's identity on every request.
|
||||
func (c *Client) signRequest(req any) []byte {
|
||||
b, _ := json.Marshal(req)
|
||||
return cs.SignEd25519(c.id.SignPriv, b)
|
||||
}
|
||||
|
||||
// newSignedRequestTo builds an *http.Request to the control-plane endpoint
|
||||
// `base` and attaches the transport authentication headers
|
||||
// (X-Unibus-Pub/Ts/Nonce/Sig) signing the canonical request bytes with this
|
||||
// peer's Ed25519 key. path is the request URI (path plus any query); body is the
|
||||
// raw request body (nil for GET). The server (membership.authenticate) verifies
|
||||
// these headers under the bus-auth flag. The signature covers method+path+ts+
|
||||
// nonce+sha256(body), NOT the host, so the same request can be addressed to any
|
||||
// node — and each failover attempt mints a fresh nonce so it is never a replay.
|
||||
//
|
||||
// Signing happens on every request — including GETs — so that under enforce the
|
||||
// server can authenticate the caller and reject unregistered or revoked
|
||||
// identities uniformly. The canonical construction is the single source of truth
|
||||
// in membership.CanonicalRequest, shared by both sides.
|
||||
func (c *Client) newSignedRequestTo(base, method, path string, body []byte) (*http.Request, error) {
|
||||
var rdr io.Reader
|
||||
if body != nil {
|
||||
rdr = bytes.NewReader(body)
|
||||
}
|
||||
req, err := http.NewRequest(method, base+path, rdr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: new request: %w", err)
|
||||
}
|
||||
|
||||
ts := strconv.FormatInt(time.Now().Unix(), 10)
|
||||
nonceRaw := make([]byte, 16)
|
||||
if _, err := rand.Read(nonceRaw); err != nil {
|
||||
return nil, fmt.Errorf("client: generate nonce: %w", err)
|
||||
}
|
||||
nonce := base64.StdEncoding.EncodeToString(nonceRaw)
|
||||
canonical := membership.CanonicalRequest(method, path, ts, nonce, body)
|
||||
sig := cs.SignEd25519(c.id.SignPriv, canonical)
|
||||
|
||||
req.Header.Set("X-Unibus-Pub", hex.EncodeToString(c.id.SignPub))
|
||||
req.Header.Set("X-Unibus-Ts", ts)
|
||||
req.Header.Set("X-Unibus-Nonce", nonce)
|
||||
req.Header.Set("X-Unibus-Sig", base64.StdEncoding.EncodeToString(sig))
|
||||
return req, nil
|
||||
}
|
||||
|
||||
// ---- mirror of server wire types (control plane) -------------------------
|
||||
|
||||
type policyJSON struct {
|
||||
@@ -231,8 +448,48 @@ type blobResp struct {
|
||||
Hash string `json:"hash"`
|
||||
}
|
||||
|
||||
type memberRoomJSON struct {
|
||||
RoomID string `json:"room_id"`
|
||||
Subject string `json:"subject"`
|
||||
Epoch int `json:"epoch"`
|
||||
Policy policyJSON `json:"policy"`
|
||||
Role string `json:"role"`
|
||||
}
|
||||
|
||||
// ---- room operations ------------------------------------------------------
|
||||
|
||||
// RoomRef is a room this peer belongs to, returned by ListMyRooms. It is the
|
||||
// unit of room discovery: a peer that was invited to a new room finds it here
|
||||
// and can then Join (fetch the sealed key) and Subscribe.
|
||||
type RoomRef struct {
|
||||
RoomID string
|
||||
Subject string
|
||||
Epoch int
|
||||
Policy room.Policy
|
||||
Role string
|
||||
}
|
||||
|
||||
// ListMyRooms returns every room this peer is currently a member of. A peer
|
||||
// polls this to discover rooms it has been invited to (the control plane is
|
||||
// pull-based: there is no server push of invitations).
|
||||
func (c *Client) ListMyRooms() ([]RoomRef, error) {
|
||||
var resp []memberRoomJSON
|
||||
if err := c.doJSON("GET", "/members/"+c.endpoint+"/rooms", nil, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := make([]RoomRef, 0, len(resp))
|
||||
for _, r := range resp {
|
||||
out = append(out, RoomRef{
|
||||
RoomID: r.RoomID,
|
||||
Subject: r.Subject,
|
||||
Epoch: r.Epoch,
|
||||
Policy: room.Policy{Encrypt: r.Policy.Encrypt, Persist: r.Policy.Persist, SignMsgs: r.Policy.SignMsgs},
|
||||
Role: r.Role,
|
||||
})
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// newRoomKey returns 32 random bytes for a symmetric room key.
|
||||
func newRoomKey() ([]byte, error) {
|
||||
k := make([]byte, 32)
|
||||
@@ -392,20 +649,31 @@ func (c *Client) signerPub(roomID, sender string) ([]byte, error) {
|
||||
|
||||
// ---- data plane: publish/subscribe ---------------------------------------
|
||||
|
||||
// Publish sends plaintext to a room. For encrypted rooms it seals the payload
|
||||
// with the current K using the subject as AEAD additional-authenticated-data;
|
||||
// for signed rooms it attaches an Ed25519 signature.
|
||||
func (c *Client) Publish(roomID string, plaintext []byte) error {
|
||||
// threadMeta carries the optional threading/reaction routing of a published
|
||||
// frame. The zero value yields a plain top-level message whose wire bytes are
|
||||
// identical to a pre-threading frame (the fields are omitempty).
|
||||
type threadMeta struct {
|
||||
threadID string // thread root message id
|
||||
replyTo string // message id being replied to / reacted to
|
||||
}
|
||||
|
||||
// publishFrame is the single publish path shared by Publish, PublishReply and
|
||||
// React. It builds the envelope, seals+signs per the room policy, and routes
|
||||
// through JetStream (persisted rooms) or core NATS (ephemeral rooms). The only
|
||||
// thing the callers vary is the frame type and the threading metadata.
|
||||
func (c *Client) publishFrame(roomID string, ftype frame.FrameType, plaintext []byte, tm threadMeta) error {
|
||||
info, err := c.fetchRoom(roomID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f := frame.Frame{
|
||||
Type: frame.PUB,
|
||||
Subject: info.Subject,
|
||||
Sender: c.endpoint,
|
||||
MsgID: newULID(),
|
||||
Epoch: info.Epoch,
|
||||
Type: ftype,
|
||||
Subject: info.Subject,
|
||||
Sender: c.endpoint,
|
||||
MsgID: newULID(),
|
||||
Epoch: info.Epoch,
|
||||
ThreadID: tm.threadID,
|
||||
ReplyTo: tm.replyTo,
|
||||
}
|
||||
if info.Policy.Encrypt {
|
||||
k, ep, err := c.fetchKey(roomID, info.Epoch)
|
||||
@@ -435,6 +703,31 @@ func (c *Client) Publish(roomID string, plaintext []byte) error {
|
||||
return c.nc.Publish(info.Subject, b)
|
||||
}
|
||||
|
||||
// Publish sends plaintext to a room. For encrypted rooms it seals the payload
|
||||
// with the current K using the subject as AEAD additional-authenticated-data;
|
||||
// for signed rooms it attaches an Ed25519 signature.
|
||||
func (c *Client) Publish(roomID string, plaintext []byte) error {
|
||||
return c.publishFrame(roomID, frame.PUB, plaintext, threadMeta{})
|
||||
}
|
||||
|
||||
// PublishReply sends plaintext as a reply inside a thread. replyTo is the id of
|
||||
// the message being replied to; threadID is the thread root — pass replyTo when
|
||||
// you are starting a new thread off a top-level message, or the existing
|
||||
// ThreadID to keep replying within one. Encryption and signing are identical to
|
||||
// Publish; the threading metadata rides the cleartext envelope. Receivers read
|
||||
// Frame.ReplyTo / Frame.ThreadID to render the conversation tree.
|
||||
func (c *Client) PublishReply(roomID string, plaintext []byte, replyTo, threadID string) error {
|
||||
return c.publishFrame(roomID, frame.PUB, plaintext, threadMeta{threadID: threadID, replyTo: replyTo})
|
||||
}
|
||||
|
||||
// React publishes a reaction (emoji/shortcode) to a target message. The reaction
|
||||
// content travels in the payload, so it is sealed exactly like a normal message
|
||||
// and stays confidential in E2E rooms. Receivers dispatch on Frame.Type ==
|
||||
// frame.REACT and read Frame.ReplyTo for the message being reacted to.
|
||||
func (c *Client) React(roomID, targetMsgID, emoji string) error {
|
||||
return c.publishFrame(roomID, frame.REACT, []byte(emoji), threadMeta{replyTo: targetMsgID})
|
||||
}
|
||||
|
||||
// Sub is a transport-agnostic handle to an active room subscription. It wraps
|
||||
// either a core NATS subscription (ephemeral rooms) or a JetStream durable
|
||||
// consumer (persisted rooms) behind a single Unsubscribe() method, so callers
|
||||
@@ -506,7 +799,17 @@ func (c *Client) processFrame(roomID string, info roomView, data []byte, handler
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if info.Policy.SignMsgs && f.Sig != nil {
|
||||
// A room with SignMsgs REQUIRES a signature, so an unsigned frame is
|
||||
// unauthenticated and must be dropped — not silently accepted. The previous
|
||||
// `&& f.Sig != nil` guard verified the signature only when one was present, so
|
||||
// an attacker with data-plane access could publish a frame with Sig==nil and a
|
||||
// forged Sender and have the receiver accept it as authentic in a room that
|
||||
// demands signatures (audit N3, report 0006). Requiring the signature first
|
||||
// closes that spoof.
|
||||
if info.Policy.SignMsgs {
|
||||
if f.Sig == nil {
|
||||
return // signature required by room policy but absent: drop
|
||||
}
|
||||
pub, err := c.signerPub(roomID, f.Sender)
|
||||
if err != nil || !cs.VerifyEd25519(pub, f.SigningBytes(), f.Sig) {
|
||||
return // unauthenticated frame: drop
|
||||
@@ -693,36 +996,50 @@ func (c *Client) FetchMedia(roomID string, f frame.Frame) ([]byte, error) {
|
||||
}
|
||||
|
||||
func (c *Client) putBlob(ciphertext []byte) (string, error) {
|
||||
req, err := http.NewRequest("POST", c.ctrlURL+"/blobs", bytes.NewReader(ciphertext))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("client: new blob request: %w", err)
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, "POST", "/blobs", ciphertext)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/octet-stream")
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("client: put blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var r blobResp
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
return "", fmt.Errorf("client: decode blob resp: %w", err)
|
||||
}
|
||||
return r.Hash, nil
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/octet-stream")
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("client: put blob: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
if resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("client: put blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
var r blobResp
|
||||
if err := json.Unmarshal(body, &r); err != nil {
|
||||
return "", fmt.Errorf("client: decode blob resp: %w", err)
|
||||
}
|
||||
return r.Hash, nil
|
||||
return "", fmt.Errorf("client: put blob: all control planes failed: %w", lastErr)
|
||||
}
|
||||
|
||||
func (c *Client) getBlob(hash string) ([]byte, error) {
|
||||
resp, err := c.http.Get(c.ctrlURL + "/blobs/" + hash)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client: get blob: %w", err)
|
||||
var lastErr error
|
||||
for _, base := range c.ctrlURLs {
|
||||
req, err := c.newSignedRequestTo(base, "GET", "/blobs/"+hash, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue // dead node: try the next control plane
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
return io.ReadAll(resp.Body)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob -> %d: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
return io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("client: get blob: all control planes failed: %w", lastErr)
|
||||
}
|
||||
|
||||
+298
-9
@@ -1,10 +1,13 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -12,6 +15,7 @@ import (
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
@@ -27,6 +31,8 @@ type testHarness struct {
|
||||
ctrlURL string
|
||||
ns *server.Server
|
||||
httpts *httptest.Server
|
||||
store membership.Store
|
||||
srv *membership.Server
|
||||
}
|
||||
|
||||
func freePort(t *testing.T) int {
|
||||
@@ -39,29 +45,61 @@ func freePort(t *testing.T) int {
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
func newHarness(t *testing.T) *testHarness {
|
||||
func newHarness(t *testing.T) *testHarness { return newHarnessFull(t, membership.AuthOff, false) }
|
||||
|
||||
// newHarnessMode is newHarness with an explicit control-plane auth mode and the
|
||||
// NATS data plane left open (no nkey auth), so HTTP-auth tests can use a plain
|
||||
// client.New that does not present an nkey.
|
||||
func newHarnessMode(t *testing.T, mode membership.AuthMode) *testHarness {
|
||||
return newHarnessFull(t, mode, false)
|
||||
}
|
||||
|
||||
// newHarnessFull boots the embedded NATS (optionally with the nkey authenticator
|
||||
// backed by the user allowlist) and the membershipd HTTP server in ctrlMode.
|
||||
// natsAuth and ctrlMode are independent on purpose: an HTTP-enforce test can
|
||||
// keep NATS open, and an nkey test can keep HTTP off, mirroring how the rollout
|
||||
// flags compose. The store is created before NATS so the authenticator can
|
||||
// consult IsAuthorized for live revocation.
|
||||
func newHarnessFull(t *testing.T, ctrlMode membership.AuthMode, natsAuth bool) *testHarness {
|
||||
return bootHarness(t, ctrlMode, natsAuth, nil)
|
||||
}
|
||||
|
||||
// bootHarness is the shared body: a store, an embedded NATS (optionally with the
|
||||
// nkey authenticator and/or TLS), and the membershipd HTTP server in ctrlMode.
|
||||
func bootHarness(t *testing.T, ctrlMode membership.AuthMode, natsAuth bool, natsTLS *tls.Config) *testHarness {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
|
||||
ns, err := embeddednats.Start(filepath.Join(dir, "js"), freePort(t))
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("membership store: %v", err)
|
||||
}
|
||||
|
||||
cfg := embeddednats.ServerConfig{
|
||||
StoreDir: filepath.Join(dir, "js"),
|
||||
Host: "127.0.0.1",
|
||||
Port: freePort(t),
|
||||
TLS: natsTLS,
|
||||
}
|
||||
if natsAuth {
|
||||
cfg.Auth = busauth.NewNkeyAuthenticator(store.IsAuthorized)
|
||||
}
|
||||
ns, err := embeddednats.StartServer(cfg)
|
||||
if err != nil {
|
||||
store.Close()
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("membership store: %v", err)
|
||||
}
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
store.Close()
|
||||
t.Fatalf("blob store: %v", err)
|
||||
}
|
||||
srv := membership.NewServer(store, blobs)
|
||||
srv := membership.NewServer(store, blobs, ctrlMode)
|
||||
httpts := httptest.NewServer(srv)
|
||||
|
||||
h := &testHarness{natsURL: embeddednats.ClientURL(ns), ctrlURL: httpts.URL, ns: ns, httpts: httpts}
|
||||
h := &testHarness{natsURL: embeddednats.ClientURL(ns), ctrlURL: httpts.URL, ns: ns, httpts: httpts, store: store, srv: srv}
|
||||
t.Cleanup(func() {
|
||||
httpts.Close()
|
||||
store.Close()
|
||||
@@ -71,6 +109,15 @@ func newHarness(t *testing.T) *testHarness {
|
||||
return h
|
||||
}
|
||||
|
||||
// registerClient adds a peer's signing identity to the bus allowlist so its
|
||||
// signed control-plane requests pass under enforce.
|
||||
func registerClient(t *testing.T, h *testHarness, c *client.Client, handle, role string) {
|
||||
t.Helper()
|
||||
if err := h.store.AddUser(hex.EncodeToString(c.Endpoint().SignPub), handle, role); err != nil {
|
||||
t.Fatalf("register %s: %v", handle, err)
|
||||
}
|
||||
}
|
||||
|
||||
func waitHealth(t *testing.T, ctrlURL string) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
@@ -302,6 +349,248 @@ func TestMediaBlobRoundTrip(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestThreadedReplyAndReaction exercises the additive threading API end to end
|
||||
// in an encrypted, persisted, signed room (ModeMatrix): A publishes a root
|
||||
// message, replies to it within a thread, and reacts to it with an emoji. The
|
||||
// loopback subscriber must observe the reply carrying ReplyTo/ThreadID and the
|
||||
// reaction as a frame.REACT whose (decrypted) payload is the emoji — proving the
|
||||
// reaction stays sealed like any message in an E2E room.
|
||||
func TestThreadedReplyAndReaction(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
a, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect A: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
|
||||
roomID, err := a.CreateRoom("room.thread", room.ModeMatrix)
|
||||
if err != nil {
|
||||
t.Fatalf("create room: %v", err)
|
||||
}
|
||||
|
||||
type rec struct {
|
||||
f frame.Frame
|
||||
pt string
|
||||
}
|
||||
var mu sync.Mutex
|
||||
var got []rec
|
||||
sub, err := a.Subscribe(roomID, func(f frame.Frame, pt []byte) {
|
||||
mu.Lock()
|
||||
got = append(got, rec{f: f, pt: string(pt)})
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
find := func(pred func(rec) bool) (rec, bool) {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
for _, r := range got {
|
||||
if pred(r) {
|
||||
return r, true
|
||||
}
|
||||
}
|
||||
return rec{}, false
|
||||
}
|
||||
waitRec := func(pred func(rec) bool) (rec, bool) {
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if r, ok := find(pred); ok {
|
||||
return r, true
|
||||
}
|
||||
time.Sleep(25 * time.Millisecond)
|
||||
}
|
||||
return rec{}, false
|
||||
}
|
||||
|
||||
// 1. Root message.
|
||||
if err := a.Publish(roomID, []byte("root")); err != nil {
|
||||
t.Fatalf("publish root: %v", err)
|
||||
}
|
||||
rootRec, ok := waitRec(func(r rec) bool { return r.pt == "root" })
|
||||
if !ok {
|
||||
t.Fatalf("never observed the root message")
|
||||
}
|
||||
rootID := rootRec.f.MsgID
|
||||
if rootID == "" {
|
||||
t.Fatalf("root frame has empty MsgID")
|
||||
}
|
||||
|
||||
// 2. Threaded reply to the root.
|
||||
if err := a.PublishReply(roomID, []byte("child"), rootID, rootID); err != nil {
|
||||
t.Fatalf("publish reply: %v", err)
|
||||
}
|
||||
reply, ok := waitRec(func(r rec) bool { return r.pt == "child" })
|
||||
if !ok {
|
||||
t.Fatalf("never observed the threaded reply")
|
||||
}
|
||||
if reply.f.ReplyTo != rootID || reply.f.ThreadID != rootID {
|
||||
t.Fatalf("reply threading lost: ReplyTo=%q ThreadID=%q want %q", reply.f.ReplyTo, reply.f.ThreadID, rootID)
|
||||
}
|
||||
|
||||
// 3. Reaction to the root (emoji rides the encrypted payload).
|
||||
if err := a.React(roomID, rootID, "👍"); err != nil {
|
||||
t.Fatalf("react: %v", err)
|
||||
}
|
||||
reaction, ok := waitRec(func(r rec) bool { return r.f.Type == frame.REACT })
|
||||
if !ok {
|
||||
t.Fatalf("never observed the reaction frame")
|
||||
}
|
||||
if reaction.f.ReplyTo != rootID {
|
||||
t.Fatalf("reaction target lost: ReplyTo=%q want %q", reaction.f.ReplyTo, rootID)
|
||||
}
|
||||
if reaction.pt != "👍" {
|
||||
t.Fatalf("reaction payload mismatch: got %q want 👍 (decryption in E2E room)", reaction.pt)
|
||||
}
|
||||
}
|
||||
|
||||
// TestListMyRoomsDiscovery verifies room discovery: an invited peer finds the
|
||||
// room via ListMyRooms (without being told its id), and a peer in no rooms gets
|
||||
// an empty list. This is what lets a bot discover rooms it was invited to.
|
||||
func TestListMyRoomsDiscovery(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
a, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect A: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
b, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect B: %v", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
// B is in no rooms yet.
|
||||
if rooms, err := b.ListMyRooms(); err != nil || len(rooms) != 0 {
|
||||
t.Fatalf("B should start in no rooms, got %v err=%v", rooms, err)
|
||||
}
|
||||
|
||||
roomID, err := a.CreateRoom("room.discovery", room.ModeMatrix)
|
||||
if err != nil {
|
||||
t.Fatalf("A create room: %v", err)
|
||||
}
|
||||
if err := a.Invite(roomID, b.Endpoint()); err != nil {
|
||||
t.Fatalf("A invite B: %v", err)
|
||||
}
|
||||
|
||||
// B discovers the room it was invited to, with its policy, without prior knowledge of the id.
|
||||
rooms, err := b.ListMyRooms()
|
||||
if err != nil {
|
||||
t.Fatalf("B ListMyRooms: %v", err)
|
||||
}
|
||||
if len(rooms) != 1 || rooms[0].RoomID != roomID {
|
||||
t.Fatalf("B should discover exactly room %s, got %+v", roomID, rooms)
|
||||
}
|
||||
if rooms[0].Subject != "room.discovery" || !rooms[0].Policy.Encrypt || rooms[0].Role != "member" {
|
||||
t.Fatalf("discovered room metadata wrong: %+v", rooms[0])
|
||||
}
|
||||
|
||||
// A sees the same room as its owner.
|
||||
aRooms, err := a.ListMyRooms()
|
||||
if err != nil {
|
||||
t.Fatalf("A ListMyRooms: %v", err)
|
||||
}
|
||||
if len(aRooms) != 1 || aRooms[0].Role != "owner" {
|
||||
t.Fatalf("A should own exactly one room, got %+v", aRooms)
|
||||
}
|
||||
}
|
||||
|
||||
// TestControlPlaneAuthEnforceE2E closes the loop end to end with the production
|
||||
// client against a server in enforce mode: a registered peer's signed requests
|
||||
// are accepted (golden), and an unregistered peer is rejected with 401 on its
|
||||
// first control-plane call (error path). This proves the client's real
|
||||
// signature construction matches the server's verification.
|
||||
func TestControlPlaneAuthEnforceE2E(t *testing.T) {
|
||||
h := newHarnessMode(t, membership.AuthEnforce)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
a, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect A: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
registerClient(t, h, a, "alice", membership.RoleAdmin)
|
||||
|
||||
// Golden: registered peer's signed request is accepted.
|
||||
if _, err := a.CreateRoom("room.enforced", room.ModeNATS); err != nil {
|
||||
t.Fatalf("registered peer should create a room under enforce: %v", err)
|
||||
}
|
||||
|
||||
// Error path: an unregistered peer is rejected on its first control-plane call.
|
||||
b, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect B: %v", err)
|
||||
}
|
||||
defer b.Close()
|
||||
_, err = b.CreateRoom("room.denied", room.ModeNATS)
|
||||
if err == nil {
|
||||
t.Fatalf("unregistered peer must be rejected under enforce")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "401") && !strings.Contains(strings.ToLower(err.Error()), "unauthorized") {
|
||||
t.Fatalf("expected a 401/unauthorized error, got %v", err)
|
||||
}
|
||||
|
||||
// Revocation takes effect without restart: revoke A, its next request fails.
|
||||
if err := h.store.RevokeUser(hex.EncodeToString(a.Endpoint().SignPub)); err != nil {
|
||||
t.Fatalf("revoke A: %v", err)
|
||||
}
|
||||
if _, err := a.CreateRoom("room.after-revoke", room.ModeNATS); err == nil {
|
||||
t.Fatalf("revoked peer must be rejected without a server restart")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNatsNkeyAuth exercises the data-plane authenticator: with NATS nkey auth
|
||||
// on, a registered peer connecting with its nkey is accepted and can publish
|
||||
// (golden); an unregistered peer is refused at connect time (error path); and a
|
||||
// peer revoked while the server runs is refused on its NEXT connection, proving
|
||||
// revocation without a restart (edge).
|
||||
func TestNatsNkeyAuth(t *testing.T) {
|
||||
h := newHarnessFull(t, membership.AuthOff, true) // NATS auth on; HTTP off to isolate the data plane
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
idA := mustIdentity(t)
|
||||
if err := h.store.AddUser(hex.EncodeToString(idA.SignPub), "alice", membership.RoleMember); err != nil {
|
||||
t.Fatalf("register A: %v", err)
|
||||
}
|
||||
|
||||
// Golden: registered peer connects with its nkey and uses the bus.
|
||||
a, err := client.NewWithOptions(h.natsURL, h.ctrlURL, idA, client.Options{UseNkey: true})
|
||||
if err != nil {
|
||||
t.Fatalf("registered peer should connect with nkey: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
if _, err := a.CreateRoom("room.nkey", room.ModeNATS); err != nil {
|
||||
t.Fatalf("registered peer should operate: %v", err)
|
||||
}
|
||||
|
||||
// Error path: an unregistered identity is refused at connect time.
|
||||
idB := mustIdentity(t)
|
||||
if _, err := client.NewWithOptions(h.natsURL, h.ctrlURL, idB, client.Options{UseNkey: true}); err == nil {
|
||||
t.Fatalf("unregistered peer must be refused by the NATS authenticator")
|
||||
}
|
||||
|
||||
// Error path: presenting no nkey to an auth-required server is refused.
|
||||
if _, err := client.NewWithOptions(h.natsURL, h.ctrlURL, idB, client.Options{UseNkey: false}); err == nil {
|
||||
t.Fatalf("a client without an nkey must be refused when the server requires auth")
|
||||
}
|
||||
|
||||
// Edge: revoke A while the server runs; A's NEXT connection is refused even
|
||||
// though an already-open connection (a) is unaffected. No server restart.
|
||||
if err := h.store.RevokeUser(hex.EncodeToString(idA.SignPub)); err != nil {
|
||||
t.Fatalf("revoke A: %v", err)
|
||||
}
|
||||
if _, err := client.NewWithOptions(h.natsURL, h.ctrlURL, idA, client.Options{UseNkey: true}); err == nil {
|
||||
t.Fatalf("revoked peer must be refused on a new connection without a restart")
|
||||
}
|
||||
}
|
||||
|
||||
// ---- test helpers ---------------------------------------------------------
|
||||
|
||||
type collector struct {
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
// TestConnectRequiresHTTPSWithCA covers audit H5's client contract: when a CA is
|
||||
// provided the control-plane URL must be https://. A signed request gives
|
||||
// integrity but not confidentiality, so silently talking http:// to a bus the
|
||||
// operator secured with a CA would leak all metadata to a MITM. Connect refuses
|
||||
// the plaintext URL outright (error path; the scheme is checked before any
|
||||
// network use, so a bogus CA path is irrelevant).
|
||||
func TestConnectRequiresHTTPSWithCA(t *testing.T) {
|
||||
_, err := client.Connect("nats://127.0.0.1:4222", "http://127.0.0.1:8470", mustIdentity(t), "/nonexistent/ca.crt")
|
||||
if err == nil {
|
||||
t.Fatalf("Connect with a CA and an http:// control plane must be refused")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "https") {
|
||||
t.Fatalf("error should point the caller at https, got: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestControlPlaneOverTLS proves the control plane works over TLS pinned to the
|
||||
// bus CA (golden) and that a client lacking the CA cannot complete the handshake
|
||||
// (error path) — so a network observer can neither read nor inject control-plane
|
||||
// traffic. The data plane is left plaintext here to isolate the HTTP-TLS wiring.
|
||||
func TestControlPlaneOverTLS(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
t.Fatalf("blobs: %v", err)
|
||||
}
|
||||
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: filepath.Join(dir, "js"), Host: "127.0.0.1", Port: freePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
natsURL := embeddednats.ClientURL(ns)
|
||||
|
||||
// An https control plane wrapping the real membership server.
|
||||
ts := httptest.NewTLSServer(membership.NewServer(store, blobs, membership.AuthOff))
|
||||
t.Cleanup(ts.Close)
|
||||
|
||||
pool := x509.NewCertPool()
|
||||
pool.AddCert(ts.Certificate())
|
||||
|
||||
// Golden: trusting the control-plane CA, an https control-plane request works.
|
||||
good, err := client.NewWithOptions(natsURL, ts.URL, mustIdentity(t),
|
||||
client.Options{CtrlTLS: &tls.Config{RootCAs: pool, MinVersion: tls.VersionTLS12}})
|
||||
if err != nil {
|
||||
t.Fatalf("connect with the pinned CA: %v", err)
|
||||
}
|
||||
defer good.Close()
|
||||
if _, err := good.CreateRoom("room.tls.ctrl", room.ModeNATS); err != nil {
|
||||
t.Fatalf("control plane over TLS should succeed with the pinned CA: %v", err)
|
||||
}
|
||||
|
||||
// Error path: without the CA the https handshake fails, so the request errors.
|
||||
bad, err := client.NewWithOptions(natsURL, ts.URL, mustIdentity(t),
|
||||
client.Options{CtrlTLS: &tls.Config{RootCAs: x509.NewCertPool(), MinVersion: tls.VersionTLS12}})
|
||||
if err != nil {
|
||||
t.Fatalf("nats connect (bad CA case): %v", err)
|
||||
}
|
||||
defer bad.Close()
|
||||
if _, err := bad.CreateRoom("room.tls.fail", room.ModeNATS); err == nil {
|
||||
t.Fatalf("a control-plane request without the CA must fail the TLS handshake")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// TestAudit_NoSubjectACL ports the auditor's H4 (Alto) finding under the minimum
|
||||
// defense chosen for this issue (forbid cleartext rooms in public; see
|
||||
// dev/0004d-dataplane-acl.md). The NATS data plane still has no per-subject ACL,
|
||||
// so the guarantee we make is CONTENT confidentiality, proven three ways:
|
||||
//
|
||||
// error : a cleartext (ModeNATS) room cannot be created under the public posture;
|
||||
// golden: a legitimate member (bob) decrypts the secret;
|
||||
// edge : eve, sniffing the raw subject off the data plane, receives only
|
||||
// ciphertext — she never recovers the plaintext the auditor's eve did.
|
||||
func TestAudit_NoSubjectACL(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
h.srv.RequireEncryptedRooms = true // the public posture
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
alice, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect alice: %v", err)
|
||||
}
|
||||
defer alice.Close()
|
||||
|
||||
// Error path: a cleartext room is refused, so no payload ever rides a subject
|
||||
// in the clear for a sniffer to read (the exact vector the auditor exploited).
|
||||
if _, err := alice.CreateRoom("secret.subject.payroll", room.ModeNATS); err == nil {
|
||||
t.Fatalf("cleartext room must be refused on a public deployment")
|
||||
}
|
||||
|
||||
// alice creates an encrypted room and invites bob (the legitimate reader).
|
||||
const subject = "secret.subject.payroll.e2e"
|
||||
const secret = "internal: salary numbers"
|
||||
roomID, err := alice.CreateRoom(subject, room.ModeMatrix)
|
||||
if err != nil {
|
||||
t.Fatalf("alice create encrypted room: %v", err)
|
||||
}
|
||||
bob, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect bob: %v", err)
|
||||
}
|
||||
defer bob.Close()
|
||||
if err := alice.Invite(roomID, bob.Endpoint()); err != nil {
|
||||
t.Fatalf("alice invite bob: %v", err)
|
||||
}
|
||||
if err := bob.Join(roomID); err != nil {
|
||||
t.Fatalf("bob join: %v", err)
|
||||
}
|
||||
|
||||
// Golden: bob (a member) subscribes and decrypts the secret.
|
||||
var bmu sync.Mutex
|
||||
var bobGot []string
|
||||
bobSub, err := bob.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
bmu.Lock()
|
||||
bobGot = append(bobGot, string(plaintext))
|
||||
bmu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("bob subscribe: %v", err)
|
||||
}
|
||||
defer bobSub.Unsubscribe()
|
||||
|
||||
// Edge: eve sniffs the raw subject directly off NATS (no membership, no key).
|
||||
rawEve, err := nats.Connect(h.natsURL)
|
||||
if err != nil {
|
||||
t.Fatalf("eve raw connect: %v", err)
|
||||
}
|
||||
defer rawEve.Close()
|
||||
eveGot := make(chan []byte, 8)
|
||||
if _, err := rawEve.Subscribe(subject, func(m *nats.Msg) { eveGot <- m.Data }); err != nil {
|
||||
t.Fatalf("eve raw subscribe: %v", err)
|
||||
}
|
||||
if err := rawEve.Flush(); err != nil {
|
||||
t.Fatalf("eve flush: %v", err)
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond) // let both subscriptions settle
|
||||
|
||||
if err := alice.Publish(roomID, []byte(secret)); err != nil {
|
||||
t.Fatalf("alice publish: %v", err)
|
||||
}
|
||||
|
||||
// bob must decrypt the secret.
|
||||
if !waitFor(&bmu, &bobGot, func(rs []string) bool {
|
||||
for _, r := range rs {
|
||||
if r == secret {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 2*time.Second) {
|
||||
t.Fatalf("bob (member) should decrypt the secret; got %v", snapshot(&bmu, &bobGot))
|
||||
}
|
||||
|
||||
// eve must receive only ciphertext — never the plaintext.
|
||||
select {
|
||||
case data := <-eveGot:
|
||||
if bytes.Contains(data, []byte(secret)) {
|
||||
t.Fatalf("eve sniffed the plaintext off the data plane: %q", data)
|
||||
}
|
||||
f, err := frame.Unmarshal(data)
|
||||
if err != nil {
|
||||
t.Fatalf("eve received an undecodable frame: %v", err)
|
||||
}
|
||||
if string(f.Payload) == secret {
|
||||
t.Fatalf("eve read the secret from the frame payload")
|
||||
}
|
||||
if len(f.Nonce) == 0 {
|
||||
t.Fatalf("expected an AEAD-encrypted payload (non-empty nonce), got cleartext frame")
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
// eve receiving nothing is also a safe outcome; the assertion is only that
|
||||
// she never gets the plaintext, which holds vacuously here.
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// startClusterNode boots a clustered embedded NATS node (auth off, no route TLS:
|
||||
// this test exercises client failover, not route security — that is covered in
|
||||
// pkg/embeddednats).
|
||||
func startClusterNode(t *testing.T, name string, clientPort, routePort int, peerRoutePorts []int) *server.Server {
|
||||
t.Helper()
|
||||
routes := make([]string, 0, len(peerRoutePorts))
|
||||
for _, p := range peerRoutePorts {
|
||||
routes = append(routes, fmt.Sprintf("nats://127.0.0.1:%d", p))
|
||||
}
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: clientPort,
|
||||
ServerName: name,
|
||||
Cluster: &embeddednats.ClusterConfig{Name: "unibus-failover", Host: "127.0.0.1", Port: routePort, Routes: routes},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start node %s: %v", name, err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
func waitClusterRoutes(t *testing.T, ns *server.Server) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if ns.NumRoutes() >= 1 {
|
||||
return
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("node %q never formed a route", ns.Name())
|
||||
}
|
||||
|
||||
// portOf extracts the :port of a nats URL for matching ConnectedServer() (which
|
||||
// may report a different host spelling than ClientURL()).
|
||||
func portOf(natsURL string) string {
|
||||
i := strings.LastIndex(natsURL, ":")
|
||||
if i < 0 {
|
||||
return ""
|
||||
}
|
||||
return natsURL[i+1:]
|
||||
}
|
||||
|
||||
// TestClientFailoverAcrossNodes is the issue's edge case: a client connected to
|
||||
// node A keeps its session when A is killed — nats.go reconnects it to node B
|
||||
// and it keeps receiving messages published on the surviving node.
|
||||
func TestClientFailoverAcrossNodes(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
p0, p1 := freePort(t), freePort(t)
|
||||
n0 := startClusterNode(t, "n0", p0, rp0, []int{rp1})
|
||||
n1 := startClusterNode(t, "n1", p1, rp1, []int{rp0})
|
||||
waitClusterRoutes(t, n0)
|
||||
waitClusterRoutes(t, n1)
|
||||
nodes := map[string]*server.Server{strconv.Itoa(p0): n0, strconv.Itoa(p1): n1}
|
||||
|
||||
// Control plane: one in-process membershipd (metadata only; the data plane is
|
||||
// the NATS cluster). Auth off keeps the test focused on data-plane failover.
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
t.Fatalf("blobs: %v", err)
|
||||
}
|
||||
ctrl := httptest.NewServer(membership.NewServer(store, blobs, membership.AuthOff))
|
||||
t.Cleanup(ctrl.Close)
|
||||
|
||||
url0 := n0.ClientURL()
|
||||
url1 := n1.ClientURL()
|
||||
|
||||
// A seeds BOTH nodes (failover list); B connects directly to n1.
|
||||
a, err := client.NewWithOptions(url0, ctrl.URL, mustIdentity(t), client.Options{NatsServers: []string{url1}})
|
||||
if err != nil {
|
||||
t.Fatalf("connect A: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
b, err := client.NewWithOptions(url1, ctrl.URL, mustIdentity(t), client.Options{NatsServers: []string{url0}})
|
||||
if err != nil {
|
||||
t.Fatalf("connect B: %v", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
roomID, err := a.CreateRoom("room.failover", room.ModeNATS)
|
||||
if err != nil {
|
||||
t.Fatalf("A create room: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var got []string
|
||||
sub, err := a.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
mu.Lock()
|
||||
got = append(got, string(plaintext))
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("A subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// Pre-kill sanity: B publishes, A receives across the cluster.
|
||||
if err := b.Publish(roomID, []byte("before-kill")); err != nil {
|
||||
t.Fatalf("B publish 1: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool { return contains(rs, "before-kill") }, 3*time.Second) {
|
||||
t.Fatalf("A did not receive the pre-kill message; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
|
||||
// Identify and KILL the node A is attached to, forcing a reconnect.
|
||||
attached := a.ConnectedServer()
|
||||
killPort := portOf(attached)
|
||||
victim, ok := nodes[killPort]
|
||||
if !ok {
|
||||
t.Fatalf("A is attached to an unknown node %q (port %q)", attached, killPort)
|
||||
}
|
||||
survivorURL := url1
|
||||
if killPort == strconv.Itoa(p1) {
|
||||
survivorURL = url0
|
||||
}
|
||||
victim.Shutdown()
|
||||
victim.WaitForShutdown()
|
||||
|
||||
// A must reconnect to the surviving node.
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if a.IsConnected() && portOf(a.ConnectedServer()) == portOf(survivorURL) {
|
||||
break
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
if !a.IsConnected() || portOf(a.ConnectedServer()) != portOf(survivorURL) {
|
||||
t.Fatalf("A did not fail over to the surviving node (now on %q, want port %s)", a.ConnectedServer(), portOf(survivorURL))
|
||||
}
|
||||
|
||||
// Make B publish from the surviving node and confirm A still receives —
|
||||
// the session (its subscription) survived the failover.
|
||||
if survivorURL == url0 {
|
||||
// B's primary was n1 (killed); ensure B is on the survivor too.
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) && portOf(b.ConnectedServer()) != portOf(survivorURL) {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
if err := b.Publish(roomID, []byte("after-kill")); err != nil {
|
||||
t.Fatalf("B publish 2: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool { return contains(rs, "after-kill") }, 6*time.Second) {
|
||||
t.Fatalf("A did not receive a message after failover; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
}
|
||||
|
||||
func contains(rs []string, want string) bool {
|
||||
for _, r := range rs {
|
||||
if r == want {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
+27
-11
@@ -33,20 +33,36 @@ type identityFile struct {
|
||||
KexPriv string `json:"kex_priv"`
|
||||
}
|
||||
|
||||
// LoadIdentity loads an existing identity from path. Unlike LoadOrCreateIdentity
|
||||
// it NEVER creates one: a missing or unreadable file is an error. It is for
|
||||
// callers that must consume a specific, pre-provisioned identity rather than mint
|
||||
// a fresh one — for example membershipd's persisted internal service identity,
|
||||
// which `membershipd user add --store kv` reads to present the privileged nkey
|
||||
// the cluster authenticator recognizes.
|
||||
func LoadIdentity(path string) (cs.Identity, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return cs.Identity{}, fmt.Errorf("client: read identity %q: %w", path, err)
|
||||
}
|
||||
var f identityFile
|
||||
if err := json.Unmarshal(data, &f); err != nil {
|
||||
return cs.Identity{}, fmt.Errorf("client: parse identity %q: %w", path, err)
|
||||
}
|
||||
id, err := f.toIdentity()
|
||||
if err != nil {
|
||||
return cs.Identity{}, fmt.Errorf("client: decode identity %q: %w", path, err)
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// LoadOrCreateIdentity loads the identity at path, or generates and persists a
|
||||
// new one if the file does not exist. The file is written with 0600
|
||||
// permissions because it holds private keys.
|
||||
// permissions because it holds private keys. A file that exists but is
|
||||
// unreadable or corrupt is an error (NOT silently regenerated), so a damaged
|
||||
// identity surfaces instead of minting a new key that cannot decrypt old data.
|
||||
func LoadOrCreateIdentity(path string) (cs.Identity, error) {
|
||||
if data, err := os.ReadFile(path); err == nil {
|
||||
var f identityFile
|
||||
if err := json.Unmarshal(data, &f); err != nil {
|
||||
return cs.Identity{}, fmt.Errorf("client: parse identity %q: %w", path, err)
|
||||
}
|
||||
id, err := f.toIdentity()
|
||||
if err != nil {
|
||||
return cs.Identity{}, fmt.Errorf("client: decode identity %q: %w", path, err)
|
||||
}
|
||||
return id, nil
|
||||
if _, statErr := os.Stat(path); statErr == nil {
|
||||
return LoadIdentity(path)
|
||||
}
|
||||
|
||||
id, err := cs.GenerateIdentity()
|
||||
|
||||
@@ -0,0 +1,154 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
"github.com/nats-io/nats.go"
|
||||
)
|
||||
|
||||
// TestReaudit_SigNilSpoof ports the re-auditor's N3 (Alto) finding: in a room
|
||||
// that REQUIRES per-message signatures, an attacker with data-plane access
|
||||
// publishes a raw frame with Sig==nil and a forged Sender. Before the fix
|
||||
// processFrame verified the signature only when one was present
|
||||
// (`SignMsgs && f.Sig != nil`), so the receiver accepted the unsigned, forged
|
||||
// frame as authentic. The fix drops any unsigned frame in a SignMsgs room.
|
||||
//
|
||||
// Coverage:
|
||||
// - golden: a properly signed frame from a real member IS delivered;
|
||||
// - error : an unsigned frame with a forged Sender in a SignMsgs room is DROPPED;
|
||||
// - edge : a room WITHOUT SignMsgs still delivers an unsigned frame (the drop
|
||||
// is specific to signed rooms, not a blanket reject of unsigned frames).
|
||||
func TestReaudit_SigNilSpoof(t *testing.T) {
|
||||
h := newHarness(t)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
alice, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect alice: %v", err)
|
||||
}
|
||||
defer alice.Close()
|
||||
bob, err := client.New(h.natsURL, h.ctrlURL, mustIdentity(t))
|
||||
if err != nil {
|
||||
t.Fatalf("connect bob: %v", err)
|
||||
}
|
||||
defer bob.Close()
|
||||
|
||||
// A signed-but-NOT-encrypted room: SignMsgs enforces authorship, and the lack
|
||||
// of encryption is exactly the case the auditor flagged as Alto (any peer with
|
||||
// the subject can forge a sender if signatures are not strictly required).
|
||||
const subject = "room.signed.spoof"
|
||||
signedPolicy := room.Policy{Encrypt: false, Persist: false, SignMsgs: true}
|
||||
roomID, err := alice.CreateRoom(subject, signedPolicy)
|
||||
if err != nil {
|
||||
t.Fatalf("alice create signed room: %v", err)
|
||||
}
|
||||
if err := alice.Invite(roomID, bob.Endpoint()); err != nil {
|
||||
t.Fatalf("alice invite bob: %v", err)
|
||||
}
|
||||
if err := bob.Join(roomID); err != nil {
|
||||
t.Fatalf("bob join: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var got []string
|
||||
sub, err := bob.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
mu.Lock()
|
||||
got = append(got, string(plaintext))
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("bob subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
// Attacker: a raw NATS connection (the dev harness leaves the data plane open),
|
||||
// no identity, forged Sender, NO signature.
|
||||
const spoofMsg = "I am totally the victim"
|
||||
rawAtk, err := nats.Connect(h.natsURL)
|
||||
if err != nil {
|
||||
t.Fatalf("attacker raw connect: %v", err)
|
||||
}
|
||||
defer rawAtk.Close()
|
||||
spoof := frame.Frame{
|
||||
Type: frame.PUB,
|
||||
Subject: subject,
|
||||
Sender: "victim-forged-endpoint",
|
||||
MsgID: "spoof-1",
|
||||
Epoch: 1,
|
||||
Payload: []byte(spoofMsg),
|
||||
// Sig intentionally nil — this is the attack.
|
||||
}
|
||||
sb, err := spoof.Marshal()
|
||||
if err != nil {
|
||||
t.Fatalf("marshal spoof: %v", err)
|
||||
}
|
||||
if err := rawAtk.Publish(subject, sb); err != nil {
|
||||
t.Fatalf("attacker publish: %v", err)
|
||||
}
|
||||
_ = rawAtk.Flush()
|
||||
|
||||
// Golden: alice's properly signed frame must be delivered.
|
||||
const goodMsg = "authentic from alice"
|
||||
if err := alice.Publish(roomID, []byte(goodMsg)); err != nil {
|
||||
t.Fatalf("alice publish: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool {
|
||||
for _, r := range rs {
|
||||
if r == goodMsg {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 2*time.Second) {
|
||||
t.Fatalf("a properly signed frame should be delivered; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
|
||||
// Error path: the unsigned, forged frame must NEVER reach the handler.
|
||||
for _, r := range snapshot(&mu, &got) {
|
||||
if r == spoofMsg {
|
||||
t.Fatalf("SIG-NIL SPOOF: receiver accepted an unsigned frame with a forged Sender in a SignMsgs room")
|
||||
}
|
||||
}
|
||||
|
||||
// Edge: a room WITHOUT SignMsgs still delivers an unsigned raw frame, proving
|
||||
// the drop is scoped to signed rooms and did not break the plain-NATS path.
|
||||
const subjectOpen = "room.open.nosig"
|
||||
openRoom, err := alice.CreateRoom(subjectOpen, room.ModeNATS)
|
||||
if err != nil {
|
||||
t.Fatalf("alice create open room: %v", err)
|
||||
}
|
||||
openCol := subscribeCollect(t, alice, openRoom)
|
||||
defer openCol.sub.Unsubscribe()
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
const openMsg = "unsigned but allowed here"
|
||||
openFrame := frame.Frame{
|
||||
Type: frame.PUB,
|
||||
Subject: subjectOpen,
|
||||
Sender: "anyone",
|
||||
MsgID: "open-1",
|
||||
Payload: []byte(openMsg),
|
||||
// no Sig — fine in a non-signed room
|
||||
}
|
||||
ob, _ := openFrame.Marshal()
|
||||
if err := rawAtk.Publish(subjectOpen, ob); err != nil {
|
||||
t.Fatalf("publish open frame: %v", err)
|
||||
}
|
||||
_ = rawAtk.Flush()
|
||||
if !waitFor(&openCol.mu, &openCol.msgs, func(rs []string) bool {
|
||||
for _, r := range rs {
|
||||
if r == openMsg {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 2*time.Second) {
|
||||
t.Fatalf("an unsigned frame in a non-signed room should be delivered; got %v", snapshot(&openCol.mu, &openCol.msgs))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
package client_test
|
||||
|
||||
import (
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/tls"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/hex"
|
||||
"encoding/pem"
|
||||
"math/big"
|
||||
"net"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
// genTestCA mints a throwaway self-signed CA plus a server certificate (SAN
|
||||
// 127.0.0.1 / localhost) signed by it, mirroring deploy/tls/generate-certs.sh
|
||||
// without shelling out to openssl. It returns the server's *tls.Config (cert it
|
||||
// presents) and the CA pool a client must trust to complete the handshake.
|
||||
func genTestCA(t *testing.T) (server *tls.Config, caPool *x509.CertPool) {
|
||||
t.Helper()
|
||||
|
||||
// --- CA ---
|
||||
caKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("ca key: %v", err)
|
||||
}
|
||||
caTmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{CommonName: "unibus-test-ca"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
IsCA: true,
|
||||
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature,
|
||||
BasicConstraintsValid: true,
|
||||
}
|
||||
caDER, err := x509.CreateCertificate(rand.Reader, caTmpl, caTmpl, &caKey.PublicKey, caKey)
|
||||
if err != nil {
|
||||
t.Fatalf("ca cert: %v", err)
|
||||
}
|
||||
caCert, err := x509.ParseCertificate(caDER)
|
||||
if err != nil {
|
||||
t.Fatalf("parse ca: %v", err)
|
||||
}
|
||||
|
||||
// --- server cert signed by the CA ---
|
||||
srvKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("server key: %v", err)
|
||||
}
|
||||
srvTmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(2),
|
||||
Subject: pkix.Name{CommonName: "unibus-test-server"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageDigitalSignature,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
|
||||
DNSNames: []string{"localhost"},
|
||||
IPAddresses: []net.IP{net.IPv4(127, 0, 0, 1)},
|
||||
}
|
||||
srvDER, err := x509.CreateCertificate(rand.Reader, srvTmpl, caCert, &srvKey.PublicKey, caKey)
|
||||
if err != nil {
|
||||
t.Fatalf("server cert: %v", err)
|
||||
}
|
||||
|
||||
srvCertPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: srvDER})
|
||||
srvKeyDER, err := x509.MarshalECPrivateKey(srvKey)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal server key: %v", err)
|
||||
}
|
||||
srvKeyPEM := pem.EncodeToMemory(&pem.Block{Type: "EC PRIVATE KEY", Bytes: srvKeyDER})
|
||||
srvPair, err := tls.X509KeyPair(srvCertPEM, srvKeyPEM)
|
||||
if err != nil {
|
||||
t.Fatalf("server keypair: %v", err)
|
||||
}
|
||||
|
||||
pool := x509.NewCertPool()
|
||||
pool.AddCert(caCert)
|
||||
return &tls.Config{Certificates: []tls.Certificate{srvPair}, MinVersion: tls.VersionTLS12}, pool
|
||||
}
|
||||
|
||||
// TestNatsTLS validates the TLS data plane: a client trusting the bus CA
|
||||
// completes the handshake and uses the bus (golden); a client that does NOT
|
||||
// trust the CA fails the handshake (error path).
|
||||
func TestNatsTLS(t *testing.T) {
|
||||
serverTLS, caPool := genTestCA(t)
|
||||
h := bootHarness(t, membership.AuthOff, false, serverTLS)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
// Golden: client pinning the CA connects over TLS and operates.
|
||||
clientTLS := &tls.Config{RootCAs: caPool, MinVersion: tls.VersionTLS12}
|
||||
a, err := client.NewWithOptions(h.natsURL, h.ctrlURL, mustIdentity(t), client.Options{TLS: clientTLS})
|
||||
if err != nil {
|
||||
t.Fatalf("client trusting the CA should complete the TLS handshake: %v", err)
|
||||
}
|
||||
defer a.Close()
|
||||
if _, err := a.CreateRoom("room.tls", room.ModeNATS); err != nil {
|
||||
t.Fatalf("TLS client should operate on the bus: %v", err)
|
||||
}
|
||||
|
||||
// Error path: a client that does not trust the CA fails the handshake. Use an
|
||||
// empty pool (system roots would also reject this private CA, but an empty
|
||||
// pool makes the intent explicit and avoids depending on the host's roots).
|
||||
badTLS := &tls.Config{RootCAs: x509.NewCertPool(), MinVersion: tls.VersionTLS12}
|
||||
if _, err := client.NewWithOptions(h.natsURL, h.ctrlURL, mustIdentity(t), client.Options{TLS: badTLS}); err == nil {
|
||||
t.Fatalf("client without the CA must fail the TLS handshake")
|
||||
}
|
||||
}
|
||||
|
||||
// TestSecureBusEndToEnd is the headline golden of issue 0001: with ALL three
|
||||
// layers active at once — control-plane request signing (enforce), NATS nkey
|
||||
// auth, and TLS — two registered peers run an encrypted room end to end. A
|
||||
// creates a Matrix-policy room, invites B, A publishes and B decrypts. This
|
||||
// proves the layers compose: signed HTTP control plane + authenticated,
|
||||
// encrypted data plane + E2E room content.
|
||||
func TestSecureBusEndToEnd(t *testing.T) {
|
||||
serverTLS, caPool := genTestCA(t)
|
||||
h := bootHarness(t, membership.AuthEnforce, true, serverTLS)
|
||||
waitHealth(t, h.ctrlURL)
|
||||
|
||||
clientTLS := &tls.Config{RootCAs: caPool, MinVersion: tls.VersionTLS12}
|
||||
secure := func(t *testing.T, handle string) (*client.Client, membership.AuthMode) {
|
||||
id := mustIdentity(t)
|
||||
if err := h.store.AddUser(hex.EncodeToString(id.SignPub), handle, membership.RoleMember); err != nil {
|
||||
t.Fatalf("register %s: %v", handle, err)
|
||||
}
|
||||
c, err := client.NewWithOptions(h.natsURL, h.ctrlURL, id, client.Options{UseNkey: true, TLS: clientTLS})
|
||||
if err != nil {
|
||||
t.Fatalf("connect %s securely: %v", handle, err)
|
||||
}
|
||||
return c, 0
|
||||
}
|
||||
|
||||
a, _ := secure(t, "alice")
|
||||
defer a.Close()
|
||||
b, _ := secure(t, "bob")
|
||||
defer b.Close()
|
||||
|
||||
roomID, err := a.CreateRoom("room.secure", room.ModeMatrix)
|
||||
if err != nil {
|
||||
t.Fatalf("A create encrypted room over secure bus: %v", err)
|
||||
}
|
||||
if err := a.Invite(roomID, b.Endpoint()); err != nil {
|
||||
t.Fatalf("A invite B: %v", err)
|
||||
}
|
||||
if err := b.Join(roomID); err != nil {
|
||||
t.Fatalf("B join: %v", err)
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
var got []string
|
||||
sub, err := b.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) {
|
||||
mu.Lock()
|
||||
got = append(got, string(plaintext))
|
||||
mu.Unlock()
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("B subscribe: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
|
||||
const msg = "mensaje sobre bus seguro (auth+TLS+E2E)"
|
||||
if err := a.Publish(roomID, []byte(msg)); err != nil {
|
||||
t.Fatalf("A publish: %v", err)
|
||||
}
|
||||
if !waitFor(&mu, &got, func(rs []string) bool {
|
||||
for _, r := range rs {
|
||||
if r == msg {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}, 2*time.Second) {
|
||||
t.Fatalf("B did not receive/decrypt the message over the secured bus; got %v", snapshot(&mu, &got))
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,344 @@
|
||||
package embeddednats_test
|
||||
|
||||
import (
|
||||
"crypto/ecdsa"
|
||||
"crypto/elliptic"
|
||||
"crypto/rand"
|
||||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"math/big"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// freePort returns an OS-assigned free TCP port on loopback.
|
||||
func freePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// startNode boots a clustered embedded NATS node. peerRoutePorts are the route
|
||||
// ports of the OTHER nodes; user/pass gate the route layer (empty disables it);
|
||||
// routeTLS, when non-nil, secures the routes with mutual TLS.
|
||||
func startNode(t *testing.T, name string, clientPort, routePort int, peerRoutePorts []int, user, pass string, routeTLS *clusterTLS) *server.Server {
|
||||
t.Helper()
|
||||
routes := make([]string, 0, len(peerRoutePorts))
|
||||
for _, p := range peerRoutePorts {
|
||||
// Carry the cluster credentials in the route URL so this node
|
||||
// authenticates outbound to its peers' route listeners.
|
||||
if user != "" {
|
||||
routes = append(routes, fmt.Sprintf("nats://%s:%s@127.0.0.1:%d", user, pass, p))
|
||||
} else {
|
||||
routes = append(routes, fmt.Sprintf("nats://127.0.0.1:%d", p))
|
||||
}
|
||||
}
|
||||
cc := &embeddednats.ClusterConfig{
|
||||
Name: "unibus-test",
|
||||
Host: "127.0.0.1",
|
||||
Port: routePort,
|
||||
Routes: routes,
|
||||
Username: user,
|
||||
Password: pass,
|
||||
}
|
||||
if routeTLS != nil {
|
||||
cfg, err := busauth.RouteTLSConfig(routeTLS.cert, routeTLS.key, routeTLS.ca)
|
||||
if err != nil {
|
||||
t.Fatalf("route TLS for %s: %v", name, err)
|
||||
}
|
||||
cc.TLS = cfg
|
||||
}
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: clientPort,
|
||||
ServerName: name,
|
||||
Cluster: cc,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("start node %s: %v", name, err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
// waitRoutes waits until ns has at least want established routes, or fails.
|
||||
func waitRoutes(t *testing.T, ns *server.Server, want int) {
|
||||
t.Helper()
|
||||
deadline := time.Now().Add(8 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if ns.NumRoutes() >= want {
|
||||
return
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
t.Fatalf("node %q never reached %d routes (have %d)", ns.Name(), want, ns.NumRoutes())
|
||||
}
|
||||
|
||||
// stableRouteCount waits for ns's route count to stop changing (the NATS route
|
||||
// pool opens several connections per peer asynchronously) and returns it, so a
|
||||
// test can use it as a baseline that an impostor must not increase.
|
||||
func stableRouteCount(t *testing.T, ns *server.Server) int {
|
||||
t.Helper()
|
||||
prev := -1
|
||||
stableSince := time.Now()
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
n := ns.NumRoutes()
|
||||
if n != prev {
|
||||
prev = n
|
||||
stableSince = time.Now()
|
||||
} else if time.Since(stableSince) >= 750*time.Millisecond {
|
||||
return n
|
||||
}
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
return prev
|
||||
}
|
||||
|
||||
// pubSubAcrossNodes connects a subscriber to subURL and a publisher to pubURL,
|
||||
// publishes one message on subject, and reports whether it arrived within 3s.
|
||||
// This proves the cluster forwards client subjects between nodes.
|
||||
func pubSubAcrossNodes(t *testing.T, subURL, pubURL, subject, payload string) bool {
|
||||
t.Helper()
|
||||
subConn, err := nats.Connect(subURL)
|
||||
if err != nil {
|
||||
t.Fatalf("subscriber connect %s: %v", subURL, err)
|
||||
}
|
||||
defer subConn.Close()
|
||||
got := make(chan string, 1)
|
||||
if _, err := subConn.Subscribe(subject, func(m *nats.Msg) {
|
||||
select {
|
||||
case got <- string(m.Data):
|
||||
default:
|
||||
}
|
||||
}); err != nil {
|
||||
t.Fatalf("subscribe: %v", err)
|
||||
}
|
||||
if err := subConn.Flush(); err != nil {
|
||||
t.Fatalf("flush sub: %v", err)
|
||||
}
|
||||
|
||||
pubConn, err := nats.Connect(pubURL)
|
||||
if err != nil {
|
||||
t.Fatalf("publisher connect %s: %v", pubURL, err)
|
||||
}
|
||||
defer pubConn.Close()
|
||||
// Retry the publish for a moment: route interest propagation across the
|
||||
// cluster is asynchronous, so the very first publish can race the gossip.
|
||||
deadline := time.Now().Add(3 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if err := pubConn.Publish(subject, []byte(payload)); err != nil {
|
||||
t.Fatalf("publish: %v", err)
|
||||
}
|
||||
_ = pubConn.Flush()
|
||||
select {
|
||||
case v := <-got:
|
||||
return v == payload
|
||||
case <-time.After(100 * time.Millisecond):
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// --- golden: two-node cluster forwards client subjects across nodes ----------
|
||||
|
||||
func TestClusterForwardsAcrossNodes(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1}, "clusteruser", "clusterpass", nil)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0}, "clusteruser", "clusterpass", nil)
|
||||
|
||||
waitRoutes(t, n0, 1)
|
||||
waitRoutes(t, n1, 1)
|
||||
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n1.ClientURL(), "test.cross", "hello-cluster") {
|
||||
t.Fatalf("subject published on n1 did not reach subscriber on n0")
|
||||
}
|
||||
}
|
||||
|
||||
// --- edge: three-node cluster (HA shape) forwards between non-adjacent nodes --
|
||||
|
||||
func TestClusterThreeNodesForward(t *testing.T) {
|
||||
rp0, rp1, rp2 := freePort(t), freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1, rp2}, "u", "p", nil)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0, rp2}, "u", "p", nil)
|
||||
n2 := startNode(t, "n2", freePort(t), rp2, []int{rp0, rp1}, "u", "p", nil)
|
||||
|
||||
waitRoutes(t, n0, 2)
|
||||
waitRoutes(t, n1, 2)
|
||||
waitRoutes(t, n2, 2)
|
||||
|
||||
// Publish on n2, subscribe on n0: a message must traverse the cluster.
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n2.ClientURL(), "test.ha", "three-node") {
|
||||
t.Fatalf("subject published on n2 did not reach subscriber on n0")
|
||||
}
|
||||
}
|
||||
|
||||
// --- error: a node with the wrong cluster password is rejected as a route -----
|
||||
|
||||
func TestClusterRejectsBadRouteAuth(t *testing.T) {
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
good := startNode(t, "good", freePort(t), rp0, []int{rp1}, "secret", "right", nil)
|
||||
_ = startNode(t, "peer", freePort(t), rp1, []int{rp0}, "secret", "right", nil)
|
||||
waitRoutes(t, good, 1)
|
||||
// Let the route pool settle so the baseline count is stable (NATS opens a
|
||||
// pool of route connections per peer, so NumRoutes counts connections, not
|
||||
// distinct peers).
|
||||
base := stableRouteCount(t, good)
|
||||
|
||||
// Impostor knows the addresses but not the cluster password. It tries to
|
||||
// route to `good`; the route handshake must be rejected, so the impostor
|
||||
// never establishes a route.
|
||||
impostor := startNode(t, "impostor", freePort(t), freePort(t), []int{rp0}, "secret", "WRONG", nil)
|
||||
|
||||
// Give the route layer ample time to (fail to) connect, then assert it never
|
||||
// formed: the impostor has zero routes, and `good`'s route count is unchanged
|
||||
// (it did not accept a route from the impostor).
|
||||
time.Sleep(2 * time.Second)
|
||||
if n := impostor.NumRoutes(); n != 0 {
|
||||
t.Fatalf("impostor with wrong cluster password formed %d routes, want 0", n)
|
||||
}
|
||||
if n := good.NumRoutes(); n != base {
|
||||
t.Fatalf("legit node route count changed from %d to %d after impostor attempt (it accepted the impostor)", base, n)
|
||||
}
|
||||
}
|
||||
|
||||
// --- golden (TLS): mutual-TLS routes forward across nodes ---------------------
|
||||
|
||||
func TestClusterMutualTLSForwards(t *testing.T) {
|
||||
ca, caKey := genCA(t)
|
||||
dir := t.TempDir()
|
||||
tlsA := writeNodeCert(t, dir, "a", ca, caKey)
|
||||
tlsB := writeNodeCert(t, dir, "b", ca, caKey)
|
||||
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
n0 := startNode(t, "n0", freePort(t), rp0, []int{rp1}, "u", "p", tlsA)
|
||||
n1 := startNode(t, "n1", freePort(t), rp1, []int{rp0}, "u", "p", tlsB)
|
||||
|
||||
waitRoutes(t, n0, 1)
|
||||
waitRoutes(t, n1, 1)
|
||||
|
||||
if !pubSubAcrossNodes(t, n0.ClientURL(), n1.ClientURL(), "test.tls", "mtls-ok") {
|
||||
t.Fatalf("subject did not cross the mutual-TLS cluster")
|
||||
}
|
||||
}
|
||||
|
||||
// --- error (TLS): a node whose cert is not signed by the bus CA cannot join ---
|
||||
|
||||
func TestClusterRejectsUnsignedNode(t *testing.T) {
|
||||
ca, caKey := genCA(t)
|
||||
dir := t.TempDir()
|
||||
tlsGood := writeNodeCert(t, dir, "good", ca, caKey)
|
||||
tlsPeer := writeNodeCert(t, dir, "peer", ca, caKey)
|
||||
|
||||
// The impostor signs its node cert with a DIFFERENT CA, and pins only that
|
||||
// CA. The legit nodes' RequireAndVerifyClientCert against the bus CA rejects
|
||||
// it; the impostor likewise rejects the legit node's cert. No route forms.
|
||||
otherCA, otherKey := genCA(t)
|
||||
tlsImpostor := writeNodeCert(t, dir, "impostor", otherCA, otherKey)
|
||||
|
||||
rp0, rp1 := freePort(t), freePort(t)
|
||||
good := startNode(t, "good", freePort(t), rp0, []int{rp1}, "u", "p", tlsGood)
|
||||
_ = startNode(t, "peer", freePort(t), rp1, []int{rp0}, "u", "p", tlsPeer)
|
||||
waitRoutes(t, good, 1)
|
||||
base := stableRouteCount(t, good)
|
||||
|
||||
impostor := startNode(t, "impostor", freePort(t), freePort(t), []int{rp0}, "u", "p", tlsImpostor)
|
||||
time.Sleep(2 * time.Second)
|
||||
if n := impostor.NumRoutes(); n != 0 {
|
||||
t.Fatalf("impostor with unsigned cert formed %d routes, want 0", n)
|
||||
}
|
||||
if n := good.NumRoutes(); n != base {
|
||||
t.Fatalf("legit node route count changed from %d to %d after unsigned impostor attempt (it accepted the impostor)", base, n)
|
||||
}
|
||||
}
|
||||
|
||||
// --- cert helpers ------------------------------------------------------------
|
||||
|
||||
type clusterTLS struct{ cert, key, ca string } // PEM file paths
|
||||
|
||||
// genCA creates a self-signed ECDSA CA certificate and its key.
|
||||
func genCA(t *testing.T) (*x509.Certificate, *ecdsa.PrivateKey) {
|
||||
t.Helper()
|
||||
key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("gen CA key: %v", err)
|
||||
}
|
||||
tmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(1),
|
||||
Subject: pkix.Name{CommonName: "unibus-test-CA"},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature,
|
||||
BasicConstraintsValid: true,
|
||||
IsCA: true,
|
||||
}
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key)
|
||||
if err != nil {
|
||||
t.Fatalf("create CA cert: %v", err)
|
||||
}
|
||||
caCert, err := x509.ParseCertificate(der)
|
||||
if err != nil {
|
||||
t.Fatalf("parse CA cert: %v", err)
|
||||
}
|
||||
return caCert, key
|
||||
}
|
||||
|
||||
// writeNodeCert issues a node certificate signed by ca (SAN 127.0.0.1/::1,
|
||||
// usable as both server and client) and writes cert/key/ca PEM files, returning
|
||||
// their paths for RouteTLSConfig.
|
||||
func writeNodeCert(t *testing.T, dir, name string, ca *x509.Certificate, caKey *ecdsa.PrivateKey) *clusterTLS {
|
||||
t.Helper()
|
||||
key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader)
|
||||
if err != nil {
|
||||
t.Fatalf("gen node key: %v", err)
|
||||
}
|
||||
tmpl := &x509.Certificate{
|
||||
SerialNumber: big.NewInt(time.Now().UnixNano()),
|
||||
Subject: pkix.Name{CommonName: name},
|
||||
NotBefore: time.Now().Add(-time.Hour),
|
||||
NotAfter: time.Now().Add(24 * time.Hour),
|
||||
KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
|
||||
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth},
|
||||
IPAddresses: []net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")},
|
||||
DNSNames: []string{"localhost"},
|
||||
}
|
||||
der, err := x509.CreateCertificate(rand.Reader, tmpl, ca, &key.PublicKey, caKey)
|
||||
if err != nil {
|
||||
t.Fatalf("create node cert: %v", err)
|
||||
}
|
||||
certPath := filepath.Join(dir, name+".crt")
|
||||
keyPath := filepath.Join(dir, name+".key")
|
||||
caPath := filepath.Join(dir, name+"-ca.crt")
|
||||
|
||||
writePEM(t, certPath, "CERTIFICATE", der)
|
||||
keyDER, err := x509.MarshalECPrivateKey(key)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal node key: %v", err)
|
||||
}
|
||||
writePEM(t, keyPath, "EC PRIVATE KEY", keyDER)
|
||||
writePEM(t, caPath, "CERTIFICATE", ca.Raw)
|
||||
return &clusterTLS{cert: certPath, key: keyPath, ca: caPath}
|
||||
}
|
||||
|
||||
func writePEM(t *testing.T, path, blockType string, der []byte) {
|
||||
t.Helper()
|
||||
b := pem.EncodeToMemory(&pem.Block{Type: blockType, Bytes: der})
|
||||
if err := os.WriteFile(path, b, 0o600); err != nil {
|
||||
t.Fatalf("write %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
@@ -6,25 +6,151 @@
|
||||
package embeddednats
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// Start launches an embedded nats-server with JetStream enabled, listening on
|
||||
// the given port and persisting JetStream state under storeDir. It blocks until
|
||||
// the server is ready to accept connections (up to 5s) and returns the running
|
||||
// server. The caller is responsible for calling Shutdown on it.
|
||||
// ClusterConfig configures the route layer that links several embedded NATS
|
||||
// servers into a single cluster (issue 0003a). It is the data-plane side of
|
||||
// high availability: with a cluster, a client subject published on one node is
|
||||
// forwarded to subscribers connected to any other node, and (with JetStream
|
||||
// replicas > 1) streams/KV are RAFT-replicated across nodes so the loss of one
|
||||
// node does not lose the bus.
|
||||
//
|
||||
// The route layer is a SEPARATE trust boundary from the client data plane: it
|
||||
// carries server-to-server traffic, so it authenticates NODES, not bus users.
|
||||
// Never reuse the nkey client authenticator here. Routes are secured with their
|
||||
// own shared secret (Username/Password -> NATS Cluster.Authorization) and their
|
||||
// own mutual TLS (TLS, built from the bus CA with busauth.RouteTLSConfig): a
|
||||
// node without the cluster secret and a CA-signed node certificate cannot join
|
||||
// the cluster nor inject messages into it.
|
||||
type ClusterConfig struct {
|
||||
// Name is the cluster name; it MUST be identical on every node or the
|
||||
// servers refuse to gossip routes to each other.
|
||||
Name string
|
||||
// Host and Port are the route listener (server-to-server), distinct from the
|
||||
// client Host/Port. Use a free, non-client port (e.g. 6250).
|
||||
Host string
|
||||
Port int
|
||||
// Routes are the nats-route URLs of the OTHER nodes, e.g.
|
||||
// "nats://user:pass@10.0.0.2:6250". When the route layer is password
|
||||
// protected each URL must carry the same userinfo as the local Username /
|
||||
// Password so this node authenticates outbound to its peers.
|
||||
Routes []string
|
||||
// Username and Password gate the route listener (NATS Cluster.Authorization).
|
||||
// A peer (or impostor) that connects to this node's route port without these
|
||||
// credentials is rejected, so it never becomes a route. Empty disables route
|
||||
// auth (dev / trusted-network only).
|
||||
Username string
|
||||
Password string
|
||||
// TLS, when non-nil, secures the route connections with mutual TLS. Build it
|
||||
// with busauth.RouteTLSConfig(cert, key, ca): the server presents its node
|
||||
// certificate AND requires+verifies the connecting node's certificate against
|
||||
// the bus CA, so an unsigned impostor cannot establish a route even with the
|
||||
// right password. Nil keeps routes plaintext (dev / WireGuard-only).
|
||||
TLS *tls.Config
|
||||
}
|
||||
|
||||
// ServerConfig is the full set of knobs for the embedded NATS server. The zero
|
||||
// value (empty StoreDir aside) yields a dev-friendly server: JetStream on, bound
|
||||
// to all interfaces, no client auth, no TLS, standalone (no cluster). Secured
|
||||
// deployments set Auth and TLS; HA deployments set ServerName + Cluster; tests
|
||||
// set Host to loopback and a free Port.
|
||||
type ServerConfig struct {
|
||||
StoreDir string // JetStream store directory
|
||||
Host string // bind interface; "" = nats-server default ("0.0.0.0")
|
||||
Port int // listen port
|
||||
// ServerName is this node's unique name within the cluster. JetStream's RAFT
|
||||
// layer requires a stable, unique name per node to form its meta-group; leave
|
||||
// it empty for a standalone server (nats-server then auto-generates one).
|
||||
ServerName string
|
||||
// Auth, when non-nil, is installed as CustomClientAuthentication so the data
|
||||
// plane only accepts approved clients (nkey signature + bus allowlist).
|
||||
Auth server.Authentication
|
||||
// TLS, when non-nil, makes the server present a certificate and require TLS
|
||||
// on the data plane. Clients must trust the issuing CA (see busauth).
|
||||
TLS *tls.Config
|
||||
// Cluster, when non-nil, joins this server to a route cluster for high
|
||||
// availability (issue 0003a). Nil keeps the server standalone (the legacy
|
||||
// single-node behavior).
|
||||
Cluster *ClusterConfig
|
||||
}
|
||||
|
||||
// Start is a thin backward-compatible wrapper: embedded JetStream server on the
|
||||
// default interface, no auth, no TLS.
|
||||
func Start(storeDir string, port int) (*server.Server, error) {
|
||||
return StartServer(ServerConfig{StoreDir: storeDir, Port: port})
|
||||
}
|
||||
|
||||
// StartHost is Start with explicit control over the bind interface. host selects
|
||||
// which network interface the data plane listens on: pass "127.0.0.1" to keep
|
||||
// NATS loopback-only (the safe default for a single-host dev stack) or "0.0.0.0"
|
||||
// to expose it to the LAN so remote peers (phones, other PCs) can connect. An
|
||||
// empty host falls back to the nats-server default ("0.0.0.0", all interfaces).
|
||||
func StartHost(storeDir, host string, port int) (*server.Server, error) {
|
||||
return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port})
|
||||
}
|
||||
|
||||
// StartHostAuth is StartHost with an optional custom client authenticator. When
|
||||
// auth is non-nil only clients the authenticator approves may connect; when nil
|
||||
// the server accepts any client (legacy, network-trusted behavior).
|
||||
func StartHostAuth(storeDir, host string, port int, auth server.Authentication) (*server.Server, error) {
|
||||
return StartServer(ServerConfig{StoreDir: storeDir, Host: host, Port: port, Auth: auth})
|
||||
}
|
||||
|
||||
// StartServer launches an embedded nats-server with JetStream from cfg. It
|
||||
// blocks until the server is ready to accept connections (up to 5s) and returns
|
||||
// the running server; the caller must Shutdown it.
|
||||
func StartServer(cfg ServerConfig) (*server.Server, error) {
|
||||
// Diagnostic toggle: UNIBUS_NATS_DEBUG=1 enables the embedded nats-server's own
|
||||
// logger (route/RAFT/JetStream errors), which is otherwise silenced. Off by
|
||||
// default so production behavior is unchanged; only set it when debugging the
|
||||
// cluster route layer.
|
||||
debugLevel := os.Getenv("UNIBUS_NATS_DEBUG")
|
||||
debugNATS := debugLevel == "1" || debugLevel == "2"
|
||||
traceNATS := debugLevel == "2"
|
||||
opts := &server.Options{
|
||||
JetStream: true,
|
||||
StoreDir: storeDir,
|
||||
Port: port,
|
||||
StoreDir: cfg.StoreDir,
|
||||
Host: cfg.Host,
|
||||
Port: cfg.Port,
|
||||
ServerName: cfg.ServerName,
|
||||
DontListen: false,
|
||||
// Keep the embedded server quiet by default; the host app logs the URLs.
|
||||
NoLog: true,
|
||||
NoSigs: true,
|
||||
NoLog: !debugNATS,
|
||||
Debug: debugNATS,
|
||||
Trace: traceNATS,
|
||||
Logtime: true,
|
||||
NoSigs: true,
|
||||
}
|
||||
if debugNATS {
|
||||
// Expose the nats-server monitoring endpoint (loopback) so the operator can
|
||||
// inspect /jsz, /routez, /varz while debugging the cluster meta-group.
|
||||
opts.HTTPHost = "127.0.0.1"
|
||||
opts.HTTPPort = 8222
|
||||
}
|
||||
if cfg.Auth != nil {
|
||||
opts.CustomClientAuthentication = cfg.Auth
|
||||
// A CustomClientAuthentication alone does not make the server advertise a
|
||||
// nonce in its INFO line, and nats.go refuses to connect with an nkey to a
|
||||
// server that does not ("nkeys not supported by the server"). Forcing the
|
||||
// nonce makes nkey clients sign the challenge our authenticator verifies.
|
||||
opts.AlwaysEnableNonce = true
|
||||
}
|
||||
if cfg.TLS != nil {
|
||||
opts.TLSConfig = cfg.TLS
|
||||
opts.TLS = true
|
||||
}
|
||||
|
||||
if cfg.Cluster != nil {
|
||||
if err := applyClusterOpts(opts, cfg.Cluster); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
ns, err := server.NewServer(opts)
|
||||
@@ -32,6 +158,10 @@ func Start(storeDir string, port int) (*server.Server, error) {
|
||||
return nil, fmt.Errorf("embeddednats: new server: %w", err)
|
||||
}
|
||||
|
||||
if debugNATS {
|
||||
ns.ConfigureLogger()
|
||||
}
|
||||
|
||||
go ns.Start()
|
||||
|
||||
if !ns.ReadyForConnections(5 * time.Second) {
|
||||
@@ -42,6 +172,49 @@ func Start(storeDir string, port int) (*server.Server, error) {
|
||||
return ns, nil
|
||||
}
|
||||
|
||||
// applyClusterOpts translates a ClusterConfig into the nats-server route options
|
||||
// on opts: the cluster listener (name + host/port + shared-secret auth + mutual
|
||||
// TLS) and the outbound routes to the other nodes. A malformed route URL is a
|
||||
// configuration error and aborts startup rather than silently dropping a peer.
|
||||
func applyClusterOpts(opts *server.Options, c *ClusterConfig) error {
|
||||
opts.Cluster = server.ClusterOpts{
|
||||
Name: c.Name,
|
||||
Host: c.Host,
|
||||
Port: c.Port,
|
||||
Username: c.Username,
|
||||
Password: c.Password,
|
||||
// Disable route connection pooling (nats-server 2.10+ defaults to a pool of
|
||||
// 3 connections per peer). On a small cluster the pool churns with
|
||||
// "duplicate route"/"client closed" reconnects that interrupt the meta-group
|
||||
// RAFT heartbeats, causing perpetual leader re-elections so the JetStream
|
||||
// meta never becomes current and stream/KV creation hangs (issue 0006g).
|
||||
// PoolSize=-1 forces the classic single route per peer, which is stable for
|
||||
// the 3-node unibus cluster.
|
||||
PoolSize: -1,
|
||||
// NoAdvertise stops the server from gossiping its locally-discovered IPs to
|
||||
// peers. The cluster nodes are Docker hosts, so without this NATS advertises
|
||||
// the docker bridge addresses (172.x / 10.0.x) as reachable routes; peers
|
||||
// then try to dial those private, mutually-unreachable IPs, churning the
|
||||
// route layer and destabilizing the JetStream meta-group. With NoAdvertise
|
||||
// the nodes use ONLY the explicit public-IP routes we configure (issue 0006g).
|
||||
NoAdvertise: true,
|
||||
}
|
||||
if c.TLS != nil {
|
||||
opts.Cluster.TLSConfig = c.TLS
|
||||
// A generous handshake budget: route TLS does a mutual handshake and the
|
||||
// peer may still be booting. The default 2s can flap on a cold cluster.
|
||||
opts.Cluster.TLSTimeout = 5.0
|
||||
}
|
||||
for _, r := range c.Routes {
|
||||
u, err := url.Parse(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("embeddednats: parse route %q: %w", r, err)
|
||||
}
|
||||
opts.Routes = append(opts.Routes, u)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClientURL returns a NATS connection URL for the running embedded server.
|
||||
func ClientURL(ns *server.Server) string {
|
||||
return ns.ClientURL()
|
||||
|
||||
+20
-9
@@ -36,6 +36,10 @@ const (
|
||||
KICK
|
||||
// ACK acknowledges receipt of a previous frame.
|
||||
ACK
|
||||
// REACT is a reaction to a previous message (an emoji/shortcode). The target
|
||||
// message id travels in ReplyTo; the reaction content rides Payload, so in
|
||||
// encrypted rooms the reaction is sealed exactly like any other message.
|
||||
REACT
|
||||
)
|
||||
|
||||
// BlobRef references an out-of-band encrypted blob stored in the object store.
|
||||
@@ -47,16 +51,23 @@ type BlobRef struct {
|
||||
}
|
||||
|
||||
// Frame is the unit of transport on the unibus message bus.
|
||||
//
|
||||
// Threading metadata (ThreadID, ReplyTo) is additive and optional: it travels in
|
||||
// the cleartext envelope (these are message-id references, not secret content)
|
||||
// and is omitted entirely when unset, so the wire format and signatures of
|
||||
// non-threaded frames are byte-for-byte identical to before this field existed.
|
||||
type Frame struct {
|
||||
Type FrameType `json:"t"`
|
||||
Subject string `json:"s"`
|
||||
Sender string `json:"from"` // endpoint id = EndpointID(signPub)
|
||||
MsgID string `json:"id"` // ULID
|
||||
Epoch int `json:"e"` // epoch of the room key K used to encrypt
|
||||
Nonce []byte `json:"n,omitempty"` // AEAD nonce (encrypted rooms only)
|
||||
Payload []byte `json:"p,omitempty"` // AEAD ciphertext (or cleartext if the room does not encrypt)
|
||||
Blob *BlobRef `json:"b,omitempty"`
|
||||
Sig []byte `json:"sig,omitempty"` // Ed25519 signature over SigningBytes()
|
||||
Type FrameType `json:"t"`
|
||||
Subject string `json:"s"`
|
||||
Sender string `json:"from"` // endpoint id = EndpointID(signPub)
|
||||
MsgID string `json:"id"` // ULID
|
||||
Epoch int `json:"e"` // epoch of the room key K used to encrypt
|
||||
ThreadID string `json:"thr,omitempty"` // root message id of the thread this frame belongs to
|
||||
ReplyTo string `json:"re,omitempty"` // message id this frame replies to / reacts to
|
||||
Nonce []byte `json:"n,omitempty"` // AEAD nonce (encrypted rooms only)
|
||||
Payload []byte `json:"p,omitempty"` // AEAD ciphertext (or cleartext if the room does not encrypt)
|
||||
Blob *BlobRef `json:"b,omitempty"`
|
||||
Sig []byte `json:"sig,omitempty"` // Ed25519 signature over SigningBytes()
|
||||
}
|
||||
|
||||
// Marshal serializes the frame to its wire representation (JSON in v1).
|
||||
|
||||
@@ -2,6 +2,7 @@ package frame
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -40,6 +41,67 @@ func TestMarshalUnmarshalRoundTrip(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestThreadingRoundTrip (golden) verifies that the additive threading fields
|
||||
// survive a marshal/unmarshal cycle and that a REACT frame keeps its target.
|
||||
func TestThreadingRoundTrip(t *testing.T) {
|
||||
orig := Frame{
|
||||
Type: REACT,
|
||||
Subject: "room.general",
|
||||
Sender: "abc123",
|
||||
MsgID: "01J000000000000000000002",
|
||||
Epoch: 1,
|
||||
ThreadID: "01J000000000000000000000",
|
||||
ReplyTo: "01J000000000000000000001",
|
||||
Payload: []byte("👍"),
|
||||
}
|
||||
b, err := orig.Marshal()
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal: %v", err)
|
||||
}
|
||||
got, err := Unmarshal(b)
|
||||
if err != nil {
|
||||
t.Fatalf("Unmarshal: %v", err)
|
||||
}
|
||||
if got.Type != REACT {
|
||||
t.Fatalf("type mismatch: got %d want REACT(%d)", got.Type, REACT)
|
||||
}
|
||||
if got.ThreadID != orig.ThreadID || got.ReplyTo != orig.ReplyTo {
|
||||
t.Fatalf("threading fields lost: got thr=%q re=%q", got.ThreadID, got.ReplyTo)
|
||||
}
|
||||
if !bytes.Equal(got.Payload, orig.Payload) {
|
||||
t.Fatalf("reaction payload mismatch: got %q", got.Payload)
|
||||
}
|
||||
}
|
||||
|
||||
// TestNonThreadedWireBackCompat (edge) asserts that a frame without threading
|
||||
// metadata serializes with NO thr/re keys at all, so its bytes — and therefore
|
||||
// its signature — are identical to a pre-threading frame. This is the
|
||||
// guarantee that makes the new fields a non-breaking, additive change.
|
||||
func TestNonThreadedWireBackCompat(t *testing.T) {
|
||||
f := Frame{Type: PUB, Subject: "room.general", Sender: "x", MsgID: "id", Epoch: 2, Payload: []byte("hi")}
|
||||
b, err := f.Marshal()
|
||||
if err != nil {
|
||||
t.Fatalf("Marshal: %v", err)
|
||||
}
|
||||
s := string(b)
|
||||
if strings.Contains(s, "\"thr\"") || strings.Contains(s, "\"re\"") {
|
||||
t.Fatalf("threading keys leaked into a non-threaded frame: %s", s)
|
||||
}
|
||||
// SigningBytes of a non-threaded frame must also be free of the keys, so old
|
||||
// signatures over equivalent frames still verify.
|
||||
if sb := f.SigningBytes(); strings.Contains(string(sb), "\"thr\"") || strings.Contains(string(sb), "\"re\"") {
|
||||
t.Fatalf("threading keys leaked into SigningBytes: %s", sb)
|
||||
}
|
||||
}
|
||||
|
||||
// TestUnmarshalRejectsGarbage (error path) verifies that malformed wire bytes
|
||||
// surface as an error rather than a silently zero-valued frame.
|
||||
func TestUnmarshalRejectsGarbage(t *testing.T) {
|
||||
if _, err := Unmarshal([]byte("{not valid json")); err == nil {
|
||||
t.Fatalf("expected error unmarshaling garbage, got nil")
|
||||
}
|
||||
}
|
||||
|
||||
func TestEndpointIDDeterministic(t *testing.T) {
|
||||
pub := []byte("some-ed25519-public-key-bytes-32")
|
||||
a := EndpointID(pub)
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
package membership
|
||||
|
||||
// Per-subject data-plane access control derived from room membership (issue
|
||||
// 0003e, audit H4 residual; tightened in issue 0006b for audit 0008 N2). The
|
||||
// control plane already authorizes metadata by membership; this is the matching
|
||||
// restriction on the NATS data plane so a registered peer can only
|
||||
// publish/subscribe on the subjects of the rooms it actually belongs to — and can
|
||||
// only reach the JetStream API of ITS OWN rooms' streams, never the control-plane
|
||||
// KV buckets.
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// clientInfraSubjects are the subjects every authorized peer needs regardless of
|
||||
// room membership, kept deliberately MINIMAL (issue 0006b, audit 0008 N2):
|
||||
//
|
||||
// - "_INBOX.>" — request/reply plus the JetStream pull-consumer delivery
|
||||
// and publish-ack inboxes.
|
||||
// - "$JS.API.INFO" — account-level JetStream info (limits/usage counters). It
|
||||
// exposes NO room/user/key contents, so granting it leaks nothing.
|
||||
//
|
||||
// It NO LONGER contains "$JS.API.>". That broad grant was the N2 leak: it let any
|
||||
// registered peer drive the whole JetStream API and read the control-plane KV
|
||||
// buckets (KV_UNIBUS_users/rooms/members/room_keys) and the object store directly
|
||||
// over NATS, bypassing the HTTP authorization (requireMember and the own-endpoint
|
||||
// checks). JetStream API access is now granted PER ROOM, scoped to the stream of
|
||||
// each room the peer belongs to (jsSubjectsFor). Because the control-plane KV
|
||||
// streams (KV_UNIBUS_*) and the object store (OBJ_UNIBUS_*) are never a room
|
||||
// stream, they fall outside the closed allow set and are denied by default.
|
||||
var clientInfraSubjects = []string{"_INBOX.>", "$JS.API.INFO"}
|
||||
|
||||
// roomStreamName is the JetStream stream name a persisted room maps to. It MUST
|
||||
// stay identical to pkg/client.streamName ("UNIBUS_" + sanitized roomID) so the
|
||||
// per-room ACL grants exactly the subjects the client's JetStream calls use. Room
|
||||
// ids are ULIDs (no '.'), so the sanitizing is a no-op in practice, but the rule
|
||||
// is replicated defensively so the producer (client) and the authorizer (this
|
||||
// ACL) never drift apart.
|
||||
func roomStreamName(roomID string) string {
|
||||
var b strings.Builder
|
||||
b.Grow(len("UNIBUS_") + len(roomID))
|
||||
b.WriteString("UNIBUS_")
|
||||
for _, r := range roomID {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '_':
|
||||
b.WriteRune(r)
|
||||
default:
|
||||
b.WriteRune('_')
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// jsSubjectsFor returns the MINIMAL JetStream API subjects a peer needs to use the
|
||||
// durable stream of ONE persisted room: create/update/info the stream, manage and
|
||||
// pull from its durable consumer, and ack deliveries. Every subject embeds this
|
||||
// room's stream name, so the grant cannot reach another room's stream nor any
|
||||
// control-plane stream (KV_UNIBUS_* / OBJ_UNIBUS_*). The wildcard layout matches
|
||||
// the NATS JetStream API subject grammar (the stream name is the trailing token
|
||||
// of single-verb requests and follows a two-token verb for MSG.GET / MSG.NEXT /
|
||||
// DURABLE.CREATE):
|
||||
//
|
||||
// $JS.API.STREAM.<verb>.<stream> verb in {CREATE,UPDATE,INFO,DELETE,PURGE,...}
|
||||
// $JS.API.STREAM.MSG.<op>.<stream> op in {GET,DELETE}
|
||||
// $JS.API.CONSUMER.<verb>.<stream> verb in {LIST,NAMES,CREATE(ephemeral)}
|
||||
// $JS.API.CONSUMER.<verb>.<stream>.<consumer>... verb in {CREATE,INFO,DELETE}
|
||||
// $JS.API.CONSUMER.<v1>.<v2>.<stream>.<cons> {MSG.NEXT, DURABLE.CREATE}
|
||||
// $JS.ACK.<stream>.> message acknowledgements
|
||||
func jsSubjectsFor(roomID string) []string {
|
||||
s := roomStreamName(roomID)
|
||||
return []string{
|
||||
"$JS.API.STREAM.*." + s,
|
||||
"$JS.API.STREAM.*.*." + s,
|
||||
"$JS.API.CONSUMER.*." + s,
|
||||
"$JS.API.CONSUMER.*." + s + ".>",
|
||||
"$JS.API.CONSUMER.*.*." + s + ".>",
|
||||
"$JS.ACK." + s + ".>",
|
||||
}
|
||||
}
|
||||
|
||||
// SubjectACLFor returns a function that maps a signing public key (lowercase hex)
|
||||
// to the data-plane subjects that identity may publish and subscribe to: the
|
||||
// subject of every room it belongs to, the per-room JetStream API subjects of
|
||||
// those rooms (so persisted-room history keeps working), plus the minimal client
|
||||
// infrastructure subjects. It reads the live membership store, so the permissions
|
||||
// reflect the identity's rooms at the moment it connects. A decode error or a
|
||||
// store failure is returned as an error so the caller can fail closed (deny the
|
||||
// connection) rather than grant open access.
|
||||
//
|
||||
// Because NATS freezes permissions at connect time, a peer invited to a new room
|
||||
// after connecting must reconnect (client.RefreshSession) to pick up the new
|
||||
// room's subject. The bus is the authoritative directory of subjects, so an
|
||||
// unlisted subject is simply absent from the allow set.
|
||||
func SubjectACLFor(store Store) func(signPubHex string) ([]string, error) {
|
||||
return func(signPubHex string) ([]string, error) {
|
||||
pub, err := hex.DecodeString(signPubHex)
|
||||
if err != nil || len(pub) != 32 {
|
||||
return nil, fmt.Errorf("acl: malformed sign pub %q", signPubHex)
|
||||
}
|
||||
endpoint := frame.EndpointID(pub)
|
||||
rooms, err := store.ListRoomsForEndpoint(endpoint)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("acl: list rooms for %s: %w", endpoint, err)
|
||||
}
|
||||
// clientInfra + per room: the room subject + that room's JetStream API.
|
||||
subjects := make([]string, 0, len(clientInfraSubjects)+len(rooms)*7)
|
||||
subjects = append(subjects, clientInfraSubjects...)
|
||||
for _, r := range rooms {
|
||||
subjects = append(subjects, r.Subject)
|
||||
subjects = append(subjects, jsSubjectsFor(r.RoomID)...)
|
||||
}
|
||||
return subjects, nil
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,379 @@
|
||||
package membership_test
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"net"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
func aclFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
func mustID(t *testing.T) cs.Identity {
|
||||
t.Helper()
|
||||
id, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
return id
|
||||
}
|
||||
|
||||
// aclPermsFunc builds the per-subject PermissionsFunc the ACL authenticator
|
||||
// expects. It delegates to the SAME production wiring membershipd uses
|
||||
// (busauth.PermissionsFromSubjects over membership.SubjectACLFor), so this test
|
||||
// exercises the real path rather than a test-only reimplementation.
|
||||
func aclPermsFunc(store membership.Store) busauth.PermissionsFunc {
|
||||
return busauth.PermissionsFromSubjects(membership.SubjectACLFor(store))
|
||||
}
|
||||
|
||||
// startACLNats boots an embedded NATS whose authenticator confines each peer to
|
||||
// the subjects of the rooms it belongs to (audit H4 residual).
|
||||
func startACLNats(t *testing.T, store membership.Store) *server.Server {
|
||||
t.Helper()
|
||||
auth := busauth.NewNkeyAuthenticatorACL(store.IsAuthorized, aclPermsFunc(store))
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: aclFreePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("acl nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
func nkeyConn(t *testing.T, natsURL string, id cs.Identity, errCh chan error) *nats.Conn {
|
||||
t.Helper()
|
||||
pub, sign, err := busauth.ClientNkey(id.SignPriv)
|
||||
if err != nil {
|
||||
t.Fatalf("nkey: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(natsURL,
|
||||
nats.Nkey(pub, sign),
|
||||
nats.ErrorHandler(func(_ *nats.Conn, _ *nats.Subscription, e error) {
|
||||
select {
|
||||
case errCh <- e:
|
||||
default:
|
||||
}
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("connect nkey: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
return nc
|
||||
}
|
||||
|
||||
func mustAddUser(t *testing.T, store membership.Store, id cs.Identity, handle string) {
|
||||
t.Helper()
|
||||
if err := store.AddUser(hex.EncodeToString(id.SignPub), handle, membership.RoleMember); err != nil {
|
||||
t.Fatalf("add user %s: %v", handle, err)
|
||||
}
|
||||
}
|
||||
|
||||
func mustCreateRoom(t *testing.T, store membership.Store, roomID, subject, ownerEP string, owner cs.Identity) {
|
||||
t.Helper()
|
||||
info := membership.RoomInfo{RoomID: roomID, Subject: subject, OwnerEndpoint: ownerEP}
|
||||
if err := store.CreateRoom(info, owner.SignPub, owner.KexPub, nil); err != nil {
|
||||
t.Fatalf("create room %s: %v", roomID, err)
|
||||
}
|
||||
}
|
||||
|
||||
func newCtrl(t *testing.T, store membership.Store, blobs blobstore.Store) string {
|
||||
t.Helper()
|
||||
ts := httptest.NewServer(membership.NewServer(store, blobs, membership.AuthOff))
|
||||
t.Cleanup(ts.Close)
|
||||
return ts.URL
|
||||
}
|
||||
|
||||
func waitErr(ch chan error, d time.Duration) error {
|
||||
select {
|
||||
case e := <-ch:
|
||||
return e
|
||||
case <-time.After(d):
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func drain(ch chan error) {
|
||||
for {
|
||||
select {
|
||||
case <-ch:
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSubjectACLIsolation closes the audit H4 residual: a registered peer is
|
||||
// confined to the subjects of the rooms it belongs to. alice (member of room.A)
|
||||
// may sub/pub room.A but is DENIED sub/pub on room.B, and never reads what bob
|
||||
// (member of room.B) publishes there.
|
||||
func TestSubjectACLIsolation(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, bob := mustID(t), mustID(t)
|
||||
aliceEP, bobEP := frame.EndpointID(alice.SignPub), frame.EndpointID(bob.SignPub)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, bob, "bob")
|
||||
const subjA, subjB = "room.acl.a", "room.acl.b"
|
||||
mustCreateRoom(t, store, "ROOMA", subjA, aliceEP, alice)
|
||||
mustCreateRoom(t, store, "ROOMB", subjB, bobEP, bob)
|
||||
|
||||
srv := startACLNats(t, store)
|
||||
url := srv.ClientURL()
|
||||
aliceErr := make(chan error, 4)
|
||||
bobErr := make(chan error, 4)
|
||||
aliceNC := nkeyConn(t, url, alice, aliceErr)
|
||||
bobNC := nkeyConn(t, url, bob, bobErr)
|
||||
|
||||
// alice may subscribe to her own room (no error).
|
||||
aliceGot := make(chan string, 4)
|
||||
if _, err := aliceNC.Subscribe(subjA, func(m *nats.Msg) { aliceGot <- string(m.Data) }); err != nil {
|
||||
t.Fatalf("alice sub A: %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 300*time.Millisecond); e != nil {
|
||||
t.Fatalf("alice sub to her OWN room raised an error: %v", e)
|
||||
}
|
||||
|
||||
// alice subscribing to bob's room is a permissions violation.
|
||||
if _, err := aliceNC.Subscribe(subjB, func(m *nats.Msg) { aliceGot <- "LEAK:" + string(m.Data) }); err != nil {
|
||||
t.Fatalf("alice sub B (queue): %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("alice subscribing to bob's room should raise a permissions violation")
|
||||
}
|
||||
|
||||
// bob publishes in his room; alice (denied) must not receive it.
|
||||
bobGot := make(chan string, 4)
|
||||
if _, err := bobNC.Subscribe(subjB, func(m *nats.Msg) { bobGot <- string(m.Data) }); err != nil {
|
||||
t.Fatalf("bob sub B: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
if err := bobNC.Publish(subjB, []byte("internal-bob")); err != nil {
|
||||
t.Fatalf("bob pub B: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
select {
|
||||
case got := <-bobGot:
|
||||
if got != "internal-bob" {
|
||||
t.Fatalf("bob got %q", got)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatalf("bob did not receive his own message")
|
||||
}
|
||||
select {
|
||||
case leak := <-aliceGot:
|
||||
t.Fatalf("alice received bob's room traffic despite the ACL: %q", leak)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// good: alice never got it
|
||||
}
|
||||
|
||||
// alice publishing into bob's room is denied; bob must not receive it.
|
||||
drain(aliceErr)
|
||||
if err := aliceNC.Publish(subjB, []byte("intruder")); err != nil {
|
||||
t.Fatalf("alice pub B (queue): %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("alice publishing into bob's room should raise a permissions violation")
|
||||
}
|
||||
select {
|
||||
case got := <-bobGot:
|
||||
t.Fatalf("bob received alice's cross-room publish despite the ACL: %q", got)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// good
|
||||
}
|
||||
}
|
||||
|
||||
// TestReaudit_H4_WildcardMetadataLeak ports the re-auditor's H4 vector. Before
|
||||
// the per-subject ACL was WIRED into membershipd (it existed in pkg/membership and
|
||||
// pkg/busauth but the binary used the plain NewNkeyAuthenticator), a registered
|
||||
// NON-member could open a raw NATS connection, Subscribe(">"), and capture every
|
||||
// room's subject plus JetStream stream/advisory activity — the payload stayed E2E
|
||||
// ciphertext, but the metadata leaked. With NewNkeyAuthenticatorACL wired via the
|
||||
// production path (busauth.PermissionsFromSubjects(membership.SubjectACLFor)), a
|
||||
// non-member is confined to the client-infra subjects, so the wildcard and any
|
||||
// foreign room subject are denied.
|
||||
//
|
||||
// Coverage:
|
||||
// - error : a non-member's Subscribe(">") raises a permission violation;
|
||||
// - edge : a non-member subscribing to another room's exact subject is denied;
|
||||
// - golden: the member still pub/subs her own room, and the non-member never
|
||||
// captures that traffic.
|
||||
//
|
||||
// Residual now CLOSED (issue 0006b, audit 0008 N2): the client-infra grant no
|
||||
// longer includes "$JS.API.>". JetStream API access is granted per-room only
|
||||
// (membership.jsSubjectsFor), so a peer can reach the API of its OWN rooms'
|
||||
// streams but not the control-plane KV buckets (KV_UNIBUS_*) nor another room's
|
||||
// stream. See TestAttack0008_N2 for the closed-leak regression.
|
||||
func TestReaudit_H4_WildcardMetadataLeak(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, eve := mustID(t), mustID(t)
|
||||
aliceEP := frame.EndpointID(alice.SignPub)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, eve, "eve") // eve is REGISTERED but never a member of alice's room
|
||||
const subject = "room.e2e.confidential"
|
||||
mustCreateRoom(t, store, "ROOMA", subject, aliceEP, alice)
|
||||
|
||||
srv := startACLNats(t, store)
|
||||
url := srv.ClientURL()
|
||||
|
||||
eveErr := make(chan error, 8)
|
||||
eveNC := nkeyConn(t, url, eve, eveErr)
|
||||
eveAll := make(chan *nats.Msg, 16)
|
||||
|
||||
// Error: eve's wildcard subscription is rejected. nats.go creates the local sub
|
||||
// object and the server rejects it asynchronously (delivered to ErrorHandler).
|
||||
if _, err := eveNC.Subscribe(">", func(m *nats.Msg) { eveAll <- m }); err != nil {
|
||||
t.Fatalf("eve sub >: %v", err)
|
||||
}
|
||||
_ = eveNC.Flush()
|
||||
if e := waitErr(eveErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("a non-member's Subscribe(\">\") must raise a permissions violation (wildcard metadata leak still open)")
|
||||
}
|
||||
|
||||
// Edge: eve subscribing to the foreign room's EXACT subject is also denied.
|
||||
drain(eveErr)
|
||||
if _, err := eveNC.Subscribe(subject, func(m *nats.Msg) { eveAll <- m }); err != nil {
|
||||
t.Fatalf("eve sub subject: %v", err)
|
||||
}
|
||||
_ = eveNC.Flush()
|
||||
if e := waitErr(eveErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("a non-member subscribing to another room's subject must be denied")
|
||||
}
|
||||
|
||||
// Golden: alice (the member) pub/subs her own room with no violation, and eve
|
||||
// never captured the traffic despite her (rejected) wildcard.
|
||||
aliceErr := make(chan error, 4)
|
||||
aliceNC := nkeyConn(t, url, alice, aliceErr)
|
||||
aliceGot := make(chan string, 4)
|
||||
if _, err := aliceNC.Subscribe(subject, func(m *nats.Msg) { aliceGot <- string(m.Data) }); err != nil {
|
||||
t.Fatalf("alice sub own room: %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
if e := waitErr(aliceErr, 300*time.Millisecond); e != nil {
|
||||
t.Fatalf("alice subscribing to her OWN room raised an error: %v", e)
|
||||
}
|
||||
if err := aliceNC.Publish(subject, []byte("members-only metadata")); err != nil {
|
||||
t.Fatalf("alice publish: %v", err)
|
||||
}
|
||||
_ = aliceNC.Flush()
|
||||
select {
|
||||
case got := <-aliceGot:
|
||||
if got != "members-only metadata" {
|
||||
t.Fatalf("alice got %q", got)
|
||||
}
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatalf("alice did not receive her own room's message")
|
||||
}
|
||||
select {
|
||||
case m := <-eveAll:
|
||||
t.Fatalf("eve captured room traffic despite the ACL: subject=%q data=%q", m.Subject, m.Data)
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
// good: eve captured nothing
|
||||
}
|
||||
}
|
||||
|
||||
// TestRefreshSessionGainsNewRoom is the "permissions refreshed on join" path:
|
||||
// alice is not in room B, so her connection has no permission for its subject;
|
||||
// after she is added to room B and calls RefreshSession, the reconnect
|
||||
// re-derives her permissions and she gains the room's subject.
|
||||
func TestRefreshSessionGainsNewRoom(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, bob := mustID(t), mustID(t)
|
||||
aliceEP, bobEP := frame.EndpointID(alice.SignPub), frame.EndpointID(bob.SignPub)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, bob, "bob")
|
||||
const subjB = "room.refresh.b"
|
||||
mustCreateRoom(t, store, "ROOMB", subjB, bobEP, bob)
|
||||
|
||||
srv := startACLNats(t, store)
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
ctrl := newCtrl(t, store, blobs)
|
||||
|
||||
aliceC, err := client.NewWithOptions(srv.ClientURL(), ctrl, alice, client.Options{UseNkey: true})
|
||||
if err != nil {
|
||||
t.Fatalf("connect alice: %v", err)
|
||||
}
|
||||
defer aliceC.Close()
|
||||
|
||||
// Add alice to room B (as if invited), then RefreshSession so the
|
||||
// authenticator re-derives her permissions on reconnect.
|
||||
if _, err := store.GetMember("ROOMB", aliceEP); err == nil {
|
||||
t.Fatalf("alice should not be a member yet")
|
||||
}
|
||||
if err := store.AddMember("ROOMB", membership.Member{Endpoint: aliceEP, Role: "member", SignPub: alice.SignPub, KexPub: alice.KexPub}, 1, nil); err != nil {
|
||||
t.Fatalf("add alice to room B: %v", err)
|
||||
}
|
||||
if err := aliceC.RefreshSession(); err != nil {
|
||||
t.Fatalf("refresh session: %v", err)
|
||||
}
|
||||
|
||||
bobErr := make(chan error, 2)
|
||||
bobNC := nkeyConn(t, srv.ClientURL(), bob, bobErr)
|
||||
|
||||
got := make(chan string, 2)
|
||||
sub, err := aliceC.Subscribe("ROOMB", func(_ frame.Frame, plaintext []byte) { got <- string(plaintext) })
|
||||
if err != nil {
|
||||
t.Fatalf("alice subscribe room B after refresh: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
|
||||
// bob publishes a minimal cleartext frame on subjB.
|
||||
f := frame.Frame{Type: frame.PUB, Subject: subjB, Sender: bobEP, MsgID: "m1", Payload: []byte("hello-after-join")}
|
||||
b, _ := f.Marshal()
|
||||
if err := bobNC.Publish(subjB, b); err != nil {
|
||||
t.Fatalf("bob publish: %v", err)
|
||||
}
|
||||
_ = bobNC.Flush()
|
||||
|
||||
select {
|
||||
case msg := <-got:
|
||||
if msg != "hello-after-join" {
|
||||
t.Fatalf("alice got %q", msg)
|
||||
}
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatalf("alice did not receive room B traffic after RefreshSession (permissions not refreshed)")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,241 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// AuthMode is the control-plane authentication rollout state (feature flag
|
||||
// bus-auth). It governs how the HTTP middleware treats a request whose signature
|
||||
// is missing, invalid, replayed, skewed, or from an unregistered identity.
|
||||
//
|
||||
// AuthOff — do not verify anything (legacy behavior; default).
|
||||
// AuthSoft — verify and LOG rejections, but let the request through. Lets
|
||||
// clients migrate to signing without an outage.
|
||||
// AuthEnforce — reject unauthenticated requests with 401.
|
||||
type AuthMode int
|
||||
|
||||
const (
|
||||
AuthOff AuthMode = iota
|
||||
AuthSoft
|
||||
AuthEnforce
|
||||
)
|
||||
|
||||
func (m AuthMode) String() string {
|
||||
switch m {
|
||||
case AuthOff:
|
||||
return "off"
|
||||
case AuthSoft:
|
||||
return "soft"
|
||||
case AuthEnforce:
|
||||
return "enforce"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ParseAuthMode maps the bus-auth flag string to an AuthMode.
|
||||
func ParseAuthMode(s string) (AuthMode, error) {
|
||||
switch s {
|
||||
case "off", "":
|
||||
return AuthOff, nil
|
||||
case "soft":
|
||||
return AuthSoft, nil
|
||||
case "enforce":
|
||||
return AuthEnforce, nil
|
||||
default:
|
||||
return AuthOff, fmt.Errorf("membership: invalid bus-auth mode %q (want off|soft|enforce)", s)
|
||||
}
|
||||
}
|
||||
|
||||
// Control-plane signature headers. The client signs the canonical bytes of the
|
||||
// request and presents these; the server reconstructs the canonical bytes and
|
||||
// verifies. See canonicalRequest for the exact byte layout.
|
||||
const (
|
||||
hdrPub = "X-Unibus-Pub" // signer Ed25519 public key, lowercase hex
|
||||
hdrTs = "X-Unibus-Ts" // unix seconds (string)
|
||||
hdrNonce = "X-Unibus-Nonce" // 16 random bytes, std base64
|
||||
hdrSig = "X-Unibus-Sig" // Ed25519 signature over canonical, std base64
|
||||
)
|
||||
|
||||
// Anti-replay parameters. A request is accepted only if its timestamp is within
|
||||
// clockSkew of now; nonces are remembered for nonceTTL so a captured request
|
||||
// cannot be replayed inside its acceptance window. nonceTTL must be >= the full
|
||||
// acceptance window (2*clockSkew) so a replay can never outlive its memory.
|
||||
const (
|
||||
clockSkew = 30 * time.Second
|
||||
nonceTTL = 60 * time.Second
|
||||
// maxNonceCacheEntries bounds the replay cache so it cannot grow without limit
|
||||
// (audit H7). With IsAuthorized now gating insertion, only authorized traffic
|
||||
// is cached, so this ceiling is only approached under a legitimate burst; at
|
||||
// the cap the oldest nonce is evicted (its TTL is nearly up anyway).
|
||||
maxNonceCacheEntries = 100_000
|
||||
)
|
||||
|
||||
// CanonicalRequest returns the exact bytes that are signed and verified for a
|
||||
// control-plane request:
|
||||
//
|
||||
// method "\n" path "\n" ts "\n" nonce "\n" hex(sha256(body))
|
||||
//
|
||||
// path is the request URI (path plus raw query) so query parameters (endpoint,
|
||||
// epoch) are covered by the signature. It is exported so the client library and
|
||||
// tests sign with the identical construction — the one place this format lives.
|
||||
func CanonicalRequest(method, path, ts, nonce string, body []byte) []byte {
|
||||
sum := sha256.Sum256(body)
|
||||
return []byte(method + "\n" + path + "\n" + ts + "\n" + nonce + "\n" + hex.EncodeToString(sum[:]))
|
||||
}
|
||||
|
||||
// nonceStore is the anti-replay backend: rememberOrReject records a nonce and
|
||||
// reports whether it was unseen (true -> accept) or already seen (false ->
|
||||
// reject the replay). It is an interface (issue 0003e) so the single-node
|
||||
// in-memory cache can be swapped for a replicated KV store: a per-process cache
|
||||
// is BROKEN under multi-node failover (a request captured and replayed to a
|
||||
// DIFFERENT node whose cache never saw the nonce would be accepted), so a
|
||||
// cluster MUST share the nonce state. Every implementation fails CLOSED — a
|
||||
// backend it cannot reach rejects rather than admits.
|
||||
type nonceStore interface {
|
||||
rememberOrReject(nonce string, now time.Time) bool
|
||||
}
|
||||
|
||||
// memNonceCache remembers recently-seen nonces to reject replays. It is an
|
||||
// in-memory store guarded by a mutex — sufficient for a SINGLE membershipd
|
||||
// process. A clustered deployment uses kvNonceStore instead (issue 0003e).
|
||||
//
|
||||
// Pruning is O(expired), not O(n): because the TTL is constant, insertion order
|
||||
// equals expiry order, so the oldest entries (front of `order`) are exactly the
|
||||
// ones that expire first (audit H7 — the previous full-map scan under the mutex
|
||||
// was a CPU-amplification vector). A size cap bounds memory.
|
||||
type memNonceCache struct {
|
||||
mu sync.Mutex
|
||||
seen map[string]time.Time // nonce -> expiry
|
||||
order []string // nonces in insertion order == expiry order
|
||||
ttl time.Duration
|
||||
cap int
|
||||
}
|
||||
|
||||
func newMemNonceCache(ttl time.Duration, capacity int) *memNonceCache {
|
||||
return &memNonceCache{seen: make(map[string]time.Time), ttl: ttl, cap: capacity}
|
||||
}
|
||||
|
||||
// rememberOrReject records nonce and returns true if it was unseen, or false if
|
||||
// it is a replay (still live in the cache).
|
||||
func (n *memNonceCache) rememberOrReject(nonce string, now time.Time) bool {
|
||||
n.mu.Lock()
|
||||
defer n.mu.Unlock()
|
||||
|
||||
// Prune expired entries from the front (oldest first). The first live entry
|
||||
// ends the scan — everything behind it was inserted later and is newer.
|
||||
cut := 0
|
||||
for cut < len(n.order) {
|
||||
exp, ok := n.seen[n.order[cut]]
|
||||
if !ok {
|
||||
cut++ // already evicted by the cap path below
|
||||
continue
|
||||
}
|
||||
if !exp.Before(now) {
|
||||
break
|
||||
}
|
||||
delete(n.seen, n.order[cut])
|
||||
cut++
|
||||
}
|
||||
if cut > 0 {
|
||||
n.order = append(n.order[:0], n.order[cut:]...)
|
||||
}
|
||||
|
||||
if exp, ok := n.seen[nonce]; ok && !exp.Before(now) {
|
||||
return false // a live replay
|
||||
}
|
||||
|
||||
// Bound memory: at capacity, evict the oldest entry (its TTL is nearly up).
|
||||
for len(n.seen) >= n.cap && len(n.order) > 0 {
|
||||
oldest := n.order[0]
|
||||
n.order = n.order[1:]
|
||||
delete(n.seen, oldest)
|
||||
}
|
||||
|
||||
n.seen[nonce] = now.Add(n.ttl)
|
||||
n.order = append(n.order, nonce)
|
||||
return true
|
||||
}
|
||||
|
||||
// authResult is what a successful authentication yields: the verified signing
|
||||
// key (hex), the endpoint id derived from it, and the authorized user record.
|
||||
// Handlers use endpoint for membership authorization (only a member of a room
|
||||
// may read its metadata/keys); user is available for role checks.
|
||||
type authResult struct {
|
||||
pubHex string
|
||||
endpoint string
|
||||
user User
|
||||
}
|
||||
|
||||
// authenticate verifies the signature headers on r against body and the user
|
||||
// allowlist. It returns an error describing the first failing check; the
|
||||
// middleware decides whether that error blocks (enforce) or only logs (soft).
|
||||
//
|
||||
// Order matters: cheap, non-cryptographic checks (header presence, key shape,
|
||||
// clock skew) run first; the Ed25519 verification runs before the replay cache
|
||||
// is touched so an attacker cannot poison the cache with unsigned nonces; the
|
||||
// allowlist lookup runs last.
|
||||
func (s *Server) authenticate(r *http.Request, body []byte, now time.Time) (authResult, error) {
|
||||
pubHex := r.Header.Get(hdrPub)
|
||||
ts := r.Header.Get(hdrTs)
|
||||
nonce := r.Header.Get(hdrNonce)
|
||||
sigB64 := r.Header.Get(hdrSig)
|
||||
if pubHex == "" || ts == "" || nonce == "" || sigB64 == "" {
|
||||
return authResult{}, fmt.Errorf("missing auth headers")
|
||||
}
|
||||
|
||||
pub, err := hex.DecodeString(pubHex)
|
||||
if err != nil || len(pub) != 32 {
|
||||
return authResult{}, fmt.Errorf("malformed %s (want 32-byte Ed25519 hex)", hdrPub)
|
||||
}
|
||||
|
||||
tsInt, err := strconv.ParseInt(ts, 10, 64)
|
||||
if err != nil {
|
||||
return authResult{}, fmt.Errorf("malformed %s", hdrTs)
|
||||
}
|
||||
if d := now.Unix() - tsInt; d > int64(clockSkew/time.Second) || d < -int64(clockSkew/time.Second) {
|
||||
return authResult{}, fmt.Errorf("timestamp out of range (skew %ds)", d)
|
||||
}
|
||||
|
||||
sig, err := base64.StdEncoding.DecodeString(sigB64)
|
||||
if err != nil {
|
||||
return authResult{}, fmt.Errorf("malformed %s", hdrSig)
|
||||
}
|
||||
|
||||
canonical := CanonicalRequest(r.Method, r.URL.RequestURI(), ts, nonce, body)
|
||||
if !cs.VerifyEd25519(pub, canonical, sig) {
|
||||
return authResult{}, fmt.Errorf("invalid signature")
|
||||
}
|
||||
|
||||
// Authorize BEFORE touching the replay cache (audit H7): an unregistered
|
||||
// identity can mint valid signatures for free, so caching its nonces would let
|
||||
// it poison/grow the cache pre-auth. Only authorized identities are remembered.
|
||||
if !s.store.IsAuthorized(pubHex) {
|
||||
return authResult{}, fmt.Errorf("identity not authorized")
|
||||
}
|
||||
|
||||
user, err := s.store.GetUser(pubHex)
|
||||
if err != nil {
|
||||
// IsAuthorized passed but the row vanished (race with revoke): fail closed.
|
||||
return authResult{}, fmt.Errorf("identity not authorized")
|
||||
}
|
||||
|
||||
// Anti-replay last: a replayed request from an authorized identity is still
|
||||
// rejected here (the nonce is already live in the cache from its first use).
|
||||
if !s.nonces.rememberOrReject(nonce, now) {
|
||||
return authResult{}, fmt.Errorf("replayed nonce")
|
||||
}
|
||||
|
||||
return authResult{pubHex: pubHex, endpoint: frame.EndpointID(pub), user: user}, nil
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// authHarness boots an in-process membershipd HTTP server in the given auth mode
|
||||
// with a fresh store + blob store, and seeds one active admin ("alice").
|
||||
type authHarness struct {
|
||||
ts *httptest.Server
|
||||
store Store
|
||||
alice cs.Identity
|
||||
alicePub string // hex
|
||||
}
|
||||
|
||||
func newAuthHarness(t *testing.T, mode AuthMode) *authHarness {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
store, err := Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open store: %v", err)
|
||||
}
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
t.Fatalf("open blobs: %v", err)
|
||||
}
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
alicePub := hex.EncodeToString(alice.SignPub)
|
||||
if err := store.AddUser(alicePub, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("seed admin: %v", err)
|
||||
}
|
||||
srv := NewServer(store, blobs, mode)
|
||||
ts := httptest.NewServer(srv)
|
||||
t.Cleanup(func() {
|
||||
ts.Close()
|
||||
store.Close()
|
||||
})
|
||||
return &authHarness{ts: ts, store: store, alice: alice, alicePub: alicePub}
|
||||
}
|
||||
|
||||
// signedReq builds a control-plane request signed by id, with explicit ts/nonce
|
||||
// so tests can force skew and replay. It signs via the same CanonicalRequest the
|
||||
// production client uses, so the test verifies the real wire contract.
|
||||
func signedReq(t *testing.T, base, method, path string, body []byte, id cs.Identity, ts int64, nonce string) *http.Request {
|
||||
t.Helper()
|
||||
var rdr io.Reader
|
||||
if body != nil {
|
||||
rdr = bytes.NewReader(body)
|
||||
}
|
||||
req, err := http.NewRequest(method, base+path, rdr)
|
||||
if err != nil {
|
||||
t.Fatalf("new request: %v", err)
|
||||
}
|
||||
tss := strconv.FormatInt(ts, 10)
|
||||
canonical := CanonicalRequest(method, path, tss, nonce, body)
|
||||
sig := cs.SignEd25519(id.SignPriv, canonical)
|
||||
req.Header.Set(hdrPub, hex.EncodeToString(id.SignPub))
|
||||
req.Header.Set(hdrTs, tss)
|
||||
req.Header.Set(hdrNonce, nonce)
|
||||
req.Header.Set(hdrSig, base64.StdEncoding.EncodeToString(sig))
|
||||
return req
|
||||
}
|
||||
|
||||
func do(t *testing.T, req *http.Request) (int, string) {
|
||||
t.Helper()
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
t.Fatalf("do request: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
b, _ := io.ReadAll(resp.Body)
|
||||
return resp.StatusCode, string(b)
|
||||
}
|
||||
|
||||
// okPath is a path that authenticates and returns 200 with an empty list when
|
||||
// the request carries NO membership-bound signer (AuthOff/soft/missing-headers
|
||||
// tests). Under enforce, the per-endpoint room directory is now restricted to
|
||||
// the signer's own endpoint (audit H3), so tests that sign as alice use
|
||||
// aliceRoomsPath instead.
|
||||
const okPath = "/members/alice-endpoint/rooms"
|
||||
|
||||
// aliceRoomsPath is alice's own room directory — the canonical "authenticated
|
||||
// and authorized" 200 path under enforce after H3.
|
||||
func aliceRoomsPath(h *authHarness) string {
|
||||
return "/members/" + frame.EndpointID(h.alice.SignPub) + "/rooms"
|
||||
}
|
||||
|
||||
// Golden: a request signed by a registered, active identity is accepted.
|
||||
func TestAuthGoldenAccepted(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
now := time.Now().Unix()
|
||||
code, _ := do(t, signedReq(t, h.ts.URL, "GET", aliceRoomsPath(h), nil, h.alice, now, "nonce-golden"))
|
||||
if code != http.StatusOK {
|
||||
t.Fatalf("golden signed request should be 200, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: a structurally valid signature from an identity that is NOT in the
|
||||
// allowlist is rejected with 401.
|
||||
func TestAuthUnregisteredRejected(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
bob, _ := cs.GenerateIdentity()
|
||||
now := time.Now().Unix()
|
||||
code, body := do(t, signedReq(t, h.ts.URL, "GET", okPath, nil, bob, now, "nonce-bob"))
|
||||
if code != http.StatusUnauthorized {
|
||||
t.Fatalf("unregistered identity should be 401, got %d (%s)", code, body)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: replaying a captured request (same nonce + signature) is rejected.
|
||||
func TestAuthReplayRejected(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
now := time.Now().Unix()
|
||||
first := signedReq(t, h.ts.URL, "GET", aliceRoomsPath(h), nil, h.alice, now, "nonce-replay")
|
||||
if code, body := do(t, first); code != http.StatusOK {
|
||||
t.Fatalf("first request should be 200, got %d (%s)", code, body)
|
||||
}
|
||||
// Identical ts + nonce + signature: a replay.
|
||||
second := signedReq(t, h.ts.URL, "GET", aliceRoomsPath(h), nil, h.alice, now, "nonce-replay")
|
||||
if code, body := do(t, second); code != http.StatusUnauthorized {
|
||||
t.Fatalf("replayed request should be 401, got %d (%s)", code, body)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: a timestamp outside the ±30s window is rejected even with a valid
|
||||
// signature (defends against long-delayed captured requests).
|
||||
func TestAuthClockSkewRejected(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
stale := time.Now().Unix() - 120
|
||||
code, body := do(t, signedReq(t, h.ts.URL, "GET", okPath, nil, h.alice, stale, "nonce-skew"))
|
||||
if code != http.StatusUnauthorized {
|
||||
t.Fatalf("clock-skewed request should be 401, got %d (%s)", code, body)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: tampering the body after signing invalidates the signature.
|
||||
func TestAuthTamperedBodyRejected(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
now := time.Now().Unix()
|
||||
req := signedReq(t, h.ts.URL, "POST", "/rooms", []byte(`{"subject":"x"}`), h.alice, now, "nonce-tamper")
|
||||
// Swap the body for different bytes the signature does not cover.
|
||||
req.Body = io.NopCloser(bytes.NewReader([]byte(`{"subject":"evil"}`)))
|
||||
req.ContentLength = int64(len(`{"subject":"evil"}`))
|
||||
code, body := do(t, req)
|
||||
if code != http.StatusUnauthorized {
|
||||
t.Fatalf("tampered body should be 401, got %d (%s)", code, body)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: missing auth headers under enforce are rejected.
|
||||
func TestAuthMissingHeadersRejected(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
req, _ := http.NewRequest("GET", h.ts.URL+okPath, nil)
|
||||
code, _ := do(t, req)
|
||||
if code != http.StatusUnauthorized {
|
||||
t.Fatalf("unsigned request under enforce should be 401, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// Exemption: the health probe bypasses auth even under enforce.
|
||||
func TestAuthHealthExempt(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
req, _ := http.NewRequest("GET", h.ts.URL+"/healthz", nil)
|
||||
code, _ := do(t, req)
|
||||
if code != http.StatusOK {
|
||||
t.Fatalf("/healthz must be reachable without auth, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// Soft mode: an unauthenticated request is logged but allowed through, so
|
||||
// clients can migrate without an outage.
|
||||
func TestAuthSoftAllowsUnauthenticated(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthSoft)
|
||||
req, _ := http.NewRequest("GET", h.ts.URL+okPath, nil)
|
||||
code, _ := do(t, req)
|
||||
if code != http.StatusOK {
|
||||
t.Fatalf("soft mode should allow unsigned request, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// Off mode (default for legacy callers): no verification at all.
|
||||
func TestAuthOffNoVerification(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthOff)
|
||||
req, _ := http.NewRequest("GET", h.ts.URL+okPath, nil)
|
||||
code, _ := do(t, req)
|
||||
if code != http.StatusOK {
|
||||
t.Fatalf("off mode should allow unsigned request, got %d", code)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// seedRoom inserts an encrypted room owned by alice with a sealed key for her,
|
||||
// directly through the store so the test controls membership precisely. It
|
||||
// returns the room id and alice's endpoint.
|
||||
func seedRoom(t *testing.T, h *authHarness, subject string) (string, string) {
|
||||
t.Helper()
|
||||
aliceEp := frame.EndpointID(h.alice.SignPub)
|
||||
roomID := newULID()
|
||||
info := RoomInfo{RoomID: roomID, Subject: subject, OwnerEndpoint: aliceEp, Encrypt: true}
|
||||
if err := h.store.CreateRoom(info, h.alice.SignPub, h.alice.KexPub, []byte("alice-sealed-key")); err != nil {
|
||||
t.Fatalf("seed room: %v", err)
|
||||
}
|
||||
return roomID, aliceEp
|
||||
}
|
||||
|
||||
// register adds id to the bus allowlist so its signed requests clear auth and
|
||||
// reach the handler, where membership authorization (not mere registration) is
|
||||
// what the test exercises.
|
||||
func register(t *testing.T, h *authHarness, id cs.Identity, handle string) {
|
||||
t.Helper()
|
||||
if err := h.store.AddUser(hex.EncodeToString(id.SignPub), handle, RoleMember); err != nil {
|
||||
t.Fatalf("register %s: %v", handle, err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAudit_HorizontalMetadataLeak ports the auditor's H3 (Alto) finding: bob is
|
||||
// REGISTERED on the bus but is NOT a member of alice's room. Before the fix the
|
||||
// GET endpoints checked registration, not membership, so bob could read the
|
||||
// room's subject, the full member list (with everyone's public keys), alice's
|
||||
// room directory, and even alice's sealed key. Now every one of those returns
|
||||
// 403 to bob, while alice (owner/member) and carol (plain member) get 200.
|
||||
func TestAudit_HorizontalMetadataLeak(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
roomID, aliceEp := seedRoom(t, h, "secret.subject.payroll")
|
||||
|
||||
// bob: registered, never invited.
|
||||
bob, _ := cs.GenerateIdentity()
|
||||
register(t, h, bob, "bob")
|
||||
|
||||
// carol: registered AND a plain (non-owner) member — the legitimate-member edge.
|
||||
carol, _ := cs.GenerateIdentity()
|
||||
register(t, h, carol, "carol")
|
||||
carolEp := frame.EndpointID(carol.SignPub)
|
||||
if err := h.store.AddMember(roomID, Member{Endpoint: carolEp, Role: RoleMember, SignPub: carol.SignPub, KexPub: carol.KexPub}, 1, []byte("carol-sealed")); err != nil {
|
||||
t.Fatalf("add carol: %v", err)
|
||||
}
|
||||
|
||||
n := 0
|
||||
get := func(id cs.Identity, path string) int {
|
||||
n++
|
||||
code, _ := do(t, signedReq(t, h.ts.URL, "GET", path, nil, id, time.Now().Unix(), nonceN(n)))
|
||||
return code
|
||||
}
|
||||
|
||||
// Error path: bob (non-member) is forbidden on every room endpoint.
|
||||
bobChecks := []struct {
|
||||
name string
|
||||
path string
|
||||
}{
|
||||
{"get room", "/rooms/" + roomID},
|
||||
{"list members", "/rooms/" + roomID + "/members"},
|
||||
{"alice room directory", "/members/" + aliceEp + "/rooms"},
|
||||
{"alice sealed key", "/rooms/" + roomID + "/key?endpoint=" + aliceEp},
|
||||
{"bob sealed key in alices room", "/rooms/" + roomID + "/key?endpoint=" + frame.EndpointID(bob.SignPub)},
|
||||
}
|
||||
for _, c := range bobChecks {
|
||||
if code := get(bob, c.path); code != http.StatusForbidden {
|
||||
t.Fatalf("bob (non-member) %s should be 403, got %d", c.name, code)
|
||||
}
|
||||
}
|
||||
|
||||
// Golden: alice (owner/member) reads her room's metadata, members, directory, key.
|
||||
aliceChecks := []string{
|
||||
"/rooms/" + roomID,
|
||||
"/rooms/" + roomID + "/members",
|
||||
"/members/" + aliceEp + "/rooms",
|
||||
"/rooms/" + roomID + "/key?endpoint=" + aliceEp,
|
||||
}
|
||||
for _, p := range aliceChecks {
|
||||
if code := get(h.alice, p); code != http.StatusOK {
|
||||
t.Fatalf("alice (owner) %s should be 200, got %d", p, code)
|
||||
}
|
||||
}
|
||||
|
||||
// Edge: carol is a plain member, not the owner — she may still read the room.
|
||||
if code := get(carol, "/rooms/"+roomID); code != http.StatusOK {
|
||||
t.Fatalf("carol (member) get room should be 200, got %d", code)
|
||||
}
|
||||
if code := get(carol, "/rooms/"+roomID+"/members"); code != http.StatusOK {
|
||||
t.Fatalf("carol (member) list members should be 200, got %d", code)
|
||||
}
|
||||
|
||||
// Edge: carol may fetch her OWN sealed key but not alice's.
|
||||
if code := get(carol, "/rooms/"+roomID+"/key?endpoint="+carolEp); code != http.StatusOK {
|
||||
t.Fatalf("carol fetching her own key should be 200, got %d", code)
|
||||
}
|
||||
if code := get(carol, "/rooms/"+roomID+"/key?endpoint="+aliceEp); code != http.StatusForbidden {
|
||||
t.Fatalf("carol fetching alice's key should be 403, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// nonceN yields a distinct nonce per request so the anti-replay cache never
|
||||
// rejects a fresh, legitimately-different request inside one test.
|
||||
func nonceN(i int) string {
|
||||
return "authz-nonce-" + strconv.Itoa(i)
|
||||
}
|
||||
@@ -0,0 +1,148 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// readRSSkBRaw reads VmRSS (kB) from /proc without a *testing.T, so it is safe to
|
||||
// call from a sampling goroutine (vmRSSkB calls t.Skip, which may only run on the
|
||||
// test's own goroutine). Returns 0 when unavailable.
|
||||
func readRSSkBRaw() int64 {
|
||||
b, err := os.ReadFile("/proc/self/status")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
for _, line := range strings.Split(string(b), "\n") {
|
||||
if strings.HasPrefix(line, "VmRSS:") {
|
||||
f := strings.Fields(line)
|
||||
if len(f) >= 2 {
|
||||
v, _ := strconv.ParseInt(f[1], 10, 64)
|
||||
return v
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// TestReaudit_DoSConcurrency ports the re-auditor's N2 (Medio-Alto) finding: the
|
||||
// per-request body ceiling and the per-IP rate limit do not bound the AGGREGATE
|
||||
// memory of many concurrent uploads. The auditor drove RSS to ~1.42 GB with 40
|
||||
// concurrent 16 MiB blob uploads. With the global in-flight byte limiter, the
|
||||
// number of simultaneously-buffered uploads is capped, so the resident set stays
|
||||
// bounded regardless of how many connections arrive at once.
|
||||
//
|
||||
// Coverage:
|
||||
// - golden: a normal upload succeeds, and the server is still healthy after the
|
||||
// storm (the limiter did not wedge it);
|
||||
// - edge : concurrency right at the cap is admitted;
|
||||
// - error : a concurrent flood far past the cap sheds the excess with 503
|
||||
// (backpressure) instead of buffering it all, and the RSS spike stays bounded
|
||||
// and does NOT scale with the number of requests.
|
||||
func TestReaudit_DoSConcurrency(t *testing.T) {
|
||||
if runtime.GOOS != "linux" {
|
||||
t.Skip("RSS probe is Linux-only")
|
||||
}
|
||||
srv := dosServer(t, AuthOff)
|
||||
// Force a small aggregate cap so the bound is observable in a unit test: with
|
||||
// a 16 MiB blob ceiling, 48 MiB admits ~3 concurrent uploads. Production uses
|
||||
// maxInflightBytes (128 MiB); the mechanism under test is identical.
|
||||
const cap = int64(48) << 20
|
||||
srv.inflight = newInflightLimiter(cap)
|
||||
|
||||
const blob = maxBlobBytes // 16 MiB, the per-request ceiling
|
||||
const n = 40 // the auditor's figure
|
||||
|
||||
// A spike bound: with the cap admitting ~3 concurrent 16 MiB uploads and a
|
||||
// ~2x copy factor (auth buffer + handler buffer) plus Go runtime slack, the
|
||||
// delta should stay well under this. Without the limiter, 40 concurrent
|
||||
// uploads admitted at once would add hundreds of MB (the auditor saw ~1.4 GB).
|
||||
const maxSpikeKB = int64(256) << 10 // 256 MiB
|
||||
|
||||
runtime.GC()
|
||||
before := readRSSkBRaw()
|
||||
|
||||
// Sample peak RSS while the storm runs.
|
||||
var peak int64
|
||||
atomic.StoreInt64(&peak, before)
|
||||
stop := make(chan struct{})
|
||||
var sampler sync.WaitGroup
|
||||
sampler.Add(1)
|
||||
go func() {
|
||||
defer sampler.Done()
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
return
|
||||
default:
|
||||
if v := readRSSkBRaw(); v > atomic.LoadInt64(&peak) {
|
||||
atomic.StoreInt64(&peak, v)
|
||||
}
|
||||
time.Sleep(2 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
var got503, got200 int64
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < n; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
req := httptest.NewRequest(http.MethodPost, "/blobs", &zeroReader{remaining: blob})
|
||||
req.ContentLength = blob
|
||||
// Distinct source IP per request: this is the multi-IP (botnet) shape the
|
||||
// auditor flagged, where the per-IP rate limit gives no aggregate defense.
|
||||
// The in-flight byte limiter is the global bound that must hold here.
|
||||
req.RemoteAddr = "198.51.100." + strconv.Itoa(i%254+1) + ":1234"
|
||||
rec := httptest.NewRecorder()
|
||||
srv.ServeHTTP(rec, req)
|
||||
switch rec.Code {
|
||||
case http.StatusServiceUnavailable:
|
||||
atomic.AddInt64(&got503, 1)
|
||||
case http.StatusOK:
|
||||
atomic.AddInt64(&got200, 1)
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
close(stop)
|
||||
sampler.Wait()
|
||||
|
||||
runtime.GC()
|
||||
delta := atomic.LoadInt64(&peak) - before
|
||||
|
||||
// Error path: the flood must have hit the cap and shed the excess with 503.
|
||||
if got503 == 0 {
|
||||
t.Fatalf("a concurrent flood of %d uploads past the cap should shed some with 503; got 200=%d 503=%d", n, got200, got503)
|
||||
}
|
||||
// The aggregate memory must stay bounded — not scale with n.
|
||||
if delta > maxSpikeKB {
|
||||
t.Fatalf("aggregate RSS spiked %d kB under %d concurrent uploads (bound %d kB): in-flight limiter not bounding memory", delta, n, maxSpikeKB)
|
||||
}
|
||||
// All reservations released after the storm.
|
||||
if f := srv.inflight.inFlight(); f != 0 {
|
||||
t.Fatalf("after the storm inFlight = %d, want 0 (reservations leaked)", f)
|
||||
}
|
||||
|
||||
// Golden: the server is still healthy and serves a normal upload (from a fresh
|
||||
// IP so the per-IP rate limiter, untouched here, is not what we measure).
|
||||
rec := httptest.NewRecorder()
|
||||
gReq := httptest.NewRequest(http.MethodPost, "/blobs", strings.NewReader("hello after storm"))
|
||||
gReq.RemoteAddr = "203.0.113.9:9999"
|
||||
srv.ServeHTTP(rec, gReq)
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("a normal upload after the storm should be 200, got %d (%s)", rec.Code, rec.Body.String())
|
||||
}
|
||||
|
||||
t.Logf("N2 bound: %d uploads -> 200=%d 503=%d, RSS delta %d kB (bound %d kB), cap %d MiB",
|
||||
n, got200, got503, delta, maxSpikeKB, cap>>20)
|
||||
}
|
||||
@@ -0,0 +1,206 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
)
|
||||
|
||||
// dosServer builds a Server backed by a fresh store + blob store so a test can
|
||||
// drive ServeHTTP in-process (white-box) and observe its memory behavior without
|
||||
// a network round trip — the same in-process technique the auditor used.
|
||||
func dosServer(t *testing.T, mode AuthMode) *Server {
|
||||
t.Helper()
|
||||
dir := t.TempDir()
|
||||
store, err := Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open store: %v", err)
|
||||
}
|
||||
blobs, err := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
if err != nil {
|
||||
t.Fatalf("open blobs: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
return NewServer(store, blobs, mode)
|
||||
}
|
||||
|
||||
// zeroReader yields up to remaining zero bytes without ever allocating them, so
|
||||
// the test process itself never materializes a huge buffer (which would taint the
|
||||
// RSS measurement we are trying to make about the SERVER).
|
||||
type zeroReader struct{ remaining int64 }
|
||||
|
||||
func (z *zeroReader) Read(p []byte) (int, error) {
|
||||
if z.remaining <= 0 {
|
||||
return 0, io.EOF
|
||||
}
|
||||
n := int64(len(p))
|
||||
if n > z.remaining {
|
||||
n = z.remaining
|
||||
}
|
||||
for i := int64(0); i < n; i++ {
|
||||
p[i] = 0
|
||||
}
|
||||
z.remaining -= n
|
||||
return int(n), nil
|
||||
}
|
||||
|
||||
// vmRSSkB reads the resident set size (kB) of this process from /proc. Linux-only;
|
||||
// the caller skips on other platforms.
|
||||
func vmRSSkB(t *testing.T) int64 {
|
||||
t.Helper()
|
||||
b, err := os.ReadFile("/proc/self/status")
|
||||
if err != nil {
|
||||
t.Skipf("cannot read /proc/self/status: %v", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(b), "\n") {
|
||||
if strings.HasPrefix(line, "VmRSS:") {
|
||||
f := strings.Fields(line)
|
||||
if len(f) >= 2 {
|
||||
v, _ := strconv.ParseInt(f[1], 10, 64)
|
||||
return v
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Skip("VmRSS not present in /proc/self/status")
|
||||
return 0
|
||||
}
|
||||
|
||||
// TestAudit_DoSBodyLimitNoAuth ports the auditor's H1 (Critical) vector: a peer
|
||||
// with NO valid signature posts an oversized body. Before the fix the middleware
|
||||
// io.ReadAll'd it unbounded (the auditor sent 400 MB and watched RSS jump from
|
||||
// 18 MB to 898 MB). Now the request is rejected 413 and the resident set does NOT
|
||||
// spike. Two shapes are covered:
|
||||
//
|
||||
// (1) a truthful, over-ceiling Content-Length -> rejected before any byte is read;
|
||||
// (2) a lying / unknown length (chunked) -> MaxBytesReader trips mid-read,
|
||||
// capping the buffered bytes at the ceiling instead of the attacker's 400 MB.
|
||||
func TestAudit_DoSBodyLimitNoAuth(t *testing.T) {
|
||||
if runtime.GOOS != "linux" {
|
||||
t.Skip("RSS probe is Linux-only")
|
||||
}
|
||||
srv := dosServer(t, AuthEnforce) // enforce: the request carries no signature
|
||||
|
||||
const huge = int64(400) << 20 // 400 MiB — the auditor's figure
|
||||
// A spike threshold an order of magnitude below the attack. The old code would
|
||||
// add ~400 MB+; the fix keeps the delta to at most one bounded buffer.
|
||||
const maxSpikeKB = int64(96) << 10 // 96 MiB
|
||||
|
||||
// Shape 1: declared Content-Length over the blob ceiling -> early 413, no read.
|
||||
runtime.GC()
|
||||
before := vmRSSkB(t)
|
||||
req := httptest.NewRequest(http.MethodPost, "/blobs", &zeroReader{remaining: huge})
|
||||
req.ContentLength = huge
|
||||
rec := httptest.NewRecorder()
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusRequestEntityTooLarge {
|
||||
t.Fatalf("over-declared body should be 413, got %d", rec.Code)
|
||||
}
|
||||
runtime.GC()
|
||||
if d := vmRSSkB(t) - before; d > maxSpikeKB {
|
||||
t.Fatalf("RSS spiked %d kB on a pre-declared oversized body (limit %d kB)", d, maxSpikeKB)
|
||||
}
|
||||
|
||||
// Shape 2: unknown length (chunked-style). The middleware cannot reject by
|
||||
// Content-Length, so MaxBytesReader must cap the read at maxBlobBytes.
|
||||
runtime.GC()
|
||||
before = vmRSSkB(t)
|
||||
req = httptest.NewRequest(http.MethodPost, "/blobs", &zeroReader{remaining: huge})
|
||||
req.ContentLength = -1
|
||||
rec = httptest.NewRecorder()
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusRequestEntityTooLarge {
|
||||
t.Fatalf("unknown-length oversized body should be 413, got %d", rec.Code)
|
||||
}
|
||||
runtime.GC()
|
||||
if d := vmRSSkB(t) - before; d > maxSpikeKB {
|
||||
t.Fatalf("RSS spiked %d kB on a chunked oversized body (limit %d kB)", d, maxSpikeKB)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBlobLimitGoldenAndBoundary covers the golden path (a normal blob is stored)
|
||||
// and the boundary (a body exactly at the ceiling is accepted; one byte over by
|
||||
// truthful Content-Length is rejected before buffering).
|
||||
func TestBlobLimitGoldenAndBoundary(t *testing.T) {
|
||||
srv := dosServer(t, AuthOff) // AuthOff: the limits apply regardless of auth mode
|
||||
|
||||
// Golden: a small blob is accepted and hashed.
|
||||
rec := httptest.NewRecorder()
|
||||
srv.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/blobs", strings.NewReader("hello blob")))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("normal blob should be 200, got %d (%s)", rec.Code, rec.Body.String())
|
||||
}
|
||||
|
||||
// Boundary: exactly at the ceiling is allowed (MaxBytesReader permits N bytes).
|
||||
atLimit := strings.Repeat("a", maxBlobBytes)
|
||||
rec = httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodPost, "/blobs", strings.NewReader(atLimit))
|
||||
req.ContentLength = int64(len(atLimit))
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("blob exactly at the ceiling should be 200, got %d", rec.Code)
|
||||
}
|
||||
|
||||
// Error: one byte over the ceiling (truthful Content-Length) -> 413 pre-read.
|
||||
rec = httptest.NewRecorder()
|
||||
req = httptest.NewRequest(http.MethodPost, "/blobs", &zeroReader{remaining: maxBlobBytes + 1})
|
||||
req.ContentLength = maxBlobBytes + 1
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusRequestEntityTooLarge {
|
||||
t.Fatalf("blob one byte over the ceiling should be 413, got %d", rec.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestControlBodyLimit checks the smaller JSON ceiling on a non-blob route: a body
|
||||
// over maxControlBodyBytes is rejected 413 before the handler runs.
|
||||
func TestControlBodyLimit(t *testing.T) {
|
||||
srv := dosServer(t, AuthOff)
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodPost, "/rooms", &zeroReader{remaining: maxControlBodyBytes + 1})
|
||||
req.ContentLength = maxControlBodyBytes + 1
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusRequestEntityTooLarge {
|
||||
t.Fatalf("control body over 1 MiB should be 413, got %d", rec.Code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRateLimitPerIP exercises the per-IP throttle: a burst from one IP eventually
|
||||
// gets 429 (error path), while a spread across distinct IPs is never throttled
|
||||
// (edge — the bucket is keyed per source, not global).
|
||||
func TestRateLimitPerIP(t *testing.T) {
|
||||
srv := dosServer(t, AuthOff)
|
||||
|
||||
// Same IP: well past the burst -> at least one 429.
|
||||
got429 := false
|
||||
for i := 0; i < defaultRateBurst+50; i++ {
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, "/rooms/none", nil)
|
||||
req.RemoteAddr = "203.0.113.7:5555"
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code == http.StatusTooManyRequests {
|
||||
got429 = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !got429 {
|
||||
t.Fatalf("a flood from one IP should eventually be rate-limited (429)")
|
||||
}
|
||||
|
||||
// Distinct IPs: each gets a fresh bucket, so none is throttled.
|
||||
for i := 0; i < 100; i++ {
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest(http.MethodGet, "/rooms/none", nil)
|
||||
req.RemoteAddr = "198.51.100." + strconv.Itoa(i%254+1) + ":4444"
|
||||
srv.ServeHTTP(rec, req)
|
||||
if rec.Code == http.StatusTooManyRequests {
|
||||
t.Fatalf("distinct IPs must not share a rate bucket; IP #%d got 429", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package membership
|
||||
|
||||
import "sync/atomic"
|
||||
|
||||
// inflightLimiter is a non-blocking, byte-counting concurrency limiter: a global
|
||||
// cap on how many bytes of request body the server will buffer simultaneously.
|
||||
//
|
||||
// The per-request body ceilings (maxControlBodyBytes / maxBlobBytes) bound a
|
||||
// single request, and the per-IP rate limiter throttles a single source, but
|
||||
// neither bounds the AGGREGATE memory across many concurrent uploads: the
|
||||
// re-audit (report 0006, N2) showed 40 concurrent 16 MiB blob uploads driving
|
||||
// RSS to ~1.42 GB, and a distributed (multi-IP) flood scales without a ceiling
|
||||
// because the rate limiter is per-IP. This limiter is the missing aggregate
|
||||
// bound: ServeHTTP reserves a request's worst-case buffered size before reading
|
||||
// the body and releases it when the request finishes, so the total bytes in
|
||||
// flight can never exceed max regardless of how many connections or source IPs
|
||||
// arrive at once.
|
||||
//
|
||||
// It is intentionally NON-blocking: when a reservation does not fit, the caller
|
||||
// sheds the request with backpressure (503) rather than parking a goroutine,
|
||||
// which would let an attacker exhaust goroutines/connections instead of RAM. The
|
||||
// counter is maintained with sync/atomic (a CAS loop), so it is safe for
|
||||
// concurrent use without a mutex.
|
||||
//
|
||||
// Implementation note: this lives inside unibus rather than the fn-registry
|
||||
// (where a generic concurrency primitive would normally belong) because the
|
||||
// registry's functions/core package pulls in transitive dependencies that
|
||||
// require CGO (mattn/go-sqlite3) and external modules, which are incompatible
|
||||
// with unibus's CGO_ENABLED=0 build, and because this work is scoped to the
|
||||
// unibus sub-repo.
|
||||
type inflightLimiter struct {
|
||||
max int64 // immutable after construction; <= 0 disables the limiter
|
||||
used int64 // bytes currently reserved; accessed ONLY via sync/atomic
|
||||
}
|
||||
|
||||
// newInflightLimiter builds a limiter with a cap of maxBytes bytes in flight.
|
||||
// maxBytes <= 0 disables the cap (tryAcquire always grants), which is the
|
||||
// loopback/dev posture where an aggregate memory ceiling is not wanted.
|
||||
func newInflightLimiter(maxBytes int64) *inflightLimiter {
|
||||
return &inflightLimiter{max: maxBytes}
|
||||
}
|
||||
|
||||
// tryAcquire reserves n bytes without blocking. It returns true and reserves the
|
||||
// bytes when they fit within the cap (used+n <= max), or false (reserving
|
||||
// nothing) when they do not. n <= 0 is granted without reserving, and a disabled
|
||||
// limiter (max <= 0) always grants. Safe for concurrent use.
|
||||
func (l *inflightLimiter) tryAcquire(n int64) bool {
|
||||
if l.max <= 0 || n <= 0 {
|
||||
return true
|
||||
}
|
||||
for {
|
||||
cur := atomic.LoadInt64(&l.used)
|
||||
if cur+n > l.max {
|
||||
return false
|
||||
}
|
||||
if atomic.CompareAndSwapInt64(&l.used, cur, cur+n) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// release returns n previously reserved bytes. It must be paired with a
|
||||
// tryAcquire that granted. A disabled limiter or n <= 0 is a no-op. The counter
|
||||
// never drops below zero (a defensive clamp against an accidental double release).
|
||||
func (l *inflightLimiter) release(n int64) {
|
||||
if l.max <= 0 || n <= 0 {
|
||||
return
|
||||
}
|
||||
for {
|
||||
cur := atomic.LoadInt64(&l.used)
|
||||
nv := cur - n
|
||||
if nv < 0 {
|
||||
nv = 0
|
||||
}
|
||||
if atomic.CompareAndSwapInt64(&l.used, cur, nv) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// inFlight returns the bytes currently reserved. It is observability for tests
|
||||
// and metrics.
|
||||
func (l *inflightLimiter) inFlight() int64 {
|
||||
return atomic.LoadInt64(&l.used)
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestInflightLimiterBasics covers the limiter contract: granting within the cap
|
||||
// (golden), the exact boundary (edge), refusal over the cap without mutating the
|
||||
// counter (error), the disabled mode, and the defensive clamp on over-release.
|
||||
func TestInflightLimiterBasics(t *testing.T) {
|
||||
l := newInflightLimiter(100)
|
||||
|
||||
// Golden: a reservation within the cap is granted and reflected.
|
||||
if !l.tryAcquire(60) {
|
||||
t.Fatalf("acquire 60 within cap 100 should grant")
|
||||
}
|
||||
if l.inFlight() != 60 {
|
||||
t.Fatalf("inFlight = %d, want 60", l.inFlight())
|
||||
}
|
||||
|
||||
// Edge: exactly reaching the cap (60+40 == 100) is granted.
|
||||
if !l.tryAcquire(40) {
|
||||
t.Fatalf("acquire to the exact cap should grant")
|
||||
}
|
||||
if l.inFlight() != 100 {
|
||||
t.Fatalf("inFlight = %d, want 100", l.inFlight())
|
||||
}
|
||||
|
||||
// Error: one more byte over the full cap is refused, and the counter is left
|
||||
// untouched (a refused reservation reserves nothing).
|
||||
if l.tryAcquire(1) {
|
||||
t.Fatalf("acquire over a full cap must be refused")
|
||||
}
|
||||
if l.inFlight() != 100 {
|
||||
t.Fatalf("a refused acquire must not change inFlight; got %d", l.inFlight())
|
||||
}
|
||||
|
||||
// Release frees capacity again.
|
||||
l.release(100)
|
||||
if l.inFlight() != 0 {
|
||||
t.Fatalf("inFlight after full release = %d, want 0", l.inFlight())
|
||||
}
|
||||
|
||||
// Defensive: an over-release never drives the counter negative.
|
||||
l.release(50)
|
||||
if l.inFlight() != 0 {
|
||||
t.Fatalf("over-release must clamp at 0; got %d", l.inFlight())
|
||||
}
|
||||
}
|
||||
|
||||
// TestInflightLimiterDisabled verifies that a non-positive cap disables the
|
||||
// limiter: every reservation is granted and nothing is tracked (the loopback/dev
|
||||
// posture).
|
||||
func TestInflightLimiterDisabled(t *testing.T) {
|
||||
for _, max := range []int64{0, -1} {
|
||||
l := newInflightLimiter(max)
|
||||
if !l.tryAcquire(1 << 30) {
|
||||
t.Fatalf("disabled limiter (max=%d) must always grant", max)
|
||||
}
|
||||
if l.inFlight() != 0 {
|
||||
t.Fatalf("disabled limiter must not track usage; got %d", l.inFlight())
|
||||
}
|
||||
l.release(1 << 30) // no-op, must not panic
|
||||
}
|
||||
}
|
||||
|
||||
// TestInflightLimiterConcurrent hammers the limiter from many goroutines with
|
||||
// equal-sized acquire/release pairs and asserts the invariant never breaks: the
|
||||
// counter returns to 0 and never exceeds the cap. Run with -race for the memory
|
||||
// model guarantee.
|
||||
func TestInflightLimiterConcurrent(t *testing.T) {
|
||||
const cap = 1000
|
||||
const chunk = 7
|
||||
l := newInflightLimiter(cap)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for g := 0; g < 64; g++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for i := 0; i < 2000; i++ {
|
||||
if l.tryAcquire(chunk) {
|
||||
if f := l.inFlight(); f > cap {
|
||||
t.Errorf("inFlight %d exceeded cap %d", f, cap)
|
||||
return
|
||||
}
|
||||
l.release(chunk)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
if l.inFlight() != 0 {
|
||||
t.Fatalf("after all goroutines, inFlight = %d, want 0", l.inFlight())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,656 @@
|
||||
package membership
|
||||
|
||||
// jetstreamStore is the JetStream KV implementation of Store (issue 0003b): the
|
||||
// control-plane state (rooms, members, sealed room keys, the user allowlist)
|
||||
// lives in replicated JetStream Key/Value buckets instead of a process-local
|
||||
// SQLite file. Any node in the cluster reads and writes the same buckets, and
|
||||
// JetStream's RAFT layer keeps them consistent across replicas, so the HTTP
|
||||
// control plane becomes effectively stateless: any membershipd can serve any
|
||||
// request. It is selected only when the `decentralized` flag is on; sqliteStore
|
||||
// stays the default.
|
||||
//
|
||||
// Key layout (every path segment is a single KV token — ULIDs, RawURL endpoint
|
||||
// ids and lowercase-hex keys never contain a '.', so '.' is a safe separator and
|
||||
// a "<prefix>.*" watch enumerates exactly one trailing token):
|
||||
//
|
||||
// rooms roomID -> RoomInfo (JSON)
|
||||
// members roomID.endpoint -> Member (JSON, carries Role)
|
||||
// rooms_by_member endpoint.roomID -> role (reverse index for ListRoomsForEndpoint)
|
||||
// room_keys roomID.endpoint.epoch -> sealed_key bytes
|
||||
// users signPubHex -> User (JSON)
|
||||
//
|
||||
// Consistency caveat: KV has no multi-key transaction, so a multi-write op
|
||||
// (CreateRoom, AddMember) is a short sequence of single-key writes. The order is
|
||||
// chosen so a partial failure leaves a recoverable state (the room/member row
|
||||
// before its reverse index or sealed key), and writes are idempotent (Put
|
||||
// overwrites), which is also what makes the SQLite->KV migration (0003c) safe to
|
||||
// re-run.
|
||||
//
|
||||
// Fail-closed: every read uses a bounded context, and IsAuthorized/HasAdmin
|
||||
// return false on ANY backend error (a KV quorum loss or timeout denies access
|
||||
// rather than admitting it), mirroring the SQLite store's behavior.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// Bucket names (alphanumeric/dash/underscore only — no dots, per KV rules).
|
||||
const (
|
||||
bucketRooms = "UNIBUS_rooms"
|
||||
bucketMembers = "UNIBUS_members"
|
||||
bucketByMember = "UNIBUS_rooms_by_member"
|
||||
bucketRoomKeys = "UNIBUS_room_keys"
|
||||
bucketUsers = "UNIBUS_users"
|
||||
defaultKVOpTime = 5 * time.Second
|
||||
)
|
||||
|
||||
// JetStreamConfig configures the KV-backed store.
|
||||
type JetStreamConfig struct {
|
||||
// Replicas is the per-bucket replication factor (R1..R5). Use 1 for a single
|
||||
// node or a 1-2 node rollout, 3 for real HA (quorum 2/3). Scaling R1->R3 in
|
||||
// place is an operational step (nats kv update) done when the third node
|
||||
// joins; it does not require reopening the store.
|
||||
Replicas int
|
||||
// OpTimeout bounds every KV operation so a stalled backend fails closed
|
||||
// instead of hanging a request. Zero uses defaultKVOpTime.
|
||||
OpTimeout time.Duration
|
||||
}
|
||||
|
||||
type jetstreamStore struct {
|
||||
rooms jetstream.KeyValue
|
||||
members jetstream.KeyValue
|
||||
byMember jetstream.KeyValue
|
||||
keys jetstream.KeyValue
|
||||
users jetstream.KeyValue
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// OpenJetStream creates (or opens) the five KV buckets on js with the configured
|
||||
// replication factor and returns a Store backed by them. The JetStream context
|
||||
// belongs to the caller (it owns the NATS connection); Close is a no-op.
|
||||
func OpenJetStream(js jetstream.JetStream, cfg JetStreamConfig) (Store, error) {
|
||||
if cfg.Replicas <= 0 {
|
||||
cfg.Replicas = 1
|
||||
}
|
||||
opTimeout := cfg.OpTimeout
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultKVOpTime
|
||||
}
|
||||
// Bootstrap budget for creating/opening the buckets. On a single node JetStream
|
||||
// is ready the instant the server starts, so the first attempt succeeds. On a
|
||||
// COLD multi-node cluster the JetStream meta-group must first elect a leader and
|
||||
// each node must establish contact with it before its $JS.API responds. A KV
|
||||
// op is a NATS request/reply: if it is published before the node's JetStream is
|
||||
// ready the request is dropped (not queued), and a single long-context call then
|
||||
// just blocks until it times out (issue 0006g). So we RETRY each bucket op with
|
||||
// short per-attempt contexts until it succeeds or the overall bootstrap budget
|
||||
// is exhausted; once the cluster is ready the next retry lands and the buckets
|
||||
// are created, after which they persist and every node opens them quickly.
|
||||
bootstrapBudget := 120 * time.Second
|
||||
deadline := time.Now().Add(bootstrapBudget)
|
||||
|
||||
s := &jetstreamStore{opTimeout: opTimeout}
|
||||
for _, b := range []struct {
|
||||
name string
|
||||
dst *jetstream.KeyValue
|
||||
}{
|
||||
{bucketRooms, &s.rooms},
|
||||
{bucketMembers, &s.members},
|
||||
{bucketByMember, &s.byMember},
|
||||
{bucketRoomKeys, &s.keys},
|
||||
{bucketUsers, &s.users},
|
||||
} {
|
||||
var kv jetstream.KeyValue
|
||||
var lastErr error
|
||||
for {
|
||||
opCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
kv, lastErr = js.CreateOrUpdateKeyValue(opCtx, jetstream.KeyValueConfig{
|
||||
Bucket: b.name,
|
||||
Replicas: cfg.Replicas,
|
||||
History: 1,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
cancel()
|
||||
if lastErr == nil {
|
||||
break
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return nil, fmt.Errorf("membership: open KV bucket %q (replicas=%d) after %s: %w", b.name, cfg.Replicas, bootstrapBudget, lastErr)
|
||||
}
|
||||
// JetStream not ready yet (no meta leader / request dropped). Wait and
|
||||
// re-publish the op; in a cluster cold start this lands once the meta
|
||||
// group settles.
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
*b.dst = kv
|
||||
}
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Close releases nothing: the JetStream context and NATS connection are owned by
|
||||
// the caller, which closes them on shutdown.
|
||||
func (s *jetstreamStore) Close() error { return nil }
|
||||
|
||||
func (s *jetstreamStore) ctx() (context.Context, context.CancelFunc) {
|
||||
return context.WithTimeout(context.Background(), s.opTimeout)
|
||||
}
|
||||
|
||||
// ---- key helpers ----------------------------------------------------------
|
||||
|
||||
func memberKey(roomID, endpoint string) string { return roomID + "." + endpoint }
|
||||
func byMemberKey(endpoint, roomID string) string { return endpoint + "." + roomID }
|
||||
func sealedKey(roomID, endpoint string, e int) string {
|
||||
return roomID + "." + endpoint + "." + strconv.Itoa(e)
|
||||
}
|
||||
|
||||
// watchEntries collects every current entry whose key matches pattern (a KV
|
||||
// watch with a "<prefix>.*" wildcard), draining the watcher until the nil marker
|
||||
// that signals "all initial values delivered". Tombstones are skipped.
|
||||
func (s *jetstreamStore) watchEntries(kv jetstream.KeyValue, pattern string) ([]jetstream.KeyValueEntry, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
w, err := kv.Watch(ctx, pattern, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer w.Stop()
|
||||
var out []jetstream.KeyValueEntry
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
return out, nil // initial snapshot complete
|
||||
}
|
||||
out = append(out, e)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---- rooms / members / keys ----------------------------------------------
|
||||
|
||||
func (s *jetstreamStore) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
|
||||
info.Epoch = 1
|
||||
roomJSON, err := json.Marshal(info)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal room: %w", err)
|
||||
}
|
||||
// Create (not Put) so a duplicate room id is rejected, matching SQLite's
|
||||
// PRIMARY KEY behavior.
|
||||
if _, err := s.rooms.Create(ctx, info.RoomID, roomJSON); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return fmt.Errorf("membership: room %q already exists", info.RoomID)
|
||||
}
|
||||
return fmt.Errorf("membership: create room: %w", err)
|
||||
}
|
||||
|
||||
owner := Member{Endpoint: info.OwnerEndpoint, Role: "owner", SignPub: ownerSignPub, KexPub: ownerKexPub}
|
||||
if err := s.putMember(ctx, info.RoomID, owner); err != nil {
|
||||
return err
|
||||
}
|
||||
if info.Encrypt {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(info.RoomID, info.OwnerEndpoint, 1), ownerSealedKey); err != nil {
|
||||
return fmt.Errorf("membership: put owner key: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// putMember writes the member row and its reverse index together.
|
||||
func (s *jetstreamStore) putMember(ctx context.Context, roomID string, m Member) error {
|
||||
mb, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal member: %w", err)
|
||||
}
|
||||
if _, err := s.members.Put(ctx, memberKey(roomID, m.Endpoint), mb); err != nil {
|
||||
return fmt.Errorf("membership: put member: %w", err)
|
||||
}
|
||||
if _, err := s.byMember.Put(ctx, byMemberKey(m.Endpoint, roomID), []byte(m.Role)); err != nil {
|
||||
return fmt.Errorf("membership: put member index: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetRoom(roomID string) (RoomInfo, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.rooms.Get(ctx, roomID)
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return RoomInfo{}, fmt.Errorf("membership: get room %q: %w", roomID, ErrNotFound)
|
||||
}
|
||||
return RoomInfo{}, fmt.Errorf("membership: get room %q: %w", roomID, err)
|
||||
}
|
||||
var info RoomInfo
|
||||
if err := json.Unmarshal(e.Value(), &info); err != nil {
|
||||
return RoomInfo{}, fmt.Errorf("membership: unmarshal room %q: %w", roomID, err)
|
||||
}
|
||||
return info, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) AddMember(roomID string, m Member, epoch int, sealedKeyBytes []byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if err := s.putMember(ctx, roomID, m); err != nil {
|
||||
return err
|
||||
}
|
||||
if len(sealedKeyBytes) > 0 {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(roomID, m.Endpoint, epoch), sealedKeyBytes); err != nil {
|
||||
return fmt.Errorf("membership: put member key: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetMember(roomID, endpoint string) (Member, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.members.Get(ctx, memberKey(roomID, endpoint))
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return Member{}, fmt.Errorf("membership: get member %q/%q: %w", roomID, endpoint, ErrNotFound)
|
||||
}
|
||||
return Member{}, fmt.Errorf("membership: get member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return Member{}, fmt.Errorf("membership: unmarshal member: %w", err)
|
||||
}
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListMembers(roomID string) ([]Member, error) {
|
||||
entries, err := s.watchEntries(s.members, roomID+".*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list members %q: %w", roomID, err)
|
||||
}
|
||||
out := make([]Member, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return nil, fmt.Errorf("membership: unmarshal member: %w", err)
|
||||
}
|
||||
out = append(out, m)
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].Endpoint < out[j].Endpoint })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error) {
|
||||
entries, err := s.watchEntries(s.byMember, endpoint+".*")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list rooms for endpoint %q: %w", endpoint, err)
|
||||
}
|
||||
out := make([]RoomMembership, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
// Key is "<endpoint>.<roomID>"; the roomID is everything after the dot.
|
||||
roomID := e.Key()[len(endpoint)+1:]
|
||||
info, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
continue // index points at a removed room: skip, stay consistent
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, RoomMembership{RoomInfo: info, Role: string(e.Value())})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].RoomID < out[j].RoomID })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
if epoch > 0 {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.keys.Get(ctx, sealedKey(roomID, endpoint, epoch))
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, ErrNotFound)
|
||||
}
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, err)
|
||||
}
|
||||
return epoch, e.Value(), nil
|
||||
}
|
||||
// epoch <= 0: latest. Enumerate "<roomID>.<endpoint>.*" and take the max.
|
||||
entries, err := s.watchEntries(s.keys, roomID+"."+endpoint+".*")
|
||||
if err != nil {
|
||||
return 0, nil, fmt.Errorf("membership: get latest sealed key %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
bestEpoch, bestVal := -1, []byte(nil)
|
||||
for _, e := range entries {
|
||||
k := e.Key()
|
||||
ep, perr := strconv.Atoi(k[len(roomID)+1+len(endpoint)+1:])
|
||||
if perr != nil {
|
||||
continue
|
||||
}
|
||||
if ep > bestEpoch {
|
||||
bestEpoch, bestVal = ep, e.Value()
|
||||
}
|
||||
}
|
||||
if bestEpoch < 0 {
|
||||
return 0, nil, fmt.Errorf("membership: get latest sealed key %q/%q: %w", roomID, endpoint, ErrNotFound)
|
||||
}
|
||||
return bestEpoch, bestVal, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
for endpoint, sealed := range keys {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(roomID, endpoint, epoch), sealed); err != nil {
|
||||
return fmt.Errorf("membership: put sealed key for %q: %w", endpoint, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) BumpEpoch(roomID string, newEpoch int) error {
|
||||
// Read-modify-write the room's epoch. The control plane serializes rekeys per
|
||||
// room (owner-signed), so the lost-update window is not exercised in practice.
|
||||
info, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
info.Epoch = newEpoch
|
||||
b, err := json.Marshal(info)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal room: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.rooms.Put(ctx, roomID, b); err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) RemoveMember(roomID, endpoint string) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
// Drop the member row and its reverse index. Past-epoch sealed keys are left
|
||||
// intact (they only decrypt data the member could already read), matching the
|
||||
// SQLite store.
|
||||
if err := s.members.Delete(ctx, memberKey(roomID, endpoint)); err != nil && !errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return fmt.Errorf("membership: remove member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
if err := s.byMember.Delete(ctx, byMemberKey(endpoint, roomID)); err != nil && !errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return fmt.Errorf("membership: remove member index %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- users (the bus allowlist) -------------------------------------------
|
||||
|
||||
func (s *jetstreamStore) AddUser(signPub, handle, role string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" || handle == "" {
|
||||
return fmt.Errorf("membership: AddUser: sign_pub and handle required")
|
||||
}
|
||||
if role == "" {
|
||||
role = RoleMember
|
||||
}
|
||||
if role != RoleAdmin && role != RoleMember {
|
||||
return fmt.Errorf("membership: AddUser: invalid role %q (want %q or %q)", role, RoleAdmin, RoleMember)
|
||||
}
|
||||
u := User{SignPub: signPub, Handle: handle, Role: role, Status: StatusActive, CreatedAt: nowRFC3339()}
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal user: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.users.Create(ctx, signPub, b); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return ErrUserExists
|
||||
}
|
||||
return fmt.Errorf("membership: insert user: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) GetUser(signPub string) (User, error) {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.users.Get(ctx, signPub)
|
||||
if err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyNotFound) {
|
||||
return User{}, fmt.Errorf("membership: get user %q: %w", signPub, ErrNotFound)
|
||||
}
|
||||
return User{}, fmt.Errorf("membership: get user %q: %w", signPub, err)
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return User{}, fmt.Errorf("membership: unmarshal user: %w", err)
|
||||
}
|
||||
return u, nil
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) ListUsers() ([]User, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
w, err := s.users.WatchAll(ctx, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
cancel()
|
||||
return nil, fmt.Errorf("membership: list users: %w", err)
|
||||
}
|
||||
defer cancel()
|
||||
defer w.Stop()
|
||||
var out []User
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
if out[i].Handle != out[j].Handle {
|
||||
return out[i].Handle < out[j].Handle
|
||||
}
|
||||
return out[i].SignPub < out[j].SignPub
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return nil, fmt.Errorf("membership: unmarshal user: %w", err)
|
||||
}
|
||||
out = append(out, u)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *jetstreamStore) RevokeUser(signPub string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
u, err := s.GetUser(signPub)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
return fmt.Errorf("membership: revoke user %q: no active user with that key", signPub)
|
||||
}
|
||||
return fmt.Errorf("membership: revoke user %q: %w", signPub, err)
|
||||
}
|
||||
if u.Status != StatusActive {
|
||||
return fmt.Errorf("membership: revoke user %q: no active user with that key", signPub)
|
||||
}
|
||||
u.Status = StatusRevoked
|
||||
u.RevokedAt = nowRFC3339()
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: marshal user: %w", err)
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
if _, err := s.users.Put(ctx, signPub, b); err != nil {
|
||||
return fmt.Errorf("membership: revoke user %q: %w", signPub, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsAuthorized reports whether signPub is an active bus user. Any backend error
|
||||
// (including a KV quorum loss or timeout) yields false: fail closed.
|
||||
func (s *jetstreamStore) IsAuthorized(signPub string) bool {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" {
|
||||
return false
|
||||
}
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
e, err := s.users.Get(ctx, signPub)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
var u User
|
||||
if err := json.Unmarshal(e.Value(), &u); err != nil {
|
||||
return false
|
||||
}
|
||||
return u.Status == StatusActive
|
||||
}
|
||||
|
||||
// HasAdmin reports whether at least one active admin exists. On any backend
|
||||
// error it returns false, keeping the admin-gated endpoints closed (conservative).
|
||||
func (s *jetstreamStore) HasAdmin() bool {
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, u := range users {
|
||||
if u.Role == RoleAdmin && u.Status == StatusActive {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ---- snapshot import / export (issue 0003c migration) ---------------------
|
||||
|
||||
// importSnapshot writes a full Snapshot into the KV buckets, preserving each
|
||||
// room's epoch and each user's status (Put, not CreateRoom/AddUser, so the exact
|
||||
// state is reproduced rather than reset to defaults). Idempotent: every write is
|
||||
// an overwrite, so re-running the migration converges.
|
||||
func (s *jetstreamStore) importSnapshot(snap *Snapshot) error {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
for _, r := range snap.Rooms {
|
||||
b, err := json.Marshal(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("import: marshal room %q: %w", r.RoomID, err)
|
||||
}
|
||||
if _, err := s.rooms.Put(ctx, r.RoomID, b); err != nil {
|
||||
return fmt.Errorf("import: put room %q: %w", r.RoomID, err)
|
||||
}
|
||||
}
|
||||
for roomID, members := range snap.Members {
|
||||
for _, m := range members {
|
||||
if err := s.putMember(ctx, roomID, m); err != nil {
|
||||
return fmt.Errorf("import: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, rec := range snap.Keys {
|
||||
if _, err := s.keys.Put(ctx, sealedKey(rec.RoomID, rec.Endpoint, rec.Epoch), rec.Sealed); err != nil {
|
||||
return fmt.Errorf("import: put key %q/%q@%d: %w", rec.RoomID, rec.Endpoint, rec.Epoch, err)
|
||||
}
|
||||
}
|
||||
for _, u := range snap.Users {
|
||||
b, err := json.Marshal(u)
|
||||
if err != nil {
|
||||
return fmt.Errorf("import: marshal user %q: %w", u.SignPub, err)
|
||||
}
|
||||
if _, err := s.users.Put(ctx, normalizeSignPub(u.SignPub), b); err != nil {
|
||||
return fmt.Errorf("import: put user %q: %w", u.SignPub, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ExportSnapshot reads the entire KV control-plane state back into a Snapshot,
|
||||
// so the migration's parity test can compare it against the SQLite source.
|
||||
func (s *jetstreamStore) ExportSnapshot() (*Snapshot, error) {
|
||||
snap := &Snapshot{Members: map[string][]Member{}}
|
||||
|
||||
roomEntries, err := s.watchAll(s.rooms)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: rooms: %w", err)
|
||||
}
|
||||
for _, e := range roomEntries {
|
||||
var r RoomInfo
|
||||
if err := json.Unmarshal(e.Value(), &r); err != nil {
|
||||
return nil, fmt.Errorf("export kv: unmarshal room: %w", err)
|
||||
}
|
||||
snap.Rooms = append(snap.Rooms, r)
|
||||
}
|
||||
|
||||
memberEntries, err := s.watchAll(s.members)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: members: %w", err)
|
||||
}
|
||||
for _, e := range memberEntries {
|
||||
// Key is "<roomID>.<endpoint>"; neither segment contains a dot.
|
||||
roomID := strings.SplitN(e.Key(), ".", 2)[0]
|
||||
var m Member
|
||||
if err := json.Unmarshal(e.Value(), &m); err != nil {
|
||||
return nil, fmt.Errorf("export kv: unmarshal member: %w", err)
|
||||
}
|
||||
snap.Members[roomID] = append(snap.Members[roomID], m)
|
||||
}
|
||||
|
||||
keyEntries, err := s.watchAll(s.keys)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: keys: %w", err)
|
||||
}
|
||||
for _, e := range keyEntries {
|
||||
// Key is "<roomID>.<endpoint>.<epoch>".
|
||||
parts := strings.Split(e.Key(), ".")
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
epoch, err := strconv.Atoi(parts[2])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
snap.Keys = append(snap.Keys, SealedKeyRecord{RoomID: parts[0], Endpoint: parts[1], Epoch: epoch, Sealed: e.Value()})
|
||||
}
|
||||
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export kv: users: %w", err)
|
||||
}
|
||||
snap.Users = users
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
// watchAll collects every current entry of a bucket (no key filter), draining
|
||||
// the watcher to its initial-snapshot nil marker.
|
||||
func (s *jetstreamStore) watchAll(kv jetstream.KeyValue) ([]jetstream.KeyValueEntry, error) {
|
||||
ctx, cancel := s.ctx()
|
||||
defer cancel()
|
||||
w, err := kv.WatchAll(ctx, jetstream.IgnoreDeletes())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer w.Stop()
|
||||
var out []jetstream.KeyValueEntry
|
||||
for {
|
||||
select {
|
||||
case e := <-w.Updates():
|
||||
if e == nil {
|
||||
return out, nil
|
||||
}
|
||||
out = append(out, e)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,275 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"net"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
func kvFreePort(t *testing.T) int {
|
||||
t.Helper()
|
||||
l, err := net.Listen("tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("free port: %v", err)
|
||||
}
|
||||
defer l.Close()
|
||||
return l.Addr().(*net.TCPAddr).Port
|
||||
}
|
||||
|
||||
// newKVStore boots a single-node embedded NATS with JetStream and opens a
|
||||
// jetstreamStore (R1) over it, returning the store plus the server and
|
||||
// connection so a test can shut the backend down to exercise fail-closed paths.
|
||||
func newKVStore(t *testing.T) (*jetstreamStore, *server.Server, *nats.Conn) {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
st, err := OpenJetStream(js, JetStreamConfig{Replicas: 1, OpTimeout: 2 * time.Second})
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("open jetstream store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
ns.WaitForShutdown()
|
||||
})
|
||||
return st.(*jetstreamStore), ns, nc
|
||||
}
|
||||
|
||||
// TestJetStreamStoreRoomsCRUD is the golden path: an encrypted room with an owner
|
||||
// and an invited member round-trips through every room/member/key method.
|
||||
func TestJetStreamStoreRoomsCRUD(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
|
||||
roomID := newULID()
|
||||
owner := "owner-ep-1"
|
||||
info := RoomInfo{RoomID: roomID, Subject: "room.kv", Encrypt: true, Persist: true, SignMsgs: true, OwnerEndpoint: owner}
|
||||
ownerSealed := []byte("sealed-owner-epoch1")
|
||||
if err := s.CreateRoom(info, []byte("owner-sign"), []byte("owner-kex"), ownerSealed); err != nil {
|
||||
t.Fatalf("CreateRoom: %v", err)
|
||||
}
|
||||
|
||||
// GetRoom returns epoch 1 and the policy.
|
||||
got, err := s.GetRoom(roomID)
|
||||
if err != nil {
|
||||
t.Fatalf("GetRoom: %v", err)
|
||||
}
|
||||
if got.Epoch != 1 || got.Subject != "room.kv" || !got.Encrypt || got.OwnerEndpoint != owner {
|
||||
t.Fatalf("GetRoom mismatch: %+v", got)
|
||||
}
|
||||
|
||||
// Owner is a member with role "owner".
|
||||
om, err := s.GetMember(roomID, owner)
|
||||
if err != nil {
|
||||
t.Fatalf("GetMember owner: %v", err)
|
||||
}
|
||||
if om.Role != "owner" || !bytes.Equal(om.SignPub, []byte("owner-sign")) {
|
||||
t.Fatalf("owner member mismatch: %+v", om)
|
||||
}
|
||||
|
||||
// Owner's sealed key at epoch 1.
|
||||
ep, sealed, err := s.GetSealedKey(roomID, owner, 1)
|
||||
if err != nil || ep != 1 || !bytes.Equal(sealed, ownerSealed) {
|
||||
t.Fatalf("GetSealedKey owner: ep=%d sealed=%q err=%v", ep, sealed, err)
|
||||
}
|
||||
|
||||
// Invite a member with a sealed key at epoch 1.
|
||||
bob := "member-ep-bob"
|
||||
bobSealed := []byte("sealed-bob-epoch1")
|
||||
if err := s.AddMember(roomID, Member{Endpoint: bob, Role: "member", SignPub: []byte("bob-sign"), KexPub: []byte("bob-kex")}, 1, bobSealed); err != nil {
|
||||
t.Fatalf("AddMember: %v", err)
|
||||
}
|
||||
|
||||
// ListMembers returns both, sorted by endpoint.
|
||||
members, err := s.ListMembers(roomID)
|
||||
if err != nil {
|
||||
t.Fatalf("ListMembers: %v", err)
|
||||
}
|
||||
if len(members) != 2 {
|
||||
t.Fatalf("ListMembers want 2, got %d (%+v)", len(members), members)
|
||||
}
|
||||
|
||||
// Bob can find the room via the reverse index.
|
||||
rooms, err := s.ListRoomsForEndpoint(bob)
|
||||
if err != nil {
|
||||
t.Fatalf("ListRoomsForEndpoint: %v", err)
|
||||
}
|
||||
if len(rooms) != 1 || rooms[0].RoomID != roomID || rooms[0].Role != "member" {
|
||||
t.Fatalf("ListRoomsForEndpoint mismatch: %+v", rooms)
|
||||
}
|
||||
|
||||
// Latest sealed key (epoch <= 0) resolves to epoch 1 for bob.
|
||||
lep, lsealed, err := s.GetSealedKey(roomID, bob, 0)
|
||||
if err != nil || lep != 1 || !bytes.Equal(lsealed, bobSealed) {
|
||||
t.Fatalf("GetSealedKey latest bob: ep=%d err=%v", lep, err)
|
||||
}
|
||||
|
||||
// Rekey to epoch 2 (bump + new sealed keys), then latest resolves to 2.
|
||||
if err := s.BumpEpoch(roomID, 2); err != nil {
|
||||
t.Fatalf("BumpEpoch: %v", err)
|
||||
}
|
||||
if err := s.PutSealedKeys(roomID, 2, map[string][]byte{owner: []byte("owner-epoch2")}); err != nil {
|
||||
t.Fatalf("PutSealedKeys: %v", err)
|
||||
}
|
||||
got2, _ := s.GetRoom(roomID)
|
||||
if got2.Epoch != 2 {
|
||||
t.Fatalf("after BumpEpoch want epoch 2, got %d", got2.Epoch)
|
||||
}
|
||||
lep2, _, err := s.GetSealedKey(roomID, owner, 0)
|
||||
if err != nil || lep2 != 2 {
|
||||
t.Fatalf("latest owner key after rekey: ep=%d err=%v", lep2, err)
|
||||
}
|
||||
|
||||
// Remove bob; he disappears from members and his reverse index.
|
||||
if err := s.RemoveMember(roomID, bob); err != nil {
|
||||
t.Fatalf("RemoveMember: %v", err)
|
||||
}
|
||||
if _, err := s.GetMember(roomID, bob); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetMember after remove want ErrNotFound, got %v", err)
|
||||
}
|
||||
rooms2, _ := s.ListRoomsForEndpoint(bob)
|
||||
if len(rooms2) != 0 {
|
||||
t.Fatalf("ListRoomsForEndpoint after remove want 0, got %d", len(rooms2))
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreUsers exercises the allowlist: add, lookup, authorize,
|
||||
// revoke (which flips IsAuthorized), and the admin gate.
|
||||
func TestJetStreamStoreUsers(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
|
||||
const aliceHex = "aa11"
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("fresh store should have no admin")
|
||||
}
|
||||
if err := s.AddUser(aliceHex, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
if !s.HasAdmin() {
|
||||
t.Fatalf("HasAdmin should be true after adding an admin")
|
||||
}
|
||||
if !s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("alice should be authorized")
|
||||
}
|
||||
// Case-insensitive lookup (keys are normalized lowercase).
|
||||
if !s.IsAuthorized("AA11") {
|
||||
t.Fatalf("uppercase hex should normalize and authorize")
|
||||
}
|
||||
u, err := s.GetUser(aliceHex)
|
||||
if err != nil || u.Handle != "alice" || u.Role != RoleAdmin || u.Status != StatusActive {
|
||||
t.Fatalf("GetUser mismatch: %+v err=%v", u, err)
|
||||
}
|
||||
|
||||
// Duplicate add is rejected with ErrUserExists.
|
||||
if err := s.AddUser(aliceHex, "alice2", RoleMember); !errors.Is(err, ErrUserExists) {
|
||||
t.Fatalf("duplicate AddUser want ErrUserExists, got %v", err)
|
||||
}
|
||||
|
||||
if err := s.AddUser("bb22", "bob", RoleMember); err != nil {
|
||||
t.Fatalf("AddUser bob: %v", err)
|
||||
}
|
||||
users, err := s.ListUsers()
|
||||
if err != nil || len(users) != 2 {
|
||||
t.Fatalf("ListUsers want 2, got %d err=%v", len(users), err)
|
||||
}
|
||||
|
||||
// Revoke alice: authorization flips off immediately.
|
||||
if err := s.RevokeUser(aliceHex); err != nil {
|
||||
t.Fatalf("RevokeUser: %v", err)
|
||||
}
|
||||
if s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("revoked user must not be authorized")
|
||||
}
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("after revoking the only admin, HasAdmin must be false")
|
||||
}
|
||||
// Revoking again is an error (no active user).
|
||||
if err := s.RevokeUser(aliceHex); err == nil {
|
||||
t.Fatalf("re-revoke should error")
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreNotFound checks the ErrNotFound mapping for misses.
|
||||
func TestJetStreamStoreNotFound(t *testing.T) {
|
||||
s, _, _ := newKVStore(t)
|
||||
if _, err := s.GetRoom("nope"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetRoom miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, err := s.GetMember("nope", "x"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetMember miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, _, err := s.GetSealedKey("nope", "x", 1); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetSealedKey miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, _, err := s.GetSealedKey("nope", "x", 0); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetSealedKey latest miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
if _, err := s.GetUser("ffff"); !errors.Is(err, ErrNotFound) {
|
||||
t.Fatalf("GetUser miss want ErrNotFound, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// TestJetStreamStoreIsAuthorizedFailClosed is the error path mandated by the
|
||||
// issue: when the KV backend is unavailable (here the NATS server is shut down),
|
||||
// IsAuthorized must DENY, never admit. A previously-authorized identity flips to
|
||||
// unauthorized once the backend cannot be reached.
|
||||
func TestJetStreamStoreIsAuthorizedFailClosed(t *testing.T) {
|
||||
s, ns, nc := newKVStore(t)
|
||||
|
||||
const aliceHex = "abcd"
|
||||
if err := s.AddUser(aliceHex, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
if !s.IsAuthorized(aliceHex) {
|
||||
t.Fatalf("alice should be authorized while the backend is up")
|
||||
}
|
||||
|
||||
// Take the KV backend away: close the client and stop the server. Every
|
||||
// subsequent KV Get fails, and the store must fail closed.
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
ns.WaitForShutdown()
|
||||
|
||||
// Bound the assertion: IsAuthorized internally caps each op at OpTimeout, so
|
||||
// this returns well before the test deadline.
|
||||
done := make(chan bool, 1)
|
||||
go func() { done <- s.IsAuthorized(aliceHex) }()
|
||||
select {
|
||||
case authorized := <-done:
|
||||
if authorized {
|
||||
t.Fatalf("KV backend down but IsAuthorized returned true: NOT fail-closed")
|
||||
}
|
||||
case <-time.After(10 * time.Second):
|
||||
t.Fatalf("IsAuthorized hung when the backend was down (no bounded timeout)")
|
||||
}
|
||||
|
||||
// HasAdmin is likewise conservative: backend down -> false (gates stay closed).
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("KV backend down but HasAdmin returned true: NOT fail-closed")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
package membership_test
|
||||
|
||||
// Regression for audit report 0008, vector N2: with the broad "$JS.API.>" grant
|
||||
// removed (issue 0006b), a registered peer that belongs to no room can no longer
|
||||
// read the control-plane KV buckets over NATS, while the per-room JetStream API of
|
||||
// a peer's OWN rooms keeps working. The auditor's ephemeral attack populated the
|
||||
// KV control plane and had a registered non-member harvest the allowlist, the room
|
||||
// graph and the sealed-key metadata directly through "$JS.API.>".
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/hex"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/busauth"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
server "github.com/nats-io/nats-server/v2/server"
|
||||
)
|
||||
|
||||
// startACLNatsInternal is startACLNats plus a recognized internal service identity
|
||||
// (so the test can seed the KV control plane with full permissions, exactly as the
|
||||
// decentralized membershipd does at bootstrap).
|
||||
func startACLNatsInternal(t *testing.T, store membership.Store, internalPubHex string) *server.Server {
|
||||
t.Helper()
|
||||
auth := busauth.NewNkeyAuthenticatorACLInternal(store.IsAuthorized, aclPermsFunc(store), internalPubHex)
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: aclFreePort(t), Auth: auth,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("acl nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return ns
|
||||
}
|
||||
|
||||
// TestAttack0008_N2 reproduces the control-plane KV leak and proves it is closed.
|
||||
//
|
||||
// error : eve (registered, member of no room) cannot read the KV buckets — the
|
||||
// JetStream KV API and the raw $KV subject space are both denied.
|
||||
// golden: the owner of a persisted room can still drive the JetStream API of HER
|
||||
// OWN room's stream (so persisted-room history keeps working).
|
||||
// edge : eve cannot reach another room's stream API either (cross-room JS deny).
|
||||
func TestAttack0008_N2(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
// The HTTP control-plane store stays SQLite; the KV buckets below stand in for
|
||||
// the decentralized control plane the attack targets.
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
ceo, eve, internalID := mustID(t), mustID(t), mustID(t)
|
||||
ceoEP := frame.EndpointID(ceo.SignPub)
|
||||
mustAddUser(t, store, ceo, "ceo-root-admin")
|
||||
mustAddUser(t, store, eve, "eve") // registered, member of nothing
|
||||
// A persisted room owned by ceo: ceo is a member, so her per-room JS is allowed.
|
||||
if err := store.CreateRoom(
|
||||
membership.RoomInfo{RoomID: "PRIVROOM", Subject: "room.board.ma-deal", Encrypt: true, Persist: true, OwnerEndpoint: ceoEP},
|
||||
ceo.SignPub, ceo.KexPub, []byte("sealed-self"),
|
||||
); err != nil {
|
||||
t.Fatalf("create room: %v", err)
|
||||
}
|
||||
|
||||
internalPubHex := hex.EncodeToString(internalID.SignPub)
|
||||
ns := startACLNatsInternal(t, store, internalPubHex)
|
||||
url := ns.ClientURL()
|
||||
|
||||
// Seed the KV control plane with the privileged internal identity (full perms),
|
||||
// simulating the decentralized buckets the attack reads.
|
||||
intErr := make(chan error, 4)
|
||||
intNC := nkeyConn(t, url, internalID, intErr)
|
||||
intJS, err := jetstream.New(intNC)
|
||||
if err != nil {
|
||||
t.Fatalf("internal jetstream: %v", err)
|
||||
}
|
||||
kvStore, err := membership.OpenJetStream(intJS, membership.JetStreamConfig{Replicas: 1, OpTimeout: 3 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("open kv buckets: %v", err)
|
||||
}
|
||||
if err := kvStore.AddUser(hex.EncodeToString(ceo.SignPub), "ceo-root-admin", membership.RoleAdmin); err != nil {
|
||||
t.Fatalf("seed kv user: %v", err)
|
||||
}
|
||||
|
||||
// Each JetStream op gets its own short context: a DENIED request never gets a
|
||||
// reply, so it blocks until its own deadline — a shared context would be
|
||||
// exhausted by the first denied call and starve the rest.
|
||||
freshCtx := func(d time.Duration) (context.Context, context.CancelFunc) {
|
||||
return context.WithTimeout(context.Background(), d)
|
||||
}
|
||||
|
||||
// --- error: eve cannot read the control-plane KV buckets ------------------
|
||||
eveErr := make(chan error, 8)
|
||||
eveNC := nkeyConn(t, url, eve, eveErr)
|
||||
eveJS, err := jetstream.New(eveNC)
|
||||
if err != nil {
|
||||
t.Fatalf("eve jetstream: %v", err)
|
||||
}
|
||||
// (a) The KV API: binding the bucket requires STREAM.INFO.KV_UNIBUS_users, which
|
||||
// eve has no permission for, so this must fail (no leak of users).
|
||||
kvCtx, kvCancel := freshCtx(2 * time.Second)
|
||||
if kv, err := eveJS.KeyValue(kvCtx, "UNIBUS_users"); err == nil {
|
||||
if e, gerr := kv.Get(kvCtx, hex.EncodeToString(ceo.SignPub)); gerr == nil {
|
||||
kvCancel()
|
||||
t.Fatalf("eve read the control-plane KV users bucket: %q (N2 leak still open)", string(e.Value()))
|
||||
}
|
||||
kvCancel()
|
||||
t.Fatalf("eve was able to BIND the KV users bucket (N2 leak still open)")
|
||||
}
|
||||
kvCancel()
|
||||
// (b) The raw KV subject space: a direct subscribe must be a permissions
|
||||
// violation (delivered async to the error handler).
|
||||
drain(eveErr)
|
||||
if _, err := eveNC.Subscribe("$KV.UNIBUS_users.>", func(*nats.Msg) {}); err != nil {
|
||||
t.Fatalf("eve sub $KV: %v", err)
|
||||
}
|
||||
_ = eveNC.Flush()
|
||||
if e := waitErr(eveErr, 1*time.Second); e == nil {
|
||||
t.Fatalf("eve subscribing to $KV.UNIBUS_users.> must raise a permissions violation")
|
||||
}
|
||||
|
||||
// --- edge: eve cannot reach another room's stream API ---------------------
|
||||
edgeCtx, edgeCancel := freshCtx(2 * time.Second)
|
||||
if _, err := eveJS.Stream(edgeCtx, "UNIBUS_PRIVROOM"); err == nil {
|
||||
edgeCancel()
|
||||
t.Fatalf("eve reached the foreign room stream API (cross-room JS not isolated)")
|
||||
}
|
||||
edgeCancel()
|
||||
|
||||
// --- golden: ceo can drive the JetStream API of HER OWN room's stream ------
|
||||
ceoErr := make(chan error, 4)
|
||||
ceoNC := nkeyConn(t, url, ceo, ceoErr)
|
||||
ceoJS, err := jetstream.New(ceoNC)
|
||||
if err != nil {
|
||||
t.Fatalf("ceo jetstream: %v", err)
|
||||
}
|
||||
goldenCtx, goldenCancel := freshCtx(5 * time.Second)
|
||||
defer goldenCancel()
|
||||
if _, err := ceoJS.CreateOrUpdateStream(goldenCtx, jetstream.StreamConfig{
|
||||
Name: "UNIBUS_PRIVROOM",
|
||||
Subjects: []string{"room.board.ma-deal"},
|
||||
Storage: jetstream.FileStorage,
|
||||
}); err != nil {
|
||||
t.Fatalf("ceo could not manage her OWN room stream (per-room JS broken): %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
package membership
|
||||
|
||||
// Migration from the local SQLite control plane to replicated JetStream KV
|
||||
// (issue 0003c). It is the one-time, idempotent data move that decentralization
|
||||
// needs: read the entire SQLite state, write it into the KV buckets. Re-running
|
||||
// it is safe (every KV write is an overwrite), so a partial/interrupted run is
|
||||
// recovered by running again, and a parity test can assert the two stores hold
|
||||
// the same state before and after.
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// SealedKeyRecord is one row of room_keys: the sealed room key for an endpoint
|
||||
// at a given epoch. It is the unit the snapshot carries so a backend can be
|
||||
// imported with the exact epoch history (CreateRoom/AddMember alone could not
|
||||
// reproduce a multi-epoch room).
|
||||
type SealedKeyRecord struct {
|
||||
RoomID string
|
||||
Endpoint string
|
||||
Epoch int
|
||||
Sealed []byte
|
||||
}
|
||||
|
||||
// Snapshot is the complete control-plane state, backend-agnostic. It is what
|
||||
// ExportSnapshot produces and importSnapshot consumes, so the SQLite->KV
|
||||
// migration and the parity test both work in terms of it.
|
||||
type Snapshot struct {
|
||||
Rooms []RoomInfo
|
||||
Members map[string][]Member // roomID -> members
|
||||
Keys []SealedKeyRecord
|
||||
Users []User
|
||||
}
|
||||
|
||||
// MigrateReport summarizes what a migration moved, for the operator log.
|
||||
type MigrateReport struct {
|
||||
BackupPath string
|
||||
Rooms int
|
||||
Members int
|
||||
Keys int
|
||||
Users int
|
||||
}
|
||||
|
||||
// MigrateSQLiteToKV reads the SQLite store at sqlitePath and writes its entire
|
||||
// state into the JetStream KV buckets on js (created with cfg.Replicas). It is
|
||||
// idempotent: re-running converges to the same state. The caller is responsible
|
||||
// for backing up the SQLite file first (BackupSQLite) — this function only
|
||||
// reads it.
|
||||
func MigrateSQLiteToKV(sqlitePath string, js jetstream.JetStream, cfg JetStreamConfig) (*MigrateReport, error) {
|
||||
src, err := openSQLite(sqlitePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: open sqlite %q: %w", sqlitePath, err)
|
||||
}
|
||||
defer src.Close()
|
||||
|
||||
snap, err := src.ExportSnapshot()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: export sqlite: %w", err)
|
||||
}
|
||||
|
||||
dst, err := OpenJetStream(js, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("migrate: open kv: %w", err)
|
||||
}
|
||||
kv := dst.(*jetstreamStore)
|
||||
if err := kv.importSnapshot(snap); err != nil {
|
||||
return nil, fmt.Errorf("migrate: import to kv: %w", err)
|
||||
}
|
||||
|
||||
members := 0
|
||||
for _, ms := range snap.Members {
|
||||
members += len(ms)
|
||||
}
|
||||
return &MigrateReport{
|
||||
Rooms: len(snap.Rooms),
|
||||
Members: members,
|
||||
Keys: len(snap.Keys),
|
||||
Users: len(snap.Users),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// BackupSQLite makes a consistent copy of the SQLite database next to it,
|
||||
// named "<path>.bak.<unixnano>", using SQLite's own VACUUM INTO (which writes a
|
||||
// transactionally-consistent snapshot even with a live WAL). It returns the
|
||||
// backup path. Always call this before MigrateSQLiteToKV so a botched migration
|
||||
// can be undone.
|
||||
func BackupSQLite(path string) (string, error) {
|
||||
dst := fmt.Sprintf("%s.bak.%d", path, time.Now().UnixNano())
|
||||
db, err := sql.Open("sqlite", "file:"+path+"?_pragma=busy_timeout(5000)")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("backup: open %q: %w", path, err)
|
||||
}
|
||||
defer db.Close()
|
||||
if err := db.Ping(); err != nil {
|
||||
return "", fmt.Errorf("backup: ping %q: %w", path, err)
|
||||
}
|
||||
// VACUUM INTO writes a fresh, consistent database file; the literal path is
|
||||
// safely single-quoted (it is operator-supplied, never network input).
|
||||
if _, err := db.Exec("VACUUM INTO '" + strings.ReplaceAll(dst, "'", "''") + "'"); err != nil {
|
||||
return "", fmt.Errorf("backup: VACUUM INTO %q: %w", dst, err)
|
||||
}
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
// ---- SQLite export --------------------------------------------------------
|
||||
|
||||
// ExportSnapshot reads the entire SQLite control-plane state into a Snapshot.
|
||||
func (s *sqliteStore) ExportSnapshot() (*Snapshot, error) {
|
||||
snap := &Snapshot{Members: map[string][]Member{}}
|
||||
|
||||
rows, err := s.db.Query(`SELECT room_id, subject, key_epoch, encrypt, persist, sign_msgs, owner_endpoint FROM rooms ORDER BY room_id`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query rooms: %w", err)
|
||||
}
|
||||
for rows.Next() {
|
||||
var r RoomInfo
|
||||
var enc, per, sgn int
|
||||
if err := rows.Scan(&r.RoomID, &r.Subject, &r.Epoch, &enc, &per, &sgn, &r.OwnerEndpoint); err != nil {
|
||||
rows.Close()
|
||||
return nil, fmt.Errorf("export: scan room: %w", err)
|
||||
}
|
||||
r.Encrypt, r.Persist, r.SignMsgs = enc != 0, per != 0, sgn != 0
|
||||
snap.Rooms = append(snap.Rooms, r)
|
||||
}
|
||||
rows.Close()
|
||||
if err := rows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mrows, err := s.db.Query(`SELECT room_id, endpoint, role, sign_pub, kex_pub FROM members ORDER BY room_id, endpoint`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query members: %w", err)
|
||||
}
|
||||
for mrows.Next() {
|
||||
var roomID string
|
||||
var m Member
|
||||
if err := mrows.Scan(&roomID, &m.Endpoint, &m.Role, &m.SignPub, &m.KexPub); err != nil {
|
||||
mrows.Close()
|
||||
return nil, fmt.Errorf("export: scan member: %w", err)
|
||||
}
|
||||
snap.Members[roomID] = append(snap.Members[roomID], m)
|
||||
}
|
||||
mrows.Close()
|
||||
if err := mrows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
krows, err := s.db.Query(`SELECT room_id, epoch, endpoint, sealed_key FROM room_keys ORDER BY room_id, endpoint, epoch`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: query room_keys: %w", err)
|
||||
}
|
||||
for krows.Next() {
|
||||
var rec SealedKeyRecord
|
||||
if err := krows.Scan(&rec.RoomID, &rec.Epoch, &rec.Endpoint, &rec.Sealed); err != nil {
|
||||
krows.Close()
|
||||
return nil, fmt.Errorf("export: scan room_key: %w", err)
|
||||
}
|
||||
snap.Keys = append(snap.Keys, rec)
|
||||
}
|
||||
krows.Close()
|
||||
if err := krows.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("export: list users: %w", err)
|
||||
}
|
||||
snap.Users = users
|
||||
return snap, nil
|
||||
}
|
||||
@@ -0,0 +1,195 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// seedSQLite populates a SQLite store with a representative control plane: two
|
||||
// rooms (one rekeyed to epoch 2 with a removed member's keys left behind), a few
|
||||
// members and sealed keys, and a user allowlist with one revoked entry. It
|
||||
// returns the populated *sqliteStore and its file path.
|
||||
func seedSQLite(t *testing.T) (*sqliteStore, string) {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "seed.db")
|
||||
s, err := openSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("openSQLite: %v", err)
|
||||
}
|
||||
|
||||
r1 := RoomInfo{RoomID: newULID(), Subject: "room.alpha", Encrypt: true, Persist: true, SignMsgs: true, OwnerEndpoint: "ep-owner1"}
|
||||
if err := s.CreateRoom(r1, []byte("o1-sign"), []byte("o1-kex"), []byte("o1-sealed-e1")); err != nil {
|
||||
t.Fatalf("create r1: %v", err)
|
||||
}
|
||||
if err := s.AddMember(r1.RoomID, Member{Endpoint: "ep-bob", Role: "member", SignPub: []byte("bob-sign"), KexPub: []byte("bob-kex")}, 1, []byte("bob-sealed-e1")); err != nil {
|
||||
t.Fatalf("add bob: %v", err)
|
||||
}
|
||||
// Rekey r1 to epoch 2 (owner keeps a key at the new epoch).
|
||||
if err := s.BumpEpoch(r1.RoomID, 2); err != nil {
|
||||
t.Fatalf("bump: %v", err)
|
||||
}
|
||||
if err := s.PutSealedKeys(r1.RoomID, 2, map[string][]byte{"ep-owner1": []byte("o1-sealed-e2")}); err != nil {
|
||||
t.Fatalf("put keys e2: %v", err)
|
||||
}
|
||||
|
||||
r2 := RoomInfo{RoomID: newULID(), Subject: "room.beta", Encrypt: false, Persist: false, SignMsgs: false, OwnerEndpoint: "ep-owner2"}
|
||||
if err := s.CreateRoom(r2, []byte("o2-sign"), []byte("o2-kex"), nil); err != nil {
|
||||
t.Fatalf("create r2: %v", err)
|
||||
}
|
||||
|
||||
if err := s.AddUser("aa11", "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
if err := s.AddUser("bb22", "bob", RoleMember); err != nil {
|
||||
t.Fatalf("add bob user: %v", err)
|
||||
}
|
||||
if err := s.AddUser("cc33", "carol", RoleMember); err != nil {
|
||||
t.Fatalf("add carol: %v", err)
|
||||
}
|
||||
if err := s.RevokeUser("cc33"); err != nil {
|
||||
t.Fatalf("revoke carol: %v", err)
|
||||
}
|
||||
return s, path
|
||||
}
|
||||
|
||||
// normalizeSnapshot sorts every slice in a Snapshot so two snapshots from
|
||||
// different backends can be compared regardless of enumeration order.
|
||||
func normalizeSnapshot(snap *Snapshot) {
|
||||
sort.Slice(snap.Rooms, func(i, j int) bool { return snap.Rooms[i].RoomID < snap.Rooms[j].RoomID })
|
||||
for _, ms := range snap.Members {
|
||||
sort.Slice(ms, func(i, j int) bool { return ms[i].Endpoint < ms[j].Endpoint })
|
||||
}
|
||||
sort.Slice(snap.Keys, func(i, j int) bool {
|
||||
a, b := snap.Keys[i], snap.Keys[j]
|
||||
if a.RoomID != b.RoomID {
|
||||
return a.RoomID < b.RoomID
|
||||
}
|
||||
if a.Endpoint != b.Endpoint {
|
||||
return a.Endpoint < b.Endpoint
|
||||
}
|
||||
return a.Epoch < b.Epoch
|
||||
})
|
||||
sort.Slice(snap.Users, func(i, j int) bool { return snap.Users[i].SignPub < snap.Users[j].SignPub })
|
||||
}
|
||||
|
||||
func newJS(t *testing.T) jetstream.JetStream {
|
||||
t.Helper()
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(),
|
||||
Host: "127.0.0.1",
|
||||
Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("embedded nats: %v", err)
|
||||
}
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
t.Fatalf("nats connect: %v", err)
|
||||
}
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
nc.Close()
|
||||
ns.Shutdown()
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { nc.Close(); ns.Shutdown(); ns.WaitForShutdown() })
|
||||
return js
|
||||
}
|
||||
|
||||
// TestMigrateSQLiteToKVParity is the parity test the issue mandates: after the
|
||||
// migration, the KV store holds exactly the SQLite source's state.
|
||||
func TestMigrateSQLiteToKVParity(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, err := src.ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export sqlite: %v", err)
|
||||
}
|
||||
src.Close() // release the file before the migration reopens it
|
||||
|
||||
js := newJS(t)
|
||||
report, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("migrate: %v", err)
|
||||
}
|
||||
if report.Rooms != 2 || report.Users != 3 {
|
||||
t.Fatalf("report mismatch: %+v", report)
|
||||
}
|
||||
|
||||
kv, err := OpenJetStream(js, JetStreamConfig{Replicas: 1, OpTimeout: 5 * time.Second})
|
||||
if err != nil {
|
||||
t.Fatalf("open kv: %v", err)
|
||||
}
|
||||
kvSnap, err := kv.(*jetstreamStore).ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export kv: %v", err)
|
||||
}
|
||||
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(kvSnap)
|
||||
if !reflect.DeepEqual(srcSnap, kvSnap) {
|
||||
t.Fatalf("parity mismatch after migration:\n sqlite=%+v\n kv= %+v", srcSnap, kvSnap)
|
||||
}
|
||||
}
|
||||
|
||||
// TestMigrateSQLiteToKVIdempotent: running the migration twice converges to the
|
||||
// same KV state (every write is an overwrite). A second run must not duplicate
|
||||
// or corrupt anything.
|
||||
func TestMigrateSQLiteToKVIdempotent(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, _ := src.ExportSnapshot()
|
||||
src.Close()
|
||||
|
||||
js := newJS(t)
|
||||
if _, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1}); err != nil {
|
||||
t.Fatalf("migrate run 1: %v", err)
|
||||
}
|
||||
if _, err := MigrateSQLiteToKV(path, js, JetStreamConfig{Replicas: 1}); err != nil {
|
||||
t.Fatalf("migrate run 2: %v", err)
|
||||
}
|
||||
|
||||
kv, _ := OpenJetStream(js, JetStreamConfig{Replicas: 1})
|
||||
kvSnap, err := kv.(*jetstreamStore).ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export kv: %v", err)
|
||||
}
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(kvSnap)
|
||||
if !reflect.DeepEqual(srcSnap, kvSnap) {
|
||||
t.Fatalf("idempotency broken: a second migration changed the KV state\n sqlite=%+v\n kv= %+v", srcSnap, kvSnap)
|
||||
}
|
||||
}
|
||||
|
||||
// TestBackupSQLiteCreatesConsistentCopy verifies the pre-migration backup is a
|
||||
// real, openable copy holding the same data.
|
||||
func TestBackupSQLiteCreatesConsistentCopy(t *testing.T) {
|
||||
src, path := seedSQLite(t)
|
||||
srcSnap, _ := src.ExportSnapshot()
|
||||
src.Close()
|
||||
|
||||
bak, err := BackupSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("backup: %v", err)
|
||||
}
|
||||
restored, err := openSQLite(bak)
|
||||
if err != nil {
|
||||
t.Fatalf("open backup: %v", err)
|
||||
}
|
||||
defer restored.Close()
|
||||
bakSnap, err := restored.ExportSnapshot()
|
||||
if err != nil {
|
||||
t.Fatalf("export backup: %v", err)
|
||||
}
|
||||
normalizeSnapshot(srcSnap)
|
||||
normalizeSnapshot(bakSnap)
|
||||
if !reflect.DeepEqual(srcSnap, bakSnap) {
|
||||
t.Fatalf("backup is not a faithful copy")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
-- 002_users.sql — bus-level user directory (issue 0001a).
|
||||
--
|
||||
-- The authoritative allowlist of identities permitted to use the bus, independent
|
||||
-- of room membership. A user is identified by its Ed25519 signing public key (the
|
||||
-- same key that derives the endpoint via frame.EndpointID); roles gate admin-only
|
||||
-- control-plane operations; status enables revocation without deleting history.
|
||||
--
|
||||
-- Additive and idempotent: safe to apply repeatedly. Never modify this file;
|
||||
-- further schema changes go in new numbered migrations (see
|
||||
-- .claude/rules/db_migrations.md). The embedded copy under
|
||||
-- pkg/membership/migrations/002_users.sql mirrors this file byte-for-byte.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
sign_pub TEXT PRIMARY KEY, -- Ed25519 public key in lowercase hex (peer identity)
|
||||
handle TEXT NOT NULL, -- human-readable name (unique recommended, not enforced as PK)
|
||||
role TEXT NOT NULL DEFAULT 'member', -- 'admin' | 'member'
|
||||
status TEXT NOT NULL DEFAULT 'active', -- 'active' | 'revoked'
|
||||
created_at TEXT NOT NULL,
|
||||
revoked_at TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_users_status ON users(status);
|
||||
@@ -0,0 +1,51 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TestNonceCacheRememberPrune covers the replay/expiry behavior directly on the
|
||||
// cache: a fresh nonce is accepted (golden), an immediate repeat is rejected
|
||||
// (error), and after the TTL the same nonce is accepted again because its entry
|
||||
// was pruned (edge).
|
||||
func TestNonceCacheRememberPrune(t *testing.T) {
|
||||
nc := newMemNonceCache(50*time.Millisecond, 1000)
|
||||
base := time.Now()
|
||||
|
||||
if !nc.rememberOrReject("a", base) {
|
||||
t.Fatalf("first sighting should be accepted")
|
||||
}
|
||||
if nc.rememberOrReject("a", base) {
|
||||
t.Fatalf("an immediate replay should be rejected")
|
||||
}
|
||||
if !nc.rememberOrReject("a", base.Add(60*time.Millisecond)) {
|
||||
t.Fatalf("after the TTL the nonce should be accepted again (pruned)")
|
||||
}
|
||||
}
|
||||
|
||||
// TestNonceCacheCapBounded covers the memory bound (audit H7): with a long TTL so
|
||||
// nothing expires, inserting far more nonces than the cap must still keep the
|
||||
// cache at or under the cap (oldest evicted), and the order queue must not drift
|
||||
// from the map.
|
||||
func TestNonceCacheCapBounded(t *testing.T) {
|
||||
const capacity = 100
|
||||
nc := newMemNonceCache(time.Hour, capacity)
|
||||
base := time.Now()
|
||||
for i := 0; i < 500; i++ {
|
||||
nc.rememberOrReject("n"+strconv.Itoa(i), base)
|
||||
}
|
||||
|
||||
nc.mu.Lock()
|
||||
size := len(nc.seen)
|
||||
orderLen := len(nc.order)
|
||||
nc.mu.Unlock()
|
||||
|
||||
if size > capacity {
|
||||
t.Fatalf("cache exceeded its cap: %d > %d", size, capacity)
|
||||
}
|
||||
if orderLen != size {
|
||||
t.Fatalf("order queue drifted from the map: order=%d seen=%d", orderLen, size)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
package membership
|
||||
|
||||
// kvNonceStore is the replicated anti-replay backend (issue 0003e): seen nonces
|
||||
// live in a JetStream KV bucket shared by every node, with a per-key TTL so they
|
||||
// expire on their own. This closes the multi-node replay hole the auditor
|
||||
// flagged: the per-process memNonceCache let an attacker replay a captured
|
||||
// request to a DIFFERENT node, whose local cache never saw the nonce. With the
|
||||
// shared bucket the first node to see a nonce wins the atomic Create, and every
|
||||
// other node rejects the replay.
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
const bucketNonces = "UNIBUS_nonces"
|
||||
|
||||
type kvNonceStore struct {
|
||||
kv jetstream.KeyValue
|
||||
opTimeout time.Duration
|
||||
}
|
||||
|
||||
// newKVNonceStore creates (or opens) the replicated nonce bucket. ttl is the
|
||||
// per-key expiry — it must be >= the request acceptance window (2*clockSkew) so
|
||||
// a replay can never outlive its memory, exactly like the in-memory cache's TTL.
|
||||
func newKVNonceStore(js jetstream.JetStream, ttl time.Duration, replicas int, opTimeout time.Duration) (*kvNonceStore, error) {
|
||||
if replicas <= 0 {
|
||||
replicas = 1
|
||||
}
|
||||
if opTimeout <= 0 {
|
||||
opTimeout = defaultKVOpTime
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||
defer cancel()
|
||||
kv, err := js.CreateOrUpdateKeyValue(ctx, jetstream.KeyValueConfig{
|
||||
Bucket: bucketNonces,
|
||||
TTL: ttl,
|
||||
Replicas: replicas,
|
||||
History: 1,
|
||||
Storage: jetstream.FileStorage,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: open nonce KV bucket (replicas=%d): %w", replicas, err)
|
||||
}
|
||||
return &kvNonceStore{kv: kv, opTimeout: opTimeout}, nil
|
||||
}
|
||||
|
||||
// nonceKVKey maps a raw nonce (std-base64, which contains '+' '/' '=' that KV
|
||||
// keys forbid) to a KV-safe token: the hex of its sha256. Deterministic, so the
|
||||
// same nonce always maps to the same key, and collision-free in practice.
|
||||
func nonceKVKey(nonce string) string {
|
||||
sum := sha256.Sum256([]byte(nonce))
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
// rememberOrReject atomically claims the nonce: Create succeeds only if the key
|
||||
// is absent, so the first sight returns true (accept) and any later sight (a
|
||||
// replay, on this or any other node sharing the bucket) returns false. A backend
|
||||
// error fails CLOSED — reject — so a KV outage never silently disables
|
||||
// anti-replay. The TTL on the bucket expires the key, reopening the window.
|
||||
func (s *kvNonceStore) rememberOrReject(nonce string, _ time.Time) bool {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), s.opTimeout)
|
||||
defer cancel()
|
||||
if _, err := s.kv.Create(ctx, nonceKVKey(nonce), nil); err != nil {
|
||||
if errors.Is(err, jetstream.ErrKeyExists) {
|
||||
return false // replay: already claimed
|
||||
}
|
||||
return false // backend unreachable: fail closed
|
||||
}
|
||||
return true // first sight: accept
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/nats-io/nats.go"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// TestReplicatedNonceRejectsCrossNodeReplay is the issue's mandated error path:
|
||||
// with the shared KV nonce store, a request accepted on node A is rejected as a
|
||||
// replay when the SAME signed bytes are sent to node B. This closes the
|
||||
// multi-node replay hole that the per-process cache left open.
|
||||
func TestReplicatedNonceRejectsCrossNodeReplay(t *testing.T) {
|
||||
// One NATS+JetStream backing the shared nonce bucket.
|
||||
ns, err := embeddednats.StartServer(embeddednats.ServerConfig{
|
||||
StoreDir: t.TempDir(), Host: "127.0.0.1", Port: kvFreePort(t),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("nats: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { ns.Shutdown(); ns.WaitForShutdown() })
|
||||
nc, err := nats.Connect(ns.ClientURL())
|
||||
if err != nil {
|
||||
t.Fatalf("connect: %v", err)
|
||||
}
|
||||
t.Cleanup(nc.Close)
|
||||
js, err := jetstream.New(nc)
|
||||
if err != nil {
|
||||
t.Fatalf("jetstream: %v", err)
|
||||
}
|
||||
|
||||
// One shared SQLite store (simulating the replicated control-plane state) and
|
||||
// two membershipd servers (two nodes) that BOTH use the shared KV nonce store.
|
||||
dir := t.TempDir()
|
||||
store, err := Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
alice, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
t.Fatalf("identity: %v", err)
|
||||
}
|
||||
alicePub := hex.EncodeToString(alice.SignPub)
|
||||
if err := store.AddUser(alicePub, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("add alice: %v", err)
|
||||
}
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
|
||||
mkNode := func() *httptest.Server {
|
||||
srv := NewServer(store, blobs, AuthEnforce)
|
||||
if err := srv.UseReplicatedNonces(js, 1); err != nil {
|
||||
t.Fatalf("UseReplicatedNonces: %v", err)
|
||||
}
|
||||
return httptest.NewServer(srv)
|
||||
}
|
||||
nodeA := mkNode()
|
||||
t.Cleanup(nodeA.Close)
|
||||
nodeB := mkNode()
|
||||
t.Cleanup(nodeB.Close)
|
||||
|
||||
// Build ONE signed request (fixed ts+nonce) and send the identical bytes to
|
||||
// both nodes. Authenticated path: alice listing her own rooms (200, empty).
|
||||
ts := time.Now().Unix()
|
||||
nonceRaw := make([]byte, 16)
|
||||
if _, err := rand.Read(nonceRaw); err != nil {
|
||||
t.Fatalf("nonce: %v", err)
|
||||
}
|
||||
nonce := base64.StdEncoding.EncodeToString(nonceRaw)
|
||||
path := "/members/" + frame.EndpointID(alice.SignPub) + "/rooms"
|
||||
|
||||
reqA := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respA, err := http.DefaultClient.Do(reqA)
|
||||
if err != nil {
|
||||
t.Fatalf("do A: %v", err)
|
||||
}
|
||||
respA.Body.Close()
|
||||
if respA.StatusCode != http.StatusOK {
|
||||
t.Fatalf("node A first use: status %d, want 200 (auth should pass, nonce fresh)", respA.StatusCode)
|
||||
}
|
||||
|
||||
// Replay the SAME ts+nonce to node B: the shared bucket already holds the
|
||||
// nonce, so node B must reject it.
|
||||
reqB := signedReq(t, nodeB.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respB, err := http.DefaultClient.Do(reqB)
|
||||
if err != nil {
|
||||
t.Fatalf("do B: %v", err)
|
||||
}
|
||||
respB.Body.Close()
|
||||
if respB.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("cross-node replay to node B: status %d, want 401 (replayed nonce)", respB.StatusCode)
|
||||
}
|
||||
|
||||
// And replaying to node A again is likewise rejected (same bucket).
|
||||
reqA2 := signedReq(t, nodeA.URL, "GET", path, nil, alice, ts, nonce)
|
||||
respA2, err := http.DefaultClient.Do(reqA2)
|
||||
if err != nil {
|
||||
t.Fatalf("do A2: %v", err)
|
||||
}
|
||||
respA2.Body.Close()
|
||||
if respA2.StatusCode != http.StatusUnauthorized {
|
||||
t.Fatalf("replay to node A: status %d, want 401", respA2.StatusCode)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
)
|
||||
|
||||
// TestAudit_OwnerSpoof ports the auditor's H6 finding: handleCreateRoom did not
|
||||
// bind the body's declared owner to the request signer, so a registered peer
|
||||
// could create rooms in another identity's name. Now the owner endpoint AND the
|
||||
// owner signing key must both be the authenticated signer's.
|
||||
func TestAudit_OwnerSpoof(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
|
||||
bob, _ := cs.GenerateIdentity()
|
||||
register(t, h, bob, "bob")
|
||||
bobEp := frame.EndpointID(bob.SignPub)
|
||||
victim, _ := cs.GenerateIdentity()
|
||||
|
||||
post := func(id cs.Identity, owner endpointJSON, nonce string) int {
|
||||
body, _ := json.Marshal(createRoomReq{Subject: "some.room", Owner: owner})
|
||||
code, _ := do(t, signedReq(t, h.ts.URL, "POST", "/rooms", body, id, time.Now().Unix(), nonce))
|
||||
return code
|
||||
}
|
||||
|
||||
// Error path: bob signs, body claims victim as owner -> 403.
|
||||
if code := post(bob, endpointJSON{Endpoint: frame.EndpointID(victim.SignPub), SignPub: victim.SignPub, KexPub: victim.KexPub}, "spoof-1"); code != http.StatusForbidden {
|
||||
t.Fatalf("owner-spoofed create should be 403, got %d", code)
|
||||
}
|
||||
|
||||
// Edge: bob declares his own endpoint but a foreign signing key -> 403 (the
|
||||
// key, not just the endpoint string, is bound to the signer).
|
||||
if code := post(bob, endpointJSON{Endpoint: bobEp, SignPub: victim.SignPub, KexPub: victim.KexPub}, "spoof-2"); code != http.StatusForbidden {
|
||||
t.Fatalf("create with a foreign owner key should be 403, got %d", code)
|
||||
}
|
||||
|
||||
// Golden: alice creates a room owned by herself -> 201.
|
||||
aliceEp := frame.EndpointID(h.alice.SignPub)
|
||||
if code := post(h.alice, endpointJSON{Endpoint: aliceEp, SignPub: h.alice.SignPub, KexPub: h.alice.KexPub}, "owner-ok"); code != http.StatusCreated {
|
||||
t.Fatalf("self-owned create should be 201, got %d", code)
|
||||
}
|
||||
}
|
||||
|
||||
// TestAudit_NonceCachePoisonPreAuth ports the auditor's H7 finding: the replay
|
||||
// cache was populated BEFORE the allowlist check, so any unregistered identity
|
||||
// (Ed25519 keys are free) could seed nonces into it. Now IsAuthorized runs first,
|
||||
// so an unauthorized identity's nonce is never cached: a repeat of the same nonce
|
||||
// still fails as "not authorized", not "replayed nonce".
|
||||
func TestAudit_NonceCachePoisonPreAuth(t *testing.T) {
|
||||
h := newAuthHarness(t, AuthEnforce)
|
||||
|
||||
eve, _ := cs.GenerateIdentity() // valid signatures, NOT on the allowlist
|
||||
now := time.Now().Unix()
|
||||
|
||||
code1, body1 := do(t, signedReq(t, h.ts.URL, "GET", "/rooms/x", nil, eve, now, "poison-nonce"))
|
||||
if code1 != http.StatusUnauthorized || !strings.Contains(body1, "not authorized") {
|
||||
t.Fatalf("unregistered first request should be 401 not-authorized, got %d (%s)", code1, body1)
|
||||
}
|
||||
|
||||
// Same nonce again: if the nonce had been cached, this would report "replayed
|
||||
// nonce". It must still be "not authorized" — proving the nonce was NOT cached.
|
||||
code2, body2 := do(t, signedReq(t, h.ts.URL, "GET", "/rooms/x", nil, eve, now, "poison-nonce"))
|
||||
if code2 != http.StatusUnauthorized {
|
||||
t.Fatalf("unregistered replay should still be 401, got %d", code2)
|
||||
}
|
||||
if strings.Contains(body2, "replayed") {
|
||||
t.Fatalf("an unauthorized identity's nonce was cached pre-auth: %s", body2)
|
||||
}
|
||||
if !strings.Contains(body2, "not authorized") {
|
||||
t.Fatalf("second unregistered request should still be not-authorized, got: %s", body2)
|
||||
}
|
||||
|
||||
// Positive control: an AUTHORIZED identity's replay IS still rejected, so the
|
||||
// reorder did not weaken anti-replay for legitimate traffic.
|
||||
if code, _ := do(t, signedReq(t, h.ts.URL, "GET", aliceRoomsPath(h), nil, h.alice, now, "alice-live")); code != http.StatusOK {
|
||||
t.Fatalf("alice's first request should be 200, got %d", code)
|
||||
}
|
||||
if code, body := do(t, signedReq(t, h.ts.URL, "GET", aliceRoomsPath(h), nil, h.alice, now, "alice-live")); code != http.StatusUnauthorized || !strings.Contains(body, "replayed") {
|
||||
t.Fatalf("alice's replay should be 401 replayed nonce, got %d (%s)", code, body)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
package membership_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
)
|
||||
|
||||
// TestHealthExposesPosture: /healthz publishes the node's security posture so a
|
||||
// monitor (or a peer) can detect a cluster member that is not enforce+ACL+TLS
|
||||
// (audit 0008 N1). The probe stays unauthenticated.
|
||||
func TestHealthExposesPosture(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
|
||||
srv := membership.NewServer(store, blobs, membership.AuthEnforce)
|
||||
srv.Posture = membership.Posture{Enforce: true, ACL: true, TLS: true, Cluster: true, Store: "kv"}
|
||||
ts := httptest.NewServer(srv)
|
||||
t.Cleanup(ts.Close)
|
||||
|
||||
resp, err := http.Get(ts.URL + "/healthz")
|
||||
if err != nil {
|
||||
t.Fatalf("get healthz: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("healthz status %d, want 200", resp.StatusCode)
|
||||
}
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
var got struct {
|
||||
Status string `json:"status"`
|
||||
Posture membership.Posture `json:"posture"`
|
||||
}
|
||||
if err := json.Unmarshal(body, &got); err != nil {
|
||||
t.Fatalf("decode healthz %q: %v", string(body), err)
|
||||
}
|
||||
if got.Status != "ok" {
|
||||
t.Fatalf("status = %q, want ok", got.Status)
|
||||
}
|
||||
if !got.Posture.Enforce || !got.Posture.ACL || !got.Posture.TLS || !got.Posture.Cluster {
|
||||
t.Fatalf("posture not surfaced correctly: %+v", got.Posture)
|
||||
}
|
||||
if got.Posture.Store != "kv" {
|
||||
t.Fatalf("posture.store = %q, want kv", got.Posture.Store)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"net"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
)
|
||||
|
||||
// ipRateLimiter is a per-source-IP token-bucket rate limiter for the control
|
||||
// plane. It exists to blunt pre-auth flooding: an unauthenticated peer that
|
||||
// hammers the HTTP API (signature verification is not free, and io is bounded
|
||||
// but still real) is throttled before it can amplify load. Like the nonceCache,
|
||||
// this is transport glue specific to unibus, not a registry primitive — the
|
||||
// report 0003 made the same call for the nonce cache (it would only drag a NATS
|
||||
// dependency into the multi-domain registry go.mod for one helper).
|
||||
//
|
||||
// Each distinct IP gets its own golang.org/x/time/rate.Limiter (a standard
|
||||
// token bucket already in the module graph, so no new dependency). Idle buckets
|
||||
// are reaped so the map cannot grow without bound under a churn of source IPs.
|
||||
type ipRateLimiter struct {
|
||||
mu sync.Mutex
|
||||
buckets map[string]*ipBucket
|
||||
r rate.Limit
|
||||
burst int
|
||||
ttl time.Duration
|
||||
}
|
||||
|
||||
type ipBucket struct {
|
||||
lim *rate.Limiter
|
||||
seen time.Time
|
||||
}
|
||||
|
||||
// newIPRateLimiter builds a limiter granting r tokens/second with the given
|
||||
// burst per IP. ttl bounds how long an idle bucket is retained before being
|
||||
// reaped. r<=0 disables limiting (Allow always true) so dev/loopback stacks are
|
||||
// unaffected.
|
||||
func newIPRateLimiter(r rate.Limit, burst int, ttl time.Duration) *ipRateLimiter {
|
||||
return &ipRateLimiter{
|
||||
buckets: make(map[string]*ipBucket),
|
||||
r: r,
|
||||
burst: burst,
|
||||
ttl: ttl,
|
||||
}
|
||||
}
|
||||
|
||||
// allow reports whether a request from ip may proceed now, consuming one token
|
||||
// on success. A disabled limiter (r<=0) always allows. Reaping of stale buckets
|
||||
// is amortized: it runs only when the map has grown past a small threshold, so
|
||||
// the common path is a single map lookup under the mutex.
|
||||
func (l *ipRateLimiter) allow(ip string, now time.Time) bool {
|
||||
if l == nil || l.r <= 0 {
|
||||
return true
|
||||
}
|
||||
l.mu.Lock()
|
||||
defer l.mu.Unlock()
|
||||
|
||||
if len(l.buckets) > 1024 {
|
||||
l.reapLocked(now)
|
||||
}
|
||||
b, ok := l.buckets[ip]
|
||||
if !ok {
|
||||
b = &ipBucket{lim: rate.NewLimiter(l.r, l.burst)}
|
||||
l.buckets[ip] = b
|
||||
}
|
||||
b.seen = now
|
||||
return b.lim.AllowN(now, 1)
|
||||
}
|
||||
|
||||
// reapLocked drops buckets idle for longer than ttl. The caller holds l.mu.
|
||||
func (l *ipRateLimiter) reapLocked(now time.Time) {
|
||||
for ip, b := range l.buckets {
|
||||
if now.Sub(b.seen) > l.ttl {
|
||||
delete(l.buckets, ip)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// clientIP extracts the source IP of an HTTP request, stripping the port. It
|
||||
// trusts the transport's RemoteAddr only (no X-Forwarded-For parsing): a public
|
||||
// deployment terminates TLS at this process or behind a proxy that the operator
|
||||
// controls, and honoring an attacker-supplied header would let a single IP fan
|
||||
// its quota across forged identities. If parsing fails the whole RemoteAddr is
|
||||
// used as the key (still a stable per-connection bucket).
|
||||
func clientIP(r *http.Request) string {
|
||||
host, _, err := net.SplitHostPort(r.RemoteAddr)
|
||||
if err != nil {
|
||||
return r.RemoteAddr
|
||||
}
|
||||
return host
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package membership_test
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
// TestClientCreateRoomRefreshPublishFlow is the issue 0006e DoD: under enforce+ACL
|
||||
// a peer creates a room AFTER connecting, and pub/sub works without manual
|
||||
// intervention because the client follows the membership-change contract
|
||||
// (CreateRoom -> RefreshSession -> Subscribe/Publish), exactly as cmd/chat and
|
||||
// cmd/worker now do. This is the end-to-end flow through the client API, proving
|
||||
// the ACL is usable under enforce rather than something an operator must disable.
|
||||
func TestClientCreateRoomRefreshPublishFlow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
store, err := membership.Open(filepath.Join(dir, "unibus.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("store: %v", err)
|
||||
}
|
||||
t.Cleanup(func() { store.Close() })
|
||||
|
||||
alice, bob := mustID(t), mustID(t)
|
||||
mustAddUser(t, store, alice, "alice")
|
||||
mustAddUser(t, store, bob, "bob")
|
||||
|
||||
srv := startACLNats(t, store) // data plane: enforce + per-subject ACL
|
||||
blobs, _ := blobstore.New(filepath.Join(dir, "blobs"))
|
||||
ctrl := newCtrl(t, store, blobs)
|
||||
|
||||
aliceC, err := client.NewWithOptions(srv.ClientURL(), ctrl, alice, client.Options{UseNkey: true})
|
||||
if err != nil {
|
||||
t.Fatalf("connect alice: %v", err)
|
||||
}
|
||||
defer aliceC.Close()
|
||||
bobC, err := client.NewWithOptions(srv.ClientURL(), ctrl, bob, client.Options{UseNkey: true})
|
||||
if err != nil {
|
||||
t.Fatalf("connect bob: %v", err)
|
||||
}
|
||||
defer bobC.Close()
|
||||
|
||||
// alice creates a room AFTER connecting: the subject was not in her ACL at
|
||||
// connect time, so she must refresh to publish on it (the worker contract).
|
||||
roomID, err := aliceC.CreateRoom("room.flow.x", room.ModeNATS)
|
||||
if err != nil {
|
||||
t.Fatalf("alice create room: %v", err)
|
||||
}
|
||||
if err := aliceC.RefreshSession(); err != nil {
|
||||
t.Fatalf("alice refresh: %v", err)
|
||||
}
|
||||
|
||||
// alice invites bob; bob joins then refreshes to gain the subject (the chat
|
||||
// subscriber contract), and only then subscribes.
|
||||
if err := aliceC.Invite(roomID, bobC.Endpoint()); err != nil {
|
||||
t.Fatalf("alice invite bob: %v", err)
|
||||
}
|
||||
if err := bobC.Join(roomID); err != nil {
|
||||
t.Fatalf("bob join: %v", err)
|
||||
}
|
||||
if err := bobC.RefreshSession(); err != nil {
|
||||
t.Fatalf("bob refresh: %v", err)
|
||||
}
|
||||
got := make(chan string, 4)
|
||||
sub, err := bobC.Subscribe(roomID, func(_ frame.Frame, plaintext []byte) { got <- string(plaintext) })
|
||||
if err != nil {
|
||||
t.Fatalf("bob subscribe after refresh: %v", err)
|
||||
}
|
||||
defer sub.Unsubscribe()
|
||||
time.Sleep(200 * time.Millisecond) // let the subscription settle
|
||||
|
||||
if err := aliceC.Publish(roomID, []byte("hello-under-acl")); err != nil {
|
||||
t.Fatalf("alice publish after refresh: %v", err)
|
||||
}
|
||||
select {
|
||||
case msg := <-got:
|
||||
if msg != "hello-under-acl" {
|
||||
t.Fatalf("bob got %q", msg)
|
||||
}
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatalf("bob did not receive the message: the create->refresh->subscribe flow is broken under enforce+ACL")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestRequireEncryptedRoomsRejectsCleartext is the control-plane half of the
|
||||
// audit H4 minimum defense: with RequireEncryptedRooms on (the public posture),
|
||||
// creating a cleartext (ModeNATS) room is refused 403, while an encrypted room is
|
||||
// created normally. This is what guarantees no message ever rides the un-ACL'd
|
||||
// NATS subject in the clear on a public deployment.
|
||||
func TestRequireEncryptedRoomsRejectsCleartext(t *testing.T) {
|
||||
srv := dosServer(t, AuthOff)
|
||||
srv.RequireEncryptedRooms = true
|
||||
|
||||
create := func(encrypt bool) int {
|
||||
body, _ := json.Marshal(createRoomReq{
|
||||
Subject: "payroll.subject",
|
||||
Policy: policyJSON{Encrypt: encrypt, Persist: encrypt, SignMsgs: encrypt},
|
||||
Owner: endpointJSON{Endpoint: "owner-ep", SignPub: []byte("sp"), KexPub: []byte("kp")},
|
||||
SealedKeySelf: []byte("sealed"),
|
||||
})
|
||||
rec := httptest.NewRecorder()
|
||||
srv.ServeHTTP(rec, httptest.NewRequest(http.MethodPost, "/rooms", bytes.NewReader(body)))
|
||||
return rec.Code
|
||||
}
|
||||
|
||||
// Error path: a cleartext room is refused.
|
||||
if code := create(false); code != http.StatusForbidden {
|
||||
t.Fatalf("cleartext room under RequireEncryptedRooms should be 403, got %d", code)
|
||||
}
|
||||
// Golden: an encrypted room is created.
|
||||
if code := create(true); code != http.StatusCreated {
|
||||
t.Fatalf("encrypted room should be 201, got %d", code)
|
||||
}
|
||||
|
||||
// Edge: with the flag OFF (loopback/dev), cleartext rooms are allowed again.
|
||||
srv.RequireEncryptedRooms = false
|
||||
if code := create(false); code != http.StatusCreated {
|
||||
t.Fatalf("cleartext room with the flag off should be 201, got %d", code)
|
||||
}
|
||||
}
|
||||
+350
-23
@@ -1,18 +1,58 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/nats-io/nats.go/jetstream"
|
||||
)
|
||||
|
||||
// Body-size ceilings for the control plane. They bound how much an unauthenticated
|
||||
// peer can make the server buffer in RAM before the request is even authenticated
|
||||
// (the signature is verified over the full body, so the body must be read — but
|
||||
// not unboundedly). maxControlBodyBytes covers JSON metadata requests; /blobs gets
|
||||
// a separate, larger ceiling because media ciphertext is legitimately bigger. A
|
||||
// request whose declared Content-Length already exceeds its ceiling is rejected
|
||||
// before a single byte is buffered.
|
||||
const (
|
||||
maxControlBodyBytes = 1 << 20 // 1 MiB for JSON control-plane requests
|
||||
maxBlobBytes = 16 << 20 // 16 MiB for a single media blob upload
|
||||
// MaxHeaderBytes caps request header size; wired into the http.Server by the
|
||||
// command. Exported so the bound lives next to its body-size siblings.
|
||||
MaxHeaderBytes = 1 << 20 // 1 MiB
|
||||
// maxInflightBytes is the GLOBAL cap on request-body bytes buffered across all
|
||||
// concurrent requests (audit N2). The per-request ceilings above bound one
|
||||
// request; this bounds the sum, so a concurrent (even multi-IP) flood of
|
||||
// max-size uploads cannot drive the resident set without limit. 128 MiB allows
|
||||
// ~8 concurrent 16 MiB blob uploads or ~128 concurrent control requests before
|
||||
// further POSTs are shed with 503 — generous for an interactive bus, bounded
|
||||
// for an attacker.
|
||||
maxInflightBytes = 128 << 20 // 128 MiB
|
||||
)
|
||||
|
||||
// Per-IP rate-limit defaults for the control plane. Tuned for an interactive
|
||||
// human/agent bus rather than a high-QPS API: a steady ~20 req/s with a burst of
|
||||
// 40 absorbs a chat client's bursty polling while throttling a flood. Loopback
|
||||
// dev stacks pass r<=0 to disable limiting entirely.
|
||||
const (
|
||||
defaultRatePerSec = rate.Limit(20)
|
||||
defaultRateBurst = 40
|
||||
rateBucketTTL = 10 * time.Minute
|
||||
)
|
||||
|
||||
// Server is the HTTP control plane: the authoritative source of room metadata,
|
||||
@@ -24,20 +64,210 @@ import (
|
||||
// rate limiting, and read endpoints (GET) are unauthenticated. Hardening
|
||||
// (mTLS, capabilities, rate limits) is a later phase.
|
||||
type Server struct {
|
||||
store *Store
|
||||
blobs *blobstore.Store
|
||||
mux *http.ServeMux
|
||||
store Store
|
||||
blobs blobstore.Store
|
||||
mux *http.ServeMux
|
||||
authMode AuthMode
|
||||
nonces nonceStore
|
||||
limiter *ipRateLimiter
|
||||
inflight *inflightLimiter
|
||||
|
||||
// RequireEncryptedRooms, when true, refuses to create cleartext (ModeNATS)
|
||||
// rooms. It is the minimum-defensive control for the data plane (audit H4):
|
||||
// the embedded NATS has no per-subject ACL, so a cleartext room is readable by
|
||||
// any registered peer that knows (or guesses) its subject. Forcing every room
|
||||
// to be end-to-end encrypted keeps message CONTENT confidential even when the
|
||||
// transport offers no subject isolation. The command sets this on a public
|
||||
// (non-loopback) bind. See dev/0004d-dataplane-acl.md for the full rationale
|
||||
// and the residual metadata exposure this does NOT close.
|
||||
RequireEncryptedRooms bool
|
||||
|
||||
// Posture is the node's security posture, surfaced on /healthz so an operator
|
||||
// or a peer can detect a node NOT running the homogeneous enforce+ACL+TLS
|
||||
// posture a secure cluster requires (audit 0008 N1). It is set by the command;
|
||||
// the zero value (all false) reflects an unsecured dev node.
|
||||
Posture Posture
|
||||
}
|
||||
|
||||
// NewServer wires the membership store and blob store into an http.Handler.
|
||||
func NewServer(store *Store, blobs *blobstore.Store) *Server {
|
||||
s := &Server{store: store, blobs: blobs, mux: http.NewServeMux()}
|
||||
// Posture describes the security posture a membershipd node runs with. It is
|
||||
// non-secret operational metadata (booleans + the store backend name), published
|
||||
// on /healthz so a monitor can flag a cluster member that is not enforce+ACL+TLS
|
||||
// — the weak node that would let an unauthenticated peer harvest the cluster's
|
||||
// forwarded traffic (audit 0008 N1).
|
||||
type Posture struct {
|
||||
Enforce bool `json:"enforce"`
|
||||
ACL bool `json:"acl"`
|
||||
TLS bool `json:"tls"`
|
||||
Cluster bool `json:"cluster"`
|
||||
Store string `json:"store"` // "sqlite" | "kv"
|
||||
}
|
||||
|
||||
// NewServer wires the membership store and blob store into an http.Handler. The
|
||||
// authMode selects the control-plane auth rollout state (AuthOff for callers and
|
||||
// tests that have not migrated to signed requests yet). It installs a per-IP
|
||||
// rate limiter with the package defaults; loopback dev behavior is unchanged
|
||||
// because the burst comfortably exceeds any single client's request rate.
|
||||
func NewServer(store Store, blobs blobstore.Store, authMode AuthMode) *Server {
|
||||
s := &Server{
|
||||
store: store,
|
||||
blobs: blobs,
|
||||
mux: http.NewServeMux(),
|
||||
authMode: authMode,
|
||||
nonces: newMemNonceCache(nonceTTL, maxNonceCacheEntries),
|
||||
limiter: newIPRateLimiter(defaultRatePerSec, defaultRateBurst, rateBucketTTL),
|
||||
inflight: newInflightLimiter(maxInflightBytes),
|
||||
}
|
||||
s.routes()
|
||||
return s
|
||||
}
|
||||
|
||||
// ServeHTTP satisfies http.Handler.
|
||||
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.mux.ServeHTTP(w, r) }
|
||||
// UseReplicatedNonces switches the server's anti-replay store from the
|
||||
// per-process in-memory cache to a JetStream KV bucket shared across the cluster
|
||||
// (issue 0003e). It MUST be called on every node of a multi-node deployment:
|
||||
// otherwise a request captured on one node can be replayed to another whose
|
||||
// local cache never saw the nonce. replicas is the bucket's replication factor
|
||||
// (R1..R3). The TTL matches the in-memory cache (nonceTTL = 2*clockSkew), so a
|
||||
// replay can never outlive its memory.
|
||||
func (s *Server) UseReplicatedNonces(js jetstream.JetStream, replicas int) error {
|
||||
ns, err := newKVNonceStore(js, nonceTTL, replicas, 0)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.nonces = ns
|
||||
return nil
|
||||
}
|
||||
|
||||
// ServeHTTP satisfies http.Handler. It runs the control-plane auth middleware
|
||||
// (signature verification + anti-replay + allowlist) ahead of the router
|
||||
// according to authMode, then dispatches to the matched handler.
|
||||
func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
|
||||
// Per-IP rate limit runs first, ahead of auth and body reads, so a flood is
|
||||
// shed at the cheapest possible point. The health probe is exempt so liveness
|
||||
// checks are never throttled.
|
||||
if !isAuthExempt(r) && !s.limiter.allow(clientIP(r), now) {
|
||||
writeErr(w, http.StatusTooManyRequests, "rate limit exceeded")
|
||||
return
|
||||
}
|
||||
|
||||
// Cap how much body we will buffer, BEFORE reading a single byte. The ceiling
|
||||
// is per-route: /blobs may legitimately carry a media ciphertext, everything
|
||||
// else is small JSON. A declared Content-Length over the ceiling is rejected
|
||||
// outright (no buffering); MaxBytesReader then guards against a lying or
|
||||
// chunked sender by failing the read once the limit is crossed. This is the
|
||||
// fix for the pre-auth DoS: without it an unauthenticated peer could make the
|
||||
// server buffer an unbounded body in RAM before authenticate() ever ran.
|
||||
limit := int64(maxControlBodyBytes)
|
||||
if r.Method == http.MethodPost && r.URL.Path == "/blobs" {
|
||||
limit = int64(maxBlobBytes)
|
||||
}
|
||||
if r.ContentLength > limit {
|
||||
writeErr(w, http.StatusRequestEntityTooLarge, "request body too large")
|
||||
return
|
||||
}
|
||||
r.Body = http.MaxBytesReader(w, r.Body, limit)
|
||||
|
||||
// Aggregate memory bound (audit N2): the per-request ceiling above and the
|
||||
// per-IP rate limit do not cap the TOTAL bytes buffered across concurrent
|
||||
// requests. A POST reserves its worst-case buffered size (its route ceiling)
|
||||
// from a global limiter before the body is read, and is shed with 503 when the
|
||||
// cap is reached, so the resident set stays bounded under a concurrent (even
|
||||
// multi-IP) upload flood instead of growing linearly with the number of
|
||||
// connections. Reservation is released when the request finishes. Only POSTs
|
||||
// buffer a body; GETs carry none, so they do not consume the budget.
|
||||
if r.Method == http.MethodPost {
|
||||
if !s.inflight.tryAcquire(limit) {
|
||||
writeErr(w, http.StatusServiceUnavailable, "server busy: too many concurrent uploads in flight")
|
||||
return
|
||||
}
|
||||
defer s.inflight.release(limit)
|
||||
}
|
||||
|
||||
if s.authMode == AuthOff || isAuthExempt(r) {
|
||||
s.mux.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
// Buffer the (now bounded) body so the signature can be verified over it and
|
||||
// the handler still reads it.
|
||||
body, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
if isBodyTooLarge(err) {
|
||||
writeErr(w, http.StatusRequestEntityTooLarge, "request body too large")
|
||||
return
|
||||
}
|
||||
writeErr(w, http.StatusBadRequest, "read body")
|
||||
return
|
||||
}
|
||||
_ = r.Body.Close()
|
||||
r.Body = io.NopCloser(bytes.NewReader(body))
|
||||
|
||||
res, err := s.authenticate(r, body, now)
|
||||
if err != nil {
|
||||
if s.authMode == AuthSoft {
|
||||
log.Printf("[auth] soft: would reject %s %s: %v", r.Method, r.URL.Path, err)
|
||||
s.mux.ServeHTTP(w, r)
|
||||
return
|
||||
}
|
||||
writeErr(w, http.StatusUnauthorized, "unauthorized: "+err.Error())
|
||||
return
|
||||
}
|
||||
// Carry the authenticated signer's endpoint into the handler so room handlers
|
||||
// can authorize by membership (audit H3). Only set on a verified identity.
|
||||
s.mux.ServeHTTP(w, r.WithContext(withSigner(r.Context(), res.endpoint)))
|
||||
}
|
||||
|
||||
// isBodyTooLarge reports whether err is the sentinel returned by MaxBytesReader
|
||||
// when the body exceeds its limit, so the middleware can map it to 413.
|
||||
func isBodyTooLarge(err error) bool {
|
||||
var maxErr *http.MaxBytesError
|
||||
return errors.As(err, &maxErr)
|
||||
}
|
||||
|
||||
// ctxKey is the unexported type for this package's request-context keys, so the
|
||||
// values cannot collide with keys set by other packages.
|
||||
type ctxKey int
|
||||
|
||||
const ctxSignerEndpoint ctxKey = iota
|
||||
|
||||
// withSigner returns a context carrying the authenticated signer's endpoint id.
|
||||
func withSigner(ctx context.Context, endpoint string) context.Context {
|
||||
return context.WithValue(ctx, ctxSignerEndpoint, endpoint)
|
||||
}
|
||||
|
||||
// signerEndpoint returns the authenticated signer's endpoint id and whether one
|
||||
// is present. It is absent under AuthOff (no verification) and when a soft-mode
|
||||
// request was let through unauthenticated — in both cases membership
|
||||
// authorization is skipped, preserving dev/legacy behavior.
|
||||
func signerEndpoint(r *http.Request) (string, bool) {
|
||||
v, ok := r.Context().Value(ctxSignerEndpoint).(string)
|
||||
return v, ok && v != ""
|
||||
}
|
||||
|
||||
// requireMember authorizes a room request by membership (audit H3): it returns
|
||||
// the signer endpoint and true when the request may proceed, or writes 403 and
|
||||
// returns false when an authenticated signer is not a member of roomID. When no
|
||||
// authenticated signer is present (AuthOff/dev, or soft pass-through) it allows
|
||||
// the request — membership is only enforced once the caller's identity is known.
|
||||
func (s *Server) requireMember(w http.ResponseWriter, r *http.Request, roomID string) (string, bool) {
|
||||
signer, ok := signerEndpoint(r)
|
||||
if !ok {
|
||||
return "", true
|
||||
}
|
||||
if _, err := s.store.GetMember(roomID, signer); err != nil {
|
||||
writeErr(w, http.StatusForbidden, "forbidden: not a member of this room")
|
||||
return signer, false
|
||||
}
|
||||
return signer, true
|
||||
}
|
||||
|
||||
// isAuthExempt lists requests that bypass control-plane auth even under enforce.
|
||||
// Only the unauthenticated health probe qualifies: it carries no data and is
|
||||
// needed by load balancers / smoke checks / systemd before any identity exists.
|
||||
func isAuthExempt(r *http.Request) bool {
|
||||
return r.Method == http.MethodGet && r.URL.Path == "/healthz"
|
||||
}
|
||||
|
||||
func (s *Server) routes() {
|
||||
s.mux.HandleFunc("GET /healthz", s.handleHealth)
|
||||
@@ -45,6 +275,7 @@ func (s *Server) routes() {
|
||||
s.mux.HandleFunc("POST /rooms/{id}/invite", s.handleInvite)
|
||||
s.mux.HandleFunc("GET /rooms/{id}/key", s.handleGetKey)
|
||||
s.mux.HandleFunc("GET /rooms/{id}/members", s.handleListMembers)
|
||||
s.mux.HandleFunc("GET /members/{endpoint}/rooms", s.handleListMemberRooms)
|
||||
s.mux.HandleFunc("POST /rooms/{id}/rekey", s.handleRekey)
|
||||
s.mux.HandleFunc("GET /rooms/{id}", s.handleGetRoom)
|
||||
s.mux.HandleFunc("POST /blobs", s.handlePutBlob)
|
||||
@@ -101,6 +332,14 @@ type roomResp struct {
|
||||
Policy policyJSON `json:"policy"`
|
||||
}
|
||||
|
||||
type memberRoomJSON struct {
|
||||
RoomID string `json:"room_id"`
|
||||
Subject string `json:"subject"`
|
||||
Epoch int `json:"epoch"`
|
||||
Policy policyJSON `json:"policy"`
|
||||
Role string `json:"role"`
|
||||
}
|
||||
|
||||
type rekeyKey struct {
|
||||
Endpoint string `json:"endpoint"`
|
||||
SealedKey []byte `json:"sealed_key"`
|
||||
@@ -130,6 +369,15 @@ func writeErr(w http.ResponseWriter, code int, msg string) {
|
||||
writeJSON(w, code, map[string]string{"error": msg})
|
||||
}
|
||||
|
||||
// writeServerErr logs the internal error detail and returns ONLY a generic
|
||||
// message to the client (audit H12): raw store/blob errors embed SQL fragments
|
||||
// and filesystem paths, which must not leak to a caller. Use it for any error
|
||||
// that originates inside the server (5xx, or a not-found wrapping a store error).
|
||||
func writeServerErr(w http.ResponseWriter, r *http.Request, code int, publicMsg string, err error) {
|
||||
log.Printf("[handler] %s %s -> %d: %v", r.Method, r.URL.Path, code, err)
|
||||
writeErr(w, code, publicMsg)
|
||||
}
|
||||
|
||||
// canonicalSig returns the bytes to verify for a request: the request struct
|
||||
// re-marshaled with its Sig field cleared. The caller passes a copy with Sig
|
||||
// already zeroed. This is symmetric with how the client signs.
|
||||
@@ -161,7 +409,7 @@ func (s *Server) verifyOwnerSig(roomID, by string, sig, canonical []byte) (Membe
|
||||
// ---- handlers -------------------------------------------------------------
|
||||
|
||||
func (s *Server) handleHealth(w http.ResponseWriter, _ *http.Request) {
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
|
||||
writeJSON(w, http.StatusOK, map[string]any{"status": "ok", "posture": s.Posture})
|
||||
}
|
||||
|
||||
func (s *Server) handleCreateRoom(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -174,6 +422,24 @@ func (s *Server) handleCreateRoom(w http.ResponseWriter, r *http.Request) {
|
||||
writeErr(w, http.StatusBadRequest, "subject and owner.endpoint required")
|
||||
return
|
||||
}
|
||||
// Data-plane minimum defense (audit H4): on a public deployment cleartext
|
||||
// rooms are disabled, so no message ever rides the un-ACL'd NATS subject in
|
||||
// the clear for another registered peer to sniff.
|
||||
if s.RequireEncryptedRooms && !req.Policy.Encrypt {
|
||||
writeErr(w, http.StatusForbidden,
|
||||
"cleartext rooms are disabled on this deployment; create an encrypted (Matrix-policy) room")
|
||||
return
|
||||
}
|
||||
// Owner binding (audit H6): the declared owner must BE the authenticated
|
||||
// signer — both the endpoint id and the signing key. Otherwise a registered
|
||||
// peer could create rooms in another identity's name. Enforced only when an
|
||||
// authenticated signer is present (AuthOff/dev trusts the caller).
|
||||
if signer, ok := signerEndpoint(r); ok {
|
||||
if req.Owner.Endpoint != signer || frame.EndpointID(req.Owner.SignPub) != signer {
|
||||
writeErr(w, http.StatusForbidden, "forbidden: room owner must be the authenticated signer")
|
||||
return
|
||||
}
|
||||
}
|
||||
roomID := newULID()
|
||||
info := RoomInfo{
|
||||
RoomID: roomID,
|
||||
@@ -184,7 +450,7 @@ func (s *Server) handleCreateRoom(w http.ResponseWriter, r *http.Request) {
|
||||
OwnerEndpoint: req.Owner.Endpoint,
|
||||
}
|
||||
if err := s.store.CreateRoom(info, req.Owner.SignPub, req.Owner.KexPub, req.SealedKeySelf); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusCreated, createRoomResp{RoomID: roomID})
|
||||
@@ -206,7 +472,7 @@ func (s *Server) handleInvite(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
info, err := s.store.GetRoom(roomID)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusNotFound, err.Error())
|
||||
writeServerErr(w, r, http.StatusNotFound, "room not found", err)
|
||||
return
|
||||
}
|
||||
m := Member{
|
||||
@@ -216,7 +482,7 @@ func (s *Server) handleInvite(w http.ResponseWriter, r *http.Request) {
|
||||
KexPub: req.Member.KexPub,
|
||||
}
|
||||
if err := s.store.AddMember(roomID, m, info.Epoch, req.SealedKey); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "invited"})
|
||||
@@ -229,6 +495,20 @@ func (s *Server) handleGetKey(w http.ResponseWriter, r *http.Request) {
|
||||
writeErr(w, http.StatusBadRequest, "endpoint query param required")
|
||||
return
|
||||
}
|
||||
// A sealed room key is sealed to one identity's X25519 key. Serving it only to
|
||||
// that identity (the signer) stops a registered peer from harvesting another
|
||||
// member's sealed key (audit H3). Membership is implied by owning a sealed key,
|
||||
// but we also require the signer to be a member for defense in depth.
|
||||
if signer, ok := signerEndpoint(r); ok {
|
||||
if endpoint != signer {
|
||||
writeErr(w, http.StatusForbidden, "forbidden: may only fetch your own sealed key")
|
||||
return
|
||||
}
|
||||
if _, err := s.store.GetMember(roomID, signer); err != nil {
|
||||
writeErr(w, http.StatusForbidden, "forbidden: not a member of this room")
|
||||
return
|
||||
}
|
||||
}
|
||||
epoch := 0
|
||||
if e := r.URL.Query().Get("epoch"); e != "" {
|
||||
if n, err := strconv.Atoi(e); err == nil {
|
||||
@@ -237,12 +517,12 @@ func (s *Server) handleGetKey(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
ep, sealed, err := s.store.GetSealedKey(roomID, endpoint, epoch)
|
||||
if err != nil {
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
writeErr(w, http.StatusForbidden,
|
||||
"not invited to this encrypted room: no key has been sealed for your identity. Ask the room owner to invite you before joining.")
|
||||
return
|
||||
}
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, keyResp{Epoch: ep, SealedKey: sealed})
|
||||
@@ -250,9 +530,14 @@ func (s *Server) handleGetKey(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func (s *Server) handleListMembers(w http.ResponseWriter, r *http.Request) {
|
||||
roomID := r.PathValue("id")
|
||||
// Membership authorization (audit H3): the member list exposes every member's
|
||||
// sign_pub + kex_pub, so it must not be served to a non-member.
|
||||
if _, ok := s.requireMember(w, r, roomID); !ok {
|
||||
return
|
||||
}
|
||||
members, err := s.store.ListMembers(roomID)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeErr(w, http.StatusInternalServerError, "internal error")
|
||||
return
|
||||
}
|
||||
out := make([]memberJSON, 0, len(members))
|
||||
@@ -262,11 +547,44 @@ func (s *Server) handleListMembers(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
func (s *Server) handleListMemberRooms(w http.ResponseWriter, r *http.Request) {
|
||||
endpoint := r.PathValue("endpoint")
|
||||
if endpoint == "" {
|
||||
writeErr(w, http.StatusBadRequest, "endpoint required")
|
||||
return
|
||||
}
|
||||
// A peer may only enumerate its OWN room directory (audit H3): otherwise any
|
||||
// registered identity could map another's entire social graph of rooms.
|
||||
if signer, ok := signerEndpoint(r); ok && endpoint != signer {
|
||||
writeErr(w, http.StatusForbidden, "forbidden: may only list your own rooms")
|
||||
return
|
||||
}
|
||||
rooms, err := s.store.ListRoomsForEndpoint(endpoint)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, "internal error")
|
||||
return
|
||||
}
|
||||
out := make([]memberRoomJSON, 0, len(rooms))
|
||||
for _, rm := range rooms {
|
||||
out = append(out, memberRoomJSON{
|
||||
RoomID: rm.RoomID,
|
||||
Subject: rm.Subject,
|
||||
Epoch: rm.Epoch,
|
||||
Policy: policyJSON{Encrypt: rm.Encrypt, Persist: rm.Persist, SignMsgs: rm.SignMsgs},
|
||||
Role: rm.Role,
|
||||
})
|
||||
}
|
||||
writeJSON(w, http.StatusOK, out)
|
||||
}
|
||||
|
||||
func (s *Server) handleGetRoom(w http.ResponseWriter, r *http.Request) {
|
||||
roomID := r.PathValue("id")
|
||||
if _, ok := s.requireMember(w, r, roomID); !ok {
|
||||
return
|
||||
}
|
||||
info, err := s.store.GetRoom(roomID)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusNotFound, err.Error())
|
||||
writeErr(w, http.StatusNotFound, "room not found")
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, roomResp{
|
||||
@@ -296,7 +614,7 @@ func (s *Server) handleRekey(w http.ResponseWriter, r *http.Request) {
|
||||
// Bump epoch, then store the fresh sealed keys for the remaining members,
|
||||
// then remove the kicked/left members.
|
||||
if err := s.store.BumpEpoch(roomID, req.NewEpoch); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
keys := make(map[string][]byte, len(req.Keys))
|
||||
@@ -305,13 +623,13 @@ func (s *Server) handleRekey(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
if len(keys) > 0 {
|
||||
if err := s.store.PutSealedKeys(roomID, req.NewEpoch, keys); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
for _, ep := range req.Remove {
|
||||
if err := s.store.RemoveMember(roomID, ep); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -319,14 +637,23 @@ func (s *Server) handleRekey(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
func (s *Server) handlePutBlob(w http.ResponseWriter, r *http.Request) {
|
||||
// The body arrives already bounded: ServeHTTP wraps it in a MaxBytesReader
|
||||
// (maxBlobBytes) and rejects an over-declared Content-Length before this
|
||||
// handler runs, in every auth mode. Reading here therefore cannot buffer
|
||||
// more than the ceiling; a sender that lies about its length (e.g. chunked)
|
||||
// trips MaxBytesReader and we map that to 413 rather than a generic 400.
|
||||
data, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusBadRequest, "read body: "+err.Error())
|
||||
if isBodyTooLarge(err) {
|
||||
writeErr(w, http.StatusRequestEntityTooLarge, "request body too large")
|
||||
return
|
||||
}
|
||||
writeErr(w, http.StatusBadRequest, "read body")
|
||||
return
|
||||
}
|
||||
hash, err := s.blobs.Put(data)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
writeServerErr(w, r, http.StatusInternalServerError, "internal error", err)
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, blobResp{Hash: hash})
|
||||
@@ -340,7 +667,7 @@ func (s *Server) handleGetBlob(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
data, err := s.blobs.Get(hash)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusNotFound, err.Error())
|
||||
writeServerErr(w, r, http.StatusNotFound, "not found", err)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/octet-stream")
|
||||
|
||||
+112
-17
@@ -13,6 +13,7 @@ package membership
|
||||
import (
|
||||
"database/sql"
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"sort"
|
||||
@@ -26,6 +27,14 @@ import (
|
||||
//go:embed migrations/*.sql
|
||||
var migrationsFS embed.FS
|
||||
|
||||
// ErrNotFound is the store-agnostic "no such record" sentinel. Both backends
|
||||
// (SQLite and JetStream KV) return it, wrapped, when a lookup misses, so callers
|
||||
// distinguish "not invited / no key yet" from a genuine backend failure without
|
||||
// depending on a specific driver's error (the SQLite store maps sql.ErrNoRows to
|
||||
// it; the KV store maps a missing key to it). This is what lets the control
|
||||
// plane stay storage-agnostic under the branch-by-abstraction of issue 0003b.
|
||||
var ErrNotFound = errors.New("membership: not found")
|
||||
|
||||
// Member is a participant of a room with their published public keys.
|
||||
type Member struct {
|
||||
Endpoint string `json:"endpoint"`
|
||||
@@ -45,14 +54,58 @@ type RoomInfo struct {
|
||||
OwnerEndpoint string
|
||||
}
|
||||
|
||||
// Store is the SQLite-backed membership/key store.
|
||||
type Store struct {
|
||||
// Store is the membership/key control-plane store: the authoritative source of
|
||||
// room metadata, the member directory, per-epoch sealed room keys, and the bus
|
||||
// user allowlist. It is an interface (branch-by-abstraction, issue 0003b) with
|
||||
// two implementations: sqliteStore (the default, single-node, local SQLite) and
|
||||
// jetstreamStore (rooms/members/keys/users on replicated JetStream KV, selected
|
||||
// when the `decentralized` flag is on). Every lookup miss returns ErrNotFound
|
||||
// (wrapped); every implementation MUST fail closed (IsAuthorized returns false
|
||||
// on any backend error), so a KV quorum loss denies rather than admits.
|
||||
type Store interface {
|
||||
// Rooms / members / keys.
|
||||
CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error
|
||||
GetRoom(roomID string) (RoomInfo, error)
|
||||
AddMember(roomID string, m Member, epoch int, sealedKey []byte) error
|
||||
GetMember(roomID, endpoint string) (Member, error)
|
||||
ListMembers(roomID string) ([]Member, error)
|
||||
ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error)
|
||||
GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error)
|
||||
PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error
|
||||
BumpEpoch(roomID string, newEpoch int) error
|
||||
RemoveMember(roomID, endpoint string) error
|
||||
|
||||
// Users (the bus allowlist).
|
||||
AddUser(signPub, handle, role string) error
|
||||
GetUser(signPub string) (User, error)
|
||||
ListUsers() ([]User, error)
|
||||
RevokeUser(signPub string) error
|
||||
IsAuthorized(signPub string) bool
|
||||
HasAdmin() bool
|
||||
|
||||
// Lifecycle.
|
||||
Close() error
|
||||
}
|
||||
|
||||
// sqliteStore is the SQLite-backed implementation of Store (the default,
|
||||
// single-node backend). It stays the production default while the
|
||||
// `decentralized` flag is off.
|
||||
type sqliteStore struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
// Open opens (creating if needed) the SQLite database at path and applies all
|
||||
// embedded migrations idempotently.
|
||||
func Open(path string) (*Store, error) {
|
||||
// Open opens (creating if needed) the SQLite database at path, applies all
|
||||
// embedded migrations idempotently, and returns it as a Store. It remains the
|
||||
// default control-plane backend; the JetStream KV store is opened separately
|
||||
// (OpenJetStream) when decentralization is enabled.
|
||||
func Open(path string) (Store, error) {
|
||||
return openSQLite(path)
|
||||
}
|
||||
|
||||
// openSQLite is the concrete constructor, returning *sqliteStore so internal
|
||||
// callers (e.g. the SQLite->KV migration) can use SQLite-specific helpers that
|
||||
// are not part of the storage-agnostic Store interface.
|
||||
func openSQLite(path string) (*sqliteStore, error) {
|
||||
// _pragma busy_timeout avoids spurious "database is locked" under concurrent
|
||||
// HTTP handlers; foreign_keys kept off — we manage referential integrity in code.
|
||||
dsn := fmt.Sprintf("file:%s?_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)", path)
|
||||
@@ -64,7 +117,7 @@ func Open(path string) (*Store, error) {
|
||||
db.Close()
|
||||
return nil, fmt.Errorf("membership: ping db: %w", err)
|
||||
}
|
||||
s := &Store{db: db}
|
||||
s := &sqliteStore{db: db}
|
||||
if err := s.applyMigrations(); err != nil {
|
||||
db.Close()
|
||||
return nil, err
|
||||
@@ -73,11 +126,11 @@ func Open(path string) (*Store, error) {
|
||||
}
|
||||
|
||||
// Close closes the underlying database.
|
||||
func (s *Store) Close() error { return s.db.Close() }
|
||||
func (s *sqliteStore) Close() error { return s.db.Close() }
|
||||
|
||||
// applyMigrations runs every embedded migration in lexical order, tolerating
|
||||
// the "already applied" errors that SQLite's non-idempotent DDL produces.
|
||||
func (s *Store) applyMigrations() error {
|
||||
func (s *sqliteStore) applyMigrations() error {
|
||||
files, err := fs.Glob(migrationsFS, "migrations/*.sql")
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: glob migrations: %w", err)
|
||||
@@ -103,7 +156,7 @@ func nowRFC3339() string { return time.Now().UTC().Format(time.RFC3339Nano) }
|
||||
// CreateRoom inserts a room at epoch 1, registers the owner as a member with
|
||||
// role "owner", and stores the owner's sealed key for epoch 1. Idempotent
|
||||
// inserts are not used: a duplicate room_id returns an error.
|
||||
func (s *Store) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
func (s *sqliteStore) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealedKey []byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -142,7 +195,7 @@ func (s *Store) CreateRoom(info RoomInfo, ownerSignPub, ownerKexPub, ownerSealed
|
||||
}
|
||||
|
||||
// GetRoom returns room metadata (including current epoch).
|
||||
func (s *Store) GetRoom(roomID string) (RoomInfo, error) {
|
||||
func (s *sqliteStore) GetRoom(roomID string) (RoomInfo, error) {
|
||||
var info RoomInfo
|
||||
var enc, per, sgn int
|
||||
err := s.db.QueryRow(
|
||||
@@ -158,7 +211,7 @@ func (s *Store) GetRoom(roomID string) (RoomInfo, error) {
|
||||
|
||||
// AddMember inserts a member at the given role and stores their sealed key for
|
||||
// the supplied epoch.
|
||||
func (s *Store) AddMember(roomID string, m Member, epoch int, sealedKey []byte) error {
|
||||
func (s *sqliteStore) AddMember(roomID string, m Member, epoch int, sealedKey []byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -185,7 +238,7 @@ func (s *Store) AddMember(roomID string, m Member, epoch int, sealedKey []byte)
|
||||
}
|
||||
|
||||
// GetMember returns a single member of a room.
|
||||
func (s *Store) GetMember(roomID, endpoint string) (Member, error) {
|
||||
func (s *sqliteStore) GetMember(roomID, endpoint string) (Member, error) {
|
||||
var m Member
|
||||
err := s.db.QueryRow(
|
||||
`SELECT endpoint, role, sign_pub, kex_pub FROM members WHERE room_id = ? AND endpoint = ?`,
|
||||
@@ -198,7 +251,7 @@ func (s *Store) GetMember(roomID, endpoint string) (Member, error) {
|
||||
}
|
||||
|
||||
// ListMembers returns all members of a room ordered by endpoint.
|
||||
func (s *Store) ListMembers(roomID string) ([]Member, error) {
|
||||
func (s *sqliteStore) ListMembers(roomID string) ([]Member, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT endpoint, role, sign_pub, kex_pub FROM members WHERE room_id = ? ORDER BY endpoint`,
|
||||
roomID,
|
||||
@@ -219,9 +272,45 @@ func (s *Store) ListMembers(roomID string) ([]Member, error) {
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// RoomMembership is a room an endpoint belongs to, with that endpoint's role.
|
||||
// It is the per-endpoint view used for room discovery (a peer asking "which
|
||||
// rooms am I in?") so a freshly-invited member can find and join its rooms.
|
||||
type RoomMembership struct {
|
||||
RoomInfo
|
||||
Role string
|
||||
}
|
||||
|
||||
// ListRoomsForEndpoint returns every room the given endpoint is a member of,
|
||||
// with the room's current metadata and the endpoint's role, ordered by room id.
|
||||
// An endpoint that is in no rooms yields an empty slice (not an error).
|
||||
func (s *sqliteStore) ListRoomsForEndpoint(endpoint string) ([]RoomMembership, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT r.room_id, r.subject, r.key_epoch, r.encrypt, r.persist, r.sign_msgs, r.owner_endpoint, m.role
|
||||
FROM members m JOIN rooms r ON r.room_id = m.room_id
|
||||
WHERE m.endpoint = ? ORDER BY r.room_id`,
|
||||
endpoint,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list rooms for endpoint %q: %w", endpoint, err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []RoomMembership
|
||||
for rows.Next() {
|
||||
var rm RoomMembership
|
||||
var enc, per, sgn int
|
||||
if err := rows.Scan(&rm.RoomID, &rm.Subject, &rm.Epoch, &enc, &per, &sgn, &rm.OwnerEndpoint, &rm.Role); err != nil {
|
||||
return nil, fmt.Errorf("membership: scan room membership: %w", err)
|
||||
}
|
||||
rm.Encrypt, rm.Persist, rm.SignMsgs = enc != 0, per != 0, sgn != 0
|
||||
out = append(out, rm)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// GetSealedKey returns the sealed room key for an endpoint at a given epoch.
|
||||
// If epoch <= 0, the latest epoch for that endpoint is returned.
|
||||
func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
func (s *sqliteStore) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, error) {
|
||||
var ep int
|
||||
var sealed []byte
|
||||
var err error
|
||||
@@ -239,6 +328,12 @@ func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, e
|
||||
).Scan(&ep, &sealed)
|
||||
}
|
||||
if err != nil {
|
||||
// Map "no such row" to the store-agnostic sentinel so the control plane
|
||||
// can tell "not invited / no key yet" (-> 403 with a helpful message) from
|
||||
// a genuine backend failure, the same way the KV store will.
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, ErrNotFound)
|
||||
}
|
||||
return 0, nil, fmt.Errorf("membership: get sealed key %q/%q@%d: %w", roomID, endpoint, epoch, err)
|
||||
}
|
||||
return ep, sealed, nil
|
||||
@@ -246,7 +341,7 @@ func (s *Store) GetSealedKey(roomID, endpoint string, epoch int) (int, []byte, e
|
||||
|
||||
// PutSealedKeys stores a batch of sealed keys for the given epoch (endpoint ->
|
||||
// sealed bytes), upserting on conflict so a rekey can overwrite stale entries.
|
||||
func (s *Store) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
func (s *sqliteStore) PutSealedKeys(roomID string, epoch int, keys map[string][]byte) error {
|
||||
tx, err := s.db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: begin: %w", err)
|
||||
@@ -265,7 +360,7 @@ func (s *Store) PutSealedKeys(roomID string, epoch int, keys map[string][]byte)
|
||||
}
|
||||
|
||||
// BumpEpoch sets the room's current key_epoch to newEpoch.
|
||||
func (s *Store) BumpEpoch(roomID string, newEpoch int) error {
|
||||
func (s *sqliteStore) BumpEpoch(roomID string, newEpoch int) error {
|
||||
if _, err := s.db.Exec(`UPDATE rooms SET key_epoch = ? WHERE room_id = ?`, newEpoch, roomID); err != nil {
|
||||
return fmt.Errorf("membership: bump epoch %q->%d: %w", roomID, newEpoch, err)
|
||||
}
|
||||
@@ -274,7 +369,7 @@ func (s *Store) BumpEpoch(roomID string, newEpoch int) error {
|
||||
|
||||
// RemoveMember deletes a member from a room. Their sealed keys for past epochs
|
||||
// are left intact (they encrypt only data that member could already read).
|
||||
func (s *Store) RemoveMember(roomID, endpoint string) error {
|
||||
func (s *sqliteStore) RemoveMember(roomID, endpoint string) error {
|
||||
if _, err := s.db.Exec(`DELETE FROM members WHERE room_id = ? AND endpoint = ?`, roomID, endpoint); err != nil {
|
||||
return fmt.Errorf("membership: remove member %q/%q: %w", roomID, endpoint, err)
|
||||
}
|
||||
|
||||
@@ -6,10 +6,10 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func openTestStore(t *testing.T) *Store {
|
||||
func openTestStore(t *testing.T) *sqliteStore {
|
||||
t.Helper()
|
||||
path := filepath.Join(t.TempDir(), "test.db")
|
||||
s, err := Open(path)
|
||||
s, err := openSQLite(path)
|
||||
if err != nil {
|
||||
t.Fatalf("Open: %v", err)
|
||||
}
|
||||
@@ -35,6 +35,58 @@ func TestMigrationsCreateSchema(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestListRoomsForEndpoint(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
// Owner of two rooms; a member in only the first.
|
||||
owner, member := "owner-ep", "member-ep"
|
||||
mk := func(id, subj string) RoomInfo {
|
||||
return RoomInfo{RoomID: id, Subject: subj, Encrypt: true, Persist: true, SignMsgs: true, OwnerEndpoint: owner}
|
||||
}
|
||||
if err := s.CreateRoom(mk("room-a", "room.a"), []byte("os"), []byte("ok"), []byte("k")); err != nil {
|
||||
t.Fatalf("CreateRoom a: %v", err)
|
||||
}
|
||||
if err := s.CreateRoom(mk("room-b", "room.b"), []byte("os"), []byte("ok"), []byte("k")); err != nil {
|
||||
t.Fatalf("CreateRoom b: %v", err)
|
||||
}
|
||||
if err := s.AddMember("room-a", Member{Endpoint: member, Role: "member", SignPub: []byte("s"), KexPub: []byte("k")}, 1, []byte("mk")); err != nil {
|
||||
t.Fatalf("AddMember: %v", err)
|
||||
}
|
||||
|
||||
// Owner is in both rooms, as owner, ordered by room id.
|
||||
ownerRooms, err := s.ListRoomsForEndpoint(owner)
|
||||
if err != nil {
|
||||
t.Fatalf("ListRoomsForEndpoint owner: %v", err)
|
||||
}
|
||||
if len(ownerRooms) != 2 {
|
||||
t.Fatalf("owner: expected 2 rooms, got %d", len(ownerRooms))
|
||||
}
|
||||
if ownerRooms[0].RoomID != "room-a" || ownerRooms[1].RoomID != "room-b" {
|
||||
t.Fatalf("owner rooms not ordered: %+v", ownerRooms)
|
||||
}
|
||||
if ownerRooms[0].Role != "owner" || !ownerRooms[0].Encrypt || ownerRooms[0].Subject != "room.a" {
|
||||
t.Fatalf("owner room metadata wrong: %+v", ownerRooms[0])
|
||||
}
|
||||
|
||||
// Member is in exactly one room, as member.
|
||||
memberRooms, err := s.ListRoomsForEndpoint(member)
|
||||
if err != nil {
|
||||
t.Fatalf("ListRoomsForEndpoint member: %v", err)
|
||||
}
|
||||
if len(memberRooms) != 1 || memberRooms[0].RoomID != "room-a" || memberRooms[0].Role != "member" {
|
||||
t.Fatalf("member rooms wrong: %+v", memberRooms)
|
||||
}
|
||||
|
||||
// An unknown endpoint yields an empty slice, not an error.
|
||||
none, err := s.ListRoomsForEndpoint("nobody")
|
||||
if err != nil {
|
||||
t.Fatalf("ListRoomsForEndpoint nobody: %v", err)
|
||||
}
|
||||
if len(none) != 0 {
|
||||
t.Fatalf("expected no rooms for unknown endpoint, got %+v", none)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRoomMemberKeyRoundTrip(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
|
||||
@@ -0,0 +1,164 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// User roles and statuses. They are stored as free text in the users table so
|
||||
// new values can be introduced without a schema change; these constants name
|
||||
// the ones the code reasons about today.
|
||||
const (
|
||||
RoleAdmin = "admin"
|
||||
RoleMember = "member"
|
||||
StatusActive = "active"
|
||||
StatusRevoked = "revoked"
|
||||
)
|
||||
|
||||
// ErrUserExists is returned by AddUser when a user with the same sign_pub is
|
||||
// already registered. Callers that want upsert semantics should branch on it.
|
||||
var ErrUserExists = errors.New("membership: user already exists")
|
||||
|
||||
// User is a bus-level identity in the allowlist: the Ed25519 signing public key
|
||||
// that authenticates a peer on both the control plane (request signatures) and
|
||||
// the data plane (NATS nkey), plus its role and revocation status. SignPub is
|
||||
// the lowercase hex of the 32-byte Ed25519 public key — the same key that
|
||||
// derives the endpoint id via frame.EndpointID.
|
||||
type User struct {
|
||||
SignPub string // Ed25519 public key, lowercase hex
|
||||
Handle string
|
||||
Role string // RoleAdmin | RoleMember
|
||||
Status string // StatusActive | StatusRevoked
|
||||
CreatedAt string
|
||||
RevokedAt string // empty unless revoked
|
||||
}
|
||||
|
||||
// normalizeSignPub lowercases the hex key so lookups are case-insensitive: the
|
||||
// primary key is stored lowercase and every query normalizes its input the same
|
||||
// way, so a caller passing uppercase hex still matches.
|
||||
func normalizeSignPub(signPub string) string {
|
||||
return strings.ToLower(strings.TrimSpace(signPub))
|
||||
}
|
||||
|
||||
// AddUser inserts a new bus user. role defaults to RoleMember when empty. It
|
||||
// returns ErrUserExists if the sign_pub is already registered (the caller may
|
||||
// choose to revoke+re-add or ignore). handle and signPub must be non-empty.
|
||||
func (s *sqliteStore) AddUser(signPub, handle, role string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" || handle == "" {
|
||||
return fmt.Errorf("membership: AddUser: sign_pub and handle required")
|
||||
}
|
||||
if role == "" {
|
||||
role = RoleMember
|
||||
}
|
||||
if role != RoleAdmin && role != RoleMember {
|
||||
return fmt.Errorf("membership: AddUser: invalid role %q (want %q or %q)", role, RoleAdmin, RoleMember)
|
||||
}
|
||||
_, err := s.db.Exec(
|
||||
`INSERT INTO users (sign_pub, handle, role, status, created_at) VALUES (?, ?, ?, ?, ?)`,
|
||||
signPub, handle, role, StatusActive, nowRFC3339(),
|
||||
)
|
||||
if err != nil {
|
||||
// modernc.org/sqlite surfaces a UNIQUE/PRIMARY KEY violation as a message
|
||||
// containing "UNIQUE constraint failed"; translate it into a typed error so
|
||||
// callers do not have to string-match.
|
||||
if strings.Contains(err.Error(), "UNIQUE constraint") || strings.Contains(err.Error(), "PRIMARY KEY") {
|
||||
return ErrUserExists
|
||||
}
|
||||
return fmt.Errorf("membership: insert user: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetUser returns the user with the given signing public key. It returns
|
||||
// sql.ErrNoRows (wrapped) when there is no such user.
|
||||
func (s *sqliteStore) GetUser(signPub string) (User, error) {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
var u User
|
||||
var revoked sql.NullString
|
||||
err := s.db.QueryRow(
|
||||
`SELECT sign_pub, handle, role, status, created_at, revoked_at FROM users WHERE sign_pub = ?`,
|
||||
signPub,
|
||||
).Scan(&u.SignPub, &u.Handle, &u.Role, &u.Status, &u.CreatedAt, &revoked)
|
||||
if err != nil {
|
||||
return User{}, fmt.Errorf("membership: get user %q: %w", signPub, err)
|
||||
}
|
||||
u.RevokedAt = revoked.String
|
||||
return u, nil
|
||||
}
|
||||
|
||||
// ListUsers returns every user ordered by handle then sign_pub (stable output).
|
||||
func (s *sqliteStore) ListUsers() ([]User, error) {
|
||||
rows, err := s.db.Query(
|
||||
`SELECT sign_pub, handle, role, status, created_at, revoked_at FROM users ORDER BY handle, sign_pub`,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("membership: list users: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var out []User
|
||||
for rows.Next() {
|
||||
var u User
|
||||
var revoked sql.NullString
|
||||
if err := rows.Scan(&u.SignPub, &u.Handle, &u.Role, &u.Status, &u.CreatedAt, &revoked); err != nil {
|
||||
return nil, fmt.Errorf("membership: scan user: %w", err)
|
||||
}
|
||||
u.RevokedAt = revoked.String
|
||||
out = append(out, u)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// RevokeUser marks a user as revoked and stamps revoked_at. Revocation is a
|
||||
// status flip (not a delete) so the identity stays auditable and IsAuthorized
|
||||
// immediately denies it on both planes. Revoking an unknown or already-revoked
|
||||
// user returns an error / is a no-op respectively.
|
||||
func (s *sqliteStore) RevokeUser(signPub string) error {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
res, err := s.db.Exec(
|
||||
`UPDATE users SET status = ?, revoked_at = ? WHERE sign_pub = ? AND status = ?`,
|
||||
StatusRevoked, nowRFC3339(), signPub, StatusActive,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: revoke user %q: %w", signPub, err)
|
||||
}
|
||||
n, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return fmt.Errorf("membership: revoke user %q: rows affected: %w", signPub, err)
|
||||
}
|
||||
if n == 0 {
|
||||
return fmt.Errorf("membership: revoke user %q: no active user with that key", signPub)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsAuthorized reports whether signPub belongs to an active (non-revoked) bus
|
||||
// user. It is the single authorization predicate consulted by both the control
|
||||
// plane (HTTP request middleware) and the data plane (NATS nkey authenticator),
|
||||
// so revoking a user denies access on both without restarting anything. An
|
||||
// unknown key, a revoked key, or any query error all yield false (fail closed).
|
||||
func (s *sqliteStore) IsAuthorized(signPub string) bool {
|
||||
signPub = normalizeSignPub(signPub)
|
||||
if signPub == "" {
|
||||
return false
|
||||
}
|
||||
var one int
|
||||
err := s.db.QueryRow(
|
||||
`SELECT 1 FROM users WHERE sign_pub = ? AND status = ?`, signPub, StatusActive,
|
||||
).Scan(&one)
|
||||
return err == nil && one == 1
|
||||
}
|
||||
|
||||
// HasAdmin reports whether at least one active admin exists. The control plane
|
||||
// uses it to gate user-management endpoints: until the host operator seeds the
|
||||
// first admin via the local CLI, those endpoints stay closed (chicken-egg).
|
||||
func (s *sqliteStore) HasAdmin() bool {
|
||||
var one int
|
||||
err := s.db.QueryRow(
|
||||
`SELECT 1 FROM users WHERE role = ? AND status = ? LIMIT 1`, RoleAdmin, StatusActive,
|
||||
).Scan(&one)
|
||||
return err == nil && one == 1
|
||||
}
|
||||
@@ -0,0 +1,164 @@
|
||||
package membership
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// a valid-shape Ed25519 public key in hex (64 hex chars). The bytes are
|
||||
// arbitrary: the store treats sign_pub as an opaque identifier and only the CLI
|
||||
// validates the length, so any 64-hex string round-trips through the store.
|
||||
const (
|
||||
pubAlice = "1111111111111111111111111111111111111111111111111111111111111111"
|
||||
pubBob = "2222222222222222222222222222222222222222222222222222222222222222"
|
||||
)
|
||||
|
||||
// Golden: add a user, read it back, and confirm it authorizes.
|
||||
func TestAddGetIsAuthorized(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
if err := s.AddUser(pubAlice, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
u, err := s.GetUser(pubAlice)
|
||||
if err != nil {
|
||||
t.Fatalf("GetUser: %v", err)
|
||||
}
|
||||
if u.Handle != "alice" || u.Role != RoleAdmin || u.Status != StatusActive {
|
||||
t.Fatalf("GetUser mismatch: %+v", u)
|
||||
}
|
||||
if u.CreatedAt == "" {
|
||||
t.Fatalf("CreatedAt not stamped")
|
||||
}
|
||||
if u.RevokedAt != "" {
|
||||
t.Fatalf("RevokedAt should be empty for an active user, got %q", u.RevokedAt)
|
||||
}
|
||||
if !s.IsAuthorized(pubAlice) {
|
||||
t.Fatalf("active user should be authorized")
|
||||
}
|
||||
if !s.HasAdmin() {
|
||||
t.Fatalf("HasAdmin should be true after seeding an admin")
|
||||
}
|
||||
}
|
||||
|
||||
// Edge: an empty role defaults to member; case-insensitive lookup; list order.
|
||||
func TestAddDefaultsAndListing(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
if err := s.AddUser(pubBob, "bob", ""); err != nil {
|
||||
t.Fatalf("AddUser bob: %v", err)
|
||||
}
|
||||
u, err := s.GetUser(pubBob)
|
||||
if err != nil {
|
||||
t.Fatalf("GetUser bob: %v", err)
|
||||
}
|
||||
if u.Role != RoleMember {
|
||||
t.Fatalf("empty role should default to member, got %q", u.Role)
|
||||
}
|
||||
// Adding bob (a member only) must not make HasAdmin true.
|
||||
if s.HasAdmin() {
|
||||
t.Fatalf("HasAdmin should be false with only a member registered")
|
||||
}
|
||||
|
||||
// Lookup is case-insensitive: uppercase hex matches the lowercase-stored key.
|
||||
if !s.IsAuthorized(strings.ToUpper(pubBob)) {
|
||||
t.Fatalf("IsAuthorized should be case-insensitive on the hex key")
|
||||
}
|
||||
|
||||
if err := s.AddUser(pubAlice, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser alice: %v", err)
|
||||
}
|
||||
users, err := s.ListUsers()
|
||||
if err != nil {
|
||||
t.Fatalf("ListUsers: %v", err)
|
||||
}
|
||||
// Ordered by handle: alice before bob.
|
||||
if len(users) != 2 || users[0].Handle != "alice" || users[1].Handle != "bob" {
|
||||
t.Fatalf("ListUsers order/content wrong: %+v", users)
|
||||
}
|
||||
}
|
||||
|
||||
// Edge: revocation flips status, stamps revoked_at, and denies authorization on
|
||||
// the spot — the property both planes rely on for revoke-without-restart.
|
||||
func TestRevokeDeniesAuthorization(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
if err := s.AddUser(pubAlice, "alice", RoleMember); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
if !s.IsAuthorized(pubAlice) {
|
||||
t.Fatalf("precondition: user should be authorized before revoke")
|
||||
}
|
||||
if err := s.RevokeUser(pubAlice); err != nil {
|
||||
t.Fatalf("RevokeUser: %v", err)
|
||||
}
|
||||
if s.IsAuthorized(pubAlice) {
|
||||
t.Fatalf("revoked user must NOT be authorized")
|
||||
}
|
||||
u, err := s.GetUser(pubAlice)
|
||||
if err != nil {
|
||||
t.Fatalf("GetUser after revoke: %v", err)
|
||||
}
|
||||
if u.Status != StatusRevoked || u.RevokedAt == "" {
|
||||
t.Fatalf("revoke should set status=revoked and stamp revoked_at, got %+v", u)
|
||||
}
|
||||
}
|
||||
|
||||
// Error path: duplicate key, unknown user, invalid role, revoke of unknown.
|
||||
func TestUserErrorPaths(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
|
||||
if err := s.AddUser(pubAlice, "alice", RoleAdmin); err != nil {
|
||||
t.Fatalf("AddUser: %v", err)
|
||||
}
|
||||
// Duplicate sign_pub -> typed ErrUserExists.
|
||||
if err := s.AddUser(pubAlice, "alice2", RoleMember); !errors.Is(err, ErrUserExists) {
|
||||
t.Fatalf("duplicate AddUser should return ErrUserExists, got %v", err)
|
||||
}
|
||||
// Invalid role rejected.
|
||||
if err := s.AddUser(pubBob, "bob", "superuser"); err == nil {
|
||||
t.Fatalf("invalid role should error")
|
||||
}
|
||||
// Missing handle/sign_pub rejected.
|
||||
if err := s.AddUser("", "nobody", RoleMember); err == nil {
|
||||
t.Fatalf("empty sign_pub should error")
|
||||
}
|
||||
// Unknown user is not authorized (fail closed) and GetUser errors.
|
||||
if s.IsAuthorized(pubBob) {
|
||||
t.Fatalf("unknown user must not be authorized")
|
||||
}
|
||||
if _, err := s.GetUser(pubBob); err == nil {
|
||||
t.Fatalf("GetUser of unknown user should error")
|
||||
}
|
||||
// Revoking an unknown (or already-revoked) user errors (no active row).
|
||||
if err := s.RevokeUser(pubBob); err == nil {
|
||||
t.Fatalf("revoking unknown user should error")
|
||||
}
|
||||
if err := s.RevokeUser(pubAlice); err != nil {
|
||||
t.Fatalf("first revoke should succeed: %v", err)
|
||||
}
|
||||
if err := s.RevokeUser(pubAlice); err == nil {
|
||||
t.Fatalf("second revoke of same user should error (already revoked)")
|
||||
}
|
||||
}
|
||||
|
||||
// Migration safety: the users table and its index exist after Open, and the
|
||||
// users migration is idempotent on re-apply (mirrors TestMigrationsCreateSchema).
|
||||
func TestUsersMigrationIdempotent(t *testing.T) {
|
||||
s := openTestStore(t)
|
||||
var name string
|
||||
if err := s.db.QueryRow(
|
||||
`SELECT name FROM sqlite_master WHERE type='table' AND name='users'`,
|
||||
).Scan(&name); err != nil {
|
||||
t.Fatalf("users table not created: %v", err)
|
||||
}
|
||||
if err := s.db.QueryRow(
|
||||
`SELECT name FROM sqlite_master WHERE type='index' AND name='idx_users_status'`,
|
||||
).Scan(&name); err != nil {
|
||||
t.Fatalf("idx_users_status not created: %v", err)
|
||||
}
|
||||
if err := s.applyMigrations(); err != nil {
|
||||
t.Fatalf("re-apply migrations: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -1,119 +0,0 @@
|
||||
# unibus playground
|
||||
|
||||
An all-in-one, web-based sandbox for the **unibus** message bus. One command
|
||||
brings up the entire stack embedded — no NATS to install, no services to wire —
|
||||
and a browser UI lets you exercise the bus visually: create peers, create and
|
||||
join rooms (cleartext or end-to-end encrypted), invite, publish, watch messages
|
||||
arrive live, and kick members (forward secrecy).
|
||||
|
||||
This is a **playground** (see `.claude/rules/playgrounds.md`): it lives inside
|
||||
the `unibus` app, reuses the parent Go module (no separate `go.mod`), is not
|
||||
indexed, and keeps all runtime state under `playground/local_files/` (ephemeral,
|
||||
safe to delete).
|
||||
|
||||
## Run
|
||||
|
||||
From the `unibus` app directory:
|
||||
|
||||
```bash
|
||||
cd /home/enmanuel/fn_registry/projects/message_bus/apps/unibus
|
||||
go run ./playground
|
||||
```
|
||||
|
||||
Then open **http://localhost:7700** in your browser.
|
||||
|
||||
Stop with `Ctrl-C` — the server tears down the web UI, every bus client, the
|
||||
control plane, and the embedded NATS cleanly (no orphaned processes).
|
||||
|
||||
## Architecture
|
||||
|
||||
The browser never speaks NATS. The Go server is the actual bus peer:
|
||||
|
||||
```
|
||||
browser ──fetch/SSE──▶ playground server (:7700)
|
||||
│ holds one unibus client per named peer
|
||||
├──HTTP──▶ membership control plane (127.0.0.1:8480)
|
||||
└──NATS──▶ embedded NATS + JetStream (:4260)
|
||||
```
|
||||
|
||||
- **:7700** — web UI (the only browser-facing port).
|
||||
- **127.0.0.1:8480** — membership control plane (rooms, members, sealed keys,
|
||||
rekey, blobs). Internal only.
|
||||
- **:4260** — embedded NATS + JetStream (the data plane). Internal only.
|
||||
|
||||
Each named peer gets its own long-term identity, persisted to
|
||||
`playground/local_files/<name>.id`, so a peer keeps the same endpoint across
|
||||
restarts. When a peer creates or joins a room, the server subscribes on its
|
||||
behalf and streams every received frame to that peer's open browser tabs over
|
||||
Server-Sent Events.
|
||||
|
||||
The playground only orchestrates the public unibus client API
|
||||
(`CreateRoom`, `Join`, `Subscribe`, `Publish`, `Invite`, `Kick`); it never
|
||||
reimplements bus or crypto logic.
|
||||
|
||||
## Try it: 2 peers + encryption + kick
|
||||
|
||||
1. Open **two browser tabs** on http://localhost:7700.
|
||||
2. Tab A: type `alice`, click **Connect**.
|
||||
3. Tab B: type `bob`, click **Connect**.
|
||||
4. Tab A (alice): type a subject like `room.general`, tick **🔒 encrypted
|
||||
(E2E)**, click **Create room**. Copy the resulting `room_id`.
|
||||
5. Tab A (alice): in the Action panel, pick `bob` as the target peer (use the
|
||||
↻ button to refresh the peer list if needed) and click **Invite to this
|
||||
room**.
|
||||
6. Tab B (bob): paste the `room_id` into the join field and click **Join**.
|
||||
7. Type messages in **both** tabs and hit Send — each message appears live in
|
||||
both tabs, tagged with subject, sender, time, and 🔒 (encrypted) or `clear`.
|
||||
8. Tab A (alice): click **Kick from this room** with `bob` selected. The room
|
||||
key rotates to a new epoch. New messages alice sends are no longer visible to
|
||||
bob — **forward secrecy**: bob no longer holds the current key.
|
||||
|
||||
Cleartext rooms (leave the checkbox unticked) behave like plain NATS fan-out:
|
||||
fast, ephemeral, unsigned. Encrypted rooms are the Matrix-like mode: E2E
|
||||
encrypted, persisted, and per-message signed.
|
||||
|
||||
## Benchmark: throughput simulator
|
||||
|
||||
The bottom panel of the UI is a performance simulator. Press **▶ Ejecutar
|
||||
benchmark** and one publisher floods a fresh room with thousands of messages
|
||||
that N subscribers receive (fan-out); a live canvas chart animates the sent vs
|
||||
received totals while it runs.
|
||||
|
||||
The two policy axes are exposed as **independent flags**, so the benchmark
|
||||
measures the cost of each layer in isolation:
|
||||
|
||||
| JetStream | Encryption | Room policy | What it costs |
|
||||
|---|---|---|---|
|
||||
| off | off | `{Encrypt:false, Persist:false}` | plain core NATS fan-out |
|
||||
| **on** | off | `{Encrypt:false, Persist:true}` | durable JetStream (publish ack per message) |
|
||||
| off | **on** | `{Encrypt:true, Persist:false}` | AEAD + Ed25519 signature per message, core transport |
|
||||
| **on** | **on** | `{Encrypt:true, Persist:true}` | full E2E + durable history |
|
||||
|
||||
A **payload size** slider (16 B – 8 KiB) sets the message size. Encrypted or
|
||||
persistent runs are capped to 30 000 messages (each message pays per-message
|
||||
crypto and/or a JetStream ack, so they run much slower than plain NATS).
|
||||
|
||||
The benchmark uses its own ephemeral peers (fresh identities, never persisted),
|
||||
so it never touches the named peers of the manual sandbox.
|
||||
|
||||
It is driven by an SSE endpoint that streams progress samples:
|
||||
|
||||
```bash
|
||||
curl -N "http://localhost:7700/api/bench?n_msgs=20000&n_subs=3&payload=128&encrypt=0&persist=0"
|
||||
# emits: data: {"type":"start",...} data: {"type":"sample",...} data: {"type":"done",...}
|
||||
```
|
||||
|
||||
Query params: `n_msgs`, `n_subs` (1–16), `payload` (bytes), `encrypt` (0/1),
|
||||
`persist` (0/1).
|
||||
|
||||
## State / cleanup
|
||||
|
||||
All writable state lives under `playground/local_files/`:
|
||||
|
||||
- `<name>.id` — per-peer identity (private keys; treat like an SSH key).
|
||||
- `play.db` — membership store (rooms, members, sealed keys).
|
||||
- `blobs/` — media blob store.
|
||||
- `js/` — embedded JetStream store.
|
||||
|
||||
Delete the whole `playground/local_files/` directory to reset to a clean slate.
|
||||
It is gitignored and never distributed.
|
||||
@@ -1,594 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>unibus playground</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0d1117;
|
||||
--panel: #161b22;
|
||||
--panel2: #1c2230;
|
||||
--border: #2b333f;
|
||||
--fg: #e6edf3;
|
||||
--muted: #8b98a5;
|
||||
--accent: #2f81f7;
|
||||
--green: #3fb950;
|
||||
--gold: #d29922;
|
||||
--red: #f85149;
|
||||
--mono: ui-monospace, "SF Mono", "Cascadia Code", Menlo, Consolas, monospace;
|
||||
}
|
||||
* { box-sizing: border-box; }
|
||||
body {
|
||||
margin: 0;
|
||||
background: var(--bg);
|
||||
color: var(--fg);
|
||||
font-family: var(--mono);
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
}
|
||||
header {
|
||||
padding: 14px 20px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
gap: 12px;
|
||||
}
|
||||
header h1 { margin: 0; font-size: 18px; letter-spacing: 0.5px; }
|
||||
header .sub { color: var(--muted); font-size: 12px; }
|
||||
.wrap {
|
||||
display: grid;
|
||||
grid-template-columns: 360px 1fr;
|
||||
gap: 16px;
|
||||
padding: 16px 20px;
|
||||
max-width: 1200px;
|
||||
}
|
||||
.col { display: flex; flex-direction: column; gap: 14px; }
|
||||
.card {
|
||||
background: var(--panel);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 8px;
|
||||
padding: 14px;
|
||||
}
|
||||
.card h2 {
|
||||
margin: 0 0 10px;
|
||||
font-size: 13px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 1px;
|
||||
color: var(--muted);
|
||||
}
|
||||
label { display: block; font-size: 12px; color: var(--muted); margin: 8px 0 3px; }
|
||||
input[type=text], select {
|
||||
width: 100%;
|
||||
background: var(--panel2);
|
||||
border: 1px solid var(--border);
|
||||
color: var(--fg);
|
||||
padding: 7px 9px;
|
||||
border-radius: 6px;
|
||||
font-family: var(--mono);
|
||||
font-size: 13px;
|
||||
}
|
||||
input:focus, select:focus { outline: none; border-color: var(--accent); }
|
||||
.row { display: flex; gap: 8px; align-items: center; }
|
||||
.row > * { flex: 1; }
|
||||
.checkrow { display: flex; align-items: center; gap: 6px; margin: 10px 0; }
|
||||
.checkrow input { flex: 0 0 auto; width: auto; }
|
||||
.checkrow label { margin: 0; flex: 0 0 auto; }
|
||||
button {
|
||||
background: var(--accent);
|
||||
border: none;
|
||||
color: #fff;
|
||||
padding: 7px 12px;
|
||||
border-radius: 6px;
|
||||
cursor: pointer;
|
||||
font-family: var(--mono);
|
||||
font-size: 13px;
|
||||
margin-top: 8px;
|
||||
}
|
||||
button:hover { filter: brightness(1.12); }
|
||||
button.ghost { background: var(--panel2); border: 1px solid var(--border); color: var(--fg); }
|
||||
button.danger { background: #3a1d1d; border: 1px solid var(--red); color: var(--red); }
|
||||
button:disabled { opacity: 0.4; cursor: not-allowed; }
|
||||
.pill {
|
||||
display: inline-block;
|
||||
background: var(--panel2);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 12px;
|
||||
padding: 2px 9px;
|
||||
font-size: 11px;
|
||||
color: var(--muted);
|
||||
}
|
||||
.pill.on { color: var(--green); border-color: var(--green); }
|
||||
.ident { word-break: break-all; font-size: 11px; color: var(--gold); margin-top: 6px; }
|
||||
.copy {
|
||||
cursor: pointer; color: var(--accent); font-size: 11px;
|
||||
margin-left: 6px; text-decoration: underline;
|
||||
}
|
||||
#log {
|
||||
background: #08090c;
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 8px;
|
||||
padding: 10px 12px;
|
||||
height: 520px;
|
||||
overflow-y: auto;
|
||||
font-size: 12.5px;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
.msg { padding: 2px 0; border-bottom: 1px solid #11151b; }
|
||||
.msg .subj { color: var(--accent); }
|
||||
.msg .from { color: var(--gold); }
|
||||
.msg .meta { color: var(--muted); font-size: 11px; }
|
||||
.msg .enc { color: var(--green); }
|
||||
.msg .clear { color: var(--muted); }
|
||||
.sys { color: var(--muted); font-style: italic; }
|
||||
.err { color: var(--red); }
|
||||
.help {
|
||||
background: var(--panel2);
|
||||
border-left: 3px solid var(--accent);
|
||||
padding: 10px 12px;
|
||||
border-radius: 4px;
|
||||
font-size: 12px;
|
||||
color: var(--muted);
|
||||
line-height: 1.6;
|
||||
}
|
||||
.help b { color: var(--fg); }
|
||||
.help code { color: var(--gold); }
|
||||
.status { font-size: 11px; color: var(--muted); margin-top: 6px; min-height: 14px; }
|
||||
.status.ok { color: var(--green); }
|
||||
.status.bad { color: var(--red); }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1>unibus playground</h1>
|
||||
<span class="sub">embedded NATS + JetStream · E2E rooms · forward secrecy · SSE</span>
|
||||
</header>
|
||||
|
||||
<div class="wrap">
|
||||
<!-- LEFT COLUMN: controls -->
|
||||
<div class="col">
|
||||
<div class="card">
|
||||
<h2>1 · Identity</h2>
|
||||
<label>Peer name</label>
|
||||
<div class="row">
|
||||
<input id="peerName" type="text" placeholder="alice" autocomplete="off" />
|
||||
<button id="connectBtn" style="flex:0 0 auto">Connect</button>
|
||||
</div>
|
||||
<div id="peerIdent" class="ident"></div>
|
||||
<div id="connStatus" class="status"></div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>2 · Rooms</h2>
|
||||
<label>Subject (e.g. room.general)</label>
|
||||
<input id="roomSubject" type="text" placeholder="room.general" autocomplete="off" />
|
||||
<div class="checkrow">
|
||||
<input id="roomEncrypt" type="checkbox" />
|
||||
<label for="roomEncrypt">🔒 encrypted (E2E)</label>
|
||||
</div>
|
||||
<div class="checkrow">
|
||||
<input id="roomPersist" type="checkbox" />
|
||||
<label for="roomPersist">🗂 persistente (historial)</label>
|
||||
</div>
|
||||
<div class="help" style="margin:-4px 0 8px; font-size:12px; color:var(--muted)">
|
||||
persistente = quien se une despues ve el historial; sin persistir = solo mensajes nuevos (NATS simple).
|
||||
</div>
|
||||
<button id="createRoomBtn" disabled>Create room</button>
|
||||
<div style="border-top:1px solid var(--border); margin:12px 0"></div>
|
||||
<label>Join by room_id</label>
|
||||
<input id="joinRoomId" type="text" placeholder="01J..." autocomplete="off" />
|
||||
<button id="joinBtn" class="ghost" disabled>Join</button>
|
||||
<div id="roomStatus" class="status"></div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<h2>3 · Action</h2>
|
||||
<label>Active room</label>
|
||||
<select id="activeRoom"></select>
|
||||
<label>Message</label>
|
||||
<div class="row">
|
||||
<input id="msgText" type="text" placeholder="hello bus" autocomplete="off" />
|
||||
<button id="sendBtn" style="flex:0 0 auto" disabled>Send</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--border); margin:12px 0"></div>
|
||||
<label>Target peer</label>
|
||||
<div class="row">
|
||||
<select id="targetPeer"></select>
|
||||
<button id="refreshPeersBtn" class="ghost" style="flex:0 0 auto" title="reload peer list">↻</button>
|
||||
</div>
|
||||
<button id="inviteBtn" disabled>Invite to this room</button>
|
||||
<button id="kickBtn" class="danger" disabled>Kick from this room</button>
|
||||
<div id="actionStatus" class="status"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- RIGHT COLUMN: live messages + help -->
|
||||
<div class="col">
|
||||
<div class="card" style="padding-bottom:8px">
|
||||
<h2>Live messages <span id="streamPill" class="pill">disconnected</span></h2>
|
||||
<div id="log"></div>
|
||||
</div>
|
||||
<div class="help">
|
||||
<b>ⓘ How to try it</b><br />
|
||||
Open <b>2 tabs</b>. Connect as <code>alice</code> in one and <code>bob</code> in the other.
|
||||
In alice: create a <code>🔒 encrypted</code> room, copy the <code>room_id</code>,
|
||||
then pick <code>bob</code> as target and <b>Invite to this room</b>.
|
||||
In bob: paste that <code>room_id</code> and <b>Join</b>.
|
||||
Type in both → messages appear live on each side.
|
||||
In alice: <b>Kick</b> bob → bob stops seeing new messages (forward secrecy: the room
|
||||
key rotates and bob no longer holds it).
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- BENCHMARK: full-width performance simulator -->
|
||||
<div style="padding: 0 20px 32px; max-width: 1200px;">
|
||||
<div class="card">
|
||||
<h2>Benchmark de rendimiento · 1 publisher → N subscribers</h2>
|
||||
<div style="display:flex; gap:26px; flex-wrap:wrap; align-items:flex-end; margin-bottom:6px;">
|
||||
<div style="min-width:230px;">
|
||||
<label>Mensajes a publicar · <span id="bMsgsVal" style="color:var(--fg)">20 000</span></label>
|
||||
<input id="bMsgs" type="range" min="1000" max="200000" step="1000" value="20000" style="width:100%; accent-color:var(--accent);" />
|
||||
</div>
|
||||
<div style="min-width:160px;">
|
||||
<label>Subscribers · <span id="bSubsVal" style="color:var(--fg)">3</span></label>
|
||||
<input id="bSubs" type="range" min="1" max="16" step="1" value="3" style="width:100%; accent-color:var(--accent);" />
|
||||
</div>
|
||||
<div style="min-width:200px;">
|
||||
<label>Tamaño payload · <span id="bPayVal" style="color:var(--fg)">128 B</span></label>
|
||||
<input id="bPay" type="range" min="16" max="8192" step="16" value="128" style="width:100%; accent-color:var(--accent);" />
|
||||
</div>
|
||||
<div class="checkrow" style="margin:0;">
|
||||
<input id="bPersist" type="checkbox" />
|
||||
<label for="bPersist">🗂 JetStream (persistente)</label>
|
||||
</div>
|
||||
<div class="checkrow" style="margin:0;">
|
||||
<input id="bEncrypt" type="checkbox" />
|
||||
<label for="bEncrypt">🔒 Encriptación E2E</label>
|
||||
</div>
|
||||
<button id="bRun" style="margin:0;">▶ Ejecutar benchmark</button>
|
||||
</div>
|
||||
<div class="help" style="margin:6px 0 12px;">
|
||||
<b>JetStream</b> y <b>Encriptación</b> son ejes independientes: NATS core (ambos off) · JetStream durable · E2E (AEAD + firma Ed25519 por mensaje) · E2E + JetStream. Los modos con cripto o persistencia se limitan a 30 000 mensajes (cada mensaje paga cifrado/firma/ack).
|
||||
</div>
|
||||
<div style="display:flex; gap:30px; flex-wrap:wrap; margin:4px 2px 8px;">
|
||||
<div><div style="font-size:11px; color:var(--muted); text-transform:uppercase; letter-spacing:.05em;">Enviados</div><div id="bSent" style="font-size:22px; color:var(--accent);">0</div></div>
|
||||
<div><div style="font-size:11px; color:var(--muted); text-transform:uppercase; letter-spacing:.05em;">Recibidos (Σ subs)</div><div id="bRecv" style="font-size:22px; color:var(--green);">0</div></div>
|
||||
<div><div style="font-size:11px; color:var(--muted); text-transform:uppercase; letter-spacing:.05em;">Throughput recv</div><div id="bTps" style="font-size:22px; color:var(--gold);">0</div></div>
|
||||
<div><div style="font-size:11px; color:var(--muted); text-transform:uppercase; letter-spacing:.05em;">Tiempo</div><div id="bTime" style="font-size:22px;">0.00 s</div></div>
|
||||
</div>
|
||||
<canvas id="bChart" style="width:100%; height:300px; display:block; background:#08090c; border:1px solid var(--border); border-radius:8px;"></canvas>
|
||||
<div style="display:flex; gap:18px; font-size:12px; color:var(--muted); margin-top:6px;">
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:var(--accent);margin-right:6px;"></span>enviados (publisher)</span>
|
||||
<span><span style="display:inline-block;width:10px;height:10px;border-radius:50%;background:var(--green);margin-right:6px;"></span>recibidos (suma de subscribers)</span>
|
||||
</div>
|
||||
<div id="bStatus" class="status" style="margin-top:8px;"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
"use strict";
|
||||
|
||||
const state = {
|
||||
peer: null, // connected peer name
|
||||
rooms: {}, // room_id -> {subject, encrypt}
|
||||
es: null, // EventSource
|
||||
};
|
||||
|
||||
const $ = (id) => document.getElementById(id);
|
||||
|
||||
async function api(path, body) {
|
||||
const opts = { method: "POST", headers: { "Content-Type": "application/json" } };
|
||||
if (body !== undefined) opts.body = JSON.stringify(body);
|
||||
const res = await fetch(path, opts);
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (!res.ok) throw new Error(data.error || ("HTTP " + res.status));
|
||||
return data;
|
||||
}
|
||||
async function apiGet(path) {
|
||||
const res = await fetch(path);
|
||||
const data = await res.json().catch(() => ({}));
|
||||
if (!res.ok) throw new Error(data.error || ("HTTP " + res.status));
|
||||
return data;
|
||||
}
|
||||
|
||||
function setStatus(id, msg, kind) {
|
||||
const el = $(id);
|
||||
el.textContent = msg || "";
|
||||
el.className = "status" + (kind ? " " + kind : "");
|
||||
}
|
||||
|
||||
function short(s, n = 10) {
|
||||
if (!s) return "";
|
||||
return s.length <= n * 2 ? s : s.slice(0, n) + "…" + s.slice(-4);
|
||||
}
|
||||
|
||||
function hhmmss(ms) {
|
||||
const d = new Date(ms);
|
||||
const p = (x) => String(x).padStart(2, "0");
|
||||
return p(d.getHours()) + ":" + p(d.getMinutes()) + ":" + p(d.getSeconds());
|
||||
}
|
||||
|
||||
function logSys(text, cls) {
|
||||
const log = $("log");
|
||||
const div = document.createElement("div");
|
||||
div.className = "msg " + (cls || "sys");
|
||||
div.textContent = text;
|
||||
log.appendChild(div);
|
||||
log.scrollTop = log.scrollHeight;
|
||||
}
|
||||
|
||||
function logMsg(ev) {
|
||||
const log = $("log");
|
||||
const div = document.createElement("div");
|
||||
div.className = "msg";
|
||||
const enc = ev.encrypted
|
||||
? '<span class="enc">🔒</span>'
|
||||
: '<span class="clear">clear</span>';
|
||||
div.innerHTML =
|
||||
'<span class="subj">[' + escapeHtml(ev.subject) + ']</span> ' +
|
||||
'<span class="from">' + escapeHtml(short(ev.sender)) + '</span> ↦ ' +
|
||||
escapeHtml(ev.text) +
|
||||
' <span class="meta">· ' + hhmmss(ev.ts) + ' · ' + enc + '</span>';
|
||||
log.appendChild(div);
|
||||
log.scrollTop = log.scrollHeight;
|
||||
}
|
||||
|
||||
function escapeHtml(s) {
|
||||
return String(s).replace(/[&<>"']/g, (c) => ({
|
||||
"&": "&", "<": "<", ">": ">", '"': """, "'": "'",
|
||||
}[c]));
|
||||
}
|
||||
|
||||
function refreshRoomSelect() {
|
||||
const sel = $("activeRoom");
|
||||
const cur = sel.value;
|
||||
sel.innerHTML = "";
|
||||
for (const [id, info] of Object.entries(state.rooms)) {
|
||||
const opt = document.createElement("option");
|
||||
opt.value = id;
|
||||
opt.textContent = info.subject + " (" + short(id, 6) + ")" + (info.encrypt ? " 🔒" : "");
|
||||
sel.appendChild(opt);
|
||||
}
|
||||
if (state.rooms[cur]) sel.value = cur;
|
||||
const has = Object.keys(state.rooms).length > 0;
|
||||
$("sendBtn").disabled = !has;
|
||||
$("inviteBtn").disabled = !has;
|
||||
$("kickBtn").disabled = !has;
|
||||
}
|
||||
|
||||
async function refreshPeers() {
|
||||
try {
|
||||
const peers = await apiGet("/api/peers");
|
||||
const sel = $("targetPeer");
|
||||
const cur = sel.value;
|
||||
sel.innerHTML = "";
|
||||
for (const p of peers) {
|
||||
if (p.name === state.peer) continue; // don't target yourself
|
||||
const opt = document.createElement("option");
|
||||
opt.value = p.name;
|
||||
opt.textContent = p.name + " (" + short(p.endpoint_id, 6) + ")";
|
||||
sel.appendChild(opt);
|
||||
}
|
||||
if ([...sel.options].some((o) => o.value === cur)) sel.value = cur;
|
||||
} catch (e) {
|
||||
setStatus("actionStatus", "peers: " + e.message, "bad");
|
||||
}
|
||||
}
|
||||
|
||||
function openStream(name) {
|
||||
if (state.es) state.es.close();
|
||||
const es = new EventSource("/api/stream?peer=" + encodeURIComponent(name));
|
||||
es.onopen = () => {
|
||||
$("streamPill").textContent = "live: " + name;
|
||||
$("streamPill").className = "pill on";
|
||||
};
|
||||
es.onmessage = (e) => {
|
||||
try { logMsg(JSON.parse(e.data)); } catch (_) {}
|
||||
};
|
||||
es.onerror = () => {
|
||||
$("streamPill").textContent = "reconnecting…";
|
||||
$("streamPill").className = "pill";
|
||||
};
|
||||
state.es = es;
|
||||
}
|
||||
|
||||
// ---- handlers ----
|
||||
|
||||
$("connectBtn").onclick = async () => {
|
||||
const name = $("peerName").value.trim();
|
||||
if (!name) { setStatus("connStatus", "enter a name", "bad"); return; }
|
||||
try {
|
||||
const res = await api("/api/peer", { name });
|
||||
state.peer = res.name;
|
||||
state.rooms = {};
|
||||
refreshRoomSelect();
|
||||
$("peerIdent").innerHTML =
|
||||
'endpoint: ' + escapeHtml(res.endpoint_id) +
|
||||
' <span class="copy" id="copyId">copy</span>';
|
||||
$("copyId").onclick = () => navigator.clipboard.writeText(res.endpoint_id);
|
||||
setStatus("connStatus", "connected as " + res.name, "ok");
|
||||
$("createRoomBtn").disabled = false;
|
||||
$("joinBtn").disabled = false;
|
||||
$("log").innerHTML = "";
|
||||
logSys("connected as " + res.name + " — listening for messages");
|
||||
openStream(res.name);
|
||||
refreshPeers();
|
||||
} catch (e) {
|
||||
setStatus("connStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
|
||||
$("createRoomBtn").onclick = async () => {
|
||||
const subject = $("roomSubject").value.trim();
|
||||
const encrypt = $("roomEncrypt").checked;
|
||||
const persist = $("roomPersist").checked;
|
||||
if (!subject) { setStatus("roomStatus", "subject required", "bad"); return; }
|
||||
try {
|
||||
const res = await api("/api/room", { peer: state.peer, subject, encrypt, persist });
|
||||
state.rooms[res.room_id] = { subject: res.subject, encrypt: res.encrypt };
|
||||
refreshRoomSelect();
|
||||
$("activeRoom").value = res.room_id;
|
||||
setStatus("roomStatus", "created " + res.room_id + " (click to copy)", "ok");
|
||||
$("roomStatus").style.cursor = "pointer";
|
||||
$("roomStatus").onclick = () => navigator.clipboard.writeText(res.room_id);
|
||||
logSys("created room " + res.subject + " [" + short(res.room_id) + "]" + (encrypt ? " 🔒" : "") + (res.persist ? " 🗄" : ""));
|
||||
} catch (e) {
|
||||
setStatus("roomStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
|
||||
$("joinBtn").onclick = async () => {
|
||||
const roomId = $("joinRoomId").value.trim();
|
||||
if (!roomId) { setStatus("roomStatus", "room_id required", "bad"); return; }
|
||||
try {
|
||||
const res = await api("/api/join", { peer: state.peer, room_id: roomId });
|
||||
state.rooms[roomId] = { subject: res.subject, encrypt: res.encrypt };
|
||||
refreshRoomSelect();
|
||||
$("activeRoom").value = roomId;
|
||||
setStatus("roomStatus", "joined " + res.subject + (res.encrypt ? " 🔒" : ""), "ok");
|
||||
logSys("joined room " + res.subject + " [" + short(roomId) + "]");
|
||||
} catch (e) {
|
||||
setStatus("roomStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
|
||||
$("sendBtn").onclick = async () => {
|
||||
const roomId = $("activeRoom").value;
|
||||
const text = $("msgText").value;
|
||||
if (!roomId) { setStatus("actionStatus", "select a room", "bad"); return; }
|
||||
try {
|
||||
await api("/api/publish", { peer: state.peer, room_id: roomId, text });
|
||||
$("msgText").value = "";
|
||||
setStatus("actionStatus", "sent", "ok");
|
||||
} catch (e) {
|
||||
setStatus("actionStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
$("msgText").addEventListener("keydown", (e) => { if (e.key === "Enter") $("sendBtn").click(); });
|
||||
|
||||
$("inviteBtn").onclick = async () => {
|
||||
const roomId = $("activeRoom").value;
|
||||
const target = $("targetPeer").value;
|
||||
if (!roomId) { setStatus("actionStatus", "select a room", "bad"); return; }
|
||||
if (!target) { setStatus("actionStatus", "no target peer (connect another peer first)", "bad"); return; }
|
||||
try {
|
||||
await api("/api/invite", { peer: state.peer, room_id: roomId, target });
|
||||
setStatus("actionStatus", "invited " + target, "ok");
|
||||
logSys("invited " + target + " to " + short(roomId));
|
||||
} catch (e) {
|
||||
setStatus("actionStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
|
||||
$("kickBtn").onclick = async () => {
|
||||
const roomId = $("activeRoom").value;
|
||||
const target = $("targetPeer").value;
|
||||
if (!roomId) { setStatus("actionStatus", "select a room", "bad"); return; }
|
||||
if (!target) { setStatus("actionStatus", "no target peer", "bad"); return; }
|
||||
try {
|
||||
await api("/api/kick", { peer: state.peer, room_id: roomId, target });
|
||||
setStatus("actionStatus", "kicked " + target + " (key rotated)", "ok");
|
||||
logSys("kicked " + target + " from " + short(roomId) + " — key rotated (forward secrecy)");
|
||||
} catch (e) {
|
||||
setStatus("actionStatus", e.message, "bad");
|
||||
}
|
||||
};
|
||||
|
||||
$("refreshPeersBtn").onclick = refreshPeers;
|
||||
$("peerName").addEventListener("keydown", (e) => { if (e.key === "Enter") $("connectBtn").click(); });
|
||||
|
||||
// ---- benchmark ----
|
||||
const fmtN = (n) => Number(n).toLocaleString("es-ES");
|
||||
const bMsgs = $("bMsgs"), bSubs = $("bSubs"), bPay = $("bPay");
|
||||
bMsgs.oninput = () => $("bMsgsVal").textContent = fmtN(+bMsgs.value);
|
||||
bSubs.oninput = () => $("bSubsVal").textContent = bSubs.value;
|
||||
bPay.oninput = () => $("bPayVal").textContent = fmtN(+bPay.value) + " B";
|
||||
|
||||
let bSamples = [], bRunning = false, bES = null;
|
||||
const bCanvas = $("bChart"), bCtx = bCanvas.getContext("2d");
|
||||
function cssVar(n) { return getComputedStyle(document.documentElement).getPropertyValue(n).trim(); }
|
||||
|
||||
function bResize() {
|
||||
const dpr = window.devicePixelRatio || 1, r = bCanvas.getBoundingClientRect();
|
||||
bCanvas.width = r.width * dpr; bCanvas.height = r.height * dpr;
|
||||
bCtx.setTransform(dpr, 0, 0, dpr, 0, 0); bDraw();
|
||||
}
|
||||
window.addEventListener("resize", bResize);
|
||||
|
||||
function bDraw() {
|
||||
const r = bCanvas.getBoundingClientRect(), W = r.width, H = r.height;
|
||||
const padL = 70, padR = 14, padT = 12, padB = 26;
|
||||
bCtx.clearRect(0, 0, W, H);
|
||||
const tMax = bSamples.length ? Math.max(bSamples[bSamples.length - 1].t, 0.001) : 1;
|
||||
const yMax = bSamples.length ? Math.max(...bSamples.map(s => Math.max(s.sent, s.recv)), 1) : 1;
|
||||
bCtx.strokeStyle = "#2b333f"; bCtx.fillStyle = "#8b98a5"; bCtx.font = "11px ui-monospace";
|
||||
for (let i = 0; i <= 5; i++) {
|
||||
const yy = (H - padB) - (i / 5) * (H - padT - padB);
|
||||
bCtx.beginPath(); bCtx.moveTo(padL, yy); bCtx.lineTo(W - padR, yy); bCtx.stroke();
|
||||
bCtx.textAlign = "right"; bCtx.fillText(fmtN(Math.round((i / 5) * yMax)), padL - 8, yy + 3);
|
||||
}
|
||||
bCtx.textAlign = "center";
|
||||
bCtx.fillText("0 s", padL, H - padB + 15);
|
||||
bCtx.fillText(tMax.toFixed(2) + " s", W - padR, H - padB + 15);
|
||||
if (bSamples.length < 2) return;
|
||||
const x = (t) => padL + (t / tMax) * (W - padL - padR);
|
||||
const y = (v) => (H - padB) - (v / yMax) * (H - padT - padB);
|
||||
const line = (key, color) => {
|
||||
bCtx.beginPath(); bCtx.lineWidth = 2.2; bCtx.strokeStyle = color;
|
||||
bSamples.forEach((s, i) => { const px = x(s.t), py = y(s[key]); i ? bCtx.lineTo(px, py) : bCtx.moveTo(px, py); });
|
||||
bCtx.stroke();
|
||||
};
|
||||
line("sent", cssVar("--accent"));
|
||||
line("recv", cssVar("--green"));
|
||||
}
|
||||
|
||||
function bSetRunning(v) { bRunning = v; $("bRun").disabled = v; }
|
||||
|
||||
$("bRun").onclick = () => {
|
||||
if (bRunning) return;
|
||||
bSamples = []; bSetRunning(true);
|
||||
$("bSent").textContent = "0"; $("bRecv").textContent = "0"; $("bTps").textContent = "0"; $("bTime").textContent = "0.00 s";
|
||||
setStatus("bStatus", "conectando…");
|
||||
const qs = new URLSearchParams({
|
||||
n_msgs: bMsgs.value, n_subs: bSubs.value, payload: bPay.value,
|
||||
encrypt: $("bEncrypt").checked ? "1" : "0", persist: $("bPersist").checked ? "1" : "0",
|
||||
});
|
||||
const es = new EventSource("/api/bench?" + qs.toString());
|
||||
bES = es;
|
||||
const finish = () => { try { es.close(); } catch (_) {} bSetRunning(false); };
|
||||
es.addEventListener("end", finish);
|
||||
es.onmessage = (e) => {
|
||||
let m; try { m = JSON.parse(e.data); } catch (_) { return; }
|
||||
if (m.type === "start") {
|
||||
setStatus("bStatus",
|
||||
"corriendo… " + fmtN(m.n_msgs) + " msgs → " + m.n_subs + " subs · payload " + fmtN(m.payload) + "B"
|
||||
+ (m.encrypt ? " · \u{1F512} E2E" : "") + (m.persist ? " · \u{1F5C4} JetStream" : "")
|
||||
+ (m.capped ? " · (limitado a 30k)" : ""), "");
|
||||
} else if (m.type === "sample") {
|
||||
bSamples.push({ t: m.t, sent: m.sent, recv: m.recv });
|
||||
$("bSent").textContent = fmtN(m.sent); $("bRecv").textContent = fmtN(m.recv); $("bTime").textContent = m.t.toFixed(2) + " s";
|
||||
if (bSamples.length >= 2) {
|
||||
const a = bSamples[bSamples.length - 2], b = bSamples[bSamples.length - 1], dt = b.t - a.t;
|
||||
if (dt > 0) $("bTps").textContent = fmtN(Math.round((b.recv - a.recv) / dt));
|
||||
}
|
||||
bDraw();
|
||||
} else if (m.type === "done") {
|
||||
bSamples.push({ t: m.t, sent: m.sent, recv: m.recv });
|
||||
$("bSent").textContent = fmtN(m.sent); $("bRecv").textContent = fmtN(m.recv);
|
||||
$("bTps").textContent = fmtN(m.recv_tps); $("bTime").textContent = m.t.toFixed(2) + " s";
|
||||
setStatus("bStatus",
|
||||
"✓ " + m.t.toFixed(2) + "s · pub " + fmtN(m.pub_tps) + "/s · recv " + fmtN(m.recv_tps) + "/s · fan-out ×"
|
||||
+ m.n_subs + " · por sub [" + (m.per_sub || []).map(fmtN).join(", ") + "]", "ok");
|
||||
bDraw(); finish();
|
||||
} else if (m.type === "error") {
|
||||
setStatus("bStatus", "error: " + m.msg, "bad"); finish();
|
||||
}
|
||||
};
|
||||
es.onerror = () => { if (bRunning) { setStatus("bStatus", "conexión SSE perdida", "bad"); finish(); } };
|
||||
};
|
||||
|
||||
bResize();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,853 +0,0 @@
|
||||
// Command playground is an all-in-one, web-based sandbox for the unibus message
|
||||
// bus. A single `go run ./playground` launches the entire stack embedded:
|
||||
//
|
||||
// - an embedded NATS server with JetStream (the data plane),
|
||||
// - the membership control plane (rooms, members, sealed keys, rekey) over an
|
||||
// internal HTTP server,
|
||||
// - the media blob store, and
|
||||
// - a browser-facing web UI on :7700.
|
||||
//
|
||||
// The browser never speaks NATS. The Go server is the actual bus peer: it holds
|
||||
// one unibus client per named peer, subscribes to rooms on the peer's behalf,
|
||||
// and streams received messages to the browser over Server-Sent Events. The
|
||||
// browser drives everything with plain fetch() + EventSource() — no build step,
|
||||
// no JS framework, no external libraries.
|
||||
//
|
||||
// This is a playground (see .claude/rules/playgrounds.md): it lives inside the
|
||||
// unibus app, reuses the parent module (no new go.mod), is not indexed, and
|
||||
// stores ephemeral state under playground/local_files/.
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
_ "embed"
|
||||
|
||||
cs "fn-registry/functions/cybersecurity"
|
||||
"github.com/enmanuel/unibus/pkg/blobstore"
|
||||
"github.com/enmanuel/unibus/pkg/client"
|
||||
"github.com/enmanuel/unibus/pkg/embeddednats"
|
||||
"github.com/enmanuel/unibus/pkg/frame"
|
||||
"github.com/enmanuel/unibus/pkg/membership"
|
||||
"github.com/enmanuel/unibus/pkg/room"
|
||||
)
|
||||
|
||||
// Fixed ports (verified free before assignment — do not change without reason).
|
||||
const (
|
||||
webAddr = "127.0.0.1:7700" // browser-facing web UI
|
||||
ctrlAddr = "127.0.0.1:8480" // internal membership control plane
|
||||
ctrlURL = "http://" + ctrlAddr
|
||||
natsPort = 4260 // internal embedded NATS
|
||||
natsURL = "nats://127.0.0.1:4260"
|
||||
localFiles = "playground/local_files"
|
||||
)
|
||||
|
||||
//go:embed index.html
|
||||
var indexHTML []byte
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Event: a message received by a peer on one of its subscribed rooms. Fanned
|
||||
// out to every SSE listener attached to that peer.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type Event struct {
|
||||
RoomID string `json:"room_id"`
|
||||
Subject string `json:"subject"`
|
||||
Sender string `json:"sender"`
|
||||
Text string `json:"text"`
|
||||
Encrypted bool `json:"encrypted"`
|
||||
TS int64 `json:"ts"` // unix millis
|
||||
}
|
||||
|
||||
// roomInfo caches the per-room metadata a peer needs to label incoming frames.
|
||||
type roomInfo struct {
|
||||
subject string
|
||||
encrypt bool
|
||||
}
|
||||
|
||||
// peerState holds everything about one named peer: its bus client, its public
|
||||
// endpoint, its live subscriptions, the rooms it knows, and the set of SSE
|
||||
// listener channels currently attached to it.
|
||||
type peerState struct {
|
||||
name string
|
||||
client *client.Client
|
||||
endpoint client.Endpoint
|
||||
|
||||
mu sync.Mutex
|
||||
subs map[string]*client.Sub // roomID -> subscription
|
||||
rooms map[string]roomInfo // roomID -> subject/encrypt
|
||||
listeners map[chan Event]struct{} // attached SSE channels
|
||||
}
|
||||
|
||||
// emit fans an event out to all attached listeners without blocking on a slow
|
||||
// or disconnected consumer.
|
||||
func (p *peerState) emit(ev Event) {
|
||||
p.mu.Lock()
|
||||
defer p.mu.Unlock()
|
||||
for ch := range p.listeners {
|
||||
select {
|
||||
case ch <- ev:
|
||||
default: // listener buffer full: drop rather than block the NATS callback
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (p *peerState) addListener(ch chan Event) {
|
||||
p.mu.Lock()
|
||||
p.listeners[ch] = struct{}{}
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
func (p *peerState) removeListener(ch chan Event) {
|
||||
p.mu.Lock()
|
||||
delete(p.listeners, ch)
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
func (p *peerState) setRoom(roomID string, info roomInfo) {
|
||||
p.mu.Lock()
|
||||
p.rooms[roomID] = info
|
||||
p.mu.Unlock()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Hub: the registry of peers, protected by a single mutex.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type Hub struct {
|
||||
mu sync.Mutex
|
||||
peers map[string]*peerState
|
||||
}
|
||||
|
||||
func newHub() *Hub { return &Hub{peers: map[string]*peerState{}} }
|
||||
|
||||
// getOrCreate returns the peer for name, creating its identity + bus client on
|
||||
// first use. Identities persist to playground/local_files/<name>.id so a peer
|
||||
// keeps the same endpoint across restarts.
|
||||
func (h *Hub) getOrCreate(name string) (*peerState, error) {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
if p, ok := h.peers[name]; ok {
|
||||
return p, nil
|
||||
}
|
||||
idPath := filepath.Join(localFiles, name+".id")
|
||||
id, err := client.LoadOrCreateIdentity(idPath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("identity for %q: %w", name, err)
|
||||
}
|
||||
c, err := client.New(natsURL, ctrlURL, id)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("client for %q: %w", name, err)
|
||||
}
|
||||
p := &peerState{
|
||||
name: name,
|
||||
client: c,
|
||||
endpoint: c.Endpoint(),
|
||||
subs: map[string]*client.Sub{},
|
||||
rooms: map[string]roomInfo{},
|
||||
listeners: map[chan Event]struct{}{},
|
||||
}
|
||||
h.peers[name] = p
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// lookup returns an already-created peer or false.
|
||||
func (h *Hub) lookup(name string) (*peerState, bool) {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
p, ok := h.peers[name]
|
||||
return p, ok
|
||||
}
|
||||
|
||||
// list returns a snapshot of all peers (name + endpoint id).
|
||||
func (h *Hub) list() []map[string]string {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
out := make([]map[string]string, 0, len(h.peers))
|
||||
for name, p := range h.peers {
|
||||
out = append(out, map[string]string{"name": name, "endpoint_id": p.endpoint.ID})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (h *Hub) closeAll() {
|
||||
h.mu.Lock()
|
||||
defer h.mu.Unlock()
|
||||
for _, p := range h.peers {
|
||||
p.mu.Lock()
|
||||
for _, sub := range p.subs {
|
||||
_ = sub.Unsubscribe()
|
||||
}
|
||||
p.mu.Unlock()
|
||||
_ = p.client.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// subscribeRoom subscribes the peer to a room (idempotent) and wires the frame
|
||||
// handler to fan incoming messages out as Events. info labels each event with
|
||||
// the room's subject and encryption flag.
|
||||
func (p *peerState) subscribeRoom(roomID string, info roomInfo) error {
|
||||
p.mu.Lock()
|
||||
if _, already := p.subs[roomID]; already {
|
||||
p.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
p.mu.Unlock()
|
||||
|
||||
sub, err := p.client.Subscribe(roomID, func(f frame.Frame, plaintext []byte) {
|
||||
p.emit(Event{
|
||||
RoomID: roomID,
|
||||
Subject: info.subject,
|
||||
Sender: f.Sender,
|
||||
Text: string(plaintext),
|
||||
Encrypted: info.encrypt,
|
||||
TS: time.Now().UnixMilli(),
|
||||
})
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("subscribe room %s: %w", roomID, err)
|
||||
}
|
||||
p.mu.Lock()
|
||||
p.subs[roomID] = sub
|
||||
p.mu.Unlock()
|
||||
p.setRoom(roomID, info)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Control-plane helper: fetch a room's subject + policy from membershipd. The
|
||||
// client package keeps fetchRoom private, so the playground talks to the
|
||||
// control plane directly (read endpoints are unauthenticated by design).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type ctrlRoomResp struct {
|
||||
Subject string `json:"subject"`
|
||||
Epoch int `json:"epoch"`
|
||||
Policy struct {
|
||||
Encrypt bool `json:"encrypt"`
|
||||
Persist bool `json:"persist"`
|
||||
SignMsgs bool `json:"sign_msgs"`
|
||||
} `json:"policy"`
|
||||
}
|
||||
|
||||
func fetchRoomInfo(roomID string) (roomInfo, error) {
|
||||
resp, err := http.Get(ctrlURL + "/rooms/" + roomID)
|
||||
if err != nil {
|
||||
return roomInfo{}, fmt.Errorf("fetch room %s: %w", roomID, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 300 {
|
||||
return roomInfo{}, fmt.Errorf("room %s not found (status %d)", roomID, resp.StatusCode)
|
||||
}
|
||||
var r ctrlRoomResp
|
||||
if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
|
||||
return roomInfo{}, fmt.Errorf("decode room %s: %w", roomID, err)
|
||||
}
|
||||
return roomInfo{subject: r.Subject, encrypt: r.Policy.Encrypt}, nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HTTP handlers (web UI on :7700).
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func writeJSON(w http.ResponseWriter, code int, v any) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(code)
|
||||
_ = json.NewEncoder(w).Encode(v)
|
||||
}
|
||||
|
||||
func writeErr(w http.ResponseWriter, code int, msg string) {
|
||||
writeJSON(w, code, map[string]string{"error": msg})
|
||||
}
|
||||
|
||||
func decodeBody(r *http.Request, out any) error {
|
||||
defer r.Body.Close()
|
||||
return json.NewDecoder(r.Body).Decode(out)
|
||||
}
|
||||
|
||||
func (h *Hub) handleIndex(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write(indexHTML)
|
||||
}
|
||||
|
||||
func (h *Hub) handlePeer(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Name string `json:"name"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Name == "" {
|
||||
writeErr(w, http.StatusBadRequest, "name required")
|
||||
return
|
||||
}
|
||||
p, err := h.getOrCreate(req.Name)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"name": p.name, "endpoint_id": p.endpoint.ID})
|
||||
}
|
||||
|
||||
func (h *Hub) handlePeers(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, http.StatusOK, h.list())
|
||||
}
|
||||
|
||||
func (h *Hub) handleRoom(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Peer string `json:"peer"`
|
||||
Subject string `json:"subject"`
|
||||
Encrypt bool `json:"encrypt"`
|
||||
Persist bool `json:"persist"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Peer == "" || req.Subject == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer and subject required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(req.Peer)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+req.Peer)
|
||||
return
|
||||
}
|
||||
// The two checkboxes map to an explicit per-room policy. encrypt drives both
|
||||
// encryption and per-message signing; persist (default false) independently
|
||||
// toggles durable JetStream history. persist=false keeps plain ephemeral NATS.
|
||||
policy := room.Policy{Encrypt: req.Encrypt, Persist: req.Persist, SignMsgs: req.Encrypt}
|
||||
roomID, err := p.client.CreateRoom(req.Subject, policy)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
info := roomInfo{subject: req.Subject, encrypt: req.Encrypt}
|
||||
if err := p.subscribeRoom(roomID, info); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{
|
||||
"room_id": roomID, "subject": req.Subject, "encrypt": req.Encrypt, "persist": req.Persist,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *Hub) handleJoin(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Peer string `json:"peer"`
|
||||
RoomID string `json:"room_id"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Peer == "" || req.RoomID == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer and room_id required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(req.Peer)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+req.Peer)
|
||||
return
|
||||
}
|
||||
if err := p.client.Join(req.RoomID); err != nil {
|
||||
writeErr(w, http.StatusBadRequest, "join failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
info, err := fetchRoomInfo(req.RoomID)
|
||||
if err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := p.subscribeRoom(req.RoomID, info); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"subject": info.subject, "encrypt": info.encrypt})
|
||||
}
|
||||
|
||||
func (h *Hub) handleInvite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Peer string `json:"peer"`
|
||||
RoomID string `json:"room_id"`
|
||||
Target string `json:"target"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Peer == "" || req.RoomID == "" || req.Target == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer, room_id and target required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(req.Peer)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+req.Peer)
|
||||
return
|
||||
}
|
||||
target, ok := h.lookup(req.Target)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "target peer "+req.Target+" does not exist; connect it first")
|
||||
return
|
||||
}
|
||||
if err := p.client.Invite(req.RoomID, target.endpoint); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "invited", "target": req.Target})
|
||||
}
|
||||
|
||||
func (h *Hub) handlePublish(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Peer string `json:"peer"`
|
||||
RoomID string `json:"room_id"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Peer == "" || req.RoomID == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer and room_id required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(req.Peer)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+req.Peer)
|
||||
return
|
||||
}
|
||||
if err := p.client.Publish(req.RoomID, []byte(req.Text)); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "published"})
|
||||
}
|
||||
|
||||
func (h *Hub) handleKick(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Peer string `json:"peer"`
|
||||
RoomID string `json:"room_id"`
|
||||
Target string `json:"target"`
|
||||
}
|
||||
if err := decodeBody(r, &req); err != nil || req.Peer == "" || req.RoomID == "" || req.Target == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer, room_id and target required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(req.Peer)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+req.Peer)
|
||||
return
|
||||
}
|
||||
target, ok := h.lookup(req.Target)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "target peer "+req.Target+" does not exist")
|
||||
return
|
||||
}
|
||||
if err := p.client.Kick(req.RoomID, target.endpoint.ID); err != nil {
|
||||
writeErr(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]string{"status": "kicked", "target": req.Target})
|
||||
}
|
||||
|
||||
// handleStream is the SSE endpoint. The browser opens one EventSource per peer;
|
||||
// each received Event is emitted as a `data: <json>\n\n` block. The listener is
|
||||
// cleaned up when the HTTP request context is cancelled (tab closed / reload).
|
||||
func (h *Hub) handleStream(w http.ResponseWriter, r *http.Request) {
|
||||
name := r.URL.Query().Get("peer")
|
||||
if name == "" {
|
||||
writeErr(w, http.StatusBadRequest, "peer query param required")
|
||||
return
|
||||
}
|
||||
p, ok := h.lookup(name)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusBadRequest, "unknown peer "+name)
|
||||
return
|
||||
}
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusInternalServerError, "streaming unsupported")
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
w.Header().Set("Cache-Control", "no-cache")
|
||||
w.Header().Set("Connection", "keep-alive")
|
||||
|
||||
ch := make(chan Event, 64)
|
||||
p.addListener(ch)
|
||||
defer p.removeListener(ch)
|
||||
|
||||
// Initial comment so the browser marks the stream open immediately.
|
||||
fmt.Fprintf(w, ": connected to %s\n\n", name)
|
||||
flusher.Flush()
|
||||
|
||||
ctx := r.Context()
|
||||
ping := time.NewTicker(20 * time.Second)
|
||||
defer ping.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ping.C:
|
||||
fmt.Fprintf(w, ": ping\n\n")
|
||||
flusher.Flush()
|
||||
case ev := <-ch:
|
||||
b, err := json.Marshal(ev)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(w, "data: %s\n\n", b)
|
||||
flusher.Flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Benchmark: one publisher floods a room with thousands of messages that N
|
||||
// subscribers receive. The two policy axes are exposed as independent flags:
|
||||
// encrypt (AEAD payload + Ed25519 per-message signature) and persist (durable
|
||||
// JetStream history vs ephemeral core NATS). Payload size is configurable. The
|
||||
// benchmark uses its own ephemeral peers (not the hub's named peers) so it never
|
||||
// interferes with the manual sandbox, and streams progress samples over SSE so
|
||||
// the browser can animate a live throughput chart.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// benchSample is one Server-Sent Event of a running benchmark.
|
||||
type benchSample struct {
|
||||
Type string `json:"type"` // "start" | "sample" | "done" | "error"
|
||||
T float64 `json:"t"`
|
||||
Sent int64 `json:"sent"`
|
||||
Recv int64 `json:"recv"`
|
||||
NMsgs int `json:"n_msgs,omitempty"`
|
||||
NSubs int `json:"n_subs,omitempty"`
|
||||
Payload int `json:"payload,omitempty"`
|
||||
Encrypt bool `json:"encrypt,omitempty"`
|
||||
Persist bool `json:"persist,omitempty"`
|
||||
Capped bool `json:"capped,omitempty"`
|
||||
PubTps int64 `json:"pub_tps,omitempty"`
|
||||
RecvTps int64 `json:"recv_tps,omitempty"`
|
||||
PerSub []int64 `json:"per_sub,omitempty"`
|
||||
Msg string `json:"msg,omitempty"`
|
||||
}
|
||||
|
||||
// runBench wires up one publisher + nSubs subscribers, publishes nMsgs payloads,
|
||||
// and calls emit periodically with the running totals. emit is only ever called
|
||||
// from the calling goroutine (the SSE handler), so it needs no locking.
|
||||
func runBench(ctx context.Context, emit func(benchSample), nMsgs, nSubs, payloadBytes int, encrypt, persist bool) {
|
||||
policy := room.Policy{Encrypt: encrypt, Persist: persist, SignMsgs: encrypt}
|
||||
subject := fmt.Sprintf("bench.%d", time.Now().UnixNano())
|
||||
|
||||
newPeer := func() (*client.Client, error) {
|
||||
id, err := cs.GenerateIdentity()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return client.New(natsURL, ctrlURL, id)
|
||||
}
|
||||
|
||||
pub, err := newPeer()
|
||||
if err != nil {
|
||||
emit(benchSample{Type: "error", Msg: "publisher: " + err.Error()})
|
||||
return
|
||||
}
|
||||
defer pub.Close()
|
||||
|
||||
roomID, err := pub.CreateRoom(subject, policy)
|
||||
if err != nil {
|
||||
emit(benchSample{Type: "error", Msg: "create room: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
counters := make([]int64, nSubs)
|
||||
subClients := make([]*client.Client, 0, nSubs)
|
||||
defer func() {
|
||||
for _, c := range subClients {
|
||||
_ = c.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
// One room, N subscribers. For encrypted rooms each subscriber must be invited
|
||||
// (sealed key) and join before subscribing; for cleartext rooms Subscribe on
|
||||
// the shared roomID is enough.
|
||||
for i := 0; i < nSubs; i++ {
|
||||
c, err := newPeer()
|
||||
if err != nil {
|
||||
emit(benchSample{Type: "error", Msg: fmt.Sprintf("subscriber %d: %v", i, err)})
|
||||
return
|
||||
}
|
||||
subClients = append(subClients, c)
|
||||
if encrypt {
|
||||
if err := pub.Invite(roomID, c.Endpoint()); err != nil {
|
||||
emit(benchSample{Type: "error", Msg: fmt.Sprintf("invite %d: %v", i, err)})
|
||||
return
|
||||
}
|
||||
if err := c.Join(roomID); err != nil {
|
||||
emit(benchSample{Type: "error", Msg: fmt.Sprintf("join %d: %v", i, err)})
|
||||
return
|
||||
}
|
||||
}
|
||||
idx := i
|
||||
if _, err := c.Subscribe(roomID, func(_ frame.Frame, _ []byte) {
|
||||
atomic.AddInt64(&counters[idx], 1)
|
||||
}); err != nil {
|
||||
emit(benchSample{Type: "error", Msg: fmt.Sprintf("subscribe %d: %v", i, err)})
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
sumRecv := func() int64 {
|
||||
var s int64
|
||||
for i := range counters {
|
||||
s += atomic.LoadInt64(&counters[i])
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
payload := bytes.Repeat([]byte{'x'}, payloadBytes)
|
||||
var sent int64
|
||||
|
||||
emit(benchSample{Type: "start", NMsgs: nMsgs, NSubs: nSubs, Payload: payloadBytes, Encrypt: encrypt, Persist: persist})
|
||||
|
||||
t0 := time.Now()
|
||||
done := make(chan struct{})
|
||||
var pubErr atomic.Value
|
||||
go func() {
|
||||
defer close(done)
|
||||
for k := 0; k < nMsgs; k++ {
|
||||
if err := pub.Publish(roomID, payload); err != nil {
|
||||
pubErr.Store(err)
|
||||
return
|
||||
}
|
||||
atomic.AddInt64(&sent, 1)
|
||||
if k%256 == 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
ticker := time.NewTicker(60 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
deadline := time.After(120 * time.Second)
|
||||
target := int64(nMsgs) * int64(nSubs)
|
||||
|
||||
sampleLoop:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-deadline:
|
||||
break sampleLoop
|
||||
case <-done:
|
||||
break sampleLoop
|
||||
case <-ticker.C:
|
||||
emit(benchSample{Type: "sample", T: time.Since(t0).Seconds(), Sent: atomic.LoadInt64(&sent), Recv: sumRecv()})
|
||||
}
|
||||
}
|
||||
if v := pubErr.Load(); v != nil {
|
||||
emit(benchSample{Type: "error", Msg: "publish: " + v.(error).Error()})
|
||||
return
|
||||
}
|
||||
|
||||
// Final drain: keep sampling until every subscriber has caught up (or we give up).
|
||||
for i := 0; i < 240; i++ {
|
||||
if sumRecv() >= target {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(25 * time.Millisecond):
|
||||
}
|
||||
emit(benchSample{Type: "sample", T: time.Since(t0).Seconds(), Sent: atomic.LoadInt64(&sent), Recv: sumRecv()})
|
||||
}
|
||||
|
||||
dur := time.Since(t0).Seconds()
|
||||
finalSent := atomic.LoadInt64(&sent)
|
||||
finalRecv := sumRecv()
|
||||
per := make([]int64, nSubs)
|
||||
for i := range counters {
|
||||
per[i] = atomic.LoadInt64(&counters[i])
|
||||
}
|
||||
var pubTps, recvTps int64
|
||||
if dur > 0 {
|
||||
pubTps = int64(float64(finalSent) / dur)
|
||||
recvTps = int64(float64(finalRecv) / dur)
|
||||
}
|
||||
emit(benchSample{Type: "done", T: dur, Sent: finalSent, Recv: finalRecv, PerSub: per, PubTps: pubTps, RecvTps: recvTps, NSubs: nSubs})
|
||||
}
|
||||
|
||||
// handleBench is the SSE endpoint that drives a benchmark from query params:
|
||||
//
|
||||
// GET /api/bench?n_msgs=20000&n_subs=3&payload=128&encrypt=0&persist=0
|
||||
//
|
||||
// Encrypted/persistent runs are capped to a lower message count (the per-message
|
||||
// crypto + JetStream ack make them far slower); the cap is reported in the start
|
||||
// sample so the UI can show it.
|
||||
func (h *Hub) handleBench(w http.ResponseWriter, r *http.Request) {
|
||||
q := r.URL.Query()
|
||||
atoiDef := func(k string, def int) int {
|
||||
if v, err := strconv.Atoi(q.Get(k)); err == nil {
|
||||
return v
|
||||
}
|
||||
return def
|
||||
}
|
||||
truthy := func(k string) bool { v := q.Get(k); return v == "1" || v == "true" }
|
||||
|
||||
nMsgs := atoiDef("n_msgs", 20000)
|
||||
nSubs := atoiDef("n_subs", 3)
|
||||
payload := atoiDef("payload", 128)
|
||||
encrypt := truthy("encrypt")
|
||||
persist := truthy("persist")
|
||||
|
||||
if nSubs < 1 {
|
||||
nSubs = 1
|
||||
} else if nSubs > 16 {
|
||||
nSubs = 16
|
||||
}
|
||||
if payload < 1 {
|
||||
payload = 1
|
||||
} else if payload > 8192 {
|
||||
payload = 8192
|
||||
}
|
||||
if nMsgs < 100 {
|
||||
nMsgs = 100
|
||||
}
|
||||
maxMsgs := 200000
|
||||
if encrypt || persist {
|
||||
maxMsgs = 30000 // crypto + JetStream ack are much slower; keep the run bounded
|
||||
}
|
||||
capped := false
|
||||
if nMsgs > maxMsgs {
|
||||
nMsgs, capped = maxMsgs, true
|
||||
}
|
||||
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
writeErr(w, http.StatusInternalServerError, "streaming unsupported")
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
w.Header().Set("Cache-Control", "no-cache")
|
||||
w.Header().Set("Connection", "keep-alive")
|
||||
fmt.Fprintf(w, ": bench start\n\n")
|
||||
flusher.Flush()
|
||||
|
||||
emit := func(s benchSample) {
|
||||
if s.Type == "start" {
|
||||
s.Capped = capped
|
||||
}
|
||||
b, err := json.Marshal(s)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
fmt.Fprintf(w, "data: %s\n\n", b)
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
runBench(r.Context(), emit, nMsgs, nSubs, payload, encrypt, persist)
|
||||
fmt.Fprintf(w, "event: end\ndata: {}\n\n")
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// main: bring up NATS, control plane, and the web server; tear them all down
|
||||
// cleanly on signal.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func main() {
|
||||
log.SetFlags(log.LstdFlags | log.Lmsgprefix)
|
||||
log.SetPrefix("[playground] ")
|
||||
|
||||
if err := os.MkdirAll(localFiles, 0o755); err != nil {
|
||||
log.Fatalf("mkdir %s: %v", localFiles, err)
|
||||
}
|
||||
|
||||
// 1. Data plane: embedded NATS + JetStream on the fixed internal port.
|
||||
ns, err := embeddednats.Start(filepath.Join(localFiles, "js"), natsPort)
|
||||
if err != nil {
|
||||
log.Fatalf("start embedded nats: %v", err)
|
||||
}
|
||||
log.Printf("embedded NATS (JetStream) ready: %s", embeddednats.ClientURL(ns))
|
||||
|
||||
// 2. Control plane: membership store + blob store + internal HTTP server.
|
||||
store, err := membership.Open(filepath.Join(localFiles, "play.db"))
|
||||
if err != nil {
|
||||
ns.Shutdown()
|
||||
log.Fatalf("open membership store: %v", err)
|
||||
}
|
||||
blobs, err := blobstore.New(filepath.Join(localFiles, "blobs"))
|
||||
if err != nil {
|
||||
store.Close()
|
||||
ns.Shutdown()
|
||||
log.Fatalf("open blob store: %v", err)
|
||||
}
|
||||
ctrlSrv := &http.Server{Addr: ctrlAddr, Handler: membership.NewServer(store, blobs)}
|
||||
go func() {
|
||||
if err := ctrlSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
log.Fatalf("control plane: %v", err)
|
||||
}
|
||||
}()
|
||||
if err := waitHealthy(ctrlURL+"/healthz", 5*time.Second); err != nil {
|
||||
log.Fatalf("control plane not healthy: %v", err)
|
||||
}
|
||||
log.Printf("control plane ready: %s", ctrlURL)
|
||||
|
||||
// 3. Web UI on :7700.
|
||||
hub := newHub()
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", hub.handleIndex)
|
||||
mux.HandleFunc("POST /api/peer", hub.handlePeer)
|
||||
mux.HandleFunc("GET /api/peers", hub.handlePeers)
|
||||
mux.HandleFunc("POST /api/room", hub.handleRoom)
|
||||
mux.HandleFunc("POST /api/join", hub.handleJoin)
|
||||
mux.HandleFunc("POST /api/invite", hub.handleInvite)
|
||||
mux.HandleFunc("POST /api/publish", hub.handlePublish)
|
||||
mux.HandleFunc("POST /api/kick", hub.handleKick)
|
||||
mux.HandleFunc("GET /api/stream", hub.handleStream)
|
||||
mux.HandleFunc("GET /api/bench", hub.handleBench)
|
||||
webSrv := &http.Server{Addr: webAddr, Handler: mux}
|
||||
go func() {
|
||||
if err := webSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||
log.Fatalf("web server: %v", err)
|
||||
}
|
||||
}()
|
||||
log.Printf("web UI ready: http://%s", webAddr)
|
||||
log.Printf("open http://localhost:7700 in two browser tabs to try the bus")
|
||||
|
||||
// 4. Graceful shutdown.
|
||||
stop := make(chan os.Signal, 1)
|
||||
signal.Notify(stop, syscall.SIGINT, syscall.SIGTERM)
|
||||
<-stop
|
||||
log.Printf("shutting down...")
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
_ = webSrv.Shutdown(ctx)
|
||||
hub.closeAll()
|
||||
_ = ctrlSrv.Shutdown(ctx)
|
||||
store.Close()
|
||||
ns.Shutdown()
|
||||
ns.WaitForShutdown()
|
||||
log.Printf("bye")
|
||||
}
|
||||
|
||||
// waitHealthy polls url until it returns a 2xx/3xx or the deadline elapses.
|
||||
func waitHealthy(url string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
c := &http.Client{Timeout: 500 * time.Millisecond}
|
||||
for time.Now().Before(deadline) {
|
||||
resp, err := c.Get(url)
|
||||
if err == nil {
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode < 400 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
return fmt.Errorf("timeout waiting for %s", url)
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
node_modules/
|
||||
dist/
|
||||
*.local
|
||||
.vite/
|
||||
*.tsbuildinfo
|
||||
@@ -0,0 +1,12 @@
|
||||
<!doctype html>
|
||||
<html lang="es">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>unibus</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"name": "unibus-web",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite",
|
||||
"build": "tsc -b && vite build",
|
||||
"preview": "vite preview"
|
||||
},
|
||||
"dependencies": {
|
||||
"@mantine/core": "^9.3.0",
|
||||
"@mantine/hooks": "^9.3.0",
|
||||
"@tabler/icons-react": "^3.36.0",
|
||||
"react": "^19.2.0",
|
||||
"react-dom": "^19.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "^19.2.0",
|
||||
"@types/react-dom": "^19.2.0",
|
||||
"@vitejs/plugin-react": "^4.3.4",
|
||||
"postcss": "^8.4.49",
|
||||
"postcss-preset-mantine": "^1.17.0",
|
||||
"postcss-simple-vars": "^7.0.1",
|
||||
"typescript": "~5.6.3",
|
||||
"vite": "^6.0.3"
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user