Files
unibus/deploy/cluster/generate-cluster-certs.sh
T
egutierrez 48a3d6be33 docs(0006g): cluster deploy material for magnus+homer+datardos (R3 HA)
Parameterized, NO-VPS-touched material to bring up unibus as a 3-node cluster.
The authoring agent ran none of it on a host; every remote-changing step is
marked HUMAN and deploy-cluster.sh defaults to a dry run.

deploy/cluster/:
- nodes.env — topology (cluster name, ports, per-node rows). Public IPs known
  (homer 141.94.69.66, datardos 51.91.100.142) pre-filled; magnus public IP and
  all WireGuard IPs are <PLACEHOLDER> for the human; scripts refuse to run while
  any remain.
- generate-cluster-certs.sh — mints a SEPARATE cluster route CA + a route cert per
  node (server+clientAuth, mutual routes) and a data-plane server cert per node
  signed by the reused client CA (../tls/ca.*); SAN = public + WG + hostname.
- membershipd-cluster.service — one unit, parameterized per node via
  /opt/unibus/cluster.env: enforce + per-subject ACL + TLS + --store kv,
  --cluster-pass-file (secret out of argv), Restart=always.
- deploy-cluster.sh — cross-build linux/amd64, generate each node's cluster.env
  (routes to the other two on the WG mesh, no userinfo), rsync + install (only
  with --yes); staggered start is manual.
- README.md — runbook: prerequisites, loopback bootstrap to seed the first admin
  into the KV (works around the user-CLI/KV chicken-and-egg), staggered bring-up,
  verify posture+quorum, scale R1->R3 in place, and the chaos test (left to 0003f
  on the real VPS).
- .gitignore — out/, build/, secrets/, *.key never committed.

bash -n passes on both scripts; go build/test unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 17:31:13 +02:00

121 lines
4.6 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# generate-cluster-certs.sh — mint the TLS material for a unibus 3-node cluster
# (issue 0006g). Run ONCE on a trusted machine (e.g. om, which custodies the bus
# CA); distribute the per-node output to each node over a secure channel. This
# script touches NO remote host.
#
# It produces two trust roots, kept SEPARATE on purpose (audit 0008 N1-low):
#
# 1. The CLUSTER route CA (cluster-ca.crt/key, generated here): signs each
# node's ROUTE certificate. The route layer authenticates NODES, not bus
# users, so it must NOT share the client data-plane CA — a client cert can
# then never be presented to the route port.
# 2. The CLIENT data-plane CA (../tls/ca.crt/key, the one clients pin): signs
# each node's DATA-PLANE server certificate. Reused, not regenerated, so
# existing clients keep trusting the bus.
#
# Per node it emits, under out/<name>/:
# route-<name>.crt/key route cert (cluster CA), EKU server+clientAuth
# (each node is BOTH server and dialer to its peers)
# server-<name>.crt/key data-plane cert (client CA), EKU serverAuth
# cluster-ca.crt the route CA cert (for --route-tls-ca)
# ca.crt the client CA cert (for clients / control-plane TLS)
#
# SANs per node = its public IP + its WireGuard IP + its hostname + localhost.
#
# Key material: EC P-256 (Go crypto/tls + nats-server friendly), matching
# ../tls/generate-certs.sh.
set -euo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$DIR"
# shellcheck source=/dev/null
source ./nodes.env
# Refuse to run while any placeholder remains (HUMAN must fill nodes.env first).
if grep -q '<[A-Z_]\+>' nodes.env; then
echo "ERROR: nodes.env still has <PLACEHOLDER> values — fill them in first." >&2
grep -n '<[A-Z_]\+>' nodes.env >&2
exit 2
fi
CLIENT_CA_CRT="../tls/ca.crt"
CLIENT_CA_KEY="../tls/ca.key"
if [[ ! -f "$CLIENT_CA_CRT" || ! -f "$CLIENT_CA_KEY" ]]; then
echo "ERROR: client data-plane CA not found at ../tls/ca.{crt,key}." >&2
echo " Run ../tls/generate-certs.sh first (it mints the client CA)." >&2
exit 2
fi
DAYS_CA=3650
DAYS_CRT=825
force=0
[[ "${1:-}" == "--force" ]] && force=1
# --- cluster route CA (separate trust root) ---
if [[ ! -f cluster-ca.crt || ! -f cluster-ca.key || $force -eq 1 ]]; then
echo "==> generating cluster route CA (separate from the client CA)"
openssl ecparam -name prime256v1 -genkey -noout -out cluster-ca.key
chmod 600 cluster-ca.key
openssl req -x509 -new -key cluster-ca.key -sha256 -days "$DAYS_CA" \
-subj "/CN=unibus-cluster-ca" -out cluster-ca.crt
else
echo "==> reusing existing cluster route CA (pass --force to regenerate)"
fi
# mint <out_key> <out_crt> <subject_cn> <san> <eku> <ca_crt> <ca_key>
mint_cert() {
local out_key="$1" out_crt="$2" cn="$3" san="$4" eku="$5" ca_crt="$6" ca_key="$7"
local csr ext
csr="$(mktemp)"
ext="$(mktemp)"
openssl ecparam -name prime256v1 -genkey -noout -out "$out_key"
chmod 600 "$out_key"
openssl req -new -key "$out_key" -subj "/CN=${cn}" -out "$csr"
cat > "$ext" <<EOF
subjectAltName=${san}
extendedKeyUsage=${eku}
keyUsage=digitalSignature,keyEncipherment
EOF
openssl x509 -req -in "$csr" -CA "$ca_crt" -CAkey "$ca_key" -CAcreateserial \
-sha256 -days "$DAYS_CRT" -extfile "$ext" -out "$out_crt"
rm -f "$csr" "$ext"
}
for row in "${CLUSTER_NODES[@]}"; do
read -r name _ssh pub wg <<<"$row"
echo "==> node ${name}: SAN IP:${pub}, IP:${wg}, DNS:${name}, localhost, 127.0.0.1"
nodedir="out/${name}"
mkdir -p "$nodedir"
san="IP:${pub},IP:${wg},DNS:${name},DNS:localhost,IP:127.0.0.1"
# Route cert: signed by the cluster CA; server+client auth (mutual routes).
mint_cert "${nodedir}/route-${name}.key" "${nodedir}/route-${name}.crt" \
"unibus-route-${name}" "$san" "serverAuth,clientAuth" \
cluster-ca.crt cluster-ca.key
# Data-plane server cert: signed by the client CA; serverAuth only.
mint_cert "${nodedir}/server-${name}.key" "${nodedir}/server-${name}.crt" \
"unibus-${name}" "$san" "serverAuth" \
"$CLIENT_CA_CRT" "$CLIENT_CA_KEY"
# Co-locate the two CA certs each node needs.
cp cluster-ca.crt "${nodedir}/cluster-ca.crt"
cp "$CLIENT_CA_CRT" "${nodedir}/ca.crt"
done
rm -f cluster-ca.srl ../tls/ca.srl 2>/dev/null || true
echo
echo "==> done. Per-node material under out/<name>/ (KEYS ARE SECRET — never git):"
for row in "${CLUSTER_NODES[@]}"; do
read -r name _rest <<<"$row"
echo " out/${name}/ (route-${name}.*, server-${name}.*, cluster-ca.crt, ca.crt)"
done
echo
echo "verify a SAN with:"
echo " openssl x509 -in out/<name>/server-<name>.crt -noout -text | grep -A1 'Subject Alternative Name'"