48a3d6be33
Parameterized, NO-VPS-touched material to bring up unibus as a 3-node cluster. The authoring agent ran none of it on a host; every remote-changing step is marked HUMAN and deploy-cluster.sh defaults to a dry run. deploy/cluster/: - nodes.env — topology (cluster name, ports, per-node rows). Public IPs known (homer 141.94.69.66, datardos 51.91.100.142) pre-filled; magnus public IP and all WireGuard IPs are <PLACEHOLDER> for the human; scripts refuse to run while any remain. - generate-cluster-certs.sh — mints a SEPARATE cluster route CA + a route cert per node (server+clientAuth, mutual routes) and a data-plane server cert per node signed by the reused client CA (../tls/ca.*); SAN = public + WG + hostname. - membershipd-cluster.service — one unit, parameterized per node via /opt/unibus/cluster.env: enforce + per-subject ACL + TLS + --store kv, --cluster-pass-file (secret out of argv), Restart=always. - deploy-cluster.sh — cross-build linux/amd64, generate each node's cluster.env (routes to the other two on the WG mesh, no userinfo), rsync + install (only with --yes); staggered start is manual. - README.md — runbook: prerequisites, loopback bootstrap to seed the first admin into the KV (works around the user-CLI/KV chicken-and-egg), staggered bring-up, verify posture+quorum, scale R1->R3 in place, and the chaos test (left to 0003f on the real VPS). - .gitignore — out/, build/, secrets/, *.key never committed. bash -n passes on both scripts; go build/test unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
127 lines
4.7 KiB
Bash
Executable File
127 lines
4.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# deploy-cluster.sh — cross-build membershipd and stage it onto the three cluster
|
|
# nodes (issue 0006g). DEFAULT IS DRY-RUN: it prints the plan and touches nothing.
|
|
# Pass --yes to actually rsync + run remote commands. Steps that a HUMAN must run
|
|
# (or confirm) are marked "HUMAN:".
|
|
#
|
|
# Prerequisites (HUMAN, once):
|
|
# 1. Fill nodes.env (no <PLACEHOLDER> left).
|
|
# 2. ./generate-cluster-certs.sh (mints out/<name>/ TLS material)
|
|
# 3. Create the route secret locally: mkdir -p secrets && openssl rand -hex 32 > secrets/cluster.pass
|
|
# (secrets/ is gitignored; it is rsynced to each node as cluster.pass)
|
|
# 4. SSH access to every node's SSH_HOST with sudo-less root (SSH_USER=root).
|
|
#
|
|
# What it does per node (with --yes):
|
|
# - rsync the membershipd binary, the node's TLS material, the unit, the
|
|
# generated cluster.env and the route secret into REMOTE_DIR.
|
|
# - install + daemon-reload the systemd unit.
|
|
# Start is STAGGERED and left to the human (see README): start the seed node,
|
|
# seed the admin, then start the rest.
|
|
set -euo pipefail
|
|
|
|
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
cd "$DIR"
|
|
|
|
# shellcheck source=/dev/null
|
|
source ./nodes.env
|
|
|
|
APPLY=0
|
|
[[ "${1:-}" == "--yes" ]] && APPLY=1
|
|
|
|
if grep -q '<[A-Z_]\+>' nodes.env; then
|
|
echo "ERROR: nodes.env still has <PLACEHOLDER> values — fill them in first." >&2
|
|
exit 2
|
|
fi
|
|
|
|
SECRET_FILE="secrets/cluster.pass"
|
|
if [[ ! -f "$SECRET_FILE" ]]; then
|
|
echo "ERROR: $SECRET_FILE missing. HUMAN: mkdir -p secrets && openssl rand -hex 32 > $SECRET_FILE" >&2
|
|
exit 2
|
|
fi
|
|
|
|
run() {
|
|
# Echo every action; only execute it under --yes.
|
|
echo " + $*"
|
|
if [[ $APPLY -eq 1 ]]; then
|
|
"$@"
|
|
fi
|
|
}
|
|
|
|
echo "==> [1/3] cross-build membershipd (linux/amd64, CGO disabled)"
|
|
run env CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o build/membershipd ../../cmd/membershipd
|
|
|
|
# Build the comma-separated route list for a node = the OTHER nodes' addresses on
|
|
# the chosen network, with NO userinfo (the secret is injected by membershipd from
|
|
# the file). Echoes nothing; prints the value.
|
|
routes_for() {
|
|
local self="$1" out=""
|
|
local row name _ssh pub wg addr
|
|
for row in "${CLUSTER_NODES[@]}"; do
|
|
read -r name _ssh pub wg <<<"$row"
|
|
[[ "$name" == "$self" ]] && continue
|
|
if [[ "$ROUTE_NETWORK" == "public" ]]; then addr="$pub"; else addr="$wg"; fi
|
|
out+="nats://${addr}:${NATS_ROUTE_PORT},"
|
|
done
|
|
echo "${out%,}"
|
|
}
|
|
|
|
echo "==> [2/3] stage each node (REMOTE_DIR=$REMOTE_DIR)"
|
|
for row in "${CLUSTER_NODES[@]}"; do
|
|
read -r name ssh _pub _wg <<<"$row"
|
|
target="${SSH_USER}@${ssh}"
|
|
nodedir="out/${name}"
|
|
if [[ ! -d "$nodedir" ]]; then
|
|
echo "ERROR: $nodedir missing — run ./generate-cluster-certs.sh first." >&2
|
|
exit 2
|
|
fi
|
|
routes="$(routes_for "$name")"
|
|
|
|
echo "-- node ${name} (ssh ${ssh}) routes=${routes}"
|
|
|
|
# Generate this node's cluster.env locally, then ship it.
|
|
envfile="build/cluster-${name}.env"
|
|
mkdir -p build
|
|
cat > "$envfile" <<EOF
|
|
NODE_NAME=${name}
|
|
CLUSTER_NAME=${CLUSTER_NAME}
|
|
CLUSTER_USER=${CLUSTER_USER}
|
|
KV_REPLICAS=${KV_REPLICAS}
|
|
HTTP_PORT=${HTTP_PORT}
|
|
NATS_CLIENT_PORT=${NATS_CLIENT_PORT}
|
|
NATS_ROUTE_PORT=${NATS_ROUTE_PORT}
|
|
ROUTES=${routes}
|
|
CLUSTER_PASS_FILE=${REMOTE_DIR}/secrets/cluster.pass
|
|
TLS_CERT=${REMOTE_DIR}/tls/server-${name}.crt
|
|
TLS_KEY=${REMOTE_DIR}/tls/server-${name}.key
|
|
ROUTE_TLS_CERT=${REMOTE_DIR}/tls/route-${name}.crt
|
|
ROUTE_TLS_KEY=${REMOTE_DIR}/tls/route-${name}.key
|
|
ROUTE_TLS_CA=${REMOTE_DIR}/tls/cluster-ca.crt
|
|
EOF
|
|
|
|
run ssh "$target" "mkdir -p ${REMOTE_DIR}/tls ${REMOTE_DIR}/secrets"
|
|
run rsync -az build/membershipd "${target}:${REMOTE_DIR}/membershipd"
|
|
run rsync -az "${nodedir}/" "${target}:${REMOTE_DIR}/tls/"
|
|
run rsync -az "$SECRET_FILE" "${target}:${REMOTE_DIR}/secrets/cluster.pass"
|
|
run rsync -az "$envfile" "${target}:${REMOTE_DIR}/cluster.env"
|
|
run rsync -az membershipd-cluster.service "${target}:/etc/systemd/system/membershipd-cluster.service"
|
|
run ssh "$target" "chmod 600 ${REMOTE_DIR}/secrets/cluster.pass ${REMOTE_DIR}/tls/*.key && systemctl daemon-reload"
|
|
done
|
|
|
|
echo "==> [3/3] staged."
|
|
if [[ $APPLY -eq 0 ]]; then
|
|
echo " DRY-RUN: nothing was sent. Re-run with --yes to apply."
|
|
fi
|
|
cat <<'NEXT'
|
|
|
|
HUMAN — staggered start (do NOT enable all at once; see README "Bring up"):
|
|
1. Seed node first (e.g. magnus):
|
|
ssh root@magnus 'systemctl enable --now membershipd-cluster'
|
|
ssh root@magnus '/opt/unibus/membershipd user add --admin ...' # seed admin
|
|
2. Then the other two, one at a time, checking quorum after each:
|
|
ssh root@homer 'systemctl enable --now membershipd-cluster'
|
|
ssh root@datardos 'systemctl enable --now membershipd-cluster'
|
|
3. Verify posture + quorum (README "Verify").
|
|
4. Scale replicas 1 -> 3 once all three are up (README "Scale to R3").
|
|
NEXT
|