unibus/deploy/cluster/deploy-cluster.sh

#!/usr/bin/env bash
#
# deploy-cluster.sh — cross-build membershipd and stage it onto the three cluster
# nodes (issue 0006g). DEFAULT IS DRY-RUN: it prints the plan and touches nothing.
# Pass --yes to actually rsync + run remote commands. Steps that a HUMAN must run
# (or confirm) are marked "HUMAN:".
#
# Prerequisites (HUMAN, once):
#   1. Fill nodes.env (no <PLACEHOLDER> left).
#   2. ./generate-cluster-certs.sh   (mints out/<name>/ TLS material)
#   3. Create the route secret locally:  mkdir -p secrets && openssl rand -hex 32 > secrets/cluster.pass
#      (secrets/ is gitignored; it is rsynced to each node as cluster.pass)
#   4. SSH access to every node's SSH_HOST with sudo-less root (SSH_USER=root).
#
# What it does per node (with --yes):
#   - rsync the membershipd binary, the node's TLS material, the unit, the
#     generated cluster.env and the route secret into REMOTE_DIR.
#   - install + daemon-reload the systemd unit.
# Start is STAGGERED and left to the human (see README): start the seed node,
# seed the admin, then start the rest.
set -euo pipefail

DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$DIR"

# shellcheck source=/dev/null
source ./nodes.env

APPLY=0
[[ "${1:-}" == "--yes" ]] && APPLY=1

if grep -q '<[A-Z_]\+>' nodes.env; then
  echo "ERROR: nodes.env still has <PLACEHOLDER> values — fill them in first." >&2
  exit 2
fi

SECRET_FILE="secrets/cluster.pass"
if [[ ! -f "$SECRET_FILE" ]]; then
  echo "ERROR: $SECRET_FILE missing. HUMAN: mkdir -p secrets && openssl rand -hex 32 > $SECRET_FILE" >&2
  exit 2
fi

run() {
  # Echo every action; only execute it under --yes.
  echo "  + $*"
  if [[ $APPLY -eq 1 ]]; then
    "$@"
  fi
}

echo "==> [1/3] cross-build membershipd (linux/amd64, CGO disabled)"
run env CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o build/membershipd ../../cmd/membershipd

# Build the comma-separated route list for a node = the OTHER nodes' addresses on
# the chosen network, with NO userinfo (the secret is injected by membershipd from
# the file). Echoes nothing; prints the value.
routes_for() {
  local self="$1" out=""
  local row name _ssh pub wg addr
  for row in "${CLUSTER_NODES[@]}"; do
    read -r name _ssh pub wg <<<"$row"
    [[ "$name" == "$self" ]] && continue
    if [[ "$ROUTE_NETWORK" == "public" ]]; then addr="$pub"; else addr="$wg"; fi
    out+="nats://${addr}:${NATS_ROUTE_PORT},"
  done
  echo "${out%,}"
}

echo "==> [2/3] stage each node (REMOTE_DIR=$REMOTE_DIR)"
for row in "${CLUSTER_NODES[@]}"; do
  read -r name ssh _pub _wg <<<"$row"
  target="${SSH_USER}@${ssh}"
  nodedir="out/${name}"
  if [[ ! -d "$nodedir" ]]; then
    echo "ERROR: $nodedir missing — run ./generate-cluster-certs.sh first." >&2
    exit 2
  fi
  routes="$(routes_for "$name")"

  echo "-- node ${name} (ssh ${ssh}) routes=${routes}"

  # Generate this node's cluster.env locally, then ship it.
  envfile="build/cluster-${name}.env"
  mkdir -p build
  cat > "$envfile" <<EOF
NODE_NAME=${name}
CLUSTER_NAME=${CLUSTER_NAME}
CLUSTER_USER=${CLUSTER_USER}
KV_REPLICAS=${KV_REPLICAS}
HTTP_PORT=${HTTP_PORT}
NATS_CLIENT_PORT=${NATS_CLIENT_PORT}
NATS_ROUTE_PORT=${NATS_ROUTE_PORT}
ROUTES=${routes}
CLUSTER_PASS_FILE=${REMOTE_DIR}/secrets/cluster.pass
TLS_CERT=${REMOTE_DIR}/tls/server-${name}.crt
TLS_KEY=${REMOTE_DIR}/tls/server-${name}.key
ROUTE_TLS_CERT=${REMOTE_DIR}/tls/route-${name}.crt
ROUTE_TLS_KEY=${REMOTE_DIR}/tls/route-${name}.key
ROUTE_TLS_CA=${REMOTE_DIR}/tls/cluster-ca.crt
EOF

  run ssh "$target" "mkdir -p ${REMOTE_DIR}/tls ${REMOTE_DIR}/secrets"
  run rsync -az build/membershipd "${target}:${REMOTE_DIR}/membershipd"
  run rsync -az "${nodedir}/" "${target}:${REMOTE_DIR}/tls/"
  run rsync -az "$SECRET_FILE" "${target}:${REMOTE_DIR}/secrets/cluster.pass"
  run rsync -az "$envfile" "${target}:${REMOTE_DIR}/cluster.env"
  run rsync -az membershipd-cluster.service "${target}:/etc/systemd/system/membershipd-cluster.service"
  run ssh "$target" "chmod 600 ${REMOTE_DIR}/secrets/cluster.pass ${REMOTE_DIR}/tls/*.key && systemctl daemon-reload"
done

echo "==> [3/3] staged."
if [[ $APPLY -eq 0 ]]; then
  echo "    DRY-RUN: nothing was sent. Re-run with --yes to apply."
fi
cat <<'NEXT'

HUMAN — staggered start (do NOT enable all at once; see README "Bring up"):
  1. Seed node first (e.g. magnus):
       ssh root@magnus 'systemctl enable --now membershipd-cluster'
       ssh root@magnus '/opt/unibus/membershipd user add --admin ...'   # seed admin
  2. Then the other two, one at a time, checking quorum after each:
       ssh root@homer    'systemctl enable --now membershipd-cluster'
       ssh root@datardos 'systemctl enable --now membershipd-cluster'
  3. Verify posture + quorum (README "Verify").
  4. Scale replicas 1 -> 3 once all three are up (README "Scale to R3").
NEXT