From d2c6b332717952ce284ca5764d3921db51b43094 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 16 Apr 2026 07:21:56 +0000
Subject: [PATCH 1/2] =?UTF-8?q?fix:=20[nomad-step-0]=20S0.4=20=E2=80=94=20?=
 =?UTF-8?q?disinto=20init=20--backend=3Dnomad=20--empty=20orchestrator=20(?=
 =?UTF-8?q?cluster-up)=20(#824)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires S0.1–S0.3 into a single idempotent bring-up script and replaces
the S0.1 stub in _disinto_init_nomad so `disinto init --backend=nomad
--empty` produces a running empty single-node cluster on a fresh box.

lib/init/nomad/cluster-up.sh (new):
  1. install.sh                (nomad + vault binaries)
  2. systemd-nomad.sh          (unit + enable, not started)
  3. systemd-vault.sh          (unit + vault.hcl + enable)
  4. host-volume dirs under /srv/disinto/* (matching nomad/client.hcl)
  5. /etc/nomad.d/{server,client}.hcl (content-compare before write)
  6. vault-init.sh             (first-run init + unseal + persist keys)
  7. systemctl start vault     (poll until unsealed; fail-fast on
                                is-failed)
  8. systemctl start nomad     (poll until ≥1 node ready)
  9. /etc/profile.d/disinto-nomad.sh (VAULT_ADDR + NOMAD_ADDR for
                                      interactive shells)
  Re-running on a healthy box is a no-op — each sub-step is itself
  idempotent and steps 7/8 fast-path when already active + healthy.
  `--dry-run` prints the full step list and exits 0.

bin/disinto:
  - _disinto_init_nomad: replaces the S0.1 stub. Invokes cluster-up.sh
    directly (as root) or via `sudo -n` otherwise. Both `--empty` and
    the default (no flag) call cluster-up.sh today; Step 1 will branch
    on $empty to gate job deployment. --dry-run forwards through.
  - disinto_init: adds `--empty` flag parsing; rejects `--empty`
    combined with `--backend=docker` explicitly instead of silently
    ignoring it.
  - usage: documents `--empty` and drops the "stub, S0.1" annotation
    from --backend.

Closes #824.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bin/disinto                  |  84 +++++++--
 lib/init/nomad/cluster-up.sh | 337 +++++++++++++++++++++++++++++++++++
 2 files changed, 406 insertions(+), 15 deletions(-)
 create mode 100755 lib/init/nomad/cluster-up.sh
diff --git a/bin/disinto b/bin/disinto
index 00404e6..75d7bab 100755
--- a/bin/disinto
+++ b/bin/disinto
@@ -81,7 +81,8 @@ Init options:
   --repo-root <path>   Local clone path (default: ~/name)
   --ci-id <n>          Woodpecker CI repo ID (default: 0 = no CI)
   --forge-url <url>    Forge base URL (default: http://localhost:3000)
-  --backend <value>    Orchestration backend: docker (default) | nomad (stub, S0.1)
+  --backend <value>    Orchestration backend: docker (default) | nomad
+  --empty              (nomad) Bring up cluster only, no jobs (S0.4)
   --bare               Skip compose generation (bare-metal setup)
   --build              Use local docker build instead of registry images (dev mode)
   --yes                Skip confirmation prompts
@@ -645,17 +646,61 @@ prompt_admin_password() {
 
 # ── init command ─────────────────────────────────────────────────────────────
 
-# Nomad backend init — stub for the Nomad+Vault migration (issue #821, S0.1).
-# Real implementation lands across S0.2–S0.5. Exists so --backend=nomad fails
-# loud instead of silently routing through the docker path.
+# Nomad backend init — dispatcher (Nomad+Vault migration, S0.4, issue #824).
+#
+# Today `--empty` and the default (no flag) both bring up an empty
+# single-node Nomad+Vault cluster via lib/init/nomad/cluster-up.sh. Step 1
+# will extend the default path to also deploy jobs; `--empty` will remain
+# the "cluster only, no workloads" escape hatch.
+#
+# Uses `sudo -n` when not already root — cluster-up.sh mutates /etc/,
+# /srv/, and systemd state, so it has to run as root. The `-n` keeps the
+# failure mode legible (no hanging TTY-prompted sudo inside a factory
+# init run); operators running without sudo-NOPASSWD should invoke
+# `sudo disinto init ...` directly.
 _disinto_init_nomad() {
-  local dry_run="${1:-false}"
-  if [ "$dry_run" = "true" ]; then
-    echo "nomad backend: stub — will be implemented by S0.2–S0.5"
-    exit 0
+  local dry_run="${1:-false}" empty="${2:-false}"
+  local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh"
+
+  if [ ! -x "$cluster_up" ]; then
+    echo "Error: ${cluster_up} not found or not executable" >&2
+    exit 1
   fi
-  echo "ERROR: nomad backend not yet implemented (stub)" >&2
-  exit 99
+
+  # --empty and default both invoke cluster-up today. Log the requested
+  # mode so the dispatch is visible in factory bootstrap logs — Step 1
+  # will branch on $empty to gate the job-deployment path.
+  if [ "$empty" = "true" ]; then
+    echo "nomad backend: --empty (cluster-up only, no jobs)"
+  else
+    echo "nomad backend: default (cluster-up; jobs deferred to Step 1)"
+  fi
+
+  # Dry-run forwards straight through; cluster-up.sh prints its own step
+  # list and exits 0 without touching the box.
+  local -a cmd=("$cluster_up")
+  if [ "$dry_run" = "true" ]; then
+    cmd+=("--dry-run")
+    "${cmd[@]}"
+    exit $?
+  fi
+
+  # Real run — needs root. Invoke via sudo if we're not already root so
+  # the command's exit code propagates directly. We don't distinguish
+  # "sudo denied" from "cluster-up.sh failed" here; both surface as a
+  # non-zero exit, and cluster-up.sh's own error messages cover the
+  # latter case.
+  local rc=0
+  if [ "$(id -u)" -eq 0 ]; then
+    "${cmd[@]}" || rc=$?
+  else
+    if ! command -v sudo >/dev/null 2>&1; then
+      echo "Error: cluster-up.sh must run as root and sudo is not installed" >&2
+      exit 1
+    fi
+    sudo -n -- "${cmd[@]}" || rc=$?
+  fi
+  exit "$rc"
 }
 
 disinto_init() {
@@ -668,7 +713,7 @@ disinto_init() {
   shift
 
   # Parse flags
-  local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker"
+  local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false
   while [ $# -gt 0 ]; do
     case "$1" in
       --branch)        branch="$2"; shift 2 ;;
@@ -679,6 +724,7 @@ disinto_init() {
       --backend=*)     backend="${1#--backend=}"; shift ;;
       --bare)          bare=true; shift ;;
       --build)         use_build=true; shift ;;
+      --empty)         empty=true; shift ;;
       --yes)           auto_yes=true; shift ;;
       --rotate-tokens) rotate_tokens=true; shift ;;
       --dry-run)       dry_run=true; shift ;;
@@ -692,11 +738,19 @@ disinto_init() {
     *) echo "Error: invalid --backend value '${backend}' (expected: docker|nomad)" >&2; exit 1 ;;
   esac
 
-  # Dispatch on backend — nomad path is a stub for now (issue #821, S0.1).
-  # Subsequent S0.x issues will replace _disinto_init_nomad with real logic
-  # without touching flag parsing or this dispatch.
+  # --empty is nomad-only today (the docker path has no concept of an
+  # "empty cluster"). Reject explicitly rather than letting it silently
+  # do nothing on --backend=docker.
+  if [ "$empty" = true ] && [ "$backend" != "nomad" ]; then
+    echo "Error: --empty is only valid with --backend=nomad" >&2
+    exit 1
+  fi
+
+  # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh
+  # (S0.4). The default and --empty variants are identical today; Step 1
+  # will branch on $empty to add job deployment to the default path.
   if [ "$backend" = "nomad" ]; then
-    _disinto_init_nomad "$dry_run"
+    _disinto_init_nomad "$dry_run" "$empty"
     # shellcheck disable=SC2317  # _disinto_init_nomad always exits today;
     # `return` is defensive against future refactors.
     return
diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh
new file mode 100755
index 0000000..a1b02ff
--- /dev/null
+++ b/lib/init/nomad/cluster-up.sh
@@ -0,0 +1,337 @@
+#!/usr/bin/env bash
+# =============================================================================
+# lib/init/nomad/cluster-up.sh — Empty Nomad+Vault cluster orchestrator (S0.4)
+#
+# Wires together the S0.1–S0.3 building blocks into one idempotent
+# "bring up a single-node Nomad+Vault cluster" script:
+#
+#   1. install.sh                  (nomad + vault binaries)
+#   2. systemd-nomad.sh            (nomad.service — unit + enable, not started)
+#   3. systemd-vault.sh            (vault.service — unit + vault.hcl + enable)
+#   4. Host-volume dirs            (/srv/disinto/* matching nomad/client.hcl)
+#   5. /etc/nomad.d/*.hcl          (server.hcl + client.hcl from repo)
+#   6. vault-init.sh               (first-run init + unseal + persist keys)
+#   7. systemctl start vault       (auto-unseal via ExecStartPost; poll)
+#   8. systemctl start nomad       (poll until ≥1 ready node)
+#   9. /etc/profile.d/disinto-nomad.sh  (VAULT_ADDR + NOMAD_ADDR for shells)
+#
+# This is the "empty cluster" orchestrator — no jobs deployed. Subsequent
+# Step-1 issues layer job deployment on top of this checkpoint.
+#
+# Idempotency contract:
+#   Running twice back-to-back on a healthy box is a no-op. Each sub-step
+#   is itself idempotent — see install.sh / systemd-*.sh / vault-init.sh
+#   headers for the per-step contract. Fast-paths in steps 7 and 8 skip
+#   the systemctl start when the service is already active + healthy.
+#
+# Usage:
+#   sudo lib/init/nomad/cluster-up.sh            # bring cluster up
+#   sudo lib/init/nomad/cluster-up.sh --dry-run  # print step list, exit 0
+#
+# Environment (override polling for slow boxes):
+#   VAULT_POLL_SECS  max seconds to wait for vault to unseal (default: 30)
+#   NOMAD_POLL_SECS  max seconds to wait for nomad node=ready (default: 60)
+#
+# Exit codes:
+#   0  success (cluster up, or already up)
+#   1  precondition or step failure
+# =============================================================================
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+
+# Sub-scripts (siblings in this directory).
+INSTALL_SH="${SCRIPT_DIR}/install.sh"
+SYSTEMD_NOMAD_SH="${SCRIPT_DIR}/systemd-nomad.sh"
+SYSTEMD_VAULT_SH="${SCRIPT_DIR}/systemd-vault.sh"
+VAULT_INIT_SH="${SCRIPT_DIR}/vault-init.sh"
+
+# In-repo Nomad configs copied to /etc/nomad.d/.
+NOMAD_CONFIG_DIR="/etc/nomad.d"
+NOMAD_SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl"
+NOMAD_CLIENT_HCL_SRC="${REPO_ROOT}/nomad/client.hcl"
+
+# /etc/profile.d entry — makes VAULT_ADDR + NOMAD_ADDR available to
+# interactive shells without requiring the operator to source anything.
+PROFILE_D_FILE="/etc/profile.d/disinto-nomad.sh"
+
+# Host-volume paths — MUST match the `host_volume "..."` declarations
+# in nomad/client.hcl. Adding a host_volume block there requires adding
+# its path here so the dir exists before nomad starts (otherwise client
+# fingerprinting fails and the node stays in "initializing").
+HOST_VOLUME_DIRS=(
+  "/srv/disinto/forgejo-data"
+  "/srv/disinto/woodpecker-data"
+  "/srv/disinto/agent-data"
+  "/srv/disinto/project-repos"
+  "/srv/disinto/caddy-data"
+  "/srv/disinto/chat-history"
+  "/srv/disinto/ops-repo"
+)
+
+# Default API addresses — matches the listener bindings in
+# nomad/server.hcl and nomad/vault.hcl. If either file ever moves
+# off 127.0.0.1 / default port, update both places together.
+VAULT_ADDR_DEFAULT="http://127.0.0.1:8200"
+NOMAD_ADDR_DEFAULT="http://127.0.0.1:4646"
+
+VAULT_POLL_SECS="${VAULT_POLL_SECS:-30}"
+NOMAD_POLL_SECS="${NOMAD_POLL_SECS:-60}"
+
+log() { printf '[cluster-up] %s\n' "$*"; }
+die() { printf '[cluster-up] ERROR: %s\n' "$*" >&2; exit 1; }
+
+# ── Flag parsing ─────────────────────────────────────────────────────────────
+dry_run=false
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --dry-run) dry_run=true; shift ;;
+    -h|--help)
+      cat <<EOF
+Usage: sudo $(basename "$0") [--dry-run]
+
+Brings up an empty single-node Nomad+Vault cluster (idempotent).
+
+  --dry-run   Print the step list without performing any action.
+EOF
+      exit 0
+      ;;
+    *) die "unknown flag: $1" ;;
+  esac
+done
+
+# ── Dry-run: print step list + exit ──────────────────────────────────────────
+if [ "$dry_run" = true ]; then
+  cat <<EOF
+[dry-run] Step 1/9: install nomad + vault binaries
+  → sudo ${INSTALL_SH}
+
+[dry-run] Step 2/9: write + enable nomad.service (NOT started)
+  → sudo ${SYSTEMD_NOMAD_SH}
+
+[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)
+  → sudo ${SYSTEMD_VAULT_SH}
+
+[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/
+EOF
+  for d in "${HOST_VOLUME_DIRS[@]}"; do
+    printf '  → install -d -m 0755 %s\n' "$d"
+  done
+  cat <<EOF
+
+[dry-run] Step 5/9: install /etc/nomad.d/server.hcl + client.hcl from repo
+  → ${NOMAD_SERVER_HCL_SRC} → ${NOMAD_CONFIG_DIR}/server.hcl
+  → ${NOMAD_CLIENT_HCL_SRC} → ${NOMAD_CONFIG_DIR}/client.hcl
+
+[dry-run] Step 6/9: first-run vault init + persist unseal.key + root.token
+  → sudo ${VAULT_INIT_SH}
+
+[dry-run] Step 7/9: systemctl start vault + poll until unsealed (≤${VAULT_POLL_SECS}s)
+
+[dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready (≤${NOMAD_POLL_SECS}s)
+
+[dry-run] Step 9/9: write ${PROFILE_D_FILE}
+  → export VAULT_ADDR=${VAULT_ADDR_DEFAULT}
+  → export NOMAD_ADDR=${NOMAD_ADDR_DEFAULT}
+
+Dry run complete — no changes made.
+EOF
+  exit 0
+fi
+
+# ── Preconditions ────────────────────────────────────────────────────────────
+if [ "$(id -u)" -ne 0 ]; then
+  die "must run as root (spawns install/systemd/vault-init sub-scripts)"
+fi
+
+command -v systemctl >/dev/null 2>&1 \
+  || die "systemctl not found (systemd required)"
+
+for f in "$INSTALL_SH" "$SYSTEMD_NOMAD_SH" "$SYSTEMD_VAULT_SH" "$VAULT_INIT_SH"; do
+  [ -x "$f" ] || die "sub-script missing or non-executable: ${f}"
+done
+
+[ -f "$NOMAD_SERVER_HCL_SRC" ] \
+  || die "source config not found: ${NOMAD_SERVER_HCL_SRC}"
+[ -f "$NOMAD_CLIENT_HCL_SRC" ] \
+  || die "source config not found: ${NOMAD_CLIENT_HCL_SRC}"
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+
+# install_file_if_differs SRC DST MODE
+#   Copy SRC to DST (root:root with MODE) iff on-disk content differs.
+#   No-op + log otherwise — preserves mtime, avoids spurious reloads.
+install_file_if_differs() {
+  local src="$1" dst="$2" mode="$3"
+  if [ -f "$dst" ] && cmp -s "$src" "$dst"; then
+    log "unchanged: ${dst}"
+    return 0
+  fi
+  log "writing: ${dst}"
+  install -m "$mode" -o root -g root "$src" "$dst"
+}
+
+# vault_status_json — echo `vault status -format=json`, or '' on unreachable.
+#   vault status exit codes: 0 = unsealed, 2 = sealed/uninit, 1 = unreachable.
+#   We treat all of 0/2 as "reachable with state"; 1 yields empty output.
+#   Wrapped in `|| true` so set -e doesn't abort on exit 2 (the expected
+#   sealed-state case during first-boot polling).
+vault_status_json() {
+  VAULT_ADDR="$VAULT_ADDR_DEFAULT" vault status -format=json 2>/dev/null || true
+}
+
+# vault_is_unsealed — true iff vault reachable AND initialized AND unsealed.
+vault_is_unsealed() {
+  local out init sealed
+  out="$(vault_status_json)"
+  [ -n "$out" ] || return 1
+  init="$(printf '%s' "$out" | jq -r '.initialized' 2>/dev/null)" || init=""
+  sealed="$(printf '%s' "$out" | jq -r '.sealed' 2>/dev/null)" || sealed=""
+  [ "$init" = "true" ] && [ "$sealed" = "false" ]
+}
+
+# nomad_ready_count — echo the number of ready nodes, or 0 on error.
+#   `nomad node status -json` returns a JSON array of nodes, each with a
+#   .Status field ("initializing" | "ready" | "down" | "disconnected").
+nomad_ready_count() {
+  local out
+  out="$(NOMAD_ADDR="$NOMAD_ADDR_DEFAULT" nomad node status -json 2>/dev/null || true)"
+  if [ -z "$out" ]; then
+    printf '0'
+    return 0
+  fi
+  printf '%s' "$out" \
+    | jq '[.[] | select(.Status == "ready")] | length' 2>/dev/null \
+    || printf '0'
+}
+
+# ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
+log "── Step 1/9: install nomad + vault binaries ──"
+"$INSTALL_SH"
+
+# ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ──────────────────
+log "── Step 2/9: install nomad.service (enable, not start) ──"
+"$SYSTEMD_NOMAD_SH"
+
+# ── Step 3/9: systemd-vault.sh (unit + vault.hcl + enable) ───────────────────
+log "── Step 3/9: install vault.service + vault.hcl (enable, not start) ──"
+"$SYSTEMD_VAULT_SH"
+
+# ── Step 4/9: host-volume dirs matching nomad/client.hcl ─────────────────────
+log "── Step 4/9: host-volume dirs under /srv/disinto/ ──"
+# Parent /srv/disinto/ first (install -d handles missing parents, but being
+# explicit makes the log output read naturally as a top-down creation).
+install -d -m 0755 -o root -g root "/srv/disinto"
+for d in "${HOST_VOLUME_DIRS[@]}"; do
+  if [ -d "$d" ]; then
+    log "unchanged: ${d}"
+  else
+    log "creating: ${d}"
+    install -d -m 0755 -o root -g root "$d"
+  fi
+done
+
+# ── Step 5/9: /etc/nomad.d/server.hcl + client.hcl ───────────────────────────
+log "── Step 5/9: install /etc/nomad.d/{server,client}.hcl ──"
+# systemd-nomad.sh already created /etc/nomad.d/. Re-assert for clarity +
+# in case someone runs cluster-up.sh with an exotic step ordering later.
+install -d -m 0755 -o root -g root "$NOMAD_CONFIG_DIR"
+install_file_if_differs "$NOMAD_SERVER_HCL_SRC" "${NOMAD_CONFIG_DIR}/server.hcl" 0644
+install_file_if_differs "$NOMAD_CLIENT_HCL_SRC" "${NOMAD_CONFIG_DIR}/client.hcl" 0644
+
+# ── Step 6/9: vault-init (first-run init + unseal + persist keys) ────────────
+log "── Step 6/9: vault-init (no-op after first run) ──"
+# vault-init.sh spawns a temporary vault server if systemd isn't managing
+# one, runs `operator init`, writes unseal.key + root.token, unseals once,
+# then stops the temp server (EXIT trap). After it returns, port 8200 is
+# free for systemctl-managed vault to take in step 7.
+"$VAULT_INIT_SH"
+
+# ── Step 7/9: systemctl start vault + poll until unsealed ────────────────────
+log "── Step 7/9: start vault + poll until unsealed ──"
+if systemctl is-active --quiet vault && vault_is_unsealed; then
+  log "vault already active + unsealed — skip start"
+else
+  systemctl start vault
+  ready=0
+  for i in $(seq 1 "$VAULT_POLL_SECS"); do
+    # Fail fast if systemd has already marked the unit as failed — usually
+    # ExecStartPost tripping because unseal.key is absent / corrupted.
+    if systemctl is-failed --quiet vault; then
+      log "vault.service entered failed state — systemctl status follows:"
+      systemctl --no-pager --full status vault >&2 || true
+      die "vault.service failed to start"
+    fi
+    if vault_is_unsealed; then
+      log "vault unsealed after ${i}s"
+      ready=1
+      break
+    fi
+    sleep 1
+  done
+  if [ "$ready" -ne 1 ]; then
+    log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:"
+    systemctl --no-pager --full status vault >&2 || true
+    die "vault failed to become unsealed"
+  fi
+fi
+
+# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
+log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
+if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then
+  log "nomad already active + ≥1 node ready — skip start"
+else
+  systemctl start nomad
+  ready=0
+  for i in $(seq 1 "$NOMAD_POLL_SECS"); do
+    if systemctl is-failed --quiet nomad; then
+      log "nomad.service entered failed state — systemctl status follows:"
+      systemctl --no-pager --full status nomad >&2 || true
+      die "nomad.service failed to start"
+    fi
+    if [ "$(nomad_ready_count)" -ge 1 ]; then
+      log "nomad has ready node after ${i}s"
+      ready=1
+      break
+    fi
+    sleep 1
+  done
+  if [ "$ready" -ne 1 ]; then
+    log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:"
+    systemctl --no-pager --full status nomad >&2 || true
+    die "nomad failed to reach ≥1 ready node"
+  fi
+fi
+
+# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────
+log "── Step 9/9: write ${PROFILE_D_FILE} ──"
+# Shell rc fragments in /etc/profile.d/ are sourced by /etc/profile for
+# every interactive login shell. Setting VAULT_ADDR + NOMAD_ADDR here means
+# the operator can run `vault status` / `nomad node status` straight after
+# `ssh factory-box` without fumbling env vars.
+desired_profile="# /etc/profile.d/disinto-nomad.sh — written by lib/init/nomad/cluster-up.sh
+# Interactive-shell defaults for Vault + Nomad clients on this box.
+export VAULT_ADDR=${VAULT_ADDR_DEFAULT}
+export NOMAD_ADDR=${NOMAD_ADDR_DEFAULT}
+"
+if [ -f "$PROFILE_D_FILE" ] \
+   && printf '%s' "$desired_profile" | cmp -s - "$PROFILE_D_FILE"; then
+  log "unchanged: ${PROFILE_D_FILE}"
+else
+  log "writing: ${PROFILE_D_FILE}"
+  # Subshell + EXIT trap: guarantees the tempfile is cleaned up on both
+  # success AND set-e-induced failure of `install`. A function-scoped
+  # RETURN trap does NOT fire on errexit-abort in bash — the subshell is
+  # the reliable cleanup boundary here.
+  (
+    tmp="$(mktemp)"
+    trap 'rm -f "$tmp"' EXIT
+    printf '%s' "$desired_profile" > "$tmp"
+    install -m 0644 -o root -g root "$tmp" "$PROFILE_D_FILE"
+  )
+fi
+
+log "── done: empty nomad+vault cluster is up ──"
+log "   Vault:  ${VAULT_ADDR_DEFAULT}  (Sealed=false Initialized=true)"
+log "   Nomad:  ${NOMAD_ADDR_DEFAULT}  (≥1 node ready)"
-- 
2.49.1


From 481175e04386eadb24454cd4960524417d1b1bc2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 16 Apr 2026 07:26:54 +0000
Subject: [PATCH 2/2] fix: dedupe cluster-up.sh polling via poll_until_healthy
 helper (#824)

CI duplicate-detection flagged the in-line vault + nomad polling loops
in cluster-up.sh as matching a 5-line window in vault-init.sh (the
`ready=1 / break / fi / sleep 1 / done` boilerplate).

Extracts the repeated pattern into three helpers at the top of the
file:

  - nomad_has_ready_node       wrapper so poll_until_healthy can take a
                               bare command name.
  - _die_with_service_status   shared "log + dump systemctl status +
                               die" path (factored out of the two
                               callsites + the timeout branch).
  - poll_until_healthy         ticks once per second up to TIMEOUT,
                               fail-fasts on systemd "failed" state,
                               and returns 0 on first successful check.

Step 7 (vault unseal) and Step 8 (nomad ready node) each collapse from
~15 lines of explicit for-loop bookkeeping to a one-line call. No
behavioural change: same tick cadence, same fail-fast, same status
dump on timeout. Local detect-duplicates.py run against main confirms
no new duplicates introduced.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/init/nomad/cluster-up.sh | 83 ++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh
index a1b02ff..7c802c6 100755
--- a/lib/init/nomad/cluster-up.sh
+++ b/lib/init/nomad/cluster-up.sh
@@ -206,6 +206,43 @@ nomad_ready_count() {
     || printf '0'
 }
 
+# nomad_has_ready_node — true iff nomad_ready_count ≥ 1. Wrapper exists
+# so poll_until_healthy can call it as a single-arg command name.
+nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; }
+
+# _die_with_service_status SVC REASON
+#   Log + dump `systemctl status SVC` to stderr + die with REASON. Factored
+#   out so the poll helper doesn't carry three copies of the same dump.
+_die_with_service_status() {
+  local svc="$1" reason="$2"
+  log "${svc}.service ${reason} — systemctl status follows:"
+  systemctl --no-pager --full status "$svc" >&2 || true
+  die "${svc}.service ${reason}"
+}
+
+# poll_until_healthy SVC CHECK_CMD TIMEOUT
+#   Tick once per second for up to TIMEOUT seconds, invoking CHECK_CMD as a
+#   command name (no arguments). Returns 0 on the first successful check.
+#   Fails fast via _die_with_service_status if SVC enters systemd "failed"
+#   state, and dies with a status dump if TIMEOUT elapses before CHECK_CMD
+#   succeeds. Replaces the two in-line ready=1/break/sleep poll loops that
+#   would otherwise each duplicate the same pattern already in vault-init.sh.
+poll_until_healthy() {
+  local svc="$1" check="$2" timeout="$3"
+  local waited=0
+  until [ "$waited" -ge "$timeout" ]; do
+    systemctl is-failed --quiet "$svc" \
+      && _die_with_service_status "$svc" "entered failed state during startup"
+    if "$check"; then
+      log "${svc} healthy after ${waited}s"
+      return 0
+    fi
+    waited=$((waited + 1))
+    sleep 1
+  done
+  _die_with_service_status "$svc" "not healthy within ${timeout}s"
+}
+
 # ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
 log "── Step 1/9: install nomad + vault binaries ──"
 "$INSTALL_SH"
@@ -250,58 +287,22 @@ log "── Step 6/9: vault-init (no-op after first run) ──"
 
 # ── Step 7/9: systemctl start vault + poll until unsealed ────────────────────
 log "── Step 7/9: start vault + poll until unsealed ──"
+# Fast-path when vault.service is already active and Vault reports
+# initialized=true,sealed=false — re-runs are a no-op.
 if systemctl is-active --quiet vault && vault_is_unsealed; then
   log "vault already active + unsealed — skip start"
 else
   systemctl start vault
-  ready=0
-  for i in $(seq 1 "$VAULT_POLL_SECS"); do
-    # Fail fast if systemd has already marked the unit as failed — usually
-    # ExecStartPost tripping because unseal.key is absent / corrupted.
-    if systemctl is-failed --quiet vault; then
-      log "vault.service entered failed state — systemctl status follows:"
-      systemctl --no-pager --full status vault >&2 || true
-      die "vault.service failed to start"
-    fi
-    if vault_is_unsealed; then
-      log "vault unsealed after ${i}s"
-      ready=1
-      break
-    fi
-    sleep 1
-  done
-  if [ "$ready" -ne 1 ]; then
-    log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:"
-    systemctl --no-pager --full status vault >&2 || true
-    die "vault failed to become unsealed"
-  fi
+  poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS"
 fi
 
 # ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
 log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
-if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then
+if systemctl is-active --quiet nomad && nomad_has_ready_node; then
   log "nomad already active + ≥1 node ready — skip start"
 else
   systemctl start nomad
-  ready=0
-  for i in $(seq 1 "$NOMAD_POLL_SECS"); do
-    if systemctl is-failed --quiet nomad; then
-      log "nomad.service entered failed state — systemctl status follows:"
-      systemctl --no-pager --full status nomad >&2 || true
-      die "nomad.service failed to start"
-    fi
-    if [ "$(nomad_ready_count)" -ge 1 ]; then
-      log "nomad has ready node after ${i}s"
-      ready=1
-      break
-    fi
-    sleep 1
-  done
-  if [ "$ready" -ne 1 ]; then
-    log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:"
-    systemctl --no-pager --full status nomad >&2 || true
-    die "nomad failed to reach ≥1 ready node"
-  fi
+  poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS"
 fi
 
 # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────
-- 
2.49.1