fix: dedupe cluster-up.sh polling via poll_until_healthy helper (#824)

CI duplicate-detection flagged the in-line vault + nomad polling loops in cluster-up.sh as matching a 5-line window in vault-init.sh (the `ready=1 / break / fi / sleep 1 / done` boilerplate). Extracts the repeated pattern into three helpers at the top of the file: - nomad_has_ready_node wrapper so poll_until_healthy can take a bare command name. - _die_with_service_status shared "log + dump systemctl status + die" path (factored out of the two callsites + the timeout branch). - poll_until_healthy ticks once per second up to TIMEOUT, fail-fasts on systemd "failed" state, and returns 0 on first successful check. Step 7 (vault unseal) and Step 8 (nomad ready node) each collapse from ~15 lines of explicit for-loop bookkeeping to a one-line call. No behavioural change: same tick cadence, same fail-fast, same status dump on timeout. Local detect-duplicates.py run against main confirms no new duplicates introduced. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 07:26:54 +00:00 · 2026-04-16 07:26:54 +00:00 · 481175e043
commit 481175e043
parent d2c6b33271
1 changed files with 42 additions and 41 deletions
--- a/lib/init/nomad/cluster-up.sh
+++ b/lib/init/nomad/cluster-up.sh
@ -206,6 +206,43 @@ nomad_ready_count() {
    || printf '0'
 }
 # nomad_has_ready_node — true iff nomad_ready_count ≥ 1. Wrapper exists
 # so poll_until_healthy can call it as a single-arg command name.
 nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; }
 # _die_with_service_status SVC REASON
 #   Log + dump `systemctl status SVC` to stderr + die with REASON. Factored
 #   out so the poll helper doesn't carry three copies of the same dump.
 _die_with_service_status() {
  local svc="$1" reason="$2"
  log "${svc}.service ${reason} — systemctl status follows:"
  systemctl --no-pager --full status "$svc" >&2 || true
  die "${svc}.service ${reason}"
 }
 # poll_until_healthy SVC CHECK_CMD TIMEOUT
 #   Tick once per second for up to TIMEOUT seconds, invoking CHECK_CMD as a
 #   command name (no arguments). Returns 0 on the first successful check.
 #   Fails fast via _die_with_service_status if SVC enters systemd "failed"
 #   state, and dies with a status dump if TIMEOUT elapses before CHECK_CMD
 #   succeeds. Replaces the two in-line ready=1/break/sleep poll loops that
 #   would otherwise each duplicate the same pattern already in vault-init.sh.
 poll_until_healthy() {
  local svc="$1" check="$2" timeout="$3"
  local waited=0
  until [ "$waited" -ge "$timeout" ]; do
    systemctl is-failed --quiet "$svc" \
      && _die_with_service_status "$svc" "entered failed state during startup"
    if "$check"; then
      log "${svc} healthy after ${waited}s"
      return 0
    fi
    waited=$((waited + 1))
    sleep 1
  done
  _die_with_service_status "$svc" "not healthy within ${timeout}s"
 }
 # ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
 log "── Step 1/9: install nomad + vault binaries ──"
 "$INSTALL_SH"
@ -250,58 +287,22 @@ log "── Step 6/9: vault-init (no-op after first run) ──"
 # ── Step 7/9: systemctl start vault + poll until unsealed ────────────────────
 log "── Step 7/9: start vault + poll until unsealed ──"
 # Fast-path when vault.service is already active and Vault reports
 # initialized=true,sealed=false — re-runs are a no-op.
 if systemctl is-active --quiet vault && vault_is_unsealed; then
  log "vault already active + unsealed — skip start"
 else
  systemctl start vault
-  ready=0
+  poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS"
  for i in $(seq 1 "$VAULT_POLL_SECS"); do
    # Fail fast if systemd has already marked the unit as failed — usually
    # ExecStartPost tripping because unseal.key is absent / corrupted.
    if systemctl is-failed --quiet vault; then
      log "vault.service entered failed state — systemctl status follows:"
      systemctl --no-pager --full status vault >&2 || true
      die "vault.service failed to start"
    fi
    if vault_is_unsealed; then
      log "vault unsealed after ${i}s"
      ready=1
      break
    fi
    sleep 1
  done
  if [ "$ready" -ne 1 ]; then
    log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:"
    systemctl --no-pager --full status vault >&2 || true
    die "vault failed to become unsealed"
  fi
 fi
 # ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
 log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
-if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then
+if systemctl is-active --quiet nomad && nomad_has_ready_node; then
  log "nomad already active + ≥1 node ready — skip start"
 else
  systemctl start nomad
-  ready=0
+  poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS"
  for i in $(seq 1 "$NOMAD_POLL_SECS"); do
    if systemctl is-failed --quiet nomad; then
      log "nomad.service entered failed state — systemctl status follows:"
      systemctl --no-pager --full status nomad >&2 || true
      die "nomad.service failed to start"
    fi
    if [ "$(nomad_ready_count)" -ge 1 ]; then
      log "nomad has ready node after ${i}s"
      ready=1
      break
    fi
    sleep 1
  done
  if [ "$ready" -ne 1 ]; then
    log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:"
    systemctl --no-pager --full status nomad >&2 || true
    die "nomad failed to reach ≥1 ready node"
  fi
 fi
 # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────