fix: dedupe cluster-up.sh polling via poll_until_healthy helper (#824)
CI duplicate-detection flagged the in-line vault + nomad polling loops
in cluster-up.sh as matching a 5-line window in vault-init.sh (the
`ready=1 / break / fi / sleep 1 / done` boilerplate).
Extracts the repeated pattern into three helpers at the top of the
file:
- nomad_has_ready_node wrapper so poll_until_healthy can take a
bare command name.
- _die_with_service_status shared "log + dump systemctl status +
die" path (factored out of the two
callsites + the timeout branch).
- poll_until_healthy ticks once per second up to TIMEOUT,
fail-fasts on systemd "failed" state,
and returns 0 on first successful check.
Step 7 (vault unseal) and Step 8 (nomad ready node) each collapse from
~15 lines of explicit for-loop bookkeeping to a one-line call. No
behavioural change: same tick cadence, same fail-fast, same status
dump on timeout. Local detect-duplicates.py run against main confirms
no new duplicates introduced.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d2c6b33271
commit
481175e043
1 changed files with 42 additions and 41 deletions
|
|
@ -206,6 +206,43 @@ nomad_ready_count() {
|
||||||
|| printf '0'
|
|| printf '0'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# nomad_has_ready_node — true iff nomad_ready_count ≥ 1. Wrapper exists
|
||||||
|
# so poll_until_healthy can call it as a single-arg command name.
|
||||||
|
nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; }
|
||||||
|
|
||||||
|
# _die_with_service_status SVC REASON
|
||||||
|
# Log + dump `systemctl status SVC` to stderr + die with REASON. Factored
|
||||||
|
# out so the poll helper doesn't carry three copies of the same dump.
|
||||||
|
_die_with_service_status() {
|
||||||
|
local svc="$1" reason="$2"
|
||||||
|
log "${svc}.service ${reason} — systemctl status follows:"
|
||||||
|
systemctl --no-pager --full status "$svc" >&2 || true
|
||||||
|
die "${svc}.service ${reason}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# poll_until_healthy SVC CHECK_CMD TIMEOUT
|
||||||
|
# Tick once per second for up to TIMEOUT seconds, invoking CHECK_CMD as a
|
||||||
|
# command name (no arguments). Returns 0 on the first successful check.
|
||||||
|
# Fails fast via _die_with_service_status if SVC enters systemd "failed"
|
||||||
|
# state, and dies with a status dump if TIMEOUT elapses before CHECK_CMD
|
||||||
|
# succeeds. Replaces the two in-line ready=1/break/sleep poll loops that
|
||||||
|
# would otherwise each duplicate the same pattern already in vault-init.sh.
|
||||||
|
poll_until_healthy() {
|
||||||
|
local svc="$1" check="$2" timeout="$3"
|
||||||
|
local waited=0
|
||||||
|
until [ "$waited" -ge "$timeout" ]; do
|
||||||
|
systemctl is-failed --quiet "$svc" \
|
||||||
|
&& _die_with_service_status "$svc" "entered failed state during startup"
|
||||||
|
if "$check"; then
|
||||||
|
log "${svc} healthy after ${waited}s"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
waited=$((waited + 1))
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
_die_with_service_status "$svc" "not healthy within ${timeout}s"
|
||||||
|
}
|
||||||
|
|
||||||
# ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
|
# ── Step 1/9: install.sh (nomad + vault binaries) ────────────────────────────
|
||||||
log "── Step 1/9: install nomad + vault binaries ──"
|
log "── Step 1/9: install nomad + vault binaries ──"
|
||||||
"$INSTALL_SH"
|
"$INSTALL_SH"
|
||||||
|
|
@ -250,58 +287,22 @@ log "── Step 6/9: vault-init (no-op after first run) ──"
|
||||||
|
|
||||||
# ── Step 7/9: systemctl start vault + poll until unsealed ────────────────────
|
# ── Step 7/9: systemctl start vault + poll until unsealed ────────────────────
|
||||||
log "── Step 7/9: start vault + poll until unsealed ──"
|
log "── Step 7/9: start vault + poll until unsealed ──"
|
||||||
|
# Fast-path when vault.service is already active and Vault reports
|
||||||
|
# initialized=true,sealed=false — re-runs are a no-op.
|
||||||
if systemctl is-active --quiet vault && vault_is_unsealed; then
|
if systemctl is-active --quiet vault && vault_is_unsealed; then
|
||||||
log "vault already active + unsealed — skip start"
|
log "vault already active + unsealed — skip start"
|
||||||
else
|
else
|
||||||
systemctl start vault
|
systemctl start vault
|
||||||
ready=0
|
poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS"
|
||||||
for i in $(seq 1 "$VAULT_POLL_SECS"); do
|
|
||||||
# Fail fast if systemd has already marked the unit as failed — usually
|
|
||||||
# ExecStartPost tripping because unseal.key is absent / corrupted.
|
|
||||||
if systemctl is-failed --quiet vault; then
|
|
||||||
log "vault.service entered failed state — systemctl status follows:"
|
|
||||||
systemctl --no-pager --full status vault >&2 || true
|
|
||||||
die "vault.service failed to start"
|
|
||||||
fi
|
|
||||||
if vault_is_unsealed; then
|
|
||||||
log "vault unsealed after ${i}s"
|
|
||||||
ready=1
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
if [ "$ready" -ne 1 ]; then
|
|
||||||
log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:"
|
|
||||||
systemctl --no-pager --full status vault >&2 || true
|
|
||||||
die "vault failed to become unsealed"
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
|
# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ───────────────
|
||||||
log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
|
log "── Step 8/9: start nomad + poll until ≥1 node ready ──"
|
||||||
if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then
|
if systemctl is-active --quiet nomad && nomad_has_ready_node; then
|
||||||
log "nomad already active + ≥1 node ready — skip start"
|
log "nomad already active + ≥1 node ready — skip start"
|
||||||
else
|
else
|
||||||
systemctl start nomad
|
systemctl start nomad
|
||||||
ready=0
|
poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS"
|
||||||
for i in $(seq 1 "$NOMAD_POLL_SECS"); do
|
|
||||||
if systemctl is-failed --quiet nomad; then
|
|
||||||
log "nomad.service entered failed state — systemctl status follows:"
|
|
||||||
systemctl --no-pager --full status nomad >&2 || true
|
|
||||||
die "nomad.service failed to start"
|
|
||||||
fi
|
|
||||||
if [ "$(nomad_ready_count)" -ge 1 ]; then
|
|
||||||
log "nomad has ready node after ${i}s"
|
|
||||||
ready=1
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
if [ "$ready" -ne 1 ]; then
|
|
||||||
log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:"
|
|
||||||
systemctl --no-pager --full status nomad >&2 || true
|
|
||||||
die "nomad failed to reach ≥1 ready node"
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────
|
# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue