From d2c6b332717952ce284ca5764d3921db51b43094 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 16 Apr 2026 07:21:56 +0000 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20[nomad-step-0]=20S0.4=20=E2=80=94=20?= =?UTF-8?q?disinto=20init=20--backend=3Dnomad=20--empty=20orchestrator=20(?= =?UTF-8?q?cluster-up)=20(#824)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires S0.1–S0.3 into a single idempotent bring-up script and replaces the S0.1 stub in _disinto_init_nomad so `disinto init --backend=nomad --empty` produces a running empty single-node cluster on a fresh box. lib/init/nomad/cluster-up.sh (new): 1. install.sh (nomad + vault binaries) 2. systemd-nomad.sh (unit + enable, not started) 3. systemd-vault.sh (unit + vault.hcl + enable) 4. host-volume dirs under /srv/disinto/* (matching nomad/client.hcl) 5. /etc/nomad.d/{server,client}.hcl (content-compare before write) 6. vault-init.sh (first-run init + unseal + persist keys) 7. systemctl start vault (poll until unsealed; fail-fast on is-failed) 8. systemctl start nomad (poll until ≥1 node ready) 9. /etc/profile.d/disinto-nomad.sh (VAULT_ADDR + NOMAD_ADDR for interactive shells) Re-running on a healthy box is a no-op — each sub-step is itself idempotent and steps 7/8 fast-path when already active + healthy. `--dry-run` prints the full step list and exits 0. bin/disinto: - _disinto_init_nomad: replaces the S0.1 stub. Invokes cluster-up.sh directly (as root) or via `sudo -n` otherwise. Both `--empty` and the default (no flag) call cluster-up.sh today; Step 1 will branch on $empty to gate job deployment. --dry-run forwards through. - disinto_init: adds `--empty` flag parsing; rejects `--empty` combined with `--backend=docker` explicitly instead of silently ignoring it. - usage: documents `--empty` and drops the "stub, S0.1" annotation from --backend. Closes #824. Co-Authored-By: Claude Opus 4.6 (1M context) --- bin/disinto | 84 +++++++-- lib/init/nomad/cluster-up.sh | 337 +++++++++++++++++++++++++++++++++++ 2 files changed, 406 insertions(+), 15 deletions(-) create mode 100755 lib/init/nomad/cluster-up.sh diff --git a/bin/disinto b/bin/disinto index 00404e6..75d7bab 100755 --- a/bin/disinto +++ b/bin/disinto @@ -81,7 +81,8 @@ Init options: --repo-root Local clone path (default: ~/name) --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) - --backend Orchestration backend: docker (default) | nomad (stub, S0.1) + --backend Orchestration backend: docker (default) | nomad + --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) --yes Skip confirmation prompts @@ -645,17 +646,61 @@ prompt_admin_password() { # ── init command ───────────────────────────────────────────────────────────── -# Nomad backend init — stub for the Nomad+Vault migration (issue #821, S0.1). -# Real implementation lands across S0.2–S0.5. Exists so --backend=nomad fails -# loud instead of silently routing through the docker path. +# Nomad backend init — dispatcher (Nomad+Vault migration, S0.4, issue #824). +# +# Today `--empty` and the default (no flag) both bring up an empty +# single-node Nomad+Vault cluster via lib/init/nomad/cluster-up.sh. Step 1 +# will extend the default path to also deploy jobs; `--empty` will remain +# the "cluster only, no workloads" escape hatch. +# +# Uses `sudo -n` when not already root — cluster-up.sh mutates /etc/, +# /srv/, and systemd state, so it has to run as root. The `-n` keeps the +# failure mode legible (no hanging TTY-prompted sudo inside a factory +# init run); operators running without sudo-NOPASSWD should invoke +# `sudo disinto init ...` directly. _disinto_init_nomad() { - local dry_run="${1:-false}" - if [ "$dry_run" = "true" ]; then - echo "nomad backend: stub — will be implemented by S0.2–S0.5" - exit 0 + local dry_run="${1:-false}" empty="${2:-false}" + local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" + + if [ ! -x "$cluster_up" ]; then + echo "Error: ${cluster_up} not found or not executable" >&2 + exit 1 fi - echo "ERROR: nomad backend not yet implemented (stub)" >&2 - exit 99 + + # --empty and default both invoke cluster-up today. Log the requested + # mode so the dispatch is visible in factory bootstrap logs — Step 1 + # will branch on $empty to gate the job-deployment path. + if [ "$empty" = "true" ]; then + echo "nomad backend: --empty (cluster-up only, no jobs)" + else + echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" + fi + + # Dry-run forwards straight through; cluster-up.sh prints its own step + # list and exits 0 without touching the box. + local -a cmd=("$cluster_up") + if [ "$dry_run" = "true" ]; then + cmd+=("--dry-run") + "${cmd[@]}" + exit $? + fi + + # Real run — needs root. Invoke via sudo if we're not already root so + # the command's exit code propagates directly. We don't distinguish + # "sudo denied" from "cluster-up.sh failed" here; both surface as a + # non-zero exit, and cluster-up.sh's own error messages cover the + # latter case. + local rc=0 + if [ "$(id -u)" -eq 0 ]; then + "${cmd[@]}" || rc=$? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: cluster-up.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${cmd[@]}" || rc=$? + fi + exit "$rc" } disinto_init() { @@ -668,7 +713,7 @@ disinto_init() { shift # Parse flags - local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" + local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -679,6 +724,7 @@ disinto_init() { --backend=*) backend="${1#--backend=}"; shift ;; --bare) bare=true; shift ;; --build) use_build=true; shift ;; + --empty) empty=true; shift ;; --yes) auto_yes=true; shift ;; --rotate-tokens) rotate_tokens=true; shift ;; --dry-run) dry_run=true; shift ;; @@ -692,11 +738,19 @@ disinto_init() { *) echo "Error: invalid --backend value '${backend}' (expected: docker|nomad)" >&2; exit 1 ;; esac - # Dispatch on backend — nomad path is a stub for now (issue #821, S0.1). - # Subsequent S0.x issues will replace _disinto_init_nomad with real logic - # without touching flag parsing or this dispatch. + # --empty is nomad-only today (the docker path has no concept of an + # "empty cluster"). Reject explicitly rather than letting it silently + # do nothing on --backend=docker. + if [ "$empty" = true ] && [ "$backend" != "nomad" ]; then + echo "Error: --empty is only valid with --backend=nomad" >&2 + exit 1 + fi + + # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh + # (S0.4). The default and --empty variants are identical today; Step 1 + # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" + _disinto_init_nomad "$dry_run" "$empty" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh new file mode 100755 index 0000000..a1b02ff --- /dev/null +++ b/lib/init/nomad/cluster-up.sh @@ -0,0 +1,337 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/cluster-up.sh — Empty Nomad+Vault cluster orchestrator (S0.4) +# +# Wires together the S0.1–S0.3 building blocks into one idempotent +# "bring up a single-node Nomad+Vault cluster" script: +# +# 1. install.sh (nomad + vault binaries) +# 2. systemd-nomad.sh (nomad.service — unit + enable, not started) +# 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable) +# 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl) +# 5. /etc/nomad.d/*.hcl (server.hcl + client.hcl from repo) +# 6. vault-init.sh (first-run init + unseal + persist keys) +# 7. systemctl start vault (auto-unseal via ExecStartPost; poll) +# 8. systemctl start nomad (poll until ≥1 ready node) +# 9. /etc/profile.d/disinto-nomad.sh (VAULT_ADDR + NOMAD_ADDR for shells) +# +# This is the "empty cluster" orchestrator — no jobs deployed. Subsequent +# Step-1 issues layer job deployment on top of this checkpoint. +# +# Idempotency contract: +# Running twice back-to-back on a healthy box is a no-op. Each sub-step +# is itself idempotent — see install.sh / systemd-*.sh / vault-init.sh +# headers for the per-step contract. Fast-paths in steps 7 and 8 skip +# the systemctl start when the service is already active + healthy. +# +# Usage: +# sudo lib/init/nomad/cluster-up.sh # bring cluster up +# sudo lib/init/nomad/cluster-up.sh --dry-run # print step list, exit 0 +# +# Environment (override polling for slow boxes): +# VAULT_POLL_SECS max seconds to wait for vault to unseal (default: 30) +# NOMAD_POLL_SECS max seconds to wait for nomad node=ready (default: 60) +# +# Exit codes: +# 0 success (cluster up, or already up) +# 1 precondition or step failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# Sub-scripts (siblings in this directory). +INSTALL_SH="${SCRIPT_DIR}/install.sh" +SYSTEMD_NOMAD_SH="${SCRIPT_DIR}/systemd-nomad.sh" +SYSTEMD_VAULT_SH="${SCRIPT_DIR}/systemd-vault.sh" +VAULT_INIT_SH="${SCRIPT_DIR}/vault-init.sh" + +# In-repo Nomad configs copied to /etc/nomad.d/. +NOMAD_CONFIG_DIR="/etc/nomad.d" +NOMAD_SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl" +NOMAD_CLIENT_HCL_SRC="${REPO_ROOT}/nomad/client.hcl" + +# /etc/profile.d entry — makes VAULT_ADDR + NOMAD_ADDR available to +# interactive shells without requiring the operator to source anything. +PROFILE_D_FILE="/etc/profile.d/disinto-nomad.sh" + +# Host-volume paths — MUST match the `host_volume "..."` declarations +# in nomad/client.hcl. Adding a host_volume block there requires adding +# its path here so the dir exists before nomad starts (otherwise client +# fingerprinting fails and the node stays in "initializing"). +HOST_VOLUME_DIRS=( + "/srv/disinto/forgejo-data" + "/srv/disinto/woodpecker-data" + "/srv/disinto/agent-data" + "/srv/disinto/project-repos" + "/srv/disinto/caddy-data" + "/srv/disinto/chat-history" + "/srv/disinto/ops-repo" +) + +# Default API addresses — matches the listener bindings in +# nomad/server.hcl and nomad/vault.hcl. If either file ever moves +# off 127.0.0.1 / default port, update both places together. +VAULT_ADDR_DEFAULT="http://127.0.0.1:8200" +NOMAD_ADDR_DEFAULT="http://127.0.0.1:4646" + +VAULT_POLL_SECS="${VAULT_POLL_SECS:-30}" +NOMAD_POLL_SECS="${NOMAD_POLL_SECS:-60}" + +log() { printf '[cluster-up] %s\n' "$*"; } +die() { printf '[cluster-up] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing ───────────────────────────────────────────────────────────── +dry_run=false +while [ $# -gt 0 ]; do + case "$1" in + --dry-run) dry_run=true; shift ;; + -h|--help) + cat </dev/null 2>&1 \ + || die "systemctl not found (systemd required)" + +for f in "$INSTALL_SH" "$SYSTEMD_NOMAD_SH" "$SYSTEMD_VAULT_SH" "$VAULT_INIT_SH"; do + [ -x "$f" ] || die "sub-script missing or non-executable: ${f}" +done + +[ -f "$NOMAD_SERVER_HCL_SRC" ] \ + || die "source config not found: ${NOMAD_SERVER_HCL_SRC}" +[ -f "$NOMAD_CLIENT_HCL_SRC" ] \ + || die "source config not found: ${NOMAD_CLIENT_HCL_SRC}" + +# ── Helpers ────────────────────────────────────────────────────────────────── + +# install_file_if_differs SRC DST MODE +# Copy SRC to DST (root:root with MODE) iff on-disk content differs. +# No-op + log otherwise — preserves mtime, avoids spurious reloads. +install_file_if_differs() { + local src="$1" dst="$2" mode="$3" + if [ -f "$dst" ] && cmp -s "$src" "$dst"; then + log "unchanged: ${dst}" + return 0 + fi + log "writing: ${dst}" + install -m "$mode" -o root -g root "$src" "$dst" +} + +# vault_status_json — echo `vault status -format=json`, or '' on unreachable. +# vault status exit codes: 0 = unsealed, 2 = sealed/uninit, 1 = unreachable. +# We treat all of 0/2 as "reachable with state"; 1 yields empty output. +# Wrapped in `|| true` so set -e doesn't abort on exit 2 (the expected +# sealed-state case during first-boot polling). +vault_status_json() { + VAULT_ADDR="$VAULT_ADDR_DEFAULT" vault status -format=json 2>/dev/null || true +} + +# vault_is_unsealed — true iff vault reachable AND initialized AND unsealed. +vault_is_unsealed() { + local out init sealed + out="$(vault_status_json)" + [ -n "$out" ] || return 1 + init="$(printf '%s' "$out" | jq -r '.initialized' 2>/dev/null)" || init="" + sealed="$(printf '%s' "$out" | jq -r '.sealed' 2>/dev/null)" || sealed="" + [ "$init" = "true" ] && [ "$sealed" = "false" ] +} + +# nomad_ready_count — echo the number of ready nodes, or 0 on error. +# `nomad node status -json` returns a JSON array of nodes, each with a +# .Status field ("initializing" | "ready" | "down" | "disconnected"). +nomad_ready_count() { + local out + out="$(NOMAD_ADDR="$NOMAD_ADDR_DEFAULT" nomad node status -json 2>/dev/null || true)" + if [ -z "$out" ]; then + printf '0' + return 0 + fi + printf '%s' "$out" \ + | jq '[.[] | select(.Status == "ready")] | length' 2>/dev/null \ + || printf '0' +} + +# ── Step 1/9: install.sh (nomad + vault binaries) ──────────────────────────── +log "── Step 1/9: install nomad + vault binaries ──" +"$INSTALL_SH" + +# ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ────────────────── +log "── Step 2/9: install nomad.service (enable, not start) ──" +"$SYSTEMD_NOMAD_SH" + +# ── Step 3/9: systemd-vault.sh (unit + vault.hcl + enable) ─────────────────── +log "── Step 3/9: install vault.service + vault.hcl (enable, not start) ──" +"$SYSTEMD_VAULT_SH" + +# ── Step 4/9: host-volume dirs matching nomad/client.hcl ───────────────────── +log "── Step 4/9: host-volume dirs under /srv/disinto/ ──" +# Parent /srv/disinto/ first (install -d handles missing parents, but being +# explicit makes the log output read naturally as a top-down creation). +install -d -m 0755 -o root -g root "/srv/disinto" +for d in "${HOST_VOLUME_DIRS[@]}"; do + if [ -d "$d" ]; then + log "unchanged: ${d}" + else + log "creating: ${d}" + install -d -m 0755 -o root -g root "$d" + fi +done + +# ── Step 5/9: /etc/nomad.d/server.hcl + client.hcl ─────────────────────────── +log "── Step 5/9: install /etc/nomad.d/{server,client}.hcl ──" +# systemd-nomad.sh already created /etc/nomad.d/. Re-assert for clarity + +# in case someone runs cluster-up.sh with an exotic step ordering later. +install -d -m 0755 -o root -g root "$NOMAD_CONFIG_DIR" +install_file_if_differs "$NOMAD_SERVER_HCL_SRC" "${NOMAD_CONFIG_DIR}/server.hcl" 0644 +install_file_if_differs "$NOMAD_CLIENT_HCL_SRC" "${NOMAD_CONFIG_DIR}/client.hcl" 0644 + +# ── Step 6/9: vault-init (first-run init + unseal + persist keys) ──────────── +log "── Step 6/9: vault-init (no-op after first run) ──" +# vault-init.sh spawns a temporary vault server if systemd isn't managing +# one, runs `operator init`, writes unseal.key + root.token, unseals once, +# then stops the temp server (EXIT trap). After it returns, port 8200 is +# free for systemctl-managed vault to take in step 7. +"$VAULT_INIT_SH" + +# ── Step 7/9: systemctl start vault + poll until unsealed ──────────────────── +log "── Step 7/9: start vault + poll until unsealed ──" +if systemctl is-active --quiet vault && vault_is_unsealed; then + log "vault already active + unsealed — skip start" +else + systemctl start vault + ready=0 + for i in $(seq 1 "$VAULT_POLL_SECS"); do + # Fail fast if systemd has already marked the unit as failed — usually + # ExecStartPost tripping because unseal.key is absent / corrupted. + if systemctl is-failed --quiet vault; then + log "vault.service entered failed state — systemctl status follows:" + systemctl --no-pager --full status vault >&2 || true + die "vault.service failed to start" + fi + if vault_is_unsealed; then + log "vault unsealed after ${i}s" + ready=1 + break + fi + sleep 1 + done + if [ "$ready" -ne 1 ]; then + log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:" + systemctl --no-pager --full status vault >&2 || true + die "vault failed to become unsealed" + fi +fi + +# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ─────────────── +log "── Step 8/9: start nomad + poll until ≥1 node ready ──" +if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then + log "nomad already active + ≥1 node ready — skip start" +else + systemctl start nomad + ready=0 + for i in $(seq 1 "$NOMAD_POLL_SECS"); do + if systemctl is-failed --quiet nomad; then + log "nomad.service entered failed state — systemctl status follows:" + systemctl --no-pager --full status nomad >&2 || true + die "nomad.service failed to start" + fi + if [ "$(nomad_ready_count)" -ge 1 ]; then + log "nomad has ready node after ${i}s" + ready=1 + break + fi + sleep 1 + done + if [ "$ready" -ne 1 ]; then + log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:" + systemctl --no-pager --full status nomad >&2 || true + die "nomad failed to reach ≥1 ready node" + fi +fi + +# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── +log "── Step 9/9: write ${PROFILE_D_FILE} ──" +# Shell rc fragments in /etc/profile.d/ are sourced by /etc/profile for +# every interactive login shell. Setting VAULT_ADDR + NOMAD_ADDR here means +# the operator can run `vault status` / `nomad node status` straight after +# `ssh factory-box` without fumbling env vars. +desired_profile="# /etc/profile.d/disinto-nomad.sh — written by lib/init/nomad/cluster-up.sh +# Interactive-shell defaults for Vault + Nomad clients on this box. +export VAULT_ADDR=${VAULT_ADDR_DEFAULT} +export NOMAD_ADDR=${NOMAD_ADDR_DEFAULT} +" +if [ -f "$PROFILE_D_FILE" ] \ + && printf '%s' "$desired_profile" | cmp -s - "$PROFILE_D_FILE"; then + log "unchanged: ${PROFILE_D_FILE}" +else + log "writing: ${PROFILE_D_FILE}" + # Subshell + EXIT trap: guarantees the tempfile is cleaned up on both + # success AND set-e-induced failure of `install`. A function-scoped + # RETURN trap does NOT fire on errexit-abort in bash — the subshell is + # the reliable cleanup boundary here. + ( + tmp="$(mktemp)" + trap 'rm -f "$tmp"' EXIT + printf '%s' "$desired_profile" > "$tmp" + install -m 0644 -o root -g root "$tmp" "$PROFILE_D_FILE" + ) +fi + +log "── done: empty nomad+vault cluster is up ──" +log " Vault: ${VAULT_ADDR_DEFAULT} (Sealed=false Initialized=true)" +log " Nomad: ${NOMAD_ADDR_DEFAULT} (≥1 node ready)" -- 2.49.1 From 481175e04386eadb24454cd4960524417d1b1bc2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 16 Apr 2026 07:26:54 +0000 Subject: [PATCH 2/2] fix: dedupe cluster-up.sh polling via poll_until_healthy helper (#824) CI duplicate-detection flagged the in-line vault + nomad polling loops in cluster-up.sh as matching a 5-line window in vault-init.sh (the `ready=1 / break / fi / sleep 1 / done` boilerplate). Extracts the repeated pattern into three helpers at the top of the file: - nomad_has_ready_node wrapper so poll_until_healthy can take a bare command name. - _die_with_service_status shared "log + dump systemctl status + die" path (factored out of the two callsites + the timeout branch). - poll_until_healthy ticks once per second up to TIMEOUT, fail-fasts on systemd "failed" state, and returns 0 on first successful check. Step 7 (vault unseal) and Step 8 (nomad ready node) each collapse from ~15 lines of explicit for-loop bookkeeping to a one-line call. No behavioural change: same tick cadence, same fail-fast, same status dump on timeout. Local detect-duplicates.py run against main confirms no new duplicates introduced. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/init/nomad/cluster-up.sh | 83 ++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh index a1b02ff..7c802c6 100755 --- a/lib/init/nomad/cluster-up.sh +++ b/lib/init/nomad/cluster-up.sh @@ -206,6 +206,43 @@ nomad_ready_count() { || printf '0' } +# nomad_has_ready_node — true iff nomad_ready_count ≥ 1. Wrapper exists +# so poll_until_healthy can call it as a single-arg command name. +nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; } + +# _die_with_service_status SVC REASON +# Log + dump `systemctl status SVC` to stderr + die with REASON. Factored +# out so the poll helper doesn't carry three copies of the same dump. +_die_with_service_status() { + local svc="$1" reason="$2" + log "${svc}.service ${reason} — systemctl status follows:" + systemctl --no-pager --full status "$svc" >&2 || true + die "${svc}.service ${reason}" +} + +# poll_until_healthy SVC CHECK_CMD TIMEOUT +# Tick once per second for up to TIMEOUT seconds, invoking CHECK_CMD as a +# command name (no arguments). Returns 0 on the first successful check. +# Fails fast via _die_with_service_status if SVC enters systemd "failed" +# state, and dies with a status dump if TIMEOUT elapses before CHECK_CMD +# succeeds. Replaces the two in-line ready=1/break/sleep poll loops that +# would otherwise each duplicate the same pattern already in vault-init.sh. +poll_until_healthy() { + local svc="$1" check="$2" timeout="$3" + local waited=0 + until [ "$waited" -ge "$timeout" ]; do + systemctl is-failed --quiet "$svc" \ + && _die_with_service_status "$svc" "entered failed state during startup" + if "$check"; then + log "${svc} healthy after ${waited}s" + return 0 + fi + waited=$((waited + 1)) + sleep 1 + done + _die_with_service_status "$svc" "not healthy within ${timeout}s" +} + # ── Step 1/9: install.sh (nomad + vault binaries) ──────────────────────────── log "── Step 1/9: install nomad + vault binaries ──" "$INSTALL_SH" @@ -250,58 +287,22 @@ log "── Step 6/9: vault-init (no-op after first run) ──" # ── Step 7/9: systemctl start vault + poll until unsealed ──────────────────── log "── Step 7/9: start vault + poll until unsealed ──" +# Fast-path when vault.service is already active and Vault reports +# initialized=true,sealed=false — re-runs are a no-op. if systemctl is-active --quiet vault && vault_is_unsealed; then log "vault already active + unsealed — skip start" else systemctl start vault - ready=0 - for i in $(seq 1 "$VAULT_POLL_SECS"); do - # Fail fast if systemd has already marked the unit as failed — usually - # ExecStartPost tripping because unseal.key is absent / corrupted. - if systemctl is-failed --quiet vault; then - log "vault.service entered failed state — systemctl status follows:" - systemctl --no-pager --full status vault >&2 || true - die "vault.service failed to start" - fi - if vault_is_unsealed; then - log "vault unsealed after ${i}s" - ready=1 - break - fi - sleep 1 - done - if [ "$ready" -ne 1 ]; then - log "vault did not unseal within ${VAULT_POLL_SECS}s — status follows:" - systemctl --no-pager --full status vault >&2 || true - die "vault failed to become unsealed" - fi + poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS" fi # ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ─────────────── log "── Step 8/9: start nomad + poll until ≥1 node ready ──" -if systemctl is-active --quiet nomad && [ "$(nomad_ready_count)" -ge 1 ]; then +if systemctl is-active --quiet nomad && nomad_has_ready_node; then log "nomad already active + ≥1 node ready — skip start" else systemctl start nomad - ready=0 - for i in $(seq 1 "$NOMAD_POLL_SECS"); do - if systemctl is-failed --quiet nomad; then - log "nomad.service entered failed state — systemctl status follows:" - systemctl --no-pager --full status nomad >&2 || true - die "nomad.service failed to start" - fi - if [ "$(nomad_ready_count)" -ge 1 ]; then - log "nomad has ready node after ${i}s" - ready=1 - break - fi - sleep 1 - done - if [ "$ready" -ne 1 ]; then - log "nomad had no ready nodes within ${NOMAD_POLL_SECS}s — status follows:" - systemctl --no-pager --full status nomad >&2 || true - die "nomad failed to reach ≥1 ready node" - fi + poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS" fi # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── -- 2.49.1