fix: [nomad-step-0] S0.2-fix — install.sh must also install docker daemon (block step 1 placement) (#871) #876

Merged
dev-qwen merged 1 commit from fix/issue-871 into main 2026-04-16 14:19:45 +00:00
3 changed files with 143 additions and 72 deletions
Showing only changes of commit b77bae9c2a - Show all commits

View file

@ -5,7 +5,7 @@
# Wires together the S0.1S0.3 building blocks into one idempotent # Wires together the S0.1S0.3 building blocks into one idempotent
# "bring up a single-node Nomad+Vault cluster" script: # "bring up a single-node Nomad+Vault cluster" script:
# #
# 1. install.sh (nomad + vault binaries) # 1. install.sh (nomad + vault binaries + docker daemon)
# 2. systemd-nomad.sh (nomad.service — unit + enable, not started) # 2. systemd-nomad.sh (nomad.service — unit + enable, not started)
# 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable) # 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable)
# 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl) # 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl)
@ -104,7 +104,7 @@ done
# ── Dry-run: print step list + exit ────────────────────────────────────────── # ── Dry-run: print step list + exit ──────────────────────────────────────────
if [ "$dry_run" = true ]; then if [ "$dry_run" = true ]; then
cat <<EOF cat <<EOF
[dry-run] Step 1/9: install nomad + vault binaries [dry-run] Step 1/9: install nomad + vault binaries + docker daemon
→ sudo ${INSTALL_SH} → sudo ${INSTALL_SH}
[dry-run] Step 2/9: write + enable nomad.service (NOT started) [dry-run] Step 2/9: write + enable nomad.service (NOT started)
@ -129,7 +129,7 @@ EOF
[dry-run] Step 7/9: systemctl start vault + poll until unsealed (${VAULT_POLL_SECS}s) [dry-run] Step 7/9: systemctl start vault + poll until unsealed (${VAULT_POLL_SECS}s)
[dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready (${NOMAD_POLL_SECS}s) [dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker driver healthy (${NOMAD_POLL_SECS}s each)
[dry-run] Step 9/9: write ${PROFILE_D_FILE} [dry-run] Step 9/9: write ${PROFILE_D_FILE}
export VAULT_ADDR=${VAULT_ADDR_DEFAULT} export VAULT_ADDR=${VAULT_ADDR_DEFAULT}
@ -210,6 +210,21 @@ nomad_ready_count() {
# so poll_until_healthy can call it as a single-arg command name. # so poll_until_healthy can call it as a single-arg command name.
nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; } nomad_has_ready_node() { [ "$(nomad_ready_count)" -ge 1 ]; }
# nomad_docker_driver_healthy — true iff the nomad self-node reports the
# docker driver as Detected=true AND Healthy=true. Required by Step-1's
# forgejo jobspec (the first docker-driver consumer) — without this the
# node reaches "ready" while docker fingerprinting is still in flight,
# and the first `nomad job run forgejo` times out with an opaque
# "missing drivers" placement failure (#871).
nomad_docker_driver_healthy() {
local out detected healthy
out="$(NOMAD_ADDR="$NOMAD_ADDR_DEFAULT" nomad node status -self -json 2>/dev/null || true)"
[ -n "$out" ] || return 1
detected="$(printf '%s' "$out" | jq -r '.Drivers.docker.Detected // false' 2>/dev/null)" || detected=""
healthy="$(printf '%s' "$out" | jq -r '.Drivers.docker.Healthy // false' 2>/dev/null)" || healthy=""
[ "$detected" = "true" ] && [ "$healthy" = "true" ]
}
# _die_with_service_status SVC REASON # _die_with_service_status SVC REASON
# Log + dump `systemctl status SVC` to stderr + die with REASON. Factored # Log + dump `systemctl status SVC` to stderr + die with REASON. Factored
# out so the poll helper doesn't carry three copies of the same dump. # out so the poll helper doesn't carry three copies of the same dump.
@ -243,8 +258,8 @@ poll_until_healthy() {
_die_with_service_status "$svc" "not healthy within ${timeout}s" _die_with_service_status "$svc" "not healthy within ${timeout}s"
} }
# ── Step 1/9: install.sh (nomad + vault binaries) ──────────────────────────── # ── Step 1/9: install.sh (nomad + vault binaries + docker daemon) ────────────
log "── Step 1/9: install nomad + vault binaries ──" log "── Step 1/9: install nomad + vault binaries + docker daemon ──"
"$INSTALL_SH" "$INSTALL_SH"
# ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ────────────────── # ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ──────────────────
@ -296,13 +311,25 @@ else
poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS" poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS"
fi fi
# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ─────────────── # ── Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker up ──
log "── Step 8/9: start nomad + poll until ≥1 node ready ──" log "── Step 8/9: start nomad + poll until ≥1 node ready + docker driver healthy ──"
if systemctl is-active --quiet nomad && nomad_has_ready_node; then # Three conditions gate this step:
log "nomad already active + ≥1 node ready — skip start" # (a) nomad.service active
# (b) ≥1 nomad node in "ready" state
# (c) nomad's docker task driver fingerprinted as Detected+Healthy
# (c) can lag (a)+(b) briefly because driver fingerprinting races with
# dockerd startup — polling it explicitly prevents Step-1 deploys from
# hitting "missing drivers" placement failures on a cold-booted host (#871).
if systemctl is-active --quiet nomad \
&& nomad_has_ready_node \
&& nomad_docker_driver_healthy; then
log "nomad already active + ≥1 node ready + docker driver healthy — skip start"
else else
if ! systemctl is-active --quiet nomad; then
systemctl start nomad systemctl start nomad
fi
poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS" poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS"
poll_until_healthy nomad nomad_docker_driver_healthy "$NOMAD_POLL_SECS"
fi fi
# ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ────────────────────────────────

View file

@ -1,20 +1,33 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# ============================================================================= # =============================================================================
# lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault # lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault
# + Ubuntu-native Docker for Nomad's docker driver
# #
# Part of the Nomad+Vault migration. Installs both the `nomad` binary (S0.2, # Part of the Nomad+Vault migration. Installs the `nomad` binary (S0.2,
# issue #822) and the `vault` binary (S0.3, issue #823) from the same # issue #822), the `vault` binary (S0.3, issue #823), and the `docker`
# HashiCorp apt repository. Does NOT configure, start, or enable any systemd # daemon (S0.2-fix, issue #871) needed by Nomad's docker task driver.
# unit — lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh # Nomad + Vault come from the pinned HashiCorp apt repo; docker comes from
# own that. Does NOT wire this script into `disinto init` — S0.4 owns that. # Ubuntu's default apt repo (docker.io) — matches the existing factory
# dev-box setup and avoids adding a second apt source with pinning.
#
# Does NOT configure, start, or enable nomad.service or vault.service —
# lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh own
# those. The docker.service unit ships with the docker.io package and is
# enabled+started here directly (not a disinto-owned unit), because Nomad's
# docker driver reports Healthy=false without a running dockerd — that
# silently blocks job placement at Step 1 with a confusing "missing
# drivers" error (issue #871). Does NOT wire this script into `disinto
# init` — S0.4 owns that.
# #
# Idempotency contract: # Idempotency contract:
# - Running twice back-to-back is a no-op once both target versions are # - Running twice back-to-back is a no-op once all three targets are
# installed and the apt source is in place. # installed and the HashiCorp apt source is in place.
# - Adds the HashiCorp apt keyring only if it is absent. # - Adds the HashiCorp apt keyring only if it is absent.
# - Adds the HashiCorp apt sources list only if it is absent. # - Adds the HashiCorp apt sources list only if it is absent.
# - Skips `apt-get install` for any package whose installed version already # - Skips `apt-get install` for any package whose installed version already
# matches the pin. If both are at pin, exits before touching apt. # matches the pin. If all three are satisfied, exits before touching apt.
# - `command -v docker` is the docker install sentinel; `systemctl
# enable --now` is a no-op on an already-enabled+active unit.
# #
# Configuration: # Configuration:
# NOMAD_VERSION — pinned Nomad version (default: see below). Apt package # NOMAD_VERSION — pinned Nomad version (default: see below). Apt package
@ -85,12 +98,24 @@ else
need_pkgs+=("vault=${VAULT_VERSION}-1") need_pkgs+=("vault=${VAULT_VERSION}-1")
fi fi
if [ "${#need_pkgs[@]}" -eq 0 ]; then # Docker isn't version-pinned (Ubuntu's docker.io tracks the distro's
# ship-stable release — good enough for a dev box and avoids a second
# apt source). Sentinel is binary presence, not a semver match.
if command -v docker >/dev/null 2>&1; then
log "docker already installed"
docker_needs_install=0
else
docker_needs_install=1
fi
if [ "${#need_pkgs[@]}" -eq 0 ] && [ "$docker_needs_install" -eq 0 ]; then
log "nothing to do" log "nothing to do"
exit 0 exit 0
fi fi
# ── Ensure HashiCorp apt keyring ───────────────────────────────────────────── # ── HashiCorp apt setup + nomad/vault install (skipped if both at pin) ───────
if [ "${#need_pkgs[@]}" -gt 0 ]; then
# Ensure HashiCorp apt keyring.
if [ ! -f "$HASHICORP_KEYRING" ]; then if [ ! -f "$HASHICORP_KEYRING" ]; then
log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}"
tmpkey="$(mktemp)" tmpkey="$(mktemp)"
@ -106,7 +131,7 @@ else
log "HashiCorp apt keyring already present" log "HashiCorp apt keyring already present"
fi fi
# ── Ensure HashiCorp apt sources list ──────────────────────────────────────── # Ensure HashiCorp apt sources list.
desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main"
if [ ! -f "$HASHICORP_SOURCES" ] \ if [ ! -f "$HASHICORP_SOURCES" ] \
|| ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then
@ -118,7 +143,7 @@ else
apt_update_needed=0 apt_update_needed=0
fi fi
# ── Install the pinned versions ────────────────────────────────────────────── # Install the pinned versions.
if [ "$apt_update_needed" -eq 1 ]; then if [ "$apt_update_needed" -eq 1 ]; then
log "running apt-get update" log "running apt-get update"
DEBIAN_FRONTEND=noninteractive apt-get update -qq \ DEBIAN_FRONTEND=noninteractive apt-get update -qq \
@ -130,7 +155,7 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
"${need_pkgs[@]}" \ "${need_pkgs[@]}" \
|| die "apt-get install ${need_pkgs[*]} failed" || die "apt-get install ${need_pkgs[*]} failed"
# ── Verify ─────────────────────────────────────────────────────────────────── # Verify pinned versions.
final_nomad="$(_installed_version nomad)" final_nomad="$(_installed_version nomad)"
if [ "$final_nomad" != "$NOMAD_VERSION" ]; then if [ "$final_nomad" != "$NOMAD_VERSION" ]; then
die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'"
@ -139,5 +164,24 @@ final_vault="$(_installed_version vault)"
if [ "$final_vault" != "$VAULT_VERSION" ]; then if [ "$final_vault" != "$VAULT_VERSION" ]; then
die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'"
fi fi
fi
log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} installed successfully" # ── Install docker.io + enable+start docker.service (if missing) ─────────────
# Nomad's docker task driver reports Healthy=false without a running
# dockerd. On the factory dev box docker was pre-installed so Step 0's
# cluster-up passed silently; on a fresh LXC the first docker-driver
# jobspec (forgejo, Step 1) fails placement with "missing drivers".
# Install from Ubuntu's default apt repo — no second source, no pinning.
# `docker.service` ships with the package; `enable --now` is idempotent.
if [ "$docker_needs_install" -eq 1 ]; then
log "installing docker.io"
DEBIAN_FRONTEND=noninteractive apt-get install -y -q docker.io \
|| die "apt-get install docker.io failed"
log "enabling + starting docker.service"
systemctl enable --now docker \
|| die "failed to enable/start docker.service"
command -v docker >/dev/null 2>&1 \
|| die "post-install check: docker binary still not found"
fi
log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} + docker installed successfully"

View file

@ -34,7 +34,7 @@ setup_file() {
[[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]] [[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]]
# All nine cluster-up dry-run steps, in order. # All nine cluster-up dry-run steps, in order.
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
[[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]]
[[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]]
[[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]] [[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]]
@ -57,7 +57,7 @@ setup_file() {
# of the migration will branch on $empty to gate job deployment; today # of the migration will branch on $empty to gate job deployment; today
# both modes invoke the same cluster-up dry-run. # both modes invoke the same cluster-up dry-run.
[[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
[[ "$output" == *"Dry run complete — no changes made."* ]] [[ "$output" == *"Dry run complete — no changes made."* ]]
} }
@ -69,7 +69,7 @@ setup_file() {
# Negative assertion: the nomad dispatcher banners must be absent. # Negative assertion: the nomad dispatcher banners must be absent.
[[ "$output" != *"nomad backend:"* ]] [[ "$output" != *"nomad backend:"* ]]
[[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
# Positive assertion: docker-path output still appears — the existing # Positive assertion: docker-path output still appears — the existing
# docker dry-run printed "=== disinto init ===" before listing the # docker dry-run printed "=== disinto init ===" before listing the
@ -88,7 +88,7 @@ setup_file() {
run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: default"* ]] [[ "$output" == *"nomad backend: default"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
} }
# ── Flag validation ────────────────────────────────────────────────────────── # ── Flag validation ──────────────────────────────────────────────────────────
@ -118,7 +118,7 @@ setup_file() {
run "$DISINTO_BIN" init --backend=nomad --empty --dry-run run "$DISINTO_BIN" init --backend=nomad --empty --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
# The bug symptom must be absent — backend was misdetected as docker # The bug symptom must be absent — backend was misdetected as docker
# when --backend=nomad got swallowed as repo_url. # when --backend=nomad got swallowed as repo_url.
[[ "$output" != *"--empty is only valid with --backend=nomad"* ]] [[ "$output" != *"--empty is only valid with --backend=nomad"* ]]
@ -128,7 +128,7 @@ setup_file() {
run "$DISINTO_BIN" init --backend nomad --dry-run run "$DISINTO_BIN" init --backend nomad --dry-run
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"nomad backend: default"* ]] [[ "$output" == *"nomad backend: default"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
} }
@test "disinto init (no args) still errors with 'repo URL required'" { @test "disinto init (no args) still errors with 'repo URL required'" {