diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh index 7c802c6..4aab42d 100755 --- a/lib/init/nomad/cluster-up.sh +++ b/lib/init/nomad/cluster-up.sh @@ -5,7 +5,7 @@ # Wires together the S0.1–S0.3 building blocks into one idempotent # "bring up a single-node Nomad+Vault cluster" script: # -# 1. install.sh (nomad + vault binaries) +# 1. install.sh (nomad + vault binaries + docker daemon) # 2. systemd-nomad.sh (nomad.service — unit + enable, not started) # 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable) # 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl) @@ -104,7 +104,7 @@ done # ── Dry-run: print step list + exit ────────────────────────────────────────── if [ "$dry_run" = true ]; then cat </dev/null || true)" + [ -n "$out" ] || return 1 + detected="$(printf '%s' "$out" | jq -r '.Drivers.docker.Detected // false' 2>/dev/null)" || detected="" + healthy="$(printf '%s' "$out" | jq -r '.Drivers.docker.Healthy // false' 2>/dev/null)" || healthy="" + [ "$detected" = "true" ] && [ "$healthy" = "true" ] +} + # _die_with_service_status SVC REASON # Log + dump `systemctl status SVC` to stderr + die with REASON. Factored # out so the poll helper doesn't carry three copies of the same dump. @@ -243,8 +258,8 @@ poll_until_healthy() { _die_with_service_status "$svc" "not healthy within ${timeout}s" } -# ── Step 1/9: install.sh (nomad + vault binaries) ──────────────────────────── -log "── Step 1/9: install nomad + vault binaries ──" +# ── Step 1/9: install.sh (nomad + vault binaries + docker daemon) ──────────── +log "── Step 1/9: install nomad + vault binaries + docker daemon ──" "$INSTALL_SH" # ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ────────────────── @@ -296,13 +311,25 @@ else poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS" fi -# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ─────────────── -log "── Step 8/9: start nomad + poll until ≥1 node ready ──" -if systemctl is-active --quiet nomad && nomad_has_ready_node; then - log "nomad already active + ≥1 node ready — skip start" +# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker up ── +log "── Step 8/9: start nomad + poll until ≥1 node ready + docker driver healthy ──" +# Three conditions gate this step: +# (a) nomad.service active +# (b) ≥1 nomad node in "ready" state +# (c) nomad's docker task driver fingerprinted as Detected+Healthy +# (c) can lag (a)+(b) briefly because driver fingerprinting races with +# dockerd startup — polling it explicitly prevents Step-1 deploys from +# hitting "missing drivers" placement failures on a cold-booted host (#871). +if systemctl is-active --quiet nomad \ + && nomad_has_ready_node \ + && nomad_docker_driver_healthy; then + log "nomad already active + ≥1 node ready + docker driver healthy — skip start" else - systemctl start nomad + if ! systemctl is-active --quiet nomad; then + systemctl start nomad + fi poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS" + poll_until_healthy nomad nomad_docker_driver_healthy "$NOMAD_POLL_SECS" fi # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── diff --git a/lib/init/nomad/install.sh b/lib/init/nomad/install.sh index 6f1ffed..ea9ac17 100755 --- a/lib/init/nomad/install.sh +++ b/lib/init/nomad/install.sh @@ -1,20 +1,33 @@ #!/usr/bin/env bash # ============================================================================= # lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault +# + Ubuntu-native Docker for Nomad's docker driver # -# Part of the Nomad+Vault migration. Installs both the `nomad` binary (S0.2, -# issue #822) and the `vault` binary (S0.3, issue #823) from the same -# HashiCorp apt repository. Does NOT configure, start, or enable any systemd -# unit — lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh -# own that. Does NOT wire this script into `disinto init` — S0.4 owns that. +# Part of the Nomad+Vault migration. Installs the `nomad` binary (S0.2, +# issue #822), the `vault` binary (S0.3, issue #823), and the `docker` +# daemon (S0.2-fix, issue #871) needed by Nomad's docker task driver. +# Nomad + Vault come from the pinned HashiCorp apt repo; docker comes from +# Ubuntu's default apt repo (docker.io) — matches the existing factory +# dev-box setup and avoids adding a second apt source with pinning. +# +# Does NOT configure, start, or enable nomad.service or vault.service — +# lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh own +# those. The docker.service unit ships with the docker.io package and is +# enabled+started here directly (not a disinto-owned unit), because Nomad's +# docker driver reports Healthy=false without a running dockerd — that +# silently blocks job placement at Step 1 with a confusing "missing +# drivers" error (issue #871). Does NOT wire this script into `disinto +# init` — S0.4 owns that. # # Idempotency contract: -# - Running twice back-to-back is a no-op once both target versions are -# installed and the apt source is in place. +# - Running twice back-to-back is a no-op once all three targets are +# installed and the HashiCorp apt source is in place. # - Adds the HashiCorp apt keyring only if it is absent. # - Adds the HashiCorp apt sources list only if it is absent. # - Skips `apt-get install` for any package whose installed version already -# matches the pin. If both are at pin, exits before touching apt. +# matches the pin. If all three are satisfied, exits before touching apt. +# - `command -v docker` is the docker install sentinel; `systemctl +# enable --now` is a no-op on an already-enabled+active unit. # # Configuration: # NOMAD_VERSION — pinned Nomad version (default: see below). Apt package @@ -85,59 +98,90 @@ else need_pkgs+=("vault=${VAULT_VERSION}-1") fi -if [ "${#need_pkgs[@]}" -eq 0 ]; then +# Docker isn't version-pinned (Ubuntu's docker.io tracks the distro's +# ship-stable release — good enough for a dev box and avoids a second +# apt source). Sentinel is binary presence, not a semver match. +if command -v docker >/dev/null 2>&1; then + log "docker already installed" + docker_needs_install=0 +else + docker_needs_install=1 +fi + +if [ "${#need_pkgs[@]}" -eq 0 ] && [ "$docker_needs_install" -eq 0 ]; then log "nothing to do" exit 0 fi -# ── Ensure HashiCorp apt keyring ───────────────────────────────────────────── -if [ ! -f "$HASHICORP_KEYRING" ]; then - log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" - tmpkey="$(mktemp)" - trap 'rm -f "$tmpkey"' EXIT - curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \ - || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}" - gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \ - || die "failed to dearmor HashiCorp GPG key" - chmod 0644 "$HASHICORP_KEYRING" - rm -f "$tmpkey" - trap - EXIT -else - log "HashiCorp apt keyring already present" +# ── HashiCorp apt setup + nomad/vault install (skipped if both at pin) ─────── +if [ "${#need_pkgs[@]}" -gt 0 ]; then + # Ensure HashiCorp apt keyring. + if [ ! -f "$HASHICORP_KEYRING" ]; then + log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" + tmpkey="$(mktemp)" + trap 'rm -f "$tmpkey"' EXIT + curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \ + || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}" + gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \ + || die "failed to dearmor HashiCorp GPG key" + chmod 0644 "$HASHICORP_KEYRING" + rm -f "$tmpkey" + trap - EXIT + else + log "HashiCorp apt keyring already present" + fi + + # Ensure HashiCorp apt sources list. + desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" + if [ ! -f "$HASHICORP_SOURCES" ] \ + || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then + log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" + printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" + apt_update_needed=1 + else + log "HashiCorp apt sources list already present" + apt_update_needed=0 + fi + + # Install the pinned versions. + if [ "$apt_update_needed" -eq 1 ]; then + log "running apt-get update" + DEBIAN_FRONTEND=noninteractive apt-get update -qq \ + || die "apt-get update failed" + fi + + log "installing ${need_pkgs[*]}" + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "${need_pkgs[@]}" \ + || die "apt-get install ${need_pkgs[*]} failed" + + # Verify pinned versions. + final_nomad="$(_installed_version nomad)" + if [ "$final_nomad" != "$NOMAD_VERSION" ]; then + die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" + fi + final_vault="$(_installed_version vault)" + if [ "$final_vault" != "$VAULT_VERSION" ]; then + die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" + fi fi -# ── Ensure HashiCorp apt sources list ──────────────────────────────────────── -desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" -if [ ! -f "$HASHICORP_SOURCES" ] \ - || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then - log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" - printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" - apt_update_needed=1 -else - log "HashiCorp apt sources list already present" - apt_update_needed=0 +# ── Install docker.io + enable+start docker.service (if missing) ───────────── +# Nomad's docker task driver reports Healthy=false without a running +# dockerd. On the factory dev box docker was pre-installed so Step 0's +# cluster-up passed silently; on a fresh LXC the first docker-driver +# jobspec (forgejo, Step 1) fails placement with "missing drivers". +# Install from Ubuntu's default apt repo — no second source, no pinning. +# `docker.service` ships with the package; `enable --now` is idempotent. +if [ "$docker_needs_install" -eq 1 ]; then + log "installing docker.io" + DEBIAN_FRONTEND=noninteractive apt-get install -y -q docker.io \ + || die "apt-get install docker.io failed" + log "enabling + starting docker.service" + systemctl enable --now docker \ + || die "failed to enable/start docker.service" + command -v docker >/dev/null 2>&1 \ + || die "post-install check: docker binary still not found" fi -# ── Install the pinned versions ────────────────────────────────────────────── -if [ "$apt_update_needed" -eq 1 ]; then - log "running apt-get update" - DEBIAN_FRONTEND=noninteractive apt-get update -qq \ - || die "apt-get update failed" -fi - -log "installing ${need_pkgs[*]}" -DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - "${need_pkgs[@]}" \ - || die "apt-get install ${need_pkgs[*]} failed" - -# ── Verify ─────────────────────────────────────────────────────────────────── -final_nomad="$(_installed_version nomad)" -if [ "$final_nomad" != "$NOMAD_VERSION" ]; then - die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" -fi -final_vault="$(_installed_version vault)" -if [ "$final_vault" != "$VAULT_VERSION" ]; then - die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" -fi - -log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} installed successfully" +log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} + docker installed successfully" diff --git a/tests/disinto-init-nomad.bats b/tests/disinto-init-nomad.bats index 8616e2d..84cfa10 100644 --- a/tests/disinto-init-nomad.bats +++ b/tests/disinto-init-nomad.bats @@ -34,7 +34,7 @@ setup_file() { [[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]] # All nine cluster-up dry-run steps, in order. - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]] @@ -57,7 +57,7 @@ setup_file() { # of the migration will branch on $empty to gate job deployment; today # both modes invoke the same cluster-up dry-run. [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"Dry run complete — no changes made."* ]] } @@ -69,7 +69,7 @@ setup_file() { # Negative assertion: the nomad dispatcher banners must be absent. [[ "$output" != *"nomad backend:"* ]] - [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] # Positive assertion: docker-path output still appears — the existing # docker dry-run printed "=== disinto init ===" before listing the @@ -88,7 +88,7 @@ setup_file() { run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: default"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] } # ── Flag validation ────────────────────────────────────────────────────────── @@ -118,7 +118,7 @@ setup_file() { run "$DISINTO_BIN" init --backend=nomad --empty --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] # The bug symptom must be absent — backend was misdetected as docker # when --backend=nomad got swallowed as repo_url. [[ "$output" != *"--empty is only valid with --backend=nomad"* ]] @@ -128,7 +128,7 @@ setup_file() { run "$DISINTO_BIN" init --backend nomad --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: default"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] } @test "disinto init (no args) still errors with 'repo URL required'" {