From fce4d83176a3e155619cfbfecfd1c5be8ca35565 Mon Sep 17 00:00:00 2001
From: Agent <agent@example.com>
Date: Thu, 16 Apr 2026 10:23:16 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20[nomad-step-1]=20S1.2=20=E2=80=94=20add?=
 =?UTF-8?q?=20lib/init/nomad/deploy.sh=20(dependency-ordered=20nomad=20job?=
 =?UTF-8?q?=20run=20+=20wait)=20(#841)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 lib/init/nomad/deploy.sh | 195 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100755 lib/init/nomad/deploy.sh
diff --git a/lib/init/nomad/deploy.sh b/lib/init/nomad/deploy.sh
new file mode 100755
index 0000000..f6a48a9
--- /dev/null
+++ b/lib/init/nomad/deploy.sh
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+# =============================================================================
+# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
+#
+# Runs a list of jobspecs in order, waiting for each to reach "running" state
+# before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend
+# the job list.
+#
+# Usage:
+#   lib/init/nomad/deploy.sh <jobname> [jobname2 ...] [--dry-run]
+#
+# Arguments:
+#   jobname  — basename of jobspec (without .hcl), resolved to
+#              ${REPO_ROOT}/nomad/jobs/<jobname>.hcl
+#
+# Environment:
+#   REPO_ROOT              — absolute path to repo root (defaults to parent of
+#                            this script's parent directory)
+#   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120)
+#
+# Exit codes:
+#   0  success (all jobs deployed and running, or dry-run completed)
+#   1  failure (validation error, timeout, or nomad command failure)
+#
+# Idempotency:
+#   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
+#   already running print "[deploy] <name> already running" and continue.
+# =============================================================================
+set -euo pipefail
+
+# ── Configuration ────────────────────────────────────────────────────────────
+SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../.." && pwd)}"
+JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}"
+
+DRY_RUN=0
+
+log() { printf '[deploy] %s\n' "$*" >&2; }
+die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
+
+# ── Parse arguments ───────────────────────────────────────────────────────────
+JOBS=()
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -*)
+      die "Unknown option: $1"
+      ;;
+    *)
+      JOBS+=("$1")
+      shift
+      ;;
+  esac
+done
+
+if [ "${#JOBS[@]}" -eq 0 ]; then
+  die "Usage: $0 <jobname> [jobname2 ...] [--dry-run]"
+fi
+
+# ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
+# Polls `nomad job status -json <name>` until:
+#   - Status == "running", OR
+#   - All allocations are in "running" state
+#
+# On timeout: prints last 50 lines of stderr from all allocations and exits 1.
+#
+# This is a named, reusable helper for future init scripts.
+_wait_job_running() {
+  local job_name="$1"
+  local timeout="$2"
+  local elapsed=0
+
+  log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..."
+
+  while [ "$elapsed" -lt "$timeout" ]; do
+    local status_json
+    status_json=$(nomad job status -json "$job_name" 2>/dev/null) || {
+      # Job may not exist yet — keep waiting
+      sleep 5
+      elapsed=$((elapsed + 5))
+      continue
+    }
+
+    local status
+    status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || {
+      sleep 5
+      elapsed=$((elapsed + 5))
+      continue
+    }
+
+    case "$status" in
+      running)
+        log "job '${job_name}' is now running"
+        return 0
+        ;;
+      complete|dead|failed)
+        # Check allocations for partial success
+        local allocs_running
+        allocs_running=$(printf '%s' "$status_json" \
+          | jq '[.Evaluations[].Allocations[]? | select(.Status == "running")] | length' 2>/dev/null) || allocs_running=0
+        local allocs_total
+        allocs_total=$(printf '%s' "$status_json" \
+          | jq '[.Evaluations[].Allocations[]? | length] | add' 2>/dev/null) || allocs_total=0
+
+        if [ "$allocs_running" -gt 0 ]; then
+          log "job '${job_name}' has ${allocs_running}/${allocs_total} allocations running"
+          # If not all running but some are, keep waiting
+          if [ "$allocs_running" -lt "$allocs_total" ]; then
+            sleep 5
+            elapsed=$((elapsed + 5))
+            continue
+          fi
+        fi
+
+        log "job '${job_name}' reached terminal state: ${status}"
+        return 0
+        ;;
+      *)
+        log "job '${job_name}' status: ${status} (waiting...)"
+        ;;
+    esac
+
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
+  # Timeout — print last 50 lines of alloc logs
+  log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s"
+  log "showing last 50 lines of allocation logs (stderr):"
+
+  # Get allocation IDs
+  local alloc_ids
+  alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
+    | jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
+
+  if [ -n "$alloc_ids" ]; then
+    for alloc_id in $alloc_ids; do
+      log "--- Allocation ${alloc_id} logs (stderr) ---"
+      nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
+    done
+  fi
+
+  return 1
+}
+
+# ── Main: deploy each job in order ───────────────────────────────────────────
+for job_name in "${JOBS[@]}"; do
+  jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"
+
+  if [ ! -f "$jobspec_path" ]; then
+    die "Jobspec not found: ${jobspec_path}"
+  fi
+
+  if [ "$DRY_RUN" -eq 1 ]; then
+    log "[dry-run] nomad job validate ${jobspec_path}"
+    log "[dry-run] nomad job run -detach ${jobspec_path}"
+    log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)"
+    continue
+  fi
+
+  log "processing job: ${job_name}"
+
+  # 1. Validate the jobspec
+  log "validating: ${jobspec_path}"
+  if ! nomad job validate "$jobspec_path"; then
+    die "validation failed for: ${jobspec_path}"
+  fi
+
+  # 2. Check if already running (idempotency)
+  job_status=$(nomad job status "$job_name" 2>/dev/null | head -1 || true)
+  if printf '%s' "$job_status" | grep -qi "running"; then
+    log "${job_name} already running"
+    continue
+  fi
+
+  # 3. Run the job (idempotent registration)
+  log "running: ${jobspec_path}"
+  if ! nomad job run -detach "$jobspec_path"; then
+    die "failed to run job: ${job_name}"
+  fi
+
+  # 4. Wait for running state
+  if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then
+    die "timeout waiting for job '${job_name}' to become running"
+  fi
+done
+
+if [ "$DRY_RUN" -eq 1 ]; then
+  log "dry-run complete"
+fi
+
+exit 0