From fce4d83176a3e155619cfbfecfd1c5be8ca35565 Mon Sep 17 00:00:00 2001 From: Agent Date: Thu, 16 Apr 2026 10:23:16 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20[nomad-step-1]=20S1.2=20=E2=80=94=20add?= =?UTF-8?q?=20lib/init/nomad/deploy.sh=20(dependency-ordered=20nomad=20job?= =?UTF-8?q?=20run=20+=20wait)=20(#841)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/init/nomad/deploy.sh | 195 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100755 lib/init/nomad/deploy.sh diff --git a/lib/init/nomad/deploy.sh b/lib/init/nomad/deploy.sh new file mode 100755 index 0000000..f6a48a9 --- /dev/null +++ b/lib/init/nomad/deploy.sh @@ -0,0 +1,195 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait +# +# Runs a list of jobspecs in order, waiting for each to reach "running" state +# before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend +# the job list. +# +# Usage: +# lib/init/nomad/deploy.sh [jobname2 ...] [--dry-run] +# +# Arguments: +# jobname — basename of jobspec (without .hcl), resolved to +# ${REPO_ROOT}/nomad/jobs/.hcl +# +# Environment: +# REPO_ROOT — absolute path to repo root (defaults to parent of +# this script's parent directory) +# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120) +# +# Exit codes: +# 0 success (all jobs deployed and running, or dry-run completed) +# 1 failure (validation error, timeout, or nomad command failure) +# +# Idempotency: +# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are +# already running print "[deploy] already running" and continue. +# ============================================================================= +set -euo pipefail + +# ── Configuration ──────────────────────────────────────────────────────────── +SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../.." && pwd)}" +JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}" + +DRY_RUN=0 + +log() { printf '[deploy] %s\n' "$*" >&2; } +die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Parse arguments ─────────────────────────────────────────────────────────── +JOBS=() +while [ $# -gt 0 ]; do + case "$1" in + --dry-run) + DRY_RUN=1 + shift + ;; + -*) + die "Unknown option: $1" + ;; + *) + JOBS+=("$1") + shift + ;; + esac +done + +if [ "${#JOBS[@]}" -eq 0 ]; then + die "Usage: $0 [jobname2 ...] [--dry-run]" +fi + +# ── Helper: _wait_job_running ─────────────────────────────── +# Polls `nomad job status -json ` until: +# - Status == "running", OR +# - All allocations are in "running" state +# +# On timeout: prints last 50 lines of stderr from all allocations and exits 1. +# +# This is a named, reusable helper for future init scripts. +_wait_job_running() { + local job_name="$1" + local timeout="$2" + local elapsed=0 + + log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..." + + while [ "$elapsed" -lt "$timeout" ]; do + local status_json + status_json=$(nomad job status -json "$job_name" 2>/dev/null) || { + # Job may not exist yet — keep waiting + sleep 5 + elapsed=$((elapsed + 5)) + continue + } + + local status + status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || { + sleep 5 + elapsed=$((elapsed + 5)) + continue + } + + case "$status" in + running) + log "job '${job_name}' is now running" + return 0 + ;; + complete|dead|failed) + # Check allocations for partial success + local allocs_running + allocs_running=$(printf '%s' "$status_json" \ + | jq '[.Evaluations[].Allocations[]? | select(.Status == "running")] | length' 2>/dev/null) || allocs_running=0 + local allocs_total + allocs_total=$(printf '%s' "$status_json" \ + | jq '[.Evaluations[].Allocations[]? | length] | add' 2>/dev/null) || allocs_total=0 + + if [ "$allocs_running" -gt 0 ]; then + log "job '${job_name}' has ${allocs_running}/${allocs_total} allocations running" + # If not all running but some are, keep waiting + if [ "$allocs_running" -lt "$allocs_total" ]; then + sleep 5 + elapsed=$((elapsed + 5)) + continue + fi + fi + + log "job '${job_name}' reached terminal state: ${status}" + return 0 + ;; + *) + log "job '${job_name}' status: ${status} (waiting...)" + ;; + esac + + sleep 5 + elapsed=$((elapsed + 5)) + done + + # Timeout — print last 50 lines of alloc logs + log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s" + log "showing last 50 lines of allocation logs (stderr):" + + # Get allocation IDs + local alloc_ids + alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \ + | jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids="" + + if [ -n "$alloc_ids" ]; then + for alloc_id in $alloc_ids; do + log "--- Allocation ${alloc_id} logs (stderr) ---" + nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true + done + fi + + return 1 +} + +# ── Main: deploy each job in order ─────────────────────────────────────────── +for job_name in "${JOBS[@]}"; do + jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl" + + if [ ! -f "$jobspec_path" ]; then + die "Jobspec not found: ${jobspec_path}" + fi + + if [ "$DRY_RUN" -eq 1 ]; then + log "[dry-run] nomad job validate ${jobspec_path}" + log "[dry-run] nomad job run -detach ${jobspec_path}" + log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)" + continue + fi + + log "processing job: ${job_name}" + + # 1. Validate the jobspec + log "validating: ${jobspec_path}" + if ! nomad job validate "$jobspec_path"; then + die "validation failed for: ${jobspec_path}" + fi + + # 2. Check if already running (idempotency) + job_status=$(nomad job status "$job_name" 2>/dev/null | head -1 || true) + if printf '%s' "$job_status" | grep -qi "running"; then + log "${job_name} already running" + continue + fi + + # 3. Run the job (idempotent registration) + log "running: ${jobspec_path}" + if ! nomad job run -detach "$jobspec_path"; then + die "failed to run job: ${job_name}" + fi + + # 4. Wait for running state + if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then + die "timeout waiting for job '${job_name}' to become running" + fi +done + +if [ "$DRY_RUN" -eq 1 ]; then + log "dry-run complete" +fi + +exit 0