disinto/lib/init/nomad/deploy.sh
Agent fce4d83176
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/push/nomad-validate Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/nomad-validate Pipeline was successful
fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)
2026-04-16 10:23:16 +00:00

195 lines
6.3 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
#
# Runs a list of jobspecs in order, waiting for each to reach "running" state
# before starting the next. Step-1 uses it for forgejo-only; Steps 36 extend
# the job list.
#
# Usage:
# lib/init/nomad/deploy.sh <jobname> [jobname2 ...] [--dry-run]
#
# Arguments:
# jobname — basename of jobspec (without .hcl), resolved to
# ${REPO_ROOT}/nomad/jobs/<jobname>.hcl
#
# Environment:
# REPO_ROOT — absolute path to repo root (defaults to parent of
# this script's parent directory)
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120)
#
# Exit codes:
# 0 success (all jobs deployed and running, or dry-run completed)
# 1 failure (validation error, timeout, or nomad command failure)
#
# Idempotency:
# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
# already running print "[deploy] <name> already running" and continue.
# =============================================================================
set -euo pipefail
# ── Configuration ────────────────────────────────────────────────────────────
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../.." && pwd)}"
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}"
DRY_RUN=0
log() { printf '[deploy] %s\n' "$*" >&2; }
die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Parse arguments ───────────────────────────────────────────────────────────
JOBS=()
while [ $# -gt 0 ]; do
case "$1" in
--dry-run)
DRY_RUN=1
shift
;;
-*)
die "Unknown option: $1"
;;
*)
JOBS+=("$1")
shift
;;
esac
done
if [ "${#JOBS[@]}" -eq 0 ]; then
die "Usage: $0 <jobname> [jobname2 ...] [--dry-run]"
fi
# ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
# Polls `nomad job status -json <name>` until:
# - Status == "running", OR
# - All allocations are in "running" state
#
# On timeout: prints last 50 lines of stderr from all allocations and exits 1.
#
# This is a named, reusable helper for future init scripts.
_wait_job_running() {
local job_name="$1"
local timeout="$2"
local elapsed=0
log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..."
while [ "$elapsed" -lt "$timeout" ]; do
local status_json
status_json=$(nomad job status -json "$job_name" 2>/dev/null) || {
# Job may not exist yet — keep waiting
sleep 5
elapsed=$((elapsed + 5))
continue
}
local status
status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || {
sleep 5
elapsed=$((elapsed + 5))
continue
}
case "$status" in
running)
log "job '${job_name}' is now running"
return 0
;;
complete|dead|failed)
# Check allocations for partial success
local allocs_running
allocs_running=$(printf '%s' "$status_json" \
| jq '[.Evaluations[].Allocations[]? | select(.Status == "running")] | length' 2>/dev/null) || allocs_running=0
local allocs_total
allocs_total=$(printf '%s' "$status_json" \
| jq '[.Evaluations[].Allocations[]? | length] | add' 2>/dev/null) || allocs_total=0
if [ "$allocs_running" -gt 0 ]; then
log "job '${job_name}' has ${allocs_running}/${allocs_total} allocations running"
# If not all running but some are, keep waiting
if [ "$allocs_running" -lt "$allocs_total" ]; then
sleep 5
elapsed=$((elapsed + 5))
continue
fi
fi
log "job '${job_name}' reached terminal state: ${status}"
return 0
;;
*)
log "job '${job_name}' status: ${status} (waiting...)"
;;
esac
sleep 5
elapsed=$((elapsed + 5))
done
# Timeout — print last 50 lines of alloc logs
log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s"
log "showing last 50 lines of allocation logs (stderr):"
# Get allocation IDs
local alloc_ids
alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
| jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
if [ -n "$alloc_ids" ]; then
for alloc_id in $alloc_ids; do
log "--- Allocation ${alloc_id} logs (stderr) ---"
nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
done
fi
return 1
}
# ── Main: deploy each job in order ───────────────────────────────────────────
for job_name in "${JOBS[@]}"; do
jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"
if [ ! -f "$jobspec_path" ]; then
die "Jobspec not found: ${jobspec_path}"
fi
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] nomad job validate ${jobspec_path}"
log "[dry-run] nomad job run -detach ${jobspec_path}"
log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)"
continue
fi
log "processing job: ${job_name}"
# 1. Validate the jobspec
log "validating: ${jobspec_path}"
if ! nomad job validate "$jobspec_path"; then
die "validation failed for: ${jobspec_path}"
fi
# 2. Check if already running (idempotency)
job_status=$(nomad job status "$job_name" 2>/dev/null | head -1 || true)
if printf '%s' "$job_status" | grep -qi "running"; then
log "${job_name} already running"
continue
fi
# 3. Run the job (idempotent registration)
log "running: ${jobspec_path}"
if ! nomad job run -detach "$jobspec_path"; then
die "failed to run job: ${job_name}"
fi
# 4. Wait for running state
if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then
die "timeout waiting for job '${job_name}' to become running"
fi
done
if [ "$DRY_RUN" -eq 1 ]; then
log "dry-run complete"
fi
exit 0