195 lines
6.3 KiB
Bash
Executable file
195 lines
6.3 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# =============================================================================
|
||
# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
|
||
#
|
||
# Runs a list of jobspecs in order, waiting for each to reach "running" state
|
||
# before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend
|
||
# the job list.
|
||
#
|
||
# Usage:
|
||
# lib/init/nomad/deploy.sh <jobname> [jobname2 ...] [--dry-run]
|
||
#
|
||
# Arguments:
|
||
# jobname — basename of jobspec (without .hcl), resolved to
|
||
# ${REPO_ROOT}/nomad/jobs/<jobname>.hcl
|
||
#
|
||
# Environment:
|
||
# REPO_ROOT — absolute path to repo root (defaults to parent of
|
||
# this script's parent directory)
|
||
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120)
|
||
#
|
||
# Exit codes:
|
||
# 0 success (all jobs deployed and running, or dry-run completed)
|
||
# 1 failure (validation error, timeout, or nomad command failure)
|
||
#
|
||
# Idempotency:
|
||
# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
|
||
# already running print "[deploy] <name> already running" and continue.
|
||
# =============================================================================
|
||
set -euo pipefail
|
||
|
||
# ── Configuration ────────────────────────────────────────────────────────────
|
||
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../.." && pwd)}"
|
||
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}"
|
||
|
||
DRY_RUN=0
|
||
|
||
log() { printf '[deploy] %s\n' "$*" >&2; }
|
||
die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
|
||
|
||
# ── Parse arguments ───────────────────────────────────────────────────────────
|
||
JOBS=()
|
||
while [ $# -gt 0 ]; do
|
||
case "$1" in
|
||
--dry-run)
|
||
DRY_RUN=1
|
||
shift
|
||
;;
|
||
-*)
|
||
die "Unknown option: $1"
|
||
;;
|
||
*)
|
||
JOBS+=("$1")
|
||
shift
|
||
;;
|
||
esac
|
||
done
|
||
|
||
if [ "${#JOBS[@]}" -eq 0 ]; then
|
||
die "Usage: $0 <jobname> [jobname2 ...] [--dry-run]"
|
||
fi
|
||
|
||
# ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
|
||
# Polls `nomad job status -json <name>` until:
|
||
# - Status == "running", OR
|
||
# - All allocations are in "running" state
|
||
#
|
||
# On timeout: prints last 50 lines of stderr from all allocations and exits 1.
|
||
#
|
||
# This is a named, reusable helper for future init scripts.
|
||
_wait_job_running() {
|
||
local job_name="$1"
|
||
local timeout="$2"
|
||
local elapsed=0
|
||
|
||
log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..."
|
||
|
||
while [ "$elapsed" -lt "$timeout" ]; do
|
||
local status_json
|
||
status_json=$(nomad job status -json "$job_name" 2>/dev/null) || {
|
||
# Job may not exist yet — keep waiting
|
||
sleep 5
|
||
elapsed=$((elapsed + 5))
|
||
continue
|
||
}
|
||
|
||
local status
|
||
status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || {
|
||
sleep 5
|
||
elapsed=$((elapsed + 5))
|
||
continue
|
||
}
|
||
|
||
case "$status" in
|
||
running)
|
||
log "job '${job_name}' is now running"
|
||
return 0
|
||
;;
|
||
complete|dead|failed)
|
||
# Check allocations for partial success
|
||
local allocs_running
|
||
allocs_running=$(printf '%s' "$status_json" \
|
||
| jq '[.Evaluations[].Allocations[]? | select(.Status == "running")] | length' 2>/dev/null) || allocs_running=0
|
||
local allocs_total
|
||
allocs_total=$(printf '%s' "$status_json" \
|
||
| jq '[.Evaluations[].Allocations[]? | length] | add' 2>/dev/null) || allocs_total=0
|
||
|
||
if [ "$allocs_running" -gt 0 ]; then
|
||
log "job '${job_name}' has ${allocs_running}/${allocs_total} allocations running"
|
||
# If not all running but some are, keep waiting
|
||
if [ "$allocs_running" -lt "$allocs_total" ]; then
|
||
sleep 5
|
||
elapsed=$((elapsed + 5))
|
||
continue
|
||
fi
|
||
fi
|
||
|
||
log "job '${job_name}' reached terminal state: ${status}"
|
||
return 0
|
||
;;
|
||
*)
|
||
log "job '${job_name}' status: ${status} (waiting...)"
|
||
;;
|
||
esac
|
||
|
||
sleep 5
|
||
elapsed=$((elapsed + 5))
|
||
done
|
||
|
||
# Timeout — print last 50 lines of alloc logs
|
||
log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s"
|
||
log "showing last 50 lines of allocation logs (stderr):"
|
||
|
||
# Get allocation IDs
|
||
local alloc_ids
|
||
alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
|
||
| jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
|
||
|
||
if [ -n "$alloc_ids" ]; then
|
||
for alloc_id in $alloc_ids; do
|
||
log "--- Allocation ${alloc_id} logs (stderr) ---"
|
||
nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
|
||
done
|
||
fi
|
||
|
||
return 1
|
||
}
|
||
|
||
# ── Main: deploy each job in order ───────────────────────────────────────────
|
||
for job_name in "${JOBS[@]}"; do
|
||
jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"
|
||
|
||
if [ ! -f "$jobspec_path" ]; then
|
||
die "Jobspec not found: ${jobspec_path}"
|
||
fi
|
||
|
||
if [ "$DRY_RUN" -eq 1 ]; then
|
||
log "[dry-run] nomad job validate ${jobspec_path}"
|
||
log "[dry-run] nomad job run -detach ${jobspec_path}"
|
||
log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)"
|
||
continue
|
||
fi
|
||
|
||
log "processing job: ${job_name}"
|
||
|
||
# 1. Validate the jobspec
|
||
log "validating: ${jobspec_path}"
|
||
if ! nomad job validate "$jobspec_path"; then
|
||
die "validation failed for: ${jobspec_path}"
|
||
fi
|
||
|
||
# 2. Check if already running (idempotency)
|
||
job_status=$(nomad job status "$job_name" 2>/dev/null | head -1 || true)
|
||
if printf '%s' "$job_status" | grep -qi "running"; then
|
||
log "${job_name} already running"
|
||
continue
|
||
fi
|
||
|
||
# 3. Run the job (idempotent registration)
|
||
log "running: ${jobspec_path}"
|
||
if ! nomad job run -detach "$jobspec_path"; then
|
||
die "failed to run job: ${job_name}"
|
||
fi
|
||
|
||
# 4. Wait for running state
|
||
if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then
|
||
die "timeout waiting for job '${job_name}' to become running"
|
||
fi
|
||
done
|
||
|
||
if [ "$DRY_RUN" -eq 1 ]; then
|
||
log "dry-run complete"
|
||
fi
|
||
|
||
exit 0
|