disinto/lib/init/nomad/deploy.sh

#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
#
# Runs a list of jobspecs in order, waiting for each to reach healthy state
# before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend
# the job list.
#
# Usage:
#   lib/init/nomad/deploy.sh <jobname> [jobname2 ...] [--dry-run]
#
# Arguments:
#   jobname  — basename of jobspec (without .hcl), resolved to
#              ${REPO_ROOT}/nomad/jobs/<jobname>.hcl
#
# Environment:
#   REPO_ROOT              — absolute path to repo root (defaults to parent of
#                            this script's parent directory)
#   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
#   JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
#                            JOB_READY_TIMEOUT_FORGEJO=300)
#                            Built-in: JOB_READY_TIMEOUT_CHAT=600
#
# Exit codes:
#   0  success (all jobs deployed and healthy, or dry-run completed)
#   1  failure (validation error, or one or more jobs unhealthy after all
#      jobs submitted — deploy does NOT cascade-skip on timeout)
#
# Idempotency:
#   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
#   already healthy print "[deploy] <name> already healthy" and continue.
# =============================================================================
set -euo pipefail

# ── Configuration ────────────────────────────────────────────────────────────
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"

# Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"

DRY_RUN=0
FAILED_JOBS=()  # jobs that timed out or failed deployment

log() { printf '[deploy] %s\n' "$*" >&2; }
die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }

# ── Parse arguments ───────────────────────────────────────────────────────────
JOBS=()
while [ $# -gt 0 ]; do
  case "$1" in
    --dry-run)
      DRY_RUN=1
      shift
      ;;
    -*)
      die "Unknown option: $1"
      ;;
    *)
      JOBS+=("$1")
      shift
      ;;
  esac
done

if [ "${#JOBS[@]}" -eq 0 ]; then
  die "Usage: $0 <jobname> [jobname2 ...] [--dry-run]"
fi

# ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
# Polls `nomad deployment status -json <deployment-id>` until:
#   - Status == "successful"
#   - Status == "failed"
#
# On deployment failure: prints last 50 lines of stderr from allocations and exits 1.
# On timeout: prints last 50 lines of stderr from allocations and exits 1.
#
# This is a named, reusable helper for future init scripts.
_wait_job_running() {
  local job_name="$1"
  local timeout="$2"
  local elapsed=0

  log "waiting for job '${job_name}' to become healthy (timeout: ${timeout}s)..."

  # Get the latest deployment ID for this job (retry until available)
  local deployment_id=""
  local retry_count=0
  local max_retries=12

  while [ -z "$deployment_id" ] && [ "$retry_count" -lt "$max_retries" ]; do
    deployment_id=$(nomad job deployments -json "$job_name" 2>/dev/null | jq -r '.[0].ID' 2>/dev/null) || deployment_id=""
    if [ -z "$deployment_id" ]; then
      sleep 5
      retry_count=$((retry_count + 1))
    fi
  done

  if [ -z "$deployment_id" ]; then
    log "ERROR: no deployment found for job '${job_name}' after ${max_retries} attempts"
    return 1
  fi

  log "tracking deployment '${deployment_id}'..."

  while [ "$elapsed" -lt "$timeout" ]; do
    local deploy_status_json
    deploy_status_json=$(nomad deployment status -json "$deployment_id" 2>/dev/null) || {
      # Deployment may not exist yet — keep waiting
      sleep 5
      elapsed=$((elapsed + 5))
      continue
    }

    local status
    status=$(printf '%s' "$deploy_status_json" | jq -r '.Status' 2>/dev/null) || {
      sleep 5
      elapsed=$((elapsed + 5))
      continue
    }

    case "$status" in
      successful)
        log "${job_name} healthy after ${elapsed}s"
        return 0
        ;;
      failed)
        log "deployment '${deployment_id}' failed for job '${job_name}'"
        log "showing last 50 lines of allocation logs (stderr):"

        # Get allocation IDs from job status
        local alloc_ids
        alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
          | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""

        if [ -n "$alloc_ids" ]; then
          for alloc_id in $alloc_ids; do
            log "--- Allocation ${alloc_id} logs (stderr) ---"
            nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
          done
        fi

        return 1
        ;;
      running|progressing)
        log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)"
        ;;
      *)
        log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)"
        ;;
    esac

    sleep 5
    elapsed=$((elapsed + 5))
  done

  # Timeout — print last 50 lines of alloc logs
  log "TIMEOUT: deployment '${deployment_id}' did not reach successful state within ${timeout}s"
  log "showing last 50 lines of allocation logs (stderr):"

  # Get allocation IDs from job status
  local alloc_ids
  alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
    | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""

  if [ -n "$alloc_ids" ]; then
    for alloc_id in $alloc_ids; do
      log "--- Allocation ${alloc_id} logs (stderr) ---"
      nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
    done
  fi

  return 1
}

# ── Helper: _run_post_deploy <job_name> ─────────────────────────────────────
# Runs post-deploy scripts for a job after it becomes healthy.
# Currently supports: forgejo → run forgejo-bootstrap.sh
#
# Args:
#   job_name — name of the deployed job
#
# Returns:
#   0 on success (script ran or not applicable)
#   1 on failure
# ─────────────────────────────────────────────────────────────────────────────
_run_post_deploy() {
  local job_name="$1"
  local post_deploy_script

  case "$job_name" in
    forgejo)
      post_deploy_script="${SCRIPT_ROOT}/forgejo-bootstrap.sh"
      if [ -x "$post_deploy_script" ]; then
        log "running post-deploy script for ${job_name}"
        if ! "$post_deploy_script"; then
          log "ERROR: post-deploy script failed for ${job_name}"
          return 1
        fi
        log "post-deploy script completed for ${job_name}"
      else
        log "no post-deploy script found for ${job_name}, skipping"
      fi
      ;;
    *)
      log "no post-deploy script for ${job_name}, skipping"
      ;;
  esac

  return 0
}

# ── Main: deploy each job in order ───────────────────────────────────────────
for job_name in "${JOBS[@]}"; do
  jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"

  if [ ! -f "$jobspec_path" ]; then
    die "Jobspec not found: ${jobspec_path}"
  fi

  # Per-job timeout override: JOB_READY_TIMEOUT_<UPPERCASE_JOBNAME>
  # Sanitize job name: replace hyphens with underscores (bash vars can't have hyphens)
  job_upper=$(printf '%s' "$job_name" | tr '[:lower:]-' '[:upper:]_' | tr ' ' '_')
  timeout_var="JOB_READY_TIMEOUT_${job_upper}"
  job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}"

  if [ "$DRY_RUN" -eq 1 ]; then
    log "[dry-run] nomad job validate ${jobspec_path}"
    log "[dry-run] nomad job run -detach ${jobspec_path}"
    log "[dry-run] (would wait for '${job_name}' to become healthy for ${job_timeout}s)"
    case "$job_name" in
      forgejo) log "[dry-run] [post-deploy] would run forgejo-bootstrap.sh" ;;
    esac
    continue
  fi

  log "processing job: ${job_name}"

  # 1. Validate the jobspec
  log "validating: ${jobspec_path}"
  if ! nomad job validate "$jobspec_path"; then
    die "validation failed for: ${jobspec_path}"
  fi

  # 2. Check if already healthy (idempotency)
  job_status_json=$(nomad job status -json "$job_name" 2>/dev/null || true)
  if [ -n "$job_status_json" ]; then
    current_status=$(printf '%s' "$job_status_json" | jq -r '.Status' 2>/dev/null || true)
    if [ "$current_status" = "running" ]; then
      log "${job_name} already healthy"
      continue
    fi
  fi

  # 3. Run the job (idempotent registration)
  log "running: ${jobspec_path}"
  if ! nomad job run -detach "$jobspec_path"; then
    die "failed to run job: ${job_name}"
  fi

  # 4. Wait for healthy state
  if ! _wait_job_running "$job_name" "$job_timeout"; then
    log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
    FAILED_JOBS+=("$job_name")
  else
    # 5. Run post-deploy scripts (only if job reached healthy state)
    if ! _run_post_deploy "$job_name"; then
      die "post-deploy script failed for job '${job_name}'"
    fi
  fi
done

if [ "$DRY_RUN" -eq 1 ]; then
  log "dry-run complete"
fi

# ── Final health summary ─────────────────────────────────────────────────────
if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
  log ""
  log "=== DEPLOY SUMMARY ==="
  log "The following jobs did NOT reach healthy state:"
  for failed in "${FAILED_JOBS[@]}"; do
    log "  - ${failed}"
  done
  log "All other jobs were submitted and healthy."
  log "======================"
  exit 1
fi

exit 0
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								#!/usr/bin/env bash
 								# =============================================================================
 								# lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait
 								#
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								# Runs a list of jobspecs in order, waiting for each to reach healthy state
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								# before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend
 								# the job list.
 								#
 								# Usage:
 								#   lib/init/nomad/deploy.sh <jobname> [jobname2 ...] [--dry-run]
 								#
 								# Arguments:
 								#   jobname  — basename of jobspec (without .hcl), resolved to
 								#              ${REPO_ROOT}/nomad/jobs/<jobname>.hcl
 								#
 								# Environment:
 								#   REPO_ROOT              — absolute path to repo root (defaults to parent of
 								#                            this script's parent directory)
-												fix: [nomad-step-5] deploy.sh 240s healthy_deadline too tight for chat cold-start (#1036)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-19 09:32:46 +00:00
+								#   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								#   JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
 								#                            JOB_READY_TIMEOUT_FORGEJO=300)
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								#                            Built-in: JOB_READY_TIMEOUT_CHAT=600
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								#
 								# Exit codes:
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								#   0  success (all jobs deployed and healthy, or dry-run completed)
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								#   1  failure (validation error, or one or more jobs unhealthy after all
 								#      jobs submitted — deploy does NOT cascade-skip on timeout)
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								#
 								# Idempotency:
 								#   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								#   already healthy print "[deploy] <name> already healthy" and continue.
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								# =============================================================================
 								set -euo pipefail
 								# ── Configuration ────────────────────────────────────────────────────────────
 								SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 								REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
-												fix: [nomad-step-5] deploy.sh 240s healthy_deadline too tight for chat cold-start (#1036)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-19 09:32:46 +00:00
+								JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								# Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
 								JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								DRY_RUN=0
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								FAILED_JOBS=()  # jobs that timed out or failed deployment
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
 								log() { printf '[deploy] %s\n' "$*" >&2; }
 								die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
 								# ── Parse arguments ───────────────────────────────────────────────────────────
 								JOBS=()
 								while [ $# -gt 0 ]; do
 								  case "$1" in
 								    --dry-run)
 								      DRY_RUN=1
 								      shift
 								      ;;
 								    -*)
 								      die "Unknown option: $1"
 								      ;;
 								    *)
 								      JOBS+=("$1")
 								      shift
 								      ;;
 								  esac
 								done
 								if [ "${#JOBS[@]}" -eq 0 ]; then
 								  die "Usage: $0 <jobname> [jobname2 ...] [--dry-run]"
 								fi
 								# ── Helper: _wait_job_running <name> <timeout> ───────────────────────────────
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								# Polls `nomad deployment status -json <deployment-id>` until:
 								#   - Status == "successful"
 								#   - Status == "failed"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								#
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								# On deployment failure: prints last 50 lines of stderr from allocations and exits 1.
 								# On timeout: prints last 50 lines of stderr from allocations and exits 1.
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								#
 								# This is a named, reusable helper for future init scripts.
 								_wait_job_running() {
 								  local job_name="$1"
 								  local timeout="$2"
 								  local elapsed=0
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  log "waiting for job '${job_name}' to become healthy (timeout: ${timeout}s)..."
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								  # Get the latest deployment ID for this job (retry until available)
 								  local deployment_id=""
 								  local retry_count=0
 								  local max_retries=12
 								  while [ -z "$deployment_id" ] && [ "$retry_count" -lt "$max_retries" ]; do
 								    deployment_id=$(nomad job deployments -json "$job_name" 2>/dev/null | jq -r '.[0].ID' 2>/dev/null) || deployment_id=""
 								    if [ -z "$deployment_id" ]; then
 								      sleep 5
 								      retry_count=$((retry_count + 1))
 								    fi
 								  done
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
 								  if [ -z "$deployment_id" ]; then
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								    log "ERROR: no deployment found for job '${job_name}' after ${max_retries} attempts"
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								    return 1
 								  fi
 								  log "tracking deployment '${deployment_id}'..."
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
 								  while [ "$elapsed" -lt "$timeout" ]; do
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								    local deploy_status_json
 								    deploy_status_json=$(nomad deployment status -json "$deployment_id" 2>/dev/null) || {
 								      # Deployment may not exist yet — keep waiting
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								      sleep 5
 								      elapsed=$((elapsed + 5))
 								      continue
 								    }
 								    local status
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								    status=$(printf '%s' "$deploy_status_json" | jq -r '.Status' 2>/dev/null) || {
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								      sleep 5
 								      elapsed=$((elapsed + 5))
 								      continue
 								    }
 								    case "$status" in
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								      successful)
 								        log "${job_name} healthy after ${elapsed}s"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								        return 0
 								        ;;
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								      failed)
 								        log "deployment '${deployment_id}' failed for job '${job_name}'"
 								        log "showing last 50 lines of allocation logs (stderr):"
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								        # Get allocation IDs from job status
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								        local alloc_ids
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								        alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
 								          | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
 								        if [ -n "$alloc_ids" ]; then
 								          for alloc_id in $alloc_ids; do
 								            log "--- Allocation ${alloc_id} logs (stderr) ---"
 								            nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
 								          done
 								        fi
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								        return 1
 								        ;;
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								      running|progressing)
 								        log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)"
 								        ;;
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								      *)
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								        log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								        ;;
 								    esac
 								    sleep 5
 								    elapsed=$((elapsed + 5))
 								  done
 								  # Timeout — print last 50 lines of alloc logs
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  log "TIMEOUT: deployment '${deployment_id}' did not reach successful state within ${timeout}s"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								  log "showing last 50 lines of allocation logs (stderr):"
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  # Get allocation IDs from job status
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								  local alloc_ids
 								  alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \
-												fix: [nomad-step-1] deploy.sh-fix — correct jq selectors for deployment status; add deployment ID retry

											
										
										
											2026-04-16 15:43:07 +00:00
+								    | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids=""
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
 								  if [ -n "$alloc_ids" ]; then
 								    for alloc_id in $alloc_ids; do
 								      log "--- Allocation ${alloc_id} logs (stderr) ---"
 								      nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true
 								    done
 								  fi
 								  return 1
 								}
-												fix: bug: disinto init --backend=nomad — does not bootstrap Forgejo admin user (#1069)

											
										
										
											2026-04-20 08:01:09 +00:00
+								# ── Helper: _run_post_deploy <job_name> ─────────────────────────────────────
 								# Runs post-deploy scripts for a job after it becomes healthy.
 								# Currently supports: forgejo → run forgejo-bootstrap.sh
 								#
 								# Args:
 								#   job_name — name of the deployed job
 								#
 								# Returns:
 								#   0 on success (script ran or not applicable)
 								#   1 on failure
 								# ─────────────────────────────────────────────────────────────────────────────
 								_run_post_deploy() {
 								  local job_name="$1"
 								  local post_deploy_script
 								  case "$job_name" in
 								    forgejo)
 								      post_deploy_script="${SCRIPT_ROOT}/forgejo-bootstrap.sh"
 								      if [ -x "$post_deploy_script" ]; then
 								        log "running post-deploy script for ${job_name}"
 								        if ! "$post_deploy_script"; then
 								          log "ERROR: post-deploy script failed for ${job_name}"
 								          return 1
 								        fi
 								        log "post-deploy script completed for ${job_name}"
 								      else
 								        log "no post-deploy script found for ${job_name}, skipping"
 								      fi
 								      ;;
 								    *)
 								      log "no post-deploy script for ${job_name}, skipping"
 								      ;;
 								  esac
 								  return 0
 								}
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								# ── Main: deploy each job in order ───────────────────────────────────────────
 								for job_name in "${JOBS[@]}"; do
 								  jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"
 								  if [ ! -f "$jobspec_path" ]; then
 								    die "Jobspec not found: ${jobspec_path}"
 								  fi
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  # Per-job timeout override: JOB_READY_TIMEOUT_<UPPERCASE_JOBNAME>
-												fix: [nomad-step-3] S3-fix — deploy.sh crashes on hyphenated job name + wp-oauth double lib/ path (#944)

											
										
										
											2026-04-17 07:36:12 +00:00
+								  # Sanitize job name: replace hyphens with underscores (bash vars can't have hyphens)
 								  job_upper=$(printf '%s' "$job_name" | tr '[:lower:]-' '[:upper:]_' | tr ' ' '_')
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  timeout_var="JOB_READY_TIMEOUT_${job_upper}"
 								  job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								  if [ "$DRY_RUN" -eq 1 ]; then
 								    log "[dry-run] nomad job validate ${jobspec_path}"
 								    log "[dry-run] nomad job run -detach ${jobspec_path}"
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								    log "[dry-run] (would wait for '${job_name}' to become healthy for ${job_timeout}s)"
-												fix: bug: disinto init --backend=nomad — does not bootstrap Forgejo admin user (#1069)

											
										
										
											2026-04-20 08:01:09 +00:00
+								    case "$job_name" in
 								      forgejo) log "[dry-run] [post-deploy] would run forgejo-bootstrap.sh" ;;
 								    esac
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								    continue
 								  fi
 								  log "processing job: ${job_name}"
 								  # 1. Validate the jobspec
 								  log "validating: ${jobspec_path}"
 								  if ! nomad job validate "$jobspec_path"; then
 								    die "validation failed for: ${jobspec_path}"
 								  fi
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  # 2. Check if already healthy (idempotency)
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								  job_status_json=$(nomad job status -json "$job_name" 2>/dev/null || true)
 								  if [ -n "$job_status_json" ]; then
 								    current_status=$(printf '%s' "$job_status_json" | jq -r '.Status' 2>/dev/null || true)
 								    if [ "$current_status" = "running" ]; then
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								      log "${job_name} already healthy"
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								      continue
 								    fi
 								  fi
 								  # 3. Run the job (idempotent registration)
 								  log "running: ${jobspec_path}"
 								  if ! nomad job run -detach "$jobspec_path"; then
 								    die "failed to run job: ${job_name}"
 								  fi
-												fix: [nomad-step-1] deploy.sh-fix — poll deployment status not alloc status; bump timeout 120→240s (#878)

											
										
										
											2026-04-16 15:29:41 +00:00
+								  # 4. Wait for healthy state
 								  if ! _wait_job_running "$job_name" "$job_timeout"; then
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								    log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
 								    FAILED_JOBS+=("$job_name")
-												fix: resolve all CI review blockers for forgejo admin bootstrap (#1069)

											
										
										
											2026-04-20 08:35:40 +00:00
+								  else
 								    # 5. Run post-deploy scripts (only if job reached healthy state)
 								    if ! _run_post_deploy "$job_name"; then
 								      die "post-deploy script failed for job '${job_name}'"
 								    fi
-												fix: bug: disinto init --backend=nomad — does not bootstrap Forgejo admin user (#1069)

											
										
										
											2026-04-20 08:01:09 +00:00
+								  fi
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								done
 								if [ "$DRY_RUN" -eq 1 ]; then
 								  log "dry-run complete"
 								fi
-												fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes:
- Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC)
- On deploy timeout/failure, log WARNING and continue submitting remaining jobs
  instead of dying immediately; print final health summary with failed jobs list

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

											
										
										
											2026-04-20 07:56:30 +00:00
+								# ── Final health summary ─────────────────────────────────────────────────────
 								if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
 								  log ""
 								  log "=== DEPLOY SUMMARY ==="
 								  log "The following jobs did NOT reach healthy state:"
 								  for failed in "${FAILED_JOBS[@]}"; do
 								    log "  - ${failed}"
 								  done
 								  log "All other jobs were submitted and healthy."
 								  log "======================"
 								  exit 1
 								fi
-												fix: [nomad-step-1] S1.2 — add lib/init/nomad/deploy.sh (dependency-ordered nomad job run + wait) (#841)

											
										
										
											2026-04-16 10:23:16 +00:00
+								exit 0