From d1a026c702837d510d722c57e7118dcf9f005d7e Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 07:56:30 +0000 Subject: [PATCH] fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070) Two changes: - Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC) - On deploy timeout/failure, log WARNING and continue submitting remaining jobs instead of dying immediately; print final health summary with failed jobs list Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/init/nomad/deploy.sh | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/lib/init/nomad/deploy.sh b/lib/init/nomad/deploy.sh index f9a3805..997fcda 100755 --- a/lib/init/nomad/deploy.sh +++ b/lib/init/nomad/deploy.sh @@ -19,10 +19,12 @@ # JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360) # JOB_READY_TIMEOUT_ — per-job timeout override (e.g., # JOB_READY_TIMEOUT_FORGEJO=300) +# Built-in: JOB_READY_TIMEOUT_CHAT=600 # # Exit codes: # 0 success (all jobs deployed and healthy, or dry-run completed) -# 1 failure (validation error, timeout, or nomad command failure) +# 1 failure (validation error, or one or more jobs unhealthy after all +# jobs submitted — deploy does NOT cascade-skip on timeout) # # Idempotency: # Running twice back-to-back on a healthy cluster is a no-op. Jobs that are @@ -35,7 +37,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}" JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}" +# Per-job built-in defaults (override with JOB_READY_TIMEOUT_ env var) +JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}" + DRY_RUN=0 +FAILED_JOBS=() # jobs that timed out or failed deployment log() { printf '[deploy] %s\n' "$*" >&2; } die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; } @@ -215,7 +221,8 @@ for job_name in "${JOBS[@]}"; do # 4. Wait for healthy state if ! _wait_job_running "$job_name" "$job_timeout"; then - die "deployment for job '${job_name}' did not reach successful state" + log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs" + FAILED_JOBS+=("$job_name") fi done @@ -223,4 +230,17 @@ if [ "$DRY_RUN" -eq 1 ]; then log "dry-run complete" fi +# ── Final health summary ───────────────────────────────────────────────────── +if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then + log "" + log "=== DEPLOY SUMMARY ===" + log "The following jobs did NOT reach healthy state:" + for failed in "${FAILED_JOBS[@]}"; do + log " - ${failed}" + done + log "All other jobs were submitted and healthy." + log "======================" + exit 1 +fi + exit 0 -- 2.49.1