fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes: - Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC) - On deploy timeout/failure, log WARNING and continue submitting remaining jobs instead of dying immediately; print final health summary with failed jobs list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 07:56:30 +00:00 · 2026-04-20 07:56:30 +00:00 · d1a026c702
commit d1a026c702
parent fbd66dd4ea
1 changed files with 22 additions and 2 deletions
--- a/lib/init/nomad/deploy.sh
+++ b/lib/init/nomad/deploy.sh
@ -19,10 +19,12 @@
 #   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
 #   JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
 #                            JOB_READY_TIMEOUT_FORGEJO=300)
 #                            Built-in: JOB_READY_TIMEOUT_CHAT=600
 #
 # Exit codes:
 #   0  success (all jobs deployed and healthy, or dry-run completed)
-#   1  failure (validation error, timeout, or nomad command failure)
+#   1  failure (validation error, or one or more jobs unhealthy after all
 #      jobs submitted — deploy does NOT cascade-skip on timeout)
 #
 # Idempotency:
 #   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
@ -35,7 +37,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
 JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
 # Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
 JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"
 DRY_RUN=0
 FAILED_JOBS=()  # jobs that timed out or failed deployment
 log() { printf '[deploy] %s\n' "$*" >&2; }
 die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
@ -215,7 +221,8 @@ for job_name in "${JOBS[@]}"; do
  # 4. Wait for healthy state
  if ! _wait_job_running "$job_name" "$job_timeout"; then
-    die "deployment for job '${job_name}' did not reach successful state"
+    log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
    FAILED_JOBS+=("$job_name")
  fi
 done
@ -223,4 +230,17 @@ if [ "$DRY_RUN" -eq 1 ]; then
  log "dry-run complete"
 fi
 # ── Final health summary ─────────────────────────────────────────────────────
 if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
  log ""
  log "=== DEPLOY SUMMARY ==="
  log "The following jobs did NOT reach healthy state:"
  for failed in "${FAILED_JOBS[@]}"; do
    log "  - ${failed}"
  done
  log "All other jobs were submitted and healthy."
  log "======================"
  exit 1
 fi
 exit 0