fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)

Two changes: - Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC) - On deploy timeout/failure, log WARNING and continue submitting remaining jobs instead of dying immediately; print final health summary with failed jobs list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-20 07:56:30 +00:00 · 2026-04-20 07:56:30 +00:00 · d1a026c702
commit d1a026c702
parent fbd66dd4ea
1 changed files with 22 additions and 2 deletions
--- a/lib/init/nomad/deploy.sh
+++ b/lib/init/nomad/deploy.sh
@ -19,10 +19,12 @@
 #   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
 #   JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
 #                            JOB_READY_TIMEOUT_FORGEJO=300)
+#                            Built-in: JOB_READY_TIMEOUT_CHAT=600
 #
 # Exit codes:
 #   0  success (all jobs deployed and healthy, or dry-run completed)
-#   1  failure (validation error, timeout, or nomad command failure)
+#   1  failure (validation error, or one or more jobs unhealthy after all
+#      jobs submitted — deploy does NOT cascade-skip on timeout)
 #
 # Idempotency:
 #   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
@ -35,7 +37,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
 JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"

+# Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
+JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"
+
 DRY_RUN=0
+FAILED_JOBS=()  # jobs that timed out or failed deployment

 log() { printf '[deploy] %s\n' "$*" >&2; }
 die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
@ -215,7 +221,8 @@ for job_name in "${JOBS[@]}"; do

  # 4. Wait for healthy state
  if ! _wait_job_running "$job_name" "$job_timeout"; then
-    die "deployment for job '${job_name}' did not reach successful state"
+    log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
+    FAILED_JOBS+=("$job_name")
  fi
 done

@ -223,4 +230,17 @@ if [ "$DRY_RUN" -eq 1 ]; then
  log "dry-run complete"
 fi

+# ── Final health summary ─────────────────────────────────────────────────────
+if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
+  log ""
+  log "=== DEPLOY SUMMARY ==="
+  log "The following jobs did NOT reach healthy state:"
+  for failed in "${FAILED_JOBS[@]}"; do
+    log "  - ${failed}"
+  done
+  log "All other jobs were submitted and healthy."
+  log "======================"
+  exit 1
+fi
+
 exit 0