fix: deploy.sh 360s still too tight for chat cold-start + cascade-skip masks edge/vault-runner (#1070)
Two changes: - Set JOB_READY_TIMEOUT_CHAT=600 (chat cold-start takes ~5-6 min on fresh LXC) - On deploy timeout/failure, log WARNING and continue submitting remaining jobs instead of dying immediately; print final health summary with failed jobs list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fbd66dd4ea
commit
d1a026c702
1 changed files with 22 additions and 2 deletions
|
|
@ -19,10 +19,12 @@
|
||||||
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
|
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
|
||||||
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
|
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
|
||||||
# JOB_READY_TIMEOUT_FORGEJO=300)
|
# JOB_READY_TIMEOUT_FORGEJO=300)
|
||||||
|
# Built-in: JOB_READY_TIMEOUT_CHAT=600
|
||||||
#
|
#
|
||||||
# Exit codes:
|
# Exit codes:
|
||||||
# 0 success (all jobs deployed and healthy, or dry-run completed)
|
# 0 success (all jobs deployed and healthy, or dry-run completed)
|
||||||
# 1 failure (validation error, timeout, or nomad command failure)
|
# 1 failure (validation error, or one or more jobs unhealthy after all
|
||||||
|
# jobs submitted — deploy does NOT cascade-skip on timeout)
|
||||||
#
|
#
|
||||||
# Idempotency:
|
# Idempotency:
|
||||||
# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
|
# Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
|
||||||
|
|
@ -35,7 +37,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
|
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
|
||||||
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
|
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
|
||||||
|
|
||||||
|
# Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
|
||||||
|
JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"
|
||||||
|
|
||||||
DRY_RUN=0
|
DRY_RUN=0
|
||||||
|
FAILED_JOBS=() # jobs that timed out or failed deployment
|
||||||
|
|
||||||
log() { printf '[deploy] %s\n' "$*" >&2; }
|
log() { printf '[deploy] %s\n' "$*" >&2; }
|
||||||
die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
|
die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
|
||||||
|
|
@ -215,7 +221,8 @@ for job_name in "${JOBS[@]}"; do
|
||||||
|
|
||||||
# 4. Wait for healthy state
|
# 4. Wait for healthy state
|
||||||
if ! _wait_job_running "$job_name" "$job_timeout"; then
|
if ! _wait_job_running "$job_name" "$job_timeout"; then
|
||||||
die "deployment for job '${job_name}' did not reach successful state"
|
log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
|
||||||
|
FAILED_JOBS+=("$job_name")
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
@ -223,4 +230,17 @@ if [ "$DRY_RUN" -eq 1 ]; then
|
||||||
log "dry-run complete"
|
log "dry-run complete"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ── Final health summary ─────────────────────────────────────────────────────
|
||||||
|
if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
|
||||||
|
log ""
|
||||||
|
log "=== DEPLOY SUMMARY ==="
|
||||||
|
log "The following jobs did NOT reach healthy state:"
|
||||||
|
for failed in "${FAILED_JOBS[@]}"; do
|
||||||
|
log " - ${failed}"
|
||||||
|
done
|
||||||
|
log "All other jobs were submitted and healthy."
|
||||||
|
log "======================"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
exit 0
|
exit 0
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue