fix: Preserve worktrees on crash for debugging (#726)
On crash (PHASE:crashed or non-zero exit), preserve the worktree and log its location instead of destroying it unconditionally. Successful sessions still clean up normally. Supervisor runs housekeeping to remove stale crashed worktrees older than 24h. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b4c053b3ed
commit
f716a75351
4 changed files with 66 additions and 6 deletions
|
|
@ -100,6 +100,7 @@ fi
|
||||||
echo $$ > "$LOCKFILE"
|
echo $$ > "$LOCKFILE"
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
|
local exit_code=$?
|
||||||
# Kill lifetime watchdog if running
|
# Kill lifetime watchdog if running
|
||||||
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
|
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
|
||||||
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
|
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
|
||||||
|
|
@ -118,8 +119,14 @@ cleanup() {
|
||||||
fi
|
fi
|
||||||
# Best-effort docker cleanup for containers started during this action
|
# Best-effort docker cleanup for containers started during this action
|
||||||
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
|
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
|
||||||
# Destroy the worktree — the runtime owns the lifecycle
|
# Preserve worktree on crash for debugging; clean up on success
|
||||||
cleanup_worktree
|
local final_phase=""
|
||||||
|
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true)
|
||||||
|
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
|
||||||
|
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||||
|
else
|
||||||
|
cleanup_worktree
|
||||||
|
fi
|
||||||
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
|
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
|
||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
|
||||||
|
|
@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
|
||||||
"session crashed unexpectedly — marking blocked" \
|
"session crashed unexpectedly — marking blocked" \
|
||||||
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
|
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
|
||||||
post_blocked_diagnostic "crashed"
|
post_blocked_diagnostic "crashed"
|
||||||
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree
|
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||||
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
|
|
||||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
|
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
|
||||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||||
|
|
|
||||||
|
|
@ -174,6 +174,53 @@ formula_phase_callback() {
|
||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
|
||||||
|
|
||||||
|
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
|
||||||
|
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
|
||||||
|
# Scans /tmp for orphaned worktrees matching agent naming patterns.
|
||||||
|
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
|
||||||
|
# Requires globals: PROJECT_REPO_ROOT.
|
||||||
|
cleanup_stale_crashed_worktrees() {
|
||||||
|
local max_age_hours="${1:-24}"
|
||||||
|
local max_age_seconds=$((max_age_hours * 3600))
|
||||||
|
local now
|
||||||
|
now=$(date +%s)
|
||||||
|
local cleaned=0
|
||||||
|
|
||||||
|
# Collect active tmux pane working directories for safety check
|
||||||
|
local active_dirs=""
|
||||||
|
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
|
||||||
|
|
||||||
|
local wt_dir
|
||||||
|
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
|
||||||
|
[ -d "$wt_dir" ] || continue
|
||||||
|
# Must be a git worktree (has .git file or directory)
|
||||||
|
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
|
||||||
|
|
||||||
|
# Check age (use directory mtime)
|
||||||
|
local dir_mtime
|
||||||
|
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
|
||||||
|
local age=$((now - dir_mtime))
|
||||||
|
[ "$age" -lt "$max_age_seconds" ] && continue
|
||||||
|
|
||||||
|
# Skip if an active tmux pane is using this worktree
|
||||||
|
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Remove the worktree
|
||||||
|
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
|
||||||
|
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
|
||||||
|
cleaned=$((cleaned + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Prune any dangling worktree references
|
||||||
|
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
|
||||||
|
|
||||||
|
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
|
||||||
|
}
|
||||||
|
|
||||||
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
||||||
|
|
||||||
# build_scratch_instruction SCRATCH_FILE
|
# build_scratch_instruction SCRATCH_FILE
|
||||||
|
|
@ -306,8 +353,12 @@ run_formula_and_monitor() {
|
||||||
|
|
||||||
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
|
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
|
||||||
|
|
||||||
# Clean up per-agent worktree — "the runtime creates and destroys"
|
# Preserve worktree on crash for debugging; clean up on success
|
||||||
remove_formula_worktree
|
if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
|
||||||
|
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
|
||||||
|
else
|
||||||
|
remove_formula_worktree
|
||||||
|
fi
|
||||||
|
|
||||||
log "--- ${agent_name^} run done ---"
|
log "--- ${agent_name^} run done ---"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,9 @@ check_memory 2000
|
||||||
|
|
||||||
log "--- Supervisor run start ---"
|
log "--- Supervisor run start ---"
|
||||||
|
|
||||||
|
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
|
||||||
|
cleanup_stale_crashed_worktrees 24
|
||||||
|
|
||||||
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
||||||
log "Running preflight.sh"
|
log "Running preflight.sh"
|
||||||
PREFLIGHT_OUTPUT=""
|
PREFLIGHT_OUTPUT=""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue