fix: Preserve worktrees on crash for debugging (#726)

On crash (PHASE:crashed or non-zero exit), preserve the worktree and log
its location instead of destroying it unconditionally. Successful sessions
still clean up normally. Supervisor runs housekeeping to remove stale
crashed worktrees older than 24h.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-26 13:41:33 +00:00
parent b4c053b3ed
commit f716a75351
4 changed files with 66 additions and 6 deletions

View file

@ -100,6 +100,7 @@ fi
echo $$ > "$LOCKFILE" echo $$ > "$LOCKFILE"
cleanup() { cleanup() {
local exit_code=$?
# Kill lifetime watchdog if running # Kill lifetime watchdog if running
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
@ -118,8 +119,14 @@ cleanup() {
fi fi
# Best-effort docker cleanup for containers started during this action # Best-effort docker cleanup for containers started during this action
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true (cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
# Destroy the worktree — the runtime owns the lifecycle # Preserve worktree on crash for debugging; clean up on success
cleanup_worktree local final_phase=""
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true)
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
log "PRESERVED crashed worktree for debugging: $WORKTREE"
else
cleanup_worktree
fi
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT" rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
} }
trap cleanup EXIT trap cleanup EXIT

View file

@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
"session crashed unexpectedly — marking blocked" \ "session crashed unexpectedly — marking blocked" \
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}" "session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
post_blocked_diagnostic "crashed" post_blocked_diagnostic "crashed"
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree log "PRESERVED crashed worktree for debugging: $WORKTREE"
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \ rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"

View file

@ -174,6 +174,53 @@ formula_phase_callback() {
esac esac
} }
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
# Scans /tmp for orphaned worktrees matching agent naming patterns.
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
# Requires globals: PROJECT_REPO_ROOT.
cleanup_stale_crashed_worktrees() {
local max_age_hours="${1:-24}"
local max_age_seconds=$((max_age_hours * 3600))
local now
now=$(date +%s)
local cleaned=0
# Collect active tmux pane working directories for safety check
local active_dirs=""
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
local wt_dir
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
[ -d "$wt_dir" ] || continue
# Must be a git worktree (has .git file or directory)
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
# Check age (use directory mtime)
local dir_mtime
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
local age=$((now - dir_mtime))
[ "$age" -lt "$max_age_seconds" ] && continue
# Skip if an active tmux pane is using this worktree
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
continue
fi
# Remove the worktree
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
cleaned=$((cleaned + 1))
done
# Prune any dangling worktree references
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
}
# ── Scratch file helpers (compaction survival) ──────────────────────────── # ── Scratch file helpers (compaction survival) ────────────────────────────
# build_scratch_instruction SCRATCH_FILE # build_scratch_instruction SCRATCH_FILE
@ -306,8 +353,12 @@ run_formula_and_monitor() {
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
# Clean up per-agent worktree — "the runtime creates and destroys" # Preserve worktree on crash for debugging; clean up on success
remove_formula_worktree if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
else
remove_formula_worktree
fi
log "--- ${agent_name^} run done ---" log "--- ${agent_name^} run done ---"
} }

View file

@ -50,6 +50,9 @@ check_memory 2000
log "--- Supervisor run start ---" log "--- Supervisor run start ---"
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
cleanup_stale_crashed_worktrees 24
# ── Collect pre-flight metrics ──────────────────────────────────────────── # ── Collect pre-flight metrics ────────────────────────────────────────────
log "Running preflight.sh" log "Running preflight.sh"
PREFLIGHT_OUTPUT="" PREFLIGHT_OUTPUT=""