fix: Preserve worktrees on crash for debugging (#726)

On crash (PHASE:crashed or non-zero exit), preserve the worktree and log
its location instead of destroying it unconditionally. Successful sessions
still clean up normally. Supervisor runs housekeeping to remove stale
crashed worktrees older than 24h.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-26 13:41:33 +00:00
parent b4c053b3ed
commit f716a75351
4 changed files with 66 additions and 6 deletions

View file

@ -174,6 +174,53 @@ formula_phase_callback() {
esac
}
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
# Scans /tmp for orphaned worktrees matching agent naming patterns.
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
# Requires globals: PROJECT_REPO_ROOT.
cleanup_stale_crashed_worktrees() {
local max_age_hours="${1:-24}"
local max_age_seconds=$((max_age_hours * 3600))
local now
now=$(date +%s)
local cleaned=0
# Collect active tmux pane working directories for safety check
local active_dirs=""
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
local wt_dir
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
[ -d "$wt_dir" ] || continue
# Must be a git worktree (has .git file or directory)
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
# Check age (use directory mtime)
local dir_mtime
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
local age=$((now - dir_mtime))
[ "$age" -lt "$max_age_seconds" ] && continue
# Skip if an active tmux pane is using this worktree
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
continue
fi
# Remove the worktree
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
cleaned=$((cleaned + 1))
done
# Prune any dangling worktree references
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
}
# ── Scratch file helpers (compaction survival) ────────────────────────────
# build_scratch_instruction SCRATCH_FILE
@ -306,8 +353,12 @@ run_formula_and_monitor() {
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
# Clean up per-agent worktree — "the runtime creates and destroys"
remove_formula_worktree
# Preserve worktree on crash for debugging; clean up on success
if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
else
remove_formula_worktree
fi
log "--- ${agent_name^} run done ---"
}