Merge pull request 'fix: Preserve worktrees on crash for debugging (#726)' (#754) from fix/issue-726 into main

This commit is contained in:
johba 2026-03-26 14:54:08 +01:00
commit fcf25b5bb2
4 changed files with 66 additions and 6 deletions

View file

@ -100,6 +100,7 @@ fi
echo $$ > "$LOCKFILE" echo $$ > "$LOCKFILE"
cleanup() { cleanup() {
local exit_code=$?
# Kill lifetime watchdog if running # Kill lifetime watchdog if running
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
@ -118,8 +119,14 @@ cleanup() {
fi fi
# Best-effort docker cleanup for containers started during this action # Best-effort docker cleanup for containers started during this action
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true (cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
# Destroy the worktree — the runtime owns the lifecycle # Preserve worktree on crash for debugging; clean up on success
cleanup_worktree local final_phase=""
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true)
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
log "PRESERVED crashed worktree for debugging: $WORKTREE"
else
cleanup_worktree
fi
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT" rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
} }
trap cleanup EXIT trap cleanup EXIT

View file

@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
"session crashed unexpectedly — marking blocked" \ "session crashed unexpectedly — marking blocked" \
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}" "session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
post_blocked_diagnostic "crashed" post_blocked_diagnostic "crashed"
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree log "PRESERVED crashed worktree for debugging: $WORKTREE"
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \ rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"

View file

@ -174,6 +174,53 @@ formula_phase_callback() {
esac esac
} }
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
# Scans /tmp for orphaned worktrees matching agent naming patterns.
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
# Requires globals: PROJECT_REPO_ROOT.
cleanup_stale_crashed_worktrees() {
local max_age_hours="${1:-24}"
local max_age_seconds=$((max_age_hours * 3600))
local now
now=$(date +%s)
local cleaned=0
# Collect active tmux pane working directories for safety check
local active_dirs=""
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
local wt_dir
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
[ -d "$wt_dir" ] || continue
# Must be a git worktree (has .git file or directory)
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
# Check age (use directory mtime)
local dir_mtime
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
local age=$((now - dir_mtime))
[ "$age" -lt "$max_age_seconds" ] && continue
# Skip if an active tmux pane is using this worktree
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
continue
fi
# Remove the worktree
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
cleaned=$((cleaned + 1))
done
# Prune any dangling worktree references
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
}
# ── Scratch file helpers (compaction survival) ──────────────────────────── # ── Scratch file helpers (compaction survival) ────────────────────────────
# build_scratch_instruction SCRATCH_FILE # build_scratch_instruction SCRATCH_FILE
@ -306,8 +353,12 @@ run_formula_and_monitor() {
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
# Clean up per-agent worktree — "the runtime creates and destroys" # Preserve worktree on crash for debugging; clean up on success
remove_formula_worktree if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
else
remove_formula_worktree
fi
log "--- ${agent_name^} run done ---" log "--- ${agent_name^} run done ---"
} }

View file

@ -50,6 +50,9 @@ check_memory 2000
log "--- Supervisor run start ---" log "--- Supervisor run start ---"
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
cleanup_stale_crashed_worktrees 24
# ── Collect pre-flight metrics ──────────────────────────────────────────── # ── Collect pre-flight metrics ────────────────────────────────────────────
log "Running preflight.sh" log "Running preflight.sh"
PREFLIGHT_OUTPUT="" PREFLIGHT_OUTPUT=""