diff --git a/action/action-agent.sh b/action/action-agent.sh index d9daea5..87a3c5b 100755 --- a/action/action-agent.sh +++ b/action/action-agent.sh @@ -100,6 +100,7 @@ fi echo $$ > "$LOCKFILE" cleanup() { + local exit_code=$? # Kill lifetime watchdog if running if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true @@ -118,8 +119,14 @@ cleanup() { fi # Best-effort docker cleanup for containers started during this action (cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true - # Destroy the worktree — the runtime owns the lifecycle - cleanup_worktree + # Preserve worktree on crash for debugging; clean up on success + local final_phase="" + [ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true) + if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then + log "PRESERVED crashed worktree for debugging: $WORKTREE" + else + cleanup_worktree + fi rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT" } trap cleanup EXIT diff --git a/dev/phase-handler.sh b/dev/phase-handler.sh index d2f3ce8..e1b06d6 100644 --- a/dev/phase-handler.sh +++ b/dev/phase-handler.sh @@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) "session crashed unexpectedly — marking blocked" \ "session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR #${PR_NUMBER}}" post_blocked_diagnostic "crashed" - [ -z "${PR_NUMBER:-}" ] && cleanup_worktree - [ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)" + log "PRESERVED crashed worktree for debugging: $WORKTREE" rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \ "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" diff --git a/lib/formula-session.sh b/lib/formula-session.sh index b75ae80..37c3274 100644 --- a/lib/formula-session.sh +++ b/lib/formula-session.sh @@ -174,6 +174,53 @@ formula_phase_callback() { esac } +# ── Stale crashed worktree cleanup ───────────────────────────────────────── + +# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS] +# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24). +# Scans /tmp for orphaned worktrees matching agent naming patterns. +# Safe to call from any agent; intended for supervisor/gardener housekeeping. +# Requires globals: PROJECT_REPO_ROOT. +cleanup_stale_crashed_worktrees() { + local max_age_hours="${1:-24}" + local max_age_seconds=$((max_age_hours * 3600)) + local now + now=$(date +%s) + local cleaned=0 + + # Collect active tmux pane working directories for safety check + local active_dirs="" + active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true) + + local wt_dir + for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do + [ -d "$wt_dir" ] || continue + # Must be a git worktree (has .git file or directory) + [ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue + + # Check age (use directory mtime) + local dir_mtime + dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now") + local age=$((now - dir_mtime)) + [ "$age" -lt "$max_age_seconds" ] && continue + + # Skip if an active tmux pane is using this worktree + if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then + continue + fi + + # Remove the worktree + git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir" + log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)" + cleaned=$((cleaned + 1)) + done + + # Prune any dangling worktree references + git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true + + [ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)" +} + # ── Scratch file helpers (compaction survival) ──────────────────────────── # build_scratch_instruction SCRATCH_FILE @@ -306,8 +353,12 @@ run_formula_and_monitor() { matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true - # Clean up per-agent worktree — "the runtime creates and destroys" - remove_formula_worktree + # Preserve worktree on crash for debugging; clean up on success + if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then + log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}" + else + remove_formula_worktree + fi log "--- ${agent_name^} run done ---" } diff --git a/supervisor/supervisor-run.sh b/supervisor/supervisor-run.sh index 5b5c223..b4ac052 100755 --- a/supervisor/supervisor-run.sh +++ b/supervisor/supervisor-run.sh @@ -50,6 +50,9 @@ check_memory 2000 log "--- Supervisor run start ---" +# ── Housekeeping: clean up stale crashed worktrees (>24h) ──────────────── +cleanup_stale_crashed_worktrees 24 + # ── Collect pre-flight metrics ──────────────────────────────────────────── log "Running preflight.sh" PREFLIGHT_OUTPUT=""