Merge pull request 'fix: Preserve worktrees on crash for debugging (#726)' (#754) from fix/issue-726 into main
This commit is contained in:
commit
fcf25b5bb2
4 changed files with 66 additions and 6 deletions
|
|
@ -100,6 +100,7 @@ fi
|
|||
echo $$ > "$LOCKFILE"
|
||||
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
# Kill lifetime watchdog if running
|
||||
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
|
||||
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
|
||||
|
|
@ -118,8 +119,14 @@ cleanup() {
|
|||
fi
|
||||
# Best-effort docker cleanup for containers started during this action
|
||||
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
|
||||
# Destroy the worktree — the runtime owns the lifecycle
|
||||
cleanup_worktree
|
||||
# Preserve worktree on crash for debugging; clean up on success
|
||||
local final_phase=""
|
||||
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true)
|
||||
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
|
||||
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||
else
|
||||
cleanup_worktree
|
||||
fi
|
||||
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
|
|
|||
|
|
@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
|
|||
"session crashed unexpectedly — marking blocked" \
|
||||
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
|
||||
post_blocked_diagnostic "crashed"
|
||||
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree
|
||||
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
|
||||
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
|
||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
|
|
|
|||
|
|
@ -174,6 +174,53 @@ formula_phase_callback() {
|
|||
esac
|
||||
}
|
||||
|
||||
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
|
||||
|
||||
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
|
||||
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
|
||||
# Scans /tmp for orphaned worktrees matching agent naming patterns.
|
||||
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
|
||||
# Requires globals: PROJECT_REPO_ROOT.
|
||||
cleanup_stale_crashed_worktrees() {
|
||||
local max_age_hours="${1:-24}"
|
||||
local max_age_seconds=$((max_age_hours * 3600))
|
||||
local now
|
||||
now=$(date +%s)
|
||||
local cleaned=0
|
||||
|
||||
# Collect active tmux pane working directories for safety check
|
||||
local active_dirs=""
|
||||
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
|
||||
|
||||
local wt_dir
|
||||
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
|
||||
[ -d "$wt_dir" ] || continue
|
||||
# Must be a git worktree (has .git file or directory)
|
||||
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
|
||||
|
||||
# Check age (use directory mtime)
|
||||
local dir_mtime
|
||||
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
|
||||
local age=$((now - dir_mtime))
|
||||
[ "$age" -lt "$max_age_seconds" ] && continue
|
||||
|
||||
# Skip if an active tmux pane is using this worktree
|
||||
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
|
||||
continue
|
||||
fi
|
||||
|
||||
# Remove the worktree
|
||||
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
|
||||
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
|
||||
cleaned=$((cleaned + 1))
|
||||
done
|
||||
|
||||
# Prune any dangling worktree references
|
||||
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
|
||||
|
||||
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
|
||||
}
|
||||
|
||||
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
||||
|
||||
# build_scratch_instruction SCRATCH_FILE
|
||||
|
|
@ -306,8 +353,12 @@ run_formula_and_monitor() {
|
|||
|
||||
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
|
||||
|
||||
# Clean up per-agent worktree — "the runtime creates and destroys"
|
||||
remove_formula_worktree
|
||||
# Preserve worktree on crash for debugging; clean up on success
|
||||
if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
|
||||
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
|
||||
else
|
||||
remove_formula_worktree
|
||||
fi
|
||||
|
||||
log "--- ${agent_name^} run done ---"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,6 +50,9 @@ check_memory 2000
|
|||
|
||||
log "--- Supervisor run start ---"
|
||||
|
||||
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
|
||||
cleanup_stale_crashed_worktrees 24
|
||||
|
||||
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
||||
log "Running preflight.sh"
|
||||
PREFLIGHT_OUTPUT=""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue