Merge pull request 'fix: Preserve worktrees on crash for debugging (#726)' (#754) from fix/issue-726 into main
This commit is contained in:
commit
fcf25b5bb2
4 changed files with 66 additions and 6 deletions
|
|
@ -100,6 +100,7 @@ fi
|
||||||
echo $$ > "$LOCKFILE"
|
echo $$ > "$LOCKFILE"
|
||||||
|
|
||||||
cleanup() {
|
cleanup() {
|
||||||
|
local exit_code=$?
|
||||||
# Kill lifetime watchdog if running
|
# Kill lifetime watchdog if running
|
||||||
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
|
if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
|
||||||
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
|
kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
|
||||||
|
|
@ -118,8 +119,14 @@ cleanup() {
|
||||||
fi
|
fi
|
||||||
# Best-effort docker cleanup for containers started during this action
|
# Best-effort docker cleanup for containers started during this action
|
||||||
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
|
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
|
||||||
# Destroy the worktree — the runtime owns the lifecycle
|
# Preserve worktree on crash for debugging; clean up on success
|
||||||
cleanup_worktree
|
local final_phase=""
|
||||||
|
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true)
|
||||||
|
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
|
||||||
|
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||||
|
else
|
||||||
|
cleanup_worktree
|
||||||
|
fi
|
||||||
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
|
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT"
|
||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
|
||||||
|
|
@ -817,8 +817,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
|
||||||
"session crashed unexpectedly — marking blocked" \
|
"session crashed unexpectedly — marking blocked" \
|
||||||
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
|
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${FORGE_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
|
||||||
post_blocked_diagnostic "crashed"
|
post_blocked_diagnostic "crashed"
|
||||||
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree
|
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||||
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
|
|
||||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
|
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
|
||||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||||
|
|
|
||||||
|
|
@ -174,6 +174,53 @@ formula_phase_callback() {
|
||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
|
||||||
|
|
||||||
|
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
|
||||||
|
# Removes preserved crashed worktrees older than MAX_AGE_HOURS (default 24).
|
||||||
|
# Scans /tmp for orphaned worktrees matching agent naming patterns.
|
||||||
|
# Safe to call from any agent; intended for supervisor/gardener housekeeping.
|
||||||
|
# Requires globals: PROJECT_REPO_ROOT.
|
||||||
|
cleanup_stale_crashed_worktrees() {
|
||||||
|
local max_age_hours="${1:-24}"
|
||||||
|
local max_age_seconds=$((max_age_hours * 3600))
|
||||||
|
local now
|
||||||
|
now=$(date +%s)
|
||||||
|
local cleaned=0
|
||||||
|
|
||||||
|
# Collect active tmux pane working directories for safety check
|
||||||
|
local active_dirs=""
|
||||||
|
active_dirs=$(tmux list-panes -a -F '#{pane_current_path}' 2>/dev/null || true)
|
||||||
|
|
||||||
|
local wt_dir
|
||||||
|
for wt_dir in /tmp/*-worktree-* /tmp/action-*-[0-9]* /tmp/disinto-*; do
|
||||||
|
[ -d "$wt_dir" ] || continue
|
||||||
|
# Must be a git worktree (has .git file or directory)
|
||||||
|
[ -f "$wt_dir/.git" ] || [ -d "$wt_dir/.git" ] || continue
|
||||||
|
|
||||||
|
# Check age (use directory mtime)
|
||||||
|
local dir_mtime
|
||||||
|
dir_mtime=$(stat -c %Y "$wt_dir" 2>/dev/null || echo "$now")
|
||||||
|
local age=$((now - dir_mtime))
|
||||||
|
[ "$age" -lt "$max_age_seconds" ] && continue
|
||||||
|
|
||||||
|
# Skip if an active tmux pane is using this worktree
|
||||||
|
if [ -n "$active_dirs" ] && echo "$active_dirs" | grep -qF "$wt_dir"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Remove the worktree
|
||||||
|
git -C "${PROJECT_REPO_ROOT}" worktree remove "$wt_dir" --force 2>/dev/null || rm -rf "$wt_dir"
|
||||||
|
log "cleaned stale crashed worktree: ${wt_dir} (age: $((age / 3600))h)"
|
||||||
|
cleaned=$((cleaned + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Prune any dangling worktree references
|
||||||
|
git -C "${PROJECT_REPO_ROOT}" worktree prune 2>/dev/null || true
|
||||||
|
|
||||||
|
[ "$cleaned" -gt 0 ] && log "cleaned ${cleaned} stale crashed worktree(s)"
|
||||||
|
}
|
||||||
|
|
||||||
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
||||||
|
|
||||||
# build_scratch_instruction SCRATCH_FILE
|
# build_scratch_instruction SCRATCH_FILE
|
||||||
|
|
@ -306,8 +353,12 @@ run_formula_and_monitor() {
|
||||||
|
|
||||||
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
|
matrix_send "$agent_name" "${agent_name^} session finished (${FINAL_PHASE:-no phase})" 2>/dev/null || true
|
||||||
|
|
||||||
# Clean up per-agent worktree — "the runtime creates and destroys"
|
# Preserve worktree on crash for debugging; clean up on success
|
||||||
remove_formula_worktree
|
if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
|
||||||
|
log "PRESERVED crashed worktree for debugging: ${_FORMULA_SESSION_WORKDIR:-}"
|
||||||
|
else
|
||||||
|
remove_formula_worktree
|
||||||
|
fi
|
||||||
|
|
||||||
log "--- ${agent_name^} run done ---"
|
log "--- ${agent_name^} run done ---"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -50,6 +50,9 @@ check_memory 2000
|
||||||
|
|
||||||
log "--- Supervisor run start ---"
|
log "--- Supervisor run start ---"
|
||||||
|
|
||||||
|
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
|
||||||
|
cleanup_stale_crashed_worktrees 24
|
||||||
|
|
||||||
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
||||||
log "Running preflight.sh"
|
log "Running preflight.sh"
|
||||||
PREFLIGHT_OUTPUT=""
|
PREFLIGHT_OUTPUT=""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue