From 7f9cefa8470430c8c4e81c10321396013a2eddeb Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 21 Mar 2026 01:31:20 +0000 Subject: [PATCH] fix: PHASE:crashed unhandled in _on_phase_change / dev-agent callback (#339) Add explicit PHASE:crashed case to _on_phase_change in phase-handler.sh: logs crash, notifies Matrix, escalates to supervisor, restores backlog label, preserves worktree if PR exists, cleans up temp files. Add crashed case to dev-agent.sh post-loop case statement for belt-and-suspenders cleanup matching the callback behavior. Replaces the dead crash_recovery_failed case that was never triggered. Co-Authored-By: Claude Opus 4.6 (1M context) --- dev/dev-agent.sh | 18 +++++++++++++++++- dev/phase-handler.sh | 29 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/dev/dev-agent.sh b/dev/dev-agent.sh index 4beaf95..c01511b 100755 --- a/dev/dev-agent.sh +++ b/dev/dev-agent.sh @@ -767,8 +767,24 @@ case "${_MONITOR_LOOP_EXIT:-}" in "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" ;; - crash_recovery_failed) + crashed) + # Belt-and-suspenders: _on_phase_change(PHASE:crashed) handles primary + # cleanup, but ensure labels and files are cleaned up if callback was + # interrupted (e.g. set -e propagation). cleanup_labels + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/labels" \ + -d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true + CLAIMED=false + if [ -z "${PR_NUMBER:-}" ]; then + cleanup_worktree + fi + rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" \ + "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE" \ + "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" + [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" ;; done) # Belt-and-suspenders: callback in phase-handler.sh handles primary cleanup, diff --git a/dev/phase-handler.sh b/dev/phase-handler.sh index 68d2ab6..86b2e90 100644 --- a/dev/phase-handler.sh +++ b/dev/phase-handler.sh @@ -714,6 +714,35 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) return 1 fi + # ── PHASE: crashed ────────────────────────────────────────────────────────── + # Session died unexpectedly (OOM kill, tmux crash, etc.). Escalate to + # supervisor and restore issue to backlog so it can be retried. + elif [ "$phase" = "PHASE:crashed" ]; then + log "session crashed for issue #${ISSUE}" + notify_ctx \ + "session crashed unexpectedly — escalating" \ + "session crashed unexpectedly — escalating${PR_NUMBER:+ | PR #${PR_NUMBER}}" + echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"crashed\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + + # Restore backlog label so issue can be retried + cleanup_labels + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/labels" \ + -d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true + + CLAIMED=false # Don't unclaim again in cleanup() + if [ -n "${PR_NUMBER:-}" ]; then + log "keeping worktree (PR #${PR_NUMBER} still open)" + else + cleanup_worktree + fi + rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \ + "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" + [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" + else log "WARNING: unknown phase value: ${phase}" fi