Merge pull request 'fix: PHASE:crashed unhandled in _on_phase_change / dev-agent callback (#339)' (#443) from fix/issue-339 into main

This commit is contained in:
johba 2026-03-21 04:39:02 +01:00
commit f511a6c7a7
2 changed files with 34 additions and 9 deletions

View file

@ -131,6 +131,16 @@ cleanup_labels() {
"${API}/issues/${ISSUE}/labels/${IN_PROGRESS_LABEL_ID}" >/dev/null 2>&1 || true "${API}/issues/${ISSUE}/labels/${IN_PROGRESS_LABEL_ID}" >/dev/null 2>&1 || true
} }
restore_to_backlog() {
cleanup_labels
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/labels" \
-d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true
CLAIMED=false # Don't unclaim again in cleanup()
}
CLAIMED=false CLAIMED=false
cleanup() { cleanup() {
rm -f "$LOCKFILE" "$STATUSFILE" rm -f "$LOCKFILE" "$STATUSFILE"
@ -750,13 +760,7 @@ case "${_MONITOR_LOOP_EXIT:-}" in
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${_MONITOR_LOOP_EXIT:-idle_timeout}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${_MONITOR_LOOP_EXIT:-idle_timeout}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore labels: remove in-progress, add backlog # Restore labels: remove in-progress, add backlog
cleanup_labels restore_to_backlog
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/labels" \
-d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true
CLAIMED=false # Don't unclaim again in cleanup()
if [ -n "${PR_NUMBER:-}" ]; then if [ -n "${PR_NUMBER:-}" ]; then
log "keeping worktree (PR #${PR_NUMBER} still open)" log "keeping worktree (PR #${PR_NUMBER} still open)"
else else
@ -767,8 +771,10 @@ case "${_MONITOR_LOOP_EXIT:-}" in
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}" [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
;; ;;
crash_recovery_failed) crashed)
cleanup_labels # Belt-and-suspenders: _on_phase_change(PHASE:crashed) handles primary
# cleanup (escalation, notification, labels, worktree, files).
restore_to_backlog
;; ;;
done) done)
# Belt-and-suspenders: callback in phase-handler.sh handles primary cleanup, # Belt-and-suspenders: callback in phase-handler.sh handles primary cleanup,

View file

@ -714,6 +714,25 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
return 1 return 1
fi fi
# ── PHASE: crashed ──────────────────────────────────────────────────────────
# Session died unexpectedly (OOM kill, tmux crash, etc.). Escalate to
# supervisor and restore issue to backlog so it can be retried.
elif [ "$phase" = "PHASE:crashed" ]; then
log "session crashed for issue #${ISSUE}"
notify_ctx \
"session crashed unexpectedly — escalating" \
"session crashed unexpectedly — escalating${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"crashed\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore backlog label, clean up worktree + temp files
restore_to_backlog
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
else else
log "WARNING: unknown phase value: ${phase}" log "WARNING: unknown phase value: ${phase}"
fi fi