fix: escalate once then continue to backlog (#59)

Two bugs after #53 merged:
1. Escalation written every poll cycle (4 entries in 30min) — now writes once, bumps counter to 4 to skip
2. Exit after escalation blocked backlog work — now falls through to pick up next issue

Co-authored-by: openhands <openhands@all-hands.dev>
Reviewed-on: https://codeberg.org/johba/disinto/pulls/59
Reviewed-by: review_bot <review_bot@noreply.codeberg.org>
This commit is contained in:
johba 2026-03-17 15:14:48 +01:00
parent c24adc4ea2
commit 531ae5cf71

View file

@ -233,17 +233,23 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR")
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — exhausted ${FIX_ATTEMPTS} fix attempts, escalating"
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
matrix_send "dev" "🚨 PR #${HAS_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated to supervisor" 2>/dev/null || true
# Already escalated — skip silently, let pipeline continue to backlog
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — skipping"
# Only write escalation + alert once (first time hitting 3)
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
matrix_send "dev" "🚨 PR #${HAS_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated to supervisor" 2>/dev/null || true
ci_fix_increment "$HAS_PR" # bump to 4 so we don't re-alert
fi
# Fall through to backlog scan instead of exit
else
ci_fix_increment "$HAS_PR"
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt $((FIX_ATTEMPTS+1))/3)"
nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
exit 0
fi
exit 0
else
log "issue #${ISSUE_NUM} has open PR #${HAS_PR} (CI: ${CI_STATE}, waiting)"
@ -308,17 +314,22 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM")
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — exhausted ${FIX_ATTEMPTS} attempts, escalating"
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
matrix_send "dev" "🚨 PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
# Already escalated — skip to let pipeline continue
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — skipping"
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
matrix_send "dev" "🚨 PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
ci_fix_increment "$PR_NUM" # bump to 4 to prevent re-alert
fi
continue # skip this PR, check next stuck PR or fall through to backlog
else
ci_fix_increment "$PR_NUM"
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt $((FIX_ATTEMPTS+1))/3)"
nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
exit 0
fi
exit 0
fi
done