fix: limit CI fix respawn to 3 attempts, then escalate to supervisor (#53)
Dev-poll spawned a fresh agent every 10min for CI failures. Each agent started with CI_FIX_COUNT=0 — infinite loop.
Now tracks attempts per PR in `/tmp/dev-poll-ci-fixes-{project}.json`. After 3 failed rounds:
- Writes escalation to `supervisor/escalations.jsonl`
- Sends Matrix alert
- Stops respawning
Part of #52 (supervisor escalation pipeline).
Co-authored-by: openhands <openhands@all-hands.dev>
Reviewed-on: https://codeberg.org/johba/disinto/pulls/53
Reviewed-by: review_bot <review_bot@noreply.codeberg.org>
This commit is contained in:
parent
740bddb2db
commit
c24adc4ea2
2 changed files with 69 additions and 7 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -16,3 +16,4 @@ metrics/supervisor-metrics.jsonl
|
||||||
|
|
||||||
# OS
|
# OS
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
dev/ci-fixes-*.json
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,33 @@ ci_passed() {
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Track CI fix attempts per PR to avoid infinite respawn loops
|
||||||
|
CI_FIX_TRACKER="${FACTORY_ROOT}/dev/ci-fixes-${PROJECT_NAME:-harb}.json"
|
||||||
|
ci_fix_count() {
|
||||||
|
local pr="$1"
|
||||||
|
python3 -c "import json,sys;d=json.load(open('$CI_FIX_TRACKER')) if __import__('os').path.exists('$CI_FIX_TRACKER') else {};print(d.get(str($pr),0))" 2>/dev/null || echo 0
|
||||||
|
}
|
||||||
|
ci_fix_increment() {
|
||||||
|
local pr="$1"
|
||||||
|
python3 -c "
|
||||||
|
import json,os
|
||||||
|
f='$CI_FIX_TRACKER'
|
||||||
|
d=json.load(open(f)) if os.path.exists(f) else {}
|
||||||
|
d[str($pr)]=d.get(str($pr),0)+1
|
||||||
|
json.dump(d,open(f,'w'))
|
||||||
|
" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
ci_fix_reset() {
|
||||||
|
local pr="$1"
|
||||||
|
python3 -c "
|
||||||
|
import json,os
|
||||||
|
f='$CI_FIX_TRACKER'
|
||||||
|
d=json.load(open(f)) if os.path.exists(f) else {}
|
||||||
|
d.pop(str($pr),None)
|
||||||
|
json.dump(d,open(f,'w'))
|
||||||
|
" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
|
||||||
REPO="${CODEBERG_REPO}"
|
REPO="${CODEBERG_REPO}"
|
||||||
|
|
||||||
API="${CODEBERG_API}"
|
API="${CODEBERG_API}"
|
||||||
|
|
@ -61,6 +88,7 @@ try_merge_or_rebase() {
|
||||||
curl -sf -X DELETE -H "Authorization: token ${CODEBERG_TOKEN}" \
|
curl -sf -X DELETE -H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||||
"${API}/issues/${issue_num}/labels/in-progress" >/dev/null 2>&1 || true
|
"${API}/issues/${issue_num}/labels/in-progress" >/dev/null 2>&1 || true
|
||||||
matrix_send "dev" "✅ PR #${pr_num} merged! Issue #${issue_num} done." 2>/dev/null || true
|
matrix_send "dev" "✅ PR #${pr_num} merged! Issue #${issue_num} done." 2>/dev/null || true
|
||||||
|
ci_fix_reset "$pr_num"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
@ -203,9 +231,18 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
|
||||||
exit 0
|
exit 0
|
||||||
|
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix"
|
FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR")
|
||||||
nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
|
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
||||||
log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
|
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — exhausted ${FIX_ATTEMPTS} fix attempts, escalating"
|
||||||
|
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${HAS_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated to supervisor" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
ci_fix_increment "$HAS_PR"
|
||||||
|
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt $((FIX_ATTEMPTS+1))/3)"
|
||||||
|
nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
|
||||||
|
log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
|
||||||
|
fi
|
||||||
exit 0
|
exit 0
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
@ -269,9 +306,18 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
|
||||||
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
||||||
exit 0
|
exit 0
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing first"
|
FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM")
|
||||||
nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
|
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
||||||
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — exhausted ${FIX_ATTEMPTS} attempts, escalating"
|
||||||
|
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
||||||
|
else
|
||||||
|
ci_fix_increment "$PR_NUM"
|
||||||
|
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt $((FIX_ATTEMPTS+1))/3)"
|
||||||
|
nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
|
||||||
|
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
||||||
|
fi
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
@ -332,8 +378,18 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
|
||||||
break
|
break
|
||||||
|
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up"
|
FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR")
|
||||||
|
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
||||||
|
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — exhausted ${FIX_ATTEMPTS} attempts, escalated (not blocking pipeline)"
|
||||||
|
echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${EXISTING_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
||||||
|
# Don't add to WAITING_PRS — escalated PRs should not block new work
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt $((FIX_ATTEMPTS+1))/3)"
|
||||||
READY_ISSUE="$ISSUE_NUM"
|
READY_ISSUE="$ISSUE_NUM"
|
||||||
|
READY_PR_FOR_INCREMENT="$EXISTING_PR"
|
||||||
break
|
break
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|
@ -363,6 +419,11 @@ fi
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# LAUNCH: start dev-agent for the ready issue
|
# LAUNCH: start dev-agent for the ready issue
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
# Deferred CI fix increment — only now that we're actually launching
|
||||||
|
if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
|
||||||
|
ci_fix_increment "$READY_PR_FOR_INCREMENT"
|
||||||
|
fi
|
||||||
|
|
||||||
log "launching dev-agent for #${READY_ISSUE}"
|
log "launching dev-agent for #${READY_ISSUE}"
|
||||||
matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true
|
matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true
|
||||||
rm -f "$PREFLIGHT_RESULT"
|
rm -f "$PREFLIGHT_RESULT"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue