Merge pull request 'fix: fix: idle timeout does not escalate — session dies silently (#123)' (#128) from fix/issue-123 into main

This commit is contained in:
johba 2026-03-18 08:29:25 +01:00
commit bf337aba20
3 changed files with 55 additions and 5 deletions

2
.gitignore vendored
View file

@ -11,7 +11,7 @@ RESOURCES.md
state.json state.json
*.lock *.lock
*.pid *.pid
escalations.jsonl escalations*.jsonl
metrics/supervisor-metrics.jsonl metrics/supervisor-metrics.jsonl
# OS # OS

View file

@ -968,14 +968,30 @@ Phase file: ${PHASE_FILE}"
# No phase change — check idle timeout # No phase change — check idle timeout
if [ "$IDLE_ELAPSED" -ge "$IDLE_TIMEOUT" ]; then if [ "$IDLE_ELAPSED" -ge "$IDLE_TIMEOUT" ]; then
log "TIMEOUT: no phase update for ${IDLE_TIMEOUT}s — killing session" log "TIMEOUT: no phase update for ${IDLE_TIMEOUT}s — killing session"
notify "session idle for 2h — killed" notify_ctx \
"session idle for 2h — killed. Escalating to gardener." \
"session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
kill_tmux_session kill_tmux_session
# Escalate: write to project-suffixed escalation file so gardener picks it up
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"idle_timeout\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore labels: remove in-progress, add backlog
cleanup_labels cleanup_labels
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/labels" \
-d '{"labels":["backlog"]}' >/dev/null 2>&1 || true
CLAIMED=false # Don't unclaim again in cleanup()
if [ -n "${PR_NUMBER:-}" ]; then if [ -n "${PR_NUMBER:-}" ]; then
log "keeping worktree (PR #${PR_NUMBER} still open)" log "keeping worktree (PR #${PR_NUMBER} still open)"
else else
cleanup_worktree cleanup_worktree
fi fi
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE"
break break
fi fi
continue continue
@ -1138,7 +1154,7 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee
if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then
log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating" log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER},\"reason\":\"ci_exhausted\",\"step\":\"${FAILED_STEP:-unknown}\",\"attempts\":${CI_FIX_COUNT},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER},\"reason\":\"ci_exhausted\",\"step\":\"${FAILED_STEP:-unknown}\",\"attempts\":${CI_FIX_COUNT},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
notify_ctx \ notify_ctx \
"CI exhausted after ${CI_FIX_COUNT} attempts — escalated to supervisor" \ "CI exhausted after ${CI_FIX_COUNT} attempts — escalated to supervisor" \
"CI exhausted after ${CI_FIX_COUNT} attempts on PR <a href='${PR_URL:-${CODEBERG_WEB}/pulls/${PR_NUMBER}}'>#${PR_NUMBER}</a> | <a href='${_ci_pipeline_url}'>Pipeline</a><br>Step: <code>${FAILED_STEP:-unknown}</code> — escalated to supervisor" "CI exhausted after ${CI_FIX_COUNT} attempts on PR <a href='${PR_URL:-${CODEBERG_WEB}/pulls/${PR_NUMBER}}'>#${PR_NUMBER}</a> | <a href='${_ci_pipeline_url}'>Pipeline</a><br>Step: <code>${FAILED_STEP:-unknown}</code> — escalated to supervisor"
@ -1462,7 +1478,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
"❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}" \ "❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}" \
"❌ <a href='${CODEBERG_WEB}/issues/${ISSUE}'>Issue #${ISSUE}</a> session failed: ${FAILURE_REASON}${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}" "❌ <a href='${CODEBERG_WEB}/issues/${ISSUE}'>Issue #${ISSUE}</a> session failed: ${FAILURE_REASON}${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${FAILURE_REASON}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${FAILURE_REASON}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore backlog label # Restore backlog label
cleanup_labels cleanup_labels

View file

@ -990,13 +990,47 @@ if [ -s "$ESCALATION_FILE" ]; then
ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty') ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty')
ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty') ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty')
ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3') ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3')
ESC_REASON=$(echo "$esc_entry" | jq -r '.reason // empty')
if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then
echo "$esc_entry" >> "$ESCALATION_DONE" echo "$esc_entry" >> "$ESCALATION_DONE"
continue continue
fi fi
log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))" log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} reason=${ESC_REASON} (${ESC_ATTEMPTS} CI attempt(s))"
# Handle idle_timeout escalations — no CI steps to inspect, just notify
if [[ "$ESC_REASON" == idle_timeout* ]]; then
_issue_url="https://codeberg.org/${CODEBERG_REPO}/issues/${ESC_ISSUE}"
sub_title="chore: investigate idle timeout for issue #${ESC_ISSUE}"
sub_body="## Dev-agent idle timeout
The dev-agent session for issue #${ESC_ISSUE} was idle for 2h without a phase update and was killed.$([ "${ESC_PR:-0}" != "0" ] && printf '\n\nPR #%s may still be open.' "$ESC_PR")
### What to check
1. Was the agent stuck waiting for input? Check the issue spec for ambiguity.
2. Was there an infrastructure issue (tmux crash, disk full, etc.)?
3. Re-run the issue by restoring the \`backlog\` label if the spec is clear.
### Context
- Issue: [#${ESC_ISSUE}](${_issue_url})$([ "${ESC_PR:-0}" != "0" ] && printf '\n- PR: #%s' "$ESC_PR")"
new_issue=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues" \
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
if [ -n "$new_issue" ]; then
log "Created idle-timeout sub-issue #${new_issue} for #${ESC_ISSUE}"
_esc_total_created=$((_esc_total_created + 1))
matrix_send "gardener" "⏱ Created #${new_issue}: idle timeout on #${ESC_ISSUE}" 2>/dev/null || true
fi
echo "$esc_entry" >> "$ESCALATION_DONE"
continue
fi
# Fetch PR metadata (SHA, mergeable status) # Fetch PR metadata (SHA, mergeable status)
ESC_PR_DATA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ ESC_PR_DATA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \