fix: idle timeout does not escalate — session dies silently (#123)
1. Timeout handler (dev-agent.sh): write escalation to project-suffixed
file, restore backlog label, clean up phase file on idle timeout.
2. Fix escalation file naming: escalations.jsonl → escalations-${PROJECT_NAME}.jsonl
everywhere in dev-agent.sh so gardener actually picks them up.
3. Gardener (gardener-poll.sh): handle idle_timeout reason before CI-specific
recipe logic — create investigation sub-issue instead of silently returning.
4. Update .gitignore to match new escalations-*.jsonl pattern.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0aec024f78
commit
88f2268bc6
3 changed files with 55 additions and 5 deletions
|
|
@ -990,13 +990,47 @@ if [ -s "$ESCALATION_FILE" ]; then
|
|||
ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty')
|
||||
ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty')
|
||||
ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3')
|
||||
ESC_REASON=$(echo "$esc_entry" | jq -r '.reason // empty')
|
||||
|
||||
if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then
|
||||
echo "$esc_entry" >> "$ESCALATION_DONE"
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))"
|
||||
log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} reason=${ESC_REASON} (${ESC_ATTEMPTS} CI attempt(s))"
|
||||
|
||||
# Handle idle_timeout escalations — no CI steps to inspect, just notify
|
||||
if [[ "$ESC_REASON" == idle_timeout* ]]; then
|
||||
_issue_url="${CODEBERG_WEB}/issues/${ESC_ISSUE}"
|
||||
sub_title="fix: investigate idle timeout for issue #${ESC_ISSUE}"
|
||||
sub_body="## Dev-agent idle timeout
|
||||
|
||||
The dev-agent session for issue #${ESC_ISSUE} was idle for 2h without a phase update and was killed.$([ "${ESC_PR:-0}" != "0" ] && printf '\n\nPR #%s may still be open.' "$ESC_PR")
|
||||
|
||||
### What to check
|
||||
1. Was the agent stuck waiting for input? Check the issue spec for ambiguity.
|
||||
2. Was there an infrastructure issue (tmux crash, disk full, etc.)?
|
||||
3. Re-run the issue by restoring the \`backlog\` label if the spec is clear.
|
||||
|
||||
### Context
|
||||
- Issue: [#${ESC_ISSUE}](${_issue_url})$([ "${ESC_PR:-0}" != "0" ] && printf '\n- PR: #%s' "$ESC_PR")"
|
||||
|
||||
new_issue=$(curl -sf -X POST \
|
||||
-H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${CODEBERG_API}/issues" \
|
||||
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
|
||||
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
|
||||
|
||||
if [ -n "$new_issue" ]; then
|
||||
log "Created idle-timeout sub-issue #${new_issue} for #${ESC_ISSUE}"
|
||||
_esc_total_created=$((_esc_total_created + 1))
|
||||
matrix_send "gardener" "Created #${new_issue}: idle timeout on #${ESC_ISSUE}" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
echo "$esc_entry" >> "$ESCALATION_DONE"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Fetch PR metadata (SHA, mergeable status)
|
||||
ESC_PR_DATA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue