diff --git a/.gitignore b/.gitignore index ca4b688..c3464d8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ RESOURCES.md state.json *.lock *.pid -escalations.jsonl +escalations*.jsonl metrics/supervisor-metrics.jsonl # OS diff --git a/dev/dev-agent.sh b/dev/dev-agent.sh index 14ad448..0e43171 100755 --- a/dev/dev-agent.sh +++ b/dev/dev-agent.sh @@ -968,14 +968,30 @@ Phase file: ${PHASE_FILE}" # No phase change — check idle timeout if [ "$IDLE_ELAPSED" -ge "$IDLE_TIMEOUT" ]; then log "TIMEOUT: no phase update for ${IDLE_TIMEOUT}s — killing session" - notify "session idle for 2h — killed" + notify_ctx \ + "session idle for 2h — killed. Escalating to gardener." \ + "session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR #${PR_NUMBER}}" kill_tmux_session + + # Escalate: write to project-suffixed escalation file so gardener picks it up + echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"idle_timeout\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + + # Restore labels: remove in-progress, add backlog cleanup_labels + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/labels" \ + -d '{"labels":["backlog"]}' >/dev/null 2>&1 || true + + CLAIMED=false # Don't unclaim again in cleanup() if [ -n "${PR_NUMBER:-}" ]; then log "keeping worktree (PR #${PR_NUMBER} still open)" else cleanup_worktree fi + rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" break fi continue @@ -1138,7 +1154,7 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating" echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER},\"reason\":\"ci_exhausted\",\"step\":\"${FAILED_STEP:-unknown}\",\"attempts\":${CI_FIX_COUNT},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations.jsonl" + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" notify_ctx \ "CI exhausted after ${CI_FIX_COUNT} attempts — escalated to supervisor" \ "CI exhausted after ${CI_FIX_COUNT} attempts on PR #${PR_NUMBER} | Pipeline
Step: ${FAILED_STEP:-unknown} — escalated to supervisor" @@ -1462,7 +1478,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) "❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}" \ "❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}${PR_NUMBER:+ | PR #${PR_NUMBER}}" echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${FAILURE_REASON}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations.jsonl" + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" # Restore backlog label cleanup_labels diff --git a/gardener/gardener-poll.sh b/gardener/gardener-poll.sh index e5f7839..4be31d6 100755 --- a/gardener/gardener-poll.sh +++ b/gardener/gardener-poll.sh @@ -990,13 +990,47 @@ if [ -s "$ESCALATION_FILE" ]; then ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty') ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty') ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3') + ESC_REASON=$(echo "$esc_entry" | jq -r '.reason // empty') if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then echo "$esc_entry" >> "$ESCALATION_DONE" continue fi - log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))" + log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} reason=${ESC_REASON} (${ESC_ATTEMPTS} CI attempt(s))" + + # Handle idle_timeout escalations — no CI steps to inspect, just notify + if [[ "$ESC_REASON" == idle_timeout* ]]; then + _issue_url="${CODEBERG_WEB}/issues/${ESC_ISSUE}" + sub_title="fix: investigate idle timeout for issue #${ESC_ISSUE}" + sub_body="## Dev-agent idle timeout + +The dev-agent session for issue #${ESC_ISSUE} was idle for 2h without a phase update and was killed.$([ "${ESC_PR:-0}" != "0" ] && printf '\n\nPR #%s may still be open.' "$ESC_PR") + +### What to check +1. Was the agent stuck waiting for input? Check the issue spec for ambiguity. +2. Was there an infrastructure issue (tmux crash, disk full, etc.)? +3. Re-run the issue by restoring the \`backlog\` label if the spec is clear. + +### Context +- Issue: [#${ESC_ISSUE}](${_issue_url})$([ "${ESC_PR:-0}" != "0" ] && printf '\n- PR: #%s' "$ESC_PR")" + + new_issue=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues" \ + -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ + '{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true + + if [ -n "$new_issue" ]; then + log "Created idle-timeout sub-issue #${new_issue} for #${ESC_ISSUE}" + _esc_total_created=$((_esc_total_created + 1)) + matrix_send "gardener" "Created #${new_issue}: idle timeout on #${ESC_ISSUE}" 2>/dev/null || true + fi + + echo "$esc_entry" >> "$ESCALATION_DONE" + continue + fi # Fetch PR metadata (SHA, mergeable status) ESC_PR_DATA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \