fix: refactor: replace escalation JSONL with blocked label + diagnostic comment (#352)

Replace the unreliable escalation JSONL system (supervisor/escalations-*.jsonl
consumed by gardener) with direct blocked label + diagnostic comment on the
original issue.

When a dev-agent or action-agent session fails (PHASE:failed, idle timeout,
crash, CI exhausted):
- Capture last 50 lines from tmux pane via tmux capture-pane
- Post a structured diagnostic comment on the issue (exit reason, timestamp,
  PR number, tmux output)
- Label the issue "blocked" (instead of restoring "backlog")
- Remove in-progress label

Removed:
- Escalation JSONL write paths in dev-agent.sh, phase-handler.sh, dev-poll.sh,
  action-agent.sh
- is_escalated() helper in dev-poll.sh
- Escalation triage (P2f section) in supervisor-poll.sh
- Escalation processing + recipe engine in gardener-poll.sh
- ci-escalation-recipes step from run-gardener.toml formula
- escalations*.jsonl from .gitignore

Added:
- post_blocked_diagnostic() shared helper in phase-handler.sh
- ensure_blocked_label_id() helper (creates label via API if not exists)
- is_blocked() helper in dev-poll.sh (replaces is_escalated)
- Blocked issues listing in supervisor/preflight.sh

Kept:
- Matrix notifications on failure (unchanged)
- CI fix counter logic (still tracks attempts)
- needs_human injection in supervisor/gardener (not escalation-related)
- Gardener grooming (gardener-agent.sh still invoked)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-21 04:18:43 +00:00
parent 0109f0b0c3
commit 61c44d31b1
10 changed files with 181 additions and 990 deletions

View file

@ -749,18 +749,15 @@ case "${_MONITOR_LOOP_EXIT:-}" in
idle_timeout|idle_prompt)
if [ "${_MONITOR_LOOP_EXIT:-}" = "idle_prompt" ]; then
notify_ctx \
"session finished without phase signal — killed. Escalating to gardener." \
"session finished without phase signal — killed. Escalating to gardener.${PR_NUMBER:+ PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
"session finished without phase signal — killed. Marking blocked." \
"session finished without phase signal — killed. Marking blocked.${PR_NUMBER:+ PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
else
notify_ctx \
"session idle for 2h — killed. Escalating to gardener." \
"session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
"session idle for 2h — killed. Marking blocked." \
"session idle for 2h — killed. Marking blocked.${PR_NUMBER:+ PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
fi
# Escalate: write to project-suffixed escalation file so gardener picks it up
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${_MONITOR_LOOP_EXIT:-idle_timeout}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore labels: remove in-progress, add backlog
restore_to_backlog
# Post diagnostic comment + label issue blocked (replaces escalation JSONL)
post_blocked_diagnostic "${_MONITOR_LOOP_EXIT:-idle_timeout}"
if [ -n "${PR_NUMBER:-}" ]; then
log "keeping worktree (PR #${PR_NUMBER} still open)"
else
@ -773,8 +770,8 @@ case "${_MONITOR_LOOP_EXIT:-}" in
;;
crashed)
# Belt-and-suspenders: _on_phase_change(PHASE:crashed) handles primary
# cleanup (escalation, notification, labels, worktree, files).
restore_to_backlog
# cleanup (diagnostic comment, blocked label, worktree, files).
post_blocked_diagnostic "crashed"
;;
done)
# Belt-and-suspenders: callback in phase-handler.sh handles primary cleanup,

View file

@ -77,40 +77,59 @@ else:
" 2>/dev/null || echo "exhausted:99"
}
# Check whether an issue/PR has been escalated (unprocessed or processed)
is_escalated() {
local issue="$1" pr="$2"
python3 -c "
import json, sys
try:
issue, pr = int('${issue}'), int('${pr}')
except (ValueError, TypeError):
sys.exit(1)
for path in ['${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl',
'${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.done.jsonl']:
try:
with open(path) as fh:
for line in fh:
line = line.strip()
if not line:
continue
d = json.loads(line)
if d.get('issue') == issue and d.get('pr') == pr:
sys.exit(0)
except OSError:
pass
sys.exit(1)
" 2>/dev/null && return 0 || return 1
# Check whether an issue already has the "blocked" label
is_blocked() {
local issue="$1"
codeberg_api GET "/issues/${issue}/labels" 2>/dev/null \
| jq -e '.[] | select(.name == "blocked")' >/dev/null 2>&1
}
# Post a CI-exhaustion diagnostic comment and label issue as blocked.
# Args: issue_num pr_num attempts
_post_ci_blocked_comment() {
local issue_num="$1" pr_num="$2" attempts="$3"
local blocked_id
blocked_id=$(codeberg_api GET "/labels" 2>/dev/null \
| jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || true)
if [ -z "$blocked_id" ]; then
blocked_id=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/labels" \
-d '{"name":"blocked","color":"#e11d48"}' 2>/dev/null \
| jq -r '.id // empty' 2>/dev/null || true)
fi
[ -z "$blocked_id" ] && return 0
local comment
comment="### Session failure diagnostic
| Field | Value |
|---|---|
| Exit reason | \`ci_exhausted_poll (${attempts} attempts)\` |
| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` |
| PR | #${pr_num} |"
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues/${issue_num}/comments" \
-d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues/${issue_num}/labels" \
-d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true
}
# =============================================================================
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
# HELPER: handle CI-exhaustion check/block (DRY for 3 call sites)
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
#
# Pass "check_only" as third arg for the backlog scan path: ok-counts are
# returned without incrementing (deferred to launch time so a WAITING_PRS
# exit cannot waste a fix attempt). The 3→4 sentinel bump is always atomic
# regardless of mode, preventing duplicate escalation writes from concurrent
# regardless of mode, preventing duplicate blocked labels from concurrent
# pollers.
# =============================================================================
handle_ci_exhaustion() {
@ -118,18 +137,18 @@ handle_ci_exhaustion() {
local check_only="${3:-}"
local result
# Fast path: already in the escalation file — skip without touching counter.
if is_escalated "$issue_num" "$pr_num"; then
# Fast path: already blocked — skip without touching counter.
if is_blocked "$issue_num"; then
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
log "PR #${pr_num} (issue #${issue_num}) already escalated (${CI_FIX_ATTEMPTS} attempts) — skipping"
log "PR #${pr_num} (issue #${issue_num}) already blocked (${CI_FIX_ATTEMPTS} attempts) — skipping"
return 0
fi
# Single flock-protected call: read + threshold-check + conditional bump.
# In check_only mode, ok-counts are returned without incrementing (deferred
# to launch time). In both modes, the 3→4 sentinel bump is atomic, so only
# one concurrent poller can ever receive exhausted_first_time:3 and write
# the escalation entry.
# one concurrent poller can ever receive exhausted_first_time:3 and label
# the issue blocked.
result=$(ci_fix_check_and_increment "$pr_num" "$check_only")
case "$result" in
ok:*)
@ -138,18 +157,17 @@ handle_ci_exhaustion() {
;;
exhausted_first_time:*)
CI_FIX_ATTEMPTS="${result#exhausted_first_time:}"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — marking blocked"
_post_ci_blocked_comment "$issue_num" "$pr_num" "$CI_FIX_ATTEMPTS"
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — marked blocked" 2>/dev/null || true
;;
exhausted:*)
CI_FIX_ATTEMPTS="${result#exhausted:}"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — already blocked, skipping"
;;
*)
CI_FIX_ATTEMPTS=99
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — already blocked, skipping"
;;
esac
return 0

View file

@ -34,6 +34,86 @@
: "${CLAIMED:=false}"
: "${PHASE_POLL_INTERVAL:=30}"
# --- Look up (or create) the "blocked" label ID ---
ensure_blocked_label_id() {
if [ -n "${_BLOCKED_LABEL_ID:-}" ]; then
printf '%s' "$_BLOCKED_LABEL_ID"
return 0
fi
_BLOCKED_LABEL_ID=$(codeberg_api GET "/labels" 2>/dev/null \
| jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || true)
if [ -z "$_BLOCKED_LABEL_ID" ]; then
_BLOCKED_LABEL_ID=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/labels" \
-d '{"name":"blocked","color":"#e11d48"}' 2>/dev/null \
| jq -r '.id // empty' 2>/dev/null || true)
fi
printf '%s' "$_BLOCKED_LABEL_ID"
}
# --- Post diagnostic comment + label issue as blocked ---
# Replaces the old escalation JSONL write path.
# Captures tmux pane output, posts a structured comment on the issue, removes
# in-progress label, and adds the "blocked" label.
#
# Args: reason [session_name]
# Uses globals: ISSUE, SESSION_NAME, PR_NUMBER, CODEBERG_TOKEN, API
post_blocked_diagnostic() {
local reason="$1"
local session="${2:-${SESSION_NAME:-}}"
# Capture last 50 lines from tmux pane (before kill)
local tmux_output=""
if [ -n "$session" ] && tmux has-session -t "$session" 2>/dev/null; then
tmux_output=$(tmux capture-pane -p -t "$session" -S -50 2>/dev/null || true)
fi
# Build diagnostic comment body
local comment
comment="### Session failure diagnostic
| Field | Value |
|---|---|
| Exit reason | \`${reason}\` |
| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` |"
[ -n "${PR_NUMBER:-}" ] && [ "${PR_NUMBER:-0}" != "0" ] && \
comment="${comment}
| PR | #${PR_NUMBER} |"
if [ -n "$tmux_output" ]; then
comment="${comment}
<details><summary>Last 50 lines from tmux pane</summary>
\`\`\`
${tmux_output}
\`\`\`
</details>"
fi
# Post comment to issue
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/comments" \
-d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true
# Remove in-progress, add blocked
cleanup_labels
local blocked_id
blocked_id=$(ensure_blocked_label_id)
if [ -n "$blocked_id" ]; then
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/labels" \
-d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true
fi
CLAIMED=false
}
# --- Build phase protocol prompt (shared across agents) ---
# Generates the phase-signaling instructions for Claude prompts.
# Args: phase_file summary_file branch
@ -319,12 +399,11 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee
CI_FIX_COUNT=$(( CI_FIX_COUNT + 1 ))
_ci_pipeline_url="${WOODPECKER_SERVER}/repos/${WOODPECKER_REPO_ID}/pipeline/${PIPELINE_NUM:-0}"
if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then
log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER},\"reason\":\"ci_exhausted\",\"step\":\"${FAILED_STEP:-unknown}\",\"attempts\":${CI_FIX_COUNT},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — marking blocked"
post_blocked_diagnostic "ci_exhausted after ${CI_FIX_COUNT} attempts (step: ${FAILED_STEP:-unknown})"
notify_ctx \
"CI exhausted after ${CI_FIX_COUNT} attempts — escalated to supervisor" \
"CI exhausted after ${CI_FIX_COUNT} attempts on PR <a href='${PR_URL:-${CODEBERG_WEB}/pulls/${PR_NUMBER}}'>#${PR_NUMBER}</a> | <a href='${_ci_pipeline_url}'>Pipeline</a><br>Step: <code>${FAILED_STEP:-unknown}</code> — escalated to supervisor"
"CI exhausted after ${CI_FIX_COUNT} attempts — issue marked blocked" \
"CI exhausted after ${CI_FIX_COUNT} attempts on PR <a href='${PR_URL:-${CODEBERG_WEB}/pulls/${PR_NUMBER}}'>#${PR_NUMBER}</a> | <a href='${_ci_pipeline_url}'>Pipeline</a><br>Step: <code>${FAILED_STEP:-unknown}</code> — issue marked blocked"
printf 'PHASE:failed\nReason: ci_exhausted after %d attempts\n' "$CI_FIX_COUNT" > "$PHASE_FILE"
# Do NOT update LAST_PHASE_MTIME here — let the main loop detect PHASE:failed
return 0
@ -685,23 +764,13 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
return 1
else
# Genuine unrecoverable failure — escalate to supervisor
# Genuine unrecoverable failure — label blocked with diagnostic
log "session failed: ${FAILURE_REASON}"
notify_ctx \
"❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}" \
"❌ <a href='${CODEBERG_WEB}/issues/${ISSUE}'>Issue #${ISSUE}</a> session failed: ${FAILURE_REASON}${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${FAILURE_REASON}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
post_blocked_diagnostic "$FAILURE_REASON"
# Restore backlog label
cleanup_labels
curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${API}/issues/${ISSUE}/labels" \
-d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true
CLAIMED=false # Don't unclaim again in cleanup()
agent_kill_session "$SESSION_NAME"
if [ -n "${PR_NUMBER:-}" ]; then
log "keeping worktree (PR #${PR_NUMBER} still open)"
@ -715,18 +784,14 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000)
fi
# ── PHASE: crashed ──────────────────────────────────────────────────────────
# Session died unexpectedly (OOM kill, tmux crash, etc.). Escalate to
# supervisor and restore issue to backlog so it can be retried.
# Session died unexpectedly (OOM kill, tmux crash, etc.). Label blocked with
# diagnostic comment so humans can triage directly on the issue.
elif [ "$phase" = "PHASE:crashed" ]; then
log "session crashed for issue #${ISSUE}"
notify_ctx \
"session crashed unexpectedly — escalating" \
"session crashed unexpectedly — escalating${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"crashed\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
# Restore backlog label, clean up worktree + temp files
restore_to_backlog
"session crashed unexpectedly — marking blocked" \
"session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR <a href='${CODEBERG_WEB}/pulls/${PR_NUMBER}'>#${PR_NUMBER}</a>}"
post_blocked_diagnostic "crashed"
[ -z "${PR_NUMBER:-}" ] && cleanup_worktree
[ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)"
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \