fix: refactor: replace escalation JSONL with blocked label + diagnostic comment (#352)

Replace the unreliable escalation JSONL system (supervisor/escalations-*.jsonl consumed by gardener) with direct blocked label + diagnostic comment on the original issue. When a dev-agent or action-agent session fails (PHASE:failed, idle timeout, crash, CI exhausted): - Capture last 50 lines from tmux pane via tmux capture-pane - Post a structured diagnostic comment on the issue (exit reason, timestamp, PR number, tmux output) - Label the issue "blocked" (instead of restoring "backlog") - Remove in-progress label Removed: - Escalation JSONL write paths in dev-agent.sh, phase-handler.sh, dev-poll.sh, action-agent.sh - is_escalated() helper in dev-poll.sh - Escalation triage (P2f section) in supervisor-poll.sh - Escalation processing + recipe engine in gardener-poll.sh - ci-escalation-recipes step from run-gardener.toml formula - escalations*.jsonl from .gitignore Added: - post_blocked_diagnostic() shared helper in phase-handler.sh - ensure_blocked_label_id() helper (creates label via API if not exists) - is_blocked() helper in dev-poll.sh (replaces is_escalated) - Blocked issues listing in supervisor/preflight.sh Kept: - Matrix notifications on failure (unchanged) - CI fix counter logic (still tracks attempts) - needs_human injection in supervisor/gardener (not escalation-related) - Gardener grooming (gardener-agent.sh still invoked) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 04:18:43 +00:00 · 2026-03-21 04:18:43 +00:00 · 61c44d31b1
commit 61c44d31b1
parent 0109f0b0c3
10 changed files with 181 additions and 990 deletions
--- a/supervisor/supervisor-poll.sh
+++ b/supervisor/supervisor-poll.sh
@ -231,26 +231,6 @@ for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
  fi
 done

-# Report pending escalations (processing has moved to gardener-poll.sh per-project)
-for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do
-  [ -f "$_esc_file" ] || continue
-  [[ "$_esc_file" == *.done.jsonl ]] && continue
-  _esc_count=$(wc -l < "$_esc_file" 2>/dev/null || true)
-  [ "${_esc_count:-0}" -gt 0 ] || continue
-  _esc_proj=$(basename "$_esc_file" .jsonl)
-  _esc_proj="${_esc_proj#escalations-}"
-  flog "${_esc_proj}: ${_esc_count} escalation(s) pending (gardener will process)"
-done
-
-# Pick up escalation resolutions handled by gardener
-_gesc_log="${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log"
-if [ -f "$_gesc_log" ]; then
-  while IFS=' ' read -r _gn _gp; do
-    [ -n "${_gn:-}" ] && fixed "${_gp:-unknown}: gardener created ${_gn} sub-issue(s) from escalations"
-  done < "$_gesc_log"
-  rm -f "$_gesc_log"
-fi
-
 # #############################################################################
 #                      LAYER 2: PER-PROJECT CHECKS
 #               (iterated over projects/*.toml, config-driven)
@ -342,149 +322,6 @@ check_project() {
    find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true
  fi

-  # ===========================================================================
-  # P2f: ESCALATION TRIAGE — auto-retrigger ci_exhausted if infra-only
-  # ===========================================================================
-  if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then
-    status "P2f: ${proj_name}: triaging ci_exhausted escalations"
-
-    _esc_file="${FACTORY_ROOT}/supervisor/escalations-${proj_name}.jsonl"
-    if [ -f "$_esc_file" ] && [ -s "$_esc_file" ]; then
-      _esc_tmp="${_esc_file}.sup.$$"
-      : > "$_esc_tmp"
-
-      while IFS= read -r _esc_line; do
-        [ -z "$_esc_line" ] && continue
-
-        _esc_reason=$(printf '%s' "$_esc_line" | jq -r '.reason // ""' 2>/dev/null)
-
-        # Only triage ci_exhausted entries (from dev-agent or dev-poll)
-        case "$_esc_reason" in
-          ci_exhausted)      ;;
-          ci_exhausted_poll) ;;
-          *) printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue ;;
-        esac
-
-        _esc_pr=$(printf '%s' "$_esc_line" | jq -r '.pr // 0' 2>/dev/null)
-        _esc_issue=$(printf '%s' "$_esc_line" | jq -r '.issue // 0' 2>/dev/null)
-        _esc_ts=$(printf '%s' "$_esc_line" | jq -r '.ts // ""' 2>/dev/null)
-
-        # Validate pr/issue are numeric
-        [[ "$_esc_pr" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; }
-        [[ "$_esc_issue" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; }
-
-        # Cooldown: 30 min from last escalation timestamp
-        _esc_epoch=0
-        [ -n "$_esc_ts" ] && _esc_epoch=$(date -d "$_esc_ts" +%s 2>/dev/null || echo 0)
-        _esc_age_min=$(( ($(date +%s) - _esc_epoch) / 60 ))
-
-        if [ "$_esc_age_min" -lt 30 ]; then
-          flog "${proj_name}: PR #${_esc_pr} ci_exhausted cooldown (${_esc_age_min}/30min)"
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"
-          continue
-        fi
-
-        # Get the PR's branch and state from Codeberg
-        _esc_pr_json=$(codeberg_api GET "/pulls/${_esc_pr}" 2>/dev/null) || {
-          flog "${proj_name}: PR #${_esc_pr}: failed to fetch PR info, keeping escalation"
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
-        }
-        _esc_pr_state=$(printf '%s' "$_esc_pr_json" | jq -r '.state // ""' 2>/dev/null)
-        if [ "$_esc_pr_state" != "open" ]; then
-          flog "${proj_name}: PR #${_esc_pr} is ${_esc_pr_state:-unknown} — discarding stale escalation"
-          continue  # PR merged/closed externally; escalation no longer actionable
-        fi
-        _esc_branch=$(printf '%s' "$_esc_pr_json" | jq -r '.head.ref // ""' 2>/dev/null)
-        if [ -z "$_esc_branch" ]; then
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
-        fi
-
-        # Validate branch name to prevent SQL injection
-        if ! [[ "$_esc_branch" =~ ^[a-zA-Z0-9/_.-]+$ ]]; then
-          flog "${proj_name}: PR #${_esc_pr}: unsafe branch name, keeping escalation"
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
-        fi
-
-        # Find the latest failed pipeline for this PR's branch via Woodpecker DB
-        _esc_pip=$(wpdb -A -c "
-          SELECT number FROM pipelines
-          WHERE repo_id = ${WOODPECKER_REPO_ID}
-            AND branch = '${_esc_branch}'
-            AND status IN ('failure', 'error')
-            AND finished > 0
-          ORDER BY number DESC LIMIT 1;" 2>/dev/null \
-          | tr -d ' ' | grep -E '^[0-9]+$' | head -1 || true)
-
-        if [ -z "$_esc_pip" ]; then
-          flog "${proj_name}: PR #${_esc_pr}: no failed pipeline for branch ${_esc_branch}, keeping escalation"
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
-        fi
-
-        # Classify failure type via ci-helpers
-        _esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code")
-
-        if [[ "$_esc_failure" != infra* ]]; then
-          flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human"
-          printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
-        fi
-
-        # Infra-only — push empty commit to retrigger CI via temporary worktree
-        _esc_wt="/tmp/${proj_name}-sup-retry-${_esc_pr}"
-        _esc_retrigger_ok=false
-        if [ -d "${PROJECT_REPO_ROOT:-}" ]; then
-          # Clean up any leftover temp worktree from a previous failed run
-          git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true
-
-          if git -C "${PROJECT_REPO_ROOT}" fetch origin "${_esc_branch}" --quiet 2>/dev/null && \
-             git -C "${PROJECT_REPO_ROOT}" worktree add --quiet --detach \
-               "${_esc_wt}" "origin/${_esc_branch}" 2>/dev/null; then
-            if git -C "${_esc_wt}" \
-                 -c user.email="supervisor@factory" \
-                 -c user.name="Supervisor" \
-                 commit --allow-empty --no-verify \
-                 -m "chore: retrigger CI after infra-only exhaustion" \
-                 --quiet 2>/dev/null && \
-               git -C "${_esc_wt}" push origin \
-                 "HEAD:refs/heads/${_esc_branch}" --quiet 2>/dev/null; then
-              _esc_retrigger_ok=true
-            fi
-            git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true
-          fi
-        fi
-
-        if [ "$_esc_retrigger_ok" = true ]; then
-          # Reset CI fix counter so dev-poll can spawn the agent again if needed
-          _ci_fix_file="${FACTORY_ROOT}/dev/ci-fixes-${proj_name}.json"
-          _ci_fix_lock="${_ci_fix_file}.lock"
-          flock "$_ci_fix_lock" python3 -c "
-import json, os
-f='${_ci_fix_file}'
-if not os.path.exists(f):
-    exit()
-d = json.load(open(f))
-d.pop(str(${_esc_pr}), None)
-json.dump(d, open(f, 'w'))
-" 2>/dev/null || true
-
-          fixed "${proj_name}: auto-retriggered CI for PR #${_esc_pr} after infra-only exhaustion"
-          flog "${proj_name}: auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion"
-          matrix_send "supervisor" "♻️ auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" 2>/dev/null || true
-          # Escalation removed — do NOT write to _esc_tmp
-        else
-          p2 "${proj_name}: PR #${_esc_pr}: infra-only CI exhaustion but retrigger push failed"
-          # Bump timestamp to now so the 30-min cooldown resets; prevents alert flood
-          # on persistent push failures (SSH key issue, Codeberg outage, etc.)
-          _esc_now=$(date -u +%Y-%m-%dT%H:%M:%SZ)
-          _esc_bumped=$(printf '%s' "$_esc_line" | jq -c --arg ts "$_esc_now" '.ts = $ts' 2>/dev/null \
-            || printf '%s' "$_esc_line")
-          printf '%s\n' "$_esc_bumped" >> "$_esc_tmp"
-        fi
-      done < "$_esc_file"
-
-      mv "$_esc_tmp" "$_esc_file"
-    fi
-  fi
-
  # Dev-agent health (only if monitoring enabled)
  if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
    DEV_LOCK="/tmp/dev-agent-${PROJECT_NAME}.lock"