From 61c44d31b17d8f76eecb68e4c94b917243cdc642 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 21 Mar 2026 04:18:43 +0000 Subject: [PATCH] fix: refactor: replace escalation JSONL with blocked label + diagnostic comment (#352) Replace the unreliable escalation JSONL system (supervisor/escalations-*.jsonl consumed by gardener) with direct blocked label + diagnostic comment on the original issue. When a dev-agent or action-agent session fails (PHASE:failed, idle timeout, crash, CI exhausted): - Capture last 50 lines from tmux pane via tmux capture-pane - Post a structured diagnostic comment on the issue (exit reason, timestamp, PR number, tmux output) - Label the issue "blocked" (instead of restoring "backlog") - Remove in-progress label Removed: - Escalation JSONL write paths in dev-agent.sh, phase-handler.sh, dev-poll.sh, action-agent.sh - is_escalated() helper in dev-poll.sh - Escalation triage (P2f section) in supervisor-poll.sh - Escalation processing + recipe engine in gardener-poll.sh - ci-escalation-recipes step from run-gardener.toml formula - escalations*.jsonl from .gitignore Added: - post_blocked_diagnostic() shared helper in phase-handler.sh - ensure_blocked_label_id() helper (creates label via API if not exists) - is_blocked() helper in dev-poll.sh (replaces is_escalated) - Blocked issues listing in supervisor/preflight.sh Kept: - Matrix notifications on failure (unchanged) - CI fix counter logic (still tracks attempts) - needs_human injection in supervisor/gardener (not escalation-related) - Gardener grooming (gardener-agent.sh still invoked) Co-Authored-By: Claude Opus 4.6 (1M context) --- .gitignore | 1 - AGENTS.md | 10 +- action/action-agent.sh | 10 +- dev/dev-agent.sh | 19 +- dev/dev-poll.sh | 92 +++-- dev/phase-handler.sh | 117 ++++-- formulas/run-gardener.toml | 46 +-- gardener/gardener-poll.sh | 690 +--------------------------------- supervisor/preflight.sh | 23 +- supervisor/supervisor-poll.sh | 163 -------- 10 files changed, 181 insertions(+), 990 deletions(-) mode change 100755 => 100644 gardener/gardener-poll.sh diff --git a/.gitignore b/.gitignore index c3464d8..efd39db 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,6 @@ RESOURCES.md state.json *.lock *.pid -escalations*.jsonl metrics/supervisor-metrics.jsonl # OS diff --git a/AGENTS.md b/AGENTS.md index 5e512dd..1b6480b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -130,9 +130,9 @@ issue is filed against). **Key files**: - `gardener/gardener-run.sh` — Cron wrapper: lock, memory guard, dedup check, files action issue -- `gardener/gardener-poll.sh` — Recipe engine: escalation-reply injection for dev sessions, processes dev-agent CI escalations via recipe engine (invoked by formula step ci-escalation-recipes) +- `gardener/gardener-poll.sh` — Escalation-reply injection for dev sessions, invokes gardener-agent.sh for grooming - `gardener/gardener-agent.sh` — Orchestrator: bash pre-analysis, creates tmux session (`gardener-{project}`) with interactive `claude`, monitors phase file, parses result file (ACTION:/DUST:/ESCALATE), handles dust bundling -- `formulas/run-gardener.toml` — Execution spec: preflight, grooming, blocked-review, CI escalation recipes, agents-update, commit-and-pr +- `formulas/run-gardener.toml` — Execution spec: preflight, grooming, blocked-review, agents-update, commit-and-pr **Environment variables consumed**: - `CODEBERG_TOKEN`, `CODEBERG_REPO`, `CODEBERG_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT` @@ -159,8 +159,8 @@ runs directly from cron like the planner and predictor. `run_formula_and_monitor` - `supervisor/preflight.sh` — Data collection: system resources (RAM, disk, swap, load), Docker status, active tmux sessions + phase files, lock files, agent log - tails, CI pipeline status, open PRs, issue counts, stale worktrees, pending - escalations, Matrix escalation replies + tails, CI pipeline status, open PRs, issue counts, stale worktrees, blocked + issues, Matrix escalation replies - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review, health-assessment, decide-actions, report, journal) with `needs` dependencies. Claude evaluates all metrics and takes actions in a single interactive session @@ -373,7 +373,7 @@ Issues flow through these states: |---|---|---| | `backlog` | Issue is queued for implementation. Dev-poll picks the first ready one. | Planner, gardener, humans | | `in-progress` | Dev-agent is actively working on this issue. Only one issue per project is in-progress at a time. | dev-agent.sh (claims issue) | -| `blocked` | Issue has unmet dependencies (other open issues). | gardener, supervisor (detected) | +| `blocked` | Issue is stuck — agent session failed, crashed, timed out, or CI exhausted. Diagnostic comment on the issue has details. Also used for unmet dependencies. | dev-agent.sh, action-agent.sh, dev-poll.sh (on failure) | | `tech-debt` | Pre-existing issue flagged by AI reviewer, not introduced by a PR. | review-pr.sh (auto-created follow-ups) | | `underspecified` | Dev-agent refused the issue as too large or vague. | dev-poll.sh (on preflight `too_large`), dev-agent.sh (on mid-run `too_large` refusal) | | `vision` | Goal anchors — high-level objectives from VISION.md. | Planner, humans | diff --git a/action/action-agent.sh b/action/action-agent.sh index d1a4fcb..1c6e3a1 100644 --- a/action/action-agent.sh +++ b/action/action-agent.sh @@ -304,13 +304,12 @@ case "${_MONITOR_LOOP_EXIT:-}" in notify_ctx \ "session idle for $((IDLE_TIMEOUT / 3600))h — killed" \ "session idle for $((IDLE_TIMEOUT / 3600))h — killed" - # Escalate to supervisor (idle_prompt already escalated via _on_phase_change callback) - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"idle_timeout\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + # Post diagnostic comment + label blocked (replaces escalation JSONL) + post_blocked_diagnostic "idle_timeout" rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE" ;; idle_prompt) - # Notification + escalation already handled by _on_phase_change(PHASE:failed) callback + # Notification + blocked label already handled by _on_phase_change(PHASE:failed) callback rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE" ;; PHASE:failed) @@ -319,8 +318,7 @@ case "${_MONITOR_LOOP_EXIT:-}" in notify_ctx \ "session killed — wall-clock cap ($((MAX_LIFETIME / 3600))h) reached" \ "session killed — wall-clock cap ($((MAX_LIFETIME / 3600))h) reached" - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"max_lifetime\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + post_blocked_diagnostic "max_lifetime" fi rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE" ;; diff --git a/dev/dev-agent.sh b/dev/dev-agent.sh index 477de5f..ec8791e 100755 --- a/dev/dev-agent.sh +++ b/dev/dev-agent.sh @@ -749,18 +749,15 @@ case "${_MONITOR_LOOP_EXIT:-}" in idle_timeout|idle_prompt) if [ "${_MONITOR_LOOP_EXIT:-}" = "idle_prompt" ]; then notify_ctx \ - "session finished without phase signal — killed. Escalating to gardener." \ - "session finished without phase signal — killed. Escalating to gardener.${PR_NUMBER:+ PR #${PR_NUMBER}}" + "session finished without phase signal — killed. Marking blocked." \ + "session finished without phase signal — killed. Marking blocked.${PR_NUMBER:+ PR #${PR_NUMBER}}" else notify_ctx \ - "session idle for 2h — killed. Escalating to gardener." \ - "session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR #${PR_NUMBER}}" + "session idle for 2h — killed. Marking blocked." \ + "session idle for 2h — killed. Marking blocked.${PR_NUMBER:+ PR #${PR_NUMBER}}" fi - # Escalate: write to project-suffixed escalation file so gardener picks it up - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${_MONITOR_LOOP_EXIT:-idle_timeout}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" - # Restore labels: remove in-progress, add backlog - restore_to_backlog + # Post diagnostic comment + label issue blocked (replaces escalation JSONL) + post_blocked_diagnostic "${_MONITOR_LOOP_EXIT:-idle_timeout}" if [ -n "${PR_NUMBER:-}" ]; then log "keeping worktree (PR #${PR_NUMBER} still open)" else @@ -773,8 +770,8 @@ case "${_MONITOR_LOOP_EXIT:-}" in ;; crashed) # Belt-and-suspenders: _on_phase_change(PHASE:crashed) handles primary - # cleanup (escalation, notification, labels, worktree, files). - restore_to_backlog + # cleanup (diagnostic comment, blocked label, worktree, files). + post_blocked_diagnostic "crashed" ;; done) # Belt-and-suspenders: callback in phase-handler.sh handles primary cleanup, diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh index fa2f273..e09c49e 100755 --- a/dev/dev-poll.sh +++ b/dev/dev-poll.sh @@ -77,40 +77,59 @@ else: " 2>/dev/null || echo "exhausted:99" } -# Check whether an issue/PR has been escalated (unprocessed or processed) -is_escalated() { - local issue="$1" pr="$2" - python3 -c " -import json, sys -try: - issue, pr = int('${issue}'), int('${pr}') -except (ValueError, TypeError): - sys.exit(1) -for path in ['${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl', - '${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.done.jsonl']: - try: - with open(path) as fh: - for line in fh: - line = line.strip() - if not line: - continue - d = json.loads(line) - if d.get('issue') == issue and d.get('pr') == pr: - sys.exit(0) - except OSError: - pass -sys.exit(1) -" 2>/dev/null && return 0 || return 1 +# Check whether an issue already has the "blocked" label +is_blocked() { + local issue="$1" + codeberg_api GET "/issues/${issue}/labels" 2>/dev/null \ + | jq -e '.[] | select(.name == "blocked")' >/dev/null 2>&1 +} + +# Post a CI-exhaustion diagnostic comment and label issue as blocked. +# Args: issue_num pr_num attempts +_post_ci_blocked_comment() { + local issue_num="$1" pr_num="$2" attempts="$3" + local blocked_id + blocked_id=$(codeberg_api GET "/labels" 2>/dev/null \ + | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || true) + if [ -z "$blocked_id" ]; then + blocked_id=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/labels" \ + -d '{"name":"blocked","color":"#e11d48"}' 2>/dev/null \ + | jq -r '.id // empty' 2>/dev/null || true) + fi + [ -z "$blocked_id" ] && return 0 + + local comment + comment="### Session failure diagnostic + +| Field | Value | +|---|---| +| Exit reason | \`ci_exhausted_poll (${attempts} attempts)\` | +| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` | +| PR | #${pr_num} |" + + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues/${issue_num}/comments" \ + -d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues/${issue_num}/labels" \ + -d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true } # ============================================================================= -# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites) +# HELPER: handle CI-exhaustion check/block (DRY for 3 call sites) # Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not. # # Pass "check_only" as third arg for the backlog scan path: ok-counts are # returned without incrementing (deferred to launch time so a WAITING_PRS # exit cannot waste a fix attempt). The 3→4 sentinel bump is always atomic -# regardless of mode, preventing duplicate escalation writes from concurrent +# regardless of mode, preventing duplicate blocked labels from concurrent # pollers. # ============================================================================= handle_ci_exhaustion() { @@ -118,18 +137,18 @@ handle_ci_exhaustion() { local check_only="${3:-}" local result - # Fast path: already in the escalation file — skip without touching counter. - if is_escalated "$issue_num" "$pr_num"; then + # Fast path: already blocked — skip without touching counter. + if is_blocked "$issue_num"; then CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num") - log "PR #${pr_num} (issue #${issue_num}) already escalated (${CI_FIX_ATTEMPTS} attempts) — skipping" + log "PR #${pr_num} (issue #${issue_num}) already blocked (${CI_FIX_ATTEMPTS} attempts) — skipping" return 0 fi # Single flock-protected call: read + threshold-check + conditional bump. # In check_only mode, ok-counts are returned without incrementing (deferred # to launch time). In both modes, the 3→4 sentinel bump is atomic, so only - # one concurrent poller can ever receive exhausted_first_time:3 and write - # the escalation entry. + # one concurrent poller can ever receive exhausted_first_time:3 and label + # the issue blocked. result=$(ci_fix_check_and_increment "$pr_num" "$check_only") case "$result" in ok:*) @@ -138,18 +157,17 @@ handle_ci_exhaustion() { ;; exhausted_first_time:*) CI_FIX_ATTEMPTS="${result#exhausted_first_time:}" - log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" - echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" - matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true + log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — marking blocked" + _post_ci_blocked_comment "$issue_num" "$pr_num" "$CI_FIX_ATTEMPTS" + matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — marked blocked" 2>/dev/null || true ;; exhausted:*) CI_FIX_ATTEMPTS="${result#exhausted:}" - log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" + log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — already blocked, skipping" ;; *) CI_FIX_ATTEMPTS=99 - log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" + log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — already blocked, skipping" ;; esac return 0 diff --git a/dev/phase-handler.sh b/dev/phase-handler.sh index e7e1568..6641184 100644 --- a/dev/phase-handler.sh +++ b/dev/phase-handler.sh @@ -34,6 +34,86 @@ : "${CLAIMED:=false}" : "${PHASE_POLL_INTERVAL:=30}" +# --- Look up (or create) the "blocked" label ID --- +ensure_blocked_label_id() { + if [ -n "${_BLOCKED_LABEL_ID:-}" ]; then + printf '%s' "$_BLOCKED_LABEL_ID" + return 0 + fi + _BLOCKED_LABEL_ID=$(codeberg_api GET "/labels" 2>/dev/null \ + | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || true) + if [ -z "$_BLOCKED_LABEL_ID" ]; then + _BLOCKED_LABEL_ID=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/labels" \ + -d '{"name":"blocked","color":"#e11d48"}' 2>/dev/null \ + | jq -r '.id // empty' 2>/dev/null || true) + fi + printf '%s' "$_BLOCKED_LABEL_ID" +} + +# --- Post diagnostic comment + label issue as blocked --- +# Replaces the old escalation JSONL write path. +# Captures tmux pane output, posts a structured comment on the issue, removes +# in-progress label, and adds the "blocked" label. +# +# Args: reason [session_name] +# Uses globals: ISSUE, SESSION_NAME, PR_NUMBER, CODEBERG_TOKEN, API +post_blocked_diagnostic() { + local reason="$1" + local session="${2:-${SESSION_NAME:-}}" + + # Capture last 50 lines from tmux pane (before kill) + local tmux_output="" + if [ -n "$session" ] && tmux has-session -t "$session" 2>/dev/null; then + tmux_output=$(tmux capture-pane -p -t "$session" -S -50 2>/dev/null || true) + fi + + # Build diagnostic comment body + local comment + comment="### Session failure diagnostic + +| Field | Value | +|---|---| +| Exit reason | \`${reason}\` | +| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` |" + [ -n "${PR_NUMBER:-}" ] && [ "${PR_NUMBER:-0}" != "0" ] && \ + comment="${comment} +| PR | #${PR_NUMBER} |" + + if [ -n "$tmux_output" ]; then + comment="${comment} + +
Last 50 lines from tmux pane + +\`\`\` +${tmux_output} +\`\`\` +
" + fi + + # Post comment to issue + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/comments" \ + -d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true + + # Remove in-progress, add blocked + cleanup_labels + local blocked_id + blocked_id=$(ensure_blocked_label_id) + if [ -n "$blocked_id" ]; then + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/labels" \ + -d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true + fi + CLAIMED=false +} + # --- Build phase protocol prompt (shared across agents) --- # Generates the phase-signaling instructions for Claude prompts. # Args: phase_file summary_file branch @@ -319,12 +399,11 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee CI_FIX_COUNT=$(( CI_FIX_COUNT + 1 )) _ci_pipeline_url="${WOODPECKER_SERVER}/repos/${WOODPECKER_REPO_ID}/pipeline/${PIPELINE_NUM:-0}" if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then - log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating" - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER},\"reason\":\"ci_exhausted\",\"step\":\"${FAILED_STEP:-unknown}\",\"attempts\":${CI_FIX_COUNT},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — marking blocked" + post_blocked_diagnostic "ci_exhausted after ${CI_FIX_COUNT} attempts (step: ${FAILED_STEP:-unknown})" notify_ctx \ - "CI exhausted after ${CI_FIX_COUNT} attempts — escalated to supervisor" \ - "CI exhausted after ${CI_FIX_COUNT} attempts on PR #${PR_NUMBER} | Pipeline
Step: ${FAILED_STEP:-unknown} — escalated to supervisor" + "CI exhausted after ${CI_FIX_COUNT} attempts — issue marked blocked" \ + "CI exhausted after ${CI_FIX_COUNT} attempts on PR #${PR_NUMBER} | Pipeline
Step: ${FAILED_STEP:-unknown} — issue marked blocked" printf 'PHASE:failed\nReason: ci_exhausted after %d attempts\n' "$CI_FIX_COUNT" > "$PHASE_FILE" # Do NOT update LAST_PHASE_MTIME here — let the main loop detect PHASE:failed return 0 @@ -685,23 +764,13 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) return 1 else - # Genuine unrecoverable failure — escalate to supervisor + # Genuine unrecoverable failure — label blocked with diagnostic log "session failed: ${FAILURE_REASON}" notify_ctx \ "❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}" \ "❌ Issue #${ISSUE} session failed: ${FAILURE_REASON}${PR_NUMBER:+ | PR #${PR_NUMBER}}" - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"${FAILURE_REASON}\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + post_blocked_diagnostic "$FAILURE_REASON" - # Restore backlog label - cleanup_labels - curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${API}/issues/${ISSUE}/labels" \ - -d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true - - CLAIMED=false # Don't unclaim again in cleanup() agent_kill_session "$SESSION_NAME" if [ -n "${PR_NUMBER:-}" ]; then log "keeping worktree (PR #${PR_NUMBER} still open)" @@ -715,18 +784,14 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) fi # ── PHASE: crashed ────────────────────────────────────────────────────────── - # Session died unexpectedly (OOM kill, tmux crash, etc.). Escalate to - # supervisor and restore issue to backlog so it can be retried. + # Session died unexpectedly (OOM kill, tmux crash, etc.). Label blocked with + # diagnostic comment so humans can triage directly on the issue. elif [ "$phase" = "PHASE:crashed" ]; then log "session crashed for issue #${ISSUE}" notify_ctx \ - "session crashed unexpectedly — escalating" \ - "session crashed unexpectedly — escalating${PR_NUMBER:+ | PR #${PR_NUMBER}}" - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"crashed\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" - - # Restore backlog label, clean up worktree + temp files - restore_to_backlog + "session crashed unexpectedly — marking blocked" \ + "session crashed unexpectedly — marking blocked${PR_NUMBER:+ | PR #${PR_NUMBER}}" + post_blocked_diagnostic "crashed" [ -z "${PR_NUMBER:-}" ] && cleanup_worktree [ -n "${PR_NUMBER:-}" ] && log "keeping worktree (PR #${PR_NUMBER} still open)" rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "${SCRATCH_FILE:-}" \ diff --git a/formulas/run-gardener.toml b/formulas/run-gardener.toml index 8505990..95ed054 100644 --- a/formulas/run-gardener.toml +++ b/formulas/run-gardener.toml @@ -1,17 +1,16 @@ # formulas/run-gardener.toml — Gardener housekeeping formula # # Defines the gardener's complete run: grooming (Claude session via -# gardener-agent.sh) + CI escalation recipes (bash, gardener-poll.sh) -# + AGENTS.md maintenance + final commit-and-pr. +# gardener-agent.sh) + blocked-review + AGENTS.md maintenance + final +# commit-and-pr. # # No memory, no journal. The gardener does mechanical housekeeping # based on current state — it doesn't need to remember past runs. # -# Steps: preflight → grooming → blocked-review → ci-escalation-recipes -# → agents-update → commit-and-pr +# Steps: preflight → grooming → blocked-review → agents-update → commit-and-pr name = "run-gardener" -description = "Mechanical housekeeping: grooming, blocked review, CI escalation recipes, docs update" +description = "Mechanical housekeeping: grooming, blocked review, docs update" version = 1 [context] @@ -184,40 +183,7 @@ CRITICAL: If this step fails, log the failure and move on. needs = ["grooming"] # ───────────────────────────────────────────────────────────────────── -# Step 4: ci-escalation-recipes — recipe-driven CI failure handling -# ───────────────────────────────────────────────────────────────────── - -[[steps]] -id = "ci-escalation-recipes" -title = "CI escalation recipes (bash — gardener-poll.sh)" -executor = "bash" -script = "gardener/gardener-poll.sh --recipes-only" -description = """ -NOT a Claude step — executed by gardener-poll.sh before/after the Claude session. -Documented here so the formula covers the full gardener run. - -gardener-poll.sh processes CI escalation entries from -supervisor/escalations-{project}.jsonl. Each entry is a dev-agent session -that exhausted its CI fix attempts and was escalated to the gardener. - -The recipe engine (match_recipe function in gardener-poll.sh) matches each -escalation against gardener/recipes/*.toml by priority order, then executes -the matched recipe's playbook actions via bash functions. - -Recipes (see gardener/recipes/*.toml for definitions): -- chicken-egg-ci (priority 10): non-blocking bypass + per-file fix issues -- cascade-rebase (priority 20): rebase via Gitea API, re-approve, retry merge -- flaky-test (priority 30): retrigger CI or quarantine -- shellcheck-violations (priority 40): per-file ShellCheck fix issues -- Generic fallback: one combined CI failure issue - -Special cases: -- idle_timeout / idle_prompt: investigation issues (no recipe matching) -""" -needs = ["grooming"] - -# ───────────────────────────────────────────────────────────────────── -# Step 5: agents-update — AGENTS.md watermark staleness check +# Step 4: agents-update — AGENTS.md watermark staleness check # ───────────────────────────────────────────────────────────────────── [[steps]] @@ -254,7 +220,7 @@ This keeps documentation fresh — runs 2x/day so drift stays small. CRITICAL: If this step fails for any reason, log the failure and move on. Do NOT let an AGENTS.md failure prevent the commit-and-pr step. """ -needs = ["ci-escalation-recipes"] +needs = ["blocked-review"] # ───────────────────────────────────────────────────────────────────── # Step 6: commit-and-pr — single commit with all file changes diff --git a/gardener/gardener-poll.sh b/gardener/gardener-poll.sh old mode 100755 new mode 100644 index 3e5dc43..95a4ba4 --- a/gardener/gardener-poll.sh +++ b/gardener/gardener-poll.sh @@ -25,16 +25,8 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" FACTORY_ROOT="$(dirname "$SCRIPT_DIR")" -# --recipes-only: skip grooming (used by formula ci-escalation-recipes step -# to avoid double-running grooming which the formula handles as its own step) -RECIPES_ONLY=0 -if [ "${1:-}" = "--recipes-only" ]; then - RECIPES_ONLY=1 - shift -fi - # Load shared environment (with optional project TOML override) -# Usage: gardener-poll.sh [--recipes-only] [projects/harb.toml] +# Usage: gardener-poll.sh [projects/harb.toml] export PROJECT_TOML="${1:-}" # shellcheck source=../lib/env.sh source "$FACTORY_ROOT/lib/env.sh" @@ -116,684 +108,8 @@ Instructions: done # ── Backlog grooming (delegated to gardener-agent.sh) ──────────────────── -# Skipped with --recipes-only (formula's grooming step handles this) -if [ "$RECIPES_ONLY" -eq 0 ]; then - log "Invoking gardener-agent.sh for backlog grooming" - bash "$SCRIPT_DIR/gardener-agent.sh" "${1:-}" || log "WARNING: gardener-agent.sh exited with error" -else - log "Skipping grooming (--recipes-only mode)" -fi +log "Invoking gardener-agent.sh for backlog grooming" +bash "$SCRIPT_DIR/gardener-agent.sh" "${1:-}" || log "WARNING: gardener-agent.sh exited with error" -# ── Recipe matching engine ──────────────────────────────────────────────── -RECIPE_DIR="$SCRIPT_DIR/recipes" - -# match_recipe — Find first matching recipe for escalation context -# Args: $1=step_names_json $2=output_file_path $3=pr_info_json -# Stdout: JSON {name, playbook} — "generic" fallback if no match -match_recipe() { - _mr_stderr=$(mktemp /tmp/recipe-match-err-XXXXXX) - _mr_result=$(RECIPE_DIR="$RECIPE_DIR" python3 - "$1" "$2" "$3" 2>"$_mr_stderr" <<'PYEOF' -import sys, os, re, json, glob -try: - import tomllib -except ModuleNotFoundError: - import tomli as tomllib # Python < 3.11 fallback (pip install tomli) - -recipe_dir = os.environ["RECIPE_DIR"] -recipes = [] -for path in sorted(glob.glob(os.path.join(recipe_dir, "*.toml"))): - with open(path, "rb") as f: - recipes.append(tomllib.load(f)) - -recipes.sort(key=lambda r: r.get("priority", 50)) - -step_names = json.loads(sys.argv[1]) -output_path = sys.argv[2] -pr_info = json.loads(sys.argv[3]) - -step_output = "" -if os.path.isfile(output_path): - with open(output_path) as f: - step_output = f.read() - -for recipe in recipes: - trigger = recipe.get("trigger", {}) - matched = True - - if matched and "step_name" in trigger: - if not any(re.search(trigger["step_name"], n) for n in step_names): - matched = False - - if matched and "output" in trigger: - if not re.search(trigger["output"], step_output): - matched = False - - if matched and "pr_mergeable" in trigger: - if pr_info.get("mergeable") != trigger["pr_mergeable"]: - matched = False - - if matched and "pr_files" in trigger: - changed = pr_info.get("changed_files", []) - if not any(re.search(trigger["pr_files"], f) for f in changed): - matched = False - - if matched and "min_attempts" in trigger: - if pr_info.get("attempts", 1) < trigger["min_attempts"]: - matched = False - - if matched and trigger.get("failures_on_unchanged"): - # Check if errors reference files NOT changed in the PR - # Patterns: ShellCheck "In file.sh line 5:", generic "file.sh:5:10: error", - # ESLint/pylint "file.py:10:5: E123", Go "file.go:5:3:" - error_files = set() - error_files.update(re.findall(r"(?<=In )\S+(?= line \d+:)", step_output)) - error_files.update(re.findall(r"^(\S+\.\w+):\d+", step_output, re.MULTILINE)) - changed = set(pr_info.get("changed_files", [])) - if not error_files or error_files <= changed: - matched = False - - if matched: - print(json.dumps({"name": recipe["name"], "playbook": recipe.get("playbook", [])})) - sys.exit(0) - -print(json.dumps({"name": "generic", "playbook": [{"action": "create-generic-issue"}]})) -PYEOF -) || true - if [ -s "$_mr_stderr" ]; then - log "WARNING: match_recipe error: $(head -3 "$_mr_stderr" | tr '\n' ' ')" - fi - rm -f "$_mr_stderr" - if [ -z "$_mr_result" ] || ! echo "$_mr_result" | jq -e '.name' >/dev/null 2>&1; then - echo '{"name":"generic","playbook":[{"action":"create-generic-issue"}]}' - else - echo "$_mr_result" - fi -} - -# ── Playbook action functions ──────────────────────────────────────────── -# Globals used by playbook functions (set by escalation loop): -# ESC_ISSUE, ESC_PR, ESC_ATTEMPTS, ESC_PIPELINE — escalation context -# _PB_FAILED_STEPS — "pid\tname" per line of failed CI steps -# _PB_LOG_DIR — temp dir with step-{pid}.log files -# _PB_SUB_CREATED — sub-issue counter for current escalation -# _esc_total_created — running total across all escalations - -# Create per-file ShellCheck sub-issues from CI output -playbook_shellcheck_per_file() { - local step_pid step_name step_log_file step_logs - while IFS=$'\t' read -r step_pid step_name; do - [ -z "$step_pid" ] && continue - echo "$step_name" | grep -qi "shellcheck" || continue - step_log_file="${_PB_LOG_DIR}/step-${step_pid}.log" - [ -f "$step_log_file" ] || continue - step_logs=$(cat "$step_log_file") - - local sc_files - sc_files=$(echo "$step_logs" | grep -oP '(?<=In )\S+(?= line \d+:)' | sort -u || true) - - local sc_file file_errors sc_codes sub_title sub_body new_issue - while IFS= read -r sc_file; do - [ -z "$sc_file" ] && continue - # grep -F for literal filename match (dots in filenames are regex wildcards) - file_errors=$(echo "$step_logs" | grep -F -A3 "In ${sc_file} line" | head -30) - # SC codes only from this file's errors, not the whole step log - sc_codes=$(echo "$file_errors" | grep -oP 'SC\d+' | sort -u | tr '\n' ' ' | sed 's/ $//' || true) - - sub_title="fix: ShellCheck errors in ${sc_file} (from PR #${ESC_PR})" - sub_body="## ShellCheck CI failure — \`${sc_file}\` - -Spawned by gardener from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). - -### Errors -\`\`\` -${file_errors} -\`\`\` - -Fix all ShellCheck errors${sc_codes:+ (${sc_codes})} in \`${sc_file}\` so PR #${ESC_PR} CI passes. - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR} -- Pipeline: #${ESC_PIPELINE} (step: ${step_name})" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true - fi - done <<< "$sc_files" - done <<< "$_PB_FAILED_STEPS" -} - -# Create per-file issues from any lint/check CI output (generic — no step name filter) -playbook_lint_per_file() { - local step_pid step_name step_log_file step_logs - while IFS=$'\t' read -r step_pid step_name; do - [ -z "$step_pid" ] && continue - step_log_file="${_PB_LOG_DIR}/step-${step_pid}.log" - [ -f "$step_log_file" ] || continue - step_logs=$(cat "$step_log_file") - - # Extract unique file paths from lint output (multiple formats): - # ShellCheck: "In file.sh line 5:" - # Generic: "file.sh:5:10: error" - local lint_files - lint_files=$( { - echo "$step_logs" | grep -oP '(?<=In )\S+(?= line \d+:)' || true - echo "$step_logs" | grep -oP '^\S+\.\w+(?=:\d+)' || true - } | sort -u) - - local lint_file file_errors sc_codes sub_title sub_body new_issue - while IFS= read -r lint_file; do - [ -z "$lint_file" ] && continue - # Extract errors for this file (try both formats) - file_errors=$(echo "$step_logs" | grep -F -A3 "In ${lint_file} line" 2>/dev/null | head -30 || true) - if [ -z "$file_errors" ]; then - file_errors=$(echo "$step_logs" | grep -F "${lint_file}:" | head -30 || true) - fi - [ -z "$file_errors" ] && continue - # Extract SC codes if present (harmless for non-ShellCheck output) - sc_codes=$(echo "$file_errors" | grep -oP 'SC\d+' | sort -u | tr '\n' ' ' | sed 's/ $//' || true) - - sub_title="fix: lint errors in ${lint_file} (from PR #${ESC_PR})" - sub_body="## Lint CI failure — \`${lint_file}\` - -Spawned by gardener from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). - -### Errors -\`\`\` -${file_errors} -\`\`\` - -Fix all errors${sc_codes:+ (${sc_codes})} in \`${lint_file}\` so PR #${ESC_PR} CI passes. - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR} -- Pipeline: #${ESC_PIPELINE} (step: ${step_name})" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created sub-issue #${new_issue}: lint in ${lint_file} (from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created sub-issue #${new_issue}: lint in ${lint_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true - fi - done <<< "$lint_files" - done <<< "$_PB_FAILED_STEPS" -} - -# Create one combined issue for non-ShellCheck CI failures -playbook_create_generic_issue() { - local generic_fail="" step_pid step_name step_log_file step_logs esc_section - while IFS=$'\t' read -r step_pid step_name; do - [ -z "$step_pid" ] && continue - # Skip shellcheck steps (handled by shellcheck-per-file action) - echo "$step_name" | grep -qi "shellcheck" && continue - step_log_file="${_PB_LOG_DIR}/step-${step_pid}.log" - [ -f "$step_log_file" ] || continue - step_logs=$(cat "$step_log_file") - - esc_section="=== ${step_name} === -$(echo "$step_logs" | tail -50)" - if [ -z "$generic_fail" ]; then - generic_fail="$esc_section" - else - generic_fail="${generic_fail} -${esc_section}" - fi - done <<< "$_PB_FAILED_STEPS" - - [ -z "$generic_fail" ] && return 0 - - local sub_title sub_body new_issue - sub_title="fix: CI failures in PR #${ESC_PR} (from issue #${ESC_ISSUE})" - sub_body="## CI failure — fix required - -Spawned by gardener from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). - -### Failed step output -\`\`\` -${generic_fail} -\`\`\` - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR}${ESC_PIPELINE:+ -- Pipeline: #${ESC_PIPELINE}}" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true - fi -} - -# Create issue to make failing CI step non-blocking (chicken-egg-ci) -playbook_make_step_non_blocking() { - local failing_steps sub_title sub_body new_issue - failing_steps=$(echo "$_PB_FAILED_STEPS" | cut -f2 | tr '\n' ', ' | sed 's/,$//' || true) - - sub_title="fix: make CI step non-blocking for pre-existing failures (PR #${ESC_PR})" - sub_body="## Chicken-egg CI failure - -PR #${ESC_PR} (issue #${ESC_ISSUE}) introduces a CI step that fails on pre-existing code. - -Failing step(s): ${failing_steps} - -### Playbook -1. Add \`|| true\` to the failing step(s) in the Woodpecker config -2. This makes the step advisory (non-blocking) until pre-existing violations are fixed - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR}${ESC_PIPELINE:+ -- Pipeline: #${ESC_PIPELINE}}" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created #${new_issue}: make step non-blocking (chicken-egg from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created #${new_issue}: make CI step non-blocking (chicken-egg, from #${ESC_ISSUE})" 2>/dev/null || true - fi -} - -# Create follow-up issue to remove || true bypass (chicken-egg-ci) -playbook_create_followup_remove_bypass() { - local sub_title sub_body new_issue - sub_title="fix: remove || true bypass once pre-existing violations are fixed (PR #${ESC_PR})" - sub_body="## Follow-up: remove CI bypass - -After all pre-existing violation issues from PR #${ESC_PR} are resolved, remove the \`|| true\` bypass from the CI step to make it blocking again. - -### Depends on -All per-file fix issues created from escalated issue #${ESC_ISSUE}. - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR}" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created follow-up #${new_issue}: remove bypass (from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - fi -} - -# Rebase PR onto main branch (cascade-rebase) -playbook_rebase_pr() { - log "Rebasing PR #${ESC_PR} onto ${PRIMARY_BRANCH}" - local result - local http_code - http_code=$(curl -s -o /dev/null -w '%{http_code}' -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/pulls/${ESC_PR}/update" \ - -d '{"style":"rebase"}' 2>/dev/null) || true - - if [ "${http_code:-0}" -ge 200 ] && [ "${http_code:-0}" -lt 300 ]; then - log "Rebase initiated for PR #${ESC_PR} (HTTP ${http_code})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - matrix_send "gardener" "🔄 Rebased PR #${ESC_PR} onto ${PRIMARY_BRANCH} (cascade-rebase, from #${ESC_ISSUE})" 2>/dev/null || true - else - log "WARNING: rebase API call failed for PR #${ESC_PR} (HTTP ${http_code:-error})" - fi -} - -# Re-approve PR if review was dismissed by force-push (cascade-rebase) -playbook_re_approve_if_dismissed() { - local reviews dismissed - reviews=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ - "${CODEBERG_API}/pulls/${ESC_PR}/reviews" 2>/dev/null || true) - [ -z "$reviews" ] || [ "$reviews" = "null" ] && return 0 - - dismissed=$(echo "$reviews" | jq -r '[.[] | select(.state == "APPROVED" and .dismissed == true)] | length' 2>/dev/null || true) - if [ "${dismissed:-0}" -gt 0 ]; then - curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/pulls/${ESC_PR}/reviews" \ - -d '{"event":"APPROVED","body":"Re-approved after rebase (cascade-rebase recipe)"}' 2>/dev/null || true - log "Re-approved PR #${ESC_PR} after rebase" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - fi -} - -# Retry merging the PR (cascade-rebase) -playbook_retry_merge() { - local result - result=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/pulls/${ESC_PR}/merge" \ - -d '{"Do":"rebase","delete_branch_after_merge":true}' 2>/dev/null) || true - - if [ -n "$result" ]; then - log "Merge retry initiated for PR #${ESC_PR}" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - matrix_send "gardener" "✅ Merge retry for PR #${ESC_PR} (cascade-rebase, from #${ESC_ISSUE})" 2>/dev/null || true - else - log "WARNING: merge retry failed for PR #${ESC_PR}" - fi -} - -# Retrigger CI pipeline (flaky-test) -playbook_retrigger_ci() { - [ -z "$ESC_PIPELINE" ] && return 0 - # Max 2 retriggers per issue spec - if [ "${ESC_ATTEMPTS:-1}" -ge 3 ]; then - log "Max retriggers reached for pipeline #${ESC_PIPELINE} (${ESC_ATTEMPTS} attempts)" - return 0 - fi - log "Retriggering CI pipeline #${ESC_PIPELINE} (attempt ${ESC_ATTEMPTS})" - local http_code - http_code=$(curl -s -o /dev/null -w '%{http_code}' -X POST \ - -H "Authorization: Bearer ${WOODPECKER_TOKEN}" \ - "${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${ESC_PIPELINE}" 2>/dev/null) || true - - if [ "${http_code:-0}" -ge 200 ] && [ "${http_code:-0}" -lt 300 ]; then - log "Pipeline #${ESC_PIPELINE} retriggered (HTTP ${http_code})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - matrix_send "gardener" "🔄 Retriggered CI for PR #${ESC_PR} (flaky-test, attempt ${ESC_ATTEMPTS})" 2>/dev/null || true - else - log "WARNING: retrigger failed for pipeline #${ESC_PIPELINE} (HTTP ${http_code:-error})" - fi -} - -# Quarantine flaky test and create fix issue (flaky-test) -playbook_quarantine_test() { - # Only quarantine if retriggers exhausted - if [ "${ESC_ATTEMPTS:-1}" -lt 3 ]; then - return 0 - fi - - local failing_steps sub_title sub_body new_issue - failing_steps=$(echo "$_PB_FAILED_STEPS" | cut -f2 | tr '\n' ', ' | sed 's/,$//' || true) - - sub_title="fix: quarantine flaky test (PR #${ESC_PR}, from #${ESC_ISSUE})" - sub_body="## Flaky test detected - -CI for PR #${ESC_PR} (issue #${ESC_ISSUE}) failed intermittently across ${ESC_ATTEMPTS} attempts. - -Failing step(s): ${failing_steps:-unknown} - -### Playbook -1. Identify the flaky test(s) from CI output -2. Quarantine (skip/mark pending) the flaky test(s) -3. Create targeted fix for the root cause - -### Context -- Parent issue: #${ESC_ISSUE} -- PR: #${ESC_PR}${ESC_PIPELINE:+ -- Pipeline: #${ESC_PIPELINE}}" - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created quarantine issue #${new_issue} for flaky test (from #${ESC_ISSUE})" - _PB_SUB_CREATED=$((_PB_SUB_CREATED + 1)) - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created #${new_issue}: quarantine flaky test (from #${ESC_ISSUE})" 2>/dev/null || true - fi -} - -# run_playbook — Execute matched recipe's playbook actions -# Args: $1=recipe_json from match_recipe -run_playbook() { - local recipe_json="$1" - local recipe_name actions action - recipe_name=$(echo "$recipe_json" | jq -r '.name') - actions=$(echo "$recipe_json" | jq -r '.playbook[].action' 2>/dev/null || true) - - while IFS= read -r action; do - [ -z "$action" ] && continue - case "$action" in - shellcheck-per-file) playbook_shellcheck_per_file ;; - lint-per-file) playbook_lint_per_file ;; - create-generic-issue) playbook_create_generic_issue ;; - make-step-non-blocking) playbook_make_step_non_blocking ;; - create-followup-remove-bypass) playbook_create_followup_remove_bypass ;; - rebase-pr) playbook_rebase_pr ;; - re-approve-if-dismissed) playbook_re_approve_if_dismissed ;; - retry-merge) playbook_retry_merge ;; - retrigger-ci) playbook_retrigger_ci ;; - quarantine-test) playbook_quarantine_test ;; - label-backlog) ;; # default label, no-op (issues created with backlog) - *) log "WARNING: unknown playbook action '${action}' in recipe '${recipe_name}'" ;; - esac - done <<< "$actions" -} - -# ── Process dev-agent escalations (per-project, recipe-driven) ─────────── -ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" -ESCALATION_DONE="${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.done.jsonl" - -if [ -s "$ESCALATION_FILE" ]; then - # Atomically snapshot the file before processing to prevent race with - # concurrent dev-poll appends: new entries go to a fresh ESCALATION_FILE - # while we process the snapshot, so nothing is ever silently dropped. - ESCALATION_SNAP="${ESCALATION_FILE}.processing.$$" - mv "$ESCALATION_FILE" "$ESCALATION_SNAP" - - ESCALATION_COUNT=$(wc -l < "$ESCALATION_SNAP") - log "Processing ${ESCALATION_COUNT} escalation(s) for ${PROJECT_NAME}" - _esc_total_created=0 - - while IFS= read -r esc_entry; do - [ -z "$esc_entry" ] && continue - - ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty') - ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty') - ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3') - ESC_REASON=$(echo "$esc_entry" | jq -r '.reason // empty') - - if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then - echo "$esc_entry" >> "$ESCALATION_DONE" - continue - fi - - log "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} reason=${ESC_REASON} (${ESC_ATTEMPTS} CI attempt(s))" - - # Handle idle_timeout / idle_prompt escalations — no CI steps to inspect, just notify - if [[ "$ESC_REASON" == idle_timeout* || "$ESC_REASON" == idle_prompt* ]]; then - _issue_url="https://codeberg.org/${CODEBERG_REPO}/issues/${ESC_ISSUE}" - if [[ "$ESC_REASON" == idle_prompt* ]]; then - sub_title="chore: investigate idle prompt for issue #${ESC_ISSUE}" - sub_body="## Dev-agent idle prompt - -The dev-agent session for issue #${ESC_ISSUE} returned to the prompt without writing a phase signal.$([ "${ESC_PR:-0}" != "0" ] && printf '\n\nPR #%s may still be open.' "$ESC_PR") - -### What to check -1. Did Claude finish without signalling a phase? Check for missing phase-file writes. -2. Was the issue spec ambiguous or missing acceptance criteria? -3. Re-run the issue by restoring the \`backlog\` label if the spec is clear. - -### Context -- Issue: [#${ESC_ISSUE}](${_issue_url})$([ "${ESC_PR:-0}" != "0" ] && printf '\n- PR: #%s' "$ESC_PR")" - else - sub_title="chore: investigate idle timeout for issue #${ESC_ISSUE}" - sub_body="## Dev-agent idle timeout - -The dev-agent session for issue #${ESC_ISSUE} was idle for 2h without a phase update and was killed.$([ "${ESC_PR:-0}" != "0" ] && printf '\n\nPR #%s may still be open.' "$ESC_PR") - -### What to check -1. Was the agent stuck waiting for input? Check the issue spec for ambiguity. -2. Was there an infrastructure issue (tmux crash, disk full, etc.)? -3. Re-run the issue by restoring the \`backlog\` label if the spec is clear. - -### Context -- Issue: [#${ESC_ISSUE}](${_issue_url})$([ "${ESC_PR:-0}" != "0" ] && printf '\n- PR: #%s' "$ESC_PR")" - fi - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created idle sub-issue #${new_issue} for #${ESC_ISSUE} (${ESC_REASON})" - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "⏱ Created #${new_issue}: ${ESC_REASON} on #${ESC_ISSUE}" 2>/dev/null || true - fi - - echo "$esc_entry" >> "$ESCALATION_DONE" - continue - fi - - # Fetch PR metadata (SHA, mergeable status) - ESC_PR_DATA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ - "${CODEBERG_API}/pulls/${ESC_PR}" 2>/dev/null || true) - ESC_PR_SHA=$(echo "$ESC_PR_DATA" | jq -r '.head.sha // ""' 2>/dev/null || true) - _PB_PR_MERGEABLE=$(echo "$ESC_PR_DATA" | jq '.mergeable // null' 2>/dev/null || true) - - ESC_PIPELINE="" - if [ -n "$ESC_PR_SHA" ]; then - # Validate SHA is a 40-char hex string before interpolating into SQL - if [[ "$ESC_PR_SHA" =~ ^[0-9a-fA-F]{40}$ ]]; then - ESC_PIPELINE=$(wpdb -c "SELECT number FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND commit='${ESC_PR_SHA}' ORDER BY created DESC LIMIT 1;" 2>/dev/null | xargs || true) - else - log "WARNING: ESC_PR_SHA '${ESC_PR_SHA}' is not a valid hex SHA — skipping pipeline lookup" - fi - fi - - # Fetch failed CI steps and their logs into temp dir - _PB_FAILED_STEPS="" - _PB_LOG_DIR=$(mktemp -d /tmp/recipe-logs-XXXXXX) - _PB_SUB_CREATED=0 - _PB_LOGS_AVAILABLE=0 - - if [ -n "$ESC_PIPELINE" ]; then - _PB_FAILED_STEPS=$(curl -sf \ - -H "Authorization: Bearer ${WOODPECKER_TOKEN}" \ - "${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${ESC_PIPELINE}" 2>/dev/null | \ - jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.pid)\t\(.name)"' 2>/dev/null || true) - - while IFS=$'\t' read -r step_pid step_name; do - [ -z "$step_pid" ] && continue - [[ "$step_pid" =~ ^[0-9]+$ ]] || { log "WARNING: invalid step_pid '${step_pid}' — skipping"; continue; } - step_logs=$(woodpecker-cli pipeline log show "${CODEBERG_REPO}" "${ESC_PIPELINE}" "${step_pid}" 2>/dev/null | tail -150 || true) - if [ -n "$step_logs" ]; then - echo "$step_logs" > "${_PB_LOG_DIR}/step-${step_pid}.log" - _PB_LOGS_AVAILABLE=1 - fi - done <<< "$_PB_FAILED_STEPS" - fi - - # Fetch PR changed files for recipe matching - _PB_PR_FILES_JSON="[]" - _PB_PR_FILES=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ - "${CODEBERG_API}/pulls/${ESC_PR}/files" 2>/dev/null | jq -r '.[].filename // empty' 2>/dev/null || true) - if [ -n "$_PB_PR_FILES" ]; then - _PB_PR_FILES_JSON=$(echo "$_PB_PR_FILES" | jq -Rsc 'split("\n") | map(select(length > 0))') - fi - - # Build recipe matching context - _RECIPE_STEP_NAMES=$(echo "$_PB_FAILED_STEPS" | cut -f2 | jq -Rsc 'split("\n") | map(select(length > 0))') - _RECIPE_OUTPUT_FILE="${_PB_LOG_DIR}/all-output.txt" - cat "${_PB_LOG_DIR}"/step-*.log > "$_RECIPE_OUTPUT_FILE" 2>/dev/null || touch "$_RECIPE_OUTPUT_FILE" - _RECIPE_PR_INFO=$(jq -nc \ - --argjson m "${_PB_PR_MERGEABLE:-null}" \ - --argjson a "${ESC_ATTEMPTS}" \ - --argjson files "${_PB_PR_FILES_JSON}" \ - '{mergeable:$m, attempts:$a, changed_files:$files}') - - # Match escalation against recipes and execute playbook - MATCHED_RECIPE=$(match_recipe "$_RECIPE_STEP_NAMES" "$_RECIPE_OUTPUT_FILE" "$_RECIPE_PR_INFO") - RECIPE_NAME=$(echo "$MATCHED_RECIPE" | jq -r '.name') - log "Recipe matched: ${RECIPE_NAME} for #${ESC_ISSUE} PR #${ESC_PR}" - - run_playbook "$MATCHED_RECIPE" - - # Fallback: no sub-issues created — create investigation issue - if [ "$_PB_SUB_CREATED" -eq 0 ]; then - sub_title="fix: investigate CI failure for PR #${ESC_PR} (from issue #${ESC_ISSUE})" - if [ "$_PB_LOGS_AVAILABLE" -eq 1 ]; then - sub_body="## CI failure — investigation required - -Spawned by gardener from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). Recipe '${RECIPE_NAME}' matched but produced no sub-issues. - -Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge." - else - sub_body="## CI failure — investigation required - -Spawned by gardener from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were unavailable at escalation time. - -Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge." - fi - - new_issue=$(curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${CODEBERG_API}/issues" \ - -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ - --argjson lid "$BACKLOG_LABEL_ID" '{"title":$t,"body":$b,"labels":[$lid]}')" 2>/dev/null | jq -r '.number // ""') || true - - if [ -n "$new_issue" ]; then - log "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}" - _esc_total_created=$((_esc_total_created + 1)) - matrix_send "gardener" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true - fi - fi - - # Cleanup temp files - rm -rf "$_PB_LOG_DIR" - - # Mark as processed - echo "$esc_entry" >> "$ESCALATION_DONE" - done < "$ESCALATION_SNAP" - - rm -f "$ESCALATION_SNAP" - log "Escalations processed — moved to $(basename "$ESCALATION_DONE")" - - # Report resolution count to supervisor for its fixed() summary - if [ "${_esc_total_created:-0}" -gt 0 ]; then - printf '%d %s\n' "$_esc_total_created" "$PROJECT_NAME" \ - >> "${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log" - fi -fi - log "--- Gardener poll done ---" diff --git a/supervisor/preflight.sh b/supervisor/preflight.sh index cab8543..a03ccba 100755 --- a/supervisor/preflight.sh +++ b/supervisor/preflight.sh @@ -158,21 +158,16 @@ done [ "$_found_wt" = false ] && echo " None" echo "" -# ── Pending Escalations ────────────────────────────────────────────────── +# ── Blocked Issues ──────────────────────────────────────────────────────── -echo "## Pending Escalations" -_found_esc=false -for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do - [ -f "$_esc_file" ] || continue - [[ "$_esc_file" == *.done.jsonl ]] && continue - _esc_count=$(wc -l < "$_esc_file" 2>/dev/null || echo 0) - [ "${_esc_count:-0}" -gt 0 ] || continue - _found_esc=true - echo "### $(basename "$_esc_file") (${_esc_count} entries)" - cat "$_esc_file" - echo "" -done -[ "$_found_esc" = false ] && echo " None" +echo "## Blocked Issues" +_blocked_issues=$(codeberg_api GET "/issues?state=open&labels=blocked&type=issues&limit=50" 2>/dev/null || echo "[]") +_blocked_n=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo 0) +if [ "${_blocked_n:-0}" -gt 0 ]; then + echo "$_blocked_issues" | jq -r '.[] | " #\(.number): \(.title)"' 2>/dev/null || echo " (query failed)" +else + echo " None" +fi echo "" # ── Escalation Replies from Matrix ──────────────────────────────────────── diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 8468c81..7e7b211 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -231,26 +231,6 @@ for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do fi done -# Report pending escalations (processing has moved to gardener-poll.sh per-project) -for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do - [ -f "$_esc_file" ] || continue - [[ "$_esc_file" == *.done.jsonl ]] && continue - _esc_count=$(wc -l < "$_esc_file" 2>/dev/null || true) - [ "${_esc_count:-0}" -gt 0 ] || continue - _esc_proj=$(basename "$_esc_file" .jsonl) - _esc_proj="${_esc_proj#escalations-}" - flog "${_esc_proj}: ${_esc_count} escalation(s) pending (gardener will process)" -done - -# Pick up escalation resolutions handled by gardener -_gesc_log="${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log" -if [ -f "$_gesc_log" ]; then - while IFS=' ' read -r _gn _gp; do - [ -n "${_gn:-}" ] && fixed "${_gp:-unknown}: gardener created ${_gn} sub-issue(s) from escalations" - done < "$_gesc_log" - rm -f "$_gesc_log" -fi - # ############################################################################# # LAYER 2: PER-PROJECT CHECKS # (iterated over projects/*.toml, config-driven) @@ -342,149 +322,6 @@ check_project() { find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true fi - # =========================================================================== - # P2f: ESCALATION TRIAGE — auto-retrigger ci_exhausted if infra-only - # =========================================================================== - if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then - status "P2f: ${proj_name}: triaging ci_exhausted escalations" - - _esc_file="${FACTORY_ROOT}/supervisor/escalations-${proj_name}.jsonl" - if [ -f "$_esc_file" ] && [ -s "$_esc_file" ]; then - _esc_tmp="${_esc_file}.sup.$$" - : > "$_esc_tmp" - - while IFS= read -r _esc_line; do - [ -z "$_esc_line" ] && continue - - _esc_reason=$(printf '%s' "$_esc_line" | jq -r '.reason // ""' 2>/dev/null) - - # Only triage ci_exhausted entries (from dev-agent or dev-poll) - case "$_esc_reason" in - ci_exhausted) ;; - ci_exhausted_poll) ;; - *) printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue ;; - esac - - _esc_pr=$(printf '%s' "$_esc_line" | jq -r '.pr // 0' 2>/dev/null) - _esc_issue=$(printf '%s' "$_esc_line" | jq -r '.issue // 0' 2>/dev/null) - _esc_ts=$(printf '%s' "$_esc_line" | jq -r '.ts // ""' 2>/dev/null) - - # Validate pr/issue are numeric - [[ "$_esc_pr" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; } - [[ "$_esc_issue" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; } - - # Cooldown: 30 min from last escalation timestamp - _esc_epoch=0 - [ -n "$_esc_ts" ] && _esc_epoch=$(date -d "$_esc_ts" +%s 2>/dev/null || echo 0) - _esc_age_min=$(( ($(date +%s) - _esc_epoch) / 60 )) - - if [ "$_esc_age_min" -lt 30 ]; then - flog "${proj_name}: PR #${_esc_pr} ci_exhausted cooldown (${_esc_age_min}/30min)" - printf '%s\n' "$_esc_line" >> "$_esc_tmp" - continue - fi - - # Get the PR's branch and state from Codeberg - _esc_pr_json=$(codeberg_api GET "/pulls/${_esc_pr}" 2>/dev/null) || { - flog "${proj_name}: PR #${_esc_pr}: failed to fetch PR info, keeping escalation" - printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue - } - _esc_pr_state=$(printf '%s' "$_esc_pr_json" | jq -r '.state // ""' 2>/dev/null) - if [ "$_esc_pr_state" != "open" ]; then - flog "${proj_name}: PR #${_esc_pr} is ${_esc_pr_state:-unknown} — discarding stale escalation" - continue # PR merged/closed externally; escalation no longer actionable - fi - _esc_branch=$(printf '%s' "$_esc_pr_json" | jq -r '.head.ref // ""' 2>/dev/null) - if [ -z "$_esc_branch" ]; then - printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue - fi - - # Validate branch name to prevent SQL injection - if ! [[ "$_esc_branch" =~ ^[a-zA-Z0-9/_.-]+$ ]]; then - flog "${proj_name}: PR #${_esc_pr}: unsafe branch name, keeping escalation" - printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue - fi - - # Find the latest failed pipeline for this PR's branch via Woodpecker DB - _esc_pip=$(wpdb -A -c " - SELECT number FROM pipelines - WHERE repo_id = ${WOODPECKER_REPO_ID} - AND branch = '${_esc_branch}' - AND status IN ('failure', 'error') - AND finished > 0 - ORDER BY number DESC LIMIT 1;" 2>/dev/null \ - | tr -d ' ' | grep -E '^[0-9]+$' | head -1 || true) - - if [ -z "$_esc_pip" ]; then - flog "${proj_name}: PR #${_esc_pr}: no failed pipeline for branch ${_esc_branch}, keeping escalation" - printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue - fi - - # Classify failure type via ci-helpers - _esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code") - - if [[ "$_esc_failure" != infra* ]]; then - flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human" - printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue - fi - - # Infra-only — push empty commit to retrigger CI via temporary worktree - _esc_wt="/tmp/${proj_name}-sup-retry-${_esc_pr}" - _esc_retrigger_ok=false - if [ -d "${PROJECT_REPO_ROOT:-}" ]; then - # Clean up any leftover temp worktree from a previous failed run - git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true - - if git -C "${PROJECT_REPO_ROOT}" fetch origin "${_esc_branch}" --quiet 2>/dev/null && \ - git -C "${PROJECT_REPO_ROOT}" worktree add --quiet --detach \ - "${_esc_wt}" "origin/${_esc_branch}" 2>/dev/null; then - if git -C "${_esc_wt}" \ - -c user.email="supervisor@factory" \ - -c user.name="Supervisor" \ - commit --allow-empty --no-verify \ - -m "chore: retrigger CI after infra-only exhaustion" \ - --quiet 2>/dev/null && \ - git -C "${_esc_wt}" push origin \ - "HEAD:refs/heads/${_esc_branch}" --quiet 2>/dev/null; then - _esc_retrigger_ok=true - fi - git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true - fi - fi - - if [ "$_esc_retrigger_ok" = true ]; then - # Reset CI fix counter so dev-poll can spawn the agent again if needed - _ci_fix_file="${FACTORY_ROOT}/dev/ci-fixes-${proj_name}.json" - _ci_fix_lock="${_ci_fix_file}.lock" - flock "$_ci_fix_lock" python3 -c " -import json, os -f='${_ci_fix_file}' -if not os.path.exists(f): - exit() -d = json.load(open(f)) -d.pop(str(${_esc_pr}), None) -json.dump(d, open(f, 'w')) -" 2>/dev/null || true - - fixed "${proj_name}: auto-retriggered CI for PR #${_esc_pr} after infra-only exhaustion" - flog "${proj_name}: auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" - matrix_send "supervisor" "♻️ auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" 2>/dev/null || true - # Escalation removed — do NOT write to _esc_tmp - else - p2 "${proj_name}: PR #${_esc_pr}: infra-only CI exhaustion but retrigger push failed" - # Bump timestamp to now so the 30-min cooldown resets; prevents alert flood - # on persistent push failures (SSH key issue, Codeberg outage, etc.) - _esc_now=$(date -u +%Y-%m-%dT%H:%M:%SZ) - _esc_bumped=$(printf '%s' "$_esc_line" | jq -c --arg ts "$_esc_now" '.ts = $ts' 2>/dev/null \ - || printf '%s' "$_esc_line") - printf '%s\n' "$_esc_bumped" >> "$_esc_tmp" - fi - done < "$_esc_file" - - mv "$_esc_tmp" "$_esc_file" - fi - fi - # Dev-agent health (only if monitoring enabled) if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then DEV_LOCK="/tmp/dev-agent-${PROJECT_NAME}.lock"