Merge pull request 'fix: incident: WP gRPC flake burned dev-qwen CI retry budget on #842 (2026-04-16) (#867)' (#933) from fix/issue-867 into main

2026-04-17 01:40:38 +00:00 · 2026-04-17 01:40:38 +00:00 · 40ffffed73
commit 40ffffed73
parent 7a45cc31f9 c0697ab27b
4 changed files with 286 additions and 3 deletions
--- a/formulas/run-supervisor.toml
+++ b/formulas/run-supervisor.toml
@ -29,7 +29,7 @@ and injected into your prompt above. Review them now.

 1. Read the injected metrics data carefully (System Resources, Docker,
   Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs,
-   CI Pipelines, Open PRs, Issue Status, Stale Worktrees).
+   CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**).
   Note: preflight.sh auto-removes PHASE:escalate files for closed issues
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.
@ -75,6 +75,10 @@ Categorize every finding from the metrics into priority levels.
 - Dev/action sessions in PHASE:escalate for > 24h (session timeout)
  (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight;
  this check covers sessions where the issue is still open)
+- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight:
+  - Container not running or in unhealthy state
+  - gRPC errors >= 3 in last 20 minutes
+  - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes

 ### P3 — Factory degraded
 - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed
@ -100,6 +104,15 @@ For each finding from the health assessment, decide and execute an action.

 ### Auto-fixable (execute these directly)

+**P2 Woodpecker agent unhealthy:**
+The supervisor-run.sh script automatically handles WP agent recovery:
+- Detects unhealthy state via preflight.sh health checks
+- Restarts container via `docker restart`
+- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes
+- Unassigns and removes blocked label from affected issues
+- Posts recovery comment with infra-flake context
+- Avoids duplicate restarts via 5-minute cooldown in history file
+
 **P0 Memory crisis:**
  # Kill stale one-shot claude processes (>3h old)
  pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
@ -248,6 +261,11 @@ Format:
  - <what was fixed>
  (or "No actions needed")

+  ### WP Agent Recovery (if applicable)
+  - WP agent restart: <time of restart or "none">
+  - Issues recovered: <count>
+  - Reason: <health check reason or "healthy">
+
  ### Vault items filed
  - vault/pending/<id>.md — <reason>
  (or "None")
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -24,7 +24,9 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
  files for `PHASE:escalate` entries and auto-removes any whose linked issue
  is confirmed closed (24h grace period after closure to avoid races). Reports
  **stale crashed worktrees** (worktrees preserved after crash) — supervisor
-  housekeeping removes them after 24h
+  housekeeping removes them after 24h. Also collects **Woodpecker agent health**:
+  container status, gRPC error count (last 20m), fast-failure pipelines (<60s,
+  last 15m), and overall health determination.
 - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review,
  health-assessment, decide-actions, report, journal) with `needs` dependencies.
  Claude evaluates all metrics and takes actions in a single interactive session
@ -47,5 +49,6 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 - Logs a WARNING message at startup indicating degraded mode

 **Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`)
-→ lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run
+→ lock + memory guard → run preflight.sh (collect metrics) → **WP agent health recovery**
+(if unhealthy: restart container + recover ci_exhausted issues) → load formula + context → run
 claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.
--- a/supervisor/preflight.sh
+++ b/supervisor/preflight.sh
@ -224,3 +224,108 @@ for _vf in "${_va_root}"/*.md; do
 done
 [ "$_found_vault" = false ] && echo "  None"
 echo ""
+
+# ── Woodpecker Agent Health ────────────────────────────────────────────────
+
+echo "## Woodpecker Agent Health"
+
+# Check WP agent container health status
+_wp_container="disinto-woodpecker-agent"
+_wp_health_status="unknown"
+_wp_health_start=""
+
+if command -v docker &>/dev/null; then
+  # Get health status via docker inspect
+  _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Health.Status}}' 2>/dev/null || echo "not_found")
+  if [ "$_wp_health_status" = "not_found" ] || [ -z "$_wp_health_status" ]; then
+    # Container may not exist or not have health check configured
+    _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Status}}' 2>/dev/null || echo "not_found")
+  fi
+
+  # Get container start time for age calculation
+  _wp_start_time=$(docker inspect "$_wp_container" --format '{{.State.StartedAt}}' 2>/dev/null || echo "")
+  if [ -n "$_wp_start_time" ] && [ "$_wp_start_time" != "0001-01-01T00:00:00Z" ]; then
+    _wp_health_start=$(date -d "$_wp_start_time" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_start_time")
+  fi
+fi
+
+echo "Container: $_wp_container"
+echo "Status: $_wp_health_status"
+[ -n "$_wp_health_start" ] && echo "Started: $_wp_health_start"
+
+# Check for gRPC errors in agent logs (last 20 minutes)
+_wp_grpc_errors=0
+if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
+  _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
+  echo "gRPC errors (last 20m): $_wp_grpc_errors"
+fi
+
+# Fast-failure heuristic: check for pipelines completing in <60s
+_wp_fast_failures=0
+_wp_recent_failures=""
+if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
+  _now=$(date +%s)
+  _pipelines=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines?perPage=100" 2>/dev/null || echo '[]')
+
+  # Count failures with duration < 60s in last 15 minutes
+  _wp_fast_failures=$(echo "$_pipelines" | jq --argjson now "$_now" '
+    [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
+    | length' 2>/dev/null || echo "0")
+
+  if [ "$_wp_fast_failures" -gt 0 ]; then
+    _wp_recent_failures=$(echo "$_pipelines" | jq -r --argjson now "$_now" '
+      [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
+      | .[] | "\(.number)\t\((.finished - .started))s"' 2>/dev/null || echo "")
+  fi
+fi
+
+echo "Fast-fail pipelines (<60s, last 15m): $_wp_fast_failures"
+if [ -n "$_wp_recent_failures" ] && [ "$_wp_fast_failures" -gt 0 ]; then
+  echo "Recent failures:"
+  echo "$_wp_recent_failures" | while IFS=$'\t' read -r _num _dur; do
+    echo "  #$_num: ${_dur}"
+  done
+fi
+
+# Determine overall WP agent health
+_wp_agent_healthy=true
+_wp_health_reason=""
+
+if [ "$_wp_health_status" = "not_found" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container not running"
+elif [ "$_wp_health_status" = "unhealthy" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container health check failed"
+elif [ "$_wp_health_status" != "running" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container not in running state: $_wp_health_status"
+elif [ "$_wp_grpc_errors" -ge 3 ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="High gRPC error count (>=3 in 20m)"
+elif [ "$_wp_fast_failures" -ge 3 ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="High fast-failure count (>=3 in 15m)"
+fi
+
+echo ""
+echo "WP Agent Health: $([ "$_wp_agent_healthy" = true ] && echo "healthy" || echo "UNHEALTHY")"
+[ -n "$_wp_health_reason" ] && echo "Reason: $_wp_health_reason"
+echo ""
+
+# ── WP Agent Health History (for idempotency) ──────────────────────────────
+
+echo "## WP Agent Health History"
+# Track last restart timestamp to avoid duplicate restarts in same run
+_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
+_wp_last_restart="never"
+_wp_last_restart_ts=0
+
+if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
+  _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
+  if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" -gt 0 ] 2>/dev/null; then
+    _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
+  fi
+fi
+echo "Last restart: $_wp_last_restart"
+echo ""
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -47,6 +47,9 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"

+# WP agent container name (configurable via env var)
+export WP_AGENT_CONTAINER_NAME="${WP_AGENT_CONTAINER_NAME:-disinto-woodpecker-agent}"
+
 # Override LOG_AGENT for consistent agent identification
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
 LOG_AGENT="supervisor"
@ -166,6 +169,160 @@ ${FORMULA_CONTENT}
 ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}"

+# ── WP Agent Health Recovery ──────────────────────────────────────────────
+# Check preflight output for WP agent health issues and trigger recovery if needed
+_WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
+echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
+
+# Extract WP agent health status from preflight output
+# Note: match exact "healthy" not "UNHEALTHY" (substring issue)
+_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
+_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
+
+if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
+  log "WP agent detected as UNHEALTHY: $_wp_health_reason"
+
+  # Check for idempotency guard - have we already restarted in this run?
+  _WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
+  _wp_last_restart_ts=0
+  _wp_last_restart="never"
+  if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
+    _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
+    if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" != "0" ] 2>/dev/null; then
+      _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
+    fi
+  fi
+
+  _current_ts=$(date +%s)
+  _restart_threshold=300  # 5 minutes between restarts
+
+  if [ -z "$_wp_last_restart_ts" ] || [ "$_wp_last_restart_ts" = "0" ] || [ $((_current_ts - _wp_last_restart_ts)) -gt $_restart_threshold ]; then
+    log "Triggering WP agent restart..."
+
+    # Restart the WP agent container
+    if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
+      _restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
+      log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
+
+      # Update history file
+      echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
+      echo "LAST_RESTART_TIME=$_restart_time" >> "$_WP_HEALTH_HISTORY_FILE"
+
+      # Post recovery notice to journal
+      _journal_file="${OPS_JOURNAL_ROOT}/$(date -u +%Y-%m-%d).md"
+      if [ -f "$_journal_file" ]; then
+        {
+          echo ""
+          echo "### WP Agent Recovery - $_restart_time"
+          echo ""
+          echo "WP agent was unhealthy: $_wp_health_reason"
+          echo "Container restarted automatically."
+        } >> "$_journal_file"
+      fi
+
+      # Scan for issues updated in the last 30 minutes with blocked: ci_exhausted label
+      log "Scanning for ci_exhausted issues updated in last 30 minutes..."
+      _now_epoch=$(date +%s)
+      _thirty_min_ago=$(( _now_epoch - 1800 ))
+
+      # Fetch open issues with blocked label
+      _blocked_issues=$(forge_api GET "/issues?state=open&labels=blocked&type=issues&limit=100" 2>/dev/null || echo "[]")
+      _blocked_count=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo "0")
+
+      _issues_processed=0
+      _issues_recovered=0
+
+      if [ "$_blocked_count" -gt 0 ]; then
+        # Process each blocked issue
+        echo "$_blocked_issues" | jq -c '.[]' 2>/dev/null | while IFS= read -r issue_json; do
+          [ -z "$issue_json" ] && continue
+
+          _issue_num=$(echo "$issue_json" | jq -r '.number // empty')
+          _issue_updated=$(echo "$issue_json" | jq -r '.updated_at // empty')
+          _issue_labels=$(echo "$issue_json" | jq -r '.labels | map(.name) | join(",")' 2>/dev/null || echo "")
+
+          # Check if issue has ci_exhausted label
+          if ! echo "$_issue_labels" | grep -q "ci_exhausted"; then
+            continue
+          fi
+
+          # Parse updated_at timestamp
+          _issue_updated_epoch=$(date -d "$_issue_updated" +%s 2>/dev/null || echo "0")
+          _time_since_update=$(( _now_epoch - _issue_updated_epoch ))
+
+          # Check if updated in last 30 minutes
+          if [ "$_time_since_update" -lt 1800 ] && [ "$_time_since_update" -ge 0 ]; then
+            _issues_processed=$(( _issues_processed + 1 ))
+
+            # Check for idempotency guard - already swept by supervisor?
+            _issue_body=$(echo "$issue_json" | jq -r '.body // ""' 2>/dev/null || echo "")
+            if echo "$_issue_body" | grep -q "<!-- supervisor-swept -->"; then
+              log "Issue #$_issue_num already swept by supervisor, skipping"
+              continue
+            fi
+
+            log "Processing ci_exhausted issue #$_issue_num (updated $_time_since_update seconds ago)"
+
+            # Get issue assignee
+            _issue_assignee=$(echo "$issue_json" | jq -r '.assignee.login // empty' 2>/dev/null || echo "")
+
+            # Unassign the issue
+            if [ -n "$_issue_assignee" ]; then
+              log "Unassigning issue #$_issue_num from $_issue_assignee"
+              curl -sf -X PATCH \
+                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+                -H "Content-Type: application/json" \
+                "${FORGE_API}/issues/$_issue_num" \
+                -d '{"assignees":[]}' >/dev/null 2>&1 || true
+            fi
+
+            # Remove blocked label
+            _blocked_label_id=$(forge_api GET "/labels" 2>/dev/null | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || echo "")
+            if [ -n "$_blocked_label_id" ]; then
+              log "Removing blocked label from issue #$_issue_num"
+              curl -sf -X DELETE \
+                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+                "${FORGE_API}/issues/$_issue_num/labels/$_blocked_label_id" >/dev/null 2>&1 || true
+            fi
+
+            # Add comment about infra-flake recovery
+            _recovery_comment=$(cat <<EOF
+<!-- supervisor-swept -->
+
+**Automated Recovery — $(date -u '+%Y-%m-%d %H:%M UTC')**
+
+CI agent was unhealthy between $_restart_time and now. The prior retry budget may have been spent on infra flake, not real failures.
+
+**Recovery Actions:**
+- Unassigned from pool and returned for fresh attempt
+- CI agent container restarted
+- Related pipelines will be retriggered automatically
+
+**Next Steps:**
+Please re-attempt this issue. The CI environment has been refreshed.
+EOF
+)
+
+            curl -sf -X POST \
+              -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+              -H "Content-Type: application/json" \
+              "${FORGE_API}/issues/$_issue_num/comments" \
+              -d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
+
+            log "Recovered issue #$_issue_num - returned to pool"
+          fi
+        done
+      fi
+
+      log "WP agent restart and issue recovery complete"
+    else
+      log "ERROR: Failed to restart WP agent container"
+    fi
+  else
+    log "WP agent restart already performed in this run (since $_wp_last_restart), skipping"
+  fi
+fi
+
 # ── Run agent ─────────────────────────────────────────────────────────────
 agent_run --worktree "$WORKTREE" "$PROMPT"
 log "agent_run complete"