fix: incident: WP gRPC flake burned dev-qwen CI retry budget on #842 (2026-04-16) (#867)

2026-04-17 01:22:59 +00:00 · 2026-04-17 01:22:59 +00:00 · 04ead1fbdc
commit 04ead1fbdc
parent c3e58e88ed
4 changed files with 287 additions and 3 deletions
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -24,7 +24,9 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
  files for `PHASE:escalate` entries and auto-removes any whose linked issue
  is confirmed closed (24h grace period after closure to avoid races). Reports
  **stale crashed worktrees** (worktrees preserved after crash) — supervisor
-  housekeeping removes them after 24h
+  housekeeping removes them after 24h. Also collects **Woodpecker agent health**:
+  container status, gRPC error count (last 20m), fast-failure pipelines (<60s,
+  last 15m), and overall health determination.
 - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review,
  health-assessment, decide-actions, report, journal) with `needs` dependencies.
  Claude evaluates all metrics and takes actions in a single interactive session
@ -47,5 +49,6 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 - Logs a WARNING message at startup indicating degraded mode

 **Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`)
-→ lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run
+→ lock + memory guard → run preflight.sh (collect metrics) → **WP agent health recovery**
+(if unhealthy: restart container + recover ci_exhausted issues) → load formula + context → run
 claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.
--- a/supervisor/preflight.sh
+++ b/supervisor/preflight.sh
@ -224,3 +224,108 @@ for _vf in "${_va_root}"/*.md; do
 done
 [ "$_found_vault" = false ] && echo "  None"
 echo ""
+
+# ── Woodpecker Agent Health ────────────────────────────────────────────────
+
+echo "## Woodpecker Agent Health"
+
+# Check WP agent container health status
+_wp_container="disinto-woodpecker-agent"
+_wp_health_status="unknown"
+_wp_health_start=""
+
+if command -v docker &>/dev/null; then
+  # Get health status via docker inspect
+  _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Health.Status}}' 2>/dev/null || echo "not_found")
+  if [ "$_wp_health_status" = "not_found" ] || [ -z "$_wp_health_status" ]; then
+    # Container may not exist or not have health check configured
+    _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Status}}' 2>/dev/null || echo "not_found")
+  fi
+
+  # Get container start time for age calculation
+  _wp_start_time=$(docker inspect "$_wp_container" --format '{{.State.StartedAt}}' 2>/dev/null || echo "")
+  if [ -n "$_wp_start_time" ] && [ "$_wp_start_time" != "0001-01-01T00:00:00Z" ]; then
+    _wp_health_start=$(date -d "$_wp_start_time" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_start_time")
+  fi
+fi
+
+echo "Container: $_wp_container"
+echo "Status: $_wp_health_status"
+[ -n "$_wp_health_start" ] && echo "Started: $_wp_health_start"
+
+# Check for gRPC errors in agent logs (last 20 minutes)
+_wp_grpc_errors=0
+if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
+  _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0")
+  echo "gRPC errors (last 20m): $_wp_grpc_errors"
+fi
+
+# Fast-failure heuristic: check for pipelines completing in <60s
+_wp_fast_failures=0
+_wp_recent_failures=""
+if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
+  _now=$(date +%s)
+  _pipelines=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines?perPage=100" 2>/dev/null || echo '[]')
+
+  # Count failures with duration < 60s in last 15 minutes
+  _wp_fast_failures=$(echo "$_pipelines" | jq --argjson now "$_now" '
+    [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
+    | length' 2>/dev/null || echo "0")
+
+  if [ "$_wp_fast_failures" -gt 0 ]; then
+    _wp_recent_failures=$(echo "$_pipelines" | jq -r --argjson now "$_now" '
+      [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
+      | .[] | "\(.number)\t\((.finished - .started))s"' 2>/dev/null || echo "")
+  fi
+fi
+
+echo "Fast-fail pipelines (<60s, last 15m): $_wp_fast_failures"
+if [ -n "$_wp_recent_failures" ] && [ "$_wp_fast_failures" -gt 0 ]; then
+  echo "Recent failures:"
+  echo "$_wp_recent_failures" | while IFS=$'\t' read -r _num _dur; do
+    echo "  #$_num: ${_dur}"
+  done
+fi
+
+# Determine overall WP agent health
+_wp_agent_healthy=true
+_wp_health_reason=""
+
+if [ "$_wp_health_status" = "not_found" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container not running"
+elif [ "$_wp_health_status" = "unhealthy" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container health check failed"
+elif [ "$_wp_health_status" != "running" ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="Container not in running state: $_wp_health_status"
+elif [ "$_wp_grpc_errors" -ge 3 ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="High gRPC error count (>=3 in 20m)"
+elif [ "$_wp_fast_failures" -ge 3 ]; then
+  _wp_agent_healthy=false
+  _wp_health_reason="High fast-failure count (>=3 in 15m)"
+fi
+
+echo ""
+echo "WP Agent Health: $([ "$_wp_agent_healthy" = true ] && echo "healthy" || echo "UNHEALTHY")"
+[ -n "$_wp_health_reason" ] && echo "Reason: $_wp_health_reason"
+echo ""
+
+# ── WP Agent Health History (for idempotency) ──────────────────────────────
+
+echo "## WP Agent Health History"
+# Track last restart timestamp to avoid duplicate restarts in same run
+_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
+_wp_last_restart="never"
+_wp_last_restart_ts=0
+
+if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
+  _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
+  if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" -gt 0 ] 2>/dev/null; then
+    _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
+  fi
+fi
+echo "Last restart: $_wp_last_restart"
+echo ""
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -47,6 +47,9 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"

+# WP agent container name (configurable via env var)
+export WP_AGENT_CONTAINER_NAME="${WP_AGENT_CONTAINER_NAME:-disinto-woodpecker-agent}"
+
 # Override LOG_AGENT for consistent agent identification
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
 LOG_AGENT="supervisor"
@ -166,6 +169,159 @@ ${FORMULA_CONTENT}
 ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}"

+# ── WP Agent Health Recovery ──────────────────────────────────────────────
+# Check preflight output for WP agent health issues and trigger recovery if needed
+_WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
+echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
+
+# Extract WP agent health status from preflight output
+_wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false")
+_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
+
+if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
+  log "WP agent detected as UNHEALTHY: $_wp_health_reason"
+
+  # Check for idempotency guard - have we already restarted in this run?
+  _WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
+  _wp_last_restart_ts=0
+  _wp_last_restart="never"
+  if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
+    _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
+    if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" != "0" ] 2>/dev/null; then
+      _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
+    fi
+  fi
+
+  _current_ts=$(date +%s)
+  _restart_threshold=300  # 5 minutes between restarts
+
+  if [ -z "$_wp_last_restart_ts" ] || [ "$_wp_last_restart_ts" = "0" ] || [ $((_current_ts - _wp_last_restart_ts)) -gt $_restart_threshold ]; then
+    log "Triggering WP agent restart..."
+
+    # Restart the WP agent container
+    if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
+      _restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
+      log "Successfully restarted WP agent container: $_wp_agent_healthy"
+
+      # Update history file
+      echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
+      echo "LAST_RESTART_TIME=$_restart_time" >> "$_WP_HEALTH_HISTORY_FILE"
+
+      # Post recovery notice to journal
+      _journal_file="${OPS_JOURNAL_ROOT}/$(date -u +%Y-%m-%d).md"
+      if [ -f "$_journal_file" ]; then
+        {
+          echo ""
+          echo "### WP Agent Recovery - $_restart_time"
+          echo ""
+          echo "WP agent was unhealthy: $_wp_health_reason"
+          echo "Container restarted automatically."
+        } >> "$_journal_file"
+      fi
+
+      # Scan for issues updated in the last 30 minutes with blocked: ci_exhausted label
+      log "Scanning for ci_exhausted issues updated in last 30 minutes..."
+      _now_epoch=$(date +%s)
+      _thirty_min_ago=$(( _now_epoch - 1800 ))
+
+      # Fetch open issues with blocked label
+      _blocked_issues=$(forge_api GET "/issues?state=open&labels=blocked&type=issues&limit=100" 2>/dev/null || echo "[]")
+      _blocked_count=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo "0")
+
+      _issues_processed=0
+      _issues_recovered=0
+
+      if [ "$_blocked_count" -gt 0 ]; then
+        # Process each blocked issue
+        echo "$_blocked_issues" | jq -c '.[]' 2>/dev/null | while IFS= read -r issue_json; do
+          [ -z "$issue_json" ] && continue
+
+          _issue_num=$(echo "$issue_json" | jq -r '.number // empty')
+          _issue_updated=$(echo "$issue_json" | jq -r '.updated_at // empty')
+          _issue_labels=$(echo "$issue_json" | jq -r '.labels | map(.name) | join(",")' 2>/dev/null || echo "")
+
+          # Check if issue has ci_exhausted label
+          if ! echo "$_issue_labels" | grep -q "ci_exhausted"; then
+            continue
+          fi
+
+          # Parse updated_at timestamp
+          _issue_updated_epoch=$(date -d "$_issue_updated" +%s 2>/dev/null || echo "0")
+          _time_since_update=$(( _now_epoch - _issue_updated_epoch ))
+
+          # Check if updated in last 30 minutes
+          if [ "$_time_since_update" -lt 1800 ] && [ "$_time_since_update" -ge 0 ]; then
+            _issues_processed=$(( _issues_processed + 1 ))
+
+            # Check for idempotency guard - already swept by supervisor?
+            _issue_body=$(echo "$issue_json" | jq -r '.body // ""' 2>/dev/null || echo "")
+            if echo "$_issue_body" | grep -q "<!-- supervisor-swept -->"; then
+              log "Issue #$_issue_num already swept by supervisor, skipping"
+              continue
+            fi
+
+            log "Processing ci_exhausted issue #$_issue_num (updated $_time_since_update seconds ago)"
+
+            # Get issue assignee
+            _issue_assignee=$(echo "$issue_json" | jq -r '.assignee.login // empty' 2>/dev/null || echo "")
+
+            # Unassign the issue
+            if [ -n "$_issue_assignee" ]; then
+              log "Unassigning issue #$_issue_num from $_issue_assignee"
+              curl -sf -X PATCH \
+                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+                -H "Content-Type: application/json" \
+                "${FORGE_API}/issues/$_issue_num" \
+                -d '{"assignees":[]}' >/dev/null 2>&1 || true
+            fi
+
+            # Remove blocked label
+            _blocked_label_id=$(forge_api GET "/labels" 2>/dev/null | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || echo "")
+            if [ -n "$_blocked_label_id" ]; then
+              log "Removing blocked label from issue #$_issue_num"
+              curl -sf -X DELETE \
+                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+                "${FORGE_API}/issues/$_issue_num/labels/$_blocked_label_id" >/dev/null 2>&1 || true
+            fi
+
+            # Add comment about infra-flake recovery
+            _recovery_comment=$(cat <<EOF
+<!-- supervisor-swept -->
+
+**Automated Recovery — $(date -u '+%Y-%m-%d %H:%M UTC')**
+
+CI agent was unhealthy between $_restart_time and now. The prior retry budget may have been spent on infra flake, not real failures.
+
+**Recovery Actions:**
+- Unassigned from pool and returned for fresh attempt
+- CI agent container restarted
+- Related pipelines will be retriggered automatically
+
+**Next Steps:**
+Please re-attempt this issue. The CI environment has been refreshed.
+EOF
+)
+
+            curl -sf -X POST \
+              -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
+              -H "Content-Type: application/json" \
+              "${FORGE_API}/issues/$_issue_num/comments" \
+              -d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true
+
+            log "Recovered issue #$_issue_num - returned to pool"
+          fi
+        done
+      fi
+
+      log "WP agent restart and issue recovery complete"
+    else
+      log "ERROR: Failed to restart WP agent container"
+    fi
+  else
+    log "WP agent restart already performed in this run (since $_wp_last_restart), skipping"
+  fi
+fi
+
 # ── Run agent ─────────────────────────────────────────────────────────────
 agent_run --worktree "$WORKTREE" "$PROMPT"
 log "agent_run complete"