fix: incident: WP gRPC flake burned dev-qwen CI retry budget on #842 (2026-04-16) (#867)

2026-04-17 01:22:59 +00:00 · 2026-04-17 01:22:59 +00:00 · 31b5e11006
commit 31b5e11006
parent f93600a1cf
4 changed files with 287 additions and 3 deletions
--- a/formulas/run-supervisor.toml
+++ b/formulas/run-supervisor.toml
@ -29,7 +29,7 @@ and injected into your prompt above. Review them now.
 1. Read the injected metrics data carefully (System Resources, Docker,
   Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs,
-   CI Pipelines, Open PRs, Issue Status, Stale Worktrees).
+   CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**).
   Note: preflight.sh auto-removes PHASE:escalate files for closed issues
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.
@ -75,6 +75,10 @@ Categorize every finding from the metrics into priority levels.
 - Dev/action sessions in PHASE:escalate for > 24h (session timeout)
  (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight;
  this check covers sessions where the issue is still open)
 - **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight:
  - Container not running or in unhealthy state
  - gRPC errors >= 3 in last 20 minutes
  - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes
 ### P3 — Factory degraded
 - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed
@ -100,6 +104,17 @@ For each finding from the health assessment, decide and execute an action.
 ### Auto-fixable (execute these directly)
 **P2 Woodpecker agent unhealthy:**
 The supervisor-run.sh script automatically handles WP agent recovery:
 - Detects unhealthy state via preflight.sh health checks
 - Restarts container via `docker restart`
 - Scans for `blocked: ci_exhausted` issues updated in last 30 minutes
 - Unassigns and removes blocked label from affected issues
 - Posts recovery comment with infra-flake context
 - Avoids duplicate restarts via 5-minute cooldown in history file
 **P0 Memory crisis:**
 **P0 Memory crisis:**
  # Kill stale one-shot claude processes (>3h old)
  pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
@ -248,6 +263,11 @@ Format:
  - <what was fixed>
  (or "No actions needed")
  ### WP Agent Recovery (if applicable)
  - WP agent restart: <time of restart or "none">
  - Issues recovered: <count>
  - Reason: <health check reason or "healthy">
  ### Vault items filed
  - vault/pending/<id>.md — <reason>
  (or "None")
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -24,7 +24,9 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
  files for `PHASE:escalate` entries and auto-removes any whose linked issue
  is confirmed closed (24h grace period after closure to avoid races). Reports
  **stale crashed worktrees** (worktrees preserved after crash) — supervisor
-  housekeeping removes them after 24h
+  housekeeping removes them after 24h. Also collects **Woodpecker agent health**:
  container status, gRPC error count (last 20m), fast-failure pipelines (<60s,
  last 15m), and overall health determination.
 - `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review,
  health-assessment, decide-actions, report, journal) with `needs` dependencies.
  Claude evaluates all metrics and takes actions in a single interactive session
@ -47,5 +49,6 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 - Logs a WARNING message at startup indicating degraded mode
 **Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`)
-→ lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run
+→ lock + memory guard → run preflight.sh (collect metrics) → **WP agent health recovery**
 (if unhealthy: restart container + recover ci_exhausted issues) → load formula + context → run
 claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.
--- a/supervisor/preflight.sh
+++ b/supervisor/preflight.sh
@ -224,3 +224,108 @@ for _vf in "${_va_root}"/*.md; do
 done
 [ "$_found_vault" = false ] && echo "  None"
 echo ""
 # ── Woodpecker Agent Health ────────────────────────────────────────────────
 echo "## Woodpecker Agent Health"
 # Check WP agent container health status
 _wp_container="disinto-woodpecker-agent"
 _wp_health_status="unknown"
 _wp_health_start=""
 if command -v docker &>/dev/null; then
  # Get health status via docker inspect
  _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Health.Status}}' 2>/dev/null || echo "not_found")
  if [ "$_wp_health_status" = "not_found" ] || [ -z "$_wp_health_status" ]; then
    # Container may not exist or not have health check configured
    _wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Status}}' 2>/dev/null || echo "not_found")
  fi
  # Get container start time for age calculation
  _wp_start_time=$(docker inspect "$_wp_container" --format '{{.State.StartedAt}}' 2>/dev/null || echo "")
  if [ -n "$_wp_start_time" ] && [ "$_wp_start_time" != "0001-01-01T00:00:00Z" ]; then
    _wp_health_start=$(date -d "$_wp_start_time" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_start_time")
  fi
 fi
 echo "Container: $_wp_container"
 echo "Status: $_wp_health_status"
 [ -n "$_wp_health_start" ] && echo "Started: $_wp_health_start"
 # Check for gRPC errors in agent logs (last 20 minutes)
 _wp_grpc_errors=0
 if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
  _wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 2>/dev/null | grep -c 'grpc error' || echo "0")
  echo "gRPC errors (last 20m): $_wp_grpc_errors"
 fi
 # Fast-failure heuristic: check for pipelines completing in <60s
 _wp_fast_failures=0
 _wp_recent_failures=""
 if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
  _now=$(date +%s)
  _pipelines=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines?perPage=100" 2>/dev/null || echo '[]')
  # Count failures with duration < 60s in last 15 minutes
  _wp_fast_failures=$(echo "$_pipelines" | jq --argjson now "$_now" '
    [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
    | length' 2>/dev/null || echo "0")
  if [ "$_wp_fast_failures" -gt 0 ]; then
    _wp_recent_failures=$(echo "$_pipelines" | jq -r --argjson now "$_now" '
      [.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
      | .[] | "\(.number)\t\((.finished - .started))s"' 2>/dev/null || echo "")
  fi
 fi
 echo "Fast-fail pipelines (<60s, last 15m): $_wp_fast_failures"
 if [ -n "$_wp_recent_failures" ] && [ "$_wp_fast_failures" -gt 0 ]; then
  echo "Recent failures:"
  echo "$_wp_recent_failures" | while IFS=$'\t' read -r _num _dur; do
    echo "  #$_num: ${_dur}"
  done
 fi
 # Determine overall WP agent health
 _wp_agent_healthy=true
 _wp_health_reason=""
 if [ "$_wp_health_status" = "not_found" ]; then
  _wp_agent_healthy=false
  _wp_health_reason="Container not running"
 elif [ "$_wp_health_status" = "unhealthy" ]; then
  _wp_agent_healthy=false
  _wp_health_reason="Container health check failed"
 elif [ "$_wp_health_status" != "running" ]; then
  _wp_agent_healthy=false
  _wp_health_reason="Container not in running state: $_wp_health_status"
 elif [ "$_wp_grpc_errors" -ge 3 ]; then
  _wp_agent_healthy=false
  _wp_health_reason="High gRPC error count (>=3 in 20m)"
 elif [ "$_wp_fast_failures" -ge 3 ]; then
  _wp_agent_healthy=false
  _wp_health_reason="High fast-failure count (>=3 in 15m)"
 fi
 echo ""
 echo "WP Agent Health: $([ "$_wp_agent_healthy" = true ] && echo "healthy" || echo "UNHEALTHY")"
 [ -n "$_wp_health_reason" ] && echo "Reason: $_wp_health_reason"
 echo ""
 # ── WP Agent Health History (for idempotency) ──────────────────────────────
 echo "## WP Agent Health History"
 # Track last restart timestamp to avoid duplicate restarts in same run
 _WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
 _wp_last_restart="never"
 _wp_last_restart_ts=0
 if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
  _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
  if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" -gt 0 ] 2>/dev/null; then
    _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
  fi
 fi
 echo "Last restart: $_wp_last_restart"
 echo ""
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -47,6 +47,9 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
 # WP agent container name (configurable via env var)
 export WP_AGENT_CONTAINER_NAME="${WP_AGENT_CONTAINER_NAME:-disinto-woodpecker-agent}"
 # Override LOG_AGENT for consistent agent identification
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
 LOG_AGENT="supervisor"
@ -166,6 +169,159 @@ ${FORMULA_CONTENT}
 ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}"
 # ── WP Agent Health Recovery ──────────────────────────────────────────────
 # Check preflight output for WP agent health issues and trigger recovery if needed
 _WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
 echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
 # Extract WP agent health status from preflight output
 _wp_agent_healthy=$(grep "^WP Agent Health:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | grep -q "healthy" && echo "true" || echo "false")
 _wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
 if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
  log "WP agent detected as UNHEALTHY: $_wp_health_reason"
  # Check for idempotency guard - have we already restarted in this run?
  _WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
  _wp_last_restart_ts=0
  _wp_last_restart="never"
  if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
    _wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
    if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" != "0" ] 2>/dev/null; then
      _wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
    fi
  fi
  _current_ts=$(date +%s)
  _restart_threshold=300  # 5 minutes between restarts
  if [ -z "$_wp_last_restart_ts" ] || [ "$_wp_last_restart_ts" = "0" ] || [ $((_current_ts - _wp_last_restart_ts)) -gt $_restart_threshold ]; then
    log "Triggering WP agent restart..."
    # Restart the WP agent container
    if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
      _restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
      log "Successfully restarted WP agent container: $_wp_agent_healthy"
      # Update history file
      echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
      echo "LAST_RESTART_TIME=$_restart_time" >> "$_WP_HEALTH_HISTORY_FILE"
      # Post recovery notice to journal
      _journal_file="${OPS_JOURNAL_ROOT}/$(date -u +%Y-%m-%d).md"
      if [ -f "$_journal_file" ]; then
        {
          echo ""
          echo "### WP Agent Recovery - $_restart_time"
          echo ""
          echo "WP agent was unhealthy: $_wp_health_reason"
          echo "Container restarted automatically."
        } >> "$_journal_file"
      fi
      # Scan for issues updated in the last 30 minutes with blocked: ci_exhausted label
      log "Scanning for ci_exhausted issues updated in last 30 minutes..."
      _now_epoch=$(date +%s)
      _thirty_min_ago=$(( _now_epoch - 1800 ))
      # Fetch open issues with blocked label
      _blocked_issues=$(forge_api GET "/issues?state=open&labels=blocked&type=issues&limit=100" 2>/dev/null || echo "[]")
      _blocked_count=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo "0")
      _issues_processed=0
      _issues_recovered=0
      if [ "$_blocked_count" -gt 0 ]; then
        # Process each blocked issue
        echo "$_blocked_issues" | jq -c '.[]' 2>/dev/null | while IFS= read -r issue_json; do
          [ -z "$issue_json" ] && continue
          _issue_num=$(echo "$issue_json" | jq -r '.number // empty')
          _issue_updated=$(echo "$issue_json" | jq -r '.updated_at // empty')
          _issue_labels=$(echo "$issue_json" | jq -r '.labels | map(.name) | join(",")' 2>/dev/null || echo "")
          # Check if issue has ci_exhausted label
          if ! echo "$_issue_labels" | grep -q "ci_exhausted"; then
            continue
          fi
          # Parse updated_at timestamp
          _issue_updated_epoch=$(date -d "$_issue_updated" +%s 2>/dev/null || echo "0")
          _time_since_update=$(( _now_epoch - _issue_updated_epoch ))
          # Check if updated in last 30 minutes
          if [ "$_time_since_update" -lt 1800 ] && [ "$_time_since_update" -ge 0 ]; then
            _issues_processed=$(( _issues_processed + 1 ))
            # Check for idempotency guard - already swept by supervisor?
            _issue_body=$(echo "$issue_json" | jq -r '.body // ""' 2>/dev/null || echo "")
            if echo "$_issue_body" | grep -q "<!-- supervisor-swept -->"; then
              log "Issue #$_issue_num already swept by supervisor, skipping"
              continue
            fi
            log "Processing ci_exhausted issue #$_issue_num (updated $_time_since_update seconds ago)"
            # Get issue assignee
            _issue_assignee=$(echo "$issue_json" | jq -r '.assignee.login // empty' 2>/dev/null || echo "")
            # Unassign the issue
            if [ -n "$_issue_assignee" ]; then
              log "Unassigning issue #$_issue_num from $_issue_assignee"
              curl -sf -X PATCH \
                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
                -H "Content-Type: application/json" \
                "${FORGE_API}/issues/$_issue_num" \
                -d '{"assignees":[]}' >/dev/null 2>&1 || true
            fi
            # Remove blocked label
            _blocked_label_id=$(forge_api GET "/labels" 2>/dev/null | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || echo "")
            if [ -n "$_blocked_label_id" ]; then
              log "Removing blocked label from issue #$_issue_num"
              curl -sf -X DELETE \
                -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
                "${FORGE_API}/issues/$_issue_num/labels/$_blocked_label_id" >/dev/null 2>&1 || true
            fi
            # Add comment about infra-flake recovery
            _recovery_comment=$(cat <<EOF
 <!-- supervisor-swept -->
 **Automated Recovery — $(date -u '+%Y-%m-%d %H:%M UTC')**
 CI agent was unhealthy between $_restart_time and now. The prior retry budget may have been spent on infra flake, not real failures.
 **Recovery Actions:**
 - Unassigned from pool and returned for fresh attempt
 - CI agent container restarted
 - Related pipelines will be retriggered automatically
 **Next Steps:**
 Please re-attempt this issue. The CI environment has been refreshed.
 EOF
 )
            curl -sf -X POST \
              -H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
              -H "Content-Type: application/json" \
              "${FORGE_API}/issues/$_issue_num/comments" \
              -d "{\"body\":$_recovery_comment}" >/dev/null 2>&1 || true
            log "Recovered issue #$_issue_num - returned to pool"
          fi
        done
      fi
      log "WP agent restart and issue recovery complete"
    else
      log "ERROR: Failed to restart WP agent container"
    fi
  else
    log "WP agent restart already performed in this run (since $_wp_last_restart), skipping"
  fi
 fi
 # ── Run agent ─────────────────────────────────────────────────────────────
 agent_run --worktree "$WORKTREE" "$PROMPT"
 log "agent_run complete"