From 3cd047a7e066312c10df97cec1469d8b7213f8e9 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 20 Mar 2026 19:19:29 +0000 Subject: [PATCH] fix: P2e and classify_pipeline_failure() use divergent infra heuristics (#251) Extract shared is_infra_step() in lib/ci-helpers.sh capturing the union of infra-detection heuristics from both P2e and classify_pipeline_failure(): - Clone/git step exit 128 (connection failure) - Any step exit 137 (OOM/signal 9) - Log-pattern matching (timeouts, connection failures) Update classify_pipeline_failure() to use is_infra_step() with log fetching and "any infra step" aggregation (matching P2e semantics). Simplify P2e to delegate to classify_pipeline_failure(). Update P2f caller for new output format ("infra "). Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 2 +- lib/ci-helpers.sh | 76 ++++++++++++++++++++++++++--------- supervisor/supervisor-poll.sh | 50 +++-------------------- 3 files changed, 64 insertions(+), 64 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0cdac18..8a40b23 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -303,7 +303,7 @@ sourced as needed. | File | What it provides | Sourced by | |---|---|---| | `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`CODEBERG_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `codeberg_api()`, `codeberg_api_all()` (accepts optional second TOKEN parameter, defaults to `$CODEBERG_TOKEN`), `woodpecker_api()`, `wpdb()`, `matrix_send()`, `matrix_send_ctx()`. Auto-loads project TOML if `PROJECT_TOML` is set. | Every agent | -| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `classify_pipeline_failure()` — returns "infra" if all failed Woodpecker steps are git-step exit 128/137, else "code". | dev-poll, review-poll, review-pr, supervisor-poll | +| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". | dev-poll, review-poll, review-pr, supervisor-poll | | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) | | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) | | `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll | diff --git a/lib/ci-helpers.sh b/lib/ci-helpers.sh index 6deb229..40f79e1 100644 --- a/lib/ci-helpers.sh +++ b/lib/ci-helpers.sh @@ -45,46 +45,86 @@ ci_passed() { return 1 } +# is_infra_step [log_data] +# Checks whether a single CI step failure matches infra heuristics. +# Returns 0 (infra) with reason on stdout, or 1 (not infra). +# +# Heuristics (union of P2e and classify_pipeline_failure patterns): +# - Clone/git step with exit 128 → connection failure / rate limit +# - Any step with exit 137 → OOM / killed by signal 9 +# - Log patterns: connection timeout, docker pull timeout, TLS handshake timeout +is_infra_step() { + local sname="$1" ecode="$2" log_data="${3:-}" + + # Clone/git step exit 128 → Codeberg connection failure / rate limit + if { [[ "$sname" == *clone* ]] || [[ "$sname" == git* ]]; } && [ "$ecode" = "128" ]; then + echo "${sname} exit 128 (connection failure)" + return 0 + fi + + # Exit 137 → OOM / killed by signal 9 + if [ "$ecode" = "137" ]; then + echo "${sname} exit 137 (OOM/signal 9)" + return 0 + fi + + # Log-pattern matching for infra issues + if [ -n "$log_data" ] && \ + printf '%s' "$log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then + echo "${sname}: log matches infra pattern (timeout/connection)" + return 0 + fi + + return 1 +} + # classify_pipeline_failure -# Classifies a pipeline's failure type by inspecting all failed steps. -# Outputs "infra" if every failed step is a git step with exit code 128 or 137. +# Classifies a pipeline's failure type by inspecting failed steps. +# Uses is_infra_step() for per-step classification (exit codes + log patterns). +# Outputs "infra " if any failed step matches infra heuristics. # Outputs "code" otherwise (including when steps cannot be determined). # Returns 0 for infra, 1 for code or unclassifiable. classify_pipeline_failure() { local repo_id="$1" pip_num="$2" - local pip_json failed_steps all_infra _sname _ecode + local pip_json failed_steps _sname _ecode _spid _reason _log_data pip_json=$(woodpecker_api "/repos/${repo_id}/pipelines/${pip_num}" 2>/dev/null) || { echo "code"; return 1 } + # Extract failed steps: name, exit_code, pid failed_steps=$(printf '%s' "$pip_json" | jq -r ' .workflows[]?.children[]? | select(.state == "failure" or .state == "error" or .state == "killed") | - "\(.name)\t\(.exit_code)"' 2>/dev/null) + "\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null) if [ -z "$failed_steps" ]; then echo "code"; return 1 fi - all_infra=true - _infra_count=0 - while IFS=$'\t' read -r _sname _ecode; do + while IFS=$'\t' read -r _sname _ecode _spid; do [ -z "$_sname" ] && continue - # git step with exit 128 (connection/rate-limit) or 137 (OOM) → infra - if [[ "$_sname" == git* ]] && { [ "$_ecode" = "128" ] || [ "$_ecode" = "137" ]; }; then - _infra_count=$(( _infra_count + 1 )) - else - all_infra=false - break + + # Check name+exit_code patterns (no log fetch needed) + if _reason=$(is_infra_step "$_sname" "$_ecode"); then + echo "infra ${_reason}" + return 0 + fi + + # Fetch step logs and check log patterns + if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then + _log_data=$(woodpecker_api "/repos/${repo_id}/logs/${pip_num}/${_spid}" \ + --max-time 15 2>/dev/null \ + | jq -r '.[].data // empty' 2>/dev/null | tail -200 || true) + if [ -n "$_log_data" ]; then + if _reason=$(is_infra_step "$_sname" "$_ecode" "$_log_data"); then + echo "infra ${_reason}" + return 0 + fi + fi fi done <<< "$failed_steps" - # Require at least one confirmed infra step (guards against all-empty-name steps) - if [ "$all_infra" = true ] && [ "$_infra_count" -gt 0 ]; then - echo "infra" - return 0 - fi echo "code" return 1 } diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 48def66..4e6a3c4 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -321,51 +321,11 @@ check_project() { continue fi - # Get pipeline details via Woodpecker API - _pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true) - [ -z "$_pip_json" ] && continue + # Classify failure type via shared helper + _classification=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_pip_num" 2>/dev/null || echo "code") - # Extract failed steps: name, exit_code, pid - _failed_steps=$(echo "$_pip_json" | jq -r ' - .workflows[]?.children[]? | - select(.state == "failure" or .state == "error" or .state == "killed") | - "\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null || true) - [ -z "$_failed_steps" ] && continue - - _is_infra=false - _infra_reason="" - - while IFS=$'\t' read -r _sname _ecode _spid; do - [ -z "$_sname" ] && continue - - # Clone step exit 128 → Codeberg connection failure / rate limit - if [[ "$_sname" == *clone* ]] && [ "$_ecode" = "128" ]; then - _is_infra=true - _infra_reason="clone exit 128 (connection failure)" - break - fi - - # Exit 137 → OOM / killed by signal 9 - if [ "$_ecode" = "137" ]; then - _is_infra=true - _infra_reason="${_sname} exit 137 (OOM/signal 9)" - break - fi - - # Check step logs for docker pull / connection timeout patterns - if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then - _log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" \ - --max-time 15 2>/dev/null \ - | jq -r '.[].data // empty' 2>/dev/null | tail -200 || true) - if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then - _is_infra=true - _infra_reason="${_sname}: log matches infra pattern (timeout/connection)" - break - fi - fi - done <<< "$_failed_steps" - - if [ "$_is_infra" = true ]; then + if [[ "$_classification" == infra* ]]; then + _infra_reason="${_classification#infra }" _new_retries=$(( _retries + 1 )) if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \ -X POST >/dev/null 2>&1; then @@ -463,7 +423,7 @@ check_project() { # Classify failure type via ci-helpers _esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code") - if [ "$_esc_failure" != "infra" ]; then + if [[ "$_esc_failure" != infra* ]]; then flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human" printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue fi