fix: P2e and classify_pipeline_failure() use divergent infra heuristics (#251)
Extract shared is_infra_step() in lib/ci-helpers.sh capturing the union of
infra-detection heuristics from both P2e and classify_pipeline_failure():
- Clone/git step exit 128 (connection failure)
- Any step exit 137 (OOM/signal 9)
- Log-pattern matching (timeouts, connection failures)
Update classify_pipeline_failure() to use is_infra_step() with log fetching
and "any infra step" aggregation (matching P2e semantics). Simplify P2e to
delegate to classify_pipeline_failure(). Update P2f caller for new output
format ("infra <reason>").
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a1ee1242ab
commit
3cd047a7e0
3 changed files with 64 additions and 64 deletions
|
|
@ -303,7 +303,7 @@ sourced as needed.
|
||||||
| File | What it provides | Sourced by |
|
| File | What it provides | Sourced by |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`CODEBERG_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `codeberg_api()`, `codeberg_api_all()` (accepts optional second TOKEN parameter, defaults to `$CODEBERG_TOKEN`), `woodpecker_api()`, `wpdb()`, `matrix_send()`, `matrix_send_ctx()`. Auto-loads project TOML if `PROJECT_TOML` is set. | Every agent |
|
| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`CODEBERG_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `codeberg_api()`, `codeberg_api_all()` (accepts optional second TOKEN parameter, defaults to `$CODEBERG_TOKEN`), `woodpecker_api()`, `wpdb()`, `matrix_send()`, `matrix_send_ctx()`. Auto-loads project TOML if `PROJECT_TOML` is set. | Every agent |
|
||||||
| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `classify_pipeline_failure()` — returns "infra" if all failed Woodpecker steps are git-step exit 128/137, else "code". | dev-poll, review-poll, review-pr, supervisor-poll |
|
| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". | dev-poll, review-poll, review-pr, supervisor-poll |
|
||||||
| `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
|
| `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
|
||||||
| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) |
|
| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) |
|
||||||
| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll |
|
| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll |
|
||||||
|
|
|
||||||
|
|
@ -45,46 +45,86 @@ ci_passed() {
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# is_infra_step <step_name> <exit_code> [log_data]
|
||||||
|
# Checks whether a single CI step failure matches infra heuristics.
|
||||||
|
# Returns 0 (infra) with reason on stdout, or 1 (not infra).
|
||||||
|
#
|
||||||
|
# Heuristics (union of P2e and classify_pipeline_failure patterns):
|
||||||
|
# - Clone/git step with exit 128 → connection failure / rate limit
|
||||||
|
# - Any step with exit 137 → OOM / killed by signal 9
|
||||||
|
# - Log patterns: connection timeout, docker pull timeout, TLS handshake timeout
|
||||||
|
is_infra_step() {
|
||||||
|
local sname="$1" ecode="$2" log_data="${3:-}"
|
||||||
|
|
||||||
|
# Clone/git step exit 128 → Codeberg connection failure / rate limit
|
||||||
|
if { [[ "$sname" == *clone* ]] || [[ "$sname" == git* ]]; } && [ "$ecode" = "128" ]; then
|
||||||
|
echo "${sname} exit 128 (connection failure)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Exit 137 → OOM / killed by signal 9
|
||||||
|
if [ "$ecode" = "137" ]; then
|
||||||
|
echo "${sname} exit 137 (OOM/signal 9)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Log-pattern matching for infra issues
|
||||||
|
if [ -n "$log_data" ] && \
|
||||||
|
printf '%s' "$log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then
|
||||||
|
echo "${sname}: log matches infra pattern (timeout/connection)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
# classify_pipeline_failure <repo_id> <pipeline_num>
|
# classify_pipeline_failure <repo_id> <pipeline_num>
|
||||||
# Classifies a pipeline's failure type by inspecting all failed steps.
|
# Classifies a pipeline's failure type by inspecting failed steps.
|
||||||
# Outputs "infra" if every failed step is a git step with exit code 128 or 137.
|
# Uses is_infra_step() for per-step classification (exit codes + log patterns).
|
||||||
|
# Outputs "infra <reason>" if any failed step matches infra heuristics.
|
||||||
# Outputs "code" otherwise (including when steps cannot be determined).
|
# Outputs "code" otherwise (including when steps cannot be determined).
|
||||||
# Returns 0 for infra, 1 for code or unclassifiable.
|
# Returns 0 for infra, 1 for code or unclassifiable.
|
||||||
classify_pipeline_failure() {
|
classify_pipeline_failure() {
|
||||||
local repo_id="$1" pip_num="$2"
|
local repo_id="$1" pip_num="$2"
|
||||||
local pip_json failed_steps all_infra _sname _ecode
|
local pip_json failed_steps _sname _ecode _spid _reason _log_data
|
||||||
|
|
||||||
pip_json=$(woodpecker_api "/repos/${repo_id}/pipelines/${pip_num}" 2>/dev/null) || {
|
pip_json=$(woodpecker_api "/repos/${repo_id}/pipelines/${pip_num}" 2>/dev/null) || {
|
||||||
echo "code"; return 1
|
echo "code"; return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Extract failed steps: name, exit_code, pid
|
||||||
failed_steps=$(printf '%s' "$pip_json" | jq -r '
|
failed_steps=$(printf '%s' "$pip_json" | jq -r '
|
||||||
.workflows[]?.children[]? |
|
.workflows[]?.children[]? |
|
||||||
select(.state == "failure" or .state == "error" or .state == "killed") |
|
select(.state == "failure" or .state == "error" or .state == "killed") |
|
||||||
"\(.name)\t\(.exit_code)"' 2>/dev/null)
|
"\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null)
|
||||||
|
|
||||||
if [ -z "$failed_steps" ]; then
|
if [ -z "$failed_steps" ]; then
|
||||||
echo "code"; return 1
|
echo "code"; return 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
all_infra=true
|
while IFS=$'\t' read -r _sname _ecode _spid; do
|
||||||
_infra_count=0
|
|
||||||
while IFS=$'\t' read -r _sname _ecode; do
|
|
||||||
[ -z "$_sname" ] && continue
|
[ -z "$_sname" ] && continue
|
||||||
# git step with exit 128 (connection/rate-limit) or 137 (OOM) → infra
|
|
||||||
if [[ "$_sname" == git* ]] && { [ "$_ecode" = "128" ] || [ "$_ecode" = "137" ]; }; then
|
# Check name+exit_code patterns (no log fetch needed)
|
||||||
_infra_count=$(( _infra_count + 1 ))
|
if _reason=$(is_infra_step "$_sname" "$_ecode"); then
|
||||||
else
|
echo "infra ${_reason}"
|
||||||
all_infra=false
|
return 0
|
||||||
break
|
fi
|
||||||
|
|
||||||
|
# Fetch step logs and check log patterns
|
||||||
|
if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then
|
||||||
|
_log_data=$(woodpecker_api "/repos/${repo_id}/logs/${pip_num}/${_spid}" \
|
||||||
|
--max-time 15 2>/dev/null \
|
||||||
|
| jq -r '.[].data // empty' 2>/dev/null | tail -200 || true)
|
||||||
|
if [ -n "$_log_data" ]; then
|
||||||
|
if _reason=$(is_infra_step "$_sname" "$_ecode" "$_log_data"); then
|
||||||
|
echo "infra ${_reason}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
done <<< "$failed_steps"
|
done <<< "$failed_steps"
|
||||||
|
|
||||||
# Require at least one confirmed infra step (guards against all-empty-name steps)
|
|
||||||
if [ "$all_infra" = true ] && [ "$_infra_count" -gt 0 ]; then
|
|
||||||
echo "infra"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "code"
|
echo "code"
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -321,51 +321,11 @@ check_project() {
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Get pipeline details via Woodpecker API
|
# Classify failure type via shared helper
|
||||||
_pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true)
|
_classification=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_pip_num" 2>/dev/null || echo "code")
|
||||||
[ -z "$_pip_json" ] && continue
|
|
||||||
|
|
||||||
# Extract failed steps: name, exit_code, pid
|
if [[ "$_classification" == infra* ]]; then
|
||||||
_failed_steps=$(echo "$_pip_json" | jq -r '
|
_infra_reason="${_classification#infra }"
|
||||||
.workflows[]?.children[]? |
|
|
||||||
select(.state == "failure" or .state == "error" or .state == "killed") |
|
|
||||||
"\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null || true)
|
|
||||||
[ -z "$_failed_steps" ] && continue
|
|
||||||
|
|
||||||
_is_infra=false
|
|
||||||
_infra_reason=""
|
|
||||||
|
|
||||||
while IFS=$'\t' read -r _sname _ecode _spid; do
|
|
||||||
[ -z "$_sname" ] && continue
|
|
||||||
|
|
||||||
# Clone step exit 128 → Codeberg connection failure / rate limit
|
|
||||||
if [[ "$_sname" == *clone* ]] && [ "$_ecode" = "128" ]; then
|
|
||||||
_is_infra=true
|
|
||||||
_infra_reason="clone exit 128 (connection failure)"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Exit 137 → OOM / killed by signal 9
|
|
||||||
if [ "$_ecode" = "137" ]; then
|
|
||||||
_is_infra=true
|
|
||||||
_infra_reason="${_sname} exit 137 (OOM/signal 9)"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Check step logs for docker pull / connection timeout patterns
|
|
||||||
if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then
|
|
||||||
_log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" \
|
|
||||||
--max-time 15 2>/dev/null \
|
|
||||||
| jq -r '.[].data // empty' 2>/dev/null | tail -200 || true)
|
|
||||||
if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then
|
|
||||||
_is_infra=true
|
|
||||||
_infra_reason="${_sname}: log matches infra pattern (timeout/connection)"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
done <<< "$_failed_steps"
|
|
||||||
|
|
||||||
if [ "$_is_infra" = true ]; then
|
|
||||||
_new_retries=$(( _retries + 1 ))
|
_new_retries=$(( _retries + 1 ))
|
||||||
if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \
|
if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \
|
||||||
-X POST >/dev/null 2>&1; then
|
-X POST >/dev/null 2>&1; then
|
||||||
|
|
@ -463,7 +423,7 @@ check_project() {
|
||||||
# Classify failure type via ci-helpers
|
# Classify failure type via ci-helpers
|
||||||
_esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code")
|
_esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code")
|
||||||
|
|
||||||
if [ "$_esc_failure" != "infra" ]; then
|
if [[ "$_esc_failure" != infra* ]]; then
|
||||||
flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human"
|
flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human"
|
||||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue