diff --git a/AGENTS.md b/AGENTS.md index 73ab287..ca7096e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -227,7 +227,7 @@ sourced as needed. | File | What it provides | Sourced by | |---|---|---| | `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`CODEBERG_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `codeberg_api()`, `codeberg_api_all()` (accepts optional second TOKEN parameter, defaults to `$CODEBERG_TOKEN`), `woodpecker_api()`, `wpdb()`, `matrix_send()`, `matrix_send_ctx()`. Auto-loads project TOML if `PROJECT_TOML` is set. | Every agent | -| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). | dev-poll, review-poll, review-pr, supervisor-poll | +| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `classify_pipeline_failure()` — returns "infra" if all failed Woodpecker steps are git-step exit 128/137, else "code". | dev-poll, review-poll, review-pr, supervisor-poll | | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) | | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) | | `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll | diff --git a/lib/ci-helpers.sh b/lib/ci-helpers.sh index 1356acc..a31d673 100644 --- a/lib/ci-helpers.sh +++ b/lib/ci-helpers.sh @@ -2,7 +2,8 @@ # ci-helpers.sh — Shared CI helper functions # # Source from any script: source "$(dirname "$0")/../lib/ci-helpers.sh" -# Requires: WOODPECKER_REPO_ID (from env.sh / project config) +# ci_passed() requires: WOODPECKER_REPO_ID (from env.sh / project config) +# classify_pipeline_failure() requires: woodpecker_api() (defined in env.sh) # ci_passed — check if CI is passing (or no CI configured) # Returns 0 if state is "success", or if no CI is configured and @@ -15,3 +16,47 @@ ci_passed() { fi return 1 } + +# classify_pipeline_failure +# Classifies a pipeline's failure type by inspecting all failed steps. +# Outputs "infra" if every failed step is a git step with exit code 128 or 137. +# Outputs "code" otherwise (including when steps cannot be determined). +# Returns 0 for infra, 1 for code or unclassifiable. +classify_pipeline_failure() { + local repo_id="$1" pip_num="$2" + local pip_json failed_steps all_infra _sname _ecode + + pip_json=$(woodpecker_api "/repos/${repo_id}/pipelines/${pip_num}" 2>/dev/null) || { + echo "code"; return 1 + } + + failed_steps=$(printf '%s' "$pip_json" | jq -r ' + .workflows[]?.children[]? | + select(.state == "failure" or .state == "error" or .state == "killed") | + "\(.name)\t\(.exit_code)"' 2>/dev/null) + + if [ -z "$failed_steps" ]; then + echo "code"; return 1 + fi + + all_infra=true + _infra_count=0 + while IFS=$'\t' read -r _sname _ecode; do + [ -z "$_sname" ] && continue + # git step with exit 128 (connection/rate-limit) or 137 (OOM) → infra + if [[ "$_sname" == git* ]] && { [ "$_ecode" = "128" ] || [ "$_ecode" = "137" ]; }; then + _infra_count=$(( _infra_count + 1 )) + else + all_infra=false + break + fi + done <<< "$failed_steps" + + # Require at least one confirmed infra step (guards against all-empty-name steps) + if [ "$all_infra" = true ] && [ "$_infra_count" -gt 0 ]; then + echo "infra" + return 0 + fi + echo "code" + return 1 +} diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 7d57233..5bc94c2 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -368,6 +368,149 @@ check_project() { find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true fi + # =========================================================================== + # P2f: ESCALATION TRIAGE — auto-retrigger ci_exhausted if infra-only + # =========================================================================== + if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then + status "P2f: ${proj_name}: triaging ci_exhausted escalations" + + _esc_file="${FACTORY_ROOT}/supervisor/escalations-${proj_name}.jsonl" + if [ -f "$_esc_file" ] && [ -s "$_esc_file" ]; then + _esc_tmp="${_esc_file}.sup.$$" + : > "$_esc_tmp" + + while IFS= read -r _esc_line; do + [ -z "$_esc_line" ] && continue + + _esc_reason=$(printf '%s' "$_esc_line" | jq -r '.reason // ""' 2>/dev/null) + + # Only triage ci_exhausted entries (from dev-agent or dev-poll) + case "$_esc_reason" in + ci_exhausted) ;; + ci_exhausted_poll) ;; + *) printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue ;; + esac + + _esc_pr=$(printf '%s' "$_esc_line" | jq -r '.pr // 0' 2>/dev/null) + _esc_issue=$(printf '%s' "$_esc_line" | jq -r '.issue // 0' 2>/dev/null) + _esc_ts=$(printf '%s' "$_esc_line" | jq -r '.ts // ""' 2>/dev/null) + + # Validate pr/issue are numeric + [[ "$_esc_pr" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; } + [[ "$_esc_issue" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; } + + # Cooldown: 30 min from last escalation timestamp + _esc_epoch=0 + [ -n "$_esc_ts" ] && _esc_epoch=$(date -d "$_esc_ts" +%s 2>/dev/null || echo 0) + _esc_age_min=$(( ($(date +%s) - _esc_epoch) / 60 )) + + if [ "$_esc_age_min" -lt 30 ]; then + flog "${proj_name}: PR #${_esc_pr} ci_exhausted cooldown (${_esc_age_min}/30min)" + printf '%s\n' "$_esc_line" >> "$_esc_tmp" + continue + fi + + # Get the PR's branch and state from Codeberg + _esc_pr_json=$(codeberg_api GET "/pulls/${_esc_pr}" 2>/dev/null) || { + flog "${proj_name}: PR #${_esc_pr}: failed to fetch PR info, keeping escalation" + printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue + } + _esc_pr_state=$(printf '%s' "$_esc_pr_json" | jq -r '.state // ""' 2>/dev/null) + if [ "$_esc_pr_state" != "open" ]; then + flog "${proj_name}: PR #${_esc_pr} is ${_esc_pr_state:-unknown} — discarding stale escalation" + continue # PR merged/closed externally; escalation no longer actionable + fi + _esc_branch=$(printf '%s' "$_esc_pr_json" | jq -r '.head.ref // ""' 2>/dev/null) + if [ -z "$_esc_branch" ]; then + printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue + fi + + # Validate branch name to prevent SQL injection + if ! [[ "$_esc_branch" =~ ^[a-zA-Z0-9/_.-]+$ ]]; then + flog "${proj_name}: PR #${_esc_pr}: unsafe branch name, keeping escalation" + printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue + fi + + # Find the latest failed pipeline for this PR's branch via Woodpecker DB + _esc_pip=$(wpdb -A -c " + SELECT number FROM pipelines + WHERE repo_id = ${WOODPECKER_REPO_ID} + AND branch = '${_esc_branch}' + AND status IN ('failure', 'error') + AND finished > 0 + ORDER BY number DESC LIMIT 1;" 2>/dev/null \ + | tr -d ' ' | grep -E '^[0-9]+$' | head -1 || true) + + if [ -z "$_esc_pip" ]; then + flog "${proj_name}: PR #${_esc_pr}: no failed pipeline for branch ${_esc_branch}, keeping escalation" + printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue + fi + + # Classify failure type via ci-helpers + _esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code") + + if [ "$_esc_failure" != "infra" ]; then + flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human" + printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue + fi + + # Infra-only — push empty commit to retrigger CI via temporary worktree + _esc_wt="/tmp/${proj_name}-sup-retry-${_esc_pr}" + _esc_retrigger_ok=false + if [ -d "${PROJECT_REPO_ROOT:-}" ]; then + # Clean up any leftover temp worktree from a previous failed run + git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true + + if git -C "${PROJECT_REPO_ROOT}" fetch origin "${_esc_branch}" --quiet 2>/dev/null && \ + git -C "${PROJECT_REPO_ROOT}" worktree add --quiet --detach \ + "${_esc_wt}" "origin/${_esc_branch}" 2>/dev/null; then + if git -C "${_esc_wt}" \ + -c user.email="supervisor@factory" \ + -c user.name="Supervisor" \ + commit --allow-empty --no-verify \ + -m "chore: retrigger CI after infra-only exhaustion" \ + --quiet 2>/dev/null && \ + git -C "${_esc_wt}" push origin \ + "HEAD:refs/heads/${_esc_branch}" --quiet 2>/dev/null; then + _esc_retrigger_ok=true + fi + git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true + fi + fi + + if [ "$_esc_retrigger_ok" = true ]; then + # Reset CI fix counter so dev-poll can spawn the agent again if needed + _ci_fix_file="${FACTORY_ROOT}/dev/ci-fixes-${proj_name}.json" + _ci_fix_lock="${_ci_fix_file}.lock" + flock "$_ci_fix_lock" python3 -c " +import json, os +f='${_ci_fix_file}' +if not os.path.exists(f): + exit() +d = json.load(open(f)) +d.pop(str(${_esc_pr}), None) +json.dump(d, open(f, 'w')) +" 2>/dev/null || true + + fixed "${proj_name}: auto-retriggered CI for PR #${_esc_pr} after infra-only exhaustion" + flog "${proj_name}: auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" + matrix_send "supervisor" "♻️ auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" 2>/dev/null || true + # Escalation removed — do NOT write to _esc_tmp + else + p2 "${proj_name}: PR #${_esc_pr}: infra-only CI exhaustion but retrigger push failed" + # Bump timestamp to now so the 30-min cooldown resets; prevents alert flood + # on persistent push failures (SSH key issue, Codeberg outage, etc.) + _esc_now=$(date -u +%Y-%m-%dT%H:%M:%SZ) + _esc_bumped=$(printf '%s' "$_esc_line" | jq -c --arg ts "$_esc_now" '.ts = $ts' 2>/dev/null \ + || printf '%s' "$_esc_line") + printf '%s\n' "$_esc_bumped" >> "$_esc_tmp" + fi + done < "$_esc_file" + + mv "$_esc_tmp" "$_esc_file" + fi + fi + # Dev-agent health (only if monitoring enabled) if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then DEV_LOCK="/tmp/dev-agent.lock"