From 57fdec95046469d84ddc0f0aa71b3bc826cfac1a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Mar 2026 01:08:35 +0000 Subject: [PATCH] fix: feat: supervisor auto-retriggers infra CI failures (#75) Co-Authored-By: Claude Opus 4.6 --- supervisor/supervisor-poll.sh | 88 +++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 0b0ba9b..5a897c6 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -273,6 +273,94 @@ check_project() { '{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true fi + # =========================================================================== + # P2e: INFRA FAILURES — auto-retrigger pipelines with infra failures + # =========================================================================== + if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then + status "P2: ${proj_name}: checking infra failures" + + _RETRY_DIR="/tmp/supervisor-infra-retries" + mkdir -p "$_RETRY_DIR" + + # Recent failed pipelines (last 6h) + # shellcheck disable=SC2086 + _failed_nums=$(wpdb -A -t -c " + SELECT number FROM pipelines + WHERE repo_id = ${WOODPECKER_REPO_ID} + AND status IN ('failure', 'error') + AND finished > 0 + AND to_timestamp(finished) > now() - interval '6 hours' + ORDER BY number DESC LIMIT 5;" 2>/dev/null \ + | tr -d ' ' | grep -E '^[0-9]+$' || true) + + for _pip_num in $_failed_nums; do + [ -z "$_pip_num" ] && continue + + # Skip if already retried twice for this pipeline + _retry_file="${_RETRY_DIR}/${WOODPECKER_REPO_ID}-${_pip_num}" + _retries=0 + [ -f "$_retry_file" ] && _retries=$(cat "$_retry_file" 2>/dev/null || echo 0) + [ "${_retries:-0}" -ge 2 ] && continue + + # Get pipeline details via Woodpecker API + _pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true) + [ -z "$_pip_json" ] && continue + + # Extract failed steps: name, exit_code, pid + _failed_steps=$(echo "$_pip_json" | jq -r ' + .workflows[]?.children[]? | + select(.state == "failure" or .state == "error" or .state == "killed") | + "\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null || true) + [ -z "$_failed_steps" ] && continue + + _is_infra=false + _infra_reason="" + + while IFS=$'\t' read -r _sname _ecode _spid; do + [ -z "$_sname" ] && continue + + # Clone step exit 128 → Codeberg connection failure / rate limit + if [[ "$_sname" == *clone* ]] && [ "$_ecode" = "128" ]; then + _is_infra=true + _infra_reason="clone exit 128 (connection failure)" + break + fi + + # Exit 137 → OOM / killed by signal 9 + if [ "$_ecode" = "137" ]; then + _is_infra=true + _infra_reason="${_sname} exit 137 (OOM/signal 9)" + break + fi + + # Check step logs for docker pull / connection timeout patterns + if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then + _log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" 2>/dev/null \ + | jq -r '.[].data // empty' 2>/dev/null | tail -200 || true) + if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then + _is_infra=true + _infra_reason="${_sname}: log matches infra pattern (timeout/connection)" + break + fi + fi + done <<< "$_failed_steps" + + if [ "$_is_infra" = true ]; then + _new_retries=$(( _retries + 1 )) + if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \ + -X POST >/dev/null 2>&1; then + echo "$_new_retries" > "$_retry_file" + fixed "${proj_name}: Retriggered pipeline #${_pip_num} (${_infra_reason}, retry ${_new_retries}/2)" + else + flog "${proj_name}: Failed to retrigger pipeline #${_pip_num}: API error" + fi + fi + done + + # Clean up stale retry tracking files (>24h) + find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true + fi + # Dev-agent health (only if monitoring enabled) if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then DEV_LOCK="/tmp/dev-agent.lock"