From 57fdec95046469d84ddc0f0aa71b3bc826cfac1a Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Mar 2026 01:08:35 +0000 Subject: [PATCH 1/2] fix: feat: supervisor auto-retriggers infra CI failures (#75) Co-Authored-By: Claude Opus 4.6 --- supervisor/supervisor-poll.sh | 88 +++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 0b0ba9b..5a897c6 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -273,6 +273,94 @@ check_project() { '{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true fi + # =========================================================================== + # P2e: INFRA FAILURES — auto-retrigger pipelines with infra failures + # =========================================================================== + if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then + status "P2: ${proj_name}: checking infra failures" + + _RETRY_DIR="/tmp/supervisor-infra-retries" + mkdir -p "$_RETRY_DIR" + + # Recent failed pipelines (last 6h) + # shellcheck disable=SC2086 + _failed_nums=$(wpdb -A -t -c " + SELECT number FROM pipelines + WHERE repo_id = ${WOODPECKER_REPO_ID} + AND status IN ('failure', 'error') + AND finished > 0 + AND to_timestamp(finished) > now() - interval '6 hours' + ORDER BY number DESC LIMIT 5;" 2>/dev/null \ + | tr -d ' ' | grep -E '^[0-9]+$' || true) + + for _pip_num in $_failed_nums; do + [ -z "$_pip_num" ] && continue + + # Skip if already retried twice for this pipeline + _retry_file="${_RETRY_DIR}/${WOODPECKER_REPO_ID}-${_pip_num}" + _retries=0 + [ -f "$_retry_file" ] && _retries=$(cat "$_retry_file" 2>/dev/null || echo 0) + [ "${_retries:-0}" -ge 2 ] && continue + + # Get pipeline details via Woodpecker API + _pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true) + [ -z "$_pip_json" ] && continue + + # Extract failed steps: name, exit_code, pid + _failed_steps=$(echo "$_pip_json" | jq -r ' + .workflows[]?.children[]? | + select(.state == "failure" or .state == "error" or .state == "killed") | + "\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null || true) + [ -z "$_failed_steps" ] && continue + + _is_infra=false + _infra_reason="" + + while IFS=$'\t' read -r _sname _ecode _spid; do + [ -z "$_sname" ] && continue + + # Clone step exit 128 → Codeberg connection failure / rate limit + if [[ "$_sname" == *clone* ]] && [ "$_ecode" = "128" ]; then + _is_infra=true + _infra_reason="clone exit 128 (connection failure)" + break + fi + + # Exit 137 → OOM / killed by signal 9 + if [ "$_ecode" = "137" ]; then + _is_infra=true + _infra_reason="${_sname} exit 137 (OOM/signal 9)" + break + fi + + # Check step logs for docker pull / connection timeout patterns + if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then + _log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" 2>/dev/null \ + | jq -r '.[].data // empty' 2>/dev/null | tail -200 || true) + if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then + _is_infra=true + _infra_reason="${_sname}: log matches infra pattern (timeout/connection)" + break + fi + fi + done <<< "$_failed_steps" + + if [ "$_is_infra" = true ]; then + _new_retries=$(( _retries + 1 )) + if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \ + -X POST >/dev/null 2>&1; then + echo "$_new_retries" > "$_retry_file" + fixed "${proj_name}: Retriggered pipeline #${_pip_num} (${_infra_reason}, retry ${_new_retries}/2)" + else + flog "${proj_name}: Failed to retrigger pipeline #${_pip_num}: API error" + fi + fi + done + + # Clean up stale retry tracking files (>24h) + find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true + fi + # Dev-agent health (only if monitoring enabled) if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then DEV_LOCK="/tmp/dev-agent.lock" From 1c5d3e7bbd724f8989b22694f59e40960ea06f2f Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Mar 2026 01:18:34 +0000 Subject: [PATCH 2/2] fix: address review findings from issue #75 Co-Authored-By: Claude Opus 4.6 --- supervisor/best-practices/ci.md | 7 ++++++- supervisor/supervisor-poll.sh | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/supervisor/best-practices/ci.md b/supervisor/best-practices/ci.md index 5bafa86..ce32086 100644 --- a/supervisor/best-practices/ci.md +++ b/supervisor/best-practices/ci.md @@ -7,7 +7,12 @@ - Example (harb): CI images pre-built at `registry.niovi.voyage/harb/*:latest` ## Safe Fixes -- Retrigger CI: push empty commit to PR branch +- Retrigger CI (preferred, automated): Woodpecker API POST + ```bash + woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${PIPELINE_NUMBER}" -X POST + ``` + supervisor-poll.sh does this automatically for infra failures (max 2 retries). +- Retrigger CI (manual fallback): push empty commit to PR branch ```bash cd /tmp/${PROJECT_NAME}-worktree- && git commit --allow-empty -m "ci: retrigger" --no-verify && git push origin --force ``` diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 5a897c6..87adbab 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -241,6 +241,10 @@ fi # (iterated over projects/*.toml, config-driven) # ############################################################################# +# Infra retry tracking (shared across projects, created once) +_RETRY_DIR="/tmp/supervisor-infra-retries" +mkdir -p "$_RETRY_DIR" + # Function: run all per-project checks for the currently loaded project config check_project() { local proj_name="${PROJECT_NAME:-unknown}" @@ -277,14 +281,10 @@ check_project() { # P2e: INFRA FAILURES — auto-retrigger pipelines with infra failures # =========================================================================== if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then - status "P2: ${proj_name}: checking infra failures" - - _RETRY_DIR="/tmp/supervisor-infra-retries" - mkdir -p "$_RETRY_DIR" + status "P2e: ${proj_name}: checking infra failures" # Recent failed pipelines (last 6h) - # shellcheck disable=SC2086 - _failed_nums=$(wpdb -A -t -c " + _failed_nums=$(wpdb -A -c " SELECT number FROM pipelines WHERE repo_id = ${WOODPECKER_REPO_ID} AND status IN ('failure', 'error') @@ -293,14 +293,18 @@ check_project() { ORDER BY number DESC LIMIT 5;" 2>/dev/null \ | tr -d ' ' | grep -E '^[0-9]+$' || true) + # shellcheck disable=SC2086 for _pip_num in $_failed_nums; do [ -z "$_pip_num" ] && continue - # Skip if already retried twice for this pipeline + # Check retry count; alert if retries exhausted _retry_file="${_RETRY_DIR}/${WOODPECKER_REPO_ID}-${_pip_num}" _retries=0 [ -f "$_retry_file" ] && _retries=$(cat "$_retry_file" 2>/dev/null || echo 0) - [ "${_retries:-0}" -ge 2 ] && continue + if [ "${_retries:-0}" -ge 2 ]; then + p2 "${proj_name}: Pipeline #${_pip_num}: infra retries exhausted (2/2), needs manual investigation" + continue + fi # Get pipeline details via Woodpecker API _pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true) @@ -335,7 +339,8 @@ check_project() { # Check step logs for docker pull / connection timeout patterns if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then - _log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" 2>/dev/null \ + _log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" \ + --max-time 15 2>/dev/null \ | jq -r '.[].data // empty' 2>/dev/null | tail -200 || true) if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then _is_infra=true @@ -352,6 +357,7 @@ check_project() { echo "$_new_retries" > "$_retry_file" fixed "${proj_name}: Retriggered pipeline #${_pip_num} (${_infra_reason}, retry ${_new_retries}/2)" else + p2 "${proj_name}: Pipeline #${_pip_num}: infra failure (${_infra_reason}) but retrigger API call failed" flog "${proj_name}: Failed to retrigger pipeline #${_pip_num}: API error" fi fi