Merge pull request 'fix: feat: supervisor auto-retriggers infra CI failures (#75)' (#104) from fix/issue-75 into main

This commit is contained in:
johba 2026-03-18 02:27:22 +01:00
commit e5f075eb97
2 changed files with 100 additions and 1 deletions

View file

@ -7,7 +7,12 @@
- Example (harb): CI images pre-built at `registry.niovi.voyage/harb/*:latest` - Example (harb): CI images pre-built at `registry.niovi.voyage/harb/*:latest`
## Safe Fixes ## Safe Fixes
- Retrigger CI: push empty commit to PR branch - Retrigger CI (preferred, automated): Woodpecker API POST
```bash
woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${PIPELINE_NUMBER}" -X POST
```
supervisor-poll.sh does this automatically for infra failures (max 2 retries).
- Retrigger CI (manual fallback): push empty commit to PR branch
```bash ```bash
cd /tmp/${PROJECT_NAME}-worktree-<issue> && git commit --allow-empty -m "ci: retrigger" --no-verify && git push origin <branch> --force cd /tmp/${PROJECT_NAME}-worktree-<issue> && git commit --allow-empty -m "ci: retrigger" --no-verify && git push origin <branch> --force
``` ```

View file

@ -241,6 +241,10 @@ fi
# (iterated over projects/*.toml, config-driven) # (iterated over projects/*.toml, config-driven)
# ############################################################################# # #############################################################################
# Infra retry tracking (shared across projects, created once)
_RETRY_DIR="/tmp/supervisor-infra-retries"
mkdir -p "$_RETRY_DIR"
# Function: run all per-project checks for the currently loaded project config # Function: run all per-project checks for the currently loaded project config
check_project() { check_project() {
local proj_name="${PROJECT_NAME:-unknown}" local proj_name="${PROJECT_NAME:-unknown}"
@ -273,6 +277,96 @@ check_project() {
'{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true '{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true
fi fi
# ===========================================================================
# P2e: INFRA FAILURES — auto-retrigger pipelines with infra failures
# ===========================================================================
if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then
status "P2e: ${proj_name}: checking infra failures"
# Recent failed pipelines (last 6h)
_failed_nums=$(wpdb -A -c "
SELECT number FROM pipelines
WHERE repo_id = ${WOODPECKER_REPO_ID}
AND status IN ('failure', 'error')
AND finished > 0
AND to_timestamp(finished) > now() - interval '6 hours'
ORDER BY number DESC LIMIT 5;" 2>/dev/null \
| tr -d ' ' | grep -E '^[0-9]+$' || true)
# shellcheck disable=SC2086
for _pip_num in $_failed_nums; do
[ -z "$_pip_num" ] && continue
# Check retry count; alert if retries exhausted
_retry_file="${_RETRY_DIR}/${WOODPECKER_REPO_ID}-${_pip_num}"
_retries=0
[ -f "$_retry_file" ] && _retries=$(cat "$_retry_file" 2>/dev/null || echo 0)
if [ "${_retries:-0}" -ge 2 ]; then
p2 "${proj_name}: Pipeline #${_pip_num}: infra retries exhausted (2/2), needs manual investigation"
continue
fi
# Get pipeline details via Woodpecker API
_pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" 2>/dev/null || true)
[ -z "$_pip_json" ] && continue
# Extract failed steps: name, exit_code, pid
_failed_steps=$(echo "$_pip_json" | jq -r '
.workflows[]?.children[]? |
select(.state == "failure" or .state == "error" or .state == "killed") |
"\(.name)\t\(.exit_code)\t\(.pid)"' 2>/dev/null || true)
[ -z "$_failed_steps" ] && continue
_is_infra=false
_infra_reason=""
while IFS=$'\t' read -r _sname _ecode _spid; do
[ -z "$_sname" ] && continue
# Clone step exit 128 → Codeberg connection failure / rate limit
if [[ "$_sname" == *clone* ]] && [ "$_ecode" = "128" ]; then
_is_infra=true
_infra_reason="clone exit 128 (connection failure)"
break
fi
# Exit 137 → OOM / killed by signal 9
if [ "$_ecode" = "137" ]; then
_is_infra=true
_infra_reason="${_sname} exit 137 (OOM/signal 9)"
break
fi
# Check step logs for docker pull / connection timeout patterns
if [ -n "$_spid" ] && [ "$_spid" != "null" ]; then
_log_data=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${_pip_num}/${_spid}" \
--max-time 15 2>/dev/null \
| jq -r '.[].data // empty' 2>/dev/null | tail -200 || true)
if echo "$_log_data" | grep -qiE 'Failed to connect|connection timed out|docker pull.*timeout|TLS handshake timeout'; then
_is_infra=true
_infra_reason="${_sname}: log matches infra pattern (timeout/connection)"
break
fi
fi
done <<< "$_failed_steps"
if [ "$_is_infra" = true ]; then
_new_retries=$(( _retries + 1 ))
if woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_pip_num}" \
-X POST >/dev/null 2>&1; then
echo "$_new_retries" > "$_retry_file"
fixed "${proj_name}: Retriggered pipeline #${_pip_num} (${_infra_reason}, retry ${_new_retries}/2)"
else
p2 "${proj_name}: Pipeline #${_pip_num}: infra failure (${_infra_reason}) but retrigger API call failed"
flog "${proj_name}: Failed to retrigger pipeline #${_pip_num}: API error"
fi
fi
done
# Clean up stale retry tracking files (>24h)
find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true
fi
# Dev-agent health (only if monitoring enabled) # Dev-agent health (only if monitoring enabled)
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
DEV_LOCK="/tmp/dev-agent.lock" DEV_LOCK="/tmp/dev-agent.lock"