fix: feat: supervisor breaks down escalated CI failures into sub-issues (#52)
- supervisor-poll.sh: replace P3 escalation log with actionable sub-issue creation. For each entry in escalations.jsonl: fetch CI logs via woodpecker-cli, create one sub-issue per file for ShellCheck failures, one combined issue for other CI failures, or a fallback investigation issue if logs are unavailable. Move processed entries to escalations.done.jsonl and clear escalations.jsonl. - dev-poll.sh: add is_escalated() helper that checks both escalations.jsonl and escalations.done.jsonl; use it (alongside ci_fix_count >= 3) in all three CI-fix spawn paths so escalated PRs are skipped even if the ci-fixes tracker is reset. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
531ae5cf71
commit
d9520f48a6
2 changed files with 189 additions and 11 deletions
|
|
@ -57,6 +57,30 @@ json.dump(d,open(f,'w'))
|
||||||
" 2>/dev/null || true
|
" 2>/dev/null || true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Check whether an issue/PR has been escalated to supervisor (unprocessed or processed)
|
||||||
|
is_escalated() {
|
||||||
|
local issue="$1" pr="$2"
|
||||||
|
local esc_file="${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
local done_file="${FACTORY_ROOT}/supervisor/escalations.done.jsonl"
|
||||||
|
python3 -c "
|
||||||
|
import json, sys
|
||||||
|
for path in ['${FACTORY_ROOT}/supervisor/escalations.jsonl',
|
||||||
|
'${FACTORY_ROOT}/supervisor/escalations.done.jsonl']:
|
||||||
|
try:
|
||||||
|
with open(path) as fh:
|
||||||
|
for line in fh:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
d = json.loads(line)
|
||||||
|
if d.get('issue') == ${issue} and d.get('pr') == ${pr}:
|
||||||
|
sys.exit(0)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
sys.exit(1)
|
||||||
|
" 2>/dev/null && return 0 || return 1
|
||||||
|
}
|
||||||
|
|
||||||
REPO="${CODEBERG_REPO}"
|
REPO="${CODEBERG_REPO}"
|
||||||
|
|
||||||
API="${CODEBERG_API}"
|
API="${CODEBERG_API}"
|
||||||
|
|
@ -232,9 +256,9 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
|
||||||
|
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR")
|
FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR")
|
||||||
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$HAS_PR"; then
|
||||||
# Already escalated — skip silently, let pipeline continue to backlog
|
# Already escalated — skip silently, let pipeline continue to backlog
|
||||||
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — skipping"
|
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping"
|
||||||
# Only write escalation + alert once (first time hitting 3)
|
# Only write escalation + alert once (first time hitting 3)
|
||||||
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
|
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
|
||||||
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
|
@ -313,9 +337,9 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
|
||||||
exit 0
|
exit 0
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM")
|
FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM")
|
||||||
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$STUCK_ISSUE" "$PR_NUM"; then
|
||||||
# Already escalated — skip to let pipeline continue
|
# Already escalated — skip to let pipeline continue
|
||||||
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — skipping"
|
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping"
|
||||||
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
|
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
|
||||||
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
|
@ -390,11 +414,14 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
|
||||||
|
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR")
|
FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR")
|
||||||
if [ "$FIX_ATTEMPTS" -ge 3 ]; then
|
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$EXISTING_PR"; then
|
||||||
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — exhausted ${FIX_ATTEMPTS} attempts, escalated (not blocking pipeline)"
|
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — escalated to supervisor, skipping (not blocking pipeline)"
|
||||||
echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
if [ "$FIX_ATTEMPTS" -eq 3 ]; then
|
||||||
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
matrix_send "dev" "🚨 PR #${EXISTING_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
>> "${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${EXISTING_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
||||||
|
ci_fix_increment "$EXISTING_PR" # bump to 4 to prevent re-alert
|
||||||
|
fi
|
||||||
# Don't add to WAITING_PRS — escalated PRs should not block new work
|
# Don't add to WAITING_PRS — escalated PRs should not block new work
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
|
||||||
|
|
@ -216,11 +216,162 @@ for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Check for dev-agent escalations
|
# Process dev-agent escalations — create sub-issues for each CI failure
|
||||||
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
||||||
|
ESCALATION_DONE="${FACTORY_ROOT}/supervisor/escalations.done.jsonl"
|
||||||
|
|
||||||
if [ -s "$ESCALATION_FILE" ]; then
|
if [ -s "$ESCALATION_FILE" ]; then
|
||||||
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
|
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
|
||||||
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
|
flog "Processing ${ESCALATION_COUNT} escalation(s) from dev-agent"
|
||||||
|
|
||||||
|
while IFS= read -r esc_entry; do
|
||||||
|
[ -z "$esc_entry" ] && continue
|
||||||
|
|
||||||
|
ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty')
|
||||||
|
ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty')
|
||||||
|
ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3')
|
||||||
|
|
||||||
|
if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then
|
||||||
|
echo "$esc_entry" >> "$ESCALATION_DONE"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
flog "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))"
|
||||||
|
|
||||||
|
# Fetch the failing pipeline for this PR
|
||||||
|
ESC_PR_SHA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||||
|
"${CODEBERG_API}/pulls/${ESC_PR}" 2>/dev/null | jq -r '.head.sha // ""') || true
|
||||||
|
|
||||||
|
ESC_PIPELINE=""
|
||||||
|
ESC_SUB_ISSUES_CREATED=0
|
||||||
|
ESC_GENERIC_FAIL=""
|
||||||
|
|
||||||
|
if [ -n "$ESC_PR_SHA" ]; then
|
||||||
|
ESC_PIPELINE=$(wpdb -c "SELECT number FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND commit='${ESC_PR_SHA}' ORDER BY created DESC LIMIT 1;" 2>/dev/null | xargs || true)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$ESC_PIPELINE" ]; then
|
||||||
|
FAILED_STEPS=$(curl -sf \
|
||||||
|
-H "Authorization: Bearer ${WOODPECKER_TOKEN}" \
|
||||||
|
"${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${ESC_PIPELINE}" 2>/dev/null | \
|
||||||
|
jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.pid)\t\(.name)"' 2>/dev/null || true)
|
||||||
|
|
||||||
|
while IFS=$'\t' read -r step_pid step_name; do
|
||||||
|
[ -z "$step_pid" ] && continue
|
||||||
|
step_logs=$(woodpecker-cli pipeline log show "${CODEBERG_REPO}" "${ESC_PIPELINE}" "${step_pid}" 2>/dev/null | tail -150 || true)
|
||||||
|
[ -z "$step_logs" ] && continue
|
||||||
|
|
||||||
|
if echo "$step_name" | grep -qi "shellcheck"; then
|
||||||
|
# Create one sub-issue per file with ShellCheck errors
|
||||||
|
sc_files=$(echo "$step_logs" | grep -oP '(?<=In )\S+(?= line \d+:)' | sort -u || true)
|
||||||
|
|
||||||
|
while IFS= read -r sc_file; do
|
||||||
|
[ -z "$sc_file" ] && continue
|
||||||
|
file_errors=$(echo "$step_logs" | grep -A3 "In ${sc_file} line" | head -30)
|
||||||
|
sc_codes=$(echo "$step_logs" | grep -oP 'SC\d+' | sort -u | tr '\n' ' ' | sed 's/ $//' || true)
|
||||||
|
|
||||||
|
sub_title="fix: ShellCheck errors in ${sc_file} (from PR #${ESC_PR})"
|
||||||
|
sub_body="## ShellCheck CI failure — \`${sc_file}\`
|
||||||
|
|
||||||
|
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)).
|
||||||
|
|
||||||
|
### Errors
|
||||||
|
\`\`\`
|
||||||
|
${file_errors}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
Fix all ShellCheck errors${sc_codes:+ (${sc_codes})} in \`${sc_file}\` so PR #${ESC_PR} CI passes.
|
||||||
|
|
||||||
|
### Context
|
||||||
|
- Parent issue: #${ESC_ISSUE}
|
||||||
|
- PR: #${ESC_PR}
|
||||||
|
- Pipeline: #${ESC_PIPELINE} (step: ${step_name})"
|
||||||
|
|
||||||
|
new_issue=$(curl -sf -X POST \
|
||||||
|
-H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
"${CODEBERG_API}/issues" \
|
||||||
|
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
|
||||||
|
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
|
||||||
|
|
||||||
|
if [ -n "$new_issue" ]; then
|
||||||
|
flog "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})"
|
||||||
|
fixed "Sub-issue #${new_issue}: ShellCheck errors in ${sc_file} (escalated from #${ESC_ISSUE})"
|
||||||
|
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
|
||||||
|
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
done <<< "$sc_files"
|
||||||
|
|
||||||
|
else
|
||||||
|
# Accumulate non-ShellCheck failures for one combined issue
|
||||||
|
ESC_GENERIC_FAIL="${ESC_GENERIC_FAIL}
|
||||||
|
=== ${step_name} ===
|
||||||
|
$(echo "$step_logs" | tail -50)"
|
||||||
|
fi
|
||||||
|
done <<< "$FAILED_STEPS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create one sub-issue for all non-ShellCheck CI failures
|
||||||
|
if [ -n "$ESC_GENERIC_FAIL" ]; then
|
||||||
|
sub_title="fix: CI failures in PR #${ESC_PR} (from issue #${ESC_ISSUE})"
|
||||||
|
sub_body="## CI failure — fix required
|
||||||
|
|
||||||
|
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)).
|
||||||
|
|
||||||
|
### Failed step output
|
||||||
|
\`\`\`${ESC_GENERIC_FAIL}
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
### Context
|
||||||
|
- Parent issue: #${ESC_ISSUE}
|
||||||
|
- PR: #${ESC_PR}${ESC_PIPELINE:+
|
||||||
|
- Pipeline: #${ESC_PIPELINE}}"
|
||||||
|
|
||||||
|
new_issue=$(curl -sf -X POST \
|
||||||
|
-H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
"${CODEBERG_API}/issues" \
|
||||||
|
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
|
||||||
|
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
|
||||||
|
|
||||||
|
if [ -n "$new_issue" ]; then
|
||||||
|
flog "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})"
|
||||||
|
fixed "Sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (escalated from #${ESC_ISSUE})"
|
||||||
|
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
|
||||||
|
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fallback: no CI logs available — create a generic investigation issue
|
||||||
|
if [ "$ESC_SUB_ISSUES_CREATED" -eq 0 ]; then
|
||||||
|
sub_title="fix: investigate CI failure for PR #${ESC_PR} (from issue #${ESC_ISSUE})"
|
||||||
|
sub_body="## CI failure — investigation required
|
||||||
|
|
||||||
|
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were unavailable at escalation time.
|
||||||
|
|
||||||
|
Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge."
|
||||||
|
|
||||||
|
new_issue=$(curl -sf -X POST \
|
||||||
|
-H "Authorization: token ${CODEBERG_TOKEN}" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
"${CODEBERG_API}/issues" \
|
||||||
|
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
|
||||||
|
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
|
||||||
|
|
||||||
|
if [ -n "$new_issue" ]; then
|
||||||
|
flog "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}"
|
||||||
|
fixed "Fallback sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (escalated from #${ESC_ISSUE})"
|
||||||
|
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Mark as processed
|
||||||
|
echo "$esc_entry" >> "$ESCALATION_DONE"
|
||||||
|
done < "$ESCALATION_FILE"
|
||||||
|
|
||||||
|
# Clear processed escalations
|
||||||
|
> "$ESCALATION_FILE"
|
||||||
|
flog "Escalations processed — moved to $(basename "$ESCALATION_DONE")"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# #############################################################################
|
# #############################################################################
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue