diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh index 6c2336c..6dd3678 100755 --- a/dev/dev-poll.sh +++ b/dev/dev-poll.sh @@ -57,6 +57,32 @@ json.dump(d,open(f,'w')) " 2>/dev/null || true } +# Check whether an issue/PR has been escalated to supervisor (unprocessed or processed) +is_escalated() { + local issue="$1" pr="$2" + python3 -c " +import json, sys +try: + issue, pr = int('${issue}'), int('${pr}') +except (ValueError, TypeError): + sys.exit(1) +for path in ['${FACTORY_ROOT}/supervisor/escalations.jsonl', + '${FACTORY_ROOT}/supervisor/escalations.done.jsonl']: + try: + with open(path) as fh: + for line in fh: + line = line.strip() + if not line: + continue + d = json.loads(line) + if d.get('issue') == issue and d.get('pr') == pr: + sys.exit(0) + except OSError: + pass +sys.exit(1) +" 2>/dev/null && return 0 || return 1 +} + REPO="${CODEBERG_REPO}" API="${CODEBERG_API}" @@ -232,9 +258,9 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR") - if [ "$FIX_ATTEMPTS" -ge 3 ]; then + if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$HAS_PR"; then # Already escalated — skip silently, let pipeline continue to backlog - log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — skipping" + log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping" # Only write escalation + alert once (first time hitting 3) if [ "$FIX_ATTEMPTS" -eq 3 ]; then echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ @@ -313,9 +339,9 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do exit 0 elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM") - if [ "$FIX_ATTEMPTS" -ge 3 ]; then + if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$STUCK_ISSUE" "$PR_NUM"; then # Already escalated — skip to let pipeline continue - log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — skipping" + log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping" if [ "$FIX_ATTEMPTS" -eq 3 ]; then echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ >> "${FACTORY_ROOT}/supervisor/escalations.jsonl" @@ -390,11 +416,14 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR") - if [ "$FIX_ATTEMPTS" -ge 3 ]; then - log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — exhausted ${FIX_ATTEMPTS} attempts, escalated (not blocking pipeline)" - echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations.jsonl" - matrix_send "dev" "🚨 PR #${EXISTING_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true + if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$EXISTING_PR"; then + log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — escalated to supervisor, skipping (not blocking pipeline)" + if [ "$FIX_ATTEMPTS" -eq 3 ]; then + echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations.jsonl" + matrix_send "dev" "🚨 PR #${EXISTING_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true + ci_fix_increment "$EXISTING_PR" # bump to 4 to prevent re-alert + fi # Don't add to WAITING_PRS — escalated PRs should not block new work continue fi diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index 6e9169c..48c2834 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -216,11 +216,193 @@ for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do fi done -# Check for dev-agent escalations +# Process dev-agent escalations — create sub-issues for each CI failure ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl" +ESCALATION_DONE="${FACTORY_ROOT}/supervisor/escalations.done.jsonl" + if [ -s "$ESCALATION_FILE" ]; then - ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE") - p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}" + # Atomically snapshot the file before processing to prevent race with + # concurrent dev-poll appends: new entries go to a fresh ESCALATION_FILE + # while we process the snapshot, so nothing is ever silently dropped. + ESCALATION_SNAP="${ESCALATION_FILE}.processing.$$" + mv "$ESCALATION_FILE" "$ESCALATION_SNAP" + + ESCALATION_COUNT=$(wc -l < "$ESCALATION_SNAP") + flog "Processing ${ESCALATION_COUNT} escalation(s) from dev-agent" + + while IFS= read -r esc_entry; do + [ -z "$esc_entry" ] && continue + + ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty') + ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty') + ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3') + + if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then + echo "$esc_entry" >> "$ESCALATION_DONE" + continue + fi + + flog "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))" + + # Fetch the failing pipeline for this PR + ESC_PR_SHA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ + "${CODEBERG_API}/pulls/${ESC_PR}" 2>/dev/null | jq -r '.head.sha // ""') || true + + ESC_PIPELINE="" + ESC_SUB_ISSUES_CREATED=0 + ESC_GENERIC_FAIL="" + ESC_LOGS_AVAILABLE=0 + + if [ -n "$ESC_PR_SHA" ]; then + # Validate SHA is a 40-char hex string before interpolating into SQL + if [[ "$ESC_PR_SHA" =~ ^[0-9a-fA-F]{40}$ ]]; then + ESC_PIPELINE=$(wpdb -c "SELECT number FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND commit='${ESC_PR_SHA}' ORDER BY created DESC LIMIT 1;" 2>/dev/null | xargs || true) + else + flog "WARNING: ESC_PR_SHA '${ESC_PR_SHA}' is not a valid hex SHA — skipping pipeline lookup" + fi + fi + + if [ -n "$ESC_PIPELINE" ]; then + FAILED_STEPS=$(curl -sf \ + -H "Authorization: Bearer ${WOODPECKER_TOKEN}" \ + "${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${ESC_PIPELINE}" 2>/dev/null | \ + jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.pid)\t\(.name)"' 2>/dev/null || true) + + while IFS=$'\t' read -r step_pid step_name; do + [ -z "$step_pid" ] && continue + [[ "$step_pid" =~ ^[0-9]+$ ]] || { flog "WARNING: invalid step_pid '${step_pid}' — skipping"; continue; } + step_logs=$(woodpecker-cli pipeline log show "${CODEBERG_REPO}" "${ESC_PIPELINE}" "${step_pid}" 2>/dev/null | tail -150 || true) + [ -z "$step_logs" ] && continue + ESC_LOGS_AVAILABLE=1 + + if echo "$step_name" | grep -qi "shellcheck"; then + # Create one sub-issue per file with ShellCheck errors + sc_files=$(echo "$step_logs" | grep -oP '(?<=In )\S+(?= line \d+:)' | sort -u || true) + + while IFS= read -r sc_file; do + [ -z "$sc_file" ] && continue + # grep -F for literal filename match (dots in filenames are regex wildcards) + file_errors=$(echo "$step_logs" | grep -F -A3 "In ${sc_file} line" | head -30) + # SC codes only from this file's errors, not the whole step log + sc_codes=$(echo "$file_errors" | grep -oP 'SC\d+' | sort -u | tr '\n' ' ' | sed 's/ $//' || true) + + sub_title="fix: ShellCheck errors in ${sc_file} (from PR #${ESC_PR})" + sub_body="## ShellCheck CI failure — \`${sc_file}\` + +Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). + +### Errors +\`\`\` +${file_errors} +\`\`\` + +Fix all ShellCheck errors${sc_codes:+ (${sc_codes})} in \`${sc_file}\` so PR #${ESC_PR} CI passes. + +### Context +- Parent issue: #${ESC_ISSUE} +- PR: #${ESC_PR} +- Pipeline: #${ESC_PIPELINE} (step: ${step_name})" + + new_issue=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues" \ + -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ + '{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true + + if [ -n "$new_issue" ]; then + flog "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})" + fixed "Sub-issue #${new_issue}: ShellCheck errors in ${sc_file} (escalated from #${ESC_ISSUE})" + ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1)) + matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true + fi + done <<< "$sc_files" + + else + # Accumulate non-ShellCheck failures for one combined issue + esc_section="=== ${step_name} === +$(echo "$step_logs" | tail -50)" + if [ -z "$ESC_GENERIC_FAIL" ]; then + ESC_GENERIC_FAIL="$esc_section" + else + ESC_GENERIC_FAIL="${ESC_GENERIC_FAIL} +${esc_section}" + fi + fi + done <<< "$FAILED_STEPS" + fi + + # Create one sub-issue for all non-ShellCheck CI failures + if [ -n "$ESC_GENERIC_FAIL" ]; then + sub_title="fix: CI failures in PR #${ESC_PR} (from issue #${ESC_ISSUE})" + sub_body="## CI failure — fix required + +Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). + +### Failed step output +\`\`\` +${ESC_GENERIC_FAIL} +\`\`\` + +### Context +- Parent issue: #${ESC_ISSUE} +- PR: #${ESC_PR}${ESC_PIPELINE:+ +- Pipeline: #${ESC_PIPELINE}}" + + new_issue=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues" \ + -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ + '{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true + + if [ -n "$new_issue" ]; then + flog "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})" + fixed "Sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (escalated from #${ESC_ISSUE})" + ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1)) + matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true + fi + fi + + # Fallback: no sub-issues created — differentiate logs-unavailable from creation failure + if [ "$ESC_SUB_ISSUES_CREATED" -eq 0 ]; then + sub_title="fix: investigate CI failure for PR #${ESC_PR} (from issue #${ESC_ISSUE})" + if [ "$ESC_LOGS_AVAILABLE" -eq 1 ]; then + # Logs were fetched but all issue creation API calls failed + sub_body="## CI failure — investigation required + +Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were retrieved but sub-issue creation failed (API error). + +Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge." + else + # Could not retrieve CI logs at all + sub_body="## CI failure — investigation required + +Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were unavailable at escalation time. + +Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge." + fi + + new_issue=$(curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${CODEBERG_API}/issues" \ + -d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \ + '{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true + + if [ -n "$new_issue" ]; then + flog "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}" + fixed "Fallback sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (escalated from #${ESC_ISSUE})" + matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true + fi + fi + + # Mark as processed + echo "$esc_entry" >> "$ESCALATION_DONE" + done < "$ESCALATION_SNAP" + + rm -f "$ESCALATION_SNAP" + flog "Escalations processed — moved to $(basename "$ESCALATION_DONE")" fi # #############################################################################