Merge pull request 'fix: TOCTOU in handle_ci_exhaustion: check-then-act not atomic (#125)' (#152) from fix/issue-125 into main
This commit is contained in:
commit
b008f07861
1 changed files with 83 additions and 21 deletions
|
|
@ -47,6 +47,22 @@ d.pop(str($pr),None)
|
||||||
json.dump(d,open(f,'w'))
|
json.dump(d,open(f,'w'))
|
||||||
" 2>/dev/null || true
|
" 2>/dev/null || true
|
||||||
}
|
}
|
||||||
|
ci_fix_check_and_increment() {
|
||||||
|
local pr="$1"
|
||||||
|
flock "$CI_FIX_LOCK" python3 -c "
|
||||||
|
import json,os
|
||||||
|
f='$CI_FIX_TRACKER'
|
||||||
|
d=json.load(open(f)) if os.path.exists(f) else {}
|
||||||
|
count=d.get(str($pr),0)
|
||||||
|
if count>=3:
|
||||||
|
print('exhausted:'+str(count))
|
||||||
|
else:
|
||||||
|
count+=1
|
||||||
|
d[str($pr)]=count
|
||||||
|
json.dump(d,open(f,'w'))
|
||||||
|
print('ok:'+str(count))
|
||||||
|
" 2>/dev/null || echo "exhausted:99"
|
||||||
|
}
|
||||||
|
|
||||||
# Check whether an issue/PR has been escalated (unprocessed or processed)
|
# Check whether an issue/PR has been escalated (unprocessed or processed)
|
||||||
is_escalated() {
|
is_escalated() {
|
||||||
|
|
@ -77,12 +93,58 @@ sys.exit(1)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
|
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
|
||||||
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
|
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
|
||||||
|
#
|
||||||
|
# Pass "check_only" as third arg to skip the atomic increment (backlog scan
|
||||||
|
# path). The caller must then call handle_ci_exhaustion again at launch time
|
||||||
|
# (without check_only) to atomically increment and confirm the decision.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
handle_ci_exhaustion() {
|
handle_ci_exhaustion() {
|
||||||
local pr_num="$1" issue_num="$2"
|
local pr_num="$1" issue_num="$2"
|
||||||
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
|
local check_only="${3:-}"
|
||||||
|
local result
|
||||||
|
|
||||||
|
# Fast path: already in the escalation file — skip without touching counter.
|
||||||
|
if is_escalated "$issue_num" "$pr_num"; then
|
||||||
|
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
|
||||||
|
log "PR #${pr_num} (issue #${issue_num}) already escalated (${CI_FIX_ATTEMPTS} attempts) — skipping"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$check_only" = "check_only" ]; then
|
||||||
|
# Read-only path for the backlog scan: do not increment here — the counter
|
||||||
|
# is bumped atomically at LAUNCH time so a WAITING_PRS exit cannot waste an
|
||||||
|
# attempt on a poll cycle that never spawns a dev-agent.
|
||||||
|
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
|
||||||
|
if [ "$CI_FIX_ATTEMPTS" -ge 3 ]; then
|
||||||
|
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
||||||
|
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
|
||||||
|
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
||||||
|
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Single flock-protected call: read + threshold-check + conditional increment.
|
||||||
|
# Prevents two concurrent pollers from both passing the threshold and spawning
|
||||||
|
# duplicate dev-agents for the same PR.
|
||||||
|
result=$(ci_fix_check_and_increment "$pr_num")
|
||||||
|
case "$result" in
|
||||||
|
ok:*)
|
||||||
|
CI_FIX_ATTEMPTS="${result#ok:}"
|
||||||
|
return 1
|
||||||
|
;;
|
||||||
|
exhausted:*)
|
||||||
|
CI_FIX_ATTEMPTS="${result#exhausted:}"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
CI_FIX_ATTEMPTS=99
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
if [ "$CI_FIX_ATTEMPTS" -ge 3 ] || is_escalated "$issue_num" "$pr_num"; then
|
|
||||||
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
||||||
# Only write escalation + alert once (first time hitting 3)
|
# Only write escalation + alert once (first time hitting 3)
|
||||||
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
|
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
|
||||||
|
|
@ -92,9 +154,6 @@ handle_ci_exhaustion() {
|
||||||
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
|
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
|
||||||
fi
|
fi
|
||||||
return 0
|
return 0
|
||||||
fi
|
|
||||||
|
|
||||||
return 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# shellcheck disable=SC2034
|
# shellcheck disable=SC2034
|
||||||
|
|
@ -150,9 +209,8 @@ try_merge_or_rebase() {
|
||||||
log "PR #${pr_num} rebase failed — CI exhausted, not spawning"
|
log "PR #${pr_num} rebase failed — CI exhausted, not spawning"
|
||||||
return 1
|
return 1
|
||||||
fi
|
fi
|
||||||
ci_fix_increment "$pr_num"
|
log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
|
||||||
log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)"
|
matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt ${CI_FIX_ATTEMPTS}/3)" 2>/dev/null || true
|
||||||
matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt $((CI_FIX_ATTEMPTS+1))/3)" 2>/dev/null || true
|
|
||||||
nohup "${SCRIPT_DIR}/dev-agent.sh" "$issue_num" >> "$LOGFILE" 2>&1 &
|
nohup "${SCRIPT_DIR}/dev-agent.sh" "$issue_num" >> "$LOGFILE" 2>&1 &
|
||||||
log "started dev-agent PID $! for PR #${pr_num} (rebase fix)"
|
log "started dev-agent PID $! for PR #${pr_num} (rebase fix)"
|
||||||
fi
|
fi
|
||||||
|
|
@ -291,8 +349,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
|
||||||
# Fall through to backlog scan instead of exit
|
# Fall through to backlog scan instead of exit
|
||||||
:
|
:
|
||||||
else
|
else
|
||||||
ci_fix_increment "$HAS_PR"
|
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
|
||||||
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)"
|
|
||||||
nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
|
nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
|
||||||
log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
|
log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
|
||||||
exit 0
|
exit 0
|
||||||
|
|
@ -362,8 +419,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
|
||||||
if handle_ci_exhaustion "$PR_NUM" "$STUCK_ISSUE"; then
|
if handle_ci_exhaustion "$PR_NUM" "$STUCK_ISSUE"; then
|
||||||
continue # skip this PR, check next stuck PR or fall through to backlog
|
continue # skip this PR, check next stuck PR or fall through to backlog
|
||||||
else
|
else
|
||||||
ci_fix_increment "$PR_NUM"
|
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt ${CI_FIX_ATTEMPTS}/3)"
|
||||||
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt $((CI_FIX_ATTEMPTS+1))/3)"
|
|
||||||
nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
|
nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
|
||||||
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
log "started dev-agent PID $! for stuck PR #${PR_NUM}"
|
||||||
exit 0
|
exit 0
|
||||||
|
|
@ -435,7 +491,7 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
|
||||||
break
|
break
|
||||||
|
|
||||||
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
|
||||||
if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM"; then
|
if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM" "check_only"; then
|
||||||
# Don't add to WAITING_PRS — escalated PRs should not block new work
|
# Don't add to WAITING_PRS — escalated PRs should not block new work
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
@ -471,9 +527,15 @@ fi
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# LAUNCH: start dev-agent for the ready issue
|
# LAUNCH: start dev-agent for the ready issue
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Deferred CI fix increment — only now that we're actually launching
|
# Deferred CI fix increment — only now that we're certain we are launching.
|
||||||
|
# Uses the atomic ci_fix_check_and_increment (inside handle_ci_exhaustion) so
|
||||||
|
# the counter is bumped exactly once even under concurrent poll invocations,
|
||||||
|
# and a WAITING_PRS exit above cannot silently consume a fix attempt.
|
||||||
if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
|
if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
|
||||||
ci_fix_increment "$READY_PR_FOR_INCREMENT"
|
if handle_ci_exhaustion "$READY_PR_FOR_INCREMENT" "$READY_ISSUE"; then
|
||||||
|
# exhausted (another poller incremented between scan and launch) — bail out
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "launching dev-agent for #${READY_ISSUE}"
|
log "launching dev-agent for #${READY_ISSUE}"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue