From 7bf13567fdba0ef77d554299662ab8868420a435 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Mar 2026 10:22:24 +0000 Subject: [PATCH 1/2] fix: TOCTOU in handle_ci_exhaustion: check-then-act not atomic (#125) Add ci_fix_check_and_increment() that performs read + threshold-check + conditional increment in a single flock-protected Python call, replacing the prior three-step sequence (ci_fix_count / bash check / ci_fix_increment) that allowed two concurrent poll invocations to both pass the threshold and spawn duplicate dev-agents for the same PR. handle_ci_exhaustion now calls ci_fix_check_and_increment atomically and returns the new count in CI_FIX_ATTEMPTS; all separate ci_fix_increment calls after handle_ci_exhaustion (including the deferred READY_PR_FOR_INCREMENT mechanism) are removed. Log messages updated from CI_FIX_ATTEMPTS+1 to CI_FIX_ATTEMPTS to reflect the post-increment count. Co-Authored-By: Claude Sonnet 4.6 --- dev/dev-poll.sh | 82 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh index 7a39707..c401155 100755 --- a/dev/dev-poll.sh +++ b/dev/dev-poll.sh @@ -47,6 +47,22 @@ d.pop(str($pr),None) json.dump(d,open(f,'w')) " 2>/dev/null || true } +ci_fix_check_and_increment() { + local pr="$1" + flock "$CI_FIX_LOCK" python3 -c " +import json,os +f='$CI_FIX_TRACKER' +d=json.load(open(f)) if os.path.exists(f) else {} +count=d.get(str($pr),0) +if count>=3: + print('exhausted:'+str(count)) +else: + count+=1 + d[str($pr)]=count + json.dump(d,open(f,'w')) + print('ok:'+str(count)) +" 2>/dev/null || echo "exhausted:99" +} # Check whether an issue/PR has been escalated (unprocessed or processed) is_escalated() { @@ -76,25 +92,46 @@ sys.exit(1) # ============================================================================= # HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites) -# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not. +# Atomically checks and increments the fix counter via ci_fix_check_and_increment; +# sets CI_FIX_ATTEMPTS to the new count for caller use. +# Returns 0 if exhausted, 1 if not. # ============================================================================= handle_ci_exhaustion() { - local pr_num="$1" issue_num="$2" - CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num") + local pr_num="$1" issue_num="$2" result - if [ "$CI_FIX_ATTEMPTS" -ge 3 ] || is_escalated "$issue_num" "$pr_num"; then - log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" - # Only write escalation + alert once (first time hitting 3) - if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then - echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" - matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true - ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert - fi + # Fast path: already in the escalation file — skip without touching counter. + if is_escalated "$issue_num" "$pr_num"; then + CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num") + log "PR #${pr_num} (issue #${issue_num}) already escalated (${CI_FIX_ATTEMPTS} attempts) — skipping" return 0 fi - return 1 + # Single flock-protected call: read + threshold-check + conditional increment. + # Prevents two concurrent pollers from both passing the threshold and spawning + # duplicate dev-agents for the same PR. + result=$(ci_fix_check_and_increment "$pr_num") + case "$result" in + ok:*) + CI_FIX_ATTEMPTS="${result#ok:}" + return 1 + ;; + exhausted:*) + CI_FIX_ATTEMPTS="${result#exhausted:}" + ;; + *) + CI_FIX_ATTEMPTS=99 + ;; + esac + + log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" + # Only write escalation + alert once (first time hitting 3) + if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then + echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true + ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert + fi + return 0 } # shellcheck disable=SC2034 @@ -150,9 +187,8 @@ try_merge_or_rebase() { log "PR #${pr_num} rebase failed — CI exhausted, not spawning" return 1 fi - ci_fix_increment "$pr_num" - log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)" - matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt $((CI_FIX_ATTEMPTS+1))/3)" 2>/dev/null || true + log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)" + matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt ${CI_FIX_ATTEMPTS}/3)" 2>/dev/null || true nohup "${SCRIPT_DIR}/dev-agent.sh" "$issue_num" >> "$LOGFILE" 2>&1 & log "started dev-agent PID $! for PR #${pr_num} (rebase fix)" fi @@ -291,8 +327,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then # Fall through to backlog scan instead of exit : else - ci_fix_increment "$HAS_PR" - log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)" + log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)" nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 & log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)" exit 0 @@ -362,8 +397,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do if handle_ci_exhaustion "$PR_NUM" "$STUCK_ISSUE"; then continue # skip this PR, check next stuck PR or fall through to backlog else - ci_fix_increment "$PR_NUM" - log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt $((CI_FIX_ATTEMPTS+1))/3)" + log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt ${CI_FIX_ATTEMPTS}/3)" nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 & log "started dev-agent PID $! for stuck PR #${PR_NUM}" exit 0 @@ -439,9 +473,8 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do # Don't add to WAITING_PRS — escalated PRs should not block new work continue fi - log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt $((CI_FIX_ATTEMPTS+1))/3)" + log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt ${CI_FIX_ATTEMPTS}/3)" READY_ISSUE="$ISSUE_NUM" - READY_PR_FOR_INCREMENT="$EXISTING_PR" break else @@ -471,11 +504,6 @@ fi # ============================================================================= # LAUNCH: start dev-agent for the ready issue # ============================================================================= -# Deferred CI fix increment — only now that we're actually launching -if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then - ci_fix_increment "$READY_PR_FOR_INCREMENT" -fi - log "launching dev-agent for #${READY_ISSUE}" matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true rm -f "$PREFLIGHT_RESULT" From 4dc64ea65bac0d6935c79bb292a24eee6e9058d1 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 18 Mar 2026 10:34:41 +0000 Subject: [PATCH 2/2] fix: restore deferred increment for backlog path to prevent counter leak The previous commit introduced a counter leak in the backlog scan path: handle_ci_exhaustion (without check_only) atomically incremented the CI fix counter before the WAITING_PRS guard, so an exit 0 that never spawned a dev-agent would silently consume one of the three allowed fix attempts. Restore the READY_PR_FOR_INCREMENT / deferred-increment mechanism: - Backlog scan calls handle_ci_exhaustion with "check_only" (read-only, no increment) to detect exhaustion without touching the counter. - The counter is bumped atomically at LAUNCH time via handle_ci_exhaustion (without check_only), so the increment only happens when we are certain a dev-agent is being spawned. If a concurrent poller already exhausted the counter between scan and launch, the LAUNCH call returns 0 and we bail out cleanly without double-spawning. The in-progress, stuck-PR, and try_merge_or_rebase paths are unaffected: they call handle_ci_exhaustion without check_only, which continues to use the atomic ci_fix_check_and_increment to prevent concurrent double-spawning. Co-Authored-By: Claude Sonnet 4.6 --- dev/dev-poll.sh | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh index c401155..222fab5 100755 --- a/dev/dev-poll.sh +++ b/dev/dev-poll.sh @@ -92,12 +92,16 @@ sys.exit(1) # ============================================================================= # HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites) -# Atomically checks and increments the fix counter via ci_fix_check_and_increment; -# sets CI_FIX_ATTEMPTS to the new count for caller use. -# Returns 0 if exhausted, 1 if not. +# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not. +# +# Pass "check_only" as third arg to skip the atomic increment (backlog scan +# path). The caller must then call handle_ci_exhaustion again at launch time +# (without check_only) to atomically increment and confirm the decision. # ============================================================================= handle_ci_exhaustion() { - local pr_num="$1" issue_num="$2" result + local pr_num="$1" issue_num="$2" + local check_only="${3:-}" + local result # Fast path: already in the escalation file — skip without touching counter. if is_escalated "$issue_num" "$pr_num"; then @@ -106,6 +110,24 @@ handle_ci_exhaustion() { return 0 fi + if [ "$check_only" = "check_only" ]; then + # Read-only path for the backlog scan: do not increment here — the counter + # is bumped atomically at LAUNCH time so a WAITING_PRS exit cannot waste an + # attempt on a poll cycle that never spawns a dev-agent. + CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num") + if [ "$CI_FIX_ATTEMPTS" -ge 3 ]; then + log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping" + if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then + echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true + ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert + fi + return 0 + fi + return 1 + fi + # Single flock-protected call: read + threshold-check + conditional increment. # Prevents two concurrent pollers from both passing the threshold and spawning # duplicate dev-agents for the same PR. @@ -469,12 +491,13 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do break elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then - if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM"; then + if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM" "check_only"; then # Don't add to WAITING_PRS — escalated PRs should not block new work continue fi - log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt ${CI_FIX_ATTEMPTS}/3)" + log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt $((CI_FIX_ATTEMPTS+1))/3)" READY_ISSUE="$ISSUE_NUM" + READY_PR_FOR_INCREMENT="$EXISTING_PR" break else @@ -504,6 +527,17 @@ fi # ============================================================================= # LAUNCH: start dev-agent for the ready issue # ============================================================================= +# Deferred CI fix increment — only now that we're certain we are launching. +# Uses the atomic ci_fix_check_and_increment (inside handle_ci_exhaustion) so +# the counter is bumped exactly once even under concurrent poll invocations, +# and a WAITING_PRS exit above cannot silently consume a fix attempt. +if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then + if handle_ci_exhaustion "$READY_PR_FOR_INCREMENT" "$READY_ISSUE"; then + # exhausted (another poller incremented between scan and launch) — bail out + exit 0 + fi +fi + log "launching dev-agent for #${READY_ISSUE}" matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true rm -f "$PREFLIGHT_RESULT"