From 7bf13567fdba0ef77d554299662ab8868420a435 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Mar 2026 10:22:24 +0000
Subject: [PATCH 1/2] fix: TOCTOU in handle_ci_exhaustion: check-then-act not
 atomic (#125)

Add ci_fix_check_and_increment() that performs read + threshold-check +
conditional increment in a single flock-protected Python call, replacing
the prior three-step sequence (ci_fix_count / bash check / ci_fix_increment)
that allowed two concurrent poll invocations to both pass the threshold and
spawn duplicate dev-agents for the same PR.

handle_ci_exhaustion now calls ci_fix_check_and_increment atomically and
returns the new count in CI_FIX_ATTEMPTS; all separate ci_fix_increment
calls after handle_ci_exhaustion (including the deferred READY_PR_FOR_INCREMENT
mechanism) are removed. Log messages updated from CI_FIX_ATTEMPTS+1 to
CI_FIX_ATTEMPTS to reflect the post-increment count.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 dev/dev-poll.sh | 82 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh
index 7a39707..c401155 100755
--- a/dev/dev-poll.sh
+++ b/dev/dev-poll.sh
@@ -47,6 +47,22 @@ d.pop(str($pr),None)
 json.dump(d,open(f,'w'))
 " 2>/dev/null || true
 }
+ci_fix_check_and_increment() {
+  local pr="$1"
+  flock "$CI_FIX_LOCK" python3 -c "
+import json,os
+f='$CI_FIX_TRACKER'
+d=json.load(open(f)) if os.path.exists(f) else {}
+count=d.get(str($pr),0)
+if count>=3:
+    print('exhausted:'+str(count))
+else:
+    count+=1
+    d[str($pr)]=count
+    json.dump(d,open(f,'w'))
+    print('ok:'+str(count))
+" 2>/dev/null || echo "exhausted:99"
+}
 
 # Check whether an issue/PR has been escalated (unprocessed or processed)
 is_escalated() {
@@ -76,25 +92,46 @@ sys.exit(1)
 
 # =============================================================================
 # HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
-# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
+# Atomically checks and increments the fix counter via ci_fix_check_and_increment;
+# sets CI_FIX_ATTEMPTS to the new count for caller use.
+# Returns 0 if exhausted, 1 if not.
 # =============================================================================
 handle_ci_exhaustion() {
-  local pr_num="$1" issue_num="$2"
-  CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
+  local pr_num="$1" issue_num="$2" result
 
-  if [ "$CI_FIX_ATTEMPTS" -ge 3 ] || is_escalated "$issue_num" "$pr_num"; then
-    log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
-    # Only write escalation + alert once (first time hitting 3)
-    if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
-      echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
-        >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
-      matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
-      ci_fix_increment "$pr_num"  # bump to 4 to prevent re-alert
-    fi
+  # Fast path: already in the escalation file — skip without touching counter.
+  if is_escalated "$issue_num" "$pr_num"; then
+    CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
+    log "PR #${pr_num} (issue #${issue_num}) already escalated (${CI_FIX_ATTEMPTS} attempts) — skipping"
     return 0
   fi
 
-  return 1
+  # Single flock-protected call: read + threshold-check + conditional increment.
+  # Prevents two concurrent pollers from both passing the threshold and spawning
+  # duplicate dev-agents for the same PR.
+  result=$(ci_fix_check_and_increment "$pr_num")
+  case "$result" in
+    ok:*)
+      CI_FIX_ATTEMPTS="${result#ok:}"
+      return 1
+      ;;
+    exhausted:*)
+      CI_FIX_ATTEMPTS="${result#exhausted:}"
+      ;;
+    *)
+      CI_FIX_ATTEMPTS=99
+      ;;
+  esac
+
+  log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
+  # Only write escalation + alert once (first time hitting 3)
+  if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
+    echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
+      >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
+    matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
+    ci_fix_increment "$pr_num"  # bump to 4 to prevent re-alert
+  fi
+  return 0
 }
 
 # shellcheck disable=SC2034
@@ -150,9 +187,8 @@ try_merge_or_rebase() {
       log "PR #${pr_num} rebase failed — CI exhausted, not spawning"
       return 1
     fi
-    ci_fix_increment "$pr_num"
-    log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)"
-    matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt $((CI_FIX_ATTEMPTS+1))/3)" 2>/dev/null || true
+    log "PR #${pr_num} rebase failed — spawning dev-agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
+    matrix_send "dev" "❌ PR #${pr_num} rebase failed — spawning dev-agent (attempt ${CI_FIX_ATTEMPTS}/3)" 2>/dev/null || true
     nohup "${SCRIPT_DIR}/dev-agent.sh" "$issue_num" >> "$LOGFILE" 2>&1 &
     log "started dev-agent PID $! for PR #${pr_num} (rebase fix)"
   fi
@@ -291,8 +327,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
         # Fall through to backlog scan instead of exit
         :
       else
-        ci_fix_increment "$HAS_PR"
-        log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt $((CI_FIX_ATTEMPTS+1))/3)"
+        log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
         nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
         log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
         exit 0
@@ -362,8 +397,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
     if handle_ci_exhaustion "$PR_NUM" "$STUCK_ISSUE"; then
       continue  # skip this PR, check next stuck PR or fall through to backlog
     else
-      ci_fix_increment "$PR_NUM"
-      log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt $((CI_FIX_ATTEMPTS+1))/3)"
+      log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt ${CI_FIX_ATTEMPTS}/3)"
       nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
       log "started dev-agent PID $! for stuck PR #${PR_NUM}"
       exit 0
@@ -439,9 +473,8 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
         # Don't add to WAITING_PRS — escalated PRs should not block new work
         continue
       fi
-      log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt $((CI_FIX_ATTEMPTS+1))/3)"
+      log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt ${CI_FIX_ATTEMPTS}/3)"
       READY_ISSUE="$ISSUE_NUM"
-      READY_PR_FOR_INCREMENT="$EXISTING_PR"
       break
 
     else
@@ -471,11 +504,6 @@ fi
 # =============================================================================
 # LAUNCH: start dev-agent for the ready issue
 # =============================================================================
-# Deferred CI fix increment — only now that we're actually launching
-if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
-  ci_fix_increment "$READY_PR_FOR_INCREMENT"
-fi
-
 log "launching dev-agent for #${READY_ISSUE}"
 matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true
 rm -f "$PREFLIGHT_RESULT"

From 4dc64ea65bac0d6935c79bb292a24eee6e9058d1 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 18 Mar 2026 10:34:41 +0000
Subject: [PATCH 2/2] fix: restore deferred increment for backlog path to
 prevent counter leak

The previous commit introduced a counter leak in the backlog scan path:
handle_ci_exhaustion (without check_only) atomically incremented the CI
fix counter before the WAITING_PRS guard, so an exit 0 that never spawned
a dev-agent would silently consume one of the three allowed fix attempts.

Restore the READY_PR_FOR_INCREMENT / deferred-increment mechanism:
- Backlog scan calls handle_ci_exhaustion with "check_only" (read-only,
  no increment) to detect exhaustion without touching the counter.
- The counter is bumped atomically at LAUNCH time via handle_ci_exhaustion
  (without check_only), so the increment only happens when we are certain
  a dev-agent is being spawned. If a concurrent poller already exhausted
  the counter between scan and launch, the LAUNCH call returns 0 and we
  bail out cleanly without double-spawning.

The in-progress, stuck-PR, and try_merge_or_rebase paths are unaffected:
they call handle_ci_exhaustion without check_only, which continues to use
the atomic ci_fix_check_and_increment to prevent concurrent double-spawning.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 dev/dev-poll.sh | 46 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh
index c401155..222fab5 100755
--- a/dev/dev-poll.sh
+++ b/dev/dev-poll.sh
@@ -92,12 +92,16 @@ sys.exit(1)
 
 # =============================================================================
 # HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
-# Atomically checks and increments the fix counter via ci_fix_check_and_increment;
-# sets CI_FIX_ATTEMPTS to the new count for caller use.
-# Returns 0 if exhausted, 1 if not.
+# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
+#
+# Pass "check_only" as third arg to skip the atomic increment (backlog scan
+# path). The caller must then call handle_ci_exhaustion again at launch time
+# (without check_only) to atomically increment and confirm the decision.
 # =============================================================================
 handle_ci_exhaustion() {
-  local pr_num="$1" issue_num="$2" result
+  local pr_num="$1" issue_num="$2"
+  local check_only="${3:-}"
+  local result
 
   # Fast path: already in the escalation file — skip without touching counter.
   if is_escalated "$issue_num" "$pr_num"; then
@@ -106,6 +110,24 @@ handle_ci_exhaustion() {
     return 0
   fi
 
+  if [ "$check_only" = "check_only" ]; then
+    # Read-only path for the backlog scan: do not increment here — the counter
+    # is bumped atomically at LAUNCH time so a WAITING_PRS exit cannot waste an
+    # attempt on a poll cycle that never spawns a dev-agent.
+    CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
+    if [ "$CI_FIX_ATTEMPTS" -ge 3 ]; then
+      log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
+      if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
+        echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
+          >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
+        matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
+        ci_fix_increment "$pr_num"  # bump to 4 to prevent re-alert
+      fi
+      return 0
+    fi
+    return 1
+  fi
+
   # Single flock-protected call: read + threshold-check + conditional increment.
   # Prevents two concurrent pollers from both passing the threshold and spawning
   # duplicate dev-agents for the same PR.
@@ -469,12 +491,13 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
       break
 
     elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
-      if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM"; then
+      if handle_ci_exhaustion "$EXISTING_PR" "$ISSUE_NUM" "check_only"; then
         # Don't add to WAITING_PRS — escalated PRs should not block new work
         continue
       fi
-      log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt ${CI_FIX_ATTEMPTS}/3)"
+      log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — picking up (attempt $((CI_FIX_ATTEMPTS+1))/3)"
       READY_ISSUE="$ISSUE_NUM"
+      READY_PR_FOR_INCREMENT="$EXISTING_PR"
       break
 
     else
@@ -504,6 +527,17 @@ fi
 # =============================================================================
 # LAUNCH: start dev-agent for the ready issue
 # =============================================================================
+# Deferred CI fix increment — only now that we're certain we are launching.
+# Uses the atomic ci_fix_check_and_increment (inside handle_ci_exhaustion) so
+# the counter is bumped exactly once even under concurrent poll invocations,
+# and a WAITING_PRS exit above cannot silently consume a fix attempt.
+if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
+  if handle_ci_exhaustion "$READY_PR_FOR_INCREMENT" "$READY_ISSUE"; then
+    # exhausted (another poller incremented between scan and launch) — bail out
+    exit 0
+  fi
+fi
+
 log "launching dev-agent for #${READY_ISSUE}"
 matrix_send "dev" "🚀 Starting dev-agent on issue #${READY_ISSUE}" 2>/dev/null || true
 rm -f "$PREFLIGHT_RESULT"