fix: Escalation write-once guard is not atomic (pre-existing) (#154)

- `ci_fix_check_and_increment` now accepts an optional `check_only` arg:
  - count < 3, check_only: returns `ok:N` without incrementing (deferred
    to launch time, preserving the WAITING_PRS protection)
  - count < 3, non-check_only: increments and returns `ok:N` (unchanged)
  - count == 3 (any mode): atomically bumps to 4 and returns
    `exhausted_first_time:3` — only one concurrent poller can win this
  - count > 3 (any mode): returns `exhausted:N` with no write

- `handle_ci_exhaustion` unified to a single code path for both
  check_only and non-check_only:
  - Writes escalation JSONL + matrix_send only when sentinel is
    `exhausted_first_time` — never on a bare integer comparison outside
    a lock
  - Removes the two separate `ci_fix_increment` bump-to-4 calls that
    were racy (the sentinel bump is now inside the flock in Python)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-18 11:44:30 +00:00
parent 9f44c512b6
commit d904192ab7

View file

@ -49,13 +49,21 @@ json.dump(d,open(f,'w'))
}
ci_fix_check_and_increment() {
local pr="$1"
local check_only="${2:-}"
flock "$CI_FIX_LOCK" python3 -c "
import json,os
f='$CI_FIX_TRACKER'
check_only = '${check_only}' == 'check_only'
d=json.load(open(f)) if os.path.exists(f) else {}
count=d.get(str($pr),0)
if count>=3:
if count>3:
print('exhausted:'+str(count))
elif count==3:
d[str($pr)]=4
json.dump(d,open(f,'w'))
print('exhausted_first_time:3')
elif check_only:
print('ok:'+str(count))
else:
count+=1
d[str($pr)]=count
@ -94,9 +102,11 @@ sys.exit(1)
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
#
# Pass "check_only" as third arg to skip the atomic increment (backlog scan
# path). The caller must then call handle_ci_exhaustion again at launch time
# (without check_only) to atomically increment and confirm the decision.
# Pass "check_only" as third arg for the backlog scan path: ok-counts are
# returned without incrementing (deferred to launch time so a WAITING_PRS
# exit cannot waste a fix attempt). The 3→4 sentinel bump is always atomic
# regardless of mode, preventing duplicate escalation writes from concurrent
# pollers.
# =============================================================================
handle_ci_exhaustion() {
local pr_num="$1" issue_num="$2"
@ -110,49 +120,33 @@ handle_ci_exhaustion() {
return 0
fi
if [ "$check_only" = "check_only" ]; then
# Read-only path for the backlog scan: do not increment here — the counter
# is bumped atomically at LAUNCH time so a WAITING_PRS exit cannot waste an
# attempt on a poll cycle that never spawns a dev-agent.
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
if [ "$CI_FIX_ATTEMPTS" -ge 3 ]; then
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
fi
return 0
fi
return 1
fi
# Single flock-protected call: read + threshold-check + conditional increment.
# Prevents two concurrent pollers from both passing the threshold and spawning
# duplicate dev-agents for the same PR.
result=$(ci_fix_check_and_increment "$pr_num")
# Single flock-protected call: read + threshold-check + conditional bump.
# In check_only mode, ok-counts are returned without incrementing (deferred
# to launch time). In both modes, the 3→4 sentinel bump is atomic, so only
# one concurrent poller can ever receive exhausted_first_time:3 and write
# the escalation entry.
result=$(ci_fix_check_and_increment "$pr_num" "$check_only")
case "$result" in
ok:*)
CI_FIX_ATTEMPTS="${result#ok:}"
return 1
;;
exhausted_first_time:*)
CI_FIX_ATTEMPTS="${result#exhausted_first_time:}"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
;;
exhausted:*)
CI_FIX_ATTEMPTS="${result#exhausted:}"
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
;;
*)
CI_FIX_ATTEMPTS=99
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
;;
esac
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
# Only write escalation + alert once (first time hitting 3)
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
fi
return 0
}