fix: Escalation write-once guard is not atomic (pre-existing) (#154)
- `ci_fix_check_and_increment` now accepts an optional `check_only` arg:
- count < 3, check_only: returns `ok:N` without incrementing (deferred
to launch time, preserving the WAITING_PRS protection)
- count < 3, non-check_only: increments and returns `ok:N` (unchanged)
- count == 3 (any mode): atomically bumps to 4 and returns
`exhausted_first_time:3` — only one concurrent poller can win this
- count > 3 (any mode): returns `exhausted:N` with no write
- `handle_ci_exhaustion` unified to a single code path for both
check_only and non-check_only:
- Writes escalation JSONL + matrix_send only when sentinel is
`exhausted_first_time` — never on a bare integer comparison outside
a lock
- Removes the two separate `ci_fix_increment` bump-to-4 calls that
were racy (the sentinel bump is now inside the flock in Python)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9f44c512b6
commit
d904192ab7
1 changed files with 29 additions and 35 deletions
|
|
@ -49,13 +49,21 @@ json.dump(d,open(f,'w'))
|
||||||
}
|
}
|
||||||
ci_fix_check_and_increment() {
|
ci_fix_check_and_increment() {
|
||||||
local pr="$1"
|
local pr="$1"
|
||||||
|
local check_only="${2:-}"
|
||||||
flock "$CI_FIX_LOCK" python3 -c "
|
flock "$CI_FIX_LOCK" python3 -c "
|
||||||
import json,os
|
import json,os
|
||||||
f='$CI_FIX_TRACKER'
|
f='$CI_FIX_TRACKER'
|
||||||
|
check_only = '${check_only}' == 'check_only'
|
||||||
d=json.load(open(f)) if os.path.exists(f) else {}
|
d=json.load(open(f)) if os.path.exists(f) else {}
|
||||||
count=d.get(str($pr),0)
|
count=d.get(str($pr),0)
|
||||||
if count>=3:
|
if count>3:
|
||||||
print('exhausted:'+str(count))
|
print('exhausted:'+str(count))
|
||||||
|
elif count==3:
|
||||||
|
d[str($pr)]=4
|
||||||
|
json.dump(d,open(f,'w'))
|
||||||
|
print('exhausted_first_time:3')
|
||||||
|
elif check_only:
|
||||||
|
print('ok:'+str(count))
|
||||||
else:
|
else:
|
||||||
count+=1
|
count+=1
|
||||||
d[str($pr)]=count
|
d[str($pr)]=count
|
||||||
|
|
@ -94,9 +102,11 @@ sys.exit(1)
|
||||||
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
|
# HELPER: handle CI-exhaustion check/escalate (DRY for 3 call sites)
|
||||||
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
|
# Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
|
||||||
#
|
#
|
||||||
# Pass "check_only" as third arg to skip the atomic increment (backlog scan
|
# Pass "check_only" as third arg for the backlog scan path: ok-counts are
|
||||||
# path). The caller must then call handle_ci_exhaustion again at launch time
|
# returned without incrementing (deferred to launch time so a WAITING_PRS
|
||||||
# (without check_only) to atomically increment and confirm the decision.
|
# exit cannot waste a fix attempt). The 3→4 sentinel bump is always atomic
|
||||||
|
# regardless of mode, preventing duplicate escalation writes from concurrent
|
||||||
|
# pollers.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
handle_ci_exhaustion() {
|
handle_ci_exhaustion() {
|
||||||
local pr_num="$1" issue_num="$2"
|
local pr_num="$1" issue_num="$2"
|
||||||
|
|
@ -110,49 +120,33 @@ handle_ci_exhaustion() {
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$check_only" = "check_only" ]; then
|
# Single flock-protected call: read + threshold-check + conditional bump.
|
||||||
# Read-only path for the backlog scan: do not increment here — the counter
|
# In check_only mode, ok-counts are returned without incrementing (deferred
|
||||||
# is bumped atomically at LAUNCH time so a WAITING_PRS exit cannot waste an
|
# to launch time). In both modes, the 3→4 sentinel bump is atomic, so only
|
||||||
# attempt on a poll cycle that never spawns a dev-agent.
|
# one concurrent poller can ever receive exhausted_first_time:3 and write
|
||||||
CI_FIX_ATTEMPTS=$(ci_fix_count "$pr_num")
|
# the escalation entry.
|
||||||
if [ "$CI_FIX_ATTEMPTS" -ge 3 ]; then
|
result=$(ci_fix_check_and_increment "$pr_num" "$check_only")
|
||||||
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
|
||||||
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
|
|
||||||
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
|
||||||
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
|
|
||||||
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
|
||||||
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Single flock-protected call: read + threshold-check + conditional increment.
|
|
||||||
# Prevents two concurrent pollers from both passing the threshold and spawning
|
|
||||||
# duplicate dev-agents for the same PR.
|
|
||||||
result=$(ci_fix_check_and_increment "$pr_num")
|
|
||||||
case "$result" in
|
case "$result" in
|
||||||
ok:*)
|
ok:*)
|
||||||
CI_FIX_ATTEMPTS="${result#ok:}"
|
CI_FIX_ATTEMPTS="${result#ok:}"
|
||||||
return 1
|
return 1
|
||||||
;;
|
;;
|
||||||
|
exhausted_first_time:*)
|
||||||
|
CI_FIX_ATTEMPTS="${result#exhausted_first_time:}"
|
||||||
|
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
||||||
|
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
||||||
|
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
|
||||||
|
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
||||||
|
;;
|
||||||
exhausted:*)
|
exhausted:*)
|
||||||
CI_FIX_ATTEMPTS="${result#exhausted:}"
|
CI_FIX_ATTEMPTS="${result#exhausted:}"
|
||||||
|
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
CI_FIX_ATTEMPTS=99
|
CI_FIX_ATTEMPTS=99
|
||||||
|
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
log "PR #${pr_num} (issue #${issue_num}) CI exhausted (${CI_FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
|
|
||||||
# Only write escalation + alert once (first time hitting 3)
|
|
||||||
if [ "$CI_FIX_ATTEMPTS" -eq 3 ]; then
|
|
||||||
echo "{\"issue\":${issue_num},\"pr\":${pr_num},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${CI_FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
|
|
||||||
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
|
|
||||||
matrix_send "dev" "🚨 PR #${pr_num} (issue #${issue_num}) CI failed after ${CI_FIX_ATTEMPTS} attempts — escalated" 2>/dev/null || true
|
|
||||||
ci_fix_increment "$pr_num" # bump to 4 to prevent re-alert
|
|
||||||
fi
|
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue