fix: address review findings from issue #67 escalation refactor

- supervisor: skip *.done.jsonl in escalation glob (bug: wildcard matched
  harb.done.jsonl producing spurious 'pending' log noise every cycle)
- supervisor: use wc -l instead of grep -c . for line counting (style nit)
- supervisor: consume gardener-esc-resolved.log via fixed() so escalation
  resolutions appear in end-of-cycle supervisor reporting
- dev-poll: update all 'escalated to supervisor' log/matrix strings to
  'escalated to gardener' (lines 263, 268, 344, 420)
- gardener: track _esc_total_created across all escalation entries and
  write count to supervisor/gardener-esc-resolved.log after processing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-17 18:30:57 +00:00
parent 150ede5605
commit df2522a7cb
3 changed files with 25 additions and 5 deletions

View file

@ -260,12 +260,12 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR") FIX_ATTEMPTS=$(ci_fix_count "$HAS_PR")
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$HAS_PR"; then if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$HAS_PR"; then
# Already escalated — skip silently, let pipeline continue to backlog # Already escalated — skip silently, let pipeline continue to backlog
log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping" log "issue #${ISSUE_NUM} PR #${HAS_PR} CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
# Only write escalation + alert once (first time hitting 3) # Only write escalation + alert once (first time hitting 3)
if [ "$FIX_ATTEMPTS" -eq 3 ]; then if [ "$FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${ISSUE_NUM},\"pr\":${HAS_PR},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
matrix_send "dev" "🚨 PR #${HAS_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated to supervisor" 2>/dev/null || true matrix_send "dev" "🚨 PR #${HAS_PR} (issue #${ISSUE_NUM}) CI failed after ${FIX_ATTEMPTS} attempts — escalated to gardener" 2>/dev/null || true
ci_fix_increment "$HAS_PR" # bump to 4 so we don't re-alert ci_fix_increment "$HAS_PR" # bump to 4 so we don't re-alert
fi fi
# Fall through to backlog scan instead of exit # Fall through to backlog scan instead of exit
@ -341,7 +341,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM") FIX_ATTEMPTS=$(ci_fix_count "$PR_NUM")
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$STUCK_ISSUE" "$PR_NUM"; then if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$STUCK_ISSUE" "$PR_NUM"; then
# Already escalated — skip to let pipeline continue # Already escalated — skip to let pipeline continue
log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to supervisor, skipping" log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI exhausted (${FIX_ATTEMPTS} attempts) — escalated to gardener, skipping"
if [ "$FIX_ATTEMPTS" -eq 3 ]; then if [ "$FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${STUCK_ISSUE},\"pr\":${PR_NUM},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
@ -417,7 +417,7 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then elif ! ci_passed "$CI_STATE" && [ "$CI_STATE" != "" ] && [ "$CI_STATE" != "pending" ] && [ "$CI_STATE" != "unknown" ]; then
FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR") FIX_ATTEMPTS=$(ci_fix_count "$EXISTING_PR")
if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$EXISTING_PR"; then if [ "$FIX_ATTEMPTS" -ge 3 ] || is_escalated "$ISSUE_NUM" "$EXISTING_PR"; then
log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — escalated to supervisor, skipping (not blocking pipeline)" log "#${ISSUE_NUM} PR #${EXISTING_PR} CI failed — escalated to gardener, skipping (not blocking pipeline)"
if [ "$FIX_ATTEMPTS" -eq 3 ]; then if [ "$FIX_ATTEMPTS" -eq 3 ]; then
echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ echo "{\"issue\":${ISSUE_NUM},\"pr\":${EXISTING_PR},\"project\":\"${PROJECT_NAME}\",\"reason\":\"ci_exhausted_poll\",\"attempts\":${FIX_ATTEMPTS},\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
>> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"

View file

@ -318,6 +318,7 @@ if [ -s "$ESCALATION_FILE" ]; then
ESCALATION_COUNT=$(wc -l < "$ESCALATION_SNAP") ESCALATION_COUNT=$(wc -l < "$ESCALATION_SNAP")
log "Processing ${ESCALATION_COUNT} escalation(s) for ${PROJECT_NAME}" log "Processing ${ESCALATION_COUNT} escalation(s) for ${PROJECT_NAME}"
_esc_total_created=0
while IFS= read -r esc_entry; do while IFS= read -r esc_entry; do
[ -z "$esc_entry" ] && continue [ -z "$esc_entry" ] && continue
@ -402,6 +403,7 @@ Fix all ShellCheck errors${sc_codes:+ (${sc_codes})} in \`${sc_file}\` so PR #${
if [ -n "$new_issue" ]; then if [ -n "$new_issue" ]; then
log "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})" log "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})"
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1)) ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
_esc_total_created=$((_esc_total_created + 1))
matrix_send "gardener" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true matrix_send "gardener" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi fi
done <<< "$sc_files" done <<< "$sc_files"
@ -447,6 +449,7 @@ ${ESC_GENERIC_FAIL}
if [ -n "$new_issue" ]; then if [ -n "$new_issue" ]; then
log "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})" log "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})"
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1)) ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
_esc_total_created=$((_esc_total_created + 1))
matrix_send "gardener" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true matrix_send "gardener" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi fi
fi fi
@ -479,6 +482,7 @@ Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the
if [ -n "$new_issue" ]; then if [ -n "$new_issue" ]; then
log "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}" log "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}"
_esc_total_created=$((_esc_total_created + 1))
matrix_send "gardener" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true matrix_send "gardener" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi fi
fi fi
@ -489,6 +493,12 @@ Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the
rm -f "$ESCALATION_SNAP" rm -f "$ESCALATION_SNAP"
log "Escalations processed — moved to $(basename "$ESCALATION_DONE")" log "Escalations processed — moved to $(basename "$ESCALATION_DONE")"
# Report resolution count to supervisor for its fixed() summary
if [ "${_esc_total_created:-0}" -gt 0 ]; then
printf '%d %s\n' "$_esc_total_created" "$PROJECT_NAME" \
>> "${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log"
fi
fi fi
log "--- Gardener poll done ---" log "--- Gardener poll done ---"

View file

@ -219,13 +219,23 @@ done
# Report pending escalations (processing has moved to gardener-poll.sh per-project) # Report pending escalations (processing has moved to gardener-poll.sh per-project)
for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do
[ -f "$_esc_file" ] || continue [ -f "$_esc_file" ] || continue
_esc_count=$(grep -c . "$_esc_file" 2>/dev/null || true) [[ "$_esc_file" == *.done.jsonl ]] && continue
_esc_count=$(wc -l < "$_esc_file" 2>/dev/null || true)
[ "${_esc_count:-0}" -gt 0 ] || continue [ "${_esc_count:-0}" -gt 0 ] || continue
_esc_proj=$(basename "$_esc_file" .jsonl) _esc_proj=$(basename "$_esc_file" .jsonl)
_esc_proj="${_esc_proj#escalations-}" _esc_proj="${_esc_proj#escalations-}"
flog "${_esc_proj}: ${_esc_count} escalation(s) pending (gardener will process)" flog "${_esc_proj}: ${_esc_count} escalation(s) pending (gardener will process)"
done done
# Pick up escalation resolutions handled by gardener
_gesc_log="${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log"
if [ -f "$_gesc_log" ]; then
while IFS=' ' read -r _gn _gp; do
[ -n "${_gn:-}" ] && fixed "${_gp:-unknown}: gardener created ${_gn} sub-issue(s) from escalations"
done < "$_gesc_log"
rm -f "$_gesc_log"
fi
# ############################################################################# # #############################################################################
# LAYER 2: PER-PROJECT CHECKS # LAYER 2: PER-PROJECT CHECKS
# (iterated over projects/*.toml, config-driven) # (iterated over projects/*.toml, config-driven)