fix: refactor: replace escalation JSONL with blocked label + diagnostic comment (#352)
Replace the unreliable escalation JSONL system (supervisor/escalations-*.jsonl consumed by gardener) with direct blocked label + diagnostic comment on the original issue. When a dev-agent or action-agent session fails (PHASE:failed, idle timeout, crash, CI exhausted): - Capture last 50 lines from tmux pane via tmux capture-pane - Post a structured diagnostic comment on the issue (exit reason, timestamp, PR number, tmux output) - Label the issue "blocked" (instead of restoring "backlog") - Remove in-progress label Removed: - Escalation JSONL write paths in dev-agent.sh, phase-handler.sh, dev-poll.sh, action-agent.sh - is_escalated() helper in dev-poll.sh - Escalation triage (P2f section) in supervisor-poll.sh - Escalation processing + recipe engine in gardener-poll.sh - ci-escalation-recipes step from run-gardener.toml formula - escalations*.jsonl from .gitignore Added: - post_blocked_diagnostic() shared helper in phase-handler.sh - ensure_blocked_label_id() helper (creates label via API if not exists) - is_blocked() helper in dev-poll.sh (replaces is_escalated) - Blocked issues listing in supervisor/preflight.sh Kept: - Matrix notifications on failure (unchanged) - CI fix counter logic (still tracks attempts) - needs_human injection in supervisor/gardener (not escalation-related) - Gardener grooming (gardener-agent.sh still invoked) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0109f0b0c3
commit
61c44d31b1
10 changed files with 181 additions and 990 deletions
|
|
@ -231,26 +231,6 @@ for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
|
|||
fi
|
||||
done
|
||||
|
||||
# Report pending escalations (processing has moved to gardener-poll.sh per-project)
|
||||
for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do
|
||||
[ -f "$_esc_file" ] || continue
|
||||
[[ "$_esc_file" == *.done.jsonl ]] && continue
|
||||
_esc_count=$(wc -l < "$_esc_file" 2>/dev/null || true)
|
||||
[ "${_esc_count:-0}" -gt 0 ] || continue
|
||||
_esc_proj=$(basename "$_esc_file" .jsonl)
|
||||
_esc_proj="${_esc_proj#escalations-}"
|
||||
flog "${_esc_proj}: ${_esc_count} escalation(s) pending (gardener will process)"
|
||||
done
|
||||
|
||||
# Pick up escalation resolutions handled by gardener
|
||||
_gesc_log="${FACTORY_ROOT}/supervisor/gardener-esc-resolved.log"
|
||||
if [ -f "$_gesc_log" ]; then
|
||||
while IFS=' ' read -r _gn _gp; do
|
||||
[ -n "${_gn:-}" ] && fixed "${_gp:-unknown}: gardener created ${_gn} sub-issue(s) from escalations"
|
||||
done < "$_gesc_log"
|
||||
rm -f "$_gesc_log"
|
||||
fi
|
||||
|
||||
# #############################################################################
|
||||
# LAYER 2: PER-PROJECT CHECKS
|
||||
# (iterated over projects/*.toml, config-driven)
|
||||
|
|
@ -342,149 +322,6 @@ check_project() {
|
|||
find "$_RETRY_DIR" -type f -mmin +1440 -delete 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# ===========================================================================
|
||||
# P2f: ESCALATION TRIAGE — auto-retrigger ci_exhausted if infra-only
|
||||
# ===========================================================================
|
||||
if [ "${CHECK_INFRA_RETRY:-true}" = "true" ]; then
|
||||
status "P2f: ${proj_name}: triaging ci_exhausted escalations"
|
||||
|
||||
_esc_file="${FACTORY_ROOT}/supervisor/escalations-${proj_name}.jsonl"
|
||||
if [ -f "$_esc_file" ] && [ -s "$_esc_file" ]; then
|
||||
_esc_tmp="${_esc_file}.sup.$$"
|
||||
: > "$_esc_tmp"
|
||||
|
||||
while IFS= read -r _esc_line; do
|
||||
[ -z "$_esc_line" ] && continue
|
||||
|
||||
_esc_reason=$(printf '%s' "$_esc_line" | jq -r '.reason // ""' 2>/dev/null)
|
||||
|
||||
# Only triage ci_exhausted entries (from dev-agent or dev-poll)
|
||||
case "$_esc_reason" in
|
||||
ci_exhausted) ;;
|
||||
ci_exhausted_poll) ;;
|
||||
*) printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue ;;
|
||||
esac
|
||||
|
||||
_esc_pr=$(printf '%s' "$_esc_line" | jq -r '.pr // 0' 2>/dev/null)
|
||||
_esc_issue=$(printf '%s' "$_esc_line" | jq -r '.issue // 0' 2>/dev/null)
|
||||
_esc_ts=$(printf '%s' "$_esc_line" | jq -r '.ts // ""' 2>/dev/null)
|
||||
|
||||
# Validate pr/issue are numeric
|
||||
[[ "$_esc_pr" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; }
|
||||
[[ "$_esc_issue" =~ ^[0-9]+$ ]] || { printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue; }
|
||||
|
||||
# Cooldown: 30 min from last escalation timestamp
|
||||
_esc_epoch=0
|
||||
[ -n "$_esc_ts" ] && _esc_epoch=$(date -d "$_esc_ts" +%s 2>/dev/null || echo 0)
|
||||
_esc_age_min=$(( ($(date +%s) - _esc_epoch) / 60 ))
|
||||
|
||||
if [ "$_esc_age_min" -lt 30 ]; then
|
||||
flog "${proj_name}: PR #${_esc_pr} ci_exhausted cooldown (${_esc_age_min}/30min)"
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Get the PR's branch and state from Codeberg
|
||||
_esc_pr_json=$(codeberg_api GET "/pulls/${_esc_pr}" 2>/dev/null) || {
|
||||
flog "${proj_name}: PR #${_esc_pr}: failed to fetch PR info, keeping escalation"
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||
}
|
||||
_esc_pr_state=$(printf '%s' "$_esc_pr_json" | jq -r '.state // ""' 2>/dev/null)
|
||||
if [ "$_esc_pr_state" != "open" ]; then
|
||||
flog "${proj_name}: PR #${_esc_pr} is ${_esc_pr_state:-unknown} — discarding stale escalation"
|
||||
continue # PR merged/closed externally; escalation no longer actionable
|
||||
fi
|
||||
_esc_branch=$(printf '%s' "$_esc_pr_json" | jq -r '.head.ref // ""' 2>/dev/null)
|
||||
if [ -z "$_esc_branch" ]; then
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||
fi
|
||||
|
||||
# Validate branch name to prevent SQL injection
|
||||
if ! [[ "$_esc_branch" =~ ^[a-zA-Z0-9/_.-]+$ ]]; then
|
||||
flog "${proj_name}: PR #${_esc_pr}: unsafe branch name, keeping escalation"
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||
fi
|
||||
|
||||
# Find the latest failed pipeline for this PR's branch via Woodpecker DB
|
||||
_esc_pip=$(wpdb -A -c "
|
||||
SELECT number FROM pipelines
|
||||
WHERE repo_id = ${WOODPECKER_REPO_ID}
|
||||
AND branch = '${_esc_branch}'
|
||||
AND status IN ('failure', 'error')
|
||||
AND finished > 0
|
||||
ORDER BY number DESC LIMIT 1;" 2>/dev/null \
|
||||
| tr -d ' ' | grep -E '^[0-9]+$' | head -1 || true)
|
||||
|
||||
if [ -z "$_esc_pip" ]; then
|
||||
flog "${proj_name}: PR #${_esc_pr}: no failed pipeline for branch ${_esc_branch}, keeping escalation"
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||
fi
|
||||
|
||||
# Classify failure type via ci-helpers
|
||||
_esc_failure=$(classify_pipeline_failure "${WOODPECKER_REPO_ID}" "$_esc_pip" 2>/dev/null || echo "code")
|
||||
|
||||
if [[ "$_esc_failure" != infra* ]]; then
|
||||
flog "${proj_name}: PR #${_esc_pr} pipeline #${_esc_pip}: code failure — leaving escalation for human"
|
||||
printf '%s\n' "$_esc_line" >> "$_esc_tmp"; continue
|
||||
fi
|
||||
|
||||
# Infra-only — push empty commit to retrigger CI via temporary worktree
|
||||
_esc_wt="/tmp/${proj_name}-sup-retry-${_esc_pr}"
|
||||
_esc_retrigger_ok=false
|
||||
if [ -d "${PROJECT_REPO_ROOT:-}" ]; then
|
||||
# Clean up any leftover temp worktree from a previous failed run
|
||||
git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true
|
||||
|
||||
if git -C "${PROJECT_REPO_ROOT}" fetch origin "${_esc_branch}" --quiet 2>/dev/null && \
|
||||
git -C "${PROJECT_REPO_ROOT}" worktree add --quiet --detach \
|
||||
"${_esc_wt}" "origin/${_esc_branch}" 2>/dev/null; then
|
||||
if git -C "${_esc_wt}" \
|
||||
-c user.email="supervisor@factory" \
|
||||
-c user.name="Supervisor" \
|
||||
commit --allow-empty --no-verify \
|
||||
-m "chore: retrigger CI after infra-only exhaustion" \
|
||||
--quiet 2>/dev/null && \
|
||||
git -C "${_esc_wt}" push origin \
|
||||
"HEAD:refs/heads/${_esc_branch}" --quiet 2>/dev/null; then
|
||||
_esc_retrigger_ok=true
|
||||
fi
|
||||
git -C "${PROJECT_REPO_ROOT}" worktree remove --force "${_esc_wt}" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$_esc_retrigger_ok" = true ]; then
|
||||
# Reset CI fix counter so dev-poll can spawn the agent again if needed
|
||||
_ci_fix_file="${FACTORY_ROOT}/dev/ci-fixes-${proj_name}.json"
|
||||
_ci_fix_lock="${_ci_fix_file}.lock"
|
||||
flock "$_ci_fix_lock" python3 -c "
|
||||
import json, os
|
||||
f='${_ci_fix_file}'
|
||||
if not os.path.exists(f):
|
||||
exit()
|
||||
d = json.load(open(f))
|
||||
d.pop(str(${_esc_pr}), None)
|
||||
json.dump(d, open(f, 'w'))
|
||||
" 2>/dev/null || true
|
||||
|
||||
fixed "${proj_name}: auto-retriggered CI for PR #${_esc_pr} after infra-only exhaustion"
|
||||
flog "${proj_name}: auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion"
|
||||
matrix_send "supervisor" "♻️ auto-retriggered CI for PR #${_esc_pr} (issue #${_esc_issue}) after infra-only exhaustion" 2>/dev/null || true
|
||||
# Escalation removed — do NOT write to _esc_tmp
|
||||
else
|
||||
p2 "${proj_name}: PR #${_esc_pr}: infra-only CI exhaustion but retrigger push failed"
|
||||
# Bump timestamp to now so the 30-min cooldown resets; prevents alert flood
|
||||
# on persistent push failures (SSH key issue, Codeberg outage, etc.)
|
||||
_esc_now=$(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
_esc_bumped=$(printf '%s' "$_esc_line" | jq -c --arg ts "$_esc_now" '.ts = $ts' 2>/dev/null \
|
||||
|| printf '%s' "$_esc_line")
|
||||
printf '%s\n' "$_esc_bumped" >> "$_esc_tmp"
|
||||
fi
|
||||
done < "$_esc_file"
|
||||
|
||||
mv "$_esc_tmp" "$_esc_file"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Dev-agent health (only if monitoring enabled)
|
||||
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
|
||||
DEV_LOCK="/tmp/dev-agent-${PROJECT_NAME}.lock"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue