disinto/supervisor/supervisor-poll.sh
openhands 13bc948b1d fix: address review findings for escalation race condition, SQL injection, and sc_codes scope
- Race condition: mv escalations.jsonl to a PID-stamped snapshot before
  processing so concurrent dev-poll appends go to a fresh file; rm snapshot
  after loop — no entries are ever silently dropped
- SQL injection: validate ESC_PR_SHA is a 40-char hex string before
  interpolating into the wpdb query
- sc_codes scope: compute per-file from file_errors (already filtered to
  that file) instead of the entire step log; also switch grep to -F so
  dots in filenames are not treated as regex wildcards
- step_pid validation: reject non-integer values from Woodpecker API before
  passing as CLI argument
- Fallback body now distinguishes "CI logs unavailable" from "logs found
  but issue creation API calls failed"
- ESC_GENERIC_FAIL: avoid leading blank line by using conditional separator
  and fix code-block opening newline
- is_escalated(): remove dead esc_file/done_file locals; add Python-level
  int() guard so empty/non-numeric issue or pr values fail cleanly instead
  of producing a syntax error suppressed by 2>/dev/null

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-17 15:11:53 +00:00

771 lines
32 KiB
Bash
Executable file

#!/usr/bin/env bash
# supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes
#
# Two-layer architecture:
# 1. Factory infrastructure (project-agnostic): RAM, disk, swap, docker, stale processes
# 2. Per-project checks (config-driven): CI, PRs, dev-agent, deps — iterated over projects/*.toml
#
# Runs every 10min via cron.
#
# Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh
#
# Peek: cat /tmp/supervisor-status
# Log: tail -f /path/to/disinto/supervisor/supervisor.log
source "$(dirname "$0")/../lib/env.sh"
LOGFILE="${FACTORY_ROOT}/supervisor/supervisor.log"
STATUSFILE="/tmp/supervisor-status"
LOCKFILE="/tmp/supervisor-poll.lock"
PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md"
PROJECTS_DIR="${FACTORY_ROOT}/projects"
METRICS_FILE="${FACTORY_ROOT}/metrics/supervisor-metrics.jsonl"
emit_metric() {
printf '%s\n' "$1" >> "$METRICS_FILE"
}
# Count all matching items from a paginated Codeberg API endpoint.
# Usage: codeberg_count_paginated "/issues?state=open&labels=backlog&type=issues"
# Returns total count across all pages (max 20 pages = 1000 items).
codeberg_count_paginated() {
local endpoint="$1" total=0 page=1 count
while true; do
count=$(codeberg_api GET "${endpoint}&limit=50&page=${page}" 2>/dev/null | jq 'length' 2>/dev/null || echo 0)
total=$((total + ${count:-0}))
[ "${count:-0}" -lt 50 ] && break
page=$((page + 1))
[ "$page" -gt 20 ] && break
done
echo "$total"
}
rotate_metrics() {
[ -f "$METRICS_FILE" ] || return 0
local cutoff tmpfile
cutoff=$(date -u -d '30 days ago' +%Y-%m-%dT%H:%M)
tmpfile="${METRICS_FILE}.tmp"
jq -c --arg cutoff "$cutoff" 'select(.ts >= $cutoff)' \
"$METRICS_FILE" > "$tmpfile" 2>/dev/null
# Only replace if jq produced output, or the source is already empty
if [ -s "$tmpfile" ] || [ ! -s "$METRICS_FILE" ]; then
mv "$tmpfile" "$METRICS_FILE"
else
rm -f "$tmpfile"
fi
}
# Prevent overlapping runs
if [ -f "$LOCKFILE" ]; then
LOCK_PID=$(cat "$LOCKFILE" 2>/dev/null)
if kill -0 "$LOCK_PID" 2>/dev/null; then
exit 0
fi
rm -f "$LOCKFILE"
fi
echo $$ > "$LOCKFILE"
trap 'rm -f "$LOCKFILE" "$STATUSFILE"' EXIT
mkdir -p "$(dirname "$METRICS_FILE")"
rotate_metrics
flog() {
printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE"
}
status() {
printf '[%s] supervisor: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" > "$STATUSFILE"
flog "$*"
}
# ── Check for escalation replies from Matrix ──────────────────────────────
ESCALATION_REPLY=""
if [ -s /tmp/supervisor-escalation-reply ]; then
ESCALATION_REPLY=$(cat /tmp/supervisor-escalation-reply)
rm -f /tmp/supervisor-escalation-reply
flog "Got escalation reply: $(echo "$ESCALATION_REPLY" | head -1)"
fi
# Alerts by priority
P0_ALERTS=""
P1_ALERTS=""
P2_ALERTS=""
P3_ALERTS=""
P4_ALERTS=""
p0() { P0_ALERTS="${P0_ALERTS}• [P0] $*\n"; flog "P0: $*"; }
p1() { P1_ALERTS="${P1_ALERTS}• [P1] $*\n"; flog "P1: $*"; }
p2() { P2_ALERTS="${P2_ALERTS}• [P2] $*\n"; flog "P2: $*"; }
p3() { P3_ALERTS="${P3_ALERTS}• [P3] $*\n"; flog "P3: $*"; }
p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; }
FIXES=""
fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; }
# #############################################################################
# LAYER 1: FACTORY INFRASTRUCTURE
# (project-agnostic, runs once)
# #############################################################################
# =============================================================================
# P0: MEMORY — check first, fix first
# =============================================================================
status "P0: checking memory"
AVAIL_MB=$(free -m | awk '/Mem:/{print $7}')
SWAP_USED_MB=$(free -m | awk '/Swap:/{print $3}')
if [ "${AVAIL_MB:-9999}" -lt 500 ] || { [ "${SWAP_USED_MB:-0}" -gt 3000 ] && [ "${AVAIL_MB:-9999}" -lt 2000 ]; }; then
flog "MEMORY CRISIS: avail=${AVAIL_MB}MB swap_used=${SWAP_USED_MB}MB — auto-fixing"
# Kill stale agent-spawned claude processes (>3h old) — skip interactive sessions
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: ${STALE_CLAUDES}"
fi
# Drop filesystem caches
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1
fixed "Dropped filesystem caches"
# Re-check after fixes
AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}')
SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}')
if [ "${AVAIL_MB_AFTER:-0}" -lt 500 ] || [ "${SWAP_AFTER:-0}" -gt 3000 ]; then
p0 "Memory still critical after auto-fix: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB"
else
flog "Memory recovered: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB"
fi
fi
# =============================================================================
# P1: DISK
# =============================================================================
status "P1: checking disk"
DISK_PERCENT=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
flog "DISK PRESSURE: ${DISK_PERCENT}% — auto-cleaning"
# Docker cleanup (safe — keeps images)
sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune"
# Truncate logs >10MB
for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 10240 ]; then
truncate -s 0 "$logfile"
fixed "Truncated $(basename "$logfile") (was ${SIZE_KB}KB)"
fi
fi
done
# Woodpecker log_entries cleanup
LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs)
if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then
SIZE_NUM=$(echo "$LOG_ENTRIES_MB" | grep -oP '\d+')
SIZE_UNIT=$(echo "$LOG_ENTRIES_MB" | grep -oP '(GB|MB)')
if [ "$SIZE_UNIT" = "GB" ] || { [ "$SIZE_UNIT" = "MB" ] && [ "$SIZE_NUM" -gt 500 ]; }; then
wpdb -c "DELETE FROM log_entries WHERE id < (SELECT max(id) - 100000 FROM log_entries);" 2>/dev/null
fixed "Trimmed Woodpecker log_entries (was ${LOG_ENTRIES_MB})"
fi
fi
DISK_AFTER=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
if [ "${DISK_AFTER:-0}" -gt 80 ]; then
p1 "Disk still ${DISK_AFTER}% after auto-clean"
else
flog "Disk recovered: ${DISK_AFTER}%"
fi
fi
# Emit infra metric
_RAM_TOTAL_MB=$(free -m | awk '/Mem:/{print $2}')
_RAM_USED_PCT=$(( ${_RAM_TOTAL_MB:-0} > 0 ? (${_RAM_TOTAL_MB:-0} - ${AVAIL_MB:-0}) * 100 / ${_RAM_TOTAL_MB:-1} : 0 ))
emit_metric "$(jq -nc \
--arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \
--argjson ram "${_RAM_USED_PCT:-0}" \
--argjson disk "${DISK_PERCENT:-0}" \
--argjson swap "${SWAP_USED_MB:-0}" \
'{ts:$ts,type:"infra",ram_used_pct:$ram,disk_used_pct:$disk,swap_mb:$swap}' 2>/dev/null)" 2>/dev/null || true
# =============================================================================
# P4-INFRA: HOUSEKEEPING — stale processes, log rotation (project-agnostic)
# =============================================================================
status "P4: infra housekeeping"
# Stale agent-spawned claude processes (>3h) — skip interactive sessions
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
fi
# Rotate logs >5MB
for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
mv "$logfile" "${logfile}.old" 2>/dev/null
fixed "Rotated $(basename "$logfile")"
fi
fi
done
# Process dev-agent escalations — create sub-issues for each CI failure
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
ESCALATION_DONE="${FACTORY_ROOT}/supervisor/escalations.done.jsonl"
if [ -s "$ESCALATION_FILE" ]; then
# Atomically snapshot the file before processing to prevent race with
# concurrent dev-poll appends: new entries go to a fresh ESCALATION_FILE
# while we process the snapshot, so nothing is ever silently dropped.
ESCALATION_SNAP="${ESCALATION_FILE}.processing.$$"
mv "$ESCALATION_FILE" "$ESCALATION_SNAP"
ESCALATION_COUNT=$(wc -l < "$ESCALATION_SNAP")
flog "Processing ${ESCALATION_COUNT} escalation(s) from dev-agent"
while IFS= read -r esc_entry; do
[ -z "$esc_entry" ] && continue
ESC_ISSUE=$(echo "$esc_entry" | jq -r '.issue // empty')
ESC_PR=$(echo "$esc_entry" | jq -r '.pr // empty')
ESC_ATTEMPTS=$(echo "$esc_entry" | jq -r '.attempts // 3')
if [ -z "$ESC_ISSUE" ] || [ -z "$ESC_PR" ]; then
echo "$esc_entry" >> "$ESCALATION_DONE"
continue
fi
flog "Escalation: issue #${ESC_ISSUE} PR #${ESC_PR} (${ESC_ATTEMPTS} CI attempt(s))"
# Fetch the failing pipeline for this PR
ESC_PR_SHA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \
"${CODEBERG_API}/pulls/${ESC_PR}" 2>/dev/null | jq -r '.head.sha // ""') || true
ESC_PIPELINE=""
ESC_SUB_ISSUES_CREATED=0
ESC_GENERIC_FAIL=""
ESC_LOGS_AVAILABLE=0
if [ -n "$ESC_PR_SHA" ]; then
# Validate SHA is a 40-char hex string before interpolating into SQL
if [[ "$ESC_PR_SHA" =~ ^[0-9a-fA-F]{40}$ ]]; then
ESC_PIPELINE=$(wpdb -c "SELECT number FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND commit='${ESC_PR_SHA}' ORDER BY created DESC LIMIT 1;" 2>/dev/null | xargs || true)
else
flog "WARNING: ESC_PR_SHA '${ESC_PR_SHA}' is not a valid hex SHA — skipping pipeline lookup"
fi
fi
if [ -n "$ESC_PIPELINE" ]; then
FAILED_STEPS=$(curl -sf \
-H "Authorization: Bearer ${WOODPECKER_TOKEN}" \
"${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${ESC_PIPELINE}" 2>/dev/null | \
jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.pid)\t\(.name)"' 2>/dev/null || true)
while IFS=$'\t' read -r step_pid step_name; do
[ -z "$step_pid" ] && continue
[[ "$step_pid" =~ ^[0-9]+$ ]] || { flog "WARNING: invalid step_pid '${step_pid}' — skipping"; continue; }
step_logs=$(woodpecker-cli pipeline log show "${CODEBERG_REPO}" "${ESC_PIPELINE}" "${step_pid}" 2>/dev/null | tail -150 || true)
[ -z "$step_logs" ] && continue
ESC_LOGS_AVAILABLE=1
if echo "$step_name" | grep -qi "shellcheck"; then
# Create one sub-issue per file with ShellCheck errors
sc_files=$(echo "$step_logs" | grep -oP '(?<=In )\S+(?= line \d+:)' | sort -u || true)
while IFS= read -r sc_file; do
[ -z "$sc_file" ] && continue
# grep -F for literal filename match (dots in filenames are regex wildcards)
file_errors=$(echo "$step_logs" | grep -F -A3 "In ${sc_file} line" | head -30)
# SC codes only from this file's errors, not the whole step log
sc_codes=$(echo "$file_errors" | grep -oP 'SC\d+' | sort -u | tr '\n' ' ' | sed 's/ $//' || true)
sub_title="fix: ShellCheck errors in ${sc_file} (from PR #${ESC_PR})"
sub_body="## ShellCheck CI failure — \`${sc_file}\`
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)).
### Errors
\`\`\`
${file_errors}
\`\`\`
Fix all ShellCheck errors${sc_codes:+ (${sc_codes})} in \`${sc_file}\` so PR #${ESC_PR} CI passes.
### Context
- Parent issue: #${ESC_ISSUE}
- PR: #${ESC_PR}
- Pipeline: #${ESC_PIPELINE} (step: ${step_name})"
new_issue=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues" \
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
if [ -n "$new_issue" ]; then
flog "Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from #${ESC_ISSUE})"
fixed "Sub-issue #${new_issue}: ShellCheck errors in ${sc_file} (escalated from #${ESC_ISSUE})"
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: ShellCheck in ${sc_file} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi
done <<< "$sc_files"
else
# Accumulate non-ShellCheck failures for one combined issue
esc_section="=== ${step_name} ===
$(echo "$step_logs" | tail -50)"
if [ -z "$ESC_GENERIC_FAIL" ]; then
ESC_GENERIC_FAIL="$esc_section"
else
ESC_GENERIC_FAIL="${ESC_GENERIC_FAIL}
${esc_section}"
fi
fi
done <<< "$FAILED_STEPS"
fi
# Create one sub-issue for all non-ShellCheck CI failures
if [ -n "$ESC_GENERIC_FAIL" ]; then
sub_title="fix: CI failures in PR #${ESC_PR} (from issue #${ESC_ISSUE})"
sub_body="## CI failure — fix required
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)).
### Failed step output
\`\`\`
${ESC_GENERIC_FAIL}
\`\`\`
### Context
- Parent issue: #${ESC_ISSUE}
- PR: #${ESC_PR}${ESC_PIPELINE:+
- Pipeline: #${ESC_PIPELINE}}"
new_issue=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues" \
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
if [ -n "$new_issue" ]; then
flog "Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from #${ESC_ISSUE})"
fixed "Sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (escalated from #${ESC_ISSUE})"
ESC_SUB_ISSUES_CREATED=$((ESC_SUB_ISSUES_CREATED + 1))
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: CI failures for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi
fi
# Fallback: no sub-issues created — differentiate logs-unavailable from creation failure
if [ "$ESC_SUB_ISSUES_CREATED" -eq 0 ]; then
sub_title="fix: investigate CI failure for PR #${ESC_PR} (from issue #${ESC_ISSUE})"
if [ "$ESC_LOGS_AVAILABLE" -eq 1 ]; then
# Logs were fetched but all issue creation API calls failed
sub_body="## CI failure — investigation required
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were retrieved but sub-issue creation failed (API error).
Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge."
else
# Could not retrieve CI logs at all
sub_body="## CI failure — investigation required
Spawned by supervisor from escalated issue #${ESC_ISSUE} (PR #${ESC_PR} failed CI after ${ESC_ATTEMPTS} attempt(s)). CI logs were unavailable at escalation time.
Check PR #${ESC_PR} CI output, identify the failing checks, and fix them so the PR can merge."
fi
new_issue=$(curl -sf -X POST \
-H "Authorization: token ${CODEBERG_TOKEN}" \
-H "Content-Type: application/json" \
"${CODEBERG_API}/issues" \
-d "$(jq -nc --arg t "$sub_title" --arg b "$sub_body" \
'{"title":$t,"body":$b,"labels":["backlog"]}')" 2>/dev/null | jq -r '.number // ""') || true
if [ -n "$new_issue" ]; then
flog "Created fallback sub-issue #${new_issue} for escalated #${ESC_ISSUE}"
fixed "Fallback sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (escalated from #${ESC_ISSUE})"
matrix_send "supervisor" "📋 Created sub-issue #${new_issue}: investigate CI for PR #${ESC_PR} (from escalated #${ESC_ISSUE})" 2>/dev/null || true
fi
fi
# Mark as processed
echo "$esc_entry" >> "$ESCALATION_DONE"
done < "$ESCALATION_SNAP"
rm -f "$ESCALATION_SNAP"
flog "Escalations processed — moved to $(basename "$ESCALATION_DONE")"
fi
# #############################################################################
# LAYER 2: PER-PROJECT CHECKS
# (iterated over projects/*.toml, config-driven)
# #############################################################################
# Function: run all per-project checks for the currently loaded project config
check_project() {
local proj_name="${PROJECT_NAME:-unknown}"
flog "── checking project: ${proj_name} (${CODEBERG_REPO}) ──"
# ===========================================================================
# P2: FACTORY STOPPED — CI, dev-agent, git
# ===========================================================================
status "P2: ${proj_name}: checking pipeline"
# CI stuck
STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true)
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "${proj_name}: CI: ${STUCK_CI} pipeline(s) running >20min"
PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true)
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "${proj_name}: CI: ${PENDING_CI} pipeline(s) pending >30min"
# Emit CI metric (last completed pipeline within 24h — skip if project has no recent CI)
_CI_ROW=$(wpdb -A -F ',' -c "SELECT id, COALESCE(ROUND(EXTRACT(EPOCH FROM (to_timestamp(finished) - to_timestamp(started)))/60)::int, 0), status FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status IN ('success','failure','error') AND finished > 0 AND to_timestamp(finished) > now() - interval '24 hours' ORDER BY id DESC LIMIT 1;" 2>/dev/null | grep -E '^[0-9]' | head -1 || true)
if [ -n "$_CI_ROW" ]; then
_CI_ID=$(echo "$_CI_ROW" | cut -d',' -f1 | tr -d ' ')
_CI_DUR=$(echo "$_CI_ROW" | cut -d',' -f2 | tr -d ' ')
_CI_STAT=$(echo "$_CI_ROW" | cut -d',' -f3 | tr -d ' ')
emit_metric "$(jq -nc \
--arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \
--arg proj "$proj_name" \
--argjson pipeline "${_CI_ID:-0}" \
--argjson duration "${_CI_DUR:-0}" \
--arg status "${_CI_STAT:-unknown}" \
'{ts:$ts,type:"ci",project:$proj,pipeline:$pipeline,duration_min:$duration,status:$status}' 2>/dev/null)" 2>/dev/null || true
fi
# Dev-agent health (only if monitoring enabled)
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
DEV_LOCK="/tmp/dev-agent.lock"
if [ -f "$DEV_LOCK" ]; then
DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null)
if ! kill -0 "$DEV_PID" 2>/dev/null; then
rm -f "$DEV_LOCK"
fixed "${proj_name}: Removed stale dev-agent lock (PID ${DEV_PID} dead)"
else
DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 ))
if [ "$STATUS_AGE_MIN" -gt 30 ]; then
p2 "${proj_name}: Dev-agent: status unchanged for ${STATUS_AGE_MIN}min"
fi
fi
fi
fi
# Git repo health
if [ -d "${PROJECT_REPO_ROOT}" ]; then
cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true
GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no")
if [ "$GIT_REBASE" = "yes" ]; then
git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "${proj_name}: Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \
p2 "${proj_name}: Git: stale rebase, auto-abort failed"
fi
if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then
git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "${proj_name}: Switched repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \
p2 "${proj_name}: Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}"
fi
fi
# ===========================================================================
# P2b: FACTORY STALLED — backlog exists but no agent running
# ===========================================================================
if [ "${CHECK_PIPELINE_STALL:-true}" = "true" ]; then
status "P2: ${proj_name}: checking pipeline stall"
BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG" ]; then
LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0)
else
LAST_LOG_EPOCH=0
fi
NOW_EPOCH=$(date +%s)
IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 ))
if [ "$IDLE_MIN" -gt 20 ]; then
p2 "${proj_name}: Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min"
fi
fi
fi
# ===========================================================================
# P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long
# ===========================================================================
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
status "P2: ${proj_name}: checking dev-agent productivity"
DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG_FILE" ]; then
RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6)
TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true)
BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true)
if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then
p2 "${proj_name}: Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues'"
fi
fi
fi
# ===========================================================================
# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs
# ===========================================================================
if [ "${CHECK_PRS:-true}" = "true" ]; then
status "P3: ${proj_name}: checking PRs"
OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true)
for pr in $OPEN_PRS; do
PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true)
[ -z "$PR_JSON" ] && continue
PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""')
[ -z "$PR_SHA" ] && continue
CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true)
MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true')
if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then
p3 "${proj_name}: PR #${pr}: CI pass but merge conflict — needs rebase"
elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
if [ -n "$UPDATED" ]; then
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
[ "$AGE_MIN" -gt 30 ] && p3 "${proj_name}: PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min"
fi
elif [ "$CI_STATE" = "success" ]; then
HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \
jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | length' 2>/dev/null || echo "0")
if [ "${HAS_REVIEW:-0}" -eq 0 ]; then
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
if [ -n "$UPDATED" ]; then
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
if [ "$AGE_MIN" -gt 60 ]; then
p3 "${proj_name}: PR #${pr}: CI passed, no review for ${AGE_MIN}min"
bash "${FACTORY_ROOT}/review/review-pr.sh" "$pr" >> "${FACTORY_ROOT}/review/review.log" 2>&1 &
fixed "${proj_name}: Auto-triggered review for PR #${pr}"
fi
fi
fi
fi
done
fi
# ===========================================================================
# P3b: CIRCULAR DEPENDENCIES — deadlock detection
# ===========================================================================
status "P3: ${proj_name}: checking for circular dependencies"
BACKLOG_FOR_DEPS=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null || true)
if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo "$BACKLOG_FOR_DEPS" | jq 'length' 2>/dev/null || echo 0)" -gt 0 ]; then
PARSE_DEPS="${FACTORY_ROOT}/lib/parse-deps.sh"
ISSUE_COUNT=$(echo "$BACKLOG_FOR_DEPS" | jq 'length')
declare -A DEPS_OF
declare -A BACKLOG_NUMS
for i in $(seq 0 $((ISSUE_COUNT - 1))); do
NUM=$(echo "$BACKLOG_FOR_DEPS" | jq -r ".[$i].number")
BODY=$(echo "$BACKLOG_FOR_DEPS" | jq -r ".[$i].body // \"\"")
ISSUE_DEPS=$(echo "$BODY" | bash "$PARSE_DEPS" | grep -v "^${NUM}$" || true)
[ -n "$ISSUE_DEPS" ] && DEPS_OF[$NUM]="$ISSUE_DEPS"
BACKLOG_NUMS[$NUM]=1
done
declare -A NODE_COLOR
for node in "${!DEPS_OF[@]}"; do NODE_COLOR[$node]=0; done
FOUND_CYCLES=""
declare -A SEEN_CYCLES
dfs_detect_cycle() {
local node="$1" path="$2"
NODE_COLOR[$node]=1
for dep in ${DEPS_OF[$node]:-}; do
[ -z "${NODE_COLOR[$dep]+x}" ] && continue
if [ "${NODE_COLOR[$dep]}" = "1" ]; then
local cycle_key=$(echo "$path $dep" | tr ' ' '\n' | sort -n | tr '\n' ' ')
if [ -z "${SEEN_CYCLES[$cycle_key]+x}" ]; then
SEEN_CYCLES[$cycle_key]=1
local in_cycle=0 cycle_str=""
for p in $path $dep; do
[ "$p" = "$dep" ] && in_cycle=1
[ "$in_cycle" = "1" ] && cycle_str="${cycle_str:+$cycle_str -> }#${p}"
done
FOUND_CYCLES="${FOUND_CYCLES}${cycle_str}\n"
fi
elif [ "${NODE_COLOR[$dep]}" = "0" ]; then
dfs_detect_cycle "$dep" "$path $dep"
fi
done
NODE_COLOR[$node]=2
}
for node in "${!DEPS_OF[@]}"; do
[ "${NODE_COLOR[$node]:-2}" = "0" ] && dfs_detect_cycle "$node" "$node"
done
if [ -n "$FOUND_CYCLES" ]; then
echo -e "$FOUND_CYCLES" | while IFS= read -r cycle; do
[ -z "$cycle" ] && continue
p3 "${proj_name}: Circular dependency deadlock: ${cycle}"
done
fi
# =========================================================================
# P3c: STALE DEPENDENCIES — blocked by old open issues (>30 days)
# =========================================================================
status "P3: ${proj_name}: checking for stale dependencies"
NOW_EPOCH=$(date +%s)
declare -A DEP_CACHE
for issue_num in "${!DEPS_OF[@]}"; do
for dep in ${DEPS_OF[$issue_num]}; do
if [ -n "${DEP_CACHE[$dep]+x}" ]; then
DEP_INFO="${DEP_CACHE[$dep]}"
else
DEP_JSON=$(codeberg_api GET "/issues/${dep}" 2>/dev/null || true)
[ -z "$DEP_JSON" ] && continue
DEP_STATE=$(echo "$DEP_JSON" | jq -r '.state // "unknown"')
DEP_CREATED=$(echo "$DEP_JSON" | jq -r '.created_at // ""')
DEP_TITLE=$(echo "$DEP_JSON" | jq -r '.title // ""' | head -c 50)
DEP_INFO="${DEP_STATE}|${DEP_CREATED}|${DEP_TITLE}"
DEP_CACHE[$dep]="$DEP_INFO"
fi
DEP_STATE="${DEP_INFO%%|*}"
[ "$DEP_STATE" != "open" ] && continue
DEP_REST="${DEP_INFO#*|}"
DEP_CREATED="${DEP_REST%%|*}"
DEP_TITLE="${DEP_REST#*|}"
[ -z "$DEP_CREATED" ] && continue
CREATED_EPOCH=$(date -d "$DEP_CREATED" +%s 2>/dev/null || echo 0)
AGE_DAYS=$(( (NOW_EPOCH - CREATED_EPOCH) / 86400 ))
if [ "$AGE_DAYS" -gt 30 ]; then
p3 "${proj_name}: Stale dependency: #${issue_num} blocked by #${dep} \"${DEP_TITLE}\" (open ${AGE_DAYS} days)"
fi
done
done
unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE
fi
# Emit dev metric (paginated to avoid silent cap at 50)
_BACKLOG_COUNT=$(codeberg_count_paginated "/issues?state=open&labels=backlog&type=issues")
_BLOCKED_COUNT=$(codeberg_count_paginated "/issues?state=open&labels=blocked&type=issues")
_PR_COUNT=$(codeberg_count_paginated "/pulls?state=open")
emit_metric "$(jq -nc \
--arg ts "$(date -u +%Y-%m-%dT%H:%MZ)" \
--arg proj "$proj_name" \
--argjson backlog "${_BACKLOG_COUNT:-0}" \
--argjson blocked "${_BLOCKED_COUNT:-0}" \
--argjson prs "${_PR_COUNT:-0}" \
'{ts:$ts,type:"dev",project:$proj,issues_in_backlog:$backlog,issues_blocked:$blocked,pr_open:$prs}' 2>/dev/null)" 2>/dev/null || true
# ===========================================================================
# P4-PROJECT: Clean stale worktrees for this project
# ===========================================================================
NOW_TS=$(date +%s)
for wt in /tmp/${PROJECT_NAME}-worktree-* /tmp/${PROJECT_NAME}-review-*; do
[ -d "$wt" ] || continue
WT_AGE_MIN=$(( (NOW_TS - $(stat -c %Y "$wt")) / 60 ))
if [ "$WT_AGE_MIN" -gt 120 ]; then
WT_BASE=$(basename "$wt")
if ! pgrep -f "$WT_BASE" >/dev/null 2>&1; then
git -C "$PROJECT_REPO_ROOT" worktree remove --force "$wt" 2>/dev/null && \
fixed "${proj_name}: Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true
fi
fi
done
git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null || true
}
# =============================================================================
# Iterate over all registered projects
# =============================================================================
status "checking projects"
PROJECT_COUNT=0
if [ -d "$PROJECTS_DIR" ]; then
for project_toml in "${PROJECTS_DIR}"/*.toml; do
[ -f "$project_toml" ] || continue
PROJECT_COUNT=$((PROJECT_COUNT + 1))
# Load project config (overrides CODEBERG_REPO, PROJECT_REPO_ROOT, etc.)
source "${FACTORY_ROOT}/lib/load-project.sh" "$project_toml"
check_project
done
fi
if [ "$PROJECT_COUNT" -eq 0 ]; then
# Fallback: no project TOML files, use .env config (backwards compatible)
flog "No projects/*.toml found, using .env defaults"
check_project
fi
# #############################################################################
# RESULT
# #############################################################################
ALL_ALERTS="${P0_ALERTS}${P1_ALERTS}${P2_ALERTS}${P3_ALERTS}${P4_ALERTS}"
if [ -n "$ALL_ALERTS" ]; then
ALERT_TEXT=$(echo -e "$ALL_ALERTS")
# Notify Matrix
matrix_send "supervisor" "⚠️ Supervisor alerts:
${ALERT_TEXT}" 2>/dev/null || true
flog "Invoking claude -p for alerts"
CLAUDE_PROMPT="$(cat "$PROMPT_FILE" 2>/dev/null || echo "You are a supervisor agent. Fix the issue below.")
## Current Alerts
${ALERT_TEXT}
## Auto-fixes already applied by bash
$(echo -e "${FIXES:-None}")
## System State
RAM: $(free -m | awk '/Mem:/{printf "avail=%sMB", $7}') $(free -m | awk '/Swap:/{printf "swap=%sMB", $3}')
Disk: $(df -h / | awk 'NR==2{printf "%s used of %s (%s)", $3, $2, $5}')
Docker: $(sudo docker ps --format '{{.Names}}' 2>/dev/null | wc -l) containers running
Claude procs: $(pgrep -f "claude" 2>/dev/null | wc -l)
$(if [ -n "$ESCALATION_REPLY" ]; then echo "
## Human Response to Previous Escalation
${ESCALATION_REPLY}
Act on this response."; fi)
Fix what you can. Escalate what you can't. Read the relevant best-practices file first."
CLAUDE_OUTPUT=$(timeout 300 claude -p --model sonnet --dangerously-skip-permissions \
"$CLAUDE_PROMPT" 2>&1) || true
flog "claude output: $(echo "$CLAUDE_OUTPUT" | tail -20)"
status "claude responded"
else
[ -n "$FIXES" ] && flog "Housekeeping: $(echo -e "$FIXES")"
status "all clear"
fi