Single source of truth for dependency parsing, replacing three copies: - dev-poll.sh get_deps() now calls parse-deps.py - supervisor P3b/P3c import parse_deps() via importlib Supports stdin, argument, and --json modes for different callers. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
503 lines
19 KiB
Bash
Executable file
503 lines
19 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes
|
|
#
|
|
# Runs every 10min via cron. Does all health checks in bash (zero tokens).
|
|
# Only invokes claude -p when auto-fix fails or issue is complex.
|
|
#
|
|
# Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh
|
|
#
|
|
# Peek: cat /tmp/supervisor-status
|
|
# Log: tail -f /path/to/disinto/supervisor/supervisor.log
|
|
|
|
source "$(dirname "$0")/../lib/env.sh"
|
|
|
|
LOGFILE="${FACTORY_ROOT}/supervisor/supervisor.log"
|
|
STATUSFILE="/tmp/supervisor-status"
|
|
LOCKFILE="/tmp/supervisor-poll.lock"
|
|
PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md"
|
|
|
|
# Prevent overlapping runs
|
|
if [ -f "$LOCKFILE" ]; then
|
|
LOCK_PID=$(cat "$LOCKFILE" 2>/dev/null)
|
|
if kill -0 "$LOCK_PID" 2>/dev/null; then
|
|
exit 0
|
|
fi
|
|
rm -f "$LOCKFILE"
|
|
fi
|
|
echo $$ > "$LOCKFILE"
|
|
trap 'rm -f "$LOCKFILE" "$STATUSFILE"' EXIT
|
|
|
|
flog() {
|
|
printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE"
|
|
}
|
|
|
|
status() {
|
|
printf '[%s] supervisor: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" > "$STATUSFILE"
|
|
flog "$*"
|
|
}
|
|
|
|
# ── Check for escalation replies from Matrix ──────────────────────────────
|
|
ESCALATION_REPLY=""
|
|
if [ -s /tmp/supervisor-escalation-reply ]; then
|
|
ESCALATION_REPLY=$(cat /tmp/supervisor-escalation-reply)
|
|
rm -f /tmp/supervisor-escalation-reply
|
|
flog "Got escalation reply: $(echo "$ESCALATION_REPLY" | head -1)"
|
|
fi
|
|
|
|
# Alerts by priority
|
|
P0_ALERTS=""
|
|
P1_ALERTS=""
|
|
P2_ALERTS=""
|
|
P3_ALERTS=""
|
|
P4_ALERTS=""
|
|
|
|
p0() { P0_ALERTS="${P0_ALERTS}• [P0] $*\n"; flog "P0: $*"; }
|
|
p1() { P1_ALERTS="${P1_ALERTS}• [P1] $*\n"; flog "P1: $*"; }
|
|
p2() { P2_ALERTS="${P2_ALERTS}• [P2] $*\n"; flog "P2: $*"; }
|
|
p3() { P3_ALERTS="${P3_ALERTS}• [P3] $*\n"; flog "P3: $*"; }
|
|
p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; }
|
|
|
|
FIXES=""
|
|
fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; }
|
|
|
|
# =============================================================================
|
|
# P0: MEMORY — check first, fix first
|
|
# =============================================================================
|
|
status "P0: checking memory"
|
|
|
|
AVAIL_MB=$(free -m | awk '/Mem:/{print $7}')
|
|
SWAP_USED_MB=$(free -m | awk '/Swap:/{print $3}')
|
|
|
|
if [ "${AVAIL_MB:-9999}" -lt 500 ] || { [ "${SWAP_USED_MB:-0}" -gt 3000 ] && [ "${AVAIL_MB:-9999}" -lt 2000 ]; }; then
|
|
flog "MEMORY CRISIS: avail=${AVAIL_MB}MB swap_used=${SWAP_USED_MB}MB — auto-fixing"
|
|
|
|
# Kill stale agent-spawned claude processes (>3h old) — skip interactive sessions
|
|
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
|
|
if [ -n "$STALE_CLAUDES" ]; then
|
|
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
|
|
fixed "Killed stale claude processes: ${STALE_CLAUDES}"
|
|
fi
|
|
|
|
# Drop filesystem caches
|
|
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1
|
|
fixed "Dropped filesystem caches"
|
|
|
|
# Restart Anvil if it's bloated (>1GB RSS)
|
|
ANVIL_CONTAINER="${ANVIL_CONTAINER:-${PROJECT_NAME}-anvil-1}"
|
|
ANVIL_RSS=$(sudo docker stats "$ANVIL_CONTAINER" --no-stream --format '{{.MemUsage}}' 2>/dev/null | grep -oP '^\S+' | head -1 || echo "0")
|
|
if echo "$ANVIL_RSS" | grep -qP '\dGiB'; then
|
|
sudo docker restart "$ANVIL_CONTAINER" >/dev/null 2>&1 && fixed "Restarted bloated Anvil (${ANVIL_RSS})"
|
|
fi
|
|
|
|
# Re-check after fixes
|
|
AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}')
|
|
SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}')
|
|
|
|
if [ "${AVAIL_MB_AFTER:-0}" -lt 500 ] || [ "${SWAP_AFTER:-0}" -gt 3000 ]; then
|
|
p0 "Memory still critical after auto-fix: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB"
|
|
else
|
|
flog "Memory recovered: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P1: DISK
|
|
# =============================================================================
|
|
status "P1: checking disk"
|
|
|
|
DISK_PERCENT=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
|
|
|
|
if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
|
|
flog "DISK PRESSURE: ${DISK_PERCENT}% — auto-cleaning"
|
|
|
|
# Docker cleanup (safe — keeps images)
|
|
sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune"
|
|
|
|
# Truncate supervisor logs >10MB
|
|
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do
|
|
if [ -f "$logfile" ]; then
|
|
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
|
|
if [ "${SIZE_KB:-0}" -gt 10240 ]; then
|
|
truncate -s 0 "$logfile"
|
|
fixed "Truncated $(basename "$logfile") (was ${SIZE_KB}KB)"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# Clean old worktrees
|
|
IDLE_WORKTREES=$(find /tmp/${PROJECT_NAME}-worktree-* -maxdepth 0 -mmin +360 2>/dev/null || true)
|
|
if [ -n "$IDLE_WORKTREES" ]; then
|
|
cd "${PROJECT_REPO_ROOT}" && git worktree prune 2>/dev/null
|
|
for wt in $IDLE_WORKTREES; do
|
|
# Only remove if dev-agent is not running on it
|
|
ISSUE_NUM=$(basename "$wt" | sed "s/${PROJECT_NAME}-worktree-//")
|
|
if ! pgrep -f "dev-agent.sh ${ISSUE_NUM}" >/dev/null 2>&1; then
|
|
rm -rf "$wt" && fixed "Removed stale worktree: $wt"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Woodpecker log_entries cleanup
|
|
LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs)
|
|
if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then
|
|
SIZE_NUM=$(echo "$LOG_ENTRIES_MB" | grep -oP '\d+')
|
|
SIZE_UNIT=$(echo "$LOG_ENTRIES_MB" | grep -oP '(GB|MB)')
|
|
if [ "$SIZE_UNIT" = "GB" ] || { [ "$SIZE_UNIT" = "MB" ] && [ "$SIZE_NUM" -gt 500 ]; }; then
|
|
wpdb -c "DELETE FROM log_entries WHERE id < (SELECT max(id) - 100000 FROM log_entries);" 2>/dev/null
|
|
fixed "Trimmed Woodpecker log_entries (was ${LOG_ENTRIES_MB})"
|
|
fi
|
|
fi
|
|
|
|
DISK_AFTER=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
|
|
if [ "${DISK_AFTER:-0}" -gt 80 ]; then
|
|
p1 "Disk still ${DISK_AFTER}% after auto-clean"
|
|
else
|
|
flog "Disk recovered: ${DISK_AFTER}%"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P2: FACTORY STOPPED — CI, dev-agent, git
|
|
# =============================================================================
|
|
status "P2: checking pipeline"
|
|
|
|
# CI stuck
|
|
STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true)
|
|
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "CI: ${STUCK_CI} pipeline(s) running >20min"
|
|
|
|
PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true)
|
|
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "CI: ${PENDING_CI} pipeline(s) pending >30min"
|
|
|
|
# Dev-agent health
|
|
DEV_LOCK="/tmp/dev-agent.lock"
|
|
if [ -f "$DEV_LOCK" ]; then
|
|
DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null)
|
|
if ! kill -0 "$DEV_PID" 2>/dev/null; then
|
|
rm -f "$DEV_LOCK"
|
|
fixed "Removed stale dev-agent lock (PID ${DEV_PID} dead)"
|
|
else
|
|
DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0)
|
|
NOW_EPOCH=$(date +%s)
|
|
STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 ))
|
|
if [ "$STATUS_AGE_MIN" -gt 30 ]; then
|
|
p2 "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Git repo health
|
|
cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true
|
|
GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
|
|
GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no")
|
|
|
|
if [ "$GIT_REBASE" = "yes" ]; then
|
|
git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
|
|
fixed "Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \
|
|
p2 "Git: stale rebase, auto-abort failed"
|
|
fi
|
|
if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then
|
|
git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
|
|
fixed "Switched main repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \
|
|
p2 "Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}"
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P2b: FACTORY STALLED — backlog exists but no agent running
|
|
# =============================================================================
|
|
status "P2: checking pipeline stall"
|
|
|
|
BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
|
|
IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
|
|
|
|
if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
|
|
# Backlog exists but nothing in progress — check if dev-agent ran recently
|
|
DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log"
|
|
if [ -f "$DEV_LOG" ]; then
|
|
LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0)
|
|
else
|
|
LAST_LOG_EPOCH=0
|
|
fi
|
|
NOW_EPOCH=$(date +%s)
|
|
IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 ))
|
|
|
|
if [ "$IDLE_MIN" -gt 20 ]; then
|
|
p2 "Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long
|
|
# =============================================================================
|
|
status "P2: checking dev-agent productivity"
|
|
|
|
DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log"
|
|
if [ -f "$DEV_LOG_FILE" ]; then
|
|
# Check if last 6 poll entries all report "no ready issues" (~1 hour at 10min intervals)
|
|
RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6)
|
|
TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true)
|
|
BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true)
|
|
if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then
|
|
p2 "Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues' — all backlog issues may be dep-blocked or have circular deps"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs
|
|
# =============================================================================
|
|
status "P3: checking PRs"
|
|
|
|
OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true)
|
|
for pr in $OPEN_PRS; do
|
|
PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true)
|
|
[ -z "$PR_JSON" ] && continue
|
|
PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""')
|
|
[ -z "$PR_SHA" ] && continue
|
|
|
|
CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true)
|
|
|
|
# Check for merge conflicts first (approved + CI pass but unmergeable)
|
|
MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true')
|
|
if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then
|
|
p3 "PR #${pr}: CI pass but merge conflict — needs rebase"
|
|
elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
|
|
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
|
|
if [ -n "$UPDATED" ]; then
|
|
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
|
|
NOW_EPOCH=$(date +%s)
|
|
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
|
|
[ "$AGE_MIN" -gt 30 ] && p3 "PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min"
|
|
fi
|
|
elif [ "$CI_STATE" = "success" ]; then
|
|
# Check if reviewed at this SHA
|
|
HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \
|
|
jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | length' 2>/dev/null || echo "0")
|
|
|
|
if [ "${HAS_REVIEW:-0}" -eq 0 ]; then
|
|
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
|
|
if [ -n "$UPDATED" ]; then
|
|
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
|
|
NOW_EPOCH=$(date +%s)
|
|
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
|
|
if [ "$AGE_MIN" -gt 60 ]; then
|
|
p3 "PR #${pr}: CI passed, no review for ${AGE_MIN}min"
|
|
# Auto-trigger review
|
|
bash "${FACTORY_ROOT}/review/review-pr.sh" "$pr" >> "${FACTORY_ROOT}/review/review.log" 2>&1 &
|
|
fixed "Auto-triggered review for PR #${pr}"
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# =============================================================================
|
|
# P3b: CIRCULAR DEPENDENCIES — deadlock detection
|
|
# =============================================================================
|
|
status "P3: checking for circular dependencies"
|
|
|
|
BACKLOG_FOR_DEPS=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null || true)
|
|
if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo "$BACKLOG_FOR_DEPS" | jq 'length' 2>/dev/null || echo 0)" -gt 0 ]; then
|
|
|
|
PARSE_DEPS="${FACTORY_ROOT}/lib/parse-deps.py"
|
|
|
|
CYCLES=$(echo "$BACKLOG_FOR_DEPS" | python3 -c '
|
|
import sys, json, importlib.util
|
|
|
|
spec = importlib.util.spec_from_file_location("parse_deps", sys.argv[1])
|
|
mod = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
|
|
issues = json.load(sys.stdin)
|
|
graph = {}
|
|
for issue in issues:
|
|
num = issue["number"]
|
|
deps = [d for d in mod.parse_deps(issue.get("body", "")) if d != num]
|
|
if deps:
|
|
graph[num] = set(deps)
|
|
|
|
WHITE, GRAY, BLACK = 0, 1, 2
|
|
color = {n: WHITE for n in graph}
|
|
cycles = []
|
|
|
|
def dfs(u, path):
|
|
color[u] = GRAY
|
|
path.append(u)
|
|
for v in graph.get(u, set()):
|
|
if v not in color:
|
|
continue
|
|
if color[v] == GRAY:
|
|
cycles.append(path[path.index(v):] + [v])
|
|
elif color[v] == WHITE:
|
|
dfs(v, path)
|
|
path.pop()
|
|
color[u] = BLACK
|
|
|
|
for node in list(graph.keys()):
|
|
if color.get(node) == WHITE:
|
|
dfs(node, [])
|
|
|
|
seen = set()
|
|
for cycle in cycles:
|
|
key = tuple(sorted(set(cycle)))
|
|
if key not in seen:
|
|
seen.add(key)
|
|
print(" -> ".join(f"#{n}" for n in cycle))
|
|
' "$PARSE_DEPS" 2>/dev/null || true)
|
|
|
|
if [ -n "$CYCLES" ]; then
|
|
while IFS= read -r cycle; do
|
|
[ -z "$cycle" ] && continue
|
|
p3 "Circular dependency deadlock: ${cycle}"
|
|
done <<< "$CYCLES"
|
|
fi
|
|
|
|
# ===========================================================================
|
|
# P3c: STALE DEPENDENCIES — blocked by old open issues (>30 days)
|
|
# ===========================================================================
|
|
status "P3: checking for stale dependencies"
|
|
|
|
STALE_DEPS=$(echo "$BACKLOG_FOR_DEPS" | CODEBERG_TOKEN="$CODEBERG_TOKEN" CODEBERG_API="$CODEBERG_API" python3 -c '
|
|
import sys, json, os, importlib.util
|
|
from datetime import datetime, timezone
|
|
from urllib.request import Request, urlopen
|
|
|
|
spec = importlib.util.spec_from_file_location("parse_deps", sys.argv[1])
|
|
mod = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
|
|
issues = json.load(sys.stdin)
|
|
token = os.environ.get("CODEBERG_TOKEN", "")
|
|
api = os.environ.get("CODEBERG_API", "")
|
|
issue_map = {i["number"]: i for i in issues}
|
|
now = datetime.now(timezone.utc)
|
|
|
|
checked = {}
|
|
for issue in issues:
|
|
num = issue["number"]
|
|
deps = [d for d in mod.parse_deps(issue.get("body", "")) if d != num]
|
|
for dep in deps:
|
|
if dep in checked:
|
|
dep_data = checked[dep]
|
|
elif dep in issue_map:
|
|
dep_data = issue_map[dep]
|
|
checked[dep] = dep_data
|
|
else:
|
|
try:
|
|
req = Request(f"{api}/issues/{dep}",
|
|
headers={"Authorization": f"token {token}"})
|
|
with urlopen(req, timeout=5) as resp:
|
|
dep_data = json.loads(resp.read())
|
|
checked[dep] = dep_data
|
|
except Exception:
|
|
continue
|
|
if dep_data.get("state") != "open":
|
|
continue
|
|
created = dep_data.get("created_at", "")
|
|
try:
|
|
created_dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
|
age_days = (now - created_dt).days
|
|
if age_days > 30:
|
|
dep_title = dep_data.get("title", "")[:50]
|
|
print(f"#{num} blocked by #{dep} \"{dep_title}\" (open {age_days} days)")
|
|
except Exception:
|
|
pass
|
|
' "$PARSE_DEPS" 2>/dev/null || true)
|
|
|
|
if [ -n "$STALE_DEPS" ]; then
|
|
while IFS= read -r stale; do
|
|
[ -z "$stale" ] && continue
|
|
p3 "Stale dependency: ${stale}"
|
|
done <<< "$STALE_DEPS"
|
|
fi
|
|
fi
|
|
|
|
# =============================================================================
|
|
# P4: HOUSEKEEPING — stale processes
|
|
# =============================================================================
|
|
# Check for dev-agent escalations
|
|
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
|
|
if [ -s "$ESCALATION_FILE" ]; then
|
|
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
|
|
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
|
|
fi
|
|
|
|
status "P4: housekeeping"
|
|
|
|
# Stale agent-spawned claude processes (>3h, not caught by P0) — skip interactive sessions
|
|
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
|
|
if [ -n "$STALE_CLAUDES" ]; then
|
|
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
|
|
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
|
|
fi
|
|
|
|
# Clean stale git worktrees (>2h, no active agent)
|
|
NOW_TS=$(date +%s)
|
|
for wt in /tmp/${PROJECT_NAME}-worktree-* /tmp/${PROJECT_NAME}-review-*; do
|
|
[ -d "$wt" ] || continue
|
|
WT_AGE_MIN=$(( (NOW_TS - $(stat -c %Y "$wt")) / 60 ))
|
|
if [ "$WT_AGE_MIN" -gt 120 ]; then
|
|
# Skip if an agent is still using it
|
|
WT_BASE=$(basename "$wt")
|
|
if ! pgrep -f "$WT_BASE" >/dev/null 2>&1; then
|
|
git -C "$PROJECT_REPO_ROOT" worktree remove --force "$wt" 2>/dev/null && \
|
|
fixed "Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true
|
|
fi
|
|
fi
|
|
done
|
|
git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null || true
|
|
|
|
# Rotate supervisor log if >5MB
|
|
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do
|
|
if [ -f "$logfile" ]; then
|
|
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
|
|
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
|
|
mv "$logfile" "${logfile}.old" 2>/dev/null
|
|
fixed "Rotated $(basename "$logfile")"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# =============================================================================
|
|
# RESULT
|
|
# =============================================================================
|
|
|
|
ALL_ALERTS="${P0_ALERTS}${P1_ALERTS}${P2_ALERTS}${P3_ALERTS}${P4_ALERTS}"
|
|
|
|
if [ -n "$ALL_ALERTS" ]; then
|
|
ALERT_TEXT=$(echo -e "$ALL_ALERTS")
|
|
|
|
# Notify Matrix
|
|
matrix_send "supervisor" "⚠️ Supervisor alerts:
|
|
${ALERT_TEXT}" 2>/dev/null || true
|
|
|
|
flog "Invoking claude -p for alerts"
|
|
|
|
CLAUDE_PROMPT="$(cat "$PROMPT_FILE" 2>/dev/null || echo "You are a supervisor agent. Fix the issue below.")
|
|
|
|
## Current Alerts
|
|
${ALERT_TEXT}
|
|
|
|
## Auto-fixes already applied by bash
|
|
$(echo -e "${FIXES:-None}")
|
|
|
|
## System State
|
|
RAM: $(free -m | awk '/Mem:/{printf "avail=%sMB", $7}') $(free -m | awk '/Swap:/{printf "swap=%sMB", $3}')
|
|
Disk: $(df -h / | awk 'NR==2{printf "%s used of %s (%s)", $3, $2, $5}')
|
|
Docker: $(sudo docker ps --format '{{.Names}}' 2>/dev/null | wc -l) containers running
|
|
Claude procs: $(pgrep -f "claude" 2>/dev/null | wc -l)
|
|
|
|
$(if [ -n "$ESCALATION_REPLY" ]; then echo "
|
|
## Human Response to Previous Escalation
|
|
${ESCALATION_REPLY}
|
|
|
|
Act on this response."; fi)
|
|
|
|
Fix what you can. Escalate what you can't. Read the relevant best-practices file first."
|
|
|
|
CLAUDE_OUTPUT=$(timeout 300 claude -p --model sonnet --dangerously-skip-permissions \
|
|
"$CLAUDE_PROMPT" 2>&1) || true
|
|
flog "claude output: $(echo "$CLAUDE_OUTPUT" | tail -20)"
|
|
status "claude responded"
|
|
else
|
|
[ -n "$FIXES" ] && flog "Housekeeping: $(echo -e "$FIXES")"
|
|
status "all clear"
|
|
fi
|