refactor: split supervisor into infra + per-project, make poll scripts config-driven

Supervisor split (#26):
- Layer 1 (infra): P0 memory, P1 disk, P4 housekeeping — runs once, project-agnostic
- Layer 2 (per-project): P2 CI/dev-agent, P3 PRs/deps — iterates projects/*.toml
- Adding a new project requires only a new TOML file, no code changes

Poll scripts accept project TOML arg (#27):
- dev-poll.sh, review-poll.sh, gardener-poll.sh accept optional project TOML as $1
- env.sh loads PROJECT_TOML if set, overriding .env defaults
- Cron: `dev-poll.sh projects/versi.toml` targets that project

New files:
- lib/load-project.sh: TOML to env var loader (Python tomllib)
- projects/versi.toml: current project config extracted from .env

Backwards compatible: scripts without a TOML arg fall back to .env config.

Closes #26, Closes #27

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
johba 2026-03-17 08:57:18 +01:00
parent e503273fba
commit 9050413994
7 changed files with 438 additions and 297 deletions

View file

@ -9,11 +9,14 @@
# 1. Orphaned "in-progress" issues (agent died or PR needs attention)
# 2. Ready "backlog" issues (all deps merged)
#
# Usage: cron every 10min
# Usage:
# cron every 10min
# dev-poll.sh [projects/harb.toml] # optional project config
set -euo pipefail
# Load shared environment
# Load shared environment (with optional project TOML override)
export PROJECT_TOML="${1:-}"
source "$(dirname "$0")/../lib/env.sh"

View file

@ -30,6 +30,9 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
# Load shared environment (with optional project TOML override)
# Usage: gardener-poll.sh [projects/harb.toml]
export PROJECT_TOML="${1:-}"
# shellcheck source=../lib/env.sh
source "$FACTORY_ROOT/lib/env.sh"

View file

@ -19,6 +19,11 @@ fi
export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}"
export HOME="${HOME:-/home/debian}"
# Load project TOML if PROJECT_TOML is set (by poll scripts that accept project arg)
if [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then
source "${FACTORY_ROOT}/lib/load-project.sh" "$PROJECT_TOML"
fi
# Codeberg token: env var > ~/.netrc
if [ -z "${CODEBERG_TOKEN:-}" ]; then
CODEBERG_TOKEN="$(awk '/codeberg.org/{getline;getline;print $2}' ~/.netrc 2>/dev/null || true)"

83
lib/load-project.sh Executable file
View file

@ -0,0 +1,83 @@
#!/usr/bin/env bash
# load-project.sh — Load project config from a TOML file into env vars
#
# Usage (source, don't execute):
# source lib/load-project.sh projects/harb.toml
#
# Exports:
# PROJECT_NAME, CODEBERG_REPO, CODEBERG_API, PROJECT_REPO_ROOT,
# PRIMARY_BRANCH, WOODPECKER_REPO_ID, PROJECT_CONTAINERS,
# CHECK_PRS, CHECK_DEV_AGENT, CHECK_PIPELINE_STALL, CI_STALE_MINUTES
#
# If no argument given, does nothing (allows poll scripts to work with
# plain .env fallback for backwards compatibility).
_PROJECT_TOML="${1:-}"
if [ -z "$_PROJECT_TOML" ] || [ ! -f "$_PROJECT_TOML" ]; then
return 0 2>/dev/null || exit 0
fi
# Parse TOML to shell variable assignments via Python
_PROJECT_VARS=$(python3 -c "
import sys, tomllib
with open(sys.argv[1], 'rb') as f:
cfg = tomllib.load(f)
def emit(key, val):
if isinstance(val, bool):
print(f'{key}={str(val).lower()}')
elif isinstance(val, list):
print(f'{key}={\" \".join(str(v) for v in val)}')
else:
print(f'{key}={val}')
# Top-level
emit('PROJECT_NAME', cfg.get('name', ''))
emit('CODEBERG_REPO', cfg.get('repo', ''))
if 'repo_root' in cfg:
emit('PROJECT_REPO_ROOT', cfg['repo_root'])
if 'primary_branch' in cfg:
emit('PRIMARY_BRANCH', cfg['primary_branch'])
# [ci] section
ci = cfg.get('ci', {})
if 'woodpecker_repo_id' in ci:
emit('WOODPECKER_REPO_ID', ci['woodpecker_repo_id'])
if 'stale_minutes' in ci:
emit('CI_STALE_MINUTES', ci['stale_minutes'])
# [services] section
svc = cfg.get('services', {})
if 'containers' in svc:
emit('PROJECT_CONTAINERS', svc['containers'])
# [monitoring] section
mon = cfg.get('monitoring', {})
for key in ['check_prs', 'check_dev_agent', 'check_pipeline_stall']:
if key in mon:
emit(key.upper(), mon[key])
" "$_PROJECT_TOML" 2>/dev/null) || {
echo "WARNING: failed to parse project TOML: $_PROJECT_TOML" >&2
return 1 2>/dev/null || exit 1
}
# Export parsed variables
while IFS='=' read -r _key _val; do
[ -z "$_key" ] && continue
export "$_key=$_val"
done <<< "$_PROJECT_VARS"
# Derive CODEBERG_API if repo changed
if [ -n "$CODEBERG_REPO" ]; then
export CODEBERG_API="https://codeberg.org/api/v1/repos/${CODEBERG_REPO}"
fi
# Derive PROJECT_REPO_ROOT if not explicitly set
if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}"
fi
unset _PROJECT_TOML _PROJECT_VARS _key _val

21
projects/versi.toml Normal file
View file

@ -0,0 +1,21 @@
# projects/versi.toml — Project config for johba/versi
#
# This file defines project-specific settings for disinto agents.
# Drop a new TOML file here to add another project — no code changes needed.
name = "versi"
repo = "johba/versi"
repo_root = "/home/admin/versi"
primary_branch = "main"
[ci]
woodpecker_repo_id = 3
stale_minutes = 60
[services]
containers = []
[monitoring]
check_prs = true
check_dev_agent = true
check_pipeline_stall = true

View file

@ -6,7 +6,9 @@
set -euo pipefail
# Load shared environment
# Load shared environment (with optional project TOML override)
# Usage: review-poll.sh [projects/harb.toml]
export PROJECT_TOML="${1:-}"
source "$(dirname "$0")/../lib/env.sh"

View file

@ -1,8 +1,11 @@
#!/usr/bin/env bash
# supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes
#
# Runs every 10min via cron. Does all health checks in bash (zero tokens).
# Only invokes claude -p when auto-fix fails or issue is complex.
# Two-layer architecture:
# 1. Factory infrastructure (project-agnostic): RAM, disk, swap, docker, stale processes
# 2. Per-project checks (config-driven): CI, PRs, dev-agent, deps — iterated over projects/*.toml
#
# Runs every 10min via cron.
#
# Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh
#
@ -15,6 +18,7 @@ LOGFILE="${FACTORY_ROOT}/supervisor/supervisor.log"
STATUSFILE="/tmp/supervisor-status"
LOCKFILE="/tmp/supervisor-poll.lock"
PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md"
PROJECTS_DIR="${FACTORY_ROOT}/projects"
# Prevent overlapping runs
if [ -f "$LOCKFILE" ]; then
@ -60,6 +64,11 @@ p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; }
FIXES=""
fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; }
# #############################################################################
# LAYER 1: FACTORY INFRASTRUCTURE
# (project-agnostic, runs once)
# #############################################################################
# =============================================================================
# P0: MEMORY — check first, fix first
# =============================================================================
@ -82,13 +91,6 @@ if [ "${AVAIL_MB:-9999}" -lt 500 ] || { [ "${SWAP_USED_MB:-0}" -gt 3000 ] && [ "
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1
fixed "Dropped filesystem caches"
# Restart Anvil if it's bloated (>1GB RSS)
ANVIL_CONTAINER="${ANVIL_CONTAINER:-${PROJECT_NAME}-anvil-1}"
ANVIL_RSS=$(sudo docker stats "$ANVIL_CONTAINER" --no-stream --format '{{.MemUsage}}' 2>/dev/null | grep -oP '^\S+' | head -1 || echo "0")
if echo "$ANVIL_RSS" | grep -qP '\dGiB'; then
sudo docker restart "$ANVIL_CONTAINER" >/dev/null 2>&1 && fixed "Restarted bloated Anvil (${ANVIL_RSS})"
fi
# Re-check after fixes
AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}')
SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}')
@ -113,8 +115,8 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
# Docker cleanup (safe — keeps images)
sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune"
# Truncate supervisor logs >10MB
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do
# Truncate logs >10MB
for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 10240 ]; then
@ -124,19 +126,6 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
fi
done
# Clean old worktrees
IDLE_WORKTREES=$(find /tmp/${PROJECT_NAME}-worktree-* -maxdepth 0 -mmin +360 2>/dev/null || true)
if [ -n "$IDLE_WORKTREES" ]; then
cd "${PROJECT_REPO_ROOT}" && git worktree prune 2>/dev/null
for wt in $IDLE_WORKTREES; do
# Only remove if dev-agent is not running on it
ISSUE_NUM=$(basename "$wt" | sed "s/${PROJECT_NAME}-worktree-//")
if ! pgrep -f "dev-agent.sh ${ISSUE_NUM}" >/dev/null 2>&1; then
rm -rf "$wt" && fixed "Removed stale worktree: $wt"
fi
done
fi
# Woodpecker log_entries cleanup
LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs)
if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then
@ -157,60 +146,104 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
fi
# =============================================================================
# P2: FACTORY STOPPED — CI, dev-agent, git
# P4-INFRA: HOUSEKEEPING — stale processes, log rotation (project-agnostic)
# =============================================================================
status "P2: checking pipeline"
status "P4: infra housekeeping"
# Stale agent-spawned claude processes (>3h) — skip interactive sessions
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
fi
# Rotate logs >5MB
for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
mv "$logfile" "${logfile}.old" 2>/dev/null
fixed "Rotated $(basename "$logfile")"
fi
fi
done
# Check for dev-agent escalations
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
if [ -s "$ESCALATION_FILE" ]; then
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
fi
# #############################################################################
# LAYER 2: PER-PROJECT CHECKS
# (iterated over projects/*.toml, config-driven)
# #############################################################################
# Function: run all per-project checks for the currently loaded project config
check_project() {
local proj_name="${PROJECT_NAME:-unknown}"
flog "── checking project: ${proj_name} (${CODEBERG_REPO}) ──"
# ===========================================================================
# P2: FACTORY STOPPED — CI, dev-agent, git
# ===========================================================================
status "P2: ${proj_name}: checking pipeline"
# CI stuck
STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true)
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "CI: ${STUCK_CI} pipeline(s) running >20min"
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "${proj_name}: CI: ${STUCK_CI} pipeline(s) running >20min"
PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true)
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "CI: ${PENDING_CI} pipeline(s) pending >30min"
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "${proj_name}: CI: ${PENDING_CI} pipeline(s) pending >30min"
# Dev-agent health
# Dev-agent health (only if monitoring enabled)
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
DEV_LOCK="/tmp/dev-agent.lock"
if [ -f "$DEV_LOCK" ]; then
DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null)
if ! kill -0 "$DEV_PID" 2>/dev/null; then
rm -f "$DEV_LOCK"
fixed "Removed stale dev-agent lock (PID ${DEV_PID} dead)"
fixed "${proj_name}: Removed stale dev-agent lock (PID ${DEV_PID} dead)"
else
DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 ))
if [ "$STATUS_AGE_MIN" -gt 30 ]; then
p2 "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min"
p2 "${proj_name}: Dev-agent: status unchanged for ${STATUS_AGE_MIN}min"
fi
fi
fi
fi
# Git repo health
if [ -d "${PROJECT_REPO_ROOT}" ]; then
cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true
GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no")
if [ "$GIT_REBASE" = "yes" ]; then
git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \
p2 "Git: stale rebase, auto-abort failed"
fixed "${proj_name}: Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \
p2 "${proj_name}: Git: stale rebase, auto-abort failed"
fi
if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then
git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "Switched main repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \
p2 "Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}"
fixed "${proj_name}: Switched repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \
p2 "${proj_name}: Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}"
fi
fi
# =============================================================================
# ===========================================================================
# P2b: FACTORY STALLED — backlog exists but no agent running
# =============================================================================
status "P2: checking pipeline stall"
# ===========================================================================
if [ "${CHECK_PIPELINE_STALL:-true}" = "true" ]; then
status "P2: ${proj_name}: checking pipeline stall"
BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
# Backlog exists but nothing in progress — check if dev-agent ran recently
DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG" ]; then
LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0)
@ -221,30 +254,33 @@ if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 ))
if [ "$IDLE_MIN" -gt 20 ]; then
p2 "Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min"
p2 "${proj_name}: Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min"
fi
fi
fi
# =============================================================================
# ===========================================================================
# P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long
# =============================================================================
status "P2: checking dev-agent productivity"
# ===========================================================================
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
status "P2: ${proj_name}: checking dev-agent productivity"
DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG_FILE" ]; then
# Check if last 6 poll entries all report "no ready issues" (~1 hour at 10min intervals)
RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6)
TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true)
BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true)
if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then
p2 "Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues' — all backlog issues may be dep-blocked or have circular deps"
p2 "${proj_name}: Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues'"
fi
fi
fi
# =============================================================================
# ===========================================================================
# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs
# =============================================================================
status "P3: checking PRs"
# ===========================================================================
if [ "${CHECK_PRS:-true}" = "true" ]; then
status "P3: ${proj_name}: checking PRs"
OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true)
for pr in $OPEN_PRS; do
@ -255,20 +291,18 @@ for pr in $OPEN_PRS; do
CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true)
# Check for merge conflicts first (approved + CI pass but unmergeable)
MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true')
if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then
p3 "PR #${pr}: CI pass but merge conflict — needs rebase"
p3 "${proj_name}: PR #${pr}: CI pass but merge conflict — needs rebase"
elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
if [ -n "$UPDATED" ]; then
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
[ "$AGE_MIN" -gt 30 ] && p3 "PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min"
[ "$AGE_MIN" -gt 30 ] && p3 "${proj_name}: PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min"
fi
elif [ "$CI_STATE" = "success" ]; then
# Check if reviewed at this SHA
HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \
jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | length' 2>/dev/null || echo "0")
@ -279,20 +313,20 @@ for pr in $OPEN_PRS; do
NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
if [ "$AGE_MIN" -gt 60 ]; then
p3 "PR #${pr}: CI passed, no review for ${AGE_MIN}min"
# Auto-trigger review
p3 "${proj_name}: PR #${pr}: CI passed, no review for ${AGE_MIN}min"
bash "${FACTORY_ROOT}/review/review-pr.sh" "$pr" >> "${FACTORY_ROOT}/review/review.log" 2>&1 &
fixed "Auto-triggered review for PR #${pr}"
fixed "${proj_name}: Auto-triggered review for PR #${pr}"
fi
fi
fi
fi
done
fi
# =============================================================================
# ===========================================================================
# P3b: CIRCULAR DEPENDENCIES — deadlock detection
# =============================================================================
status "P3: checking for circular dependencies"
# ===========================================================================
status "P3: ${proj_name}: checking for circular dependencies"
BACKLOG_FOR_DEPS=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null || true)
if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo "$BACKLOG_FOR_DEPS" | jq 'length' 2>/dev/null || echo 0)" -gt 0 ]; then
@ -300,7 +334,6 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
PARSE_DEPS="${FACTORY_ROOT}/lib/parse-deps.sh"
ISSUE_COUNT=$(echo "$BACKLOG_FOR_DEPS" | jq 'length')
# Build dep graph: DEPS_OF[issue_num]="dep1 dep2 ..."
declare -A DEPS_OF
declare -A BACKLOG_NUMS
for i in $(seq 0 $((ISSUE_COUNT - 1))); do
@ -311,7 +344,6 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
BACKLOG_NUMS[$NUM]=1
done
# DFS cycle detection using color marking (0=white, 1=gray, 2=black)
declare -A NODE_COLOR
for node in "${!DEPS_OF[@]}"; do NODE_COLOR[$node]=0; done
@ -322,13 +354,11 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
local node="$1" path="$2"
NODE_COLOR[$node]=1
for dep in ${DEPS_OF[$node]:-}; do
[ -z "${NODE_COLOR[$dep]+x}" ] && continue # not in graph
[ -z "${NODE_COLOR[$dep]+x}" ] && continue
if [ "${NODE_COLOR[$dep]}" = "1" ]; then
# Cycle found — normalize for dedup
local cycle_key=$(echo "$path $dep" | tr ' ' '\n' | sort -n | tr '\n' ' ')
if [ -z "${SEEN_CYCLES[$cycle_key]+x}" ]; then
SEEN_CYCLES[$cycle_key]=1
# Extract cycle portion from path (from $dep onward)
local in_cycle=0 cycle_str=""
for p in $path $dep; do
[ "$p" = "$dep" ] && in_cycle=1
@ -350,22 +380,20 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
if [ -n "$FOUND_CYCLES" ]; then
echo -e "$FOUND_CYCLES" | while IFS= read -r cycle; do
[ -z "$cycle" ] && continue
p3 "Circular dependency deadlock: ${cycle}"
p3 "${proj_name}: Circular dependency deadlock: ${cycle}"
done
fi
# ===========================================================================
# =========================================================================
# P3c: STALE DEPENDENCIES — blocked by old open issues (>30 days)
# ===========================================================================
status "P3: checking for stale dependencies"
# =========================================================================
status "P3: ${proj_name}: checking for stale dependencies"
NOW_EPOCH=$(date +%s)
THIRTY_DAYS=$((30 * 86400))
declare -A DEP_CACHE
for issue_num in "${!DEPS_OF[@]}"; do
for dep in ${DEPS_OF[$issue_num]}; do
# Check cache first
if [ -n "${DEP_CACHE[$dep]+x}" ]; then
DEP_INFO="${DEP_CACHE[$dep]}"
else
@ -389,7 +417,7 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
CREATED_EPOCH=$(date -d "$DEP_CREATED" +%s 2>/dev/null || echo 0)
AGE_DAYS=$(( (NOW_EPOCH - CREATED_EPOCH) / 86400 ))
if [ "$AGE_DAYS" -gt 30 ]; then
p3 "Stale dependency: #${issue_num} blocked by #${dep} \"${DEP_TITLE}\" (open ${AGE_DAYS} days)"
p3 "${proj_name}: Stale dependency: #${issue_num} blocked by #${dep} \"${DEP_TITLE}\" (open ${AGE_DAYS} days)"
fi
done
done
@ -397,55 +425,51 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE
fi
# =============================================================================
# P4: HOUSEKEEPING — stale processes
# =============================================================================
# Check for dev-agent escalations
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
if [ -s "$ESCALATION_FILE" ]; then
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
fi
status "P4: housekeeping"
# Stale agent-spawned claude processes (>3h, not caught by P0) — skip interactive sessions
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
fi
# Clean stale git worktrees (>2h, no active agent)
# ===========================================================================
# P4-PROJECT: Clean stale worktrees for this project
# ===========================================================================
NOW_TS=$(date +%s)
for wt in /tmp/${PROJECT_NAME}-worktree-* /tmp/${PROJECT_NAME}-review-*; do
[ -d "$wt" ] || continue
WT_AGE_MIN=$(( (NOW_TS - $(stat -c %Y "$wt")) / 60 ))
if [ "$WT_AGE_MIN" -gt 120 ]; then
# Skip if an agent is still using it
WT_BASE=$(basename "$wt")
if ! pgrep -f "$WT_BASE" >/dev/null 2>&1; then
git -C "$PROJECT_REPO_ROOT" worktree remove --force "$wt" 2>/dev/null && \
fixed "Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true
fixed "${proj_name}: Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true
fi
fi
done
git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null || true
}
# Rotate supervisor log if >5MB
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
mv "$logfile" "${logfile}.old" 2>/dev/null
fixed "Rotated $(basename "$logfile")"
fi
fi
# =============================================================================
# Iterate over all registered projects
# =============================================================================
status "checking projects"
PROJECT_COUNT=0
if [ -d "$PROJECTS_DIR" ]; then
for project_toml in "${PROJECTS_DIR}"/*.toml; do
[ -f "$project_toml" ] || continue
PROJECT_COUNT=$((PROJECT_COUNT + 1))
# Load project config (overrides CODEBERG_REPO, PROJECT_REPO_ROOT, etc.)
source "${FACTORY_ROOT}/lib/load-project.sh" "$project_toml"
check_project
done
fi
# =============================================================================
if [ "$PROJECT_COUNT" -eq 0 ]; then
# Fallback: no project TOML files, use .env config (backwards compatible)
flog "No projects/*.toml found, using .env defaults"
check_project
fi
# #############################################################################
# RESULT
# =============================================================================
# #############################################################################
ALL_ALERTS="${P0_ALERTS}${P1_ALERTS}${P2_ALERTS}${P3_ALERTS}${P4_ALERTS}"