Merge feat/multi-project: split supervisor + project-agnostic poll scripts (#26, #27)

This commit is contained in:
openhands 2026-03-17 08:03:45 +00:00
commit fce5e66b90
8 changed files with 458 additions and 297 deletions

View file

@ -9,11 +9,14 @@
# 1. Orphaned "in-progress" issues (agent died or PR needs attention) # 1. Orphaned "in-progress" issues (agent died or PR needs attention)
# 2. Ready "backlog" issues (all deps merged) # 2. Ready "backlog" issues (all deps merged)
# #
# Usage: cron every 10min # Usage:
# cron every 10min
# dev-poll.sh [projects/harb.toml] # optional project config
set -euo pipefail set -euo pipefail
# Load shared environment # Load shared environment (with optional project TOML override)
export PROJECT_TOML="${1:-}"
source "$(dirname "$0")/../lib/env.sh" source "$(dirname "$0")/../lib/env.sh"

View file

@ -30,6 +30,9 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")" FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
# Load shared environment (with optional project TOML override)
# Usage: gardener-poll.sh [projects/harb.toml]
export PROJECT_TOML="${1:-}"
# shellcheck source=../lib/env.sh # shellcheck source=../lib/env.sh
source "$FACTORY_ROOT/lib/env.sh" source "$FACTORY_ROOT/lib/env.sh"

View file

@ -19,6 +19,11 @@ fi
export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}" export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}"
export HOME="${HOME:-/home/debian}" export HOME="${HOME:-/home/debian}"
# Load project TOML if PROJECT_TOML is set (by poll scripts that accept project arg)
if [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then
source "${FACTORY_ROOT}/lib/load-project.sh" "$PROJECT_TOML"
fi
# Codeberg token: env var > ~/.netrc # Codeberg token: env var > ~/.netrc
if [ -z "${CODEBERG_TOKEN:-}" ]; then if [ -z "${CODEBERG_TOKEN:-}" ]; then
CODEBERG_TOKEN="$(awk '/codeberg.org/{getline;getline;print $2}' ~/.netrc 2>/dev/null || true)" CODEBERG_TOKEN="$(awk '/codeberg.org/{getline;getline;print $2}' ~/.netrc 2>/dev/null || true)"

83
lib/load-project.sh Executable file
View file

@ -0,0 +1,83 @@
#!/usr/bin/env bash
# load-project.sh — Load project config from a TOML file into env vars
#
# Usage (source, don't execute):
# source lib/load-project.sh projects/harb.toml
#
# Exports:
# PROJECT_NAME, CODEBERG_REPO, CODEBERG_API, PROJECT_REPO_ROOT,
# PRIMARY_BRANCH, WOODPECKER_REPO_ID, PROJECT_CONTAINERS,
# CHECK_PRS, CHECK_DEV_AGENT, CHECK_PIPELINE_STALL, CI_STALE_MINUTES
#
# If no argument given, does nothing (allows poll scripts to work with
# plain .env fallback for backwards compatibility).
_PROJECT_TOML="${1:-}"
if [ -z "$_PROJECT_TOML" ] || [ ! -f "$_PROJECT_TOML" ]; then
return 0 2>/dev/null || exit 0
fi
# Parse TOML to shell variable assignments via Python
_PROJECT_VARS=$(python3 -c "
import sys, tomllib
with open(sys.argv[1], 'rb') as f:
cfg = tomllib.load(f)
def emit(key, val):
if isinstance(val, bool):
print(f'{key}={str(val).lower()}')
elif isinstance(val, list):
print(f'{key}={\" \".join(str(v) for v in val)}')
else:
print(f'{key}={val}')
# Top-level
emit('PROJECT_NAME', cfg.get('name', ''))
emit('CODEBERG_REPO', cfg.get('repo', ''))
if 'repo_root' in cfg:
emit('PROJECT_REPO_ROOT', cfg['repo_root'])
if 'primary_branch' in cfg:
emit('PRIMARY_BRANCH', cfg['primary_branch'])
# [ci] section
ci = cfg.get('ci', {})
if 'woodpecker_repo_id' in ci:
emit('WOODPECKER_REPO_ID', ci['woodpecker_repo_id'])
if 'stale_minutes' in ci:
emit('CI_STALE_MINUTES', ci['stale_minutes'])
# [services] section
svc = cfg.get('services', {})
if 'containers' in svc:
emit('PROJECT_CONTAINERS', svc['containers'])
# [monitoring] section
mon = cfg.get('monitoring', {})
for key in ['check_prs', 'check_dev_agent', 'check_pipeline_stall']:
if key in mon:
emit(key.upper(), mon[key])
" "$_PROJECT_TOML" 2>/dev/null) || {
echo "WARNING: failed to parse project TOML: $_PROJECT_TOML" >&2
return 1 2>/dev/null || exit 1
}
# Export parsed variables
while IFS='=' read -r _key _val; do
[ -z "$_key" ] && continue
export "$_key=$_val"
done <<< "$_PROJECT_VARS"
# Derive CODEBERG_API if repo changed
if [ -n "$CODEBERG_REPO" ]; then
export CODEBERG_API="https://codeberg.org/api/v1/repos/${CODEBERG_REPO}"
fi
# Derive PROJECT_REPO_ROOT if not explicitly set
if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}"
fi
unset _PROJECT_TOML _PROJECT_VARS _key _val

20
projects/harb.toml Normal file
View file

@ -0,0 +1,20 @@
# projects/harb.toml — Project config for johba/harb
#
# This file defines project-specific settings for disinto agents.
name = "harb"
repo = "johba/harb"
repo_root = "/home/debian/harb"
primary_branch = "master"
[ci]
woodpecker_repo_id = 2
stale_minutes = 60
[services]
containers = ["ponder"]
[monitoring]
check_prs = true
check_dev_agent = true
check_pipeline_stall = true

21
projects/versi.toml Normal file
View file

@ -0,0 +1,21 @@
# projects/versi.toml — Project config for johba/versi
#
# This file defines project-specific settings for disinto agents.
# Drop a new TOML file here to add another project — no code changes needed.
name = "versi"
repo = "johba/versi"
repo_root = "/home/admin/versi"
primary_branch = "main"
[ci]
woodpecker_repo_id = 3
stale_minutes = 60
[services]
containers = []
[monitoring]
check_prs = true
check_dev_agent = true
check_pipeline_stall = true

View file

@ -6,7 +6,9 @@
set -euo pipefail set -euo pipefail
# Load shared environment # Load shared environment (with optional project TOML override)
# Usage: review-poll.sh [projects/harb.toml]
export PROJECT_TOML="${1:-}"
source "$(dirname "$0")/../lib/env.sh" source "$(dirname "$0")/../lib/env.sh"

View file

@ -1,8 +1,11 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes # supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes
# #
# Runs every 10min via cron. Does all health checks in bash (zero tokens). # Two-layer architecture:
# Only invokes claude -p when auto-fix fails or issue is complex. # 1. Factory infrastructure (project-agnostic): RAM, disk, swap, docker, stale processes
# 2. Per-project checks (config-driven): CI, PRs, dev-agent, deps — iterated over projects/*.toml
#
# Runs every 10min via cron.
# #
# Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh # Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh
# #
@ -15,6 +18,7 @@ LOGFILE="${FACTORY_ROOT}/supervisor/supervisor.log"
STATUSFILE="/tmp/supervisor-status" STATUSFILE="/tmp/supervisor-status"
LOCKFILE="/tmp/supervisor-poll.lock" LOCKFILE="/tmp/supervisor-poll.lock"
PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md" PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md"
PROJECTS_DIR="${FACTORY_ROOT}/projects"
# Prevent overlapping runs # Prevent overlapping runs
if [ -f "$LOCKFILE" ]; then if [ -f "$LOCKFILE" ]; then
@ -60,6 +64,11 @@ p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; }
FIXES="" FIXES=""
fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; } fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; }
# #############################################################################
# LAYER 1: FACTORY INFRASTRUCTURE
# (project-agnostic, runs once)
# #############################################################################
# ============================================================================= # =============================================================================
# P0: MEMORY — check first, fix first # P0: MEMORY — check first, fix first
# ============================================================================= # =============================================================================
@ -82,13 +91,6 @@ if [ "${AVAIL_MB:-9999}" -lt 500 ] || { [ "${SWAP_USED_MB:-0}" -gt 3000 ] && [ "
sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1
fixed "Dropped filesystem caches" fixed "Dropped filesystem caches"
# Restart Anvil if it's bloated (>1GB RSS)
ANVIL_CONTAINER="${ANVIL_CONTAINER:-${PROJECT_NAME}-anvil-1}"
ANVIL_RSS=$(sudo docker stats "$ANVIL_CONTAINER" --no-stream --format '{{.MemUsage}}' 2>/dev/null | grep -oP '^\S+' | head -1 || echo "0")
if echo "$ANVIL_RSS" | grep -qP '\dGiB'; then
sudo docker restart "$ANVIL_CONTAINER" >/dev/null 2>&1 && fixed "Restarted bloated Anvil (${ANVIL_RSS})"
fi
# Re-check after fixes # Re-check after fixes
AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}') AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}')
SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}') SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}')
@ -113,8 +115,8 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
# Docker cleanup (safe — keeps images) # Docker cleanup (safe — keeps images)
sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune" sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune"
# Truncate supervisor logs >10MB # Truncate logs >10MB
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1) SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 10240 ]; then if [ "${SIZE_KB:-0}" -gt 10240 ]; then
@ -124,19 +126,6 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
fi fi
done done
# Clean old worktrees
IDLE_WORKTREES=$(find /tmp/${PROJECT_NAME}-worktree-* -maxdepth 0 -mmin +360 2>/dev/null || true)
if [ -n "$IDLE_WORKTREES" ]; then
cd "${PROJECT_REPO_ROOT}" && git worktree prune 2>/dev/null
for wt in $IDLE_WORKTREES; do
# Only remove if dev-agent is not running on it
ISSUE_NUM=$(basename "$wt" | sed "s/${PROJECT_NAME}-worktree-//")
if ! pgrep -f "dev-agent.sh ${ISSUE_NUM}" >/dev/null 2>&1; then
rm -rf "$wt" && fixed "Removed stale worktree: $wt"
fi
done
fi
# Woodpecker log_entries cleanup # Woodpecker log_entries cleanup
LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs) LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs)
if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then
@ -157,60 +146,104 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then
fi fi
# ============================================================================= # =============================================================================
# P2: FACTORY STOPPED — CI, dev-agent, git # P4-INFRA: HOUSEKEEPING — stale processes, log rotation (project-agnostic)
# ============================================================================= # =============================================================================
status "P2: checking pipeline" status "P4: infra housekeeping"
# CI stuck # Stale agent-spawned claude processes (>3h) — skip interactive sessions
STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true) STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "CI: ${STUCK_CI} pipeline(s) running >20min" if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
fi
PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true) # Rotate logs >5MB
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "CI: ${PENDING_CI} pipeline(s) pending >30min" for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
mv "$logfile" "${logfile}.old" 2>/dev/null
fixed "Rotated $(basename "$logfile")"
fi
fi
done
# Dev-agent health # Check for dev-agent escalations
DEV_LOCK="/tmp/dev-agent.lock" ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl"
if [ -f "$DEV_LOCK" ]; then if [ -s "$ESCALATION_FILE" ]; then
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
fi
# #############################################################################
# LAYER 2: PER-PROJECT CHECKS
# (iterated over projects/*.toml, config-driven)
# #############################################################################
# Function: run all per-project checks for the currently loaded project config
check_project() {
local proj_name="${PROJECT_NAME:-unknown}"
flog "── checking project: ${proj_name} (${CODEBERG_REPO}) ──"
# ===========================================================================
# P2: FACTORY STOPPED — CI, dev-agent, git
# ===========================================================================
status "P2: ${proj_name}: checking pipeline"
# CI stuck
STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true)
[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "${proj_name}: CI: ${STUCK_CI} pipeline(s) running >20min"
PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true)
[ "${PENDING_CI:-0}" -gt 0 ] && p2 "${proj_name}: CI: ${PENDING_CI} pipeline(s) pending >30min"
# Dev-agent health (only if monitoring enabled)
if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
DEV_LOCK="/tmp/dev-agent.lock"
if [ -f "$DEV_LOCK" ]; then
DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null) DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null)
if ! kill -0 "$DEV_PID" 2>/dev/null; then if ! kill -0 "$DEV_PID" 2>/dev/null; then
rm -f "$DEV_LOCK" rm -f "$DEV_LOCK"
fixed "Removed stale dev-agent lock (PID ${DEV_PID} dead)" fixed "${proj_name}: Removed stale dev-agent lock (PID ${DEV_PID} dead)"
else else
DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0) DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s) NOW_EPOCH=$(date +%s)
STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 )) STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 ))
if [ "$STATUS_AGE_MIN" -gt 30 ]; then if [ "$STATUS_AGE_MIN" -gt 30 ]; then
p2 "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min" p2 "${proj_name}: Dev-agent: status unchanged for ${STATUS_AGE_MIN}min"
fi
fi
fi fi
fi fi
fi
# Git repo health # Git repo health
cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true if [ -d "${PROJECT_REPO_ROOT}" ]; then
GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true
GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no") GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no")
if [ "$GIT_REBASE" = "yes" ]; then if [ "$GIT_REBASE" = "yes" ]; then
git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \ git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \ fixed "${proj_name}: Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \
p2 "Git: stale rebase, auto-abort failed" p2 "${proj_name}: Git: stale rebase, auto-abort failed"
fi fi
if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then
git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \ git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \
fixed "Switched main repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \ fixed "${proj_name}: Switched repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \
p2 "Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}" p2 "${proj_name}: Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}"
fi fi
fi
# ============================================================================= # ===========================================================================
# P2b: FACTORY STALLED — backlog exists but no agent running # P2b: FACTORY STALLED — backlog exists but no agent running
# ============================================================================= # ===========================================================================
status "P2: checking pipeline stall" if [ "${CHECK_PIPELINE_STALL:-true}" = "true" ]; then
status "P2: ${proj_name}: checking pipeline stall"
BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0") BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0") IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0")
if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
# Backlog exists but nothing in progress — check if dev-agent ran recently
DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log" DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG" ]; then if [ -f "$DEV_LOG" ]; then
LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0) LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0)
@ -221,33 +254,36 @@ if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then
IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 )) IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 ))
if [ "$IDLE_MIN" -gt 20 ]; then if [ "$IDLE_MIN" -gt 20 ]; then
p2 "Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min" p2 "${proj_name}: Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min"
fi
fi
fi fi
fi
# ============================================================================= # ===========================================================================
# P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long # P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long
# ============================================================================= # ===========================================================================
status "P2: checking dev-agent productivity" if [ "${CHECK_DEV_AGENT:-true}" = "true" ]; then
status "P2: ${proj_name}: checking dev-agent productivity"
DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log" DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log"
if [ -f "$DEV_LOG_FILE" ]; then if [ -f "$DEV_LOG_FILE" ]; then
# Check if last 6 poll entries all report "no ready issues" (~1 hour at 10min intervals)
RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6) RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6)
TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true) TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true)
BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true) BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true)
if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then
p2 "Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues' — all backlog issues may be dep-blocked or have circular deps" p2 "${proj_name}: Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues'"
fi
fi
fi fi
fi
# ============================================================================= # ===========================================================================
# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs # P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs
# ============================================================================= # ===========================================================================
status "P3: checking PRs" if [ "${CHECK_PRS:-true}" = "true" ]; then
status "P3: ${proj_name}: checking PRs"
OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true) OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true)
for pr in $OPEN_PRS; do for pr in $OPEN_PRS; do
PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true) PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true)
[ -z "$PR_JSON" ] && continue [ -z "$PR_JSON" ] && continue
PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""') PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""')
@ -255,20 +291,18 @@ for pr in $OPEN_PRS; do
CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true) CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true)
# Check for merge conflicts first (approved + CI pass but unmergeable)
MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true') MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true')
if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then
p3 "PR #${pr}: CI pass but merge conflict — needs rebase" p3 "${proj_name}: PR #${pr}: CI pass but merge conflict — needs rebase"
elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""') UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""')
if [ -n "$UPDATED" ]; then if [ -n "$UPDATED" ]; then
UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0) UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0)
NOW_EPOCH=$(date +%s) NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 )) AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
[ "$AGE_MIN" -gt 30 ] && p3 "PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min" [ "$AGE_MIN" -gt 30 ] && p3 "${proj_name}: PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min"
fi fi
elif [ "$CI_STATE" = "success" ]; then elif [ "$CI_STATE" = "success" ]; then
# Check if reviewed at this SHA
HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \ HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \
jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | length' 2>/dev/null || echo "0") jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | length' 2>/dev/null || echo "0")
@ -279,28 +313,27 @@ for pr in $OPEN_PRS; do
NOW_EPOCH=$(date +%s) NOW_EPOCH=$(date +%s)
AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 )) AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 ))
if [ "$AGE_MIN" -gt 60 ]; then if [ "$AGE_MIN" -gt 60 ]; then
p3 "PR #${pr}: CI passed, no review for ${AGE_MIN}min" p3 "${proj_name}: PR #${pr}: CI passed, no review for ${AGE_MIN}min"
# Auto-trigger review
bash "${FACTORY_ROOT}/review/review-pr.sh" "$pr" >> "${FACTORY_ROOT}/review/review.log" 2>&1 & bash "${FACTORY_ROOT}/review/review-pr.sh" "$pr" >> "${FACTORY_ROOT}/review/review.log" 2>&1 &
fixed "Auto-triggered review for PR #${pr}" fixed "${proj_name}: Auto-triggered review for PR #${pr}"
fi fi
fi fi
fi fi
fi fi
done done
fi
# ============================================================================= # ===========================================================================
# P3b: CIRCULAR DEPENDENCIES — deadlock detection # P3b: CIRCULAR DEPENDENCIES — deadlock detection
# ============================================================================= # ===========================================================================
status "P3: checking for circular dependencies" status "P3: ${proj_name}: checking for circular dependencies"
BACKLOG_FOR_DEPS=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null || true) BACKLOG_FOR_DEPS=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=50" 2>/dev/null || true)
if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo "$BACKLOG_FOR_DEPS" | jq 'length' 2>/dev/null || echo 0)" -gt 0 ]; then if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo "$BACKLOG_FOR_DEPS" | jq 'length' 2>/dev/null || echo 0)" -gt 0 ]; then
PARSE_DEPS="${FACTORY_ROOT}/lib/parse-deps.sh" PARSE_DEPS="${FACTORY_ROOT}/lib/parse-deps.sh"
ISSUE_COUNT=$(echo "$BACKLOG_FOR_DEPS" | jq 'length') ISSUE_COUNT=$(echo "$BACKLOG_FOR_DEPS" | jq 'length')
# Build dep graph: DEPS_OF[issue_num]="dep1 dep2 ..."
declare -A DEPS_OF declare -A DEPS_OF
declare -A BACKLOG_NUMS declare -A BACKLOG_NUMS
for i in $(seq 0 $((ISSUE_COUNT - 1))); do for i in $(seq 0 $((ISSUE_COUNT - 1))); do
@ -311,7 +344,6 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
BACKLOG_NUMS[$NUM]=1 BACKLOG_NUMS[$NUM]=1
done done
# DFS cycle detection using color marking (0=white, 1=gray, 2=black)
declare -A NODE_COLOR declare -A NODE_COLOR
for node in "${!DEPS_OF[@]}"; do NODE_COLOR[$node]=0; done for node in "${!DEPS_OF[@]}"; do NODE_COLOR[$node]=0; done
@ -322,13 +354,11 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
local node="$1" path="$2" local node="$1" path="$2"
NODE_COLOR[$node]=1 NODE_COLOR[$node]=1
for dep in ${DEPS_OF[$node]:-}; do for dep in ${DEPS_OF[$node]:-}; do
[ -z "${NODE_COLOR[$dep]+x}" ] && continue # not in graph [ -z "${NODE_COLOR[$dep]+x}" ] && continue
if [ "${NODE_COLOR[$dep]}" = "1" ]; then if [ "${NODE_COLOR[$dep]}" = "1" ]; then
# Cycle found — normalize for dedup
local cycle_key=$(echo "$path $dep" | tr ' ' '\n' | sort -n | tr '\n' ' ') local cycle_key=$(echo "$path $dep" | tr ' ' '\n' | sort -n | tr '\n' ' ')
if [ -z "${SEEN_CYCLES[$cycle_key]+x}" ]; then if [ -z "${SEEN_CYCLES[$cycle_key]+x}" ]; then
SEEN_CYCLES[$cycle_key]=1 SEEN_CYCLES[$cycle_key]=1
# Extract cycle portion from path (from $dep onward)
local in_cycle=0 cycle_str="" local in_cycle=0 cycle_str=""
for p in $path $dep; do for p in $path $dep; do
[ "$p" = "$dep" ] && in_cycle=1 [ "$p" = "$dep" ] && in_cycle=1
@ -350,22 +380,20 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
if [ -n "$FOUND_CYCLES" ]; then if [ -n "$FOUND_CYCLES" ]; then
echo -e "$FOUND_CYCLES" | while IFS= read -r cycle; do echo -e "$FOUND_CYCLES" | while IFS= read -r cycle; do
[ -z "$cycle" ] && continue [ -z "$cycle" ] && continue
p3 "Circular dependency deadlock: ${cycle}" p3 "${proj_name}: Circular dependency deadlock: ${cycle}"
done done
fi fi
# =========================================================================== # =========================================================================
# P3c: STALE DEPENDENCIES — blocked by old open issues (>30 days) # P3c: STALE DEPENDENCIES — blocked by old open issues (>30 days)
# =========================================================================== # =========================================================================
status "P3: checking for stale dependencies" status "P3: ${proj_name}: checking for stale dependencies"
NOW_EPOCH=$(date +%s) NOW_EPOCH=$(date +%s)
THIRTY_DAYS=$((30 * 86400))
declare -A DEP_CACHE declare -A DEP_CACHE
for issue_num in "${!DEPS_OF[@]}"; do for issue_num in "${!DEPS_OF[@]}"; do
for dep in ${DEPS_OF[$issue_num]}; do for dep in ${DEPS_OF[$issue_num]}; do
# Check cache first
if [ -n "${DEP_CACHE[$dep]+x}" ]; then if [ -n "${DEP_CACHE[$dep]+x}" ]; then
DEP_INFO="${DEP_CACHE[$dep]}" DEP_INFO="${DEP_CACHE[$dep]}"
else else
@ -389,63 +417,59 @@ if [ -n "$BACKLOG_FOR_DEPS" ] && [ "$BACKLOG_FOR_DEPS" != "null" ] && [ "$(echo
CREATED_EPOCH=$(date -d "$DEP_CREATED" +%s 2>/dev/null || echo 0) CREATED_EPOCH=$(date -d "$DEP_CREATED" +%s 2>/dev/null || echo 0)
AGE_DAYS=$(( (NOW_EPOCH - CREATED_EPOCH) / 86400 )) AGE_DAYS=$(( (NOW_EPOCH - CREATED_EPOCH) / 86400 ))
if [ "$AGE_DAYS" -gt 30 ]; then if [ "$AGE_DAYS" -gt 30 ]; then
p3 "Stale dependency: #${issue_num} blocked by #${dep} \"${DEP_TITLE}\" (open ${AGE_DAYS} days)" p3 "${proj_name}: Stale dependency: #${issue_num} blocked by #${dep} \"${DEP_TITLE}\" (open ${AGE_DAYS} days)"
fi fi
done done
done done
unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE unset DEPS_OF BACKLOG_NUMS NODE_COLOR SEEN_CYCLES DEP_CACHE
fi fi
# ============================================================================= # ===========================================================================
# P4: HOUSEKEEPING — stale processes # P4-PROJECT: Clean stale worktrees for this project
# ============================================================================= # ===========================================================================
# Check for dev-agent escalations NOW_TS=$(date +%s)
ESCALATION_FILE="${FACTORY_ROOT}/supervisor/escalations.jsonl" for wt in /tmp/${PROJECT_NAME}-worktree-* /tmp/${PROJECT_NAME}-review-*; do
if [ -s "$ESCALATION_FILE" ]; then
ESCALATION_COUNT=$(wc -l < "$ESCALATION_FILE")
p3 "Dev-agent escalated ${ESCALATION_COUNT} issue(s) — see ${ESCALATION_FILE}"
fi
status "P4: housekeeping"
# Stale agent-spawned claude processes (>3h, not caught by P0) — skip interactive sessions
STALE_CLAUDES=$(pgrep -f "claude -p" --older 10800 2>/dev/null || true)
if [ -n "$STALE_CLAUDES" ]; then
echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true
fixed "Killed stale claude processes: $(echo $STALE_CLAUDES | wc -w) procs"
fi
# Clean stale git worktrees (>2h, no active agent)
NOW_TS=$(date +%s)
for wt in /tmp/${PROJECT_NAME}-worktree-* /tmp/${PROJECT_NAME}-review-*; do
[ -d "$wt" ] || continue [ -d "$wt" ] || continue
WT_AGE_MIN=$(( (NOW_TS - $(stat -c %Y "$wt")) / 60 )) WT_AGE_MIN=$(( (NOW_TS - $(stat -c %Y "$wt")) / 60 ))
if [ "$WT_AGE_MIN" -gt 120 ]; then if [ "$WT_AGE_MIN" -gt 120 ]; then
# Skip if an agent is still using it
WT_BASE=$(basename "$wt") WT_BASE=$(basename "$wt")
if ! pgrep -f "$WT_BASE" >/dev/null 2>&1; then if ! pgrep -f "$WT_BASE" >/dev/null 2>&1; then
git -C "$PROJECT_REPO_ROOT" worktree remove --force "$wt" 2>/dev/null && \ git -C "$PROJECT_REPO_ROOT" worktree remove --force "$wt" 2>/dev/null && \
fixed "Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true fixed "${proj_name}: Removed stale worktree: $wt (${WT_AGE_MIN}min old)" || true
fi fi
fi fi
done done
git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null || true git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null || true
}
# Rotate supervisor log if >5MB
for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do
if [ -f "$logfile" ]; then
SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1)
if [ "${SIZE_KB:-0}" -gt 5120 ]; then
mv "$logfile" "${logfile}.old" 2>/dev/null
fixed "Rotated $(basename "$logfile")"
fi
fi
done
# ============================================================================= # =============================================================================
# Iterate over all registered projects
# =============================================================================
status "checking projects"
PROJECT_COUNT=0
if [ -d "$PROJECTS_DIR" ]; then
for project_toml in "${PROJECTS_DIR}"/*.toml; do
[ -f "$project_toml" ] || continue
PROJECT_COUNT=$((PROJECT_COUNT + 1))
# Load project config (overrides CODEBERG_REPO, PROJECT_REPO_ROOT, etc.)
source "${FACTORY_ROOT}/lib/load-project.sh" "$project_toml"
check_project
done
fi
if [ "$PROJECT_COUNT" -eq 0 ]; then
# Fallback: no project TOML files, use .env config (backwards compatible)
flog "No projects/*.toml found, using .env defaults"
check_project
fi
# #############################################################################
# RESULT # RESULT
# ============================================================================= # #############################################################################
ALL_ALERTS="${P0_ALERTS}${P1_ALERTS}${P2_ALERTS}${P3_ALERTS}${P4_ALERTS}" ALL_ALERTS="${P0_ALERTS}${P1_ALERTS}${P2_ALERTS}${P3_ALERTS}${P4_ALERTS}"