From 905041399474ecd378c16ae09a73a69a4ec3b73a Mon Sep 17 00:00:00 2001 From: johba Date: Tue, 17 Mar 2026 08:57:18 +0100 Subject: [PATCH] refactor: split supervisor into infra + per-project, make poll scripts config-driven MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supervisor split (#26): - Layer 1 (infra): P0 memory, P1 disk, P4 housekeeping — runs once, project-agnostic - Layer 2 (per-project): P2 CI/dev-agent, P3 PRs/deps — iterates projects/*.toml - Adding a new project requires only a new TOML file, no code changes Poll scripts accept project TOML arg (#27): - dev-poll.sh, review-poll.sh, gardener-poll.sh accept optional project TOML as $1 - env.sh loads PROJECT_TOML if set, overriding .env defaults - Cron: `dev-poll.sh projects/versi.toml` targets that project New files: - lib/load-project.sh: TOML to env var loader (Python tomllib) - projects/versi.toml: current project config extracted from .env Backwards compatible: scripts without a TOML arg fall back to .env config. Closes #26, Closes #27 Co-Authored-By: Claude Opus 4.6 (1M context) --- dev/dev-poll.sh | 7 +- gardener/gardener-poll.sh | 3 + lib/env.sh | 5 + lib/load-project.sh | 83 +++++ projects/versi.toml | 21 ++ review/review-poll.sh | 4 +- supervisor/supervisor-poll.sh | 612 ++++++++++++++++++---------------- 7 files changed, 438 insertions(+), 297 deletions(-) create mode 100755 lib/load-project.sh create mode 100644 projects/versi.toml diff --git a/dev/dev-poll.sh b/dev/dev-poll.sh index d08e08d..7551e88 100755 --- a/dev/dev-poll.sh +++ b/dev/dev-poll.sh @@ -9,11 +9,14 @@ # 1. Orphaned "in-progress" issues (agent died or PR needs attention) # 2. Ready "backlog" issues (all deps merged) # -# Usage: cron every 10min +# Usage: +# cron every 10min +# dev-poll.sh [projects/harb.toml] # optional project config set -euo pipefail -# Load shared environment +# Load shared environment (with optional project TOML override) +export PROJECT_TOML="${1:-}" source "$(dirname "$0")/../lib/env.sh" diff --git a/gardener/gardener-poll.sh b/gardener/gardener-poll.sh index a63d308..a820dba 100755 --- a/gardener/gardener-poll.sh +++ b/gardener/gardener-poll.sh @@ -30,6 +30,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" FACTORY_ROOT="$(dirname "$SCRIPT_DIR")" +# Load shared environment (with optional project TOML override) +# Usage: gardener-poll.sh [projects/harb.toml] +export PROJECT_TOML="${1:-}" # shellcheck source=../lib/env.sh source "$FACTORY_ROOT/lib/env.sh" diff --git a/lib/env.sh b/lib/env.sh index 7e732fd..e05fdc5 100755 --- a/lib/env.sh +++ b/lib/env.sh @@ -19,6 +19,11 @@ fi export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}" export HOME="${HOME:-/home/debian}" +# Load project TOML if PROJECT_TOML is set (by poll scripts that accept project arg) +if [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then + source "${FACTORY_ROOT}/lib/load-project.sh" "$PROJECT_TOML" +fi + # Codeberg token: env var > ~/.netrc if [ -z "${CODEBERG_TOKEN:-}" ]; then CODEBERG_TOKEN="$(awk '/codeberg.org/{getline;getline;print $2}' ~/.netrc 2>/dev/null || true)" diff --git a/lib/load-project.sh b/lib/load-project.sh new file mode 100755 index 0000000..acfe20c --- /dev/null +++ b/lib/load-project.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# load-project.sh — Load project config from a TOML file into env vars +# +# Usage (source, don't execute): +# source lib/load-project.sh projects/harb.toml +# +# Exports: +# PROJECT_NAME, CODEBERG_REPO, CODEBERG_API, PROJECT_REPO_ROOT, +# PRIMARY_BRANCH, WOODPECKER_REPO_ID, PROJECT_CONTAINERS, +# CHECK_PRS, CHECK_DEV_AGENT, CHECK_PIPELINE_STALL, CI_STALE_MINUTES +# +# If no argument given, does nothing (allows poll scripts to work with +# plain .env fallback for backwards compatibility). + +_PROJECT_TOML="${1:-}" + +if [ -z "$_PROJECT_TOML" ] || [ ! -f "$_PROJECT_TOML" ]; then + return 0 2>/dev/null || exit 0 +fi + +# Parse TOML to shell variable assignments via Python +_PROJECT_VARS=$(python3 -c " +import sys, tomllib + +with open(sys.argv[1], 'rb') as f: + cfg = tomllib.load(f) + +def emit(key, val): + if isinstance(val, bool): + print(f'{key}={str(val).lower()}') + elif isinstance(val, list): + print(f'{key}={\" \".join(str(v) for v in val)}') + else: + print(f'{key}={val}') + +# Top-level +emit('PROJECT_NAME', cfg.get('name', '')) +emit('CODEBERG_REPO', cfg.get('repo', '')) + +if 'repo_root' in cfg: + emit('PROJECT_REPO_ROOT', cfg['repo_root']) +if 'primary_branch' in cfg: + emit('PRIMARY_BRANCH', cfg['primary_branch']) + +# [ci] section +ci = cfg.get('ci', {}) +if 'woodpecker_repo_id' in ci: + emit('WOODPECKER_REPO_ID', ci['woodpecker_repo_id']) +if 'stale_minutes' in ci: + emit('CI_STALE_MINUTES', ci['stale_minutes']) + +# [services] section +svc = cfg.get('services', {}) +if 'containers' in svc: + emit('PROJECT_CONTAINERS', svc['containers']) + +# [monitoring] section +mon = cfg.get('monitoring', {}) +for key in ['check_prs', 'check_dev_agent', 'check_pipeline_stall']: + if key in mon: + emit(key.upper(), mon[key]) +" "$_PROJECT_TOML" 2>/dev/null) || { + echo "WARNING: failed to parse project TOML: $_PROJECT_TOML" >&2 + return 1 2>/dev/null || exit 1 +} + +# Export parsed variables +while IFS='=' read -r _key _val; do + [ -z "$_key" ] && continue + export "$_key=$_val" +done <<< "$_PROJECT_VARS" + +# Derive CODEBERG_API if repo changed +if [ -n "$CODEBERG_REPO" ]; then + export CODEBERG_API="https://codeberg.org/api/v1/repos/${CODEBERG_REPO}" +fi + +# Derive PROJECT_REPO_ROOT if not explicitly set +if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then + export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}" +fi + +unset _PROJECT_TOML _PROJECT_VARS _key _val diff --git a/projects/versi.toml b/projects/versi.toml new file mode 100644 index 0000000..c39afee --- /dev/null +++ b/projects/versi.toml @@ -0,0 +1,21 @@ +# projects/versi.toml — Project config for johba/versi +# +# This file defines project-specific settings for disinto agents. +# Drop a new TOML file here to add another project — no code changes needed. + +name = "versi" +repo = "johba/versi" +repo_root = "/home/admin/versi" +primary_branch = "main" + +[ci] +woodpecker_repo_id = 3 +stale_minutes = 60 + +[services] +containers = [] + +[monitoring] +check_prs = true +check_dev_agent = true +check_pipeline_stall = true diff --git a/review/review-poll.sh b/review/review-poll.sh index 202fe6a..71b4d8c 100755 --- a/review/review-poll.sh +++ b/review/review-poll.sh @@ -6,7 +6,9 @@ set -euo pipefail -# Load shared environment +# Load shared environment (with optional project TOML override) +# Usage: review-poll.sh [projects/harb.toml] +export PROJECT_TOML="${1:-}" source "$(dirname "$0")/../lib/env.sh" diff --git a/supervisor/supervisor-poll.sh b/supervisor/supervisor-poll.sh index b1d44df..aae8c3b 100755 --- a/supervisor/supervisor-poll.sh +++ b/supervisor/supervisor-poll.sh @@ -1,8 +1,11 @@ #!/usr/bin/env bash # supervisor-poll.sh — Supervisor agent: bash checks + claude -p for fixes # -# Runs every 10min via cron. Does all health checks in bash (zero tokens). -# Only invokes claude -p when auto-fix fails or issue is complex. +# Two-layer architecture: +# 1. Factory infrastructure (project-agnostic): RAM, disk, swap, docker, stale processes +# 2. Per-project checks (config-driven): CI, PRs, dev-agent, deps — iterated over projects/*.toml +# +# Runs every 10min via cron. # # Cron: */10 * * * * /path/to/disinto/supervisor/supervisor-poll.sh # @@ -15,6 +18,7 @@ LOGFILE="${FACTORY_ROOT}/supervisor/supervisor.log" STATUSFILE="/tmp/supervisor-status" LOCKFILE="/tmp/supervisor-poll.lock" PROMPT_FILE="${FACTORY_ROOT}/supervisor/PROMPT.md" +PROJECTS_DIR="${FACTORY_ROOT}/projects" # Prevent overlapping runs if [ -f "$LOCKFILE" ]; then @@ -60,6 +64,11 @@ p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; } FIXES="" fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; } +# ############################################################################# +# LAYER 1: FACTORY INFRASTRUCTURE +# (project-agnostic, runs once) +# ############################################################################# + # ============================================================================= # P0: MEMORY — check first, fix first # ============================================================================= @@ -82,13 +91,6 @@ if [ "${AVAIL_MB:-9999}" -lt 500 ] || { [ "${SWAP_USED_MB:-0}" -gt 3000 ] && [ " sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 fixed "Dropped filesystem caches" - # Restart Anvil if it's bloated (>1GB RSS) - ANVIL_CONTAINER="${ANVIL_CONTAINER:-${PROJECT_NAME}-anvil-1}" - ANVIL_RSS=$(sudo docker stats "$ANVIL_CONTAINER" --no-stream --format '{{.MemUsage}}' 2>/dev/null | grep -oP '^\S+' | head -1 || echo "0") - if echo "$ANVIL_RSS" | grep -qP '\dGiB'; then - sudo docker restart "$ANVIL_CONTAINER" >/dev/null 2>&1 && fixed "Restarted bloated Anvil (${ANVIL_RSS})" - fi - # Re-check after fixes AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}') SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}') @@ -113,8 +115,8 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then # Docker cleanup (safe — keeps images) sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune" - # Truncate supervisor logs >10MB - for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do + # Truncate logs >10MB + for logfile in "${FACTORY_ROOT}"/{dev,review,supervisor}/*.log; do if [ -f "$logfile" ]; then SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1) if [ "${SIZE_KB:-0}" -gt 10240 ]; then @@ -124,19 +126,6 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then fi done - # Clean old worktrees - IDLE_WORKTREES=$(find /tmp/${PROJECT_NAME}-worktree-* -maxdepth 0 -mmin +360 2>/dev/null || true) - if [ -n "$IDLE_WORKTREES" ]; then - cd "${PROJECT_REPO_ROOT}" && git worktree prune 2>/dev/null - for wt in $IDLE_WORKTREES; do - # Only remove if dev-agent is not running on it - ISSUE_NUM=$(basename "$wt" | sed "s/${PROJECT_NAME}-worktree-//") - if ! pgrep -f "dev-agent.sh ${ISSUE_NUM}" >/dev/null 2>&1; then - rm -rf "$wt" && fixed "Removed stale worktree: $wt" - fi - done - fi - # Woodpecker log_entries cleanup LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs) if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then @@ -157,283 +146,19 @@ if [ "${DISK_PERCENT:-0}" -gt 80 ]; then fi # ============================================================================= -# P2: FACTORY STOPPED — CI, dev-agent, git +# P4-INFRA: HOUSEKEEPING — stale processes, log rotation (project-agnostic) # ============================================================================= -status "P2: checking pipeline" +status "P4: infra housekeeping" -# CI stuck -STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || true) -[ "${STUCK_CI:-0}" -gt 0 ] 2>/dev/null && p2 "CI: ${STUCK_CI} pipeline(s) running >20min" - -PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=${WOODPECKER_REPO_ID} AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || true) -[ "${PENDING_CI:-0}" -gt 0 ] && p2 "CI: ${PENDING_CI} pipeline(s) pending >30min" - -# Dev-agent health -DEV_LOCK="/tmp/dev-agent.lock" -if [ -f "$DEV_LOCK" ]; then - DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null) - if ! kill -0 "$DEV_PID" 2>/dev/null; then - rm -f "$DEV_LOCK" - fixed "Removed stale dev-agent lock (PID ${DEV_PID} dead)" - else - DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0) - NOW_EPOCH=$(date +%s) - STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 )) - if [ "$STATUS_AGE_MIN" -gt 30 ]; then - p2 "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min" - fi - fi -fi - -# Git repo health -cd "${PROJECT_REPO_ROOT}" 2>/dev/null || true -GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") -GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no") - -if [ "$GIT_REBASE" = "yes" ]; then - git rebase --abort 2>/dev/null && git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \ - fixed "Aborted stale rebase, switched to ${PRIMARY_BRANCH}" || \ - p2 "Git: stale rebase, auto-abort failed" -fi -if [ "$GIT_BRANCH" != "${PRIMARY_BRANCH}" ] && [ "$GIT_BRANCH" != "unknown" ]; then - git checkout "${PRIMARY_BRANCH}" 2>/dev/null && \ - fixed "Switched main repo from '${GIT_BRANCH}' to ${PRIMARY_BRANCH}" || \ - p2 "Git: on '${GIT_BRANCH}' instead of ${PRIMARY_BRANCH}" -fi - -# ============================================================================= -# P2b: FACTORY STALLED — backlog exists but no agent running -# ============================================================================= -status "P2: checking pipeline stall" - -BACKLOG_COUNT=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0") -IN_PROGRESS=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq -r 'length' 2>/dev/null || echo "0") - -if [ "${BACKLOG_COUNT:-0}" -gt 0 ] && [ "${IN_PROGRESS:-0}" -eq 0 ]; then - # Backlog exists but nothing in progress — check if dev-agent ran recently - DEV_LOG="${FACTORY_ROOT}/dev/dev-agent.log" - if [ -f "$DEV_LOG" ]; then - LAST_LOG_EPOCH=$(stat -c %Y "$DEV_LOG" 2>/dev/null || echo 0) - else - LAST_LOG_EPOCH=0 - fi - NOW_EPOCH=$(date +%s) - IDLE_MIN=$(( (NOW_EPOCH - LAST_LOG_EPOCH) / 60 )) - - if [ "$IDLE_MIN" -gt 20 ]; then - p2 "Pipeline stalled: ${BACKLOG_COUNT} backlog issue(s), no agent ran for ${IDLE_MIN}min" - fi -fi - -# ============================================================================= -# P2c: DEV-AGENT PRODUCTIVITY — all backlog blocked for too long -# ============================================================================= -status "P2: checking dev-agent productivity" - -DEV_LOG_FILE="${FACTORY_ROOT}/dev/dev-agent.log" -if [ -f "$DEV_LOG_FILE" ]; then - # Check if last 6 poll entries all report "no ready issues" (~1 hour at 10min intervals) - RECENT_POLLS=$(tail -100 "$DEV_LOG_FILE" | grep "poll:" | tail -6) - TOTAL_RECENT=$(echo "$RECENT_POLLS" | grep -c "." || true) - BLOCKED_IN_RECENT=$(echo "$RECENT_POLLS" | grep -c "no ready issues" || true) - if [ "$TOTAL_RECENT" -ge 6 ] && [ "$BLOCKED_IN_RECENT" -eq "$TOTAL_RECENT" ]; then - p2 "Dev-agent blocked: last ${BLOCKED_IN_RECENT} polls all report 'no ready issues' — all backlog issues may be dep-blocked or have circular deps" - fi -fi - -# ============================================================================= -# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs -# ============================================================================= -status "P3: checking PRs" - -OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true) -for pr in $OPEN_PRS; do - PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true) - [ -z "$PR_JSON" ] && continue - PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""') - [ -z "$PR_SHA" ] && continue - - CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true) - - # Check for merge conflicts first (approved + CI pass but unmergeable) - MERGEABLE=$(echo "$PR_JSON" | jq -r '.mergeable // true') - if [ "$MERGEABLE" = "false" ] && [ "$CI_STATE" = "success" ]; then - p3 "PR #${pr}: CI pass but merge conflict — needs rebase" - elif [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then - UPDATED=$(echo "$PR_JSON" | jq -r '.updated_at // ""') - if [ -n "$UPDATED" ]; then - UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0) - NOW_EPOCH=$(date +%s) - AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 )) - [ "$AGE_MIN" -gt 30 ] && p3 "PR #${pr}: CI=${CI_STATE}, stale ${AGE_MIN}min" - fi - elif [ "$CI_STATE" = "success" ]; then - # Check if reviewed at this SHA - HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \ - jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("