From cb7dd398c726c8cf2cdce2802e74ee689467ec40 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 12 Mar 2026 13:00:17 +0000 Subject: [PATCH] feat: factory supervisor with priorities, auto-fix, and claude -p escalation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P0: memory crisis (auto-kill stale claude, drop caches, restart Anvil) - P1: disk pressure (docker prune, log truncate, worktree cleanup, WP log trim) - P2: factory stopped (CI stuck, dev-agent dead, git broken — auto-fix where possible) - P3: factory degraded (derailed PRs, auto-trigger reviews) - P4: housekeeping (stale processes, log rotation) Calls claude -p only for P0/P1 issues that auto-fix couldn't resolve. PROMPT.md contains distilled operational knowledge + self-update mechanism. --- factory/PROMPT.md | 88 +++++++++++ factory/factory-poll.sh | 327 +++++++++++++++++++++++++++------------ factory/update-prompt.sh | 34 ++++ 3 files changed, 347 insertions(+), 102 deletions(-) create mode 100644 factory/PROMPT.md create mode 100755 factory/update-prompt.sh diff --git a/factory/PROMPT.md b/factory/PROMPT.md new file mode 100644 index 0000000..389adac --- /dev/null +++ b/factory/PROMPT.md @@ -0,0 +1,88 @@ +# Factory Supervisor — System Prompt + +You are the factory supervisor for the `johba/harb` DeFi protocol repo. You were +called because `factory-poll.sh` detected an issue it couldn't auto-fix. + +## Your Environment + +- **VPS:** 8GB RAM, 4GB swap, Debian +- **Repo:** `/home/debian/harb` (Codeberg: johba/harb, branch: master, protected) +- **CI:** Woodpecker at localhost:8000 (Docker backend) +- **Stack:** Docker containers (anvil, ponder, webapp, landing, caddy, postgres, txn-bot, otterscan) +- **Tools:** Foundry at `~/.foundry/bin/`, Node at `~/.nvm/versions/node/v22.20.0/bin/` +- **Factory scripts:** See FACTORY_ROOT env var + +## Priority Order + +1. **P0 — Memory crisis:** RAM <500MB available OR swap >3GB. Fix IMMEDIATELY. +2. **P1 — Disk pressure:** Disk >80%. Clean up before builds fail. +3. **P2 — Factory stopped:** Dev-agent dead, CI down, git repo broken. +4. **P3 — Factory degraded:** Derailed PR, stuck pipeline, unreviewed PRs. +5. **P4 — Housekeeping:** Stale processes, log rotation, docker cleanup. + +## What You Can Do (no permission needed) + +- Kill stale `claude` processes (`pgrep -f "claude" | xargs kill`) +- Clean docker: `sudo docker system prune -f` (NOT `-a --volumes` — that kills CI images) +- Truncate large logs: `truncate -s 0 ` for factory logs +- Remove stale lock files (`/tmp/dev-agent.lock` if PID is dead) +- Restart dev-agent on a derailed PR: `bash ${FACTORY_ROOT}/dev/dev-agent.sh &` +- Restart frozen Anvil: `sudo docker restart harb-anvil-1` +- Retrigger CI: empty commit + push on a PR branch +- Clean Woodpecker log_entries: `wpdb -c "DELETE FROM log_entries WHERE id < (SELECT max(id)-100000 FROM log_entries);"` +- Drop filesystem caches: `sync && echo 3 | sudo tee /proc/sys/vm/drop_caches` +- Prune git worktrees: `cd /home/debian/harb && git worktree prune` +- Kill orphan worktree processes + +## What You CANNOT Do (escalate to Clawy) + +- Merge PRs +- Close/reopen issues +- Make architecture decisions +- Modify production contracts +- Run `docker system prune -a --volumes` (kills CI images, hours to rebuild) +- Anything you're unsure about + +## Best Practices (distilled from experience) + +### Memory Management +- Docker containers grow: Anvil reaches 12GB+ within hours. Restart is the fix. +- `claude` processes from dev-agent can zombie at 200MB+ each. Kill any older than 3h. +- `forge build` with via_ir OOMs on 8GB. Never compile full test suite — use `--skip test script`. +- After killing processes, run `sync && echo 3 | sudo tee /proc/sys/vm/drop_caches`. + +### Disk Management +- Woodpecker `log_entries` table grows to 5GB+. Truncate periodically, then `VACUUM FULL`. +- Docker overlay layers survive normal prune. Use `docker system prune -f` (NOT `-a`). +- Git worktrees in `/tmp/harb-worktree-*` accumulate. Prune if dev-agent is idle. +- Node module caches in worktrees eat disk. Remove `/tmp/harb-worktree-*/node_modules/`. + +### CI +- Codeberg rate-limits SSH clones. If `git` step fails with exit 128, retrigger (empty commit). +- CI images are pre-built. `docker system prune -a` deletes them — hours to rebuild. +- Running CI + harb stack = 14+ containers. Only run one pipeline at a time. +- `log_entries` table: truncate when >1GB. + +### Dev-Agent +- Lock file at `/tmp/dev-agent.lock`. If PID is dead, remove lock file. +- Worktrees at `/tmp/harb-worktree-`. Preserved for session continuity. +- `claude` subprocess timeout is 2h. Kill if running longer. +- After killing dev-agent, ensure the issue is unclaimed (remove `in-progress` label). + +### Git +- Main repo must be on `master`. If detached HEAD or mid-rebase: `git rebase --abort && git checkout master`. +- Never delete remote branches before confirmed merged. +- Stale worktrees break `git worktree add`. Run `git worktree prune` to fix. + +## Output Format + +After fixing, output a SHORT summary: +``` +FIXED: +REMAINING: +``` + +If you can't fix it: +``` +ESCALATE: +``` diff --git a/factory/factory-poll.sh b/factory/factory-poll.sh index 15145e0..c74be55 100755 --- a/factory/factory-poll.sh +++ b/factory/factory-poll.sh @@ -2,7 +2,7 @@ # factory-poll.sh — Factory supervisor: bash checks + claude -p for fixes # # Runs every 10min via cron. Does all health checks in bash (zero tokens). -# Only invokes claude -p when intervention is needed. +# Only invokes claude -p when auto-fix fails or issue is complex. # # Cron: */10 * * * * /path/to/dark-factory/factory/factory-poll.sh # @@ -14,6 +14,7 @@ source "$(dirname "$0")/../lib/env.sh" LOGFILE="${FACTORY_ROOT}/factory/factory.log" STATUSFILE="/tmp/factory-status" LOCKFILE="/tmp/factory-poll.lock" +PROMPT_FILE="${FACTORY_ROOT}/factory/PROMPT.md" # Prevent overlapping runs if [ -f "$LOCKFILE" ]; then @@ -26,170 +27,292 @@ fi echo $$ > "$LOCKFILE" trap 'rm -f "$LOCKFILE" "$STATUSFILE"' EXIT +flog() { + printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" +} + status() { printf '[%s] factory: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" > "$STATUSFILE" - log "$*" >> "$LOGFILE" + flog "$*" } -ALERTS="" -alert() { - ALERTS="${ALERTS}• $*\n" - log "ALERT: $*" >> "$LOGFILE" -} +# Alerts by priority +P0_ALERTS="" +P1_ALERTS="" +P2_ALERTS="" +P3_ALERTS="" +P4_ALERTS="" + +p0() { P0_ALERTS="${P0_ALERTS}• [P0] $*\n"; flog "P0: $*"; } +p1() { P1_ALERTS="${P1_ALERTS}• [P1] $*\n"; flog "P1: $*"; } +p2() { P2_ALERTS="${P2_ALERTS}• [P2] $*\n"; flog "P2: $*"; } +p3() { P3_ALERTS="${P3_ALERTS}• [P3] $*\n"; flog "P3: $*"; } +p4() { P4_ALERTS="${P4_ALERTS}• [P4] $*\n"; flog "P4: $*"; } + +FIXES="" +fixed() { FIXES="${FIXES}• ✅ $*\n"; flog "FIXED: $*"; } # ============================================================================= -# CHECK 1: Stuck/failed CI pipelines +# P0: MEMORY — check first, fix first # ============================================================================= -status "checking CI" +status "P0: checking memory" -STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=2 AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs) -[ "${STUCK_CI:-0}" -gt 0 ] && alert "CI: ${STUCK_CI} pipeline(s) running >20min" +AVAIL_MB=$(free -m | awk '/Mem:/{print $7}') +SWAP_USED_MB=$(free -m | awk '/Swap:/{print $3}') -PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=2 AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs) -[ "${PENDING_CI:-0}" -gt 0 ] && alert "CI: ${PENDING_CI} pipeline(s) pending >30min" +if [ "${AVAIL_MB:-9999}" -lt 500 ] || [ "${SWAP_USED_MB:-0}" -gt 3000 ]; then + flog "MEMORY CRISIS: avail=${AVAIL_MB}MB swap_used=${SWAP_USED_MB}MB — auto-fixing" + + # Kill stale claude processes (>3h old) + STALE_CLAUDES=$(pgrep -f "claude" --older 10800 2>/dev/null || true) + if [ -n "$STALE_CLAUDES" ]; then + echo "$STALE_CLAUDES" | xargs kill 2>/dev/null || true + fixed "Killed stale claude processes: ${STALE_CLAUDES}" + fi + + # Drop filesystem caches + sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 + fixed "Dropped filesystem caches" + + # Restart Anvil if it's bloated (>1GB RSS) + ANVIL_RSS=$(sudo docker stats harb-anvil-1 --no-stream --format '{{.MemUsage}}' 2>/dev/null | grep -oP '^\S+' | head -1 || echo "0") + if echo "$ANVIL_RSS" | grep -qP '\dGiB'; then + sudo docker restart harb-anvil-1 >/dev/null 2>&1 && fixed "Restarted bloated Anvil (${ANVIL_RSS})" + fi + + # Re-check after fixes + AVAIL_MB_AFTER=$(free -m | awk '/Mem:/{print $7}') + SWAP_AFTER=$(free -m | awk '/Swap:/{print $3}') + + if [ "${AVAIL_MB_AFTER:-0}" -lt 500 ] || [ "${SWAP_AFTER:-0}" -gt 3000 ]; then + p0 "Memory still critical after auto-fix: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB" + else + flog "Memory recovered: avail=${AVAIL_MB_AFTER}MB swap=${SWAP_AFTER}MB" + fi +fi # ============================================================================= -# CHECK 2: Derailed PRs — open with CI failure + no push in 30min +# P1: DISK # ============================================================================= -status "checking PRs" +status "P1: checking disk" -OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true) -for pr in $OPEN_PRS; do - PR_SHA=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null | jq -r '.head.sha' 2>/dev/null || true) - [ -z "$PR_SHA" ] && continue +DISK_PERCENT=$(df -h / | awk 'NR==2{print $5}' | tr -d '%') - CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true) - if [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then - # Check when last push happened - UPDATED=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null | jq -r '.updated_at // ""' 2>/dev/null || true) - if [ -n "$UPDATED" ]; then - UPDATED_EPOCH=$(date -d "$UPDATED" +%s 2>/dev/null || echo 0) - NOW_EPOCH=$(date +%s) - AGE_MIN=$(( (NOW_EPOCH - UPDATED_EPOCH) / 60 )) - if [ "$AGE_MIN" -gt 30 ]; then - alert "PR #${pr}: CI=${CI_STATE}, no activity for ${AGE_MIN}min" +if [ "${DISK_PERCENT:-0}" -gt 80 ]; then + flog "DISK PRESSURE: ${DISK_PERCENT}% — auto-cleaning" + + # Docker cleanup (safe — keeps images) + sudo docker system prune -f >/dev/null 2>&1 && fixed "Docker prune" + + # Truncate factory logs >10MB + for logfile in "${FACTORY_ROOT}"/{dev,review,factory}/*.log; do + if [ -f "$logfile" ]; then + SIZE_KB=$(du -k "$logfile" 2>/dev/null | cut -f1) + if [ "${SIZE_KB:-0}" -gt 10240 ]; then + truncate -s 0 "$logfile" + fixed "Truncated $(basename "$logfile") (was ${SIZE_KB}KB)" fi fi + done + + # Clean old worktrees + IDLE_WORKTREES=$(find /tmp/harb-worktree-* -maxdepth 0 -mmin +360 2>/dev/null || true) + if [ -n "$IDLE_WORKTREES" ]; then + cd "${HARB_REPO_ROOT}" && git worktree prune 2>/dev/null + for wt in $IDLE_WORKTREES; do + # Only remove if dev-agent is not running on it + ISSUE_NUM=$(basename "$wt" | sed 's/harb-worktree-//') + if ! pgrep -f "dev-agent.sh ${ISSUE_NUM}" >/dev/null 2>&1; then + rm -rf "$wt" && fixed "Removed stale worktree: $wt" + fi + done fi -done + + # Woodpecker log_entries cleanup + LOG_ENTRIES_MB=$(wpdb -c "SELECT pg_size_pretty(pg_total_relation_size('log_entries'));" 2>/dev/null | xargs) + if echo "$LOG_ENTRIES_MB" | grep -qP '\d+\s*(GB|MB)'; then + SIZE_NUM=$(echo "$LOG_ENTRIES_MB" | grep -oP '\d+') + SIZE_UNIT=$(echo "$LOG_ENTRIES_MB" | grep -oP '(GB|MB)') + if [ "$SIZE_UNIT" = "GB" ] || { [ "$SIZE_UNIT" = "MB" ] && [ "$SIZE_NUM" -gt 500 ]; }; then + wpdb -c "DELETE FROM log_entries WHERE id < (SELECT max(id) - 100000 FROM log_entries);" 2>/dev/null + fixed "Trimmed Woodpecker log_entries (was ${LOG_ENTRIES_MB})" + fi + fi + + DISK_AFTER=$(df -h / | awk 'NR==2{print $5}' | tr -d '%') + if [ "${DISK_AFTER:-0}" -gt 80 ]; then + p1 "Disk still ${DISK_AFTER}% after auto-clean" + else + flog "Disk recovered: ${DISK_AFTER}%" + fi +fi # ============================================================================= -# CHECK 3: Dev-agent health +# P2: FACTORY STOPPED — CI, dev-agent, git # ============================================================================= -status "checking dev-agent" +status "P2: checking factory" +# CI stuck +STUCK_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=2 AND status='running' AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs) +[ "${STUCK_CI:-0}" -gt 0 ] && p2 "CI: ${STUCK_CI} pipeline(s) running >20min" + +PENDING_CI=$(wpdb -c "SELECT count(*) FROM pipelines WHERE repo_id=2 AND status='pending' AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs) +[ "${PENDING_CI:-0}" -gt 0 ] && p2 "CI: ${PENDING_CI} pipeline(s) pending >30min" + +# Dev-agent health DEV_LOCK="/tmp/dev-agent.lock" if [ -f "$DEV_LOCK" ]; then DEV_PID=$(cat "$DEV_LOCK" 2>/dev/null) if ! kill -0 "$DEV_PID" 2>/dev/null; then - alert "Dev-agent: lock file exists but PID ${DEV_PID} is dead (stale lock)" + rm -f "$DEV_LOCK" + fixed "Removed stale dev-agent lock (PID ${DEV_PID} dead)" else - # Check if it's making progress — same status for >30min? - DEV_STATUS=$(cat /tmp/dev-agent-status 2>/dev/null || echo "") DEV_STATUS_AGE=$(stat -c %Y /tmp/dev-agent-status 2>/dev/null || echo 0) NOW_EPOCH=$(date +%s) STATUS_AGE_MIN=$(( (NOW_EPOCH - DEV_STATUS_AGE) / 60 )) if [ "$STATUS_AGE_MIN" -gt 30 ]; then - alert "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min — possibly stuck" + p2 "Dev-agent: status unchanged for ${STATUS_AGE_MIN}min" fi fi fi -# ============================================================================= -# CHECK 4: Git repo health -# ============================================================================= -status "checking git repo" - +# Git repo health cd "${HARB_REPO_ROOT}" 2>/dev/null || true -GIT_STATUS=$(git status --porcelain 2>/dev/null | wc -l) GIT_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") GIT_REBASE=$([ -d .git/rebase-merge ] || [ -d .git/rebase-apply ] && echo "yes" || echo "no") if [ "$GIT_REBASE" = "yes" ]; then - alert "Git: stale rebase in progress on main repo" + git rebase --abort 2>/dev/null && git checkout master 2>/dev/null && \ + fixed "Aborted stale rebase, switched to master" || \ + p2 "Git: stale rebase, auto-abort failed" fi -if [ "$GIT_BRANCH" != "master" ]; then - alert "Git: main repo on branch '${GIT_BRANCH}' instead of master" +if [ "$GIT_BRANCH" != "master" ] && [ "$GIT_BRANCH" != "unknown" ]; then + git checkout master 2>/dev/null && \ + fixed "Switched main repo from '${GIT_BRANCH}' to master" || \ + p2 "Git: on '${GIT_BRANCH}' instead of master" fi # ============================================================================= -# CHECK 5: Infra — RAM, swap, disk, docker +# P3: FACTORY DEGRADED — derailed PRs, unreviewed PRs # ============================================================================= -status "checking infra" - -AVAIL_MB=$(free -m | awk '/Mem:/{print $7}') -SWAP_USED_MB=$(free -m | awk '/Swap:/{print $3}') -DISK_PERCENT=$(df -h / | awk 'NR==2{print $5}' | tr -d '%') - -if [ "${AVAIL_MB:-0}" -lt 500 ]; then - alert "RAM: only ${AVAIL_MB}MB available" -fi -if [ "${SWAP_USED_MB:-0}" -gt 3000 ]; then - alert "Swap: ${SWAP_USED_MB}MB used (>3GB)" -fi -if [ "${DISK_PERCENT:-0}" -gt 85 ]; then - alert "Disk: ${DISK_PERCENT}% full" -fi - -# Check if Anvil is responsive -ANVIL_OK=$(curl -sf -m 5 -X POST -H "Content-Type: application/json" \ - -d '{"jsonrpc":"2.0","method":"eth_chainId","params":[],"id":1}' \ - http://localhost:8545 2>/dev/null | jq -r '.result // "fail"' 2>/dev/null || echo "fail") -if [ "$ANVIL_OK" = "fail" ]; then - # Try to auto-fix - sudo docker restart harb-anvil-1 2>/dev/null && \ - log "Auto-fixed: restarted frozen Anvil" >> "$LOGFILE" || \ - alert "Anvil: unresponsive and restart failed" -fi - -# ============================================================================= -# CHECK 6: Review bot — unreviewed PRs older than 1h -# ============================================================================= -status "checking review backlog" +status "P3: checking PRs" +OPEN_PRS=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null | jq -r '.[].number' 2>/dev/null || true) for pr in $OPEN_PRS; do - PR_SHA=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null | jq -r '.head.sha' 2>/dev/null || true) + PR_JSON=$(codeberg_api GET "/pulls/${pr}" 2>/dev/null || true) + [ -z "$PR_JSON" ] && continue + PR_SHA=$(echo "$PR_JSON" | jq -r '.head.sha // ""') [ -z "$PR_SHA" ] && continue CI_STATE=$(codeberg_api GET "/commits/${PR_SHA}/status" 2>/dev/null | jq -r '.state // "unknown"' 2>/dev/null || true) - [ "$CI_STATE" != "success" ] && continue - # CI passed — check if reviewed at this SHA - HAS_REVIEW=$(codeberg_api GET "/issues/${pr}/comments?limit=50" 2>/dev/null | \ - jq -r --arg sha "$PR_SHA" '[.[] | select(.body | contains("