disinto/supervisor/supervisor-run.sh
openhands f716a75351 fix: Preserve worktrees on crash for debugging (#726)
On crash (PHASE:crashed or non-zero exit), preserve the worktree and log
its location instead of destroying it unconditionally. Successful sessions
still clean up normally. Supervisor runs housekeeping to remove stale
crashed worktrees older than 24h.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 13:41:33 +00:00

104 lines
4.7 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# supervisor-run.sh — Cron wrapper: supervisor execution via Claude + formula
#
# Runs every 20 minutes (or on-demand). Guards against concurrent runs and
# low memory. Collects metrics via preflight.sh, then creates a tmux session
# with Claude (sonnet) reading formulas/run-supervisor.toml.
#
# Replaces supervisor-poll.sh (bash orchestrator + claude -p one-shot) with
# formula-driven interactive Claude session matching the planner/predictor
# pattern.
#
# Usage:
# supervisor-run.sh [projects/disinto.toml] # project config (default: disinto)
#
# Cron: */20 * * * * cd /path/to/dark-factory && bash supervisor/supervisor-run.sh
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
# Accept project config from argument; default to disinto
export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
# shellcheck source=../lib/env.sh
source "$FACTORY_ROOT/lib/env.sh"
# shellcheck source=../lib/agent-session.sh
source "$FACTORY_ROOT/lib/agent-session.sh"
# shellcheck source=../lib/formula-session.sh
source "$FACTORY_ROOT/lib/formula-session.sh"
# shellcheck source=../lib/guard.sh
source "$FACTORY_ROOT/lib/guard.sh"
LOG_FILE="$SCRIPT_DIR/supervisor.log"
# shellcheck disable=SC2034 # consumed by run_formula_and_monitor
SESSION_NAME="supervisor-${PROJECT_NAME}"
PHASE_FILE="/tmp/supervisor-session-${PROJECT_NAME}.phase"
# shellcheck disable=SC2034 # read by monitor_phase_loop in lib/agent-session.sh
PHASE_POLL_INTERVAL=15
SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
# ── Guards ────────────────────────────────────────────────────────────────
check_active supervisor
acquire_cron_lock "/tmp/supervisor-run.lock"
check_memory 2000
log "--- Supervisor run start ---"
# ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
cleanup_stale_crashed_worktrees 24
# ── Collect pre-flight metrics ────────────────────────────────────────────
log "Running preflight.sh"
PREFLIGHT_OUTPUT=""
if PREFLIGHT_OUTPUT=$(bash "$SCRIPT_DIR/preflight.sh" "$PROJECT_TOML" 2>&1); then
log "Preflight collected ($(echo "$PREFLIGHT_OUTPUT" | wc -l) lines)"
else
log "WARNING: preflight.sh failed, continuing with partial data"
fi
# ── Load formula + context ───────────────────────────────────────────────
load_formula "$FACTORY_ROOT/formulas/run-supervisor.toml"
build_context_block AGENTS.md
# ── Read scratch file (compaction survival) ───────────────────────────────
SCRATCH_CONTEXT=$(read_scratch_context "$SCRATCH_FILE")
SCRATCH_INSTRUCTION=$(build_scratch_instruction "$SCRATCH_FILE")
# ── Build prompt ─────────────────────────────────────────────────────────
build_prompt_footer
# shellcheck disable=SC2034 # consumed by run_formula_and_monitor
PROMPT="You are the supervisor agent for ${FORGE_REPO}. Work through the formula below. You MUST write PHASE:done to '${PHASE_FILE}' when finished — the orchestrator will time you out if you return to the prompt without signalling.
You have full shell access and --dangerously-skip-permissions.
Fix what you can. File vault items for what you cannot. Do NOT ask permission — act first, report after.
## Pre-flight metrics (collected $(date -u +%H:%M) UTC)
${PREFLIGHT_OUTPUT}
## Project context
${CONTEXT_BLOCK}
${SCRATCH_CONTEXT:+${SCRATCH_CONTEXT}
}
Priority order: P0 memory > P1 disk > P2 stopped > P3 degraded > P4 housekeeping
${FORMULA_CONTENT}
${SCRATCH_INSTRUCTION}
${PROMPT_FOOTER}"
# ── Run session ──────────────────────────────────────────────────────────
export CLAUDE_MODEL="sonnet"
run_formula_and_monitor "supervisor" 1200
# ── Cleanup scratch file on normal exit ──────────────────────────────────
# FINAL_PHASE already set by run_formula_and_monitor
if [ "${FINAL_PHASE:-}" = "PHASE:done" ]; then
rm -f "$SCRATCH_FILE"
fi