fix: feat: supervisor as formula-driven agent — cron + Matrix escalation (#245)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7fe5ed0381
commit
d8244742f1
5 changed files with 585 additions and 12 deletions
0
supervisor/journal/.gitkeep
Normal file
0
supervisor/journal/.gitkeep
Normal file
188
supervisor/preflight.sh
Executable file
188
supervisor/preflight.sh
Executable file
|
|
@ -0,0 +1,188 @@
|
|||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# preflight.sh — Collect system and project metrics for the supervisor formula
|
||||
#
|
||||
# Outputs structured text to stdout. Called by supervisor-run.sh before
|
||||
# launching the Claude session. The output is injected into the prompt.
|
||||
#
|
||||
# Usage:
|
||||
# bash supervisor/preflight.sh [projects/disinto.toml]
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
|
||||
# shellcheck source=../lib/env.sh
|
||||
source "$FACTORY_ROOT/lib/env.sh"
|
||||
# shellcheck source=../lib/ci-helpers.sh
|
||||
source "$FACTORY_ROOT/lib/ci-helpers.sh"
|
||||
|
||||
# ── System Resources ─────────────────────────────────────────────────────
|
||||
|
||||
echo "## System Resources"
|
||||
|
||||
_avail_mb=$(free -m | awk '/Mem:/{print $7}')
|
||||
_total_mb=$(free -m | awk '/Mem:/{print $2}')
|
||||
_swap_used=$(free -m | awk '/Swap:/{print $3}')
|
||||
_disk_pct=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
|
||||
_disk_used=$(df -h / | awk 'NR==2{print $3}')
|
||||
_disk_total=$(df -h / | awk 'NR==2{print $2}')
|
||||
_load=$(cat /proc/loadavg 2>/dev/null || echo "unknown")
|
||||
|
||||
echo "RAM: ${_avail_mb}MB available / ${_total_mb}MB total, Swap: ${_swap_used}MB used"
|
||||
echo "Disk: ${_disk_pct}% used (${_disk_used}/${_disk_total} on /)"
|
||||
echo "Load: ${_load}"
|
||||
echo ""
|
||||
|
||||
# ── Docker ────────────────────────────────────────────────────────────────
|
||||
|
||||
echo "## Docker"
|
||||
if command -v docker &>/dev/null; then
|
||||
docker ps --format 'table {{.Names}}\t{{.Status}}' 2>/dev/null || echo "Docker query failed"
|
||||
else
|
||||
echo "Docker not available"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# ── Active Sessions + Phase Files ─────────────────────────────────────────
|
||||
|
||||
echo "## Active Sessions"
|
||||
if tmux list-sessions 2>/dev/null; then
|
||||
:
|
||||
else
|
||||
echo "No tmux sessions"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
echo "## Phase Files"
|
||||
_found_phase=false
|
||||
for _pf in /tmp/*-session-*.phase; do
|
||||
[ -f "$_pf" ] || continue
|
||||
_found_phase=true
|
||||
_phase_content=$(head -1 "$_pf" 2>/dev/null || echo "unreadable")
|
||||
_phase_age_min=$(( ($(date +%s) - $(stat -c %Y "$_pf" 2>/dev/null || echo 0)) / 60 ))
|
||||
echo " $(basename "$_pf"): ${_phase_content} (${_phase_age_min}min ago)"
|
||||
done
|
||||
[ "$_found_phase" = false ] && echo " None"
|
||||
echo ""
|
||||
|
||||
# ── Lock Files ────────────────────────────────────────────────────────────
|
||||
|
||||
echo "## Lock Files"
|
||||
_found_lock=false
|
||||
for _lf in /tmp/*-poll.lock /tmp/*-run.lock /tmp/dev-agent-*.lock; do
|
||||
[ -f "$_lf" ] || continue
|
||||
_found_lock=true
|
||||
_pid=$(cat "$_lf" 2>/dev/null || true)
|
||||
_age_min=$(( ($(date +%s) - $(stat -c %Y "$_lf" 2>/dev/null || echo 0)) / 60 ))
|
||||
_alive="dead"
|
||||
[ -n "${_pid:-}" ] && kill -0 "$_pid" 2>/dev/null && _alive="alive"
|
||||
echo " $(basename "$_lf"): PID=${_pid:-?} ${_alive} age=${_age_min}min"
|
||||
done
|
||||
[ "$_found_lock" = false ] && echo " None"
|
||||
echo ""
|
||||
|
||||
# ── Agent Logs (last 15 lines each) ──────────────────────────────────────
|
||||
|
||||
echo "## Recent Agent Logs"
|
||||
for _log in supervisor/supervisor.log dev/dev-agent.log review/review.log \
|
||||
gardener/gardener.log planner/planner.log predictor/predictor.log \
|
||||
action/action.log; do
|
||||
_logpath="${FACTORY_ROOT}/${_log}"
|
||||
if [ -f "$_logpath" ]; then
|
||||
_log_age_min=$(( ($(date +%s) - $(stat -c %Y "$_logpath" 2>/dev/null || echo 0)) / 60 ))
|
||||
echo "### ${_log} (last modified ${_log_age_min}min ago)"
|
||||
tail -15 "$_logpath" 2>/dev/null || echo "(read failed)"
|
||||
echo ""
|
||||
fi
|
||||
done
|
||||
|
||||
# ── CI Pipelines ──────────────────────────────────────────────────────────
|
||||
|
||||
echo "## CI Pipelines (${PROJECT_NAME})"
|
||||
|
||||
_recent_ci=$(wpdb -A -c "
|
||||
SELECT number, status, branch,
|
||||
ROUND(EXTRACT(EPOCH FROM (to_timestamp(finished) - to_timestamp(started)))/60)::int as dur_min
|
||||
FROM pipelines
|
||||
WHERE repo_id = ${WOODPECKER_REPO_ID}
|
||||
AND finished > 0
|
||||
AND to_timestamp(finished) > now() - interval '24 hours'
|
||||
ORDER BY number DESC LIMIT 10;" 2>/dev/null || echo "CI database query failed")
|
||||
echo "$_recent_ci"
|
||||
|
||||
_stuck=$(wpdb -c "
|
||||
SELECT count(*) FROM pipelines
|
||||
WHERE repo_id=${WOODPECKER_REPO_ID}
|
||||
AND status='running'
|
||||
AND EXTRACT(EPOCH FROM now() - to_timestamp(started)) > 1200;" 2>/dev/null | xargs || echo "?")
|
||||
|
||||
_pending=$(wpdb -c "
|
||||
SELECT count(*) FROM pipelines
|
||||
WHERE repo_id=${WOODPECKER_REPO_ID}
|
||||
AND status='pending'
|
||||
AND EXTRACT(EPOCH FROM now() - to_timestamp(created)) > 1800;" 2>/dev/null | xargs || echo "?")
|
||||
|
||||
echo "Stuck (>20min): ${_stuck}"
|
||||
echo "Pending (>30min): ${_pending}"
|
||||
echo ""
|
||||
|
||||
# ── Open PRs ──────────────────────────────────────────────────────────────
|
||||
|
||||
echo "## Open PRs (${PROJECT_NAME})"
|
||||
_open_prs=$(codeberg_api GET "/pulls?state=open&limit=10" 2>/dev/null || echo "[]")
|
||||
echo "$_open_prs" | jq -r '.[] | "#\(.number) [\(.head.ref)] \(.title) — updated \(.updated_at)"' 2>/dev/null || echo "No open PRs or query failed"
|
||||
echo ""
|
||||
|
||||
# ── Backlog + In-Progress ─────────────────────────────────────────────────
|
||||
|
||||
echo "## Issue Status (${PROJECT_NAME})"
|
||||
_backlog_count=$(codeberg_api GET "/issues?state=open&labels=backlog&type=issues&limit=1" 2>/dev/null | jq 'length' 2>/dev/null || echo "?")
|
||||
_in_progress_count=$(codeberg_api GET "/issues?state=open&labels=in-progress&type=issues&limit=1" 2>/dev/null | jq 'length' 2>/dev/null || echo "?")
|
||||
_blocked_count=$(codeberg_api GET "/issues?state=open&labels=blocked&type=issues&limit=1" 2>/dev/null | jq 'length' 2>/dev/null || echo "?")
|
||||
echo "Backlog: ${_backlog_count}, In-progress: ${_in_progress_count}, Blocked: ${_blocked_count}"
|
||||
echo ""
|
||||
|
||||
# ── Stale Worktrees ───────────────────────────────────────────────────────
|
||||
|
||||
echo "## Stale Worktrees"
|
||||
_found_wt=false
|
||||
for _wt in /tmp/*-worktree-* /tmp/*-review-*; do
|
||||
[ -d "$_wt" ] || continue
|
||||
_found_wt=true
|
||||
_wt_age_min=$(( ($(date +%s) - $(stat -c %Y "$_wt" 2>/dev/null || echo 0)) / 60 ))
|
||||
echo " $(basename "$_wt"): ${_wt_age_min}min old"
|
||||
done
|
||||
[ "$_found_wt" = false ] && echo " None"
|
||||
echo ""
|
||||
|
||||
# ── Pending Escalations ──────────────────────────────────────────────────
|
||||
|
||||
echo "## Pending Escalations"
|
||||
_found_esc=false
|
||||
for _esc_file in "${FACTORY_ROOT}/supervisor/escalations-"*.jsonl; do
|
||||
[ -f "$_esc_file" ] || continue
|
||||
[[ "$_esc_file" == *.done.jsonl ]] && continue
|
||||
_esc_count=$(wc -l < "$_esc_file" 2>/dev/null || echo 0)
|
||||
[ "${_esc_count:-0}" -gt 0 ] || continue
|
||||
_found_esc=true
|
||||
echo "### $(basename "$_esc_file") (${_esc_count} entries)"
|
||||
cat "$_esc_file"
|
||||
echo ""
|
||||
done
|
||||
[ "$_found_esc" = false ] && echo " None"
|
||||
echo ""
|
||||
|
||||
# ── Escalation Replies from Matrix ────────────────────────────────────────
|
||||
|
||||
echo "## Escalation Replies (from Matrix)"
|
||||
if [ -s /tmp/supervisor-escalation-reply ]; then
|
||||
cat /tmp/supervisor-escalation-reply
|
||||
echo ""
|
||||
echo "(This reply will be consumed after this run)"
|
||||
else
|
||||
echo " None"
|
||||
fi
|
||||
echo ""
|
||||
115
supervisor/supervisor-run.sh
Executable file
115
supervisor/supervisor-run.sh
Executable file
|
|
@ -0,0 +1,115 @@
|
|||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# supervisor-run.sh — Cron wrapper: supervisor execution via Claude + formula
|
||||
#
|
||||
# Runs every 20 minutes (or on-demand). Guards against concurrent runs and
|
||||
# low memory. Collects metrics via preflight.sh, then creates a tmux session
|
||||
# with Claude (sonnet) reading formulas/run-supervisor.toml.
|
||||
#
|
||||
# Replaces supervisor-poll.sh (bash orchestrator + claude -p one-shot) with
|
||||
# formula-driven interactive Claude session matching the planner/predictor
|
||||
# pattern.
|
||||
#
|
||||
# Usage:
|
||||
# supervisor-run.sh [projects/disinto.toml] # project config (default: disinto)
|
||||
#
|
||||
# Cron: */20 * * * * cd /path/to/dark-factory && bash supervisor/supervisor-run.sh
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
# Accept project config from argument; default to disinto
|
||||
export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
|
||||
# shellcheck source=../lib/env.sh
|
||||
source "$FACTORY_ROOT/lib/env.sh"
|
||||
# shellcheck source=../lib/agent-session.sh
|
||||
source "$FACTORY_ROOT/lib/agent-session.sh"
|
||||
# shellcheck source=../lib/formula-session.sh
|
||||
source "$FACTORY_ROOT/lib/formula-session.sh"
|
||||
|
||||
LOG_FILE="$SCRIPT_DIR/supervisor.log"
|
||||
# shellcheck disable=SC2034 # consumed by run_formula_and_monitor
|
||||
SESSION_NAME="supervisor-${PROJECT_NAME}"
|
||||
PHASE_FILE="/tmp/supervisor-session-${PROJECT_NAME}.phase"
|
||||
|
||||
# shellcheck disable=SC2034 # read by monitor_phase_loop in lib/agent-session.sh
|
||||
PHASE_POLL_INTERVAL=15
|
||||
|
||||
SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
|
||||
|
||||
log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
|
||||
|
||||
# ── Guards ────────────────────────────────────────────────────────────────
|
||||
acquire_cron_lock "/tmp/supervisor-run.lock"
|
||||
check_memory 2000
|
||||
|
||||
log "--- Supervisor run start ---"
|
||||
|
||||
# ── Collect pre-flight metrics ────────────────────────────────────────────
|
||||
log "Running preflight.sh"
|
||||
PREFLIGHT_OUTPUT=""
|
||||
if PREFLIGHT_OUTPUT=$(bash "$SCRIPT_DIR/preflight.sh" "$PROJECT_TOML" 2>&1); then
|
||||
log "Preflight collected ($(echo "$PREFLIGHT_OUTPUT" | wc -l) lines)"
|
||||
else
|
||||
log "WARNING: preflight.sh failed, continuing with partial data"
|
||||
fi
|
||||
|
||||
# ── Consume escalation replies ────────────────────────────────────────────
|
||||
# Move the file atomically so matrix_listener can write a new one
|
||||
ESCALATION_REPLY=""
|
||||
if [ -s /tmp/supervisor-escalation-reply ]; then
|
||||
_reply_tmp="/tmp/supervisor-escalation-reply.consumed.$$"
|
||||
if mv /tmp/supervisor-escalation-reply "$_reply_tmp" 2>/dev/null; then
|
||||
ESCALATION_REPLY=$(cat "$_reply_tmp")
|
||||
rm -f "$_reply_tmp"
|
||||
log "Consumed escalation reply: $(echo "$ESCALATION_REPLY" | head -1)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── Load formula + context ───────────────────────────────────────────────
|
||||
load_formula "$FACTORY_ROOT/formulas/run-supervisor.toml"
|
||||
build_context_block AGENTS.md
|
||||
|
||||
# ── Read scratch file (compaction survival) ───────────────────────────────
|
||||
SCRATCH_CONTEXT=$(read_scratch_context "$SCRATCH_FILE")
|
||||
SCRATCH_INSTRUCTION=$(build_scratch_instruction "$SCRATCH_FILE")
|
||||
|
||||
# ── Build prompt ─────────────────────────────────────────────────────────
|
||||
build_prompt_footer
|
||||
|
||||
# shellcheck disable=SC2034 # consumed by run_formula_and_monitor
|
||||
PROMPT="You are the supervisor agent for ${CODEBERG_REPO}. Work through the formula below. You MUST write PHASE:done to '${PHASE_FILE}' when finished — the orchestrator will time you out if you return to the prompt without signalling.
|
||||
|
||||
You have full shell access and --dangerously-skip-permissions.
|
||||
Fix what you can. Escalate what you cannot. Do NOT ask permission — act first, report after.
|
||||
|
||||
## Pre-flight metrics (collected $(date -u +%H:%M) UTC)
|
||||
${PREFLIGHT_OUTPUT}
|
||||
${ESCALATION_REPLY:+
|
||||
## Escalation Reply (from Matrix — human message)
|
||||
${ESCALATION_REPLY}
|
||||
|
||||
Act on this reply in the decide-actions step.
|
||||
}
|
||||
## Project context
|
||||
${CONTEXT_BLOCK}
|
||||
${SCRATCH_CONTEXT:+${SCRATCH_CONTEXT}
|
||||
}
|
||||
## Formula
|
||||
${FORMULA_CONTENT}
|
||||
|
||||
${SCRATCH_INSTRUCTION}
|
||||
|
||||
${PROMPT_FOOTER}"
|
||||
|
||||
# ── Run session ──────────────────────────────────────────────────────────
|
||||
export CLAUDE_MODEL="sonnet"
|
||||
run_formula_and_monitor "supervisor" 1200
|
||||
|
||||
# ── Cleanup scratch file on normal exit ──────────────────────────────────
|
||||
# FINAL_PHASE already set by run_formula_and_monitor
|
||||
if [ "${FINAL_PHASE:-}" = "PHASE:done" ]; then
|
||||
rm -f "$SCRATCH_FILE"
|
||||
fi
|
||||
Loading…
Add table
Add a link
Reference in a new issue