diff --git a/.woodpecker/agent-smoke.sh b/.woodpecker/agent-smoke.sh index 40d95b8..a6a1bda 100644 --- a/.woodpecker/agent-smoke.sh +++ b/.woodpecker/agent-smoke.sh @@ -91,7 +91,7 @@ echo "=== 2/2 Function resolution ===" # Functions provided by shared lib files (available to all agent scripts via source) LIB_FUNS=$( - for f in lib/agent-session.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/file-action-issue.sh; do + for f in lib/agent-session.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/file-action-issue.sh lib/formula-session.sh; do if [ -f "$f" ]; then get_fns "$f"; fi done | sort -u ) @@ -162,7 +162,7 @@ check_script gardener/gardener-poll.sh check_script gardener/gardener-run.sh check_script review/review-pr.sh check_script review/review-poll.sh -check_script planner/planner-poll.sh +check_script planner/planner-run.sh check_script supervisor/supervisor-poll.sh check_script supervisor/update-prompt.sh check_script vault/vault-agent.sh diff --git a/AGENTS.md b/AGENTS.md index 646752e..d9d9249 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -291,6 +291,7 @@ sourced as needed. | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) | | `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll | | `lib/matrix_listener.sh` | Long-poll Matrix sync daemon. Dispatches thread replies to the correct agent via well-known files (`/tmp/{agent}-escalation-reply`). Handles supervisor, gardener, dev, review, vault, and action reply routing. Run as systemd service. | Standalone daemon | +| `lib/formula-session.sh` | `acquire_cron_lock()`, `check_memory()`, `load_formula()`, `build_context_block()`, `start_formula_session()`, `formula_phase_callback()` — shared helpers for formula-driven cron agents (lock, memory guard, formula loading, tmux session, crash recovery). | planner-run.sh | | `lib/file-action-issue.sh` | `file_action_issue()` — dedup check, label lookup, and issue creation for formula-driven cron wrappers. Sets `FILED_ISSUE_NUM` on success. | gardener-run.sh | | `lib/agent-session.sh` | Shared tmux + Claude session helpers: `create_agent_session()`, `inject_formula()`, `agent_wait_for_claude_ready()`, `agent_inject_into_session()`, `agent_kill_session()`, `monitor_phase_loop()`, `read_phase()`. `create_agent_session(session, workdir, [phase_file])` optionally installs a PostToolUse hook (matcher `Bash\|Write`) that detects phase file writes in real-time — when Claude writes to the phase file, the hook writes a marker so `monitor_phase_loop` reacts on the next poll instead of waiting for mtime changes. Also installs a StopFailure hook (matcher `rate_limit\|server_error\|authentication_failed\|billing_error`) that writes `PHASE:failed` with an `api_error` reason to the phase file and touches the phase-changed marker, so the orchestrator discovers API errors within one poll cycle instead of waiting for idle timeout. When `MATRIX_THREAD_ID` is exported, also installs a Stop hook (`on-stop-matrix.sh`) that streams each Claude response to the Matrix thread. `monitor_phase_loop` sets `_MONITOR_LOOP_EXIT` to one of: `done`, `idle_timeout`, `idle_prompt` (Claude returned to `❯` for 3 consecutive polls without writing any phase — callback invoked with `PHASE:failed`, session already dead), `crashed`, or a `PHASE:*` string. Agents must handle `idle_prompt` in both their callback and their post-loop exit handler. | dev-agent.sh, gardener-agent.sh, action-agent.sh | diff --git a/gardener/gardener-agent.sh b/gardener/gardener-agent.sh index 3a3f658..699e8be 100644 --- a/gardener/gardener-agent.sh +++ b/gardener/gardener-agent.sh @@ -29,6 +29,8 @@ export PROJECT_TOML="${1:-}" source "$FACTORY_ROOT/lib/env.sh" # shellcheck source=../lib/agent-session.sh source "$FACTORY_ROOT/lib/agent-session.sh" +# shellcheck source=../lib/formula-session.sh +source "$FACTORY_ROOT/lib/formula-session.sh" LOG_FILE="$SCRIPT_DIR/gardener.log" SESSION_NAME="gardener-${PROJECT_NAME}" @@ -275,32 +277,15 @@ matrix_send "gardener" "🌱 Gardener session started for ${CODEBERG_REPO}" 2>/d # ── Phase monitoring loop ───────────────────────────────────────────────── log "Monitoring phase file: ${PHASE_FILE}" -GARDENER_CRASH_COUNT=0 +_FORMULA_CRASH_COUNT=0 gardener_phase_callback() { - local phase="$1" - log "phase: ${phase}" - case "$phase" in - PHASE:crashed) - if [ "$GARDENER_CRASH_COUNT" -gt 0 ]; then - log "ERROR: session crashed again after recovery — giving up" - return 0 - fi - GARDENER_CRASH_COUNT=$((GARDENER_CRASH_COUNT + 1)) - log "WARNING: tmux session died unexpectedly — attempting recovery" - rm -f "$RESULT_FILE" - touch "$RESULT_FILE" - if create_agent_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROJECT_REPO_ROOT" "$PHASE_FILE" 2>/dev/null; then - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROMPT" - log "Recovery session started" - else - log "ERROR: could not restart session after crash" - fi - ;; - PHASE:done|PHASE:failed|PHASE:needs_human|PHASE:merged) - agent_kill_session "${_MONITOR_SESSION:-$SESSION_NAME}" - ;; - esac + # Gardener-specific cleanup before shared crash recovery + if [ "$1" = "PHASE:crashed" ]; then + rm -f "$RESULT_FILE" + touch "$RESULT_FILE" + fi + formula_phase_callback "$1" } monitor_phase_loop "$PHASE_FILE" 7200 "gardener_phase_callback" diff --git a/lib/formula-session.sh b/lib/formula-session.sh new file mode 100644 index 0000000..3d73983 --- /dev/null +++ b/lib/formula-session.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# formula-session.sh — Shared helpers for formula-driven cron agents +# +# Provides reusable functions for the common cron-wrapper + tmux-session +# pattern used by planner-run.sh and gardener-agent.sh. +# +# Functions: +# acquire_cron_lock LOCK_FILE — PID lock with stale cleanup +# check_memory [MIN_MB] — skip if available RAM too low +# load_formula FORMULA_FILE — sets FORMULA_CONTENT +# build_context_block FILE [FILE ...] — sets CONTEXT_BLOCK +# start_formula_session SESSION WORKDIR PHASE_FILE — create tmux + claude +# formula_phase_callback PHASE — standard crash-recovery callback +# +# Requires: lib/agent-session.sh sourced first (for create_agent_session, +# agent_kill_session, agent_inject_into_session). +# Globals used by formula_phase_callback: SESSION_NAME, PHASE_FILE, +# PROJECT_REPO_ROOT, PROMPT (set by the calling script). + +# ── Cron guards ────────────────────────────────────────────────────────── + +# acquire_cron_lock LOCK_FILE +# Acquires a PID lock. Exits 0 if another instance is running. +# Sets an EXIT trap to clean up the lock file. +acquire_cron_lock() { + _CRON_LOCK_FILE="$1" + if [ -f "$_CRON_LOCK_FILE" ]; then + local lock_pid + lock_pid=$(cat "$_CRON_LOCK_FILE" 2>/dev/null || true) + if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then + log "run: already running (PID $lock_pid)" + exit 0 + fi + rm -f "$_CRON_LOCK_FILE" + fi + echo $$ > "$_CRON_LOCK_FILE" + trap 'rm -f "$_CRON_LOCK_FILE"' EXIT +} + +# check_memory [MIN_MB] +# Exits 0 (skip) if available memory is below MIN_MB (default 2000). +check_memory() { + local min_mb="${1:-2000}" + local avail_mb + avail_mb=$(free -m | awk '/Mem:/{print $7}') + if [ "${avail_mb:-0}" -lt "$min_mb" ]; then + log "run: skipping — only ${avail_mb}MB available (need ${min_mb})" + exit 0 + fi +} + +# ── Formula loading ────────────────────────────────────────────────────── + +# load_formula FORMULA_FILE +# Reads formula TOML into FORMULA_CONTENT. Exits 1 if missing. +load_formula() { + local formula_file="$1" + if [ ! -f "$formula_file" ]; then + log "ERROR: formula not found: $formula_file" + exit 1 + fi + # shellcheck disable=SC2034 # consumed by the calling script + FORMULA_CONTENT=$(cat "$formula_file") +} + +# build_context_block FILE [FILE ...] +# Reads each file from $PROJECT_REPO_ROOT and builds CONTEXT_BLOCK. +build_context_block() { + CONTEXT_BLOCK="" + local ctx ctx_path + for ctx in "$@"; do + ctx_path="${PROJECT_REPO_ROOT}/${ctx}" + if [ -f "$ctx_path" ]; then + CONTEXT_BLOCK="${CONTEXT_BLOCK} +### ${ctx} +$(cat "$ctx_path") +" + fi + done +} + +# ── Session management ─────────────────────────────────────────────────── + +# start_formula_session SESSION WORKDIR PHASE_FILE +# Kills stale session, resets phase file, creates new tmux + claude session. +# Returns 0 on success, 1 on failure. +start_formula_session() { + local session="$1" workdir="$2" phase_file="$3" + agent_kill_session "$session" + rm -f "$phase_file" + log "Creating tmux session: ${session}" + if ! create_agent_session "$session" "$workdir" "$phase_file"; then + log "ERROR: failed to create tmux session ${session}" + return 1 + fi +} + +# formula_phase_callback PHASE +# Standard crash-recovery phase callback for formula sessions. +# Requires globals: SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT. +# Uses _FORMULA_CRASH_COUNT (auto-initialized) for single-retry limit. +# shellcheck disable=SC2154 # SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT set by caller +formula_phase_callback() { + local phase="$1" + log "phase: ${phase}" + case "$phase" in + PHASE:crashed) + if [ "${_FORMULA_CRASH_COUNT:-0}" -gt 0 ]; then + log "ERROR: session crashed again after recovery — giving up" + return 0 + fi + _FORMULA_CRASH_COUNT=$(( ${_FORMULA_CRASH_COUNT:-0} + 1 )) + log "WARNING: tmux session died unexpectedly — attempting recovery" + if create_agent_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROJECT_REPO_ROOT" "$PHASE_FILE" 2>/dev/null; then + agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROMPT" + log "Recovery session started" + else + log "ERROR: could not restart session after crash" + fi + ;; + PHASE:done|PHASE:failed|PHASE:needs_human|PHASE:merged) + agent_kill_session "${_MONITOR_SESSION:-$SESSION_NAME}" + ;; + esac +} diff --git a/planner/planner-run.sh b/planner/planner-run.sh index dbc913f..2ad01cc 100755 --- a/planner/planner-run.sh +++ b/planner/planner-run.sh @@ -20,9 +20,10 @@ export PROJECT_TOML="$FACTORY_ROOT/projects/disinto.toml" source "$FACTORY_ROOT/lib/env.sh" # shellcheck source=../lib/agent-session.sh source "$FACTORY_ROOT/lib/agent-session.sh" +# shellcheck source=../lib/formula-session.sh +source "$FACTORY_ROOT/lib/formula-session.sh" LOG_FILE="$SCRIPT_DIR/planner.log" -LOCK_FILE="/tmp/planner-run.lock" SESSION_NAME="planner-${PROJECT_NAME}" PHASE_FILE="/tmp/planner-session-${PROJECT_NAME}.phase" @@ -31,46 +32,15 @@ PHASE_POLL_INTERVAL=15 log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } -# ── Lock ────────────────────────────────────────────────────────────────── -if [ -f "$LOCK_FILE" ]; then - LOCK_PID=$(cat "$LOCK_FILE" 2>/dev/null || true) - if [ -n "$LOCK_PID" ] && kill -0 "$LOCK_PID" 2>/dev/null; then - log "run: planner running (PID $LOCK_PID)" - exit 0 - fi - rm -f "$LOCK_FILE" -fi -echo $$ > "$LOCK_FILE" -trap 'rm -f "$LOCK_FILE"' EXIT - -# ── Memory guard ────────────────────────────────────────────────────────── -AVAIL_MB=$(free -m | awk '/Mem:/{print $7}') -if [ "${AVAIL_MB:-0}" -lt 2000 ]; then - log "run: skipping — only ${AVAIL_MB}MB available (need 2000)" - exit 0 -fi +# ── Guards ──────────────────────────────────────────────────────────────── +acquire_cron_lock "/tmp/planner-run.lock" +check_memory 2000 log "--- Planner run start ---" -# ── Load formula ───────────────────────────────────────────────────────── -FORMULA_FILE="$FACTORY_ROOT/formulas/run-planner.toml" -if [ ! -f "$FORMULA_FILE" ]; then - log "ERROR: formula not found: $FORMULA_FILE" - exit 1 -fi -FORMULA_CONTENT=$(cat "$FORMULA_FILE") - -# ── Read context files ─────────────────────────────────────────────────── -CONTEXT_BLOCK="" -for ctx in VISION.md AGENTS.md RESOURCES.md; do - ctx_path="${PROJECT_REPO_ROOT}/${ctx}" - if [ -f "$ctx_path" ]; then - CONTEXT_BLOCK="${CONTEXT_BLOCK} -### ${ctx} -$(cat "$ctx_path") -" - fi -done +# ── Load formula + context ─────────────────────────────────────────────── +load_formula "$FACTORY_ROOT/formulas/run-planner.toml" +build_context_block VISION.md AGENTS.md RESOURCES.md # ── Read planner memory ───────────────────────────────────────────────── MEMORY_BLOCK="" @@ -113,15 +83,9 @@ When all work is done: On unrecoverable error: printf 'PHASE:failed\nReason: %s\n' 'describe error' > '${PHASE_FILE}'" -# ── Reset phase file + kill stale session ──────────────────────────────── -agent_kill_session "$SESSION_NAME" -rm -f "$PHASE_FILE" - # ── Create tmux session ───────────────────────────────────────────────── -log "Creating tmux session: ${SESSION_NAME}" export CLAUDE_MODEL="opus" -if ! create_agent_session "$SESSION_NAME" "$PROJECT_REPO_ROOT" "$PHASE_FILE"; then - log "ERROR: failed to create tmux session ${SESSION_NAME}" +if ! start_formula_session "$SESSION_NAME" "$PROJECT_REPO_ROOT" "$PHASE_FILE"; then exit 1 fi @@ -131,33 +95,9 @@ matrix_send "planner" "Planner session started for ${CODEBERG_REPO}" 2>/dev/null # ── Phase monitoring loop ──────────────────────────────────────────────── log "Monitoring phase file: ${PHASE_FILE}" -PLANNER_CRASH_COUNT=0 +_FORMULA_CRASH_COUNT=0 -planner_phase_callback() { - local phase="$1" - log "phase: ${phase}" - case "$phase" in - PHASE:crashed) - if [ "$PLANNER_CRASH_COUNT" -gt 0 ]; then - log "ERROR: session crashed again after recovery — giving up" - return 0 - fi - PLANNER_CRASH_COUNT=$((PLANNER_CRASH_COUNT + 1)) - log "WARNING: tmux session died unexpectedly — attempting recovery" - if create_agent_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROJECT_REPO_ROOT" "$PHASE_FILE" 2>/dev/null; then - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROMPT" - log "Recovery session started" - else - log "ERROR: could not restart session after crash" - fi - ;; - PHASE:done|PHASE:failed|PHASE:needs_human|PHASE:merged) - agent_kill_session "${_MONITOR_SESSION:-$SESSION_NAME}" - ;; - esac -} - -monitor_phase_loop "$PHASE_FILE" 7200 "planner_phase_callback" +monitor_phase_loop "$PHASE_FILE" 7200 "formula_phase_callback" FINAL_PHASE=$(read_phase "$PHASE_FILE") log "Final phase: ${FINAL_PHASE:-none}"