diff --git a/dev/dev-agent.sh b/dev/dev-agent.sh index ac0693f..930b624 100755 --- a/dev/dev-agent.sh +++ b/dev/dev-agent.sh @@ -22,6 +22,7 @@ set -euo pipefail # Load shared environment source "$(dirname "$0")/../lib/env.sh" +source "$(dirname "$0")/../lib/agent-session.sh" # Auto-pull factory code to pick up merged fixes before any logic runs git -C "$FACTORY_ROOT" pull --ff-only origin main 2>/dev/null || true @@ -50,7 +51,7 @@ IMPL_SUMMARY_FILE="/tmp/dev-impl-summary-${PROJECT_NAME}-${ISSUE}.txt" THREAD_FILE="/tmp/dev-thread-${PROJECT_NAME}-${ISSUE}" # Timing -PHASE_POLL_INTERVAL=30 # seconds between phase checks +export PHASE_POLL_INTERVAL=30 # seconds between phase checks (read by agent-session.sh) IDLE_TIMEOUT=7200 # 2h: kill session if phase stale this long CI_POLL_TIMEOUT=1800 # 30min max for CI to complete REVIEW_POLL_TIMEOUT=10800 # 3h max wait for review @@ -65,74 +66,6 @@ CI_FIX_COUNT=0 REVIEW_ROUND=0 PR_NUMBER="" -# --- Logging --- -log() { - printf '[%s] #%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$ISSUE" "$*" >> "$LOGFILE" -} - -status() { - printf '[%s] dev-agent #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$ISSUE" "$*" > "$STATUSFILE" - log "$*" -} - -notify() { - local thread_id="" - [ -f "${THREAD_FILE}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true) - matrix_send "dev" "🔧 #${ISSUE}: $*" "${thread_id}" 2>/dev/null || true -} - -# notify_ctx — Send rich notification with HTML links/context into the issue thread -# Falls back to plain matrix_send (which registers a thread root) when no thread exists. -notify_ctx() { - local plain="$1" html="$2" - local thread_id="" - [ -f "${THREAD_FILE}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true) - if [ -n "$thread_id" ]; then - matrix_send_ctx "dev" "🔧 #${ISSUE}: ${plain}" "🔧 #${ISSUE}: ${html}" "${thread_id}" 2>/dev/null || true - else - # No thread — fall back to plain send so a thread root is registered - matrix_send "dev" "🔧 #${ISSUE}: ${plain}" "" "${ISSUE}" 2>/dev/null || true - fi -} - -# --- Phase helpers --- -read_phase() { - { cat "$PHASE_FILE" 2>/dev/null || true; } | head -1 | tr -d '[:space:]' -} - -wait_for_claude_ready() { - local timeout="${1:-120}" - local elapsed=0 - while [ "$elapsed" -lt "$timeout" ]; do - # Claude Code shows ❯ when ready for input - if tmux capture-pane -t "${SESSION_NAME}" -p 2>/dev/null | grep -q '❯'; then - return 0 - fi - sleep 2 - elapsed=$((elapsed + 2)) - done - log "WARNING: claude not ready after ${timeout}s — proceeding anyway" - return 1 -} - -inject_into_session() { - local text="$1" - local tmpfile - wait_for_claude_ready 120 - tmpfile=$(mktemp /tmp/tmux-inject-XXXXXX) - printf '%s' "$text" > "$tmpfile" - tmux load-buffer -b "inject-${ISSUE}" "$tmpfile" - tmux paste-buffer -t "${SESSION_NAME}" -b "inject-${ISSUE}" - sleep 0.5 - tmux send-keys -t "${SESSION_NAME}" "" Enter - tmux delete-buffer -b "inject-${ISSUE}" 2>/dev/null || true - rm -f "$tmpfile" -} - -kill_tmux_session() { - tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true -} - # --- Refusal comment helper (used in PHASE:failed handler) --- post_refusal_comment() { local emoji="$1" title="$2" body="$3" @@ -855,37 +788,15 @@ fi # ============================================================================= status "creating tmux session: ${SESSION_NAME}" -# Reuse existing session if still alive (orchestrator may have been restarted) -if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then - # Kill any stale entry - tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true - - # Create new detached session running interactive claude in the worktree - tmux new-session -d -s "${SESSION_NAME}" -c "${WORKTREE}" \ - "claude --dangerously-skip-permissions" - - if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then - log "ERROR: failed to create tmux session ${SESSION_NAME}" - cleanup_labels - cleanup_worktree - exit 1 - fi - log "tmux session created: ${SESSION_NAME}" - - # Wait for Claude to be ready (polls for ❯ prompt) - if ! wait_for_claude_ready 120; then - log "ERROR: claude did not become ready in ${SESSION_NAME}" - kill_tmux_session - cleanup_labels - cleanup_worktree - exit 1 - fi -else - log "reusing existing tmux session: ${SESSION_NAME}" +if ! create_agent_session "${SESSION_NAME}" "${WORKTREE}"; then + log "ERROR: failed to create agent session" + cleanup_labels + cleanup_worktree + exit 1 fi -# Send initial prompt (inject_into_session waits for claude to be ready) -inject_into_session "$INITIAL_PROMPT" +# Send initial prompt into the session +inject_formula "${SESSION_NAME}" "${INITIAL_PROMPT}" log "initial prompt sent to tmux session" # Signal to dev-poll.sh that we're running (session is up) @@ -907,103 +818,15 @@ notify "tmux session ${SESSION_NAME} started for issue #${ISSUE}: ${ISSUE_TITLE} # ============================================================================= # PHASE MONITORING LOOP # ============================================================================= -status "monitoring phase: ${PHASE_FILE}" -LAST_PHASE_MTIME=0 -IDLE_ELAPSED=0 - -while true; do - sleep "$PHASE_POLL_INTERVAL" - IDLE_ELAPSED=$(( IDLE_ELAPSED + PHASE_POLL_INTERVAL )) - - # --- Session health check --- - if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then - CURRENT_PHASE=$(read_phase) - case "$CURRENT_PHASE" in - PHASE:done|PHASE:failed) - # Expected terminal phases — fall through to phase handler below - ;; - *) - log "WARNING: tmux session died unexpectedly (phase: ${CURRENT_PHASE:-none})" - notify "session crashed (phase: ${CURRENT_PHASE:-none}), attempting recovery" - - # Attempt crash recovery: restart session with recovery context - CRASH_DIFF=$(git -C "${WORKTREE}" diff "origin/${PRIMARY_BRANCH}..HEAD" --stat 2>/dev/null | head -20 || echo "(no diff)") - RECOVERY_MSG="## Session Recovery - -Your Claude Code session for issue #${ISSUE} was interrupted unexpectedly. -The git worktree at ${WORKTREE} is intact — your changes survived. - -Last known phase: ${CURRENT_PHASE:-unknown} - -Work so far: -${CRASH_DIFF} - -Run: git log --oneline -5 && git status -Then resume from the last phase following the original phase protocol. -Phase file: ${PHASE_FILE}" - - if tmux new-session -d -s "${SESSION_NAME}" -c "${WORKTREE}" \ - "claude --dangerously-skip-permissions" 2>/dev/null; then - inject_into_session "$RECOVERY_MSG" - log "recovery session started" - IDLE_ELAPSED=0 - else - log "ERROR: could not restart session after crash" - notify "session crashed and could not recover — needs human attention" - cleanup_labels - break - fi - continue - ;; - esac - fi - - # --- Check phase file for changes --- - PHASE_MTIME=$(stat -c %Y "$PHASE_FILE" 2>/dev/null || echo 0) - CURRENT_PHASE=$(read_phase) - - if [ -z "$CURRENT_PHASE" ] || [ "$PHASE_MTIME" -le "$LAST_PHASE_MTIME" ]; then - # No phase change — check idle timeout - if [ "$IDLE_ELAPSED" -ge "$IDLE_TIMEOUT" ]; then - log "TIMEOUT: no phase update for ${IDLE_TIMEOUT}s — killing session" - notify_ctx \ - "session idle for 2h — killed. Escalating to gardener." \ - "session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR #${PR_NUMBER}}" - kill_tmux_session - - # Escalate: write to project-suffixed escalation file so gardener picks it up - echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"idle_timeout\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ - >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" - - # Restore labels: remove in-progress, add backlog - cleanup_labels - curl -sf -X POST \ - -H "Authorization: token ${CODEBERG_TOKEN}" \ - -H "Content-Type: application/json" \ - "${API}/issues/${ISSUE}/labels" \ - -d '{"labels":["backlog"]}' >/dev/null 2>&1 || true - - CLAIMED=false # Don't unclaim again in cleanup() - if [ -n "${PR_NUMBER:-}" ]; then - log "keeping worktree (PR #${PR_NUMBER} still open)" - else - cleanup_worktree - fi - rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" - break - fi - continue - fi - - # Phase changed — handle it - LAST_PHASE_MTIME="$PHASE_MTIME" - IDLE_ELAPSED=0 - log "phase: ${CURRENT_PHASE}" - status "${CURRENT_PHASE}" +# _on_phase_change — Phase dispatch callback for monitor_phase_loop +# Receives the current phase as $1. +# Returns 0 to continue the loop, 1 to break (terminal phase reached). +_on_phase_change() { + local phase="$1" # ── PHASE: awaiting_ci ────────────────────────────────────────────────────── - if [ "$CURRENT_PHASE" = "PHASE:awaiting_ci" ]; then + if [ "$phase" = "PHASE:awaiting_ci" ]; then # Create PR if not yet created if [ -z "${PR_NUMBER:-}" ]; then @@ -1053,13 +876,13 @@ Phase file: ${PHASE_FILE}" else log "ERROR: PR creation got 409 but no existing PR found" inject_into_session "ERROR: Could not create PR (HTTP 409, no existing PR found). Check the Codeberg API. Retry by writing PHASE:awaiting_ci again after verifying the branch was pushed." - continue + return 0 fi else log "ERROR: PR creation failed (HTTP ${PR_HTTP_CODE})" notify "failed to create PR (HTTP ${PR_HTTP_CODE})" inject_into_session "ERROR: Could not create PR (HTTP ${PR_HTTP_CODE}). Check branch was pushed: git push origin ${BRANCH}. Then write PHASE:awaiting_ci again." - continue + return 0 fi fi @@ -1068,7 +891,7 @@ Phase file: ${PHASE_FILE}" log "no CI configured — treating as passed" inject_into_session "CI passed on PR #${PR_NUMBER} (no CI configured for this project). Write PHASE:awaiting_review to the phase file, then stop and wait for review feedback." - continue + return 0 fi # Poll CI until done or timeout @@ -1103,7 +926,7 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee log "TIMEOUT: CI didn't complete in ${CI_POLL_TIMEOUT}s" notify "CI timeout on PR #${PR_NUMBER}" inject_into_session "CI TIMEOUT: CI did not complete within 30 minutes for PR #${PR_NUMBER} (SHA: ${CI_CURRENT_SHA:0:7}). This may be an infrastructure issue. Write PHASE:needs_human if you cannot proceed." - continue + return 0 fi log "CI: ${CI_STATE}" @@ -1145,7 +968,7 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee # Do NOT update LAST_PHASE_MTIME here — let the main loop detect the fresh mtime touch "$PHASE_FILE" CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || true) - continue + return 0 fi CI_FIX_COUNT=$(( CI_FIX_COUNT + 1 )) @@ -1159,7 +982,7 @@ Write PHASE:awaiting_review to the phase file, then stop and wait for review fee "CI exhausted after ${CI_FIX_COUNT} attempts on PR #${PR_NUMBER} | Pipeline
Step: ${FAILED_STEP:-unknown} — escalated to supervisor" printf 'PHASE:failed\nReason: ci_exhausted after %d attempts\n' "$CI_FIX_COUNT" > "$PHASE_FILE" # Do NOT update LAST_PHASE_MTIME here — let the main loop detect PHASE:failed - continue + return 0 fi CI_ERROR_LOG="" @@ -1199,7 +1022,7 @@ Instructions: fi # ── PHASE: awaiting_review ────────────────────────────────────────────────── - elif [ "$CURRENT_PHASE" = "PHASE:awaiting_review" ]; then + elif [ "$phase" = "PHASE:awaiting_review" ]; then status "waiting for review on PR #${PR_NUMBER:-?}" CI_FIX_COUNT=0 # Reset CI fix budget for this review cycle @@ -1214,7 +1037,7 @@ Instructions: log "found PR #${PR_NUMBER}" else inject_into_session "ERROR: Cannot find open PR for branch ${BRANCH}. Did you push? Verify with git status and git push origin ${BRANCH}, then write PHASE:awaiting_ci." - continue + return 0 fi fi @@ -1356,7 +1179,7 @@ Instructions: fi # ── PHASE: needs_human ────────────────────────────────────────────────────── - elif [ "$CURRENT_PHASE" = "PHASE:needs_human" ]; then + elif [ "$phase" = "PHASE:needs_human" ]; then status "needs human input on issue #${ISSUE}" HUMAN_REASON=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "") _issue_url="${CODEBERG_WEB}/issues/${ISSUE}" @@ -1369,7 +1192,7 @@ Instructions: # Don't inject anything — supervisor-poll.sh (#81) injects human replies, gardener-poll.sh as backup # ── PHASE: done ───────────────────────────────────────────────────────────── - elif [ "$CURRENT_PHASE" = "PHASE:done" ]; then + elif [ "$phase" = "PHASE:done" ]; then status "phase done — merging PR #${PR_NUMBER:-?}" if [ -z "${PR_NUMBER:-}" ]; then @@ -1377,7 +1200,7 @@ Instructions: notify "PHASE:done but no PR known — needs human attention" kill_tmux_session cleanup_labels - break + return 1 fi MERGE_SHA=$(curl -sf -H "Authorization: token ${CODEBERG_TOKEN}" \ @@ -1391,7 +1214,7 @@ Instructions: inject_into_session "Merge failed for PR #${PR_NUMBER}. The orchestrator could not merge automatically. This may be due to merge conflicts or CI. Investigate the PR state and write PHASE:needs_human if human intervention is required." # ── PHASE: failed ─────────────────────────────────────────────────────────── - elif [ "$CURRENT_PHASE" = "PHASE:failed" ]; then + elif [ "$phase" = "PHASE:failed" ]; then FAILURE_REASON=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "unspecified") log "phase: failed — reason: ${FAILURE_REASON}" @@ -1478,7 +1301,7 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) kill_tmux_session cleanup_worktree rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" - break + return 1 else # Genuine unrecoverable failure — escalate to supervisor @@ -1505,12 +1328,44 @@ $(printf '%s' "$REFUSAL_JSON" | head -c 2000) cleanup_worktree fi rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" - break + return 1 fi else - log "WARNING: unknown phase value: ${CURRENT_PHASE}" + log "WARNING: unknown phase value: ${phase}" fi -done +} + +status "monitoring phase: ${PHASE_FILE}" +monitor_phase_loop "$PHASE_FILE" "$IDLE_TIMEOUT" _on_phase_change + +# Handle exit reason from monitor_phase_loop +case "${_MONITOR_LOOP_EXIT:-}" in + idle_timeout) + notify_ctx \ + "session idle for 2h — killed. Escalating to gardener." \ + "session idle for 2h — killed. Escalating to gardener.${PR_NUMBER:+ PR #${PR_NUMBER}}" + # Escalate: write to project-suffixed escalation file so gardener picks it up + echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"idle_timeout\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \ + >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl" + # Restore labels: remove in-progress, add backlog + cleanup_labels + curl -sf -X POST \ + -H "Authorization: token ${CODEBERG_TOKEN}" \ + -H "Content-Type: application/json" \ + "${API}/issues/${ISSUE}/labels" \ + -d '{"labels":["backlog"]}' >/dev/null 2>&1 || true + CLAIMED=false # Don't unclaim again in cleanup() + if [ -n "${PR_NUMBER:-}" ]; then + log "keeping worktree (PR #${PR_NUMBER} still open)" + else + cleanup_worktree + fi + rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" + ;; + crash_recovery_failed) + cleanup_labels + ;; +esac log "dev-agent finished for issue #${ISSUE}" diff --git a/lib/agent-session.sh b/lib/agent-session.sh new file mode 100644 index 0000000..447bf3a --- /dev/null +++ b/lib/agent-session.sh @@ -0,0 +1,255 @@ +#!/usr/bin/env bash +# lib/agent-session.sh — Reusable tmux + Claude agent runtime +# +# Source this in any agent script after lib/env.sh. +# +# Required globals (set by the caller before using functions): +# SESSION_NAME — tmux session name (e.g., "dev-harb-935") +# PHASE_FILE — path to phase file +# LOGFILE — path to log file +# ISSUE — issue/context identifier (used in log prefix) +# STATUSFILE — path to status file +# THREAD_FILE — path to Matrix thread ID file +# WORKTREE — agent working directory (for crash recovery) +# PRIMARY_BRANCH — primary git branch (for crash recovery diff) +# +# Optional globals: +# PHASE_POLL_INTERVAL — seconds between phase polls (default: 30) +# +# Globals exported by monitor_phase_loop (readable by phase callbacks): +# LAST_PHASE_MTIME — mtime of the phase file when the current phase was dispatched +# _MONITOR_LOOP_EXIT — set on return: "idle_timeout", "crash_recovery_failed", +# or "callback_break" + +# log — Timestamped logging to LOGFILE +# Usage: log +log() { + printf '[%s] #%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" >> "${LOGFILE:-/dev/null}" +} + +# status — Log + write current status to STATUSFILE +# Usage: status +status() { + printf '[%s] agent #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" > "${STATUSFILE:-/dev/null}" + log "$*" +} + +# notify — Send plain-text Matrix notification into the issue thread +# Usage: notify +notify() { + local thread_id="" + [ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true) + matrix_send "dev" "🔧 #${ISSUE}: $*" "${thread_id}" 2>/dev/null || true +} + +# notify_ctx — Send rich Matrix notification with HTML context into the issue thread +# Falls back to plain send (registering a thread root) when no thread exists. +# Usage: notify_ctx +notify_ctx() { + local plain="$1" html="$2" + local thread_id="" + [ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true) + if [ -n "$thread_id" ]; then + matrix_send_ctx "dev" "🔧 #${ISSUE}: ${plain}" "🔧 #${ISSUE}: ${html}" "${thread_id}" 2>/dev/null || true + else + # No thread — fall back to plain send so a thread root is registered + matrix_send "dev" "🔧 #${ISSUE}: ${plain}" "" "${ISSUE}" 2>/dev/null || true + fi +} + +# read_phase — Read current value from PHASE_FILE, stripping whitespace +# Usage: read_phase +read_phase() { + { cat "${PHASE_FILE}" 2>/dev/null || true; } | head -1 | tr -d '[:space:]' +} + +# wait_for_claude_ready — Poll SESSION_NAME tmux pane until Claude shows ❯ prompt +# Usage: wait_for_claude_ready [timeout_seconds] +# Returns: 0 if ready, 1 if timeout +wait_for_claude_ready() { + local timeout="${1:-120}" + local elapsed=0 + while [ "$elapsed" -lt "$timeout" ]; do + # Claude Code shows ❯ when ready for input + if tmux capture-pane -t "${SESSION_NAME}" -p 2>/dev/null | grep -q '❯'; then + return 0 + fi + sleep 2 + elapsed=$((elapsed + 2)) + done + log "WARNING: claude not ready after ${timeout}s — proceeding anyway" + return 1 +} + +# inject_into_session — Paste text into the tmux session via tmux buffer +# Usage: inject_into_session +inject_into_session() { + local text="$1" + local tmpfile + wait_for_claude_ready 120 + tmpfile=$(mktemp /tmp/tmux-inject-XXXXXX) + printf '%s' "$text" > "$tmpfile" + tmux load-buffer -b "inject-${ISSUE}" "$tmpfile" + tmux paste-buffer -t "${SESSION_NAME}" -b "inject-${ISSUE}" + sleep 0.5 + tmux send-keys -t "${SESSION_NAME}" "" Enter + tmux delete-buffer -b "inject-${ISSUE}" 2>/dev/null || true + rm -f "$tmpfile" +} + +# kill_tmux_session — Kill SESSION_NAME tmux session +# Usage: kill_tmux_session +kill_tmux_session() { + tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true +} + +# create_agent_session — Create (or reuse) a detached tmux session running claude +# Sets SESSION_NAME to $1 and uses $2 as the working directory. +# Usage: create_agent_session +# Returns: 0 on success, 1 on failure +create_agent_session() { + SESSION_NAME="${1:-${SESSION_NAME}}" + local workdir="${2:-${WORKTREE}}" + + if tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then + log "reusing existing tmux session: ${SESSION_NAME}" + return 0 + fi + + # Kill any stale entry before creating + tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true + + tmux new-session -d -s "${SESSION_NAME}" -c "${workdir}" \ + "claude --dangerously-skip-permissions" + + if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then + log "ERROR: failed to create tmux session ${SESSION_NAME}" + return 1 + fi + log "tmux session created: ${SESSION_NAME}" + + if ! wait_for_claude_ready 120; then + log "ERROR: claude did not become ready in ${SESSION_NAME}" + kill_tmux_session + return 1 + fi + return 0 +} + +# inject_formula — Send a formula/prompt into the agent session +# Usage: inject_formula [context] +inject_formula() { + SESSION_NAME="${1:-${SESSION_NAME}}" + local formula_text="$2" + # $3 context is available for future use by callers + inject_into_session "$formula_text" +} + +# Globals exported by monitor_phase_loop for use by phase callbacks. +# LAST_PHASE_MTIME: mtime of phase file at the time the current phase was dispatched. +# _MONITOR_LOOP_EXIT: reason monitor_phase_loop returned — check after the call. +LAST_PHASE_MTIME=0 +_MONITOR_LOOP_EXIT="" + +# monitor_phase_loop — Watch PHASE_FILE and dispatch phase changes to a callback +# +# Handles: phase change detection, idle timeout, and session crash recovery. +# The phase callback receives the current phase string as $1. +# Return 1 from the callback to break the loop; return 0 (or default) to continue. +# +# On idle timeout: kills the session, sets _MONITOR_LOOP_EXIT=idle_timeout, breaks. +# On crash recovery failure: sets _MONITOR_LOOP_EXIT=crash_recovery_failed, breaks. +# On callback return 1: sets _MONITOR_LOOP_EXIT=callback_break, breaks. +# +# LAST_PHASE_MTIME is updated before each callback invocation so callbacks can +# detect subsequent phase file changes (e.g., during inner polling loops). +# +# Usage: monitor_phase_loop +monitor_phase_loop() { + local phase_file="${1:-${PHASE_FILE}}" + local idle_timeout="${2:-7200}" + local callback_fn="${3:-}" + local poll_interval="${PHASE_POLL_INTERVAL:-30}" + local current_phase phase_mtime crash_diff recovery_msg + + PHASE_FILE="$phase_file" + LAST_PHASE_MTIME=0 + _MONITOR_LOOP_EXIT="" + local idle_elapsed=0 + + while true; do + sleep "$poll_interval" + idle_elapsed=$(( idle_elapsed + poll_interval )) + + # --- Session health check --- + if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then + current_phase=$(read_phase) + case "$current_phase" in + PHASE:done|PHASE:failed) + # Expected terminal phases — fall through to phase dispatch below + ;; + *) + log "WARNING: tmux session died unexpectedly (phase: ${current_phase:-none})" + notify "session crashed (phase: ${current_phase:-none}), attempting recovery" + + # Attempt crash recovery: restart session with recovery context + crash_diff=$(git -C "${WORKTREE}" diff "origin/${PRIMARY_BRANCH}..HEAD" --stat 2>/dev/null | head -20 || echo "(no diff)") + recovery_msg="## Session Recovery + +Your Claude Code session for issue #${ISSUE} was interrupted unexpectedly. +The git worktree at ${WORKTREE} is intact — your changes survived. + +Last known phase: ${current_phase:-unknown} + +Work so far: +${crash_diff} + +Run: git log --oneline -5 && git status +Then resume from the last phase following the original phase protocol. +Phase file: ${PHASE_FILE}" + + if tmux new-session -d -s "${SESSION_NAME}" -c "${WORKTREE}" \ + "claude --dangerously-skip-permissions" 2>/dev/null; then + inject_into_session "$recovery_msg" + log "recovery session started" + idle_elapsed=0 + else + log "ERROR: could not restart session after crash" + notify "session crashed and could not recover — needs human attention" + _MONITOR_LOOP_EXIT="crash_recovery_failed" + break + fi + continue + ;; + esac + fi + + # --- Check phase file for changes --- + phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0) + current_phase=$(read_phase) + + if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$LAST_PHASE_MTIME" ]; then + # No phase change — check idle timeout + if [ "$idle_elapsed" -ge "$idle_timeout" ]; then + log "TIMEOUT: no phase update for ${idle_timeout}s — killing session" + kill_tmux_session + _MONITOR_LOOP_EXIT="idle_timeout" + break + fi + continue + fi + + # Phase changed — update tracking state and dispatch to callback + LAST_PHASE_MTIME="$phase_mtime" + idle_elapsed=0 + log "phase: ${current_phase}" + status "${current_phase}" + + if [ -n "$callback_fn" ]; then + if ! "$callback_fn" "$current_phase"; then + _MONITOR_LOOP_EXIT="callback_break" + break + fi + fi + done +}