Move generic agent infrastructure from dev/dev-agent.sh into lib/agent-session.sh: - log, status, notify, notify_ctx, read_phase, wait_for_claude_ready, inject_into_session, kill_tmux_session extracted verbatim - create_agent_session(session_name, workdir) — new: tmux session creation - inject_formula(session_name, formula_text, context) — new: prompt injection - monitor_phase_loop(phase_file, idle_timeout, callback_fn) — new: phase loop with session health check, crash recovery, and idle timeout detection dev-agent.sh: sources the library, implements _on_phase_change() callback, calls monitor_phase_loop(); idle-timeout and crash-recovery-failed cleanup handled via _MONITOR_LOOP_EXIT signal variable. Behavior unchanged.
255 lines
9.1 KiB
Bash
255 lines
9.1 KiB
Bash
#!/usr/bin/env bash
|
||
# lib/agent-session.sh — Reusable tmux + Claude agent runtime
|
||
#
|
||
# Source this in any agent script after lib/env.sh.
|
||
#
|
||
# Required globals (set by the caller before using functions):
|
||
# SESSION_NAME — tmux session name (e.g., "dev-harb-935")
|
||
# PHASE_FILE — path to phase file
|
||
# LOGFILE — path to log file
|
||
# ISSUE — issue/context identifier (used in log prefix)
|
||
# STATUSFILE — path to status file
|
||
# THREAD_FILE — path to Matrix thread ID file
|
||
# WORKTREE — agent working directory (for crash recovery)
|
||
# PRIMARY_BRANCH — primary git branch (for crash recovery diff)
|
||
#
|
||
# Optional globals:
|
||
# PHASE_POLL_INTERVAL — seconds between phase polls (default: 30)
|
||
#
|
||
# Globals exported by monitor_phase_loop (readable by phase callbacks):
|
||
# LAST_PHASE_MTIME — mtime of the phase file when the current phase was dispatched
|
||
# _MONITOR_LOOP_EXIT — set on return: "idle_timeout", "crash_recovery_failed",
|
||
# or "callback_break"
|
||
|
||
# log — Timestamped logging to LOGFILE
|
||
# Usage: log <message>
|
||
log() {
|
||
printf '[%s] #%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" >> "${LOGFILE:-/dev/null}"
|
||
}
|
||
|
||
# status — Log + write current status to STATUSFILE
|
||
# Usage: status <message>
|
||
status() {
|
||
printf '[%s] agent #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" > "${STATUSFILE:-/dev/null}"
|
||
log "$*"
|
||
}
|
||
|
||
# notify — Send plain-text Matrix notification into the issue thread
|
||
# Usage: notify <message>
|
||
notify() {
|
||
local thread_id=""
|
||
[ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true)
|
||
matrix_send "dev" "🔧 #${ISSUE}: $*" "${thread_id}" 2>/dev/null || true
|
||
}
|
||
|
||
# notify_ctx — Send rich Matrix notification with HTML context into the issue thread
|
||
# Falls back to plain send (registering a thread root) when no thread exists.
|
||
# Usage: notify_ctx <plain_text> <html_body>
|
||
notify_ctx() {
|
||
local plain="$1" html="$2"
|
||
local thread_id=""
|
||
[ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true)
|
||
if [ -n "$thread_id" ]; then
|
||
matrix_send_ctx "dev" "🔧 #${ISSUE}: ${plain}" "🔧 #${ISSUE}: ${html}" "${thread_id}" 2>/dev/null || true
|
||
else
|
||
# No thread — fall back to plain send so a thread root is registered
|
||
matrix_send "dev" "🔧 #${ISSUE}: ${plain}" "" "${ISSUE}" 2>/dev/null || true
|
||
fi
|
||
}
|
||
|
||
# read_phase — Read current value from PHASE_FILE, stripping whitespace
|
||
# Usage: read_phase
|
||
read_phase() {
|
||
{ cat "${PHASE_FILE}" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
|
||
}
|
||
|
||
# wait_for_claude_ready — Poll SESSION_NAME tmux pane until Claude shows ❯ prompt
|
||
# Usage: wait_for_claude_ready [timeout_seconds]
|
||
# Returns: 0 if ready, 1 if timeout
|
||
wait_for_claude_ready() {
|
||
local timeout="${1:-120}"
|
||
local elapsed=0
|
||
while [ "$elapsed" -lt "$timeout" ]; do
|
||
# Claude Code shows ❯ when ready for input
|
||
if tmux capture-pane -t "${SESSION_NAME}" -p 2>/dev/null | grep -q '❯'; then
|
||
return 0
|
||
fi
|
||
sleep 2
|
||
elapsed=$((elapsed + 2))
|
||
done
|
||
log "WARNING: claude not ready after ${timeout}s — proceeding anyway"
|
||
return 1
|
||
}
|
||
|
||
# inject_into_session — Paste text into the tmux session via tmux buffer
|
||
# Usage: inject_into_session <text>
|
||
inject_into_session() {
|
||
local text="$1"
|
||
local tmpfile
|
||
wait_for_claude_ready 120
|
||
tmpfile=$(mktemp /tmp/tmux-inject-XXXXXX)
|
||
printf '%s' "$text" > "$tmpfile"
|
||
tmux load-buffer -b "inject-${ISSUE}" "$tmpfile"
|
||
tmux paste-buffer -t "${SESSION_NAME}" -b "inject-${ISSUE}"
|
||
sleep 0.5
|
||
tmux send-keys -t "${SESSION_NAME}" "" Enter
|
||
tmux delete-buffer -b "inject-${ISSUE}" 2>/dev/null || true
|
||
rm -f "$tmpfile"
|
||
}
|
||
|
||
# kill_tmux_session — Kill SESSION_NAME tmux session
|
||
# Usage: kill_tmux_session
|
||
kill_tmux_session() {
|
||
tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true
|
||
}
|
||
|
||
# create_agent_session — Create (or reuse) a detached tmux session running claude
|
||
# Sets SESSION_NAME to $1 and uses $2 as the working directory.
|
||
# Usage: create_agent_session <session_name> <workdir>
|
||
# Returns: 0 on success, 1 on failure
|
||
create_agent_session() {
|
||
SESSION_NAME="${1:-${SESSION_NAME}}"
|
||
local workdir="${2:-${WORKTREE}}"
|
||
|
||
if tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
|
||
log "reusing existing tmux session: ${SESSION_NAME}"
|
||
return 0
|
||
fi
|
||
|
||
# Kill any stale entry before creating
|
||
tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true
|
||
|
||
tmux new-session -d -s "${SESSION_NAME}" -c "${workdir}" \
|
||
"claude --dangerously-skip-permissions"
|
||
|
||
if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
|
||
log "ERROR: failed to create tmux session ${SESSION_NAME}"
|
||
return 1
|
||
fi
|
||
log "tmux session created: ${SESSION_NAME}"
|
||
|
||
if ! wait_for_claude_ready 120; then
|
||
log "ERROR: claude did not become ready in ${SESSION_NAME}"
|
||
kill_tmux_session
|
||
return 1
|
||
fi
|
||
return 0
|
||
}
|
||
|
||
# inject_formula — Send a formula/prompt into the agent session
|
||
# Usage: inject_formula <session_name> <formula_text> [context]
|
||
inject_formula() {
|
||
SESSION_NAME="${1:-${SESSION_NAME}}"
|
||
local formula_text="$2"
|
||
# $3 context is available for future use by callers
|
||
inject_into_session "$formula_text"
|
||
}
|
||
|
||
# Globals exported by monitor_phase_loop for use by phase callbacks.
|
||
# LAST_PHASE_MTIME: mtime of phase file at the time the current phase was dispatched.
|
||
# _MONITOR_LOOP_EXIT: reason monitor_phase_loop returned — check after the call.
|
||
LAST_PHASE_MTIME=0
|
||
_MONITOR_LOOP_EXIT=""
|
||
|
||
# monitor_phase_loop — Watch PHASE_FILE and dispatch phase changes to a callback
|
||
#
|
||
# Handles: phase change detection, idle timeout, and session crash recovery.
|
||
# The phase callback receives the current phase string as $1.
|
||
# Return 1 from the callback to break the loop; return 0 (or default) to continue.
|
||
#
|
||
# On idle timeout: kills the session, sets _MONITOR_LOOP_EXIT=idle_timeout, breaks.
|
||
# On crash recovery failure: sets _MONITOR_LOOP_EXIT=crash_recovery_failed, breaks.
|
||
# On callback return 1: sets _MONITOR_LOOP_EXIT=callback_break, breaks.
|
||
#
|
||
# LAST_PHASE_MTIME is updated before each callback invocation so callbacks can
|
||
# detect subsequent phase file changes (e.g., during inner polling loops).
|
||
#
|
||
# Usage: monitor_phase_loop <phase_file> <idle_timeout_secs> <phase_callback_fn>
|
||
monitor_phase_loop() {
|
||
local phase_file="${1:-${PHASE_FILE}}"
|
||
local idle_timeout="${2:-7200}"
|
||
local callback_fn="${3:-}"
|
||
local poll_interval="${PHASE_POLL_INTERVAL:-30}"
|
||
local current_phase phase_mtime crash_diff recovery_msg
|
||
|
||
PHASE_FILE="$phase_file"
|
||
LAST_PHASE_MTIME=0
|
||
_MONITOR_LOOP_EXIT=""
|
||
local idle_elapsed=0
|
||
|
||
while true; do
|
||
sleep "$poll_interval"
|
||
idle_elapsed=$(( idle_elapsed + poll_interval ))
|
||
|
||
# --- Session health check ---
|
||
if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
|
||
current_phase=$(read_phase)
|
||
case "$current_phase" in
|
||
PHASE:done|PHASE:failed)
|
||
# Expected terminal phases — fall through to phase dispatch below
|
||
;;
|
||
*)
|
||
log "WARNING: tmux session died unexpectedly (phase: ${current_phase:-none})"
|
||
notify "session crashed (phase: ${current_phase:-none}), attempting recovery"
|
||
|
||
# Attempt crash recovery: restart session with recovery context
|
||
crash_diff=$(git -C "${WORKTREE}" diff "origin/${PRIMARY_BRANCH}..HEAD" --stat 2>/dev/null | head -20 || echo "(no diff)")
|
||
recovery_msg="## Session Recovery
|
||
|
||
Your Claude Code session for issue #${ISSUE} was interrupted unexpectedly.
|
||
The git worktree at ${WORKTREE} is intact — your changes survived.
|
||
|
||
Last known phase: ${current_phase:-unknown}
|
||
|
||
Work so far:
|
||
${crash_diff}
|
||
|
||
Run: git log --oneline -5 && git status
|
||
Then resume from the last phase following the original phase protocol.
|
||
Phase file: ${PHASE_FILE}"
|
||
|
||
if tmux new-session -d -s "${SESSION_NAME}" -c "${WORKTREE}" \
|
||
"claude --dangerously-skip-permissions" 2>/dev/null; then
|
||
inject_into_session "$recovery_msg"
|
||
log "recovery session started"
|
||
idle_elapsed=0
|
||
else
|
||
log "ERROR: could not restart session after crash"
|
||
notify "session crashed and could not recover — needs human attention"
|
||
_MONITOR_LOOP_EXIT="crash_recovery_failed"
|
||
break
|
||
fi
|
||
continue
|
||
;;
|
||
esac
|
||
fi
|
||
|
||
# --- Check phase file for changes ---
|
||
phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
|
||
current_phase=$(read_phase)
|
||
|
||
if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$LAST_PHASE_MTIME" ]; then
|
||
# No phase change — check idle timeout
|
||
if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
|
||
log "TIMEOUT: no phase update for ${idle_timeout}s — killing session"
|
||
kill_tmux_session
|
||
_MONITOR_LOOP_EXIT="idle_timeout"
|
||
break
|
||
fi
|
||
continue
|
||
fi
|
||
|
||
# Phase changed — update tracking state and dispatch to callback
|
||
LAST_PHASE_MTIME="$phase_mtime"
|
||
idle_elapsed=0
|
||
log "phase: ${current_phase}"
|
||
status "${current_phase}"
|
||
|
||
if [ -n "$callback_fn" ]; then
|
||
if ! "$callback_fn" "$current_phase"; then
|
||
_MONITOR_LOOP_EXIT="callback_break"
|
||
break
|
||
fi
|
||
fi
|
||
done
|
||
}
|