disinto/lib/agent-session.sh
openhands cbd8c81da8 refactor: extract lib/agent-session.sh — reusable tmux + Claude agent runtime (#158)
Move generic agent infrastructure from dev/dev-agent.sh into lib/agent-session.sh:
- log, status, notify, notify_ctx, read_phase, wait_for_claude_ready,
  inject_into_session, kill_tmux_session extracted verbatim
- create_agent_session(session_name, workdir) — new: tmux session creation
- inject_formula(session_name, formula_text, context) — new: prompt injection
- monitor_phase_loop(phase_file, idle_timeout, callback_fn) — new: phase loop
  with session health check, crash recovery, and idle timeout detection

dev-agent.sh: sources the library, implements _on_phase_change() callback,
calls monitor_phase_loop(); idle-timeout and crash-recovery-failed cleanup
handled via _MONITOR_LOOP_EXIT signal variable. Behavior unchanged.
2026-03-18 14:36:36 +00:00

255 lines
9.1 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# lib/agent-session.sh — Reusable tmux + Claude agent runtime
#
# Source this in any agent script after lib/env.sh.
#
# Required globals (set by the caller before using functions):
# SESSION_NAME — tmux session name (e.g., "dev-harb-935")
# PHASE_FILE — path to phase file
# LOGFILE — path to log file
# ISSUE — issue/context identifier (used in log prefix)
# STATUSFILE — path to status file
# THREAD_FILE — path to Matrix thread ID file
# WORKTREE — agent working directory (for crash recovery)
# PRIMARY_BRANCH — primary git branch (for crash recovery diff)
#
# Optional globals:
# PHASE_POLL_INTERVAL — seconds between phase polls (default: 30)
#
# Globals exported by monitor_phase_loop (readable by phase callbacks):
# LAST_PHASE_MTIME — mtime of the phase file when the current phase was dispatched
# _MONITOR_LOOP_EXIT — set on return: "idle_timeout", "crash_recovery_failed",
# or "callback_break"
# log — Timestamped logging to LOGFILE
# Usage: log <message>
log() {
printf '[%s] #%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" >> "${LOGFILE:-/dev/null}"
}
# status — Log + write current status to STATUSFILE
# Usage: status <message>
status() {
printf '[%s] agent #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "${ISSUE:-?}" "$*" > "${STATUSFILE:-/dev/null}"
log "$*"
}
# notify — Send plain-text Matrix notification into the issue thread
# Usage: notify <message>
notify() {
local thread_id=""
[ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true)
matrix_send "dev" "🔧 #${ISSUE}: $*" "${thread_id}" 2>/dev/null || true
}
# notify_ctx — Send rich Matrix notification with HTML context into the issue thread
# Falls back to plain send (registering a thread root) when no thread exists.
# Usage: notify_ctx <plain_text> <html_body>
notify_ctx() {
local plain="$1" html="$2"
local thread_id=""
[ -f "${THREAD_FILE:-}" ] && thread_id=$(cat "$THREAD_FILE" 2>/dev/null || true)
if [ -n "$thread_id" ]; then
matrix_send_ctx "dev" "🔧 #${ISSUE}: ${plain}" "🔧 #${ISSUE}: ${html}" "${thread_id}" 2>/dev/null || true
else
# No thread — fall back to plain send so a thread root is registered
matrix_send "dev" "🔧 #${ISSUE}: ${plain}" "" "${ISSUE}" 2>/dev/null || true
fi
}
# read_phase — Read current value from PHASE_FILE, stripping whitespace
# Usage: read_phase
read_phase() {
{ cat "${PHASE_FILE}" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
}
# wait_for_claude_ready — Poll SESSION_NAME tmux pane until Claude shows prompt
# Usage: wait_for_claude_ready [timeout_seconds]
# Returns: 0 if ready, 1 if timeout
wait_for_claude_ready() {
local timeout="${1:-120}"
local elapsed=0
while [ "$elapsed" -lt "$timeout" ]; do
# Claude Code shows when ready for input
if tmux capture-pane -t "${SESSION_NAME}" -p 2>/dev/null | grep -q ''; then
return 0
fi
sleep 2
elapsed=$((elapsed + 2))
done
log "WARNING: claude not ready after ${timeout}s — proceeding anyway"
return 1
}
# inject_into_session — Paste text into the tmux session via tmux buffer
# Usage: inject_into_session <text>
inject_into_session() {
local text="$1"
local tmpfile
wait_for_claude_ready 120
tmpfile=$(mktemp /tmp/tmux-inject-XXXXXX)
printf '%s' "$text" > "$tmpfile"
tmux load-buffer -b "inject-${ISSUE}" "$tmpfile"
tmux paste-buffer -t "${SESSION_NAME}" -b "inject-${ISSUE}"
sleep 0.5
tmux send-keys -t "${SESSION_NAME}" "" Enter
tmux delete-buffer -b "inject-${ISSUE}" 2>/dev/null || true
rm -f "$tmpfile"
}
# kill_tmux_session — Kill SESSION_NAME tmux session
# Usage: kill_tmux_session
kill_tmux_session() {
tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true
}
# create_agent_session — Create (or reuse) a detached tmux session running claude
# Sets SESSION_NAME to $1 and uses $2 as the working directory.
# Usage: create_agent_session <session_name> <workdir>
# Returns: 0 on success, 1 on failure
create_agent_session() {
SESSION_NAME="${1:-${SESSION_NAME}}"
local workdir="${2:-${WORKTREE}}"
if tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
log "reusing existing tmux session: ${SESSION_NAME}"
return 0
fi
# Kill any stale entry before creating
tmux kill-session -t "${SESSION_NAME}" 2>/dev/null || true
tmux new-session -d -s "${SESSION_NAME}" -c "${workdir}" \
"claude --dangerously-skip-permissions"
if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
log "ERROR: failed to create tmux session ${SESSION_NAME}"
return 1
fi
log "tmux session created: ${SESSION_NAME}"
if ! wait_for_claude_ready 120; then
log "ERROR: claude did not become ready in ${SESSION_NAME}"
kill_tmux_session
return 1
fi
return 0
}
# inject_formula — Send a formula/prompt into the agent session
# Usage: inject_formula <session_name> <formula_text> [context]
inject_formula() {
SESSION_NAME="${1:-${SESSION_NAME}}"
local formula_text="$2"
# $3 context is available for future use by callers
inject_into_session "$formula_text"
}
# Globals exported by monitor_phase_loop for use by phase callbacks.
# LAST_PHASE_MTIME: mtime of phase file at the time the current phase was dispatched.
# _MONITOR_LOOP_EXIT: reason monitor_phase_loop returned — check after the call.
LAST_PHASE_MTIME=0
_MONITOR_LOOP_EXIT=""
# monitor_phase_loop — Watch PHASE_FILE and dispatch phase changes to a callback
#
# Handles: phase change detection, idle timeout, and session crash recovery.
# The phase callback receives the current phase string as $1.
# Return 1 from the callback to break the loop; return 0 (or default) to continue.
#
# On idle timeout: kills the session, sets _MONITOR_LOOP_EXIT=idle_timeout, breaks.
# On crash recovery failure: sets _MONITOR_LOOP_EXIT=crash_recovery_failed, breaks.
# On callback return 1: sets _MONITOR_LOOP_EXIT=callback_break, breaks.
#
# LAST_PHASE_MTIME is updated before each callback invocation so callbacks can
# detect subsequent phase file changes (e.g., during inner polling loops).
#
# Usage: monitor_phase_loop <phase_file> <idle_timeout_secs> <phase_callback_fn>
monitor_phase_loop() {
local phase_file="${1:-${PHASE_FILE}}"
local idle_timeout="${2:-7200}"
local callback_fn="${3:-}"
local poll_interval="${PHASE_POLL_INTERVAL:-30}"
local current_phase phase_mtime crash_diff recovery_msg
PHASE_FILE="$phase_file"
LAST_PHASE_MTIME=0
_MONITOR_LOOP_EXIT=""
local idle_elapsed=0
while true; do
sleep "$poll_interval"
idle_elapsed=$(( idle_elapsed + poll_interval ))
# --- Session health check ---
if ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
current_phase=$(read_phase)
case "$current_phase" in
PHASE:done|PHASE:failed)
# Expected terminal phases — fall through to phase dispatch below
;;
*)
log "WARNING: tmux session died unexpectedly (phase: ${current_phase:-none})"
notify "session crashed (phase: ${current_phase:-none}), attempting recovery"
# Attempt crash recovery: restart session with recovery context
crash_diff=$(git -C "${WORKTREE}" diff "origin/${PRIMARY_BRANCH}..HEAD" --stat 2>/dev/null | head -20 || echo "(no diff)")
recovery_msg="## Session Recovery
Your Claude Code session for issue #${ISSUE} was interrupted unexpectedly.
The git worktree at ${WORKTREE} is intact — your changes survived.
Last known phase: ${current_phase:-unknown}
Work so far:
${crash_diff}
Run: git log --oneline -5 && git status
Then resume from the last phase following the original phase protocol.
Phase file: ${PHASE_FILE}"
if tmux new-session -d -s "${SESSION_NAME}" -c "${WORKTREE}" \
"claude --dangerously-skip-permissions" 2>/dev/null; then
inject_into_session "$recovery_msg"
log "recovery session started"
idle_elapsed=0
else
log "ERROR: could not restart session after crash"
notify "session crashed and could not recover — needs human attention"
_MONITOR_LOOP_EXIT="crash_recovery_failed"
break
fi
continue
;;
esac
fi
# --- Check phase file for changes ---
phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
current_phase=$(read_phase)
if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$LAST_PHASE_MTIME" ]; then
# No phase change — check idle timeout
if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
log "TIMEOUT: no phase update for ${idle_timeout}s — killing session"
kill_tmux_session
_MONITOR_LOOP_EXIT="idle_timeout"
break
fi
continue
fi
# Phase changed — update tracking state and dispatch to callback
LAST_PHASE_MTIME="$phase_mtime"
idle_elapsed=0
log "phase: ${current_phase}"
status "${current_phase}"
if [ -n "$callback_fn" ]; then
if ! "$callback_fn" "$current_phase"; then
_MONITOR_LOOP_EXIT="callback_break"
break
fi
fi
done
}