disinto/lib/agent-session.sh
johba d5c2c213a3 fix: bug: gardener hangs forever when Claude finishes without writing phase file (#261) (#263)
Fixes #261

## Changes
Fixed gardener hanging forever when Claude skips phase protocol. Three changes: (1) gardener-agent.sh: replaced 999999s timeout with 7200s (2h, matching dev-agent); (2) lib/agent-session.sh: added idle-prompt detection to monitor_phase_loop — if Claude returns to the ❯ prompt for 3 consecutive polls with no phase file written, exits immediately with _MONITOR_LOOP_EXIT=idle_prompt (only fires when phase file is empty, so awaiting_ci/review waits are unaffected); (3) gardener prompt: removed 'no time limit' wording, replaced with explicit phase-write requirement.

Co-authored-by: openhands <openhands@all-hands.dev>
Reviewed-on: https://codeberg.org/johba/disinto/pulls/263
Reviewed-by: Disinto_bot <disinto_bot@noreply.codeberg.org>
2026-03-19 13:47:10 +01:00

195 lines
7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# agent-session.sh — Shared tmux + Claude interactive session helpers
#
# Source this into agent orchestrator scripts for reusable session management.
#
# Functions:
# agent_wait_for_claude_ready SESSION_NAME [TIMEOUT_SECS]
# agent_inject_into_session SESSION_NAME TEXT
# agent_kill_session SESSION_NAME
# monitor_phase_loop PHASE_FILE IDLE_TIMEOUT_SECS CALLBACK_FN [SESSION_NAME]
# Wait for the Claude ready prompt in a tmux pane.
# Returns 0 if ready within TIMEOUT_SECS (default 120), 1 otherwise.
agent_wait_for_claude_ready() {
local session="$1"
local timeout="${2:-120}"
local elapsed=0
while [ "$elapsed" -lt "$timeout" ]; do
if tmux capture-pane -t "$session" -p 2>/dev/null | grep -q ''; then
return 0
fi
sleep 2
elapsed=$((elapsed + 2))
done
return 1
}
# Paste TEXT into SESSION (waits for Claude to be ready first), then press Enter.
agent_inject_into_session() {
local session="$1"
local text="$2"
local tmpfile
agent_wait_for_claude_ready "$session" 120 || true
tmpfile=$(mktemp /tmp/agent-inject-XXXXXX)
printf '%s' "$text" > "$tmpfile"
tmux load-buffer -b "agent-inject-$$" "$tmpfile"
tmux paste-buffer -t "$session" -b "agent-inject-$$"
sleep 0.5
tmux send-keys -t "$session" "" Enter
tmux delete-buffer -b "agent-inject-$$" 2>/dev/null || true
rm -f "$tmpfile"
}
# Create a tmux session running Claude in the given workdir.
# Returns 0 if session is ready, 1 otherwise.
create_agent_session() {
local session="$1"
local workdir="${2:-.}"
tmux new-session -d -s "$session" -c "$workdir" \
"claude --dangerously-skip-permissions" 2>/dev/null
sleep 1
tmux has-session -t "$session" 2>/dev/null || return 1
agent_wait_for_claude_ready "$session" 120 || return 1
return 0
}
# Inject a prompt/formula into a session (alias for agent_inject_into_session).
inject_formula() {
agent_inject_into_session "$@"
}
# Monitor a phase file, calling a callback on changes and handling idle timeout.
# Sets _MONITOR_LOOP_EXIT to the exit reason (idle_timeout, idle_prompt, done, failed, break).
# Sets _MONITOR_SESSION to the resolved session name (arg 4 or $SESSION_NAME).
# Callbacks should reference _MONITOR_SESSION instead of $SESSION_NAME directly.
# Args: phase_file idle_timeout_secs callback_fn [session_name]
# session_name — tmux session to health-check; falls back to $SESSION_NAME global
#
# Idle prompt detection: if Claude returns to the prompt for 3 consecutive polls
# WITHOUT having written any phase signal, the session is killed and the callback is
# invoked with "PHASE:failed". This handles the case where Claude completes its work
# but skips the phase protocol entirely.
monitor_phase_loop() {
local phase_file="$1"
local idle_timeout="$2"
local callback="$3"
local _session="${4:-${SESSION_NAME:-}}"
# Export resolved session name so callbacks can reference it regardless of
# which session was passed to monitor_phase_loop (analogous to _MONITOR_LOOP_EXIT).
export _MONITOR_SESSION="$_session"
local poll_interval="${PHASE_POLL_INTERVAL:-10}"
local last_mtime=0
local idle_elapsed=0
local idle_pane_count=0
while true; do
sleep "$poll_interval"
idle_elapsed=$(( idle_elapsed + poll_interval ))
# Session health check
if ! tmux has-session -t "${_session}" 2>/dev/null; then
local current_phase
current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
case "$current_phase" in
PHASE:done|PHASE:failed|PHASE:merged)
;; # terminal — fall through to phase handler
*)
# Call callback with "crashed" — let agent-specific code handle recovery
if type "${callback}" &>/dev/null; then
"$callback" "PHASE:crashed"
fi
# If callback didn't restart session, break
if ! tmux has-session -t "${_session}" 2>/dev/null; then
_MONITOR_LOOP_EXIT="crashed"
return 1
fi
idle_elapsed=0
idle_pane_count=0
continue
;;
esac
fi
# Check phase file for changes
local phase_mtime
phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
local current_phase
current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$last_mtime" ]; then
# No phase change — check idle timeout
if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
_MONITOR_LOOP_EXIT="idle_timeout"
agent_kill_session "${_session}"
return 0
fi
# Idle prompt detection: Claude finished without writing a phase signal.
# Only fires when current_phase is empty (no phase ever written).
# Note: tmux capture-pane captures the full visible pane area, not just the
# last line. Prior tool output containing (e.g. a zsh subshell prompt in
# Claude's output) could trigger a false positive — the same risk exists in
# agent_wait_for_claude_ready(). Requiring 3 consecutive polls (≥2 poll
# intervals of sustained idle) reduces but does not eliminate this risk.
if [ -z "$current_phase" ] && tmux has-session -t "${_session}" 2>/dev/null && \
tmux capture-pane -t "${_session}" -p 2>/dev/null | grep -q ''; then
idle_pane_count=$(( idle_pane_count + 1 ))
if [ "$idle_pane_count" -ge 3 ]; then
_MONITOR_LOOP_EXIT="idle_prompt"
# Session is already killed before the callback is invoked.
# Callbacks that handle PHASE:failed must not assume the session is alive.
agent_kill_session "${_session}"
if type "${callback}" &>/dev/null; then
"$callback" "PHASE:failed"
fi
return 0
fi
else
idle_pane_count=0
fi
continue
fi
# Phase changed
last_mtime="$phase_mtime"
# shellcheck disable=SC2034 # read by phase-handler.sh callback
LAST_PHASE_MTIME="$phase_mtime"
idle_elapsed=0
# Terminal phases
case "$current_phase" in
PHASE:done|PHASE:merged)
_MONITOR_LOOP_EXIT="done"
if type "${callback}" &>/dev/null; then
"$callback" "$current_phase"
fi
return 0
;;
PHASE:failed|PHASE:needs_human)
_MONITOR_LOOP_EXIT="$current_phase"
if type "${callback}" &>/dev/null; then
"$callback" "$current_phase"
fi
return 0
;;
esac
# Non-terminal phase — call callback
if type "${callback}" &>/dev/null; then
"$callback" "$current_phase"
fi
done
}
# Kill a tmux session gracefully (no-op if not found).
agent_kill_session() {
local session="${1:-}"
[ -n "$session" ] && tmux kill-session -t "$session" 2>/dev/null || true
}
# Read the current phase from a phase file, stripped of whitespace.
# Usage: read_phase [file] — defaults to $PHASE_FILE
read_phase() {
local file="${1:-${PHASE_FILE:-}}"
{ cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
}