207 lines
8 KiB
Bash
207 lines
8 KiB
Bash
#!/usr/bin/env bash
|
|
# agent-sdk.sh — Shared SDK for synchronous Claude agent invocations
|
|
#
|
|
# Provides agent_run(): one-shot `claude -p` with session persistence.
|
|
# Source this from any agent script after defining:
|
|
# SID_FILE — path to persist session ID (e.g. /tmp/dev-session-proj-123.sid)
|
|
# LOGFILE — path for log output
|
|
# log() — logging function
|
|
#
|
|
# Usage:
|
|
# source "$(dirname "$0")/../lib/agent-sdk.sh"
|
|
# agent_run [--resume SESSION_ID] [--worktree DIR] PROMPT
|
|
#
|
|
# After each call, _AGENT_SESSION_ID holds the session ID (also saved to SID_FILE).
|
|
# Call agent_recover_session() on startup to restore a previous session.
|
|
|
|
set -euo pipefail
|
|
|
|
_AGENT_SESSION_ID=""
|
|
|
|
# agent_recover_session — restore session_id from SID_FILE if it exists.
|
|
# Call this before agent_run --resume to enable session continuity.
|
|
agent_recover_session() {
|
|
if [ -f "$SID_FILE" ]; then
|
|
_AGENT_SESSION_ID=$(cat "$SID_FILE")
|
|
log "agent_recover_session: ${_AGENT_SESSION_ID:0:12}..."
|
|
fi
|
|
}
|
|
|
|
# claude_run_with_watchdog — run claude with idle-after-final-message watchdog
|
|
#
|
|
# Mitigates upstream Claude Code hang (#591) by detecting when the final
|
|
# assistant message has been written and terminating the process after a
|
|
# short grace period instead of waiting for CLAUDE_TIMEOUT.
|
|
#
|
|
# The watchdog:
|
|
# 1. Streams claude stdout to a temp file
|
|
# 2. Polls for the final result marker ("type":"result" for stream-json
|
|
# or closing } for regular json output)
|
|
# 3. After detecting the final marker, starts a CLAUDE_IDLE_GRACE countdown
|
|
# 4. SIGTERM claude if it hasn't exited cleanly within the grace period
|
|
# 5. Falls back to CLAUDE_TIMEOUT as the absolute hard ceiling
|
|
#
|
|
# Usage: claude_run_with_watchdog claude [args...]
|
|
# Expects: LOGFILE, CLAUDE_TIMEOUT, CLAUDE_IDLE_GRACE (default 30)
|
|
# Returns: exit code from claude or timeout
|
|
claude_run_with_watchdog() {
|
|
local -a cmd=("$@")
|
|
local out_file pid grace_pid rc
|
|
|
|
# Create temp file for stdout capture
|
|
out_file=$(mktemp) || return 1
|
|
trap 'rm -f "$out_file"' RETURN
|
|
|
|
# Start claude in background, capturing stdout to temp file
|
|
"${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
|
|
pid=$!
|
|
|
|
# Background watchdog: poll for final result marker
|
|
(
|
|
local grace="${CLAUDE_IDLE_GRACE:-30}"
|
|
local detected=0
|
|
|
|
while kill -0 "$pid" 2>/dev/null; do
|
|
# Check for stream-json result marker first (more reliable)
|
|
if grep -q '"type":"result"' "$out_file" 2>/dev/null; then
|
|
detected=1
|
|
break
|
|
fi
|
|
# Fallback: check for closing brace of top-level result object
|
|
if tail -c 100 "$out_file" 2>/dev/null | grep -q '}[[:space:]]*$'; then
|
|
# Verify it looks like a JSON result (has session_id or result key)
|
|
if grep -qE '"(session_id|result)":' "$out_file" 2>/dev/null; then
|
|
detected=1
|
|
break
|
|
fi
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
# If we detected a final message, wait grace period then kill if still running
|
|
if [ "$detected" -eq 1 ] && kill -0 "$pid" 2>/dev/null; then
|
|
log "watchdog: final result detected, ${grace}s grace period before SIGTERM"
|
|
sleep "$grace"
|
|
if kill -0 "$pid" 2>/dev/null; then
|
|
log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
|
|
kill -TERM "$pid" 2>/dev/null || true
|
|
# Give it a moment to clean up
|
|
sleep 5
|
|
if kill -0 "$pid" 2>/dev/null; then
|
|
log "watchdog: force kill after SIGTERM timeout"
|
|
kill -KILL "$pid" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
fi
|
|
) &
|
|
grace_pid=$!
|
|
|
|
# Hard ceiling timeout (existing behavior) — use tail --pid to wait for process
|
|
timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
|
|
rc=$?
|
|
|
|
# Clean up the watchdog
|
|
kill "$grace_pid" 2>/dev/null || true
|
|
wait "$grace_pid" 2>/dev/null || true
|
|
|
|
# When timeout fires (rc=124), explicitly kill the orphaned claude process
|
|
# tail --pid is a passive waiter, not a supervisor
|
|
if [ "$rc" -eq 124 ]; then
|
|
kill "$pid" 2>/dev/null || true
|
|
sleep 1
|
|
kill -KILL "$pid" 2>/dev/null || true
|
|
fi
|
|
|
|
# Output the captured stdout
|
|
cat "$out_file"
|
|
return "$rc"
|
|
}
|
|
|
|
# agent_run — synchronous Claude invocation (one-shot claude -p)
|
|
# Usage: agent_run [--resume SESSION_ID] [--worktree DIR] PROMPT
|
|
# Sets: _AGENT_SESSION_ID (updated each call, persisted to SID_FILE)
|
|
agent_run() {
|
|
local resume_id="" worktree_dir=""
|
|
while [[ "${1:-}" == --* ]]; do
|
|
case "$1" in
|
|
--resume) shift; resume_id="${1:-}"; shift ;;
|
|
--worktree) shift; worktree_dir="${1:-}"; shift ;;
|
|
*) shift ;;
|
|
esac
|
|
done
|
|
local prompt="${1:-}"
|
|
|
|
local -a args=(-p "$prompt" --output-format json --dangerously-skip-permissions --max-turns 200)
|
|
[ -n "$resume_id" ] && args+=(--resume "$resume_id")
|
|
[ -n "${CLAUDE_MODEL:-}" ] && args+=(--model "$CLAUDE_MODEL")
|
|
|
|
local run_dir="${worktree_dir:-$(pwd)}"
|
|
local lock_file="${HOME}/.claude/session.lock"
|
|
mkdir -p "$(dirname "$lock_file")"
|
|
local output rc
|
|
log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
|
|
# Acquire lock separately (flock cannot exec bash functions)
|
|
output=$(cd "$run_dir" && ( flock -w 600 "$lock_file" || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 2>>"$LOGFILE") && rc=0 || rc=$?
|
|
if [ "$rc" -eq 124 ]; then
|
|
log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
|
|
elif [ "$rc" -ne 0 ]; then
|
|
log "agent_run: claude exited with code $rc"
|
|
# Log last 3 lines of output for diagnostics
|
|
if [ -n "$output" ]; then
|
|
log "agent_run: last output lines: $(echo "$output" | tail -3)"
|
|
fi
|
|
fi
|
|
if [ -z "$output" ]; then
|
|
log "agent_run: empty output (claude may have crashed or failed, exit code: $rc)"
|
|
fi
|
|
|
|
# Extract and persist session_id
|
|
local new_sid
|
|
new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
|
|
if [ -n "$new_sid" ]; then
|
|
_AGENT_SESSION_ID="$new_sid"
|
|
printf '%s' "$new_sid" > "$SID_FILE"
|
|
log "agent_run: session_id=${new_sid:0:12}..."
|
|
fi
|
|
|
|
# Save output for diagnostics (no_push, crashes)
|
|
_AGENT_LAST_OUTPUT="$output"
|
|
local diag_file="${DISINTO_LOG_DIR:-/tmp}/dev/agent-run-last.json"
|
|
printf '%s' "$output" > "$diag_file" 2>/dev/null || true
|
|
|
|
# Nudge: if the model stopped without pushing, resume with encouragement.
|
|
# Some models emit end_turn prematurely when confused. A nudge often unsticks them.
|
|
if [ -n "$_AGENT_SESSION_ID" ] && [ -n "$output" ]; then
|
|
local has_changes
|
|
has_changes=$(cd "$run_dir" && git status --porcelain 2>/dev/null | head -1) || true
|
|
local has_pushed
|
|
has_pushed=$(cd "$run_dir" && git log --oneline "${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH:-main}..HEAD" 2>/dev/null | head -1) || true
|
|
if [ -z "$has_pushed" ]; then
|
|
if [ -n "$has_changes" ]; then
|
|
# Nudge: there are uncommitted changes
|
|
local nudge="You stopped but did not push any code. You have uncommitted changes. Commit them and push."
|
|
log "agent_run: nudging (uncommitted changes)"
|
|
local nudge_rc
|
|
output=$(cd "$run_dir" && ( flock -w 600 "$lock_file" || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
|
|
if [ "$nudge_rc" -eq 124 ]; then
|
|
log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
|
|
elif [ "$nudge_rc" -ne 0 ]; then
|
|
log "agent_run: nudge claude exited with code $nudge_rc"
|
|
# Log last 3 lines of output for diagnostics
|
|
if [ -n "$output" ]; then
|
|
log "agent_run: nudge last output lines: $(echo "$output" | tail -3)"
|
|
fi
|
|
fi
|
|
new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
|
|
if [ -n "$new_sid" ]; then
|
|
_AGENT_SESSION_ID="$new_sid"
|
|
printf '%s' "$new_sid" > "$SID_FILE"
|
|
fi
|
|
printf '%s' "$output" > "$diag_file" 2>/dev/null || true
|
|
_AGENT_LAST_OUTPUT="$output"
|
|
else
|
|
log "agent_run: no push and no changes — skipping nudge"
|
|
fi
|
|
fi
|
|
fi
|
|
}
|