fix: feat: StopFailure hook writes phase file on API error / rate limit (#275)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
openhands 2026-03-20 01:43:00 +00:00
parent 109758e86b
commit eaf2841494
4 changed files with 127 additions and 3 deletions

View file

@ -47,6 +47,7 @@ agent_inject_into_session() {
# Installs a Stop hook for idle detection (see monitor_phase_loop).
# Installs a PreToolUse hook to guard destructive Bash operations.
# Optionally installs a PostToolUse hook for phase file write detection.
# Optionally installs a StopFailure hook for immediate phase file update on API error.
# Args: session workdir [phase_file]
# Returns 0 if session is ready, 1 otherwise.
create_agent_session() {
@ -121,6 +122,38 @@ create_agent_session() {
fi
fi
# Install StopFailure hook for immediate phase file update on API error:
# when Claude hits a rate limit, server error, billing error, or auth failure,
# the hook writes PHASE:failed to the phase file and touches the phase-changed
# marker so monitor_phase_loop picks it up within one poll cycle instead of
# waiting for idle timeout (up to 2 hours).
if [ -n "$phase_file" ]; then
local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh"
if [ -x "$stop_failure_hook_script" ]; then
local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${phase_marker}"
if [ -f "$settings" ]; then
jq --arg cmd "$stop_failure_hook_cmd" '
if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd))
then .
else .hooks.StopFailure = (.hooks.StopFailure // []) + [{
matcher: "rate_limit|server_error|authentication_failed|billing_error",
hooks: [{type: "command", command: $cmd}]
}]
end
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
else
jq -n --arg cmd "$stop_failure_hook_cmd" '{
hooks: {
StopFailure: [{
matcher: "rate_limit|server_error|authentication_failed|billing_error",
hooks: [{type: "command", command: $cmd}]
}]
}
}' > "$settings"
fi
fi
fi
# Install PreToolUse hook for destructive operation guard: blocks force push
# to primary branch, rm -rf outside worktree, direct API merge calls, and
# checkout/switch to primary branch. Claude sees the denial reason on exit 2

34
lib/hooks/on-stop-failure.sh Executable file
View file

@ -0,0 +1,34 @@
#!/bin/bash
# on-stop-failure.sh — StopFailure hook for immediate phase file update on API error.
#
# Called by Claude Code when a turn ends due to an API error (rate limit,
# server error, billing error, authentication failure). Writes PHASE:failed
# to the phase file and touches the phase-changed marker so the orchestrator
# picks up the failure within one poll cycle instead of waiting for idle
# timeout (up to 2 hours).
#
# Usage (in .claude/settings.json):
# {"type": "command", "command": "this-script /path/to/phase-file /path/to/marker"}
#
# Args: $1 = phase file path, $2 = phase-changed marker path
phase_file="${1:-}"
marker_file="${2:-}"
[ -z "$phase_file" ] && exit 0
input=$(cat) # consume hook JSON from stdin
# Extract the stop reason from the hook payload
reason=$(printf '%s' "$input" | jq -r '
.stop_reason // .matched_hook // .reason // .type // "unknown"
' 2>/dev/null)
[ -z "$reason" ] && reason="unknown"
# Write phase file immediately — orchestrator reads first line as phase sentinel
printf 'PHASE:failed\nReason: api_error: %s\n' "$reason" > "$phase_file"
# Touch marker so monitor_phase_loop picks this up on the next poll cycle
if [ -n "$marker_file" ]; then
date +%s > "$marker_file"
fi