From eaf2841494188584344cee518929eaeab3106354 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 20 Mar 2026 01:43:00 +0000 Subject: [PATCH 1/2] fix: feat: StopFailure hook writes phase file on API error / rate limit (#275) Co-Authored-By: Claude Opus 4.6 (1M context) --- AGENTS.md | 2 +- dev/phase-test.sh | 61 ++++++++++++++++++++++++++++++++++-- lib/agent-session.sh | 33 +++++++++++++++++++ lib/hooks/on-stop-failure.sh | 34 ++++++++++++++++++++ 4 files changed, 127 insertions(+), 3 deletions(-) create mode 100755 lib/hooks/on-stop-failure.sh diff --git a/AGENTS.md b/AGENTS.md index d58f2d4..e843bde 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -267,7 +267,7 @@ sourced as needed. | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `CODEBERG_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, Matrix config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) | | `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` patterns. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll | | `lib/matrix_listener.sh` | Long-poll Matrix sync daemon. Dispatches thread replies to the correct agent via well-known files (`/tmp/{agent}-escalation-reply`). Handles supervisor, gardener, dev, review, vault, and action reply routing. Run as systemd service. | Standalone daemon | -| `lib/agent-session.sh` | Shared tmux + Claude session helpers: `create_agent_session()`, `inject_formula()`, `agent_wait_for_claude_ready()`, `agent_inject_into_session()`, `agent_kill_session()`, `monitor_phase_loop()`, `read_phase()`. `create_agent_session(session, workdir, [phase_file])` optionally installs a PostToolUse hook (matcher `Bash\|Write`) that detects phase file writes in real-time — when Claude writes to the phase file, the hook writes a marker so `monitor_phase_loop` reacts on the next poll instead of waiting for mtime changes. When `MATRIX_THREAD_ID` is exported, also installs a Stop hook (`on-stop-matrix.sh`) that streams each Claude response to the Matrix thread. `monitor_phase_loop` sets `_MONITOR_LOOP_EXIT` to one of: `done`, `idle_timeout`, `idle_prompt` (Claude returned to `❯` for 3 consecutive polls without writing any phase — callback invoked with `PHASE:failed`, session already dead), `crashed`, or a `PHASE:*` string. Agents must handle `idle_prompt` in both their callback and their post-loop exit handler. | dev-agent.sh, gardener-agent.sh, action-agent.sh | +| `lib/agent-session.sh` | Shared tmux + Claude session helpers: `create_agent_session()`, `inject_formula()`, `agent_wait_for_claude_ready()`, `agent_inject_into_session()`, `agent_kill_session()`, `monitor_phase_loop()`, `read_phase()`. `create_agent_session(session, workdir, [phase_file])` optionally installs a PostToolUse hook (matcher `Bash\|Write`) that detects phase file writes in real-time — when Claude writes to the phase file, the hook writes a marker so `monitor_phase_loop` reacts on the next poll instead of waiting for mtime changes. Also installs a StopFailure hook (matcher `rate_limit\|server_error\|authentication_failed\|billing_error`) that writes `PHASE:failed` with an `api_error` reason to the phase file and touches the phase-changed marker, so the orchestrator discovers API errors within one poll cycle instead of waiting for idle timeout. When `MATRIX_THREAD_ID` is exported, also installs a Stop hook (`on-stop-matrix.sh`) that streams each Claude response to the Matrix thread. `monitor_phase_loop` sets `_MONITOR_LOOP_EXIT` to one of: `done`, `idle_timeout`, `idle_prompt` (Claude returned to `❯` for 3 consecutive polls without writing any phase — callback invoked with `PHASE:failed`, session already dead), `crashed`, or a `PHASE:*` string. Agents must handle `idle_prompt` in both their callback and their post-loop exit handler. | dev-agent.sh, gardener-agent.sh, action-agent.sh | --- diff --git a/dev/phase-test.sh b/dev/phase-test.sh index 438a80c..6c07c6c 100755 --- a/dev/phase-test.sh +++ b/dev/phase-test.sh @@ -226,7 +226,64 @@ else fail "PostToolUse hook script not found or not executable: $HOOK_SCRIPT" fi -# ── Test 10: phase-changed marker resets mtime guard ───────────────────── +# ── Test 10: StopFailure hook writes phase file and marker on API error ─── +STOP_FAILURE_HOOK="$(dirname "$0")/../lib/hooks/on-stop-failure.sh" +SF_MARKER="/tmp/phase-changed-test-sf.marker" +rm -f "$SF_MARKER" "$PHASE_FILE" + +if [ -x "$STOP_FAILURE_HOOK" ]; then + # 10a: rate_limit stop reason → PHASE:failed with api_error reason + printf '{"stop_reason":"rate_limit"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "$SF_MARKER" + sf_first=$(head -1 "$PHASE_FILE" 2>/dev/null) + sf_second=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null) + if [ "$sf_first" = "PHASE:failed" ] && echo "$sf_second" | grep -q "api_error: rate_limit"; then + ok "StopFailure hook writes PHASE:failed with api_error: rate_limit" + else + fail "StopFailure hook phase file: first='$sf_first' second='$sf_second'" + fi + if [ -f "$SF_MARKER" ]; then + ok "StopFailure hook writes phase-changed marker" + else + fail "StopFailure hook did not write phase-changed marker" + fi + rm -f "$SF_MARKER" "$PHASE_FILE" + + # 10b: server_error stop reason + printf '{"stop_reason":"server_error"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "$SF_MARKER" + sf_second=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null) + if echo "$sf_second" | grep -q "api_error: server_error"; then + ok "StopFailure hook writes api_error: server_error" + else + fail "StopFailure hook server_error: got '$sf_second'" + fi + rm -f "$SF_MARKER" "$PHASE_FILE" + + # 10c: missing phase_file arg → no-op (exit 0, no crash) + printf '{"stop_reason":"rate_limit"}' | "$STOP_FAILURE_HOOK" "" "$SF_MARKER" + if [ ! -f "$PHASE_FILE" ]; then + ok "StopFailure hook no-ops when phase_file is empty" + else + fail "StopFailure hook should not write when phase_file is empty" + fi + rm -f "$SF_MARKER" + + # 10d: missing marker arg → phase file still written, no marker + printf '{"stop_reason":"billing_error"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "" + sf_first=$(head -1 "$PHASE_FILE" 2>/dev/null) + if [ "$sf_first" = "PHASE:failed" ] && [ ! -f "$SF_MARKER" ]; then + ok "StopFailure hook writes phase without marker when marker arg is empty" + else + fail "StopFailure hook: first='$sf_first' marker_exists=$([ -f "$SF_MARKER" ] && echo yes || echo no)" + fi + rm -f "$PHASE_FILE" +else + fail "StopFailure hook script not found or not executable: $STOP_FAILURE_HOOK" +fi + +# ── Test 11: phase-changed marker resets mtime guard ───────────────────── # Simulates monitor_phase_loop behavior: when marker exists, last_mtime # is reset to 0 so the phase is processed even if mtime hasn't changed. echo "PHASE:awaiting_ci" > "$PHASE_FILE" @@ -241,7 +298,7 @@ else fi # Now simulate marker present — reset last_mtime to 0 -MARKER_FILE="/tmp/phase-changed-test-session.marker" +MARKER_FILE="/tmp/phase-changed-test-mtime.marker" date +%s > "$MARKER_FILE" if [ -f "$MARKER_FILE" ]; then rm -f "$MARKER_FILE" diff --git a/lib/agent-session.sh b/lib/agent-session.sh index 599b2c0..04638bb 100644 --- a/lib/agent-session.sh +++ b/lib/agent-session.sh @@ -47,6 +47,7 @@ agent_inject_into_session() { # Installs a Stop hook for idle detection (see monitor_phase_loop). # Installs a PreToolUse hook to guard destructive Bash operations. # Optionally installs a PostToolUse hook for phase file write detection. +# Optionally installs a StopFailure hook for immediate phase file update on API error. # Args: session workdir [phase_file] # Returns 0 if session is ready, 1 otherwise. create_agent_session() { @@ -121,6 +122,38 @@ create_agent_session() { fi fi + # Install StopFailure hook for immediate phase file update on API error: + # when Claude hits a rate limit, server error, billing error, or auth failure, + # the hook writes PHASE:failed to the phase file and touches the phase-changed + # marker so monitor_phase_loop picks it up within one poll cycle instead of + # waiting for idle timeout (up to 2 hours). + if [ -n "$phase_file" ]; then + local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh" + if [ -x "$stop_failure_hook_script" ]; then + local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${phase_marker}" + if [ -f "$settings" ]; then + jq --arg cmd "$stop_failure_hook_cmd" ' + if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd)) + then . + else .hooks.StopFailure = (.hooks.StopFailure // []) + [{ + matcher: "rate_limit|server_error|authentication_failed|billing_error", + hooks: [{type: "command", command: $cmd}] + }] + end + ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings" + else + jq -n --arg cmd "$stop_failure_hook_cmd" '{ + hooks: { + StopFailure: [{ + matcher: "rate_limit|server_error|authentication_failed|billing_error", + hooks: [{type: "command", command: $cmd}] + }] + } + }' > "$settings" + fi + fi + fi + # Install PreToolUse hook for destructive operation guard: blocks force push # to primary branch, rm -rf outside worktree, direct API merge calls, and # checkout/switch to primary branch. Claude sees the denial reason on exit 2 diff --git a/lib/hooks/on-stop-failure.sh b/lib/hooks/on-stop-failure.sh new file mode 100755 index 0000000..01fb484 --- /dev/null +++ b/lib/hooks/on-stop-failure.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# on-stop-failure.sh — StopFailure hook for immediate phase file update on API error. +# +# Called by Claude Code when a turn ends due to an API error (rate limit, +# server error, billing error, authentication failure). Writes PHASE:failed +# to the phase file and touches the phase-changed marker so the orchestrator +# picks up the failure within one poll cycle instead of waiting for idle +# timeout (up to 2 hours). +# +# Usage (in .claude/settings.json): +# {"type": "command", "command": "this-script /path/to/phase-file /path/to/marker"} +# +# Args: $1 = phase file path, $2 = phase-changed marker path + +phase_file="${1:-}" +marker_file="${2:-}" + +[ -z "$phase_file" ] && exit 0 + +input=$(cat) # consume hook JSON from stdin + +# Extract the stop reason from the hook payload +reason=$(printf '%s' "$input" | jq -r ' + .stop_reason // .matched_hook // .reason // .type // "unknown" +' 2>/dev/null) +[ -z "$reason" ] && reason="unknown" + +# Write phase file immediately — orchestrator reads first line as phase sentinel +printf 'PHASE:failed\nReason: api_error: %s\n' "$reason" > "$phase_file" + +# Touch marker so monitor_phase_loop picks this up on the next poll cycle +if [ -n "$marker_file" ]; then + date +%s > "$marker_file" +fi From f66fcd666cc5a0bb7a71aff6363af1db1e06fdf4 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 20 Mar 2026 01:52:46 +0000 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20termi?= =?UTF-8?q?nal=20phase=20guard,=20explicit=20marker=20var,=20test=20covera?= =?UTF-8?q?ge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Guard against overwriting terminal phases (PHASE:done, PHASE:merged) in on-stop-failure.sh to prevent false failures from same-turn race - Declare sf_phase_marker explicitly in StopFailure block instead of relying on phase_marker from PostToolUse block - Add authentication_failed test (10c) and terminal phase guard tests (10g, 10h) - Fix fragile nested command substitution in test 10f fail() message Co-Authored-By: Claude Opus 4.6 (1M context) --- dev/phase-test.sh | 45 ++++++++++++++++++++++++++++++++---- lib/agent-session.sh | 5 +++- lib/hooks/on-stop-failure.sh | 8 +++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/dev/phase-test.sh b/dev/phase-test.sh index 6c07c6c..15c59b8 100755 --- a/dev/phase-test.sh +++ b/dev/phase-test.sh @@ -260,7 +260,18 @@ if [ -x "$STOP_FAILURE_HOOK" ]; then fi rm -f "$SF_MARKER" "$PHASE_FILE" - # 10c: missing phase_file arg → no-op (exit 0, no crash) + # 10c: authentication_failed stop reason + printf '{"stop_reason":"authentication_failed"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "$SF_MARKER" + sf_second=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null) + if echo "$sf_second" | grep -q "api_error: authentication_failed"; then + ok "StopFailure hook writes api_error: authentication_failed" + else + fail "StopFailure hook authentication_failed: got '$sf_second'" + fi + rm -f "$SF_MARKER" "$PHASE_FILE" + + # 10e: missing phase_file arg → no-op (exit 0, no crash) printf '{"stop_reason":"rate_limit"}' | "$STOP_FAILURE_HOOK" "" "$SF_MARKER" if [ ! -f "$PHASE_FILE" ]; then ok "StopFailure hook no-ops when phase_file is empty" @@ -269,16 +280,42 @@ if [ -x "$STOP_FAILURE_HOOK" ]; then fi rm -f "$SF_MARKER" - # 10d: missing marker arg → phase file still written, no marker + # 10f: missing marker arg → phase file still written, no marker printf '{"stop_reason":"billing_error"}' \ | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "" sf_first=$(head -1 "$PHASE_FILE" 2>/dev/null) - if [ "$sf_first" = "PHASE:failed" ] && [ ! -f "$SF_MARKER" ]; then + sf_marker_exists="no" + [ -f "$SF_MARKER" ] && sf_marker_exists="yes" + if [ "$sf_first" = "PHASE:failed" ] && [ "$sf_marker_exists" = "no" ]; then ok "StopFailure hook writes phase without marker when marker arg is empty" else - fail "StopFailure hook: first='$sf_first' marker_exists=$([ -f "$SF_MARKER" ] && echo yes || echo no)" + fail "StopFailure hook: first='$sf_first' marker_exists=$sf_marker_exists" fi rm -f "$PHASE_FILE" + + # 10g: terminal phase guard — does not overwrite PHASE:done + echo "PHASE:done" > "$PHASE_FILE" + printf '{"stop_reason":"rate_limit"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "$SF_MARKER" + sf_first=$(head -1 "$PHASE_FILE" 2>/dev/null) + if [ "$sf_first" = "PHASE:done" ] && [ ! -f "$SF_MARKER" ]; then + ok "StopFailure hook does not overwrite terminal PHASE:done" + else + fail "StopFailure hook overwrote PHASE:done: first='$sf_first'" + fi + rm -f "$SF_MARKER" "$PHASE_FILE" + + # 10h: terminal phase guard — does not overwrite PHASE:merged + echo "PHASE:merged" > "$PHASE_FILE" + printf '{"stop_reason":"server_error"}' \ + | "$STOP_FAILURE_HOOK" "$PHASE_FILE" "$SF_MARKER" + sf_first=$(head -1 "$PHASE_FILE" 2>/dev/null) + if [ "$sf_first" = "PHASE:merged" ] && [ ! -f "$SF_MARKER" ]; then + ok "StopFailure hook does not overwrite terminal PHASE:merged" + else + fail "StopFailure hook overwrote PHASE:merged: first='$sf_first'" + fi + rm -f "$SF_MARKER" "$PHASE_FILE" else fail "StopFailure hook script not found or not executable: $STOP_FAILURE_HOOK" fi diff --git a/lib/agent-session.sh b/lib/agent-session.sh index 04638bb..2377f7e 100644 --- a/lib/agent-session.sh +++ b/lib/agent-session.sh @@ -130,7 +130,10 @@ create_agent_session() { if [ -n "$phase_file" ]; then local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh" if [ -x "$stop_failure_hook_script" ]; then - local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${phase_marker}" + # phase_marker is defined in the PostToolUse block above; redeclare so + # this block is self-contained if that block is ever removed. + local sf_phase_marker="/tmp/phase-changed-${session}.marker" + local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${sf_phase_marker}" if [ -f "$settings" ]; then jq --arg cmd "$stop_failure_hook_cmd" ' if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd)) diff --git a/lib/hooks/on-stop-failure.sh b/lib/hooks/on-stop-failure.sh index 01fb484..9c61648 100755 --- a/lib/hooks/on-stop-failure.sh +++ b/lib/hooks/on-stop-failure.sh @@ -25,6 +25,14 @@ reason=$(printf '%s' "$input" | jq -r ' ' 2>/dev/null) [ -z "$reason" ] && reason="unknown" +# Guard: do not overwrite a terminal phase. If Claude wrote PHASE:done via a +# tool call and then hit a rate limit while generating the rest of its response, +# the PostToolUse hook already recorded the correct terminal phase. +existing=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]') +case "$existing" in + PHASE:done|PHASE:merged) exit 0 ;; +esac + # Write phase file immediately — orchestrator reads first line as phase sentinel printf 'PHASE:failed\nReason: api_error: %s\n' "$reason" > "$phase_file"