diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh
index 1c1a69c..ca38dca 100644
--- a/lib/agent-sdk.sh
+++ b/lib/agent-sdk.sh
@@ -27,6 +27,96 @@ agent_recover_session() {
   fi
 }
 
+# claude_run_with_watchdog — run claude with idle-after-final-message watchdog
+#
+# Mitigates upstream Claude Code hang (#591) by detecting when the final
+# assistant message has been written and terminating the process after a
+# short grace period instead of waiting for CLAUDE_TIMEOUT.
+#
+# The watchdog:
+#   1. Streams claude stdout to a temp file
+#   2. Polls for the final result marker ("type":"result" for stream-json
+#      or closing } for regular json output)
+#   3. After detecting the final marker, starts a CLAUDE_IDLE_GRACE countdown
+#   4. SIGTERM claude if it hasn't exited cleanly within the grace period
+#   5. Falls back to CLAUDE_TIMEOUT as the absolute hard ceiling
+#
+# Usage: claude_run_with_watchdog claude [args...]
+# Expects: LOGFILE, CLAUDE_TIMEOUT, CLAUDE_IDLE_GRACE (default 30)
+# Returns: exit code from claude or timeout
+claude_run_with_watchdog() {
+  local -a cmd=("$@")
+  local out_file pid grace_pid rc
+
+  # Create temp file for stdout capture
+  out_file=$(mktemp) || return 1
+  trap 'rm -f "$out_file"' RETURN
+
+  # Start claude in background, capturing stdout to temp file
+  "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
+  pid=$!
+
+  # Background watchdog: poll for final result marker
+  (
+    local grace="${CLAUDE_IDLE_GRACE:-30}"
+    local detected=0
+
+    while kill -0 "$pid" 2>/dev/null; do
+      # Check for stream-json result marker first (more reliable)
+      if grep -q '"type":"result"' "$out_file" 2>/dev/null; then
+        detected=1
+        break
+      fi
+      # Fallback: check for closing brace of top-level result object
+      if tail -c 100 "$out_file" 2>/dev/null | grep -q '}[[:space:]]*$'; then
+        # Verify it looks like a JSON result (has session_id or result key)
+        if grep -qE '"(session_id|result)":' "$out_file" 2>/dev/null; then
+          detected=1
+          break
+        fi
+      fi
+      sleep 2
+    done
+
+    # If we detected a final message, wait grace period then kill if still running
+    if [ "$detected" -eq 1 ] && kill -0 "$pid" 2>/dev/null; then
+      log "watchdog: final result detected, ${grace}s grace period before SIGTERM"
+      sleep "$grace"
+      if kill -0 "$pid" 2>/dev/null; then
+        log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
+        kill -TERM "$pid" 2>/dev/null || true
+        # Give it a moment to clean up
+        sleep 5
+        if kill -0 "$pid" 2>/dev/null; then
+          log "watchdog: force kill after SIGTERM timeout"
+          kill -KILL "$pid" 2>/dev/null || true
+        fi
+      fi
+    fi
+  ) &
+  grace_pid=$!
+
+  # Hard ceiling timeout (existing behavior) — use tail --pid to wait for process
+  timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
+  rc=$?
+
+  # Clean up the watchdog
+  kill "$grace_pid" 2>/dev/null || true
+  wait "$grace_pid" 2>/dev/null || true
+
+  # When timeout fires (rc=124), explicitly kill the orphaned claude process
+  # tail --pid is a passive waiter, not a supervisor
+  if [ "$rc" -eq 124 ]; then
+    kill "$pid" 2>/dev/null || true
+    sleep 1
+    kill -KILL "$pid" 2>/dev/null || true
+  fi
+
+  # Output the captured stdout
+  cat "$out_file"
+  return "$rc"
+}
+
 # agent_run — synchronous Claude invocation (one-shot claude -p)
 # Usage: agent_run [--resume SESSION_ID] [--worktree DIR] PROMPT
 # Sets: _AGENT_SESSION_ID (updated each call, persisted to SID_FILE)
@@ -50,7 +140,8 @@ agent_run() {
   mkdir -p "$(dirname "$lock_file")"
   local output rc
   log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
-  output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
+  # Acquire lock separately (flock cannot exec bash functions)
+  output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" 2>>"$LOGFILE") && rc=0 || rc=$?
   if [ "$rc" -eq 124 ]; then
     log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
   elif [ "$rc" -ne 0 ]; then
@@ -91,7 +182,7 @@ agent_run() {
         local nudge="You stopped but did not push any code. You have uncommitted changes. Commit them and push."
         log "agent_run: nudging (uncommitted changes)"
         local nudge_rc
-        output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
+        output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 9>"$lock_file" 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
         if [ "$nudge_rc" -eq 124 ]; then
           log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
         elif [ "$nudge_rc" -ne 0 ]; then
diff --git a/lib/formula-session.sh b/lib/formula-session.sh
index bf44ce7..450b655 100644
--- a/lib/formula-session.sh
+++ b/lib/formula-session.sh
@@ -28,7 +28,10 @@
 #   ops_commit_and_push MESSAGE [FILES]    — commit/push to ops repo
 #   cleanup_stale_crashed_worktrees [HOURS] — thin wrapper around worktree_cleanup_stale
 #
-# Requires: lib/env.sh, lib/worktree.sh sourced first for shared helpers.
+# Requires: lib/env.sh, lib/worktree.sh, lib/agent-sdk.sh sourced first for shared helpers.
+
+# Source agent-sdk for claude_run_with_watchdog watchdog helper
+source "$(dirname "${BASH_SOURCE[0]}")/agent-sdk.sh"
 
 # ── Run guards ───────────────────────────────────────────────────────────
 
@@ -248,7 +251,7 @@ Write the complete, rewritten lessons-learned.md content below. No preamble, no
 
   # Run claude -p one-shot with same model as agent
   local output
-  output=$(claude -p "$digest_prompt" \
+  output=$(claude_run_with_watchdog claude -p "$digest_prompt" \
     --output-format json \
     --dangerously-skip-permissions \
     ${model:+--model "$model"} \
@@ -442,7 +445,7 @@ Write the journal entry below. Use markdown format."
 
   # Run claude -p one-shot with same model as agent
   local output
-  output=$(claude -p "$reflection_prompt" \
+  output=$(claude_run_with_watchdog claude -p "$reflection_prompt" \
     --output-format json \
     --dangerously-skip-permissions \
     ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} \