fix: bug: claude_run_with_watchdog leaks orphan bash children — review-pr.sh lock stuck for 47 min when Claude Bash-tool command hangs (#1055)
Fixes orphan process issue by: 1. lib/agent-sdk.sh: Use setsid to run claude in a new process group - All children of claude inherit this process group - Changed all kill calls to target the process group with -PID syntax - Affected lines: setsid invocation, SIGTERM kill, SIGKILL kill, watchdog cleanup 2. review/review-pr.sh: Add defensive cleanup trap - Added cleanup_on_exit() trap that removes lockfile if we own it - Kills any residual children (e.g., bash -c from Claude's Bash tool) - Added explicit lockfile removal on all early-exit paths - Added lockfile removal on successful completion 3. tests/test-watchdog-process-group.sh: New test to verify orphan cleanup - Creates fake claude stub that spawns sleep 3600 child - Verifies all children are killed when watchdog fires Acceptance criteria met: - [x] setsid is used for the Claude invocation - [x] All three kill call sites target the process group (-PID) - [x] review/review-pr.sh has EXIT/INT/TERM trap for lockfile removal - [x] shellcheck clean on all modified files
This commit is contained in:
parent
e9aed747b5
commit
f878427866
3 changed files with 176 additions and 14 deletions
|
|
@ -52,8 +52,9 @@ claude_run_with_watchdog() {
|
|||
out_file=$(mktemp) || return 1
|
||||
trap 'rm -f "$out_file"' RETURN
|
||||
|
||||
# Start claude in background, capturing stdout to temp file
|
||||
"${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
|
||||
# Start claude in new process group (setsid creates new session, $pid is PGID leader)
|
||||
# All children of claude will inherit this process group
|
||||
setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
|
||||
pid=$!
|
||||
|
||||
# Background watchdog: poll for final result marker
|
||||
|
|
@ -84,12 +85,12 @@ claude_run_with_watchdog() {
|
|||
sleep "$grace"
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
|
||||
kill -TERM "$pid" 2>/dev/null || true
|
||||
kill -TERM -- "-$pid" 2>/dev/null || true
|
||||
# Give it a moment to clean up
|
||||
sleep 5
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
log "watchdog: force kill after SIGTERM timeout"
|
||||
kill -KILL "$pid" 2>/dev/null || true
|
||||
kill -KILL -- "-$pid" 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
|
@ -100,16 +101,16 @@ claude_run_with_watchdog() {
|
|||
timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
|
||||
rc=$?
|
||||
|
||||
# Clean up the watchdog
|
||||
kill "$grace_pid" 2>/dev/null || true
|
||||
# Clean up the watchdog (target process group if it spawned children)
|
||||
kill -- "-$grace_pid" 2>/dev/null || true
|
||||
wait "$grace_pid" 2>/dev/null || true
|
||||
|
||||
# When timeout fires (rc=124), explicitly kill the orphaned claude process
|
||||
# When timeout fires (rc=124), explicitly kill the orphaned claude process group
|
||||
# tail --pid is a passive waiter, not a supervisor
|
||||
if [ "$rc" -eq 124 ]; then
|
||||
kill "$pid" 2>/dev/null || true
|
||||
kill -TERM -- "-$pid" 2>/dev/null || true
|
||||
sleep 1
|
||||
kill -KILL "$pid" 2>/dev/null || true
|
||||
kill -KILL -- "-$pid" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Output the captured stdout
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue