fix: bug: claude_run_with_watchdog leaks orphan bash children — review-pr.sh lock stuck for 47 min when Claude Bash-tool command hangs (#1055)
Fixes orphan process issue by: 1. lib/agent-sdk.sh: Use setsid to run claude in a new process group - All children of claude inherit this process group - Changed all kill calls to target the process group with -PID syntax - Affected lines: setsid invocation, SIGTERM kill, SIGKILL kill, watchdog cleanup 2. review/review-pr.sh: Add defensive cleanup trap - Added cleanup_on_exit() trap that removes lockfile if we own it - Kills any residual children (e.g., bash -c from Claude's Bash tool) - Added explicit lockfile removal on all early-exit paths - Added lockfile removal on successful completion 3. tests/test-watchdog-process-group.sh: New test to verify orphan cleanup - Creates fake claude stub that spawns sleep 3600 child - Verifies all children are killed when watchdog fires Acceptance criteria met: - [x] setsid is used for the Claude invocation - [x] All three kill call sites target the process group (-PID) - [x] review/review-pr.sh has EXIT/INT/TERM trap for lockfile removal - [x] shellcheck clean on all modified files
This commit is contained in:
parent
e9aed747b5
commit
f878427866
3 changed files with 176 additions and 14 deletions
|
|
@ -52,8 +52,35 @@ REVIEW_TMPDIR=$(mktemp -d)
|
|||
|
||||
log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; }
|
||||
status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; }
|
||||
cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; }
|
||||
trap cleanup EXIT
|
||||
|
||||
# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that)
|
||||
cleanup() {
|
||||
rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"
|
||||
}
|
||||
|
||||
# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children
|
||||
# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM)
|
||||
cleanup_on_exit() {
|
||||
local ec=$?
|
||||
# Remove lockfile only if we own it (PID matches $$)
|
||||
if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then
|
||||
if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then
|
||||
rm -f "$LOCKFILE"
|
||||
log "cleanup_on_exit: removed lockfile (we owned it)"
|
||||
fi
|
||||
fi
|
||||
# Kill any direct children that may have been spawned by this process
|
||||
# (e.g., bash -c commands from Claude's Bash tool that didn't get reaped)
|
||||
pkill -P $$ 2>/dev/null || true
|
||||
# Call the main cleanup function to remove temp files
|
||||
cleanup
|
||||
exit "$ec"
|
||||
}
|
||||
trap cleanup_on_exit EXIT INT TERM
|
||||
|
||||
# Note: EXIT trap is already set above. The cleanup function is still available for
|
||||
# non-error exits (e.g., normal completion via exit 0 after verdict posted).
|
||||
# When review succeeds, we want to skip lockfile removal since the verdict was posted.
|
||||
|
||||
# =============================================================================
|
||||
# LOG ROTATION
|
||||
|
|
@ -104,6 +131,7 @@ if [ "$PR_STATE" != "open" ]; then
|
|||
log "SKIP: state=${PR_STATE}"
|
||||
worktree_cleanup "$WORKTREE"
|
||||
rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true
|
||||
rm -f "$LOCKFILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
|
|
@ -113,7 +141,7 @@ fi
|
|||
CI_STATE=$(ci_commit_status "$PR_SHA")
|
||||
CI_NOTE=""
|
||||
if ! ci_passed "$CI_STATE"; then
|
||||
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; }
|
||||
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; }
|
||||
CI_NOTE=" (not required — non-code PR)"
|
||||
fi
|
||||
|
||||
|
|
@ -123,10 +151,10 @@ fi
|
|||
ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments")
|
||||
HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \
|
||||
'[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length')
|
||||
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; }
|
||||
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; }
|
||||
HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \
|
||||
'[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length')
|
||||
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; }
|
||||
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; }
|
||||
|
||||
# =============================================================================
|
||||
# RE-REVIEW DETECTION
|
||||
|
|
@ -324,3 +352,7 @@ esac
|
|||
profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true
|
||||
|
||||
log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})"
|
||||
|
||||
# Remove lockfile on successful completion (cleanup_on_exit will also do this,
|
||||
# but we do it here to avoid the trap running twice)
|
||||
rm -f "$LOCKFILE"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue