diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh index 2522655..b968222 100644 --- a/lib/agent-sdk.sh +++ b/lib/agent-sdk.sh @@ -52,8 +52,9 @@ claude_run_with_watchdog() { out_file=$(mktemp) || return 1 trap 'rm -f "$out_file"' RETURN - # Start claude in background, capturing stdout to temp file - "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & + # Start claude in new process group (setsid creates new session, $pid is PGID leader) + # All children of claude will inherit this process group + setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" & pid=$! # Background watchdog: poll for final result marker @@ -84,12 +85,12 @@ claude_run_with_watchdog() { sleep "$grace" if kill -0 "$pid" 2>/dev/null; then log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM" - kill -TERM "$pid" 2>/dev/null || true + kill -TERM -- "-$pid" 2>/dev/null || true # Give it a moment to clean up sleep 5 if kill -0 "$pid" 2>/dev/null; then log "watchdog: force kill after SIGTERM timeout" - kill -KILL "$pid" 2>/dev/null || true + kill -KILL -- "-$pid" 2>/dev/null || true fi fi fi @@ -100,16 +101,16 @@ claude_run_with_watchdog() { timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null rc=$? - # Clean up the watchdog - kill "$grace_pid" 2>/dev/null || true + # Clean up the watchdog (target process group if it spawned children) + kill -- "-$grace_pid" 2>/dev/null || true wait "$grace_pid" 2>/dev/null || true - # When timeout fires (rc=124), explicitly kill the orphaned claude process + # When timeout fires (rc=124), explicitly kill the orphaned claude process group # tail --pid is a passive waiter, not a supervisor if [ "$rc" -eq 124 ]; then - kill "$pid" 2>/dev/null || true + kill -TERM -- "-$pid" 2>/dev/null || true sleep 1 - kill -KILL "$pid" 2>/dev/null || true + kill -KILL -- "-$pid" 2>/dev/null || true fi # Output the captured stdout diff --git a/lib/backup.sh b/lib/backup.sh index 8b4c858..8d7a827 100644 --- a/lib/backup.sh +++ b/lib/backup.sh @@ -128,4 +128,9 @@ backup_create() { local size size=$(du -h "$outfile" | cut -f1) echo "=== Backup complete: ${outfile} (${size}) ===" + + # Clean up before returning — the EXIT trap references the local $tmpdir + # which goes out of scope after return, causing 'unbound variable' under set -u. + trap - EXIT + rm -rf "$tmpdir" } diff --git a/lib/generators.sh b/lib/generators.sh index 5a3a002..eb223e8 100644 --- a/lib/generators.sh +++ b/lib/generators.sh @@ -405,6 +405,9 @@ services: WOODPECKER_SERVER: localhost:9000 WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-} WOODPECKER_GRPC_SECURE: "false" + WOODPECKER_GRPC_KEEPALIVE_TIME: "10s" + WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s" + WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true" WOODPECKER_HEALTHCHECK_ADDR: ":3333" WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net} WOODPECKER_MAX_WORKFLOWS: 1 diff --git a/nomad/jobs/woodpecker-agent.hcl b/nomad/jobs/woodpecker-agent.hcl index c7779a2..a4111fe 100644 --- a/nomad/jobs/woodpecker-agent.hcl +++ b/nomad/jobs/woodpecker-agent.hcl @@ -57,7 +57,7 @@ job "woodpecker-agent" { check { type = "http" path = "/healthz" - interval = "15s" + interval = "10s" timeout = "3s" } } @@ -89,10 +89,13 @@ job "woodpecker-agent" { # Nomad's port stanza to the allocation's IP (not localhost), so the # agent must use the LXC's eth0 IP, not 127.0.0.1. env { - WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000" - WOODPECKER_GRPC_SECURE = "false" - WOODPECKER_MAX_WORKFLOWS = "1" - WOODPECKER_HEALTHCHECK_ADDR = ":3333" + WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000" + WOODPECKER_GRPC_SECURE = "false" + WOODPECKER_GRPC_KEEPALIVE_TIME = "10s" + WOODPECKER_GRPC_KEEPALIVE_TIMEOUT = "20s" + WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS = "true" + WOODPECKER_MAX_WORKFLOWS = "1" + WOODPECKER_HEALTHCHECK_ADDR = ":3333" } # ── Vault-templated agent secret ────────────────────────────────── diff --git a/review/review-pr.sh b/review/review-pr.sh index 091025f..09f6cb6 100755 --- a/review/review-pr.sh +++ b/review/review-pr.sh @@ -52,8 +52,35 @@ REVIEW_TMPDIR=$(mktemp -d) log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; } status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; } -cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; } -trap cleanup EXIT + +# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that) +cleanup() { + rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json" +} + +# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children +# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM) +cleanup_on_exit() { + local ec=$? + # Remove lockfile only if we own it (PID matches $$) + if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then + if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then + rm -f "$LOCKFILE" + log "cleanup_on_exit: removed lockfile (we owned it)" + fi + fi + # Kill any direct children that may have been spawned by this process + # (e.g., bash -c commands from Claude's Bash tool that didn't get reaped) + pkill -P $$ 2>/dev/null || true + # Call the main cleanup function to remove temp files + cleanup + exit "$ec" +} +trap cleanup_on_exit EXIT INT TERM + +# Note: EXIT trap is already set above. The cleanup function is still available for +# non-error exits (e.g., normal completion via exit 0 after verdict posted). +# When review succeeds, we want to skip lockfile removal since the verdict was posted. # ============================================================================= # LOG ROTATION @@ -104,6 +131,7 @@ if [ "$PR_STATE" != "open" ]; then log "SKIP: state=${PR_STATE}" worktree_cleanup "$WORKTREE" rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true + rm -f "$LOCKFILE" exit 0 fi @@ -113,7 +141,7 @@ fi CI_STATE=$(ci_commit_status "$PR_SHA") CI_NOTE="" if ! ci_passed "$CI_STATE"; then - ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; } + ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; } CI_NOTE=" (not required — non-code PR)" fi @@ -123,10 +151,10 @@ fi ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments") HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \ '[.[]|select(.body|contains(""))]|length') -[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; } +[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; } HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \ '[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length') -[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; } +[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; } # ============================================================================= # RE-REVIEW DETECTION @@ -324,3 +352,7 @@ esac profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})" + +# Remove lockfile on successful completion (cleanup_on_exit will also do this, +# but we do it here to avoid the trap running twice) +rm -f "$LOCKFILE" diff --git a/tests/test-watchdog-process-group.sh b/tests/test-watchdog-process-group.sh new file mode 100755 index 0000000..54fedf9 --- /dev/null +++ b/tests/test-watchdog-process-group.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash +# test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children +# +# This test verifies that when claude_run_with_watchdog terminates the Claude process, +# all child processes (including those spawned by Claude's Bash tool) are also killed. +# +# Reproducer scenario: +# 1. Create a fake "claude" stub that: +# a. Spawns a long-running child process (sleep 3600) +# b. Writes a result marker to stdout to trigger idle detection +# c. Stays running +# 2. Run claude_run_with_watchdog with the stub +# 3. Before the fix: sleep child survives (orphaned to PID 1) +# 4. After the fix: sleep child dies (killed as part of process group with -PID) +# +# Usage: ./tests/test-watchdog-process-group.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" +TEST_TMP="/tmp/test-watchdog-$$" +LOGFILE="${TEST_TMP}/log.txt" +PASS=true + +# shellcheck disable=SC2317 +cleanup_test() { + rm -rf "$TEST_TMP" +} +trap cleanup_test EXIT INT TERM + +mkdir -p "$TEST_TMP" + +log() { + printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE" +} + +fail() { + printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE" + PASS=false +} + +pass() { + printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE" +} + +# Export required environment variables +export CLAUDE_TIMEOUT=10 # Short timeout for testing +export CLAUDE_IDLE_GRACE=2 # Short grace period for testing +export LOGFILE="${LOGFILE}" # Required by agent-sdk.sh + +# Create a fake claude stub that: +# 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed +# 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path) +# 3. Stays running so the watchdog can kill it +cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF' +#!/usr/bin/env bash +# Fake claude that spawns a child and stays running +# Simulates Claude's behavior when it spawns a Bash tool command + +# Write result marker to stdout (triggers watchdog idle detection) +echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}' + +# Spawn a child that simulates Claude's Bash tool hanging +# This is the process that should be killed when the parent is terminated +sleep 3600 & +CHILD_PID=$! + +# Log the child PID for debugging +echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2 + +# Stay running - sleep in a loop so the watchdog can kill us +while true; do + sleep 3600 & + wait $! 2>/dev/null || true +done +FAKE_CLAUDE_EOF +chmod +x "${TEST_TMP}/fake-claude" + +log "Testing claude_run_with_watchdog process group cleanup..." + +# Source the library and run claude_run_with_watchdog +cd "$SCRIPT_DIR" +source lib/agent-sdk.sh + +log "Starting claude_run_with_watchdog with fake claude..." + +# Run the function directly (not as a script) +# We need to capture output and redirect stderr +OUTPUT_FILE="${TEST_TMP}/output.txt" +timeout 35 bash -c " + source '${SCRIPT_DIR}/lib/agent-sdk.sh' + CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1 + exit \$? +" || true + +# Give the watchdog a moment to clean up +log "Waiting for cleanup..." +sleep 5 + +# More precise check: look for sleep 3600 processes +# These would be the orphans from our fake claude +ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0") + +if [ "$ORPHAN_COUNT" -gt 0 ]; then + log "Found $ORPHAN_COUNT orphan sleep 3600 processes:" + pgrep -a sleep | grep "sleep 3600" + fail "Orphan children found - process group cleanup did not work" +else + pass "No orphan children found - process group cleanup worked" +fi + +# Also verify that the fake claude itself is not running +FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0") +if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then + log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running" + fail "Fake claude process(es) still running" +else + pass "Fake claude process terminated" +fi + +# Summary +echo "" +if [ "$PASS" = true ]; then + log "All tests passed!" + exit 0 +else + log "Some tests failed. See log at $LOGFILE" + exit 1 +fi