Compare commits

..

1 commit

Author SHA1 Message Date
Claude
526b7fb73a fix: tool: disinto backup create — export Forgejo issues + disinto-ops git bundle (#1057)
All checks were successful
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/push/nomad-validate Pipeline was successful
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/nomad-validate Pipeline was successful
ci/woodpecker/pr/smoke-init Pipeline was successful
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-19 20:12:12 +00:00
6 changed files with 19 additions and 192 deletions

View file

@ -52,9 +52,8 @@ claude_run_with_watchdog() {
out_file=$(mktemp) || return 1
trap 'rm -f "$out_file"' RETURN
# Start claude in new process group (setsid creates new session, $pid is PGID leader)
# All children of claude will inherit this process group
setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
# Start claude in background, capturing stdout to temp file
"${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
pid=$!
# Background watchdog: poll for final result marker
@ -85,12 +84,12 @@ claude_run_with_watchdog() {
sleep "$grace"
if kill -0 "$pid" 2>/dev/null; then
log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
kill -TERM -- "-$pid" 2>/dev/null || true
kill -TERM "$pid" 2>/dev/null || true
# Give it a moment to clean up
sleep 5
if kill -0 "$pid" 2>/dev/null; then
log "watchdog: force kill after SIGTERM timeout"
kill -KILL -- "-$pid" 2>/dev/null || true
kill -KILL "$pid" 2>/dev/null || true
fi
fi
fi
@ -101,16 +100,16 @@ claude_run_with_watchdog() {
timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
rc=$?
# Clean up the watchdog (target process group if it spawned children)
kill -- "-$grace_pid" 2>/dev/null || true
# Clean up the watchdog
kill "$grace_pid" 2>/dev/null || true
wait "$grace_pid" 2>/dev/null || true
# When timeout fires (rc=124), explicitly kill the orphaned claude process group
# When timeout fires (rc=124), explicitly kill the orphaned claude process
# tail --pid is a passive waiter, not a supervisor
if [ "$rc" -eq 124 ]; then
kill -TERM -- "-$pid" 2>/dev/null || true
kill "$pid" 2>/dev/null || true
sleep 1
kill -KILL -- "-$pid" 2>/dev/null || true
kill -KILL "$pid" 2>/dev/null || true
fi
# Output the captured stdout

View file

@ -128,9 +128,4 @@ backup_create() {
local size
size=$(du -h "$outfile" | cut -f1)
echo "=== Backup complete: ${outfile} (${size}) ==="
# Clean up before returning — the EXIT trap references the local $tmpdir
# which goes out of scope after return, causing 'unbound variable' under set -u.
trap - EXIT
rm -rf "$tmpdir"
}

View file

@ -405,9 +405,6 @@ services:
WOODPECKER_SERVER: localhost:9000
WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
WOODPECKER_GRPC_SECURE: "false"
WOODPECKER_GRPC_KEEPALIVE_TIME: "10s"
WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s"
WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true"
WOODPECKER_HEALTHCHECK_ADDR: ":3333"
WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net}
WOODPECKER_MAX_WORKFLOWS: 1

View file

@ -57,7 +57,7 @@ job "woodpecker-agent" {
check {
type = "http"
path = "/healthz"
interval = "10s"
interval = "15s"
timeout = "3s"
}
}
@ -91,9 +91,6 @@ job "woodpecker-agent" {
env {
WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000"
WOODPECKER_GRPC_SECURE = "false"
WOODPECKER_GRPC_KEEPALIVE_TIME = "10s"
WOODPECKER_GRPC_KEEPALIVE_TIMEOUT = "20s"
WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS = "true"
WOODPECKER_MAX_WORKFLOWS = "1"
WOODPECKER_HEALTHCHECK_ADDR = ":3333"
}

View file

@ -52,35 +52,8 @@ REVIEW_TMPDIR=$(mktemp -d)
log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; }
status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; }
# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that)
cleanup() {
rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"
}
# cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children
# This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM)
cleanup_on_exit() {
local ec=$?
# Remove lockfile only if we own it (PID matches $$)
if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then
if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then
rm -f "$LOCKFILE"
log "cleanup_on_exit: removed lockfile (we owned it)"
fi
fi
# Kill any direct children that may have been spawned by this process
# (e.g., bash -c commands from Claude's Bash tool that didn't get reaped)
pkill -P $$ 2>/dev/null || true
# Call the main cleanup function to remove temp files
cleanup
exit "$ec"
}
trap cleanup_on_exit EXIT INT TERM
# Note: EXIT trap is already set above. The cleanup function is still available for
# non-error exits (e.g., normal completion via exit 0 after verdict posted).
# When review succeeds, we want to skip lockfile removal since the verdict was posted.
cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; }
trap cleanup EXIT
# =============================================================================
# LOG ROTATION
@ -131,7 +104,6 @@ if [ "$PR_STATE" != "open" ]; then
log "SKIP: state=${PR_STATE}"
worktree_cleanup "$WORKTREE"
rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true
rm -f "$LOCKFILE"
exit 0
fi
@ -141,7 +113,7 @@ fi
CI_STATE=$(ci_commit_status "$PR_SHA")
CI_NOTE=""
if ! ci_passed "$CI_STATE"; then
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; }
ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; }
CI_NOTE=" (not required — non-code PR)"
fi
@ -151,10 +123,10 @@ fi
ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments")
HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \
'[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length')
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; }
[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; }
HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \
'[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length')
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; }
[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; }
# =============================================================================
# RE-REVIEW DETECTION
@ -352,7 +324,3 @@ esac
profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true
log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})"
# Remove lockfile on successful completion (cleanup_on_exit will also do this,
# but we do it here to avoid the trap running twice)
rm -f "$LOCKFILE"

View file

@ -1,129 +0,0 @@
#!/usr/bin/env bash
# test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children
#
# This test verifies that when claude_run_with_watchdog terminates the Claude process,
# all child processes (including those spawned by Claude's Bash tool) are also killed.
#
# Reproducer scenario:
# 1. Create a fake "claude" stub that:
# a. Spawns a long-running child process (sleep 3600)
# b. Writes a result marker to stdout to trigger idle detection
# c. Stays running
# 2. Run claude_run_with_watchdog with the stub
# 3. Before the fix: sleep child survives (orphaned to PID 1)
# 4. After the fix: sleep child dies (killed as part of process group with -PID)
#
# Usage: ./tests/test-watchdog-process-group.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
TEST_TMP="/tmp/test-watchdog-$$"
LOGFILE="${TEST_TMP}/log.txt"
PASS=true
# shellcheck disable=SC2317
cleanup_test() {
rm -rf "$TEST_TMP"
}
trap cleanup_test EXIT INT TERM
mkdir -p "$TEST_TMP"
log() {
printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE"
}
fail() {
printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE"
PASS=false
}
pass() {
printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE"
}
# Export required environment variables
export CLAUDE_TIMEOUT=10 # Short timeout for testing
export CLAUDE_IDLE_GRACE=2 # Short grace period for testing
export LOGFILE="${LOGFILE}" # Required by agent-sdk.sh
# Create a fake claude stub that:
# 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed
# 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path)
# 3. Stays running so the watchdog can kill it
cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF'
#!/usr/bin/env bash
# Fake claude that spawns a child and stays running
# Simulates Claude's behavior when it spawns a Bash tool command
# Write result marker to stdout (triggers watchdog idle detection)
echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}'
# Spawn a child that simulates Claude's Bash tool hanging
# This is the process that should be killed when the parent is terminated
sleep 3600 &
CHILD_PID=$!
# Log the child PID for debugging
echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2
# Stay running - sleep in a loop so the watchdog can kill us
while true; do
sleep 3600 &
wait $! 2>/dev/null || true
done
FAKE_CLAUDE_EOF
chmod +x "${TEST_TMP}/fake-claude"
log "Testing claude_run_with_watchdog process group cleanup..."
# Source the library and run claude_run_with_watchdog
cd "$SCRIPT_DIR"
source lib/agent-sdk.sh
log "Starting claude_run_with_watchdog with fake claude..."
# Run the function directly (not as a script)
# We need to capture output and redirect stderr
OUTPUT_FILE="${TEST_TMP}/output.txt"
timeout 35 bash -c "
source '${SCRIPT_DIR}/lib/agent-sdk.sh'
CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1
exit \$?
" || true
# Give the watchdog a moment to clean up
log "Waiting for cleanup..."
sleep 5
# More precise check: look for sleep 3600 processes
# These would be the orphans from our fake claude
ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0")
if [ "$ORPHAN_COUNT" -gt 0 ]; then
log "Found $ORPHAN_COUNT orphan sleep 3600 processes:"
pgrep -a sleep | grep "sleep 3600"
fail "Orphan children found - process group cleanup did not work"
else
pass "No orphan children found - process group cleanup worked"
fi
# Also verify that the fake claude itself is not running
FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0")
if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then
log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running"
fail "Fake claude process(es) still running"
else
pass "Fake claude process terminated"
fi
# Summary
echo ""
if [ "$PASS" = true ]; then
log "All tests passed!"
exit 0
else
log "Some tests failed. See log at $LOGFILE"
exit 1
fi