fix: fix: standardize logging across all agents — capture errors, log exit codes, consistent format (#367)
Some checks failed
ci/woodpecker/push/ci Pipeline failed
ci/woodpecker/pr/ci Pipeline failed
ci/woodpecker/pr/smoke-init Pipeline was successful

This commit is contained in:
Agent 2026-04-07 19:01:41 +00:00
parent f686d47a98
commit 91f971fe53
10 changed files with 116 additions and 53 deletions

View file

@ -52,12 +52,16 @@ agent_run() {
log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
if [ "$rc" -eq 124 ]; then
log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s"
log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
elif [ "$rc" -ne 0 ]; then
log "agent_run: claude exited with code $rc"
# Log last 3 lines of output for diagnostics
if [ -n "$output" ]; then
log "agent_run: last output lines: $(echo "$output" | tail -3)"
fi
fi
if [ -z "$output" ]; then
log "agent_run: empty output (claude may have crashed or failed)"
log "agent_run: empty output (claude may have crashed or failed, exit code: $rc)"
fi
# Extract and persist session_id
@ -89,9 +93,13 @@ agent_run() {
local nudge_rc
output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
if [ "$nudge_rc" -eq 124 ]; then
log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s"
log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
elif [ "$nudge_rc" -ne 0 ]; then
log "agent_run: nudge claude exited with code $nudge_rc"
# Log last 3 lines of output for diagnostics
if [ -n "$output" ]; then
log "agent_run: nudge last output lines: $(echo "$output" | tail -3)"
fi
fi
new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
if [ -n "$new_sid" ]; then

View file

@ -13,7 +13,7 @@ FACTORY_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
if [ "${DISINTO_CONTAINER:-}" = "1" ]; then
DISINTO_DATA_DIR="${HOME}/data"
DISINTO_LOG_DIR="${DISINTO_DATA_DIR}/logs"
mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics,gardener}
mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics,gardener,planner,predictor,architect,dispatcher}
else
DISINTO_LOG_DIR="${FACTORY_ROOT}"
fi
@ -138,8 +138,12 @@ unset CLAWHUB_TOKEN 2>/dev/null || true
export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
# Shared log helper
# Usage: log "message"
# Output: [2026-04-03T14:00:00Z] agent: message
# Where agent is set via LOG_AGENT variable (defaults to caller's context)
log() {
printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
local agent="${LOG_AGENT:-agent}"
printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*"
}
# =============================================================================

View file

@ -357,11 +357,18 @@ pr_close() {
local pr_num="$1"
_prl_log "closing PR #${pr_num}"
curl -sf -X PATCH \
local resp http_code
resp=$(curl -sf -w "\n%{http_code}" -X PATCH \
-H "Authorization: token ${FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/pulls/${pr_num}" \
-d '{"state":"closed"}' >/dev/null 2>&1 || true
-d '{"state":"closed"}' 2>/dev/null) || true
http_code=$(printf '%s\n' "$resp" | tail -1)
if [ "$http_code" != "200" ] && [ "$http_code" != "204" ]; then
_prl_log "pr_close FAILED: HTTP ${http_code} for PR #${pr_num}"
return 1
fi
_prl_log "PR #${pr_num} closed"
}
# ---------------------------------------------------------------------------
@ -398,11 +405,18 @@ pr_walk_to_merge() {
if [ "${_PR_CI_FAILURE_TYPE:-}" = "infra" ] && [ "$ci_retry_count" -lt 1 ]; then
ci_retry_count=$((ci_retry_count + 1))
_prl_log "infra failure — retriggering CI (retry ${ci_retry_count})"
local rebase_output rebase_rc
( cd "$worktree" && \
git commit --allow-empty -m "ci: retrigger after infra failure" --no-verify && \
git fetch "$remote" "${PRIMARY_BRANCH}" 2>/dev/null && \
git rebase "${remote}/${PRIMARY_BRANCH}" && \
git push --force-with-lease "$remote" HEAD ) 2>&1 | tail -5 || true
git push --force-with-lease "$remote" HEAD ) > /tmp/rebase-output-$$ 2>&1
rebase_rc=$?
rebase_output=$(cat /tmp/rebase-output-$$)
rm -f /tmp/rebase-output-$$
if [ "$rebase_rc" -ne 0 ]; then
_prl_log "rebase/push failed (exit code $rebase_rc): $(echo "$rebase_output" | tail -5)"
fi
continue
fi