fix: fix: standardize logging across all agents — capture errors, log exit codes, consistent format (#367)
This commit is contained in:
parent
f686d47a98
commit
91f971fe53
10 changed files with 116 additions and 53 deletions
|
|
@ -52,12 +52,16 @@ agent_run() {
|
|||
log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
|
||||
output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
|
||||
if [ "$rc" -eq 124 ]; then
|
||||
log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s"
|
||||
log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
|
||||
elif [ "$rc" -ne 0 ]; then
|
||||
log "agent_run: claude exited with code $rc"
|
||||
# Log last 3 lines of output for diagnostics
|
||||
if [ -n "$output" ]; then
|
||||
log "agent_run: last output lines: $(echo "$output" | tail -3)"
|
||||
fi
|
||||
fi
|
||||
if [ -z "$output" ]; then
|
||||
log "agent_run: empty output (claude may have crashed or failed)"
|
||||
log "agent_run: empty output (claude may have crashed or failed, exit code: $rc)"
|
||||
fi
|
||||
|
||||
# Extract and persist session_id
|
||||
|
|
@ -89,9 +93,13 @@ agent_run() {
|
|||
local nudge_rc
|
||||
output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
|
||||
if [ "$nudge_rc" -eq 124 ]; then
|
||||
log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s"
|
||||
log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
|
||||
elif [ "$nudge_rc" -ne 0 ]; then
|
||||
log "agent_run: nudge claude exited with code $nudge_rc"
|
||||
# Log last 3 lines of output for diagnostics
|
||||
if [ -n "$output" ]; then
|
||||
log "agent_run: nudge last output lines: $(echo "$output" | tail -3)"
|
||||
fi
|
||||
fi
|
||||
new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
|
||||
if [ -n "$new_sid" ]; then
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ FACTORY_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|||
if [ "${DISINTO_CONTAINER:-}" = "1" ]; then
|
||||
DISINTO_DATA_DIR="${HOME}/data"
|
||||
DISINTO_LOG_DIR="${DISINTO_DATA_DIR}/logs"
|
||||
mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics,gardener}
|
||||
mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics,gardener,planner,predictor,architect,dispatcher}
|
||||
else
|
||||
DISINTO_LOG_DIR="${FACTORY_ROOT}"
|
||||
fi
|
||||
|
|
@ -138,8 +138,12 @@ unset CLAWHUB_TOKEN 2>/dev/null || true
|
|||
export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
|
||||
|
||||
# Shared log helper
|
||||
# Usage: log "message"
|
||||
# Output: [2026-04-03T14:00:00Z] agent: message
|
||||
# Where agent is set via LOG_AGENT variable (defaults to caller's context)
|
||||
log() {
|
||||
printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
|
||||
local agent="${LOG_AGENT:-agent}"
|
||||
printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -357,11 +357,18 @@ pr_close() {
|
|||
local pr_num="$1"
|
||||
|
||||
_prl_log "closing PR #${pr_num}"
|
||||
curl -sf -X PATCH \
|
||||
local resp http_code
|
||||
resp=$(curl -sf -w "\n%{http_code}" -X PATCH \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${FORGE_API}/pulls/${pr_num}" \
|
||||
-d '{"state":"closed"}' >/dev/null 2>&1 || true
|
||||
-d '{"state":"closed"}' 2>/dev/null) || true
|
||||
http_code=$(printf '%s\n' "$resp" | tail -1)
|
||||
if [ "$http_code" != "200" ] && [ "$http_code" != "204" ]; then
|
||||
_prl_log "pr_close FAILED: HTTP ${http_code} for PR #${pr_num}"
|
||||
return 1
|
||||
fi
|
||||
_prl_log "PR #${pr_num} closed"
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -398,11 +405,18 @@ pr_walk_to_merge() {
|
|||
if [ "${_PR_CI_FAILURE_TYPE:-}" = "infra" ] && [ "$ci_retry_count" -lt 1 ]; then
|
||||
ci_retry_count=$((ci_retry_count + 1))
|
||||
_prl_log "infra failure — retriggering CI (retry ${ci_retry_count})"
|
||||
local rebase_output rebase_rc
|
||||
( cd "$worktree" && \
|
||||
git commit --allow-empty -m "ci: retrigger after infra failure" --no-verify && \
|
||||
git fetch "$remote" "${PRIMARY_BRANCH}" 2>/dev/null && \
|
||||
git rebase "${remote}/${PRIMARY_BRANCH}" && \
|
||||
git push --force-with-lease "$remote" HEAD ) 2>&1 | tail -5 || true
|
||||
git push --force-with-lease "$remote" HEAD ) > /tmp/rebase-output-$$ 2>&1
|
||||
rebase_rc=$?
|
||||
rebase_output=$(cat /tmp/rebase-output-$$)
|
||||
rm -f /tmp/rebase-output-$$
|
||||
if [ "$rebase_rc" -ne 0 ]; then
|
||||
_prl_log "rebase/push failed (exit code $rebase_rc): $(echo "$rebase_output" | tail -5)"
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue