fix: fix: standardize logging across all agents — capture errors, log exit codes, consistent format (#367)
Some checks failed
ci/woodpecker/push/ci Pipeline failed
ci/woodpecker/pr/ci Pipeline failed
ci/woodpecker/pr/smoke-init Pipeline was successful

This commit is contained in:
Agent 2026-04-07 18:51:34 +00:00
parent f686d47a98
commit d216e6294f
11 changed files with 190 additions and 136 deletions

View file

@ -47,9 +47,14 @@ VAULT_ENV="${SCRIPT_ROOT}/../vault/vault-env.sh"
# Comma-separated list of Forgejo usernames with admin role
ADMIN_USERS="${FORGE_ADMIN_USERS:-vault-bot,admin}"
# Log function
# Persistent log file for dispatcher
DISPATCHER_LOG_FILE="${DISINTO_LOG_DIR:-/tmp}/dispatcher/dispatcher.log"
mkdir -p "$(dirname "$DISPATCHER_LOG_FILE")"
# Log function with standardized format
log() {
printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
local agent="${LOG_AGENT:-dispatcher}"
printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$DISPATCHER_LOG_FILE"
}
# -----------------------------------------------------------------------------
@ -185,7 +190,7 @@ verify_admin_approver() {
# Fetch reviews for this PR
local reviews_json
reviews_json=$(get_pr_reviews "$pr_number") || {
log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping"
log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping" >> "$DISPATCHER_LOG_FILE"
return 1
}
@ -193,7 +198,7 @@ verify_admin_approver() {
local review_count
review_count=$(echo "$reviews_json" | jq 'length // 0')
if [ "$review_count" -eq 0 ]; then
log "WARNING: No reviews found for PR #${pr_number} — rejecting"
log "WARNING: No reviews found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE"
return 1
fi
@ -216,12 +221,12 @@ verify_admin_approver() {
# Check if reviewer is admin
if is_allowed_admin "$reviewer"; then
log "Verified: PR #${pr_number} approved by admin '${reviewer}'"
log "Verified: PR #${pr_number} approved by admin '${reviewer}'" >> "$DISPATCHER_LOG_FILE"
return 0
fi
done < <(echo "$reviews_json" | jq -c '.[]')
log "WARNING: No admin approval found for PR #${pr_number} — rejecting"
log "WARNING: No admin approval found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE"
return 1
}
@ -243,11 +248,11 @@ verify_admin_merged() {
# Get the PR that introduced this file
local pr_num
pr_num=$(get_pr_for_file "$toml_file") || {
log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)"
log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)" >> "$DISPATCHER_LOG_FILE"
return 1
}
log "Action ${action_id} arrived via PR #${pr_num}"
log "Action ${action_id} arrived via PR #${pr_num}" >> "$DISPATCHER_LOG_FILE"
# First, try admin approver check (for auto-merge workflow)
if verify_admin_approver "$pr_num" "$action_id"; then
@ -257,7 +262,7 @@ verify_admin_merged() {
# Fallback: Check merger (backwards compatibility for manual merges)
local merger_json
merger_json=$(get_pr_merger "$pr_num") || {
log "WARNING: Could not fetch PR #${pr_num} details — skipping"
log "WARNING: Could not fetch PR #${pr_num} details — skipping" >> "$DISPATCHER_LOG_FILE"
return 1
}
@ -267,22 +272,22 @@ verify_admin_merged() {
# Check if PR is merged
if [[ "$merged" != "true" ]]; then
log "WARNING: PR #${pr_num} is not merged — skipping"
log "WARNING: PR #${pr_num} is not merged — skipping" >> "$DISPATCHER_LOG_FILE"
return 1
fi
# Check if merger is admin
if [ -z "$merger_username" ]; then
log "WARNING: Could not determine PR #${pr_num} merger — skipping"
log "WARNING: Could not determine PR #${pr_num} merger — skipping" >> "$DISPATCHER_LOG_FILE"
return 1
fi
if ! is_allowed_admin "$merger_username"; then
log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping"
log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping" >> "$DISPATCHER_LOG_FILE"
return 1
fi
log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)"
log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)" >> "$DISPATCHER_LOG_FILE"
return 0
}
@ -343,7 +348,7 @@ write_result() {
'{id: $id, exit_code: $exit_code, timestamp: $timestamp, logs: $logs}' \
> "$result_file"
log "Result written: ${result_file}"
log "Result written: ${result_file}" >> "$DISPATCHER_LOG_FILE"
}
# Launch runner for the given action
@ -353,18 +358,18 @@ launch_runner() {
local action_id
action_id=$(basename "$toml_file" .toml)
log "Launching runner for action: ${action_id}"
log "Launching runner for action: ${action_id}" >> "$DISPATCHER_LOG_FILE"
# Validate TOML
if ! validate_action "$toml_file"; then
log "ERROR: Action validation failed for ${action_id}"
log "ERROR: Action validation failed for ${action_id}" >> "$DISPATCHER_LOG_FILE"
write_result "$action_id" 1 "Validation failed: see logs above"
return 1
fi
# Verify admin merge
if ! verify_admin_merged "$toml_file"; then
log "ERROR: Admin merge verification failed for ${action_id}"
log "ERROR: Admin merge verification failed for ${action_id}" >> "$DISPATCHER_LOG_FILE"
write_result "$action_id" 1 "Admin merge verification failed: see logs above"
return 1
fi
@ -400,7 +405,7 @@ launch_runner() {
fi
done
else
log "Action ${action_id} has no secrets declared — runner will execute without extra env vars"
log "Action ${action_id} has no secrets declared — runner will execute without extra env vars" >> "$DISPATCHER_LOG_FILE"
fi
# Add formula and action id as arguments (safe from shell injection)
@ -423,7 +428,7 @@ launch_runner() {
log_cmd+=("$arg")
fi
done
log "Running: ${log_cmd[*]}"
log "Running: ${log_cmd[*]}" >> "$DISPATCHER_LOG_FILE"
# Create temp file for logs
local log_file
@ -443,9 +448,9 @@ launch_runner() {
write_result "$action_id" "$exit_code" "$logs"
if [ $exit_code -eq 0 ]; then
log "Runner completed successfully for action: ${action_id}"
log "Runner completed successfully for action: ${action_id}" >> "$DISPATCHER_LOG_FILE"
else
log "Runner failed for action: ${action_id} (exit code: ${exit_code})"
log "Runner failed for action: ${action_id} (exit code: ${exit_code})" >> "$DISPATCHER_LOG_FILE"
fi
return $exit_code
@ -512,7 +517,7 @@ dispatch_reproduce() {
local issue_number="$1"
if is_reproduce_running "$issue_number"; then
log "Reproduce already running for issue #${issue_number}, skipping"
log "Reproduce already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE"
return 0
fi
@ -523,11 +528,11 @@ dispatch_reproduce() {
done
if [ -z "$project_toml" ]; then
log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}"
log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}" >> "$DISPATCHER_LOG_FILE"
return 0
fi
log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})"
log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})" >> "$DISPATCHER_LOG_FILE"
# Build docker run command using array (safe from injection)
local -a cmd=(docker run --rm
@ -575,7 +580,7 @@ dispatch_reproduce() {
"${cmd[@]}" &
local bg_pid=$!
echo "$bg_pid" > "$(_reproduce_lockfile "$issue_number")"
log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}"
log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE"
}
# -----------------------------------------------------------------------------
@ -636,7 +641,7 @@ dispatch_triage() {
local issue_number="$1"
if is_triage_running "$issue_number"; then
log "Triage already running for issue #${issue_number}, skipping"
log "Triage already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE"
return 0
fi
@ -647,7 +652,7 @@ dispatch_triage() {
done
if [ -z "$project_toml" ]; then
log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}"
log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}" >> "$DISPATCHER_LOG_FILE"
return 0
fi
@ -700,7 +705,7 @@ dispatch_triage() {
"${cmd[@]}" &
local bg_pid=$!
echo "$bg_pid" > "$(_triage_lockfile "$issue_number")"
log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE"
}
# -----------------------------------------------------------------------------
@ -710,19 +715,19 @@ dispatch_triage() {
# Clone or pull the ops repo
ensure_ops_repo() {
if [ ! -d "${OPS_REPO_ROOT}/.git" ]; then
log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..."
log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..." >> "$DISPATCHER_LOG_FILE"
git clone "${FORGE_URL}/${FORGE_OPS_REPO}" "${OPS_REPO_ROOT}"
else
log "Pulling latest ops repo changes..."
log "Pulling latest ops repo changes..." >> "$DISPATCHER_LOG_FILE"
(cd "${OPS_REPO_ROOT}" && git pull --rebase)
fi
}
# Main dispatcher loop
main() {
log "Starting dispatcher..."
log "Polling ops repo: ${VAULT_ACTIONS_DIR}"
log "Admin users: ${ADMIN_USERS}"
log "Starting dispatcher..." >> "$DISPATCHER_LOG_FILE"
log "Polling ops repo: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE"
log "Admin users: ${ADMIN_USERS}" >> "$DISPATCHER_LOG_FILE"
while true; do
# Refresh ops repo at the start of each poll cycle
@ -730,7 +735,7 @@ main() {
# Check if actions directory exists
if [ ! -d "${VAULT_ACTIONS_DIR}" ]; then
log "Actions directory not found: ${VAULT_ACTIONS_DIR}"
log "Actions directory not found: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE"
sleep 60
continue
fi
@ -745,7 +750,7 @@ main() {
# Skip if already completed
if is_action_completed "$action_id"; then
log "Action ${action_id} already completed, skipping"
log "Action ${action_id} already completed, skipping" >> "$DISPATCHER_LOG_FILE"
continue
fi