From d216e6294f5d7c38ebfc72376ea3f7d16af388e4 Mon Sep 17 00:00:00 2001 From: Agent Date: Tue, 7 Apr 2026 18:51:34 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20fix:=20standardize=20logging=20across=20?= =?UTF-8?q?all=20agents=20=E2=80=94=20capture=20errors,=20log=20exit=20cod?= =?UTF-8?q?es,=20consistent=20format=20(#367)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- architect/architect-run.sh | 5 +- disinto-factory/lessons-learned.md | 69 ++++++++++++++------------- docker/edge/dispatcher.sh | 75 ++++++++++++++++-------------- gardener/gardener-run.sh | 73 +++++++++++++++++------------ lib/agent-sdk.sh | 12 +++-- lib/env.sh | 6 ++- lib/pr-lifecycle.sh | 22 +++++++-- planner/planner-run.sh | 5 +- predictor/predictor-run.sh | 5 +- review/review-poll.sh | 47 ++++++++++--------- supervisor/supervisor-run.sh | 7 ++- 11 files changed, 190 insertions(+), 136 deletions(-) diff --git a/architect/architect-run.sh b/architect/architect-run.sh index 18de885..ce7498c 100755 --- a/architect/architect-run.sh +++ b/architect/architect-run.sh @@ -36,7 +36,7 @@ source "$FACTORY_ROOT/lib/guard.sh" # shellcheck source=../lib/agent-sdk.sh source "$FACTORY_ROOT/lib/agent-sdk.sh" -LOG_FILE="$SCRIPT_DIR/architect.log" +LOG_FILE="${DISINTO_LOG_DIR}/architect/architect.log" # shellcheck disable=SC2034 # consumed by agent-sdk.sh LOGFILE="$LOG_FILE" # shellcheck disable=SC2034 # consumed by agent-sdk.sh @@ -44,7 +44,8 @@ SID_FILE="/tmp/architect-session-${PROJECT_NAME}.sid" SCRATCH_FILE="/tmp/architect-${PROJECT_NAME}-scratch.md" WORKTREE="/tmp/${PROJECT_NAME}-architect-run" -log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="architect" # ── Guards ──────────────────────────────────────────────────────────────── check_active architect diff --git a/disinto-factory/lessons-learned.md b/disinto-factory/lessons-learned.md index b1f6a3b..7f0f94a 100644 --- a/disinto-factory/lessons-learned.md +++ b/disinto-factory/lessons-learned.md @@ -1,54 +1,57 @@ -# Working with the factory — lessons learned +# Lessons Learned -## Writing issues for the dev agent +## Issue Management -**Put everything in the issue body, not comments.** The dev agent reads the issue body when it starts work. It does not reliably read comments. If an issue fails and you need to add guidance for a retry, update the issue body. +**Put everything in the issue body.** Agents read the issue body; comments are unreliable. Add guidance by updating the body. -**One approach per issue, no choices.** The dev agent cannot make design decisions. If there are multiple ways to solve a problem, decide before filing. Issues with "Option A or Option B" will confuse the agent. +**One decision per issue.** Agents cannot choose between approaches. Decide before filing. -**Issues must fit the templates.** Every backlog issue needs: affected files (max 3), acceptance criteria (max 5 checkboxes), and a clear proposed solution. If you cannot fill these fields, the issue is too big — label it `vision` and break it down first. +**Explicit dependencies prevent ordering bugs.** Use `Depends-on: #N` so agents don't work on stale code. -**Explicit dependencies prevent ordering bugs.** Add `Depends-on: #N` in the issue body. dev-poll checks these before pickup. Without explicit deps, the agent may attempt work on a stale codebase. +**Fit issues to templates.** Max 3 files, 5 acceptance criteria, clear solution. If it doesn't fit, it's too big—label `vision` and decompose. -## Debugging CI failures +## Debugging CI Failures -**Check CI logs via Woodpecker SQLite when the API fails.** The Woodpecker v3 log API may return HTML instead of JSON. Reliable fallback: -```bash -sqlite3 /var/lib/docker/volumes/disinto_woodpecker-data/_data/woodpecker.sqlite \ - "SELECT le.data FROM log_entries le \ - JOIN steps s ON le.step_id = s.id \ - JOIN workflows w ON s.pipeline_id = w.id \ - JOIN pipelines p ON w.pipeline_id = p.id \ - WHERE p.number = AND s.name = '' ORDER BY le.id" -``` +**Agents can't see CI logs.** When an issue fails repeatedly, diagnose externally and put the exact error and fix in the issue body. -**When the agent fails repeatedly on CI, diagnose externally.** The dev agent cannot see CI log output (only pass/fail status). If the same step fails 3+ times, read the logs yourself and put the exact error and fix in the issue body. +**Blocked CI often means environment, not code.** Missing libraries, auth failures, or path assumptions usually point to infrastructure misconfiguration. -## Retrying failed issues +## Base Image and Environment -**Clean up stale branches before retrying.** Old branches cause recovery mode which inherits stale code. Close the PR, delete the branch on Forgejo, then relabel to backlog. +**Minimal images introduce hidden traps.** Alpine/musl libc breaks tools that work on glibc. Validate base image assumptions before committing to a path. -**After a dependency lands, stale branches miss the fix.** If issue B depends on A, and B's PR was created before A merged, B's branch is stale. Close the PR and delete the branch so the agent starts fresh from current main. +**Ask: "What system libraries does this tool actually need?"** Don't assume development environment constraints apply to production images. -## Environment gotchas +**Document base image tradeoffs.** Alpine saves space but costs compatibility. Ubuntu saves compatibility but costs size. Make decisions explicit. -**Alpine/BusyBox differs from Debian.** CI and edge containers use Alpine: -- `grep -P` (Perl regex) does not work — use `grep -E` -- `USER` variable is unset — set it explicitly: `USER=$(whoami); export USER` -- Network calls fail during `docker build` in LXD — download binaries on the host, COPY into images +## Authentication -**The host repo drifts from Forgejo main.** If factory code is bind-mounted, the host checkout goes stale. Pull regularly or use versioned releases. +**Treat auth as first-class.** Authentication assumptions are the most fragile part of any integration. -## Vault operations +**Audit all external calls.** Map every operation touching remote systems and verify each has proper auth—even internal repos can require credentials. -**The human merging a vault PR must be a Forgejo site admin.** The dispatcher verifies `is_admin` on the merger. Promote your user via the Forgejo CLI or database if needed. +**Test in failure mode.** Don't just verify the happy path. Simulate restricted access environments. -**Result files cache failures.** If a vault action fails, the dispatcher writes `.result.json` and skips it. To retry: delete the result file inside the edge container. +**Verify the entire chain.** Each layer in a call chain can strip credentials. Check every script, command, and library boundary. -## Breaking down large features +**Special cases often bypass standard flows.** "Edge" or "beta" builds frequently have auth gaps. -**Vision issues need structured decomposition.** When a feature touches multiple subsystems or has design forks, label it `vision`. Break it down by identifying what exists, what can be reused, where the design forks are, and resolve them before filing backlog issues. +## Concurrency and Locking -**Prefer gluecode over greenfield.** Check if Forgejo API, Woodpecker, Docker, or existing lib/ functions can do the job before building new components. +**Scope conditions to the actor, not the system.** In multi-agent systems, distinguish "in progress" from "in progress by me." -**Max 7 sub-issues per sprint.** If a breakdown produces more, split into two sprints. +**Design blocking points with "who does this affect?" comments.** Challenge any global-blocking condition: "What if five agents hit this simultaneously?" + +**State falls into three categories:** global (affects all), actor-local, and shared resources. Only block on global state or own ownership. + +## Branch and PR Hygiene + +**Clean up stale branches before retrying.** Old branches cause recovery mode with stale code. Close PR, delete branch, then relabel. + +**After a dependency lands, stale branches miss the fix.** If issue B depends on A and B's PR predates A's merge, close and recreate the branch. + +## Feature Decomposition + +**Prefer gluecode over greenfield.** Check existing APIs, libraries, or utilities before building new components. + +**Max 7 sub-issues per sprint.** If more, split into two sprints. diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index e3e57b7..c3f44b0 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -47,9 +47,14 @@ VAULT_ENV="${SCRIPT_ROOT}/../vault/vault-env.sh" # Comma-separated list of Forgejo usernames with admin role ADMIN_USERS="${FORGE_ADMIN_USERS:-vault-bot,admin}" -# Log function +# Persistent log file for dispatcher +DISPATCHER_LOG_FILE="${DISINTO_LOG_DIR:-/tmp}/dispatcher/dispatcher.log" +mkdir -p "$(dirname "$DISPATCHER_LOG_FILE")" + +# Log function with standardized format log() { - printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" + local agent="${LOG_AGENT:-dispatcher}" + printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$DISPATCHER_LOG_FILE" } # ----------------------------------------------------------------------------- @@ -185,7 +190,7 @@ verify_admin_approver() { # Fetch reviews for this PR local reviews_json reviews_json=$(get_pr_reviews "$pr_number") || { - log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping" + log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping" >> "$DISPATCHER_LOG_FILE" return 1 } @@ -193,7 +198,7 @@ verify_admin_approver() { local review_count review_count=$(echo "$reviews_json" | jq 'length // 0') if [ "$review_count" -eq 0 ]; then - log "WARNING: No reviews found for PR #${pr_number} — rejecting" + log "WARNING: No reviews found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE" return 1 fi @@ -216,12 +221,12 @@ verify_admin_approver() { # Check if reviewer is admin if is_allowed_admin "$reviewer"; then - log "Verified: PR #${pr_number} approved by admin '${reviewer}'" + log "Verified: PR #${pr_number} approved by admin '${reviewer}'" >> "$DISPATCHER_LOG_FILE" return 0 fi done < <(echo "$reviews_json" | jq -c '.[]') - log "WARNING: No admin approval found for PR #${pr_number} — rejecting" + log "WARNING: No admin approval found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE" return 1 } @@ -243,11 +248,11 @@ verify_admin_merged() { # Get the PR that introduced this file local pr_num pr_num=$(get_pr_for_file "$toml_file") || { - log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)" + log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)" >> "$DISPATCHER_LOG_FILE" return 1 } - log "Action ${action_id} arrived via PR #${pr_num}" + log "Action ${action_id} arrived via PR #${pr_num}" >> "$DISPATCHER_LOG_FILE" # First, try admin approver check (for auto-merge workflow) if verify_admin_approver "$pr_num" "$action_id"; then @@ -257,7 +262,7 @@ verify_admin_merged() { # Fallback: Check merger (backwards compatibility for manual merges) local merger_json merger_json=$(get_pr_merger "$pr_num") || { - log "WARNING: Could not fetch PR #${pr_num} details — skipping" + log "WARNING: Could not fetch PR #${pr_num} details — skipping" >> "$DISPATCHER_LOG_FILE" return 1 } @@ -267,22 +272,22 @@ verify_admin_merged() { # Check if PR is merged if [[ "$merged" != "true" ]]; then - log "WARNING: PR #${pr_num} is not merged — skipping" + log "WARNING: PR #${pr_num} is not merged — skipping" >> "$DISPATCHER_LOG_FILE" return 1 fi # Check if merger is admin if [ -z "$merger_username" ]; then - log "WARNING: Could not determine PR #${pr_num} merger — skipping" + log "WARNING: Could not determine PR #${pr_num} merger — skipping" >> "$DISPATCHER_LOG_FILE" return 1 fi if ! is_allowed_admin "$merger_username"; then - log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping" + log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping" >> "$DISPATCHER_LOG_FILE" return 1 fi - log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)" + log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)" >> "$DISPATCHER_LOG_FILE" return 0 } @@ -343,7 +348,7 @@ write_result() { '{id: $id, exit_code: $exit_code, timestamp: $timestamp, logs: $logs}' \ > "$result_file" - log "Result written: ${result_file}" + log "Result written: ${result_file}" >> "$DISPATCHER_LOG_FILE" } # Launch runner for the given action @@ -353,18 +358,18 @@ launch_runner() { local action_id action_id=$(basename "$toml_file" .toml) - log "Launching runner for action: ${action_id}" + log "Launching runner for action: ${action_id}" >> "$DISPATCHER_LOG_FILE" # Validate TOML if ! validate_action "$toml_file"; then - log "ERROR: Action validation failed for ${action_id}" + log "ERROR: Action validation failed for ${action_id}" >> "$DISPATCHER_LOG_FILE" write_result "$action_id" 1 "Validation failed: see logs above" return 1 fi # Verify admin merge if ! verify_admin_merged "$toml_file"; then - log "ERROR: Admin merge verification failed for ${action_id}" + log "ERROR: Admin merge verification failed for ${action_id}" >> "$DISPATCHER_LOG_FILE" write_result "$action_id" 1 "Admin merge verification failed: see logs above" return 1 fi @@ -400,7 +405,7 @@ launch_runner() { fi done else - log "Action ${action_id} has no secrets declared — runner will execute without extra env vars" + log "Action ${action_id} has no secrets declared — runner will execute without extra env vars" >> "$DISPATCHER_LOG_FILE" fi # Add formula and action id as arguments (safe from shell injection) @@ -423,7 +428,7 @@ launch_runner() { log_cmd+=("$arg") fi done - log "Running: ${log_cmd[*]}" + log "Running: ${log_cmd[*]}" >> "$DISPATCHER_LOG_FILE" # Create temp file for logs local log_file @@ -443,9 +448,9 @@ launch_runner() { write_result "$action_id" "$exit_code" "$logs" if [ $exit_code -eq 0 ]; then - log "Runner completed successfully for action: ${action_id}" + log "Runner completed successfully for action: ${action_id}" >> "$DISPATCHER_LOG_FILE" else - log "Runner failed for action: ${action_id} (exit code: ${exit_code})" + log "Runner failed for action: ${action_id} (exit code: ${exit_code})" >> "$DISPATCHER_LOG_FILE" fi return $exit_code @@ -512,7 +517,7 @@ dispatch_reproduce() { local issue_number="$1" if is_reproduce_running "$issue_number"; then - log "Reproduce already running for issue #${issue_number}, skipping" + log "Reproduce already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE" return 0 fi @@ -523,11 +528,11 @@ dispatch_reproduce() { done if [ -z "$project_toml" ]; then - log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}" + log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}" >> "$DISPATCHER_LOG_FILE" return 0 fi - log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})" + log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})" >> "$DISPATCHER_LOG_FILE" # Build docker run command using array (safe from injection) local -a cmd=(docker run --rm @@ -575,7 +580,7 @@ dispatch_reproduce() { "${cmd[@]}" & local bg_pid=$! echo "$bg_pid" > "$(_reproduce_lockfile "$issue_number")" - log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}" + log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE" } # ----------------------------------------------------------------------------- @@ -636,7 +641,7 @@ dispatch_triage() { local issue_number="$1" if is_triage_running "$issue_number"; then - log "Triage already running for issue #${issue_number}, skipping" + log "Triage already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE" return 0 fi @@ -647,7 +652,7 @@ dispatch_triage() { done if [ -z "$project_toml" ]; then - log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}" + log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}" >> "$DISPATCHER_LOG_FILE" return 0 fi @@ -700,7 +705,7 @@ dispatch_triage() { "${cmd[@]}" & local bg_pid=$! echo "$bg_pid" > "$(_triage_lockfile "$issue_number")" - log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}" + log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE" } # ----------------------------------------------------------------------------- @@ -710,19 +715,19 @@ dispatch_triage() { # Clone or pull the ops repo ensure_ops_repo() { if [ ! -d "${OPS_REPO_ROOT}/.git" ]; then - log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..." + log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..." >> "$DISPATCHER_LOG_FILE" git clone "${FORGE_URL}/${FORGE_OPS_REPO}" "${OPS_REPO_ROOT}" else - log "Pulling latest ops repo changes..." + log "Pulling latest ops repo changes..." >> "$DISPATCHER_LOG_FILE" (cd "${OPS_REPO_ROOT}" && git pull --rebase) fi } # Main dispatcher loop main() { - log "Starting dispatcher..." - log "Polling ops repo: ${VAULT_ACTIONS_DIR}" - log "Admin users: ${ADMIN_USERS}" + log "Starting dispatcher..." >> "$DISPATCHER_LOG_FILE" + log "Polling ops repo: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE" + log "Admin users: ${ADMIN_USERS}" >> "$DISPATCHER_LOG_FILE" while true; do # Refresh ops repo at the start of each poll cycle @@ -730,7 +735,7 @@ main() { # Check if actions directory exists if [ ! -d "${VAULT_ACTIONS_DIR}" ]; then - log "Actions directory not found: ${VAULT_ACTIONS_DIR}" + log "Actions directory not found: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE" sleep 60 continue fi @@ -745,7 +750,7 @@ main() { # Skip if already completed if is_action_completed "$action_id"; then - log "Action ${action_id} already completed, skipping" + log "Action ${action_id} already completed, skipping" >> "$DISPATCHER_LOG_FILE" continue fi diff --git a/gardener/gardener-run.sh b/gardener/gardener-run.sh index 3b29987..7da8b5f 100755 --- a/gardener/gardener-run.sh +++ b/gardener/gardener-run.sh @@ -55,7 +55,8 @@ RESULT_FILE="/tmp/gardener-result-${PROJECT_NAME}.txt" GARDENER_PR_FILE="/tmp/gardener-pr-${PROJECT_NAME}.txt" WORKTREE="/tmp/${PROJECT_NAME}-gardener-run" -log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="gardener" # ── Guards ──────────────────────────────────────────────────────────────── check_active gardener @@ -156,19 +157,21 @@ _gardener_execute_manifest() { case "$action" in add_label) - local label label_id + local label label_id http_code resp label=$(jq -r ".[$i].label" "$manifest_file") label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/labels" | jq -r --arg n "$label" \ '.[] | select(.name == $n) | .id') || true if [ -n "$label_id" ]; then - if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/issues/${issue}/labels" \ - -d "{\"labels\":[${label_id}]}" >/dev/null 2>&1; then + -d "{\"labels\":[${label_id}]}" 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then log "manifest: add_label '${label}' to #${issue}" else - log "manifest: FAILED add_label '${label}' to #${issue}" + log "manifest: FAILED add_label '${label}' to #${issue}: HTTP ${http_code}" fi else log "manifest: FAILED add_label — label '${label}' not found" @@ -176,17 +179,19 @@ _gardener_execute_manifest() { ;; remove_label) - local label label_id + local label label_id http_code resp label=$(jq -r ".[$i].label" "$manifest_file") label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/labels" | jq -r --arg n "$label" \ '.[] | select(.name == $n) | .id') || true if [ -n "$label_id" ]; then - if curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/issues/${issue}/labels/${label_id}" >/dev/null 2>&1; then + resp=$(curl -sf -w "\n%{http_code}" -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \ + "${FORGE_API}/issues/${issue}/labels/${label_id}" 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then log "manifest: remove_label '${label}' from #${issue}" else - log "manifest: FAILED remove_label '${label}' from #${issue}" + log "manifest: FAILED remove_label '${label}' from #${issue}: HTTP ${http_code}" fi else log "manifest: FAILED remove_label — label '${label}' not found" @@ -194,34 +199,38 @@ _gardener_execute_manifest() { ;; close) - local reason + local reason http_code resp reason=$(jq -r ".[$i].reason // empty" "$manifest_file") - if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/issues/${issue}" \ - -d '{"state":"closed"}' >/dev/null 2>&1; then + -d '{"state":"closed"}' 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then log "manifest: closed #${issue} (${reason})" else - log "manifest: FAILED close #${issue}" + log "manifest: FAILED close #${issue}: HTTP ${http_code}" fi ;; comment) - local body escaped_body + local body escaped_body http_code resp body=$(jq -r ".[$i].body" "$manifest_file") escaped_body=$(printf '%s' "$body" | jq -Rs '.') - if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/issues/${issue}/comments" \ - -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then + -d "{\"body\":${escaped_body}}" 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then log "manifest: commented on #${issue}" else - log "manifest: FAILED comment on #${issue}" + log "manifest: FAILED comment on #${issue}: HTTP ${http_code}" fi ;; create_issue) - local title body labels escaped_title escaped_body label_ids + local title body labels escaped_title escaped_body label_ids http_code resp title=$(jq -r ".[$i].title" "$manifest_file") body=$(jq -r ".[$i].body" "$manifest_file") labels=$(jq -r ".[$i].labels // [] | .[]" "$manifest_file") @@ -241,40 +250,46 @@ _gardener_execute_manifest() { done <<< "$labels" [ -n "$ids_json" ] && label_ids="[${ids_json}]" fi - if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/issues" \ - -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" >/dev/null 2>&1; then + -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then log "manifest: created issue '${title}'" else - log "manifest: FAILED create_issue '${title}'" + log "manifest: FAILED create_issue '${title}': HTTP ${http_code}" fi ;; edit_body) - local body escaped_body + local body escaped_body http_code resp body=$(jq -r ".[$i].body" "$manifest_file") escaped_body=$(printf '%s' "$body" | jq -Rs '.') - if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/issues/${issue}" \ - -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then + -d "{\"body\":${escaped_body}}" 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then log "manifest: edited body of #${issue}" else - log "manifest: FAILED edit_body #${issue}" + log "manifest: FAILED edit_body #${issue}: HTTP ${http_code}" fi ;; close_pr) - local pr + local pr http_code resp pr=$(jq -r ".[$i].pr" "$manifest_file") - if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ + resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \ -H 'Content-Type: application/json' \ "${FORGE_API}/pulls/${pr}" \ - -d '{"state":"closed"}' >/dev/null 2>&1; then + -d '{"state":"closed"}' 2>/dev/null) || true + http_code=$(echo "$resp" | tail -1) + if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then log "manifest: closed PR #${pr}" else - log "manifest: FAILED close_pr #${pr}" + log "manifest: FAILED close_pr #${pr}: HTTP ${http_code}" fi ;; diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh index 1180982..ef8e0b2 100644 --- a/lib/agent-sdk.sh +++ b/lib/agent-sdk.sh @@ -52,9 +52,11 @@ agent_run() { log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})" output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$? if [ "$rc" -eq 124 ]; then - log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s" + log "agent_run: TIMEOUT after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)" elif [ "$rc" -ne 0 ]; then - log "agent_run: claude exited with code $rc" + local tail_lines + tail_lines=$(echo "$output" | tail -3) + log "agent_run: FAILED with exit code $rc: ${tail_lines:-no output}" fi if [ -z "$output" ]; then log "agent_run: empty output (claude may have crashed or failed)" @@ -89,9 +91,11 @@ agent_run() { local nudge_rc output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$? if [ "$nudge_rc" -eq 124 ]; then - log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s" + log "agent_run: nudge TIMEOUT after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)" elif [ "$nudge_rc" -ne 0 ]; then - log "agent_run: nudge claude exited with code $nudge_rc" + local nudge_tail + nudge_tail=$(echo "$output" | tail -3) + log "agent_run: nudge FAILED with exit code $nudge_rc: ${nudge_tail:-no output}" fi new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true if [ -n "$new_sid" ]; then diff --git a/lib/env.sh b/lib/env.sh index 7a82619..7c29d83 100755 --- a/lib/env.sh +++ b/lib/env.sh @@ -138,8 +138,12 @@ unset CLAWHUB_TOKEN 2>/dev/null || true export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 # Shared log helper +# Usage: log "message" +# Sets LOG_AGENT to control the agent name prefix (default: derived from SCRIPT_DIR) +# Output format: [2026-04-03T14:00:00Z] agent: message log() { - printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" + local agent="${LOG_AGENT:-$(basename "$(dirname "$(dirname "${BASH_SOURCE[0]}")")")}" + printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" } # ============================================================================= diff --git a/lib/pr-lifecycle.sh b/lib/pr-lifecycle.sh index 2bb64e1..d248d2d 100644 --- a/lib/pr-lifecycle.sh +++ b/lib/pr-lifecycle.sh @@ -357,11 +357,19 @@ pr_close() { local pr_num="$1" _prl_log "closing PR #${pr_num}" - curl -sf -X PATCH \ + local resp http_code + resp=$(curl -sf -w "\n%{http_code}" -X PATCH \ -H "Authorization: token ${FORGE_TOKEN}" \ -H "Content-Type: application/json" \ "${FORGE_API}/pulls/${pr_num}" \ - -d '{"state":"closed"}' >/dev/null 2>&1 || true + -d '{"state":"closed"}' 2>/dev/null) || true + http_code=$(printf '%s\n' "$resp" | tail -1) + if [ "$http_code" != "200" ] && [ "$http_code" != "204" ]; then + local body + body=$(printf '%s\n' "$resp" | sed '$d' | head -1) + _prl_log "pr_close FAILED for PR #${pr_num}: HTTP ${http_code} ${body:0:200}" + return 1 + fi } # --------------------------------------------------------------------------- @@ -398,11 +406,17 @@ pr_walk_to_merge() { if [ "${_PR_CI_FAILURE_TYPE:-}" = "infra" ] && [ "$ci_retry_count" -lt 1 ]; then ci_retry_count=$((ci_retry_count + 1)) _prl_log "infra failure — retriggering CI (retry ${ci_retry_count})" - ( cd "$worktree" && \ + local rebase_output rebase_rc + rebase_output=$( ( cd "$worktree" && \ git commit --allow-empty -m "ci: retrigger after infra failure" --no-verify && \ git fetch "$remote" "${PRIMARY_BRANCH}" 2>/dev/null && \ git rebase "${remote}/${PRIMARY_BRANCH}" && \ - git push --force-with-lease "$remote" HEAD ) 2>&1 | tail -5 || true + git push --force-with-lease "$remote" HEAD ) 2>&1 ) || rebase_rc=$? + if [ -n "$rebase_rc" ] && [ "$rebase_rc" -ne 0 ]; then + _prl_log "infra retrigger FAILED (exit code $rebase_rc): $(echo "$rebase_output" | tail -3)" + else + _prl_log "infra retrigger succeeded" + fi continue fi diff --git a/planner/planner-run.sh b/planner/planner-run.sh index 2bbfab8..1a13a60 100755 --- a/planner/planner-run.sh +++ b/planner/planner-run.sh @@ -35,7 +35,7 @@ source "$FACTORY_ROOT/lib/guard.sh" # shellcheck source=../lib/agent-sdk.sh source "$FACTORY_ROOT/lib/agent-sdk.sh" -LOG_FILE="$SCRIPT_DIR/planner.log" +LOG_FILE="${DISINTO_LOG_DIR}/planner/planner.log" # shellcheck disable=SC2034 # consumed by agent-sdk.sh LOGFILE="$LOG_FILE" # shellcheck disable=SC2034 # consumed by agent-sdk.sh @@ -43,7 +43,8 @@ SID_FILE="/tmp/planner-session-${PROJECT_NAME}.sid" SCRATCH_FILE="/tmp/planner-${PROJECT_NAME}-scratch.md" WORKTREE="/tmp/${PROJECT_NAME}-planner-run" -log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="planner" # ── Guards ──────────────────────────────────────────────────────────────── check_active planner diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh index f87001b..89050d7 100755 --- a/predictor/predictor-run.sh +++ b/predictor/predictor-run.sh @@ -36,7 +36,7 @@ source "$FACTORY_ROOT/lib/guard.sh" # shellcheck source=../lib/agent-sdk.sh source "$FACTORY_ROOT/lib/agent-sdk.sh" -LOG_FILE="$SCRIPT_DIR/predictor.log" +LOG_FILE="${DISINTO_LOG_DIR}/predictor/predictor.log" # shellcheck disable=SC2034 # consumed by agent-sdk.sh LOGFILE="$LOG_FILE" # shellcheck disable=SC2034 # consumed by agent-sdk.sh @@ -44,7 +44,8 @@ SID_FILE="/tmp/predictor-session-${PROJECT_NAME}.sid" SCRATCH_FILE="/tmp/predictor-${PROJECT_NAME}-scratch.md" WORKTREE="/tmp/${PROJECT_NAME}-predictor-run" -log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="predictor" # ── Guards ──────────────────────────────────────────────────────────────── check_active predictor diff --git a/review/review-poll.sh b/review/review-poll.sh index 47d37df..dcda50d 100755 --- a/review/review-poll.sh +++ b/review/review-poll.sh @@ -23,20 +23,19 @@ LOGFILE="${DISINTO_LOG_DIR}/review/review-poll.log" MAX_REVIEWS=3 REVIEW_IDLE_TIMEOUT=14400 # 4h: kill review session if idle -log() { - printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE" -} +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="review" # Log rotation if [ -f "$LOGFILE" ]; then LOGSIZE=$(stat -c%s "$LOGFILE" 2>/dev/null || echo 0) if [ "$LOGSIZE" -gt 102400 ]; then mv "$LOGFILE" "$LOGFILE.old" - log "Log rotated" + log "Log rotated" >> "$LOGFILE" fi fi -log "--- Poll start ---" +log "--- Poll start ---" >> "$LOGFILE" # --- Clean up stale review sessions (.sid files + worktrees) --- # Remove .sid files, phase files, and worktrees for merged/closed PRs or idle > 4h @@ -54,7 +53,7 @@ if [ -n "$REVIEW_SIDS" ]; then "${API_BASE}/pulls/${pr_num}" | jq -r '.state // "unknown"' 2>/dev/null) || true if [ "$pr_state" != "open" ]; then - log "cleanup: PR #${pr_num} state=${pr_state} — removing sid/worktree" + log "cleanup: PR #${pr_num} state=${pr_state} — removing sid/worktree" >> "$LOGFILE" rm -f "$sid_file" "$phase_file" "/tmp/${PROJECT_NAME}-review-output-${pr_num}.json" cd "$REPO_ROOT" git worktree remove "$worktree" --force 2>/dev/null || true @@ -66,7 +65,7 @@ if [ -n "$REVIEW_SIDS" ]; then sid_mtime=$(stat -c %Y "$sid_file" 2>/dev/null || echo 0) now=$(date +%s) if [ "$sid_mtime" -gt 0 ] && [ $(( now - sid_mtime )) -gt "$REVIEW_IDLE_TIMEOUT" ]; then - log "cleanup: PR #${pr_num} idle > 4h — removing sid/worktree" + log "cleanup: PR #${pr_num} idle > 4h — removing sid/worktree" >> "$LOGFILE" rm -f "$sid_file" "$phase_file" "/tmp/${PROJECT_NAME}-review-output-${pr_num}.json" cd "$REPO_ROOT" git worktree remove "$worktree" --force 2>/dev/null || true @@ -86,7 +85,7 @@ if [ -z "$PRS" ]; then fi TOTAL=$(echo "$PRS" | wc -l) -log "Found ${TOTAL} open PRs" +log "Found ${TOTAL} open PRs" >> "$LOGFILE" REVIEWED=0 SKIPPED=0 @@ -119,17 +118,19 @@ if [ -n "$REVIEW_SIDS" ]; then if ! ci_passed "$ci_state"; then if ci_required_for_pr "$pr_num"; then - log " #${pr_num} awaiting_changes: new SHA ${current_sha:0:7} CI=${ci_state}, waiting" + log " #${pr_num} awaiting_changes: new SHA ${current_sha:0:7} CI=${ci_state}, waiting" >> "$LOGFILE" continue fi fi - log " #${pr_num} re-review: new commits (${reviewed_sha:0:7}→${current_sha:0:7})" + log " #${pr_num} re-review: new commits (${reviewed_sha:0:7}→${current_sha:0:7})" >> "$LOGFILE" - if "${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1; then + review_output=$("${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1) && review_rc=0 || review_rc=$? + if [ "$review_rc" -eq 0 ]; then REVIEWED=$((REVIEWED + 1)) else - log " #${pr_num} re-review failed" + tail_lines=$(echo "$review_output" | tail -3) + log " #${pr_num} re-review FAILED (exit code $review_rc): ${tail_lines}" fi [ "$REVIEWED" -lt "$MAX_REVIEWS" ] || break @@ -145,11 +146,11 @@ while IFS= read -r line; do # Skip if CI is running/failed. Allow "success", no CI configured, or non-code PRs if ! ci_passed "$CI_STATE"; then if ci_required_for_pr "$PR_NUM"; then - log " #${PR_NUM} CI=${CI_STATE}, skip" + log " #${PR_NUM} CI=${CI_STATE}, skip" >> "$LOGFILE" SKIPPED=$((SKIPPED + 1)) continue fi - log " #${PR_NUM} CI=${CI_STATE} but no code files — proceeding" + log " #${PR_NUM} CI=${CI_STATE} but no code files — proceeding" >> "$LOGFILE" fi # Check formal forge reviews (not comment markers) @@ -159,12 +160,12 @@ while IFS= read -r line; do '[.[] | select(.commit_id == $sha) | select(.state != "COMMENT")] | length') if [ "${HAS_REVIEW:-0}" -gt "0" ]; then - log " #${PR_NUM} formal review exists for ${PR_SHA:0:7}, skip" + log " #${PR_NUM} formal review exists for ${PR_SHA:0:7}, skip" >> "$LOGFILE" SKIPPED=$((SKIPPED + 1)) continue fi - log " #${PR_NUM} needs review (CI=success, SHA=${PR_SHA:0:7})" + log " #${PR_NUM} needs review (CI=success, SHA=${PR_SHA:0:7})" >> "$LOGFILE" # Circuit breaker: count existing review-error comments for this SHA ERROR_COMMENTS=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ @@ -173,21 +174,23 @@ while IFS= read -r line; do '[.[] | select(.body | contains(""))] | length') if [ "${ERROR_COMMENTS:-0}" -ge 3 ]; then - log " #${PR_NUM} blocked: ${ERROR_COMMENTS} consecutive error comments for ${PR_SHA:0:7}, skipping" + log " #${PR_NUM} blocked: ${ERROR_COMMENTS} consecutive error comments for ${PR_SHA:0:7}, skipping" >> "$LOGFILE" SKIPPED=$((SKIPPED + 1)) continue fi - log " #${PR_NUM} error check: ${ERROR_COMMENTS:-0} prior error(s) for ${PR_SHA:0:7}" + log " #${PR_NUM} error check: ${ERROR_COMMENTS:-0} prior error(s) for ${PR_SHA:0:7}" >> "$LOGFILE" - if "${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1; then + review_output=$("${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1) && review_rc=0 || review_rc=$? + if [ "$review_rc" -eq 0 ]; then REVIEWED=$((REVIEWED + 1)) else - log " #${PR_NUM} review failed" + tail_lines=$(echo "$review_output" | tail -3) + log " #${PR_NUM} review FAILED (exit code $review_rc): ${tail_lines}" fi if [ "$REVIEWED" -ge "$MAX_REVIEWS" ]; then - log "Hit max reviews (${MAX_REVIEWS}), stopping" + log "Hit max reviews (${MAX_REVIEWS}), stopping" >> "$LOGFILE" break fi @@ -195,4 +198,4 @@ while IFS= read -r line; do done <<< "$PRS" -log "--- Poll done: ${REVIEWED} reviewed, ${SKIPPED} skipped ---" +log "--- Poll done: ${REVIEWED} reviewed, ${SKIPPED} skipped ---" >> "$LOGFILE" diff --git a/supervisor/supervisor-run.sh b/supervisor/supervisor-run.sh index f3e70dd..cc99610 100755 --- a/supervisor/supervisor-run.sh +++ b/supervisor/supervisor-run.sh @@ -46,7 +46,8 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid" SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md" WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run" -log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } +# Override LOG_AGENT for consistent agent identification +LOG_AGENT="supervisor" # ── Guards ──────────────────────────────────────────────────────────────── check_active supervisor @@ -67,10 +68,12 @@ resolve_agent_identity || true # ── Collect pre-flight metrics ──────────────────────────────────────────── log "Running preflight.sh" PREFLIGHT_OUTPUT="" +PREFLIGHT_RC=0 if PREFLIGHT_OUTPUT=$(bash "$SCRIPT_DIR/preflight.sh" "$PROJECT_TOML" 2>&1); then log "Preflight collected ($(echo "$PREFLIGHT_OUTPUT" | wc -l) lines)" else - log "WARNING: preflight.sh failed, continuing with partial data" + PREFLIGHT_RC=$? + log "WARNING: preflight.sh failed (exit code $PREFLIGHT_RC), continuing with partial data" fi # ── Load formula + context ───────────────────────────────────────────────