From d216e6294f5d7c38ebfc72376ea3f7d16af388e4 Mon Sep 17 00:00:00 2001
From: Agent <agent@example.com>
Date: Tue, 7 Apr 2026 18:51:34 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20fix:=20standardize=20logging=20across=20?=
 =?UTF-8?q?all=20agents=20=E2=80=94=20capture=20errors,=20log=20exit=20cod?=
 =?UTF-8?q?es,=20consistent=20format=20(#367)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 architect/architect-run.sh         |  5 +-
 disinto-factory/lessons-learned.md | 69 ++++++++++++++-------------
 docker/edge/dispatcher.sh          | 75 ++++++++++++++++--------------
 gardener/gardener-run.sh           | 73 +++++++++++++++++------------
 lib/agent-sdk.sh                   | 12 +++--
 lib/env.sh                         |  6 ++-
 lib/pr-lifecycle.sh                | 22 +++++++--
 planner/planner-run.sh             |  5 +-
 predictor/predictor-run.sh         |  5 +-
 review/review-poll.sh              | 47 ++++++++++---------
 supervisor/supervisor-run.sh       |  7 ++-
 11 files changed, 190 insertions(+), 136 deletions(-)

diff --git a/architect/architect-run.sh b/architect/architect-run.sh
index 18de885..ce7498c 100755
--- a/architect/architect-run.sh
+++ b/architect/architect-run.sh
@@ -36,7 +36,7 @@ source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"
 
-LOG_FILE="$SCRIPT_DIR/architect.log"
+LOG_FILE="${DISINTO_LOG_DIR}/architect/architect.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@@ -44,7 +44,8 @@ SID_FILE="/tmp/architect-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/architect-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-architect-run"
 
-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="architect"
 
 # ── Guards ────────────────────────────────────────────────────────────────
 check_active architect
diff --git a/disinto-factory/lessons-learned.md b/disinto-factory/lessons-learned.md
index b1f6a3b..7f0f94a 100644
--- a/disinto-factory/lessons-learned.md
+++ b/disinto-factory/lessons-learned.md
@@ -1,54 +1,57 @@
-# Working with the factory — lessons learned
+# Lessons Learned
 
-## Writing issues for the dev agent
+## Issue Management
 
-**Put everything in the issue body, not comments.** The dev agent reads the issue body when it starts work. It does not reliably read comments. If an issue fails and you need to add guidance for a retry, update the issue body.
+**Put everything in the issue body.** Agents read the issue body; comments are unreliable. Add guidance by updating the body.
 
-**One approach per issue, no choices.** The dev agent cannot make design decisions. If there are multiple ways to solve a problem, decide before filing. Issues with "Option A or Option B" will confuse the agent.
+**One decision per issue.** Agents cannot choose between approaches. Decide before filing.
 
-**Issues must fit the templates.** Every backlog issue needs: affected files (max 3), acceptance criteria (max 5 checkboxes), and a clear proposed solution. If you cannot fill these fields, the issue is too big — label it `vision` and break it down first.
+**Explicit dependencies prevent ordering bugs.** Use `Depends-on: #N` so agents don't work on stale code.
 
-**Explicit dependencies prevent ordering bugs.** Add `Depends-on: #N` in the issue body. dev-poll checks these before pickup. Without explicit deps, the agent may attempt work on a stale codebase.
+**Fit issues to templates.** Max 3 files, 5 acceptance criteria, clear solution. If it doesn't fit, it's too big—label `vision` and decompose.
 
-## Debugging CI failures
+## Debugging CI Failures
 
-**Check CI logs via Woodpecker SQLite when the API fails.** The Woodpecker v3 log API may return HTML instead of JSON. Reliable fallback:
-```bash
-sqlite3 /var/lib/docker/volumes/disinto_woodpecker-data/_data/woodpecker.sqlite \
-  "SELECT le.data FROM log_entries le \
-   JOIN steps s ON le.step_id = s.id \
-   JOIN workflows w ON s.pipeline_id = w.id \
-   JOIN pipelines p ON w.pipeline_id = p.id \
-   WHERE p.number = <N> AND s.name = '<step>' ORDER BY le.id"
-```
+**Agents can't see CI logs.** When an issue fails repeatedly, diagnose externally and put the exact error and fix in the issue body.
 
-**When the agent fails repeatedly on CI, diagnose externally.** The dev agent cannot see CI log output (only pass/fail status). If the same step fails 3+ times, read the logs yourself and put the exact error and fix in the issue body.
+**Blocked CI often means environment, not code.** Missing libraries, auth failures, or path assumptions usually point to infrastructure misconfiguration.
 
-## Retrying failed issues
+## Base Image and Environment
 
-**Clean up stale branches before retrying.** Old branches cause recovery mode which inherits stale code. Close the PR, delete the branch on Forgejo, then relabel to backlog.
+**Minimal images introduce hidden traps.** Alpine/musl libc breaks tools that work on glibc. Validate base image assumptions before committing to a path.
 
-**After a dependency lands, stale branches miss the fix.** If issue B depends on A, and B's PR was created before A merged, B's branch is stale. Close the PR and delete the branch so the agent starts fresh from current main.
+**Ask: "What system libraries does this tool actually need?"** Don't assume development environment constraints apply to production images.
 
-## Environment gotchas
+**Document base image tradeoffs.** Alpine saves space but costs compatibility. Ubuntu saves compatibility but costs size. Make decisions explicit.
 
-**Alpine/BusyBox differs from Debian.** CI and edge containers use Alpine:
-- `grep -P` (Perl regex) does not work — use `grep -E`
-- `USER` variable is unset — set it explicitly: `USER=$(whoami); export USER`
-- Network calls fail during `docker build` in LXD — download binaries on the host, COPY into images
+## Authentication
 
-**The host repo drifts from Forgejo main.** If factory code is bind-mounted, the host checkout goes stale. Pull regularly or use versioned releases.
+**Treat auth as first-class.** Authentication assumptions are the most fragile part of any integration.
 
-## Vault operations
+**Audit all external calls.** Map every operation touching remote systems and verify each has proper auth—even internal repos can require credentials.
 
-**The human merging a vault PR must be a Forgejo site admin.** The dispatcher verifies `is_admin` on the merger. Promote your user via the Forgejo CLI or database if needed.
+**Test in failure mode.** Don't just verify the happy path. Simulate restricted access environments.
 
-**Result files cache failures.** If a vault action fails, the dispatcher writes `.result.json` and skips it. To retry: delete the result file inside the edge container.
+**Verify the entire chain.** Each layer in a call chain can strip credentials. Check every script, command, and library boundary.
 
-## Breaking down large features
+**Special cases often bypass standard flows.** "Edge" or "beta" builds frequently have auth gaps.
 
-**Vision issues need structured decomposition.** When a feature touches multiple subsystems or has design forks, label it `vision`. Break it down by identifying what exists, what can be reused, where the design forks are, and resolve them before filing backlog issues.
+## Concurrency and Locking
 
-**Prefer gluecode over greenfield.** Check if Forgejo API, Woodpecker, Docker, or existing lib/ functions can do the job before building new components.
+**Scope conditions to the actor, not the system.** In multi-agent systems, distinguish "in progress" from "in progress by me."
 
-**Max 7 sub-issues per sprint.** If a breakdown produces more, split into two sprints.
+**Design blocking points with "who does this affect?" comments.** Challenge any global-blocking condition: "What if five agents hit this simultaneously?"
+
+**State falls into three categories:** global (affects all), actor-local, and shared resources. Only block on global state or own ownership.
+
+## Branch and PR Hygiene
+
+**Clean up stale branches before retrying.** Old branches cause recovery mode with stale code. Close PR, delete branch, then relabel.
+
+**After a dependency lands, stale branches miss the fix.** If issue B depends on A and B's PR predates A's merge, close and recreate the branch.
+
+## Feature Decomposition
+
+**Prefer gluecode over greenfield.** Check existing APIs, libraries, or utilities before building new components.
+
+**Max 7 sub-issues per sprint.** If more, split into two sprints.
diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh
index e3e57b7..c3f44b0 100755
--- a/docker/edge/dispatcher.sh
+++ b/docker/edge/dispatcher.sh
@@ -47,9 +47,14 @@ VAULT_ENV="${SCRIPT_ROOT}/../vault/vault-env.sh"
 # Comma-separated list of Forgejo usernames with admin role
 ADMIN_USERS="${FORGE_ADMIN_USERS:-vault-bot,admin}"
 
-# Log function
+# Persistent log file for dispatcher
+DISPATCHER_LOG_FILE="${DISINTO_LOG_DIR:-/tmp}/dispatcher/dispatcher.log"
+mkdir -p "$(dirname "$DISPATCHER_LOG_FILE")"
+
+# Log function with standardized format
 log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
+  local agent="${LOG_AGENT:-dispatcher}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$DISPATCHER_LOG_FILE"
 }
 
 # -----------------------------------------------------------------------------
@@ -185,7 +190,7 @@ verify_admin_approver() {
   # Fetch reviews for this PR
   local reviews_json
   reviews_json=$(get_pr_reviews "$pr_number") || {
-    log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping"
+    log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping" >> "$DISPATCHER_LOG_FILE"
     return 1
   }
 
@@ -193,7 +198,7 @@ verify_admin_approver() {
   local review_count
   review_count=$(echo "$reviews_json" | jq 'length // 0')
   if [ "$review_count" -eq 0 ]; then
-    log "WARNING: No reviews found for PR #${pr_number} — rejecting"
+    log "WARNING: No reviews found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE"
     return 1
   fi
 
@@ -216,12 +221,12 @@ verify_admin_approver() {
 
     # Check if reviewer is admin
     if is_allowed_admin "$reviewer"; then
-      log "Verified: PR #${pr_number} approved by admin '${reviewer}'"
+      log "Verified: PR #${pr_number} approved by admin '${reviewer}'" >> "$DISPATCHER_LOG_FILE"
       return 0
     fi
   done < <(echo "$reviews_json" | jq -c '.[]')
 
-  log "WARNING: No admin approval found for PR #${pr_number} — rejecting"
+  log "WARNING: No admin approval found for PR #${pr_number} — rejecting" >> "$DISPATCHER_LOG_FILE"
   return 1
 }
 
@@ -243,11 +248,11 @@ verify_admin_merged() {
   # Get the PR that introduced this file
   local pr_num
   pr_num=$(get_pr_for_file "$toml_file") || {
-    log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)"
+    log "WARNING: No PR found for action ${action_id} — skipping (possible direct push)" >> "$DISPATCHER_LOG_FILE"
     return 1
   }
 
-  log "Action ${action_id} arrived via PR #${pr_num}"
+  log "Action ${action_id} arrived via PR #${pr_num}" >> "$DISPATCHER_LOG_FILE"
 
   # First, try admin approver check (for auto-merge workflow)
   if verify_admin_approver "$pr_num" "$action_id"; then
@@ -257,7 +262,7 @@ verify_admin_merged() {
   # Fallback: Check merger (backwards compatibility for manual merges)
   local merger_json
   merger_json=$(get_pr_merger "$pr_num") || {
-    log "WARNING: Could not fetch PR #${pr_num} details — skipping"
+    log "WARNING: Could not fetch PR #${pr_num} details — skipping" >> "$DISPATCHER_LOG_FILE"
     return 1
   }
 
@@ -267,22 +272,22 @@ verify_admin_merged() {
 
   # Check if PR is merged
   if [[ "$merged" != "true" ]]; then
-    log "WARNING: PR #${pr_num} is not merged — skipping"
+    log "WARNING: PR #${pr_num} is not merged — skipping" >> "$DISPATCHER_LOG_FILE"
     return 1
   fi
 
   # Check if merger is admin
   if [ -z "$merger_username" ]; then
-    log "WARNING: Could not determine PR #${pr_num} merger — skipping"
+    log "WARNING: Could not determine PR #${pr_num} merger — skipping" >> "$DISPATCHER_LOG_FILE"
     return 1
   fi
 
   if ! is_allowed_admin "$merger_username"; then
-    log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping"
+    log "WARNING: PR #${pr_num} merged by non-admin user '${merger_username}' — skipping" >> "$DISPATCHER_LOG_FILE"
     return 1
   fi
 
-  log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)"
+  log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)" >> "$DISPATCHER_LOG_FILE"
   return 0
 }
 
@@ -343,7 +348,7 @@ write_result() {
     '{id: $id, exit_code: $exit_code, timestamp: $timestamp, logs: $logs}' \
     > "$result_file"
 
-  log "Result written: ${result_file}"
+  log "Result written: ${result_file}" >> "$DISPATCHER_LOG_FILE"
 }
 
 # Launch runner for the given action
@@ -353,18 +358,18 @@ launch_runner() {
   local action_id
   action_id=$(basename "$toml_file" .toml)
 
-  log "Launching runner for action: ${action_id}"
+  log "Launching runner for action: ${action_id}" >> "$DISPATCHER_LOG_FILE"
 
   # Validate TOML
   if ! validate_action "$toml_file"; then
-    log "ERROR: Action validation failed for ${action_id}"
+    log "ERROR: Action validation failed for ${action_id}" >> "$DISPATCHER_LOG_FILE"
     write_result "$action_id" 1 "Validation failed: see logs above"
     return 1
   fi
 
   # Verify admin merge
   if ! verify_admin_merged "$toml_file"; then
-    log "ERROR: Admin merge verification failed for ${action_id}"
+    log "ERROR: Admin merge verification failed for ${action_id}" >> "$DISPATCHER_LOG_FILE"
     write_result "$action_id" 1 "Admin merge verification failed: see logs above"
     return 1
   fi
@@ -400,7 +405,7 @@ launch_runner() {
       fi
     done
   else
-    log "Action ${action_id} has no secrets declared — runner will execute without extra env vars"
+    log "Action ${action_id} has no secrets declared — runner will execute without extra env vars" >> "$DISPATCHER_LOG_FILE"
   fi
 
   # Add formula and action id as arguments (safe from shell injection)
@@ -423,7 +428,7 @@ launch_runner() {
       log_cmd+=("$arg")
     fi
   done
-  log "Running: ${log_cmd[*]}"
+  log "Running: ${log_cmd[*]}" >> "$DISPATCHER_LOG_FILE"
 
   # Create temp file for logs
   local log_file
@@ -443,9 +448,9 @@ launch_runner() {
   write_result "$action_id" "$exit_code" "$logs"
 
   if [ $exit_code -eq 0 ]; then
-    log "Runner completed successfully for action: ${action_id}"
+    log "Runner completed successfully for action: ${action_id}" >> "$DISPATCHER_LOG_FILE"
   else
-    log "Runner failed for action: ${action_id} (exit code: ${exit_code})"
+    log "Runner failed for action: ${action_id} (exit code: ${exit_code})" >> "$DISPATCHER_LOG_FILE"
   fi
 
   return $exit_code
@@ -512,7 +517,7 @@ dispatch_reproduce() {
   local issue_number="$1"
 
   if is_reproduce_running "$issue_number"; then
-    log "Reproduce already running for issue #${issue_number}, skipping"
+    log "Reproduce already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE"
     return 0
   fi
 
@@ -523,11 +528,11 @@ dispatch_reproduce() {
   done
 
   if [ -z "$project_toml" ]; then
-    log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}"
+    log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping reproduce for #${issue_number}" >> "$DISPATCHER_LOG_FILE"
     return 0
   fi
 
-  log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})"
+  log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})" >> "$DISPATCHER_LOG_FILE"
 
   # Build docker run command using array (safe from injection)
   local -a cmd=(docker run --rm
@@ -575,7 +580,7 @@ dispatch_reproduce() {
   "${cmd[@]}" &
   local bg_pid=$!
   echo "$bg_pid" > "$(_reproduce_lockfile "$issue_number")"
-  log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}"
+  log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE"
 }
 
 # -----------------------------------------------------------------------------
@@ -636,7 +641,7 @@ dispatch_triage() {
   local issue_number="$1"
 
   if is_triage_running "$issue_number"; then
-    log "Triage already running for issue #${issue_number}, skipping"
+    log "Triage already running for issue #${issue_number}, skipping" >> "$DISPATCHER_LOG_FILE"
     return 0
   fi
 
@@ -647,7 +652,7 @@ dispatch_triage() {
   done
 
   if [ -z "$project_toml" ]; then
-    log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}"
+    log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}" >> "$DISPATCHER_LOG_FILE"
     return 0
   fi
 
@@ -700,7 +705,7 @@ dispatch_triage() {
   "${cmd[@]}" &
   local bg_pid=$!
   echo "$bg_pid" > "$(_triage_lockfile "$issue_number")"
-  log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
+  log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}" >> "$DISPATCHER_LOG_FILE"
 }
 
 # -----------------------------------------------------------------------------
@@ -710,19 +715,19 @@ dispatch_triage() {
 # Clone or pull the ops repo
 ensure_ops_repo() {
   if [ ! -d "${OPS_REPO_ROOT}/.git" ]; then
-    log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..."
+    log "Cloning ops repo from ${FORGE_URL}/${FORGE_OPS_REPO}..." >> "$DISPATCHER_LOG_FILE"
     git clone "${FORGE_URL}/${FORGE_OPS_REPO}" "${OPS_REPO_ROOT}"
   else
-    log "Pulling latest ops repo changes..."
+    log "Pulling latest ops repo changes..." >> "$DISPATCHER_LOG_FILE"
     (cd "${OPS_REPO_ROOT}" && git pull --rebase)
   fi
 }
 
 # Main dispatcher loop
 main() {
-  log "Starting dispatcher..."
-  log "Polling ops repo: ${VAULT_ACTIONS_DIR}"
-  log "Admin users: ${ADMIN_USERS}"
+  log "Starting dispatcher..." >> "$DISPATCHER_LOG_FILE"
+  log "Polling ops repo: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE"
+  log "Admin users: ${ADMIN_USERS}" >> "$DISPATCHER_LOG_FILE"
 
   while true; do
     # Refresh ops repo at the start of each poll cycle
@@ -730,7 +735,7 @@ main() {
 
     # Check if actions directory exists
     if [ ! -d "${VAULT_ACTIONS_DIR}" ]; then
-      log "Actions directory not found: ${VAULT_ACTIONS_DIR}"
+      log "Actions directory not found: ${VAULT_ACTIONS_DIR}" >> "$DISPATCHER_LOG_FILE"
       sleep 60
       continue
     fi
@@ -745,7 +750,7 @@ main() {
 
       # Skip if already completed
       if is_action_completed "$action_id"; then
-        log "Action ${action_id} already completed, skipping"
+        log "Action ${action_id} already completed, skipping" >> "$DISPATCHER_LOG_FILE"
         continue
       fi
 
diff --git a/gardener/gardener-run.sh b/gardener/gardener-run.sh
index 3b29987..7da8b5f 100755
--- a/gardener/gardener-run.sh
+++ b/gardener/gardener-run.sh
@@ -55,7 +55,8 @@ RESULT_FILE="/tmp/gardener-result-${PROJECT_NAME}.txt"
 GARDENER_PR_FILE="/tmp/gardener-pr-${PROJECT_NAME}.txt"
 WORKTREE="/tmp/${PROJECT_NAME}-gardener-run"
 
-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="gardener"
 
 # ── Guards ────────────────────────────────────────────────────────────────
 check_active gardener
@@ -156,19 +157,21 @@ _gardener_execute_manifest() {
 
     case "$action" in
       add_label)
-        local label label_id
+        local label label_id http_code resp
         label=$(jq -r ".[$i].label" "$manifest_file")
         label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
           "${FORGE_API}/labels" | jq -r --arg n "$label" \
           '.[] | select(.name == $n) | .id') || true
         if [ -n "$label_id" ]; then
-          if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+          resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
                -H 'Content-Type: application/json' \
                "${FORGE_API}/issues/${issue}/labels" \
-               -d "{\"labels\":[${label_id}]}" >/dev/null 2>&1; then
+               -d "{\"labels\":[${label_id}]}" 2>/dev/null) || true
+          http_code=$(echo "$resp" | tail -1)
+          if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
             log "manifest: add_label '${label}' to #${issue}"
           else
-            log "manifest: FAILED add_label '${label}' to #${issue}"
+            log "manifest: FAILED add_label '${label}' to #${issue}: HTTP ${http_code}"
           fi
         else
           log "manifest: FAILED add_label — label '${label}' not found"
@@ -176,17 +179,19 @@ _gardener_execute_manifest() {
         ;;
 
       remove_label)
-        local label label_id
+        local label label_id http_code resp
         label=$(jq -r ".[$i].label" "$manifest_file")
         label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
           "${FORGE_API}/labels" | jq -r --arg n "$label" \
           '.[] | select(.name == $n) | .id') || true
         if [ -n "$label_id" ]; then
-          if curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
-               "${FORGE_API}/issues/${issue}/labels/${label_id}" >/dev/null 2>&1; then
+          resp=$(curl -sf -w "\n%{http_code}" -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
+               "${FORGE_API}/issues/${issue}/labels/${label_id}" 2>/dev/null) || true
+          http_code=$(echo "$resp" | tail -1)
+          if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
             log "manifest: remove_label '${label}' from #${issue}"
           else
-            log "manifest: FAILED remove_label '${label}' from #${issue}"
+            log "manifest: FAILED remove_label '${label}' from #${issue}: HTTP ${http_code}"
           fi
         else
           log "manifest: FAILED remove_label — label '${label}' not found"
@@ -194,34 +199,38 @@ _gardener_execute_manifest() {
         ;;
 
       close)
-        local reason
+        local reason http_code resp
         reason=$(jq -r ".[$i].reason // empty" "$manifest_file")
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
              -H 'Content-Type: application/json' \
              "${FORGE_API}/issues/${issue}" \
-             -d '{"state":"closed"}' >/dev/null 2>&1; then
+             -d '{"state":"closed"}' 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
           log "manifest: closed #${issue} (${reason})"
         else
-          log "manifest: FAILED close #${issue}"
+          log "manifest: FAILED close #${issue}: HTTP ${http_code}"
         fi
         ;;
 
       comment)
-        local body escaped_body
+        local body escaped_body http_code resp
         body=$(jq -r ".[$i].body" "$manifest_file")
         escaped_body=$(printf '%s' "$body" | jq -Rs '.')
-        if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
              -H 'Content-Type: application/json' \
              "${FORGE_API}/issues/${issue}/comments" \
-             -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then
+             -d "{\"body\":${escaped_body}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
           log "manifest: commented on #${issue}"
         else
-          log "manifest: FAILED comment on #${issue}"
+          log "manifest: FAILED comment on #${issue}: HTTP ${http_code}"
         fi
         ;;
 
       create_issue)
-        local title body labels escaped_title escaped_body label_ids
+        local title body labels escaped_title escaped_body label_ids http_code resp
         title=$(jq -r ".[$i].title" "$manifest_file")
         body=$(jq -r ".[$i].body" "$manifest_file")
         labels=$(jq -r ".[$i].labels // [] | .[]" "$manifest_file")
@@ -241,40 +250,46 @@ _gardener_execute_manifest() {
           done <<< "$labels"
           [ -n "$ids_json" ] && label_ids="[${ids_json}]"
         fi
-        if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
              -H 'Content-Type: application/json' \
              "${FORGE_API}/issues" \
-             -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" >/dev/null 2>&1; then
+             -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
           log "manifest: created issue '${title}'"
         else
-          log "manifest: FAILED create_issue '${title}'"
+          log "manifest: FAILED create_issue '${title}': HTTP ${http_code}"
         fi
         ;;
 
       edit_body)
-        local body escaped_body
+        local body escaped_body http_code resp
         body=$(jq -r ".[$i].body" "$manifest_file")
         escaped_body=$(printf '%s' "$body" | jq -Rs '.')
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
              -H 'Content-Type: application/json' \
              "${FORGE_API}/issues/${issue}" \
-             -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then
+             -d "{\"body\":${escaped_body}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
           log "manifest: edited body of #${issue}"
         else
-          log "manifest: FAILED edit_body #${issue}"
+          log "manifest: FAILED edit_body #${issue}: HTTP ${http_code}"
         fi
         ;;
 
       close_pr)
-        local pr
+        local pr http_code resp
         pr=$(jq -r ".[$i].pr" "$manifest_file")
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
              -H 'Content-Type: application/json' \
              "${FORGE_API}/pulls/${pr}" \
-             -d '{"state":"closed"}' >/dev/null 2>&1; then
+             -d '{"state":"closed"}' 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
           log "manifest: closed PR #${pr}"
         else
-          log "manifest: FAILED close_pr #${pr}"
+          log "manifest: FAILED close_pr #${pr}: HTTP ${http_code}"
         fi
         ;;
 
diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh
index 1180982..ef8e0b2 100644
--- a/lib/agent-sdk.sh
+++ b/lib/agent-sdk.sh
@@ -52,9 +52,11 @@ agent_run() {
   log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
   output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
   if [ "$rc" -eq 124 ]; then
-    log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s"
+    log "agent_run: TIMEOUT after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
   elif [ "$rc" -ne 0 ]; then
-    log "agent_run: claude exited with code $rc"
+    local tail_lines
+    tail_lines=$(echo "$output" | tail -3)
+    log "agent_run: FAILED with exit code $rc: ${tail_lines:-no output}"
   fi
   if [ -z "$output" ]; then
     log "agent_run: empty output (claude may have crashed or failed)"
@@ -89,9 +91,11 @@ agent_run() {
         local nudge_rc
         output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
         if [ "$nudge_rc" -eq 124 ]; then
-          log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s"
+          log "agent_run: nudge TIMEOUT after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
         elif [ "$nudge_rc" -ne 0 ]; then
-          log "agent_run: nudge claude exited with code $nudge_rc"
+          local nudge_tail
+          nudge_tail=$(echo "$output" | tail -3)
+          log "agent_run: nudge FAILED with exit code $nudge_rc: ${nudge_tail:-no output}"
         fi
         new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
         if [ -n "$new_sid" ]; then
diff --git a/lib/env.sh b/lib/env.sh
index 7a82619..7c29d83 100755
--- a/lib/env.sh
+++ b/lib/env.sh
@@ -138,8 +138,12 @@ unset CLAWHUB_TOKEN 2>/dev/null || true
 export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
 
 # Shared log helper
+# Usage: log "message"
+# Sets LOG_AGENT to control the agent name prefix (default: derived from SCRIPT_DIR)
+# Output format: [2026-04-03T14:00:00Z] agent: message
 log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
+  local agent="${LOG_AGENT:-$(basename "$(dirname "$(dirname "${BASH_SOURCE[0]}")")")}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*"
 }
 
 # =============================================================================
diff --git a/lib/pr-lifecycle.sh b/lib/pr-lifecycle.sh
index 2bb64e1..d248d2d 100644
--- a/lib/pr-lifecycle.sh
+++ b/lib/pr-lifecycle.sh
@@ -357,11 +357,19 @@ pr_close() {
   local pr_num="$1"
 
   _prl_log "closing PR #${pr_num}"
-  curl -sf -X PATCH \
+  local resp http_code
+  resp=$(curl -sf -w "\n%{http_code}" -X PATCH \
     -H "Authorization: token ${FORGE_TOKEN}" \
     -H "Content-Type: application/json" \
     "${FORGE_API}/pulls/${pr_num}" \
-    -d '{"state":"closed"}' >/dev/null 2>&1 || true
+    -d '{"state":"closed"}' 2>/dev/null) || true
+  http_code=$(printf '%s\n' "$resp" | tail -1)
+  if [ "$http_code" != "200" ] && [ "$http_code" != "204" ]; then
+    local body
+    body=$(printf '%s\n' "$resp" | sed '$d' | head -1)
+    _prl_log "pr_close FAILED for PR #${pr_num}: HTTP ${http_code} ${body:0:200}"
+    return 1
+  fi
 }
 
 # ---------------------------------------------------------------------------
@@ -398,11 +406,17 @@ pr_walk_to_merge() {
       if [ "${_PR_CI_FAILURE_TYPE:-}" = "infra" ] && [ "$ci_retry_count" -lt 1 ]; then
         ci_retry_count=$((ci_retry_count + 1))
         _prl_log "infra failure — retriggering CI (retry ${ci_retry_count})"
-        ( cd "$worktree" && \
+        local rebase_output rebase_rc
+        rebase_output=$( ( cd "$worktree" && \
           git commit --allow-empty -m "ci: retrigger after infra failure" --no-verify && \
           git fetch "$remote" "${PRIMARY_BRANCH}" 2>/dev/null && \
           git rebase "${remote}/${PRIMARY_BRANCH}" && \
-          git push --force-with-lease "$remote" HEAD ) 2>&1 | tail -5 || true
+          git push --force-with-lease "$remote" HEAD ) 2>&1 ) || rebase_rc=$?
+        if [ -n "$rebase_rc" ] && [ "$rebase_rc" -ne 0 ]; then
+          _prl_log "infra retrigger FAILED (exit code $rebase_rc): $(echo "$rebase_output" | tail -3)"
+        else
+          _prl_log "infra retrigger succeeded"
+        fi
         continue
       fi
 
diff --git a/planner/planner-run.sh b/planner/planner-run.sh
index 2bbfab8..1a13a60 100755
--- a/planner/planner-run.sh
+++ b/planner/planner-run.sh
@@ -35,7 +35,7 @@ source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"
 
-LOG_FILE="$SCRIPT_DIR/planner.log"
+LOG_FILE="${DISINTO_LOG_DIR}/planner/planner.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@@ -43,7 +43,8 @@ SID_FILE="/tmp/planner-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/planner-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-planner-run"
 
-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="planner"
 
 # ── Guards ────────────────────────────────────────────────────────────────
 check_active planner
diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh
index f87001b..89050d7 100755
--- a/predictor/predictor-run.sh
+++ b/predictor/predictor-run.sh
@@ -36,7 +36,7 @@ source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"
 
-LOG_FILE="$SCRIPT_DIR/predictor.log"
+LOG_FILE="${DISINTO_LOG_DIR}/predictor/predictor.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@@ -44,7 +44,8 @@ SID_FILE="/tmp/predictor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/predictor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-predictor-run"
 
-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="predictor"
 
 # ── Guards ────────────────────────────────────────────────────────────────
 check_active predictor
diff --git a/review/review-poll.sh b/review/review-poll.sh
index 47d37df..dcda50d 100755
--- a/review/review-poll.sh
+++ b/review/review-poll.sh
@@ -23,20 +23,19 @@ LOGFILE="${DISINTO_LOG_DIR}/review/review-poll.log"
 MAX_REVIEWS=3
 REVIEW_IDLE_TIMEOUT=14400  # 4h: kill review session if idle
 
-log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE"
-}
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="review"
 
 # Log rotation
 if [ -f "$LOGFILE" ]; then
   LOGSIZE=$(stat -c%s "$LOGFILE" 2>/dev/null || echo 0)
   if [ "$LOGSIZE" -gt 102400 ]; then
     mv "$LOGFILE" "$LOGFILE.old"
-    log "Log rotated"
+    log "Log rotated" >> "$LOGFILE"
   fi
 fi
 
-log "--- Poll start ---"
+log "--- Poll start ---" >> "$LOGFILE"
 
 # --- Clean up stale review sessions (.sid files + worktrees) ---
 # Remove .sid files, phase files, and worktrees for merged/closed PRs or idle > 4h
@@ -54,7 +53,7 @@ if [ -n "$REVIEW_SIDS" ]; then
       "${API_BASE}/pulls/${pr_num}" | jq -r '.state // "unknown"' 2>/dev/null) || true
 
     if [ "$pr_state" != "open" ]; then
-      log "cleanup: PR #${pr_num} state=${pr_state} — removing sid/worktree"
+      log "cleanup: PR #${pr_num} state=${pr_state} — removing sid/worktree" >> "$LOGFILE"
       rm -f "$sid_file" "$phase_file" "/tmp/${PROJECT_NAME}-review-output-${pr_num}.json"
       cd "$REPO_ROOT"
       git worktree remove "$worktree" --force 2>/dev/null || true
@@ -66,7 +65,7 @@ if [ -n "$REVIEW_SIDS" ]; then
     sid_mtime=$(stat -c %Y "$sid_file" 2>/dev/null || echo 0)
     now=$(date +%s)
     if [ "$sid_mtime" -gt 0 ] && [ $(( now - sid_mtime )) -gt "$REVIEW_IDLE_TIMEOUT" ]; then
-      log "cleanup: PR #${pr_num} idle > 4h — removing sid/worktree"
+      log "cleanup: PR #${pr_num} idle > 4h — removing sid/worktree" >> "$LOGFILE"
       rm -f "$sid_file" "$phase_file" "/tmp/${PROJECT_NAME}-review-output-${pr_num}.json"
       cd "$REPO_ROOT"
       git worktree remove "$worktree" --force 2>/dev/null || true
@@ -86,7 +85,7 @@ if [ -z "$PRS" ]; then
 fi
 
 TOTAL=$(echo "$PRS" | wc -l)
-log "Found ${TOTAL} open PRs"
+log "Found ${TOTAL} open PRs" >> "$LOGFILE"
 
 REVIEWED=0
 SKIPPED=0
@@ -119,17 +118,19 @@ if [ -n "$REVIEW_SIDS" ]; then
 
     if ! ci_passed "$ci_state"; then
       if ci_required_for_pr "$pr_num"; then
-        log "  #${pr_num} awaiting_changes: new SHA ${current_sha:0:7} CI=${ci_state}, waiting"
+        log "  #${pr_num} awaiting_changes: new SHA ${current_sha:0:7} CI=${ci_state}, waiting" >> "$LOGFILE"
         continue
       fi
     fi
 
-    log "  #${pr_num} re-review: new commits (${reviewed_sha:0:7}→${current_sha:0:7})"
+    log "  #${pr_num} re-review: new commits (${reviewed_sha:0:7}→${current_sha:0:7})" >> "$LOGFILE"
 
-    if "${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1; then
+    review_output=$("${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1) && review_rc=0 || review_rc=$?
+    if [ "$review_rc" -eq 0 ]; then
       REVIEWED=$((REVIEWED + 1))
     else
-      log "  #${pr_num} re-review failed"
+      tail_lines=$(echo "$review_output" | tail -3)
+      log "  #${pr_num} re-review FAILED (exit code $review_rc): ${tail_lines}"
     fi
 
     [ "$REVIEWED" -lt "$MAX_REVIEWS" ] || break
@@ -145,11 +146,11 @@ while IFS= read -r line; do
   # Skip if CI is running/failed. Allow "success", no CI configured, or non-code PRs
   if ! ci_passed "$CI_STATE"; then
     if ci_required_for_pr "$PR_NUM"; then
-      log "  #${PR_NUM} CI=${CI_STATE}, skip"
+      log "  #${PR_NUM} CI=${CI_STATE}, skip" >> "$LOGFILE"
       SKIPPED=$((SKIPPED + 1))
       continue
     fi
-    log "  #${PR_NUM} CI=${CI_STATE} but no code files — proceeding"
+    log "  #${PR_NUM} CI=${CI_STATE} but no code files — proceeding" >> "$LOGFILE"
   fi
 
   # Check formal forge reviews (not comment markers)
@@ -159,12 +160,12 @@ while IFS= read -r line; do
     '[.[] | select(.commit_id == $sha) | select(.state != "COMMENT")] | length')
 
   if [ "${HAS_REVIEW:-0}" -gt "0" ]; then
-    log "  #${PR_NUM} formal review exists for ${PR_SHA:0:7}, skip"
+    log "  #${PR_NUM} formal review exists for ${PR_SHA:0:7}, skip" >> "$LOGFILE"
     SKIPPED=$((SKIPPED + 1))
     continue
   fi
 
-  log "  #${PR_NUM} needs review (CI=success, SHA=${PR_SHA:0:7})"
+  log "  #${PR_NUM} needs review (CI=success, SHA=${PR_SHA:0:7})" >> "$LOGFILE"
 
   # Circuit breaker: count existing review-error comments for this SHA
   ERROR_COMMENTS=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
@@ -173,21 +174,23 @@ while IFS= read -r line; do
     '[.[] | select(.body | contains("<!-- review-error: " + $sha + " -->"))] | length')
 
   if [ "${ERROR_COMMENTS:-0}" -ge 3 ]; then
-    log "  #${PR_NUM} blocked: ${ERROR_COMMENTS} consecutive error comments for ${PR_SHA:0:7}, skipping"
+    log "  #${PR_NUM} blocked: ${ERROR_COMMENTS} consecutive error comments for ${PR_SHA:0:7}, skipping" >> "$LOGFILE"
     SKIPPED=$((SKIPPED + 1))
     continue
   fi
 
-  log "  #${PR_NUM} error check: ${ERROR_COMMENTS:-0} prior error(s) for ${PR_SHA:0:7}"
+  log "  #${PR_NUM} error check: ${ERROR_COMMENTS:-0} prior error(s) for ${PR_SHA:0:7}" >> "$LOGFILE"
 
-  if "${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1; then
+  review_output=$("${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1) && review_rc=0 || review_rc=$?
+  if [ "$review_rc" -eq 0 ]; then
     REVIEWED=$((REVIEWED + 1))
   else
-    log "  #${PR_NUM} review failed"
+    tail_lines=$(echo "$review_output" | tail -3)
+    log "  #${PR_NUM} review FAILED (exit code $review_rc): ${tail_lines}"
   fi
 
   if [ "$REVIEWED" -ge "$MAX_REVIEWS" ]; then
-    log "Hit max reviews (${MAX_REVIEWS}), stopping"
+    log "Hit max reviews (${MAX_REVIEWS}), stopping" >> "$LOGFILE"
     break
   fi
 
@@ -195,4 +198,4 @@ while IFS= read -r line; do
 
 done <<< "$PRS"
 
-log "--- Poll done: ${REVIEWED} reviewed, ${SKIPPED} skipped ---"
+log "--- Poll done: ${REVIEWED} reviewed, ${SKIPPED} skipped ---" >> "$LOGFILE"
diff --git a/supervisor/supervisor-run.sh b/supervisor/supervisor-run.sh
index f3e70dd..cc99610 100755
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@@ -46,7 +46,8 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
 
-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+LOG_AGENT="supervisor"
 
 # ── Guards ────────────────────────────────────────────────────────────────
 check_active supervisor
@@ -67,10 +68,12 @@ resolve_agent_identity || true
 # ── Collect pre-flight metrics ────────────────────────────────────────────
 log "Running preflight.sh"
 PREFLIGHT_OUTPUT=""
+PREFLIGHT_RC=0
 if PREFLIGHT_OUTPUT=$(bash "$SCRIPT_DIR/preflight.sh" "$PROJECT_TOML" 2>&1); then
   log "Preflight collected ($(echo "$PREFLIGHT_OUTPUT" | wc -l) lines)"
 else
-  log "WARNING: preflight.sh failed, continuing with partial data"
+  PREFLIGHT_RC=$?
+  log "WARNING: preflight.sh failed (exit code $PREFLIGHT_RC), continuing with partial data"
 fi
 
 # ── Load formula + context ───────────────────────────────────────────────