Merge pull request 'fix: bug: profile journal digestion can hang for hours on local Qwen with many journals — blocks dev-agent (#702)' (#703) from fix/issue-702 into main

2026-04-11 23:24:25 +00:00 · 2026-04-11 23:24:25 +00:00 · c51cc9dba6
commit c51cc9dba6
parent fb7f7aa7db 9aeef51d9d
1 changed files with 79 additions and 18 deletions
--- a/lib/formula-session.sh
+++ b/lib/formula-session.sh
@ -18,7 +18,8 @@
 #   ensure_profile_repo [AGENT_IDENTITY]   — clone/pull .profile repo
 #   _profile_has_repo                      — check if agent has .profile repo
 #   _count_undigested_journals             — count journal entries to digest
-#   _profile_digest_journals               — digest journals into lessons
+#   _profile_digest_journals               — digest journals into lessons (timeout + batch cap)
+#   _profile_restore_lessons FILE BACKUP   — restore lessons on digest failure
 #   _profile_commit_and_push MESSAGE [FILES] — commit/push to .profile repo
 #   resolve_agent_identity                 — resolve agent user login from FORGE_TOKEN
 #   build_graph_section                    — run build-graph.py and set GRAPH_SECTION
@ -191,10 +192,14 @@ _count_undigested_journals() {

 # _profile_digest_journals
 # Runs a claude -p one-shot to digest undigested journals into lessons-learned.md
+# Respects PROFILE_DIGEST_TIMEOUT (default 300s) and PROFILE_DIGEST_MAX_BATCH (default 5).
+# On failure/timeout, preserves the previous lessons-learned.md and does not archive journals.
 # Returns 0 on success, 1 on failure.
 _profile_digest_journals() {
  local agent_identity="${AGENT_IDENTITY:-}"
  local model="${CLAUDE_MODEL:-opus}"
+  local digest_timeout="${PROFILE_DIGEST_TIMEOUT:-300}"
+  local max_batch="${PROFILE_DIGEST_MAX_BATCH:-5}"

  if [ -z "$agent_identity" ]; then
    if ! resolve_agent_identity; then
@ -207,19 +212,27 @@ _profile_digest_journals() {
  local knowledge_dir="${PROFILE_REPO_PATH}/knowledge"
  local lessons_file="${knowledge_dir}/lessons-learned.md"

-  # Collect undigested journal entries
+  # Collect undigested journal entries (capped at max_batch)
  local journal_entries=""
+  local batch_count=0
+  local -a batchfiles=()
  if [ -d "$journal_dir" ]; then
    for jf in "$journal_dir"/*.md; do
      [ -f "$jf" ] || continue
      # Skip archived entries
      [[ "$jf" == */archive/* ]] && continue
+      if [ "$batch_count" -ge "$max_batch" ]; then
+        log "profile: capping digest batch at ${max_batch} journals (remaining will be digested in future runs)"
+        break
+      fi
      local basename
      basename=$(basename "$jf")
      journal_entries="${journal_entries}
 ### ${basename}
 $(cat "$jf")
 "
+      batchfiles+=("$jf")
+      batch_count=$((batch_count + 1))
    done
  fi

@ -228,9 +241,18 @@ $(cat "$jf")
    return 0
  fi

+  log "profile: digesting ${batch_count} journals (timeout ${digest_timeout}s)"
+
  # Ensure knowledge directory exists
  mkdir -p "$knowledge_dir"

+  # Back up existing lessons-learned.md so we can restore on failure
+  local lessons_backup=""
+  if [ -f "$lessons_file" ]; then
+    lessons_backup=$(mktemp)
+    cp "$lessons_file" "$lessons_backup"
+  fi
+
  # Capture mtime so we can detect a Write-tool write afterwards
  local mtime_before=0
  [ -f "$lessons_file" ] && mtime_before=$(stat -c %Y "$lessons_file")
@ -257,27 +279,51 @@ Update the lessons-learned file at this exact absolute path:
 ## Journal entries to digest
 ${journal_entries}"

-  # Run claude -p one-shot with same model as agent
-  local output
+  # Run claude -p one-shot with digest-specific timeout
+  local output digest_rc
+  local saved_timeout="${CLAUDE_TIMEOUT:-7200}"
+  CLAUDE_TIMEOUT="$digest_timeout"
  output=$(claude_run_with_watchdog claude -p "$digest_prompt" \
    --output-format json \
    --dangerously-skip-permissions \
    ${model:+--model "$model"} \
-    2>>"$LOGFILE" || echo '{"result":"error"}')
+    2>>"$LOGFILE") && digest_rc=0 || digest_rc=$?
+  CLAUDE_TIMEOUT="$saved_timeout"
+
+  if [ "$digest_rc" -eq 124 ]; then
+    log "profile: digest timed out after ${digest_timeout}s — preserving previous lessons, skipping archive"
+    _profile_restore_lessons "$lessons_file" "$lessons_backup"
+    return 1
+  fi
+
+  if [ "$digest_rc" -ne 0 ]; then
+    log "profile: digest failed (exit code ${digest_rc}) — preserving previous lessons, skipping archive"
+    _profile_restore_lessons "$lessons_file" "$lessons_backup"
+    return 1
+  fi

  local mtime_after=0
  [ -f "$lessons_file" ] && mtime_after=$(stat -c %Y "$lessons_file")

  if [ "$mtime_after" -gt "$mtime_before" ] && [ -s "$lessons_file" ]; then
-    log "profile: lessons-learned.md written by model via Write tool ($(wc -c < "$lessons_file") bytes)"
+    local file_size
+    file_size=$(wc -c < "$lessons_file")
+    # Treat tiny files (<=16 bytes) as failed digestion (e.g. "null", "{}", empty)
+    if [ "$file_size" -le 16 ]; then
+      log "profile: digest produced suspiciously small file (${file_size} bytes) — preserving previous lessons, skipping archive"
+      _profile_restore_lessons "$lessons_file" "$lessons_backup"
+      return 1
+    fi
+    log "profile: lessons-learned.md written by model via Write tool (${file_size} bytes)"
  else
    # Fallback: model didn't use Write tool — capture .result and strip any markdown code fence
    local lessons_content
    lessons_content=$(printf '%s' "$output" | jq -r '.result // empty' 2>/dev/null || echo "")
    lessons_content=$(printf '%s' "$lessons_content" | sed -E '1{/^```(markdown|md)?[[:space:]]*$/d;};${/^```[[:space:]]*$/d;}')

-    if [ -z "$lessons_content" ]; then
-      log "profile: failed to digest journals (no Write tool call, empty .result)"
+    if [ -z "$lessons_content" ] || [ "${#lessons_content}" -le 16 ]; then
+      log "profile: failed to digest journals (no Write tool call, empty or tiny .result) — preserving previous lessons, skipping archive"
+      _profile_restore_lessons "$lessons_file" "$lessons_backup"
      return 1
    fi

@ -285,13 +331,14 @@ ${journal_entries}"
    log "profile: lessons-learned.md written from .result fallback (${#lessons_content} bytes)"
  fi

-  # Move digested journals to archive (if any were processed)
-  if [ -d "$journal_dir" ]; then
+  # Clean up backup on success
+  [ -n "$lessons_backup" ] && rm -f "$lessons_backup"
+
+  # Move only the digested journals to archive (not all — only the batch we processed)
+  if [ ${#batchfiles[@]} -gt 0 ]; then
    mkdir -p "${journal_dir}/archive"
    local archived=0
-    for jf in "$journal_dir"/*.md; do
-      [ -f "$jf" ] || continue
-      [[ "$jf" == */archive/* ]] && continue
+    for jf in "${batchfiles[@]}"; do
      local basename
      basename=$(basename "$jf")
      mv "$jf" "${journal_dir}/archive/${basename}" 2>/dev/null && archived=$((archived + 1))
@ -310,6 +357,18 @@ ${journal_entries}"
  return 0
 }

+# _profile_restore_lessons LESSONS_FILE BACKUP_FILE
+# Restores previous lessons-learned.md from backup on digest failure.
+_profile_restore_lessons() {
+  local lessons_file="$1"
+  local backup="$2"
+  if [ -n "$backup" ] && [ -f "$backup" ]; then
+    cp "$backup" "$lessons_file"
+    rm -f "$backup"
+    log "profile: restored previous lessons-learned.md"
+  fi
+}
+
 # _profile_commit_and_push MESSAGE [FILE ...]
 # Commits and pushes changes to .profile repo.
 _profile_commit_and_push() {
@ -350,7 +409,8 @@ _profile_commit_and_push() {

 # profile_load_lessons
 # Pre-session: loads lessons-learned.md into LESSONS_CONTEXT for prompt injection.
-# Lazy digestion: if >10 undigested journals exist, runs claude -p to digest them.
+# Lazy digestion: if undigested journals exceed PROFILE_DIGEST_THRESHOLD (default 10),
+# runs claude -p to digest them (bounded by PROFILE_DIGEST_MAX_BATCH and PROFILE_DIGEST_TIMEOUT).
 # Returns 0 on success, 1 if agent has no .profile repo (silent no-op).
 # Requires: ensure_profile_repo() called, AGENT_IDENTITY, FORGE_TOKEN, FORGE_URL, CLAUDE_MODEL.
 # Exports: LESSONS_CONTEXT (the lessons file content, hard-capped at 2KB).
@ -366,13 +426,14 @@ profile_load_lessons() {
  fi

  # Check journal count for lazy digestion trigger
-  local journal_count
+  local journal_count digest_threshold
  journal_count=$(_count_undigested_journals)
+  digest_threshold="${PROFILE_DIGEST_THRESHOLD:-10}"

-  if [ "${journal_count:-0}" -gt 10 ]; then
-    log "profile: digesting ${journal_count} undigested journals"
+  if [ "${journal_count:-0}" -gt "$digest_threshold" ]; then
+    log "profile: ${journal_count} undigested journals (threshold ${digest_threshold})"
    if ! _profile_digest_journals; then
-      log "profile: warning — journal digestion failed"
+      log "profile: warning — journal digestion failed, continuing with existing lessons"
    fi
  fi