Merge pull request 'fix: bug: supervisor hardcodes ops repo expectation — fails silently on deployments without one (#544)' (#564) from fix/issue-544 into main

2026-04-10 08:19:07 +00:00 · 2026-04-10 08:19:07 +00:00 · a8b96d8211
commit a8b96d8211
parent be5957f127 f299bae77b
11 changed files with 278 additions and 16 deletions
--- a/formulas/run-supervisor.toml
+++ b/formulas/run-supervisor.toml
@ -34,13 +34,15 @@ and injected into your prompt above. Review them now.
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.
-2. Check vault state: read $OPS_REPO_ROOT/vault/pending/*.md for any procurement items
+2. Check vault state: read ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/*.md for any procurement items
   the planner has filed. Note items relevant to the health assessment
   (e.g. a blocked resource that explains why the pipeline is stalled).
   Note: In degraded mode, vault items are stored locally.
 3. Read the supervisor journal for recent history:
-     JOURNAL_FILE="$OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md"
+     JOURNAL_FILE="${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md"
     if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi
   Note: In degraded mode, the journal is stored locally and not committed to git.
 4. Note any values that cross these thresholds:
   - RAM available < 500MB or swap > 3GB → P0 (memory crisis)
@ -143,7 +145,7 @@ For each finding from the health assessment, decide and execute an action.
 **P3 Stale PRs (CI done >20min, no push since):**
  Do NOT read dev-poll.sh, push branches, attempt merges, or investigate pipeline code.
  Instead, file a vault item for the dev-agent to pick up:
-    Write $OPS_REPO_ROOT/vault/pending/stale-pr-${ISSUE_NUM}.md:
+    Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/stale-pr-${ISSUE_NUM}.md:
      # Stale PR: ${PR_TITLE}
      ## What
      CI finished >20min ago but no git push has been made to the PR branch.
@ -157,7 +159,7 @@ For each finding from the health assessment, decide and execute an action.
 For P0-P2 issues that persist after auto-fix attempts, or issues requiring
 human judgment, file a vault procurement item:
-  Write $OPS_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md:
+  Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/supervisor-<issue-slug>.md:
    # <What is needed>
    ## What
    <description of the problem and why the supervisor cannot fix it>
@ -166,13 +168,23 @@ human judgment, file a vault procurement item:
    ## Unblocks
    - Factory health: <what this resolves>
  Vault PR filed on ops repo — human approves via PR review.
  Note: In degraded mode (no ops repo), vault items are written locally to ${OPS_VAULT_ROOT:-local path}.
-Read the relevant best-practices file before taking action:
+### Reading best-practices files
-  cat "$OPS_REPO_ROOT/knowledge/memory.md"    # P0
+
-  cat "$OPS_REPO_ROOT/knowledge/disk.md"      # P1
+Read the relevant best-practices file before taking action. In degraded mode,
-  cat "$OPS_REPO_ROOT/knowledge/ci.md"        # P2 CI
+use the bundled knowledge files from ${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}:
-  cat "$OPS_REPO_ROOT/knowledge/dev-agent.md" # P2 agent
+
-  cat "$OPS_REPO_ROOT/knowledge/git.md"       # P2 git
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/memory.md"    # P0
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/disk.md"      # P1
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/ci.md"        # P2 CI
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/dev-agent.md" # P2 agent
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/git.md"       # P2 git
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/review-agent.md" # P2 review
  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/forge.md"     # P2 forge
 Note: If OPS_REPO_ROOT is not available (degraded mode), the bundled knowledge
 files in ${OPS_KNOWLEDGE_ROOT:-<unset>} provide fallback guidance.
 Track what you fixed and what vault items you filed for the report step.
 """
@ -214,7 +226,7 @@ description = """
 Append a timestamped entry to the supervisor journal.
 File path:
-  $OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md
+  ${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md
 If the file already exists (multiple runs per day), append a new section.
 If it does not exist, create it.
@ -247,12 +259,20 @@ run-to-run context so future supervisor runs can detect trends
 IMPORTANT: Do NOT commit or push the journal — it is a local working file.
 The journal directory is committed to git periodically by other agents.
 Note: In degraded mode (no ops repo), the journal is written locally to
 ${OPS_JOURNAL_ROOT:-<unset>} and is NOT automatically committed to any repo.
 ## Learning
-If you discover something new during this run, append it to the relevant
+If you discover something new during this run:
-knowledge file in the ops repo:
+
-  echo "### Lesson title
+- In full mode (ops repo available): append to the relevant knowledge file:
-  Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
+    echo "### Lesson title
    Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
 - In degraded mode: write to the local knowledge directory for reference:
    echo "### Lesson title
    Description of what you learned." >> "${OPS_KNOWLEDGE_ROOT:-<unset>}/<file>.md"
 Knowledge files: memory.md, disk.md, ci.md, forge.md, dev-agent.md,
 review-agent.md, git.md.
--- a/knowledge/ci.md
+++ b/knowledge/ci.md
@ -0,0 +1,28 @@
 # CI/CD — Best Practices
 ## CI Pipeline Issues (P2)
 When CI pipelines are stuck running >20min or pending >30min:
 ### Investigation Steps
 1. Check pipeline status via Forgejo API:
   ```bash
   curl -sf -H "Authorization: token $FORGE_TOKEN" \
     "$FORGE_API/pipelines?limit=50" | jq '.[] | {number, status, created}'
   ```
 2. Check Woodpecker CI if configured:
   ```bash
   curl -sf -H "Authorization: Bearer $WOODPECKER_TOKEN" \
     "$WOODPECKER_SERVER/api/repos/${WOODPECKER_REPO_ID}/pipelines?limit=10"
   ```
 ### Common Fixes
 - **Stuck pipeline**: Cancel via Forgejo API, retrigger
 - **Pending pipeline**: Check queue depth, scale CI runners
 - **Failed pipeline**: Review logs, fix failing test/step
 ### Prevention
 - Set timeout limits on CI pipelines
 - Monitor runner capacity and scale as needed
 - Use caching for dependencies to reduce build time
--- a/knowledge/dev-agent.md
+++ b/knowledge/dev-agent.md
@ -0,0 +1,28 @@
 # Dev Agent — Best Practices
 ## Dev Agent Issues (P2)
 When dev-agent is stuck, blocked, or in bad state:
 ### Dead Lock File
 ```bash
 # Check if process still exists
 ps -p $(cat /path/to/lock.file) 2>/dev/null || rm -f /path/to/lock.file
 ```
 ### Stale Worktree Cleanup
 ```bash
 cd "$PROJECT_REPO_ROOT"
 git worktree remove --force /tmp/stale-worktree 2>/dev/null || true
 git worktree prune 2>/dev/null || true
 ```
 ### Blocked Pipeline
 - Check if PR is awaiting review or CI
 - Verify no other agent is actively working on same issue
 - Check for unmet dependencies (issues with `Depends on` refs)
 ### Prevention
 - Single-threaded pipeline per project (AD-002)
 - Clear lock files in EXIT traps
 - Use phase files to track agent state
--- a/knowledge/disk.md
+++ b/knowledge/disk.md
@ -0,0 +1,35 @@
 # Disk Management — Best Practices
 ## Disk Pressure Response (P1)
 When disk usage exceeds 80%, take these actions in order:
 ### Immediate Actions
 1. **Docker cleanup** (safe, low impact):
   ```bash
   sudo docker system prune -f
   ```
 2. **Aggressive Docker cleanup** (if still >80%):
   ```bash
   sudo docker system prune -a -f
   ```
   This removes unused images in addition to containers/volumes.
 3. **Log rotation**:
   ```bash
   for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do
     [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f"
   done
   ```
 ### Prevention
 - Monitor disk with alerts at 70% (warning) and 80% (critical)
 - Set up automatic log rotation for agent logs
 - Clean up old Docker images regularly
 - Consider using separate partitions for `/var/lib/docker`
 ### When to Escalate
 - Disk stays >80% after cleanup (indicates legitimate growth)
 - No unused Docker images to clean
 - Critical data filling disk (check /home, /var/log)
--- a/knowledge/forge.md
+++ b/knowledge/forge.md
@ -0,0 +1,25 @@
 # Forgejo Operations — Best Practices
 ## Forgejo Issues
 When Forgejo operations encounter issues:
 ### API Rate Limits
 - Monitor rate limit headers in API responses
 - Implement exponential backoff on 429 responses
 - Use agent-specific tokens (#747) to increase limits
 ### Authentication Issues
 - Verify FORGE_TOKEN is valid and not expired
 - Check agent identity matches token (#747)
 - Use FORGE_<AGENT>_TOKEN for agent-specific identities
 ### Repository Access
 - Verify FORGE_REMOTE matches actual git remote
 - Check token has appropriate permissions (repo, write)
 - Use `resolve_forge_remote()` to auto-detect remote
 ### Prevention
 - Set up monitoring for API failures
 - Rotate tokens before expiry
 - Document required permissions per agent
--- a/knowledge/git.md
+++ b/knowledge/git.md
@ -0,0 +1,28 @@
 # Git State Recovery — Best Practices
 ## Git State Issues (P2)
 When git repo is on wrong branch or in broken rebase state:
 ### Wrong Branch Recovery
 ```bash
 cd "$PROJECT_REPO_ROOT"
 git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
 ```
 ### Broken Rebase Recovery
 ```bash
 cd "$PROJECT_REPO_ROOT"
 git rebase --abort 2>/dev/null || true
 git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
 ```
 ### Stale Lock File Cleanup
 ```bash
 rm -f /path/to/stale.lock
 ```
 ### Prevention
 - Always checkout primary branch after rebase conflicts
 - Remove lock files after agent sessions complete
 - Use `git status` to verify repo state before operations
--- a/knowledge/memory.md
+++ b/knowledge/memory.md
@ -0,0 +1,27 @@
 # Memory Management — Best Practices
 ## Memory Crisis Response (P0)
 When RAM available drops below 500MB or swap usage exceeds 3GB, take these actions:
 ### Immediate Actions
 1. **Kill stale claude processes** (>3 hours old):
   ```bash
   pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
   ```
 2. **Drop filesystem caches**:
   ```bash
   sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
   ```
 ### Prevention
 - Set memory_guard to 2000MB minimum (default in env.sh)
 - Configure swap usage alerts at 2GB
 - Monitor for memory leaks in long-running processes
 - Use cgroups for process memory limits
 ### When to Escalate
 - RAM stays <500MB after cache drop
 - Swap continues growing after process kills
 - System becomes unresponsive (OOM killer active)
--- a/knowledge/review-agent.md
+++ b/knowledge/review-agent.md
@ -0,0 +1,23 @@
 # Review Agent — Best Practices
 ## Review Agent Issues
 When review agent encounters issues with PRs:
 ### Stale PR Handling
 - PRs stale >20min (CI done, no push since) → file vault item for dev-agent
 - Do NOT push branches or attempt merges directly
 - File vault item with:
  - What: Stale PR requiring push
  - Why: Factory degraded
  - Unblocks: dev-agent will push the branch
 ### Circular Dependencies
 - Check backlog for issues with circular `Depends on` refs
 - Use `lib/parse-deps.sh` to analyze dependency graph
 - Report to planner for resolution
 ### Prevention
 - Review agent only reads PRs, never modifies
 - Use vault items for actions requiring dev-agent
 - Monitor for PRs stuck in review state
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -40,6 +40,12 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by supervisor-run.sh)
 - `WOODPECKER_TOKEN`, `WOODPECKER_SERVER`, `WOODPECKER_DB_PASSWORD`, `WOODPECKER_DB_USER`, `WOODPECKER_DB_HOST`, `WOODPECKER_DB_NAME` — CI database queries
 **Degraded mode (Issue #544)**: When `OPS_REPO_ROOT` is not set or the directory doesn't exist, the supervisor runs in degraded mode:
 - Uses bundled knowledge files from `$FACTORY_ROOT/knowledge/` instead of ops repo playbooks
 - Writes journal locally to `$FACTORY_ROOT/state/supervisor-journal/` (not committed to git)
 - Files vault items locally to `$PROJECT_REPO_ROOT/vault/pending/`
 - Logs a WARNING message at startup indicating degraded mode
 **Lifecycle**: supervisor-run.sh (cron */20) → lock + memory guard → run
 preflight.sh (collect metrics) → load formula + context → run claude -p via agent-sdk.sh
 → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.
--- a/supervisor/preflight.sh
+++ b/supervisor/preflight.sh
@ -214,7 +214,9 @@ echo ""
 echo "## Pending Vault Items"
 _found_vault=false
-for _vf in "${OPS_REPO_ROOT}/vault/pending/"*.md; do
+# Use OPS_VAULT_ROOT if set (from supervisor-run.sh degraded mode detection), otherwise default to OPS_REPO_ROOT
 _va_root="${OPS_VAULT_ROOT:-${OPS_REPO_ROOT}/vault/pending}"
 for _vf in "${_va_root}"/*.md; do
  [ -f "$_vf" ] || continue
  _found_vault=true
  _vtitle=$(grep -m1 '^# ' "$_vf" | sed 's/^# //' || basename "$_vf")
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -50,6 +50,26 @@ WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
 LOG_AGENT="supervisor"
 # ── OPS Repo Detection (Issue #544) ──────────────────────────────────────
 # Detect if OPS_REPO_ROOT is available and set degraded mode flag if not.
 # This allows the supervisor to run with fallback knowledge files and
 # local journal/vault paths when the ops repo is absent.
 if [ -z "${OPS_REPO_ROOT:-}" ] || [ ! -d "${OPS_REPO_ROOT}" ]; then
  log "WARNING: OPS_REPO_ROOT not set or directory missing — running in degraded mode (no playbooks, no journal continuity, no vault destination)"
  export OPS_REPO_DEGRADED=1
  # Set fallback paths for degraded mode
  export OPS_KNOWLEDGE_ROOT="${FACTORY_ROOT}/knowledge"
  export OPS_JOURNAL_ROOT="${FACTORY_ROOT}/state/supervisor-journal"
  export OPS_VAULT_ROOT="${PROJECT_REPO_ROOT}/vault/pending"
  mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true
 else
  export OPS_REPO_DEGRADED=0
  export OPS_KNOWLEDGE_ROOT="${OPS_REPO_ROOT}/knowledge"
  export OPS_JOURNAL_ROOT="${OPS_REPO_ROOT}/journal/supervisor"
  export OPS_VAULT_ROOT="${OPS_REPO_ROOT}/vault/pending"
  mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true
 fi
 # Override log() to append to supervisor-specific log file
 # shellcheck disable=SC2034
 log() {
@ -105,6 +125,25 @@ export CLAUDE_MODEL="sonnet"
 # ── Create worktree (before prompt assembly so trap is set early) ────────
 formula_worktree_setup "$WORKTREE"
 # Inject OPS repo status into prompt
 if [ "${OPS_REPO_DEGRADED:-0}" = "1" ]; then
  OPS_STATUS="
 ## OPS Repo Status
 **DEGRADED MODE**: OPS repo is not available. Using bundled knowledge files and local journal/vault paths.
 - Knowledge files: ${OPS_KNOWLEDGE_ROOT:-<unset>}
 - Journal: ${OPS_JOURNAL_ROOT:-<unset>}
 - Vault destination: ${OPS_VAULT_ROOT:-<unset>}
 "
 else
  OPS_STATUS="
 ## OPS Repo Status
 **FULL MODE**: OPS repo available at ${OPS_REPO_ROOT}
 - Knowledge files: ${OPS_KNOWLEDGE_ROOT:-<unset>}
 - Journal: ${OPS_JOURNAL_ROOT:-<unset>}
 - Vault destination: ${OPS_VAULT_ROOT:-<unset>}
 "
 fi
 PROMPT="You are the supervisor agent for ${FORGE_REPO}. Work through the formula below.
 You have full shell access and --dangerously-skip-permissions.
@ -117,6 +156,7 @@ ${PREFLIGHT_OUTPUT}
 ${CONTEXT_BLOCK}$(formula_lessons_block)
 ${SCRATCH_CONTEXT:+${SCRATCH_CONTEXT}
 }
 ${OPS_STATUS}
 Priority order: P0 memory > P1 disk > P2 stopped > P3 degraded > P4 housekeeping
 ${FORMULA_CONTENT}