Merge pull request 'fix: bug: supervisor hardcodes ops repo expectation — fails silently on deployments without one (#544)' (#564) from fix/issue-544 into main

2026-04-10 08:19:07 +00:00 · 2026-04-10 08:19:07 +00:00 · a8b96d8211
commit a8b96d8211
parent be5957f127 f299bae77b
11 changed files with 278 additions and 16 deletions
--- a/formulas/run-supervisor.toml
+++ b/formulas/run-supervisor.toml
@ -34,13 +34,15 @@ and injected into your prompt above. Review them now.
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.

-2. Check vault state: read $OPS_REPO_ROOT/vault/pending/*.md for any procurement items
+2. Check vault state: read ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/*.md for any procurement items
   the planner has filed. Note items relevant to the health assessment
   (e.g. a blocked resource that explains why the pipeline is stalled).
+   Note: In degraded mode, vault items are stored locally.

 3. Read the supervisor journal for recent history:
-     JOURNAL_FILE="$OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md"
+     JOURNAL_FILE="${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md"
     if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi
+   Note: In degraded mode, the journal is stored locally and not committed to git.

 4. Note any values that cross these thresholds:
   - RAM available < 500MB or swap > 3GB → P0 (memory crisis)
@ -143,7 +145,7 @@ For each finding from the health assessment, decide and execute an action.
 **P3 Stale PRs (CI done >20min, no push since):**
  Do NOT read dev-poll.sh, push branches, attempt merges, or investigate pipeline code.
  Instead, file a vault item for the dev-agent to pick up:
-    Write $OPS_REPO_ROOT/vault/pending/stale-pr-${ISSUE_NUM}.md:
+    Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/stale-pr-${ISSUE_NUM}.md:
      # Stale PR: ${PR_TITLE}
      ## What
      CI finished >20min ago but no git push has been made to the PR branch.
@ -157,7 +159,7 @@ For each finding from the health assessment, decide and execute an action.

 For P0-P2 issues that persist after auto-fix attempts, or issues requiring
 human judgment, file a vault procurement item:
-  Write $OPS_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md:
+  Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/supervisor-<issue-slug>.md:
    # <What is needed>
    ## What
    <description of the problem and why the supervisor cannot fix it>
@ -166,13 +168,23 @@ human judgment, file a vault procurement item:
    ## Unblocks
    - Factory health: <what this resolves>
  Vault PR filed on ops repo — human approves via PR review.
+  Note: In degraded mode (no ops repo), vault items are written locally to ${OPS_VAULT_ROOT:-local path}.

-Read the relevant best-practices file before taking action:
-  cat "$OPS_REPO_ROOT/knowledge/memory.md"    # P0
-  cat "$OPS_REPO_ROOT/knowledge/disk.md"      # P1
-  cat "$OPS_REPO_ROOT/knowledge/ci.md"        # P2 CI
-  cat "$OPS_REPO_ROOT/knowledge/dev-agent.md" # P2 agent
-  cat "$OPS_REPO_ROOT/knowledge/git.md"       # P2 git
+### Reading best-practices files
+
+Read the relevant best-practices file before taking action. In degraded mode,
+use the bundled knowledge files from ${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}:
+
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/memory.md"    # P0
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/disk.md"      # P1
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/ci.md"        # P2 CI
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/dev-agent.md" # P2 agent
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/git.md"       # P2 git
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/review-agent.md" # P2 review
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/forge.md"     # P2 forge
+
+Note: If OPS_REPO_ROOT is not available (degraded mode), the bundled knowledge
+files in ${OPS_KNOWLEDGE_ROOT:-<unset>} provide fallback guidance.

 Track what you fixed and what vault items you filed for the report step.
 """
@ -214,7 +226,7 @@ description = """
 Append a timestamped entry to the supervisor journal.

 File path:
-  $OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md
+  ${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md

 If the file already exists (multiple runs per day), append a new section.
 If it does not exist, create it.
@ -247,12 +259,20 @@ run-to-run context so future supervisor runs can detect trends
 IMPORTANT: Do NOT commit or push the journal — it is a local working file.
 The journal directory is committed to git periodically by other agents.

+Note: In degraded mode (no ops repo), the journal is written locally to
+${OPS_JOURNAL_ROOT:-<unset>} and is NOT automatically committed to any repo.
+
 ## Learning

-If you discover something new during this run, append it to the relevant
-knowledge file in the ops repo:
-  echo "### Lesson title
-  Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
+If you discover something new during this run:
+
+- In full mode (ops repo available): append to the relevant knowledge file:
+    echo "### Lesson title
+    Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
+
+- In degraded mode: write to the local knowledge directory for reference:
+    echo "### Lesson title
+    Description of what you learned." >> "${OPS_KNOWLEDGE_ROOT:-<unset>}/<file>.md"

 Knowledge files: memory.md, disk.md, ci.md, forge.md, dev-agent.md,
 review-agent.md, git.md.
--- a/knowledge/ci.md
+++ b/knowledge/ci.md
@ -0,0 +1,28 @@
+# CI/CD — Best Practices
+
+## CI Pipeline Issues (P2)
+
+When CI pipelines are stuck running >20min or pending >30min:
+
+### Investigation Steps
+1. Check pipeline status via Forgejo API:
+   ```bash
+   curl -sf -H "Authorization: token $FORGE_TOKEN" \
+     "$FORGE_API/pipelines?limit=50" | jq '.[] | {number, status, created}'
+   ```
+
+2. Check Woodpecker CI if configured:
+   ```bash
+   curl -sf -H "Authorization: Bearer $WOODPECKER_TOKEN" \
+     "$WOODPECKER_SERVER/api/repos/${WOODPECKER_REPO_ID}/pipelines?limit=10"
+   ```
+
+### Common Fixes
+- **Stuck pipeline**: Cancel via Forgejo API, retrigger
+- **Pending pipeline**: Check queue depth, scale CI runners
+- **Failed pipeline**: Review logs, fix failing test/step
+
+### Prevention
+- Set timeout limits on CI pipelines
+- Monitor runner capacity and scale as needed
+- Use caching for dependencies to reduce build time
--- a/knowledge/dev-agent.md
+++ b/knowledge/dev-agent.md
@ -0,0 +1,28 @@
+# Dev Agent — Best Practices
+
+## Dev Agent Issues (P2)
+
+When dev-agent is stuck, blocked, or in bad state:
+
+### Dead Lock File
+```bash
+# Check if process still exists
+ps -p $(cat /path/to/lock.file) 2>/dev/null || rm -f /path/to/lock.file
+```
+
+### Stale Worktree Cleanup
+```bash
+cd "$PROJECT_REPO_ROOT"
+git worktree remove --force /tmp/stale-worktree 2>/dev/null || true
+git worktree prune 2>/dev/null || true
+```
+
+### Blocked Pipeline
+- Check if PR is awaiting review or CI
+- Verify no other agent is actively working on same issue
+- Check for unmet dependencies (issues with `Depends on` refs)
+
+### Prevention
+- Single-threaded pipeline per project (AD-002)
+- Clear lock files in EXIT traps
+- Use phase files to track agent state
--- a/knowledge/disk.md
+++ b/knowledge/disk.md
@ -0,0 +1,35 @@
+# Disk Management — Best Practices
+
+## Disk Pressure Response (P1)
+
+When disk usage exceeds 80%, take these actions in order:
+
+### Immediate Actions
+1. **Docker cleanup** (safe, low impact):
+   ```bash
+   sudo docker system prune -f
+   ```
+
+2. **Aggressive Docker cleanup** (if still >80%):
+   ```bash
+   sudo docker system prune -a -f
+   ```
+   This removes unused images in addition to containers/volumes.
+
+3. **Log rotation**:
+   ```bash
+   for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do
+     [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f"
+   done
+   ```
+
+### Prevention
+- Monitor disk with alerts at 70% (warning) and 80% (critical)
+- Set up automatic log rotation for agent logs
+- Clean up old Docker images regularly
+- Consider using separate partitions for `/var/lib/docker`
+
+### When to Escalate
+- Disk stays >80% after cleanup (indicates legitimate growth)
+- No unused Docker images to clean
+- Critical data filling disk (check /home, /var/log)
--- a/knowledge/forge.md
+++ b/knowledge/forge.md
@ -0,0 +1,25 @@
+# Forgejo Operations — Best Practices
+
+## Forgejo Issues
+
+When Forgejo operations encounter issues:
+
+### API Rate Limits
+- Monitor rate limit headers in API responses
+- Implement exponential backoff on 429 responses
+- Use agent-specific tokens (#747) to increase limits
+
+### Authentication Issues
+- Verify FORGE_TOKEN is valid and not expired
+- Check agent identity matches token (#747)
+- Use FORGE_<AGENT>_TOKEN for agent-specific identities
+
+### Repository Access
+- Verify FORGE_REMOTE matches actual git remote
+- Check token has appropriate permissions (repo, write)
+- Use `resolve_forge_remote()` to auto-detect remote
+
+### Prevention
+- Set up monitoring for API failures
+- Rotate tokens before expiry
+- Document required permissions per agent
--- a/knowledge/git.md
+++ b/knowledge/git.md
@ -0,0 +1,28 @@
+# Git State Recovery — Best Practices
+
+## Git State Issues (P2)
+
+When git repo is on wrong branch or in broken rebase state:
+
+### Wrong Branch Recovery
+```bash
+cd "$PROJECT_REPO_ROOT"
+git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
+```
+
+### Broken Rebase Recovery
+```bash
+cd "$PROJECT_REPO_ROOT"
+git rebase --abort 2>/dev/null || true
+git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
+```
+
+### Stale Lock File Cleanup
+```bash
+rm -f /path/to/stale.lock
+```
+
+### Prevention
+- Always checkout primary branch after rebase conflicts
+- Remove lock files after agent sessions complete
+- Use `git status` to verify repo state before operations
--- a/knowledge/memory.md
+++ b/knowledge/memory.md
@ -0,0 +1,27 @@
+# Memory Management — Best Practices
+
+## Memory Crisis Response (P0)
+
+When RAM available drops below 500MB or swap usage exceeds 3GB, take these actions:
+
+### Immediate Actions
+1. **Kill stale claude processes** (>3 hours old):
+   ```bash
+   pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
+   ```
+
+2. **Drop filesystem caches**:
+   ```bash
+   sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
+   ```
+
+### Prevention
+- Set memory_guard to 2000MB minimum (default in env.sh)
+- Configure swap usage alerts at 2GB
+- Monitor for memory leaks in long-running processes
+- Use cgroups for process memory limits
+
+### When to Escalate
+- RAM stays <500MB after cache drop
+- Swap continues growing after process kills
+- System becomes unresponsive (OOM killer active)
--- a/knowledge/review-agent.md
+++ b/knowledge/review-agent.md
@ -0,0 +1,23 @@
+# Review Agent — Best Practices
+
+## Review Agent Issues
+
+When review agent encounters issues with PRs:
+
+### Stale PR Handling
+- PRs stale >20min (CI done, no push since) → file vault item for dev-agent
+- Do NOT push branches or attempt merges directly
+- File vault item with:
+  - What: Stale PR requiring push
+  - Why: Factory degraded
+  - Unblocks: dev-agent will push the branch
+
+### Circular Dependencies
+- Check backlog for issues with circular `Depends on` refs
+- Use `lib/parse-deps.sh` to analyze dependency graph
+- Report to planner for resolution
+
+### Prevention
+- Review agent only reads PRs, never modifies
+- Use vault items for actions requiring dev-agent
+- Monitor for PRs stuck in review state
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -40,6 +40,12 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by supervisor-run.sh)
 - `WOODPECKER_TOKEN`, `WOODPECKER_SERVER`, `WOODPECKER_DB_PASSWORD`, `WOODPECKER_DB_USER`, `WOODPECKER_DB_HOST`, `WOODPECKER_DB_NAME` — CI database queries

+**Degraded mode (Issue #544)**: When `OPS_REPO_ROOT` is not set or the directory doesn't exist, the supervisor runs in degraded mode:
+- Uses bundled knowledge files from `$FACTORY_ROOT/knowledge/` instead of ops repo playbooks
+- Writes journal locally to `$FACTORY_ROOT/state/supervisor-journal/` (not committed to git)
+- Files vault items locally to `$PROJECT_REPO_ROOT/vault/pending/`
+- Logs a WARNING message at startup indicating degraded mode
+
 **Lifecycle**: supervisor-run.sh (cron */20) → lock + memory guard → run
 preflight.sh (collect metrics) → load formula + context → run claude -p via agent-sdk.sh
 → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.
--- a/supervisor/preflight.sh
+++ b/supervisor/preflight.sh
@ -214,7 +214,9 @@ echo ""

 echo "## Pending Vault Items"
 _found_vault=false
-for _vf in "${OPS_REPO_ROOT}/vault/pending/"*.md; do
+# Use OPS_VAULT_ROOT if set (from supervisor-run.sh degraded mode detection), otherwise default to OPS_REPO_ROOT
+_va_root="${OPS_VAULT_ROOT:-${OPS_REPO_ROOT}/vault/pending}"
+for _vf in "${_va_root}"/*.md; do
  [ -f "$_vf" ] || continue
  _found_vault=true
  _vtitle=$(grep -m1 '^# ' "$_vf" | sed 's/^# //' || basename "$_vf")
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -50,6 +50,26 @@ WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
 LOG_AGENT="supervisor"

+# ── OPS Repo Detection (Issue #544) ──────────────────────────────────────
+# Detect if OPS_REPO_ROOT is available and set degraded mode flag if not.
+# This allows the supervisor to run with fallback knowledge files and
+# local journal/vault paths when the ops repo is absent.
+if [ -z "${OPS_REPO_ROOT:-}" ] || [ ! -d "${OPS_REPO_ROOT}" ]; then
+  log "WARNING: OPS_REPO_ROOT not set or directory missing — running in degraded mode (no playbooks, no journal continuity, no vault destination)"
+  export OPS_REPO_DEGRADED=1
+  # Set fallback paths for degraded mode
+  export OPS_KNOWLEDGE_ROOT="${FACTORY_ROOT}/knowledge"
+  export OPS_JOURNAL_ROOT="${FACTORY_ROOT}/state/supervisor-journal"
+  export OPS_VAULT_ROOT="${PROJECT_REPO_ROOT}/vault/pending"
+  mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true
+else
+  export OPS_REPO_DEGRADED=0
+  export OPS_KNOWLEDGE_ROOT="${OPS_REPO_ROOT}/knowledge"
+  export OPS_JOURNAL_ROOT="${OPS_REPO_ROOT}/journal/supervisor"
+  export OPS_VAULT_ROOT="${OPS_REPO_ROOT}/vault/pending"
+  mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true
+fi
+
 # Override log() to append to supervisor-specific log file
 # shellcheck disable=SC2034
 log() {
@ -105,6 +125,25 @@ export CLAUDE_MODEL="sonnet"
 # ── Create worktree (before prompt assembly so trap is set early) ────────
 formula_worktree_setup "$WORKTREE"

+# Inject OPS repo status into prompt
+if [ "${OPS_REPO_DEGRADED:-0}" = "1" ]; then
+  OPS_STATUS="
+## OPS Repo Status
+**DEGRADED MODE**: OPS repo is not available. Using bundled knowledge files and local journal/vault paths.
+- Knowledge files: ${OPS_KNOWLEDGE_ROOT:-<unset>}
+- Journal: ${OPS_JOURNAL_ROOT:-<unset>}
+- Vault destination: ${OPS_VAULT_ROOT:-<unset>}
+"
+else
+  OPS_STATUS="
+## OPS Repo Status
+**FULL MODE**: OPS repo available at ${OPS_REPO_ROOT}
+- Knowledge files: ${OPS_KNOWLEDGE_ROOT:-<unset>}
+- Journal: ${OPS_JOURNAL_ROOT:-<unset>}
+- Vault destination: ${OPS_VAULT_ROOT:-<unset>}
+"
+fi
+
 PROMPT="You are the supervisor agent for ${FORGE_REPO}. Work through the formula below.

 You have full shell access and --dangerously-skip-permissions.
@ -117,6 +156,7 @@ ${PREFLIGHT_OUTPUT}
 ${CONTEXT_BLOCK}$(formula_lessons_block)
 ${SCRATCH_CONTEXT:+${SCRATCH_CONTEXT}
 }
+${OPS_STATUS}
 Priority order: P0 memory > P1 disk > P2 stopped > P3 degraded > P4 housekeeping

 ${FORMULA_CONTENT}