diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index 63262d1..7d43d36 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -34,13 +34,15 @@ and injected into your prompt above. Review them now. (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. -2. Check vault state: read $OPS_REPO_ROOT/vault/pending/*.md for any procurement items +2. Check vault state: read ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/*.md for any procurement items the planner has filed. Note items relevant to the health assessment (e.g. a blocked resource that explains why the pipeline is stalled). + Note: In degraded mode, vault items are stored locally. 3. Read the supervisor journal for recent history: - JOURNAL_FILE="$OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md" + JOURNAL_FILE="${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md" if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi + Note: In degraded mode, the journal is stored locally and not committed to git. 4. Note any values that cross these thresholds: - RAM available < 500MB or swap > 3GB → P0 (memory crisis) @@ -143,7 +145,7 @@ For each finding from the health assessment, decide and execute an action. **P3 Stale PRs (CI done >20min, no push since):** Do NOT read dev-poll.sh, push branches, attempt merges, or investigate pipeline code. Instead, file a vault item for the dev-agent to pick up: - Write $OPS_REPO_ROOT/vault/pending/stale-pr-${ISSUE_NUM}.md: + Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/stale-pr-${ISSUE_NUM}.md: # Stale PR: ${PR_TITLE} ## What CI finished >20min ago but no git push has been made to the PR branch. @@ -157,7 +159,7 @@ For each finding from the health assessment, decide and execute an action. For P0-P2 issues that persist after auto-fix attempts, or issues requiring human judgment, file a vault procurement item: - Write $OPS_REPO_ROOT/vault/pending/supervisor-.md: + Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/supervisor-.md: # ## What @@ -166,13 +168,23 @@ human judgment, file a vault procurement item: ## Unblocks - Factory health: Vault PR filed on ops repo — human approves via PR review. + Note: In degraded mode (no ops repo), vault items are written locally to ${OPS_VAULT_ROOT:-local path}. -Read the relevant best-practices file before taking action: - cat "$OPS_REPO_ROOT/knowledge/memory.md" # P0 - cat "$OPS_REPO_ROOT/knowledge/disk.md" # P1 - cat "$OPS_REPO_ROOT/knowledge/ci.md" # P2 CI - cat "$OPS_REPO_ROOT/knowledge/dev-agent.md" # P2 agent - cat "$OPS_REPO_ROOT/knowledge/git.md" # P2 git +### Reading best-practices files + +Read the relevant best-practices file before taking action. In degraded mode, +use the bundled knowledge files from ${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}: + + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/memory.md" # P0 + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/disk.md" # P1 + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/ci.md" # P2 CI + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/dev-agent.md" # P2 agent + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/git.md" # P2 git + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/review-agent.md" # P2 review + cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/forge.md" # P2 forge + +Note: If OPS_REPO_ROOT is not available (degraded mode), the bundled knowledge +files in ${OPS_KNOWLEDGE_ROOT:-} provide fallback guidance. Track what you fixed and what vault items you filed for the report step. """ @@ -214,7 +226,7 @@ description = """ Append a timestamped entry to the supervisor journal. File path: - $OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md + ${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md If the file already exists (multiple runs per day), append a new section. If it does not exist, create it. @@ -247,12 +259,20 @@ run-to-run context so future supervisor runs can detect trends IMPORTANT: Do NOT commit or push the journal — it is a local working file. The journal directory is committed to git periodically by other agents. +Note: In degraded mode (no ops repo), the journal is written locally to +${OPS_JOURNAL_ROOT:-} and is NOT automatically committed to any repo. + ## Learning -If you discover something new during this run, append it to the relevant -knowledge file in the ops repo: - echo "### Lesson title - Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/.md" +If you discover something new during this run: + +- In full mode (ops repo available): append to the relevant knowledge file: + echo "### Lesson title + Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/.md" + +- In degraded mode: write to the local knowledge directory for reference: + echo "### Lesson title + Description of what you learned." >> "${OPS_KNOWLEDGE_ROOT:-}/.md" Knowledge files: memory.md, disk.md, ci.md, forge.md, dev-agent.md, review-agent.md, git.md. diff --git a/knowledge/ci.md b/knowledge/ci.md new file mode 100644 index 0000000..5132b57 --- /dev/null +++ b/knowledge/ci.md @@ -0,0 +1,28 @@ +# CI/CD — Best Practices + +## CI Pipeline Issues (P2) + +When CI pipelines are stuck running >20min or pending >30min: + +### Investigation Steps +1. Check pipeline status via Forgejo API: + ```bash + curl -sf -H "Authorization: token $FORGE_TOKEN" \ + "$FORGE_API/pipelines?limit=50" | jq '.[] | {number, status, created}' + ``` + +2. Check Woodpecker CI if configured: + ```bash + curl -sf -H "Authorization: Bearer $WOODPECKER_TOKEN" \ + "$WOODPECKER_SERVER/api/repos/${WOODPECKER_REPO_ID}/pipelines?limit=10" + ``` + +### Common Fixes +- **Stuck pipeline**: Cancel via Forgejo API, retrigger +- **Pending pipeline**: Check queue depth, scale CI runners +- **Failed pipeline**: Review logs, fix failing test/step + +### Prevention +- Set timeout limits on CI pipelines +- Monitor runner capacity and scale as needed +- Use caching for dependencies to reduce build time diff --git a/knowledge/dev-agent.md b/knowledge/dev-agent.md new file mode 100644 index 0000000..c32f519 --- /dev/null +++ b/knowledge/dev-agent.md @@ -0,0 +1,28 @@ +# Dev Agent — Best Practices + +## Dev Agent Issues (P2) + +When dev-agent is stuck, blocked, or in bad state: + +### Dead Lock File +```bash +# Check if process still exists +ps -p $(cat /path/to/lock.file) 2>/dev/null || rm -f /path/to/lock.file +``` + +### Stale Worktree Cleanup +```bash +cd "$PROJECT_REPO_ROOT" +git worktree remove --force /tmp/stale-worktree 2>/dev/null || true +git worktree prune 2>/dev/null || true +``` + +### Blocked Pipeline +- Check if PR is awaiting review or CI +- Verify no other agent is actively working on same issue +- Check for unmet dependencies (issues with `Depends on` refs) + +### Prevention +- Single-threaded pipeline per project (AD-002) +- Clear lock files in EXIT traps +- Use phase files to track agent state diff --git a/knowledge/disk.md b/knowledge/disk.md new file mode 100644 index 0000000..3c98e36 --- /dev/null +++ b/knowledge/disk.md @@ -0,0 +1,35 @@ +# Disk Management — Best Practices + +## Disk Pressure Response (P1) + +When disk usage exceeds 80%, take these actions in order: + +### Immediate Actions +1. **Docker cleanup** (safe, low impact): + ```bash + sudo docker system prune -f + ``` + +2. **Aggressive Docker cleanup** (if still >80%): + ```bash + sudo docker system prune -a -f + ``` + This removes unused images in addition to containers/volumes. + +3. **Log rotation**: + ```bash + for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do + [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f" + done + ``` + +### Prevention +- Monitor disk with alerts at 70% (warning) and 80% (critical) +- Set up automatic log rotation for agent logs +- Clean up old Docker images regularly +- Consider using separate partitions for `/var/lib/docker` + +### When to Escalate +- Disk stays >80% after cleanup (indicates legitimate growth) +- No unused Docker images to clean +- Critical data filling disk (check /home, /var/log) diff --git a/knowledge/forge.md b/knowledge/forge.md new file mode 100644 index 0000000..d85f164 --- /dev/null +++ b/knowledge/forge.md @@ -0,0 +1,25 @@ +# Forgejo Operations — Best Practices + +## Forgejo Issues + +When Forgejo operations encounter issues: + +### API Rate Limits +- Monitor rate limit headers in API responses +- Implement exponential backoff on 429 responses +- Use agent-specific tokens (#747) to increase limits + +### Authentication Issues +- Verify FORGE_TOKEN is valid and not expired +- Check agent identity matches token (#747) +- Use FORGE__TOKEN for agent-specific identities + +### Repository Access +- Verify FORGE_REMOTE matches actual git remote +- Check token has appropriate permissions (repo, write) +- Use `resolve_forge_remote()` to auto-detect remote + +### Prevention +- Set up monitoring for API failures +- Rotate tokens before expiry +- Document required permissions per agent diff --git a/knowledge/git.md b/knowledge/git.md new file mode 100644 index 0000000..f382bce --- /dev/null +++ b/knowledge/git.md @@ -0,0 +1,28 @@ +# Git State Recovery — Best Practices + +## Git State Issues (P2) + +When git repo is on wrong branch or in broken rebase state: + +### Wrong Branch Recovery +```bash +cd "$PROJECT_REPO_ROOT" +git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null +``` + +### Broken Rebase Recovery +```bash +cd "$PROJECT_REPO_ROOT" +git rebase --abort 2>/dev/null || true +git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null +``` + +### Stale Lock File Cleanup +```bash +rm -f /path/to/stale.lock +``` + +### Prevention +- Always checkout primary branch after rebase conflicts +- Remove lock files after agent sessions complete +- Use `git status` to verify repo state before operations diff --git a/knowledge/memory.md b/knowledge/memory.md new file mode 100644 index 0000000..711ea94 --- /dev/null +++ b/knowledge/memory.md @@ -0,0 +1,27 @@ +# Memory Management — Best Practices + +## Memory Crisis Response (P0) + +When RAM available drops below 500MB or swap usage exceeds 3GB, take these actions: + +### Immediate Actions +1. **Kill stale claude processes** (>3 hours old): + ```bash + pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true + ``` + +2. **Drop filesystem caches**: + ```bash + sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true + ``` + +### Prevention +- Set memory_guard to 2000MB minimum (default in env.sh) +- Configure swap usage alerts at 2GB +- Monitor for memory leaks in long-running processes +- Use cgroups for process memory limits + +### When to Escalate +- RAM stays <500MB after cache drop +- Swap continues growing after process kills +- System becomes unresponsive (OOM killer active) diff --git a/knowledge/review-agent.md b/knowledge/review-agent.md new file mode 100644 index 0000000..6027652 --- /dev/null +++ b/knowledge/review-agent.md @@ -0,0 +1,23 @@ +# Review Agent — Best Practices + +## Review Agent Issues + +When review agent encounters issues with PRs: + +### Stale PR Handling +- PRs stale >20min (CI done, no push since) → file vault item for dev-agent +- Do NOT push branches or attempt merges directly +- File vault item with: + - What: Stale PR requiring push + - Why: Factory degraded + - Unblocks: dev-agent will push the branch + +### Circular Dependencies +- Check backlog for issues with circular `Depends on` refs +- Use `lib/parse-deps.sh` to analyze dependency graph +- Report to planner for resolution + +### Prevention +- Review agent only reads PRs, never modifies +- Use vault items for actions requiring dev-agent +- Monitor for PRs stuck in review state diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index e1a80dd..d8679df 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -40,6 +40,12 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping). - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by supervisor-run.sh) - `WOODPECKER_TOKEN`, `WOODPECKER_SERVER`, `WOODPECKER_DB_PASSWORD`, `WOODPECKER_DB_USER`, `WOODPECKER_DB_HOST`, `WOODPECKER_DB_NAME` — CI database queries +**Degraded mode (Issue #544)**: When `OPS_REPO_ROOT` is not set or the directory doesn't exist, the supervisor runs in degraded mode: +- Uses bundled knowledge files from `$FACTORY_ROOT/knowledge/` instead of ops repo playbooks +- Writes journal locally to `$FACTORY_ROOT/state/supervisor-journal/` (not committed to git) +- Files vault items locally to `$PROJECT_REPO_ROOT/vault/pending/` +- Logs a WARNING message at startup indicating degraded mode + **Lifecycle**: supervisor-run.sh (cron */20) → lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`. diff --git a/supervisor/preflight.sh b/supervisor/preflight.sh index 6d12cfd..2ddf110 100755 --- a/supervisor/preflight.sh +++ b/supervisor/preflight.sh @@ -214,7 +214,9 @@ echo "" echo "## Pending Vault Items" _found_vault=false -for _vf in "${OPS_REPO_ROOT}/vault/pending/"*.md; do +# Use OPS_VAULT_ROOT if set (from supervisor-run.sh degraded mode detection), otherwise default to OPS_REPO_ROOT +_va_root="${OPS_VAULT_ROOT:-${OPS_REPO_ROOT}/vault/pending}" +for _vf in "${_va_root}"/*.md; do [ -f "$_vf" ] || continue _found_vault=true _vtitle=$(grep -m1 '^# ' "$_vf" | sed 's/^# //' || basename "$_vf") diff --git a/supervisor/supervisor-run.sh b/supervisor/supervisor-run.sh index 0411177..2a44b47 100755 --- a/supervisor/supervisor-run.sh +++ b/supervisor/supervisor-run.sh @@ -50,6 +50,26 @@ WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run" # shellcheck disable=SC2034 # consumed by agent-sdk.sh and env.sh log() LOG_AGENT="supervisor" +# ── OPS Repo Detection (Issue #544) ────────────────────────────────────── +# Detect if OPS_REPO_ROOT is available and set degraded mode flag if not. +# This allows the supervisor to run with fallback knowledge files and +# local journal/vault paths when the ops repo is absent. +if [ -z "${OPS_REPO_ROOT:-}" ] || [ ! -d "${OPS_REPO_ROOT}" ]; then + log "WARNING: OPS_REPO_ROOT not set or directory missing — running in degraded mode (no playbooks, no journal continuity, no vault destination)" + export OPS_REPO_DEGRADED=1 + # Set fallback paths for degraded mode + export OPS_KNOWLEDGE_ROOT="${FACTORY_ROOT}/knowledge" + export OPS_JOURNAL_ROOT="${FACTORY_ROOT}/state/supervisor-journal" + export OPS_VAULT_ROOT="${PROJECT_REPO_ROOT}/vault/pending" + mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true +else + export OPS_REPO_DEGRADED=0 + export OPS_KNOWLEDGE_ROOT="${OPS_REPO_ROOT}/knowledge" + export OPS_JOURNAL_ROOT="${OPS_REPO_ROOT}/journal/supervisor" + export OPS_VAULT_ROOT="${OPS_REPO_ROOT}/vault/pending" + mkdir -p "$OPS_JOURNAL_ROOT" "$OPS_VAULT_ROOT" 2>/dev/null || true +fi + # Override log() to append to supervisor-specific log file # shellcheck disable=SC2034 log() { @@ -105,6 +125,25 @@ export CLAUDE_MODEL="sonnet" # ── Create worktree (before prompt assembly so trap is set early) ──────── formula_worktree_setup "$WORKTREE" +# Inject OPS repo status into prompt +if [ "${OPS_REPO_DEGRADED:-0}" = "1" ]; then + OPS_STATUS=" +## OPS Repo Status +**DEGRADED MODE**: OPS repo is not available. Using bundled knowledge files and local journal/vault paths. +- Knowledge files: ${OPS_KNOWLEDGE_ROOT:-} +- Journal: ${OPS_JOURNAL_ROOT:-} +- Vault destination: ${OPS_VAULT_ROOT:-} +" +else + OPS_STATUS=" +## OPS Repo Status +**FULL MODE**: OPS repo available at ${OPS_REPO_ROOT} +- Knowledge files: ${OPS_KNOWLEDGE_ROOT:-} +- Journal: ${OPS_JOURNAL_ROOT:-} +- Vault destination: ${OPS_VAULT_ROOT:-} +" +fi + PROMPT="You are the supervisor agent for ${FORGE_REPO}. Work through the formula below. You have full shell access and --dangerously-skip-permissions. @@ -117,6 +156,7 @@ ${PREFLIGHT_OUTPUT} ${CONTEXT_BLOCK}$(formula_lessons_block) ${SCRATCH_CONTEXT:+${SCRATCH_CONTEXT} } +${OPS_STATUS} Priority order: P0 memory > P1 disk > P2 stopped > P3 degraded > P4 housekeeping ${FORMULA_CONTENT}