disinto/formulas/run-supervisor.toml

# formulas/run-supervisor.toml — Supervisor formula (health monitoring + remediation)
#
# Executed by supervisor/supervisor-run.sh via cron (every 20 minutes).
# supervisor-run.sh creates a tmux session with Claude (sonnet) and injects
# this formula with pre-collected metrics as context.
#
# Steps: preflight → health-assessment → decide-actions → report → journal
#
# Key differences from planner/gardener:
#   - Runs every 20min — lightweight health check
#   - Primarily READS state, rarely WRITES (no PRs, just Matrix + journal)
#   - Checks vault state for pending procurement items
#   - Conversation memory via Matrix thread and journal

name        = "run-supervisor"
description = "Factory health monitoring: assess metrics, fix issues, report via Matrix, write journal"
version     = 1
model       = "sonnet"

[context]
files = ["AGENTS.md"]

[[steps]]
id    = "preflight"
title = "Review pre-collected metrics"
description = """
The pre-flight metrics have already been collected by supervisor/preflight.sh
and injected into your prompt above. Review them now.

1. Read the injected metrics data carefully (System Resources, Docker,
   Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs,
   CI Pipelines, Open PRs, Issue Status, Stale Worktrees).
   Note: preflight.sh auto-removes PHASE:escalate files for closed issues
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.

2. Check vault state: read vault/pending/*.md for any procurement items
   the planner has filed. Note items relevant to the health assessment
   (e.g. a blocked resource that explains why the pipeline is stalled).

3. Read the supervisor journal for recent history:
     JOURNAL_FILE="$FACTORY_ROOT/supervisor/journal/$(date -u +%Y-%m-%d).md"
     if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi

4. Note any values that cross these thresholds:
   - RAM available < 500MB or swap > 3GB → P0 (memory crisis)
   - Disk > 80% → P1 (disk pressure)
   - Agent sessions dead, CI stuck/pending, git in bad state → P2 (factory stopped)
   - PRs stale >20min (CI done, no push since) → P3 (factory degraded)
   - Stale worktrees, old lock files → P4 (housekeeping)
"""

[[steps]]
id    = "health-assessment"
title = "Evaluate health of each subsystem"
description = """
Categorize every finding from the metrics into priority levels.

### P0 — Memory crisis
- RAM available < 500MB
- Swap used > 3GB AND RAM available < 2000MB

### P1 — Disk pressure
- Disk usage > 80%

### P2 — Factory stopped / stalled
- CI pipelines stuck running > 20min or pending > 30min
- Dev-agent lock file present but process dead
- Dev-agent status unchanged for > 30min
- Git repo on wrong branch or in broken rebase state
- Pipeline stalled: backlog issues exist but no agent ran for > 20min
- Dev-agent blocked: last N polls all report "no ready issues"
- Dev/action sessions in PHASE:escalate for > 24h (session timeout)
  (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight;
  this check covers sessions where the issue is still open)

### P3 — Factory degraded
- PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed
  (Do NOT flag PRs that are actively being worked on — only truly inactive ones)
- Circular dependency deadlocks in backlog
- Stale dependencies (blocked by issues open > 30 days)

### P4 — Housekeeping
- Stale worktrees > 2h old with no active process
- Lock files for dead processes
- Stale claude processes (> 3h old)

List each finding with its priority level. If everything looks healthy,
note "All systems healthy" and proceed.
"""
needs = ["preflight"]

[[steps]]
id    = "decide-actions"
title = "Fix what you can, file vault items for what you cannot"
description = """
For each finding from the health assessment, decide and execute an action.

### Auto-fixable (execute these directly)

**P0 Memory crisis:**
  # Kill stale one-shot claude processes (>3h old)
  pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
  # Drop filesystem caches
  sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true

**P1 Disk pressure:**
  # Docker cleanup
  sudo docker system prune -f >/dev/null 2>&1 || true
  # Truncate logs > 10MB
  for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do
    [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f"
  done

**P2 Dead lock files:**
  rm -f /path/to/stale.lock

**P2 Stale rebase:**
  cd "$PROJECT_REPO_ROOT"
  git rebase --abort 2>/dev/null
  git checkout "$PRIMARY_BRANCH" 2>/dev/null

**P2 Wrong branch:**
  cd "$PROJECT_REPO_ROOT"
  git checkout "$PRIMARY_BRANCH" 2>/dev/null

**P4 Stale PHASE:escalate files (closed issues):**
  Already handled by preflight.sh auto-cleanup. Check "Stale Phase Cleanup"
  in the metrics for results. Log any cleanups in the journal.

**P4 Stale worktrees:**
  git -C "$PROJECT_REPO_ROOT" worktree remove --force /tmp/stale-worktree 2>/dev/null
  git -C "$PROJECT_REPO_ROOT" worktree prune 2>/dev/null

**P4 Stale claude processes:**
  pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true

**P3 Stale PRs (CI done >20min, no push since):**
  Do NOT read dev-poll.sh, push branches, attempt merges, or investigate pipeline code.
  Instead, nudge the dev-agent via tmux injection if a session is alive:
    # Find the dev session for this issue
    SESSION=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep "dev-.*-${ISSUE_NUM}" | head -1)
    if [ -n "$SESSION" ]; then
      # Inject a nudge into the dev-agent session
      tmux send-keys -t "$SESSION" "# [supervisor] PR stale >20min — CI finished, please push or update" Enter
    fi
  If no active tmux session exists, note it in the journal for the next dev-poll cycle.
  Do NOT file vault items for stale PRs unless they remain stale for >3 consecutive runs.

### Cannot auto-fix → file vault item

For P0-P2 issues that persist after auto-fix attempts, or issues requiring
human judgment, file a vault procurement item:
  Write $PROJECT_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md:
    # <What is needed>
    ## What
    <description of the problem and why the supervisor cannot fix it>
    ## Why
    <impact on factory health — reference the priority level>
    ## Unblocks
    - Factory health: <what this resolves>
  The vault-poll will notify the human and track the request.

Read the relevant best-practices file before taking action:
  cat "$FACTORY_ROOT/supervisor/best-practices/memory.md"    # P0
  cat "$FACTORY_ROOT/supervisor/best-practices/disk.md"      # P1
  cat "$FACTORY_ROOT/supervisor/best-practices/ci.md"        # P2 CI
  cat "$FACTORY_ROOT/supervisor/best-practices/dev-agent.md" # P2 agent
  cat "$FACTORY_ROOT/supervisor/best-practices/git.md"       # P2 git

Track what you fixed and what vault items you filed for the report step.
"""
needs = ["health-assessment"]

[[steps]]
id    = "report"
title = "Post health summary to Matrix"
description = """
Post a status summary to Matrix. Use the matrix_send function:
  source "$FACTORY_ROOT/lib/env.sh"
  matrix_send "supervisor" "<message>"

### When everything is healthy
Post a brief "all clear" only if the PREVIOUS run had alerts (check journal).
Do NOT post "all clear" every 20 minutes — that would be noise.

### When there are findings
Post a summary grouped by priority:
  matrix_send "supervisor" "Supervisor health check:

  Fixed:
  - <what was auto-fixed>

  Alerts:
  - [P2] <description>
  - [P3] <description>

  Status: RAM=<X>MB Disk=<Y>% Load=<Z>"

### When vault items were filed (P0-P2 unresolved)
Note the vault items in the status summary:
  matrix_send "supervisor" "Supervisor health check:

  Filed vault items:
  - vault/pending/<id>.md — <summary>

  Status: RAM=<X>MB Disk=<Y>% Load=<Z>"

Keep messages concise. Do not post identical messages to what was posted
in the previous run (check journal for prior messages).
"""
needs = ["decide-actions"]

[[steps]]
id    = "journal"
title = "Write health journal entry"
description = """
Append a timestamped entry to the supervisor journal.

File path:
  $FACTORY_ROOT/supervisor/journal/$(date -u +%Y-%m-%d).md

If the file already exists (multiple runs per day), append a new section.
If it does not exist, create it.

Format:
  ## Supervisor run — HH:MM UTC

  ### Health status
  - RAM: <X>MB available, Swap: <X>MB
  - Disk: <X>%
  - Load: <X>
  - Docker: <N> containers

  ### Findings
  - [P<N>] <finding> — <action taken or "filed vault item">
  (or "No issues found — all systems healthy")

  ### Actions taken
  - <what was fixed>
  (or "No actions needed")

  ### Vault items filed
  - vault/pending/<id>.md — <reason>
  (or "None")

Keep each entry concise — 15-25 lines max. This journal provides
run-to-run context so future supervisor runs can detect trends
(e.g., "disk has been >75% for 3 consecutive runs").

IMPORTANT: Do NOT commit or push the journal — it is a local working file.
The journal directory is committed to git periodically by other agents.

After writing the journal, write the phase signal:
  echo 'PHASE:done' > "$PHASE_FILE"
"""
needs = ["report"]