From b417747f8973d6fa460ed00486ff9d9a5cfece77 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 22 Mar 2026 11:47:13 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20feat:=20predictor=20v2=20=E2=80=94=20out?=
 =?UTF-8?q?come=20measurement=20+=20external=20signal=20scanning=20(#547)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 formulas/run-predictor.toml | 101 +++++++++++++++++++++++++++++++++---
 predictor/AGENTS.md         |  28 ++++++----
 predictor/predictor-run.sh  |  10 +++-
 3 files changed, 120 insertions(+), 19 deletions(-)

diff --git a/formulas/run-predictor.toml b/formulas/run-predictor.toml
index 63283b8..2508d72 100644
--- a/formulas/run-predictor.toml
+++ b/formulas/run-predictor.toml
@@ -6,15 +6,23 @@
 #
 # Steps: preflight → collect-signals → analyze-and-predict
 #
-# Disinto-specific signal sources:
-#   - CI pipeline trends (Woodpecker)
-#   - Stale issues (open issues with no recent activity)
-#   - Agent health (tmux sessions, recent logs)
-#   - Resource patterns (RAM, disk, load, containers)
+# Signal sources (three categories):
+#   Health signals:
+#     - CI pipeline trends (Woodpecker)
+#     - Stale issues (open issues with no recent activity)
+#     - Agent health (tmux sessions, recent logs)
+#     - Resource patterns (RAM, disk, load, containers)
+#   Outcome signals:
+#     - Output freshness (formula evidence/artifacts)
+#     - Capacity utilization (idle agents vs dispatchable work)
+#     - Throughput (recently closed issues, merged PRs)
+#   External signals:
+#     - Dependency security advisories
+#     - Upstream breaking changes and deprecations
 
 name        = "run-predictor"
-description = "Evidence-based prediction: CI trends, stale issues, agent health, resource patterns"
-version     = 1
+description = "Evidence-based prediction: health, outcome measurement, external environment signals"
+version     = 2
 model       = "sonnet"
 
 [context]
@@ -114,6 +122,59 @@ Also check prediction/backlog (watched but not yet actioned):
     "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Fbacklog&limit=50"
 
 Record their titles so you can avoid duplicating them.
+
+### 6. Outcome measurement
+
+Check whether the factory is producing results, not just running:
+
+- Read RESOURCES.md for available formulas and capabilities
+- Read $PROJECT_REPO_ROOT/formulas/*.toml for dispatchable work
+- Check evidence/output directories for freshness:
+    find "$PROJECT_REPO_ROOT" -maxdepth 3 -name "*.log" -o -name "journal" -type d | \
+      while read -r f; do
+        echo "=== $f ==="
+        find "$f" -maxdepth 1 -type f -printf '%T+ %p\n' 2>/dev/null | sort -r | head -5
+      done
+- Check recently closed issues — is work completing or just cycling?
+    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+      "$CODEBERG_API/issues?state=closed&type=issues&limit=20&sort=updated&direction=desc"
+- Check recently merged PRs — what's the throughput?
+    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+      "$CODEBERG_API/pulls?state=closed&sort=updated&direction=desc&limit=20" | \
+      jq '[.[] | select(.merged)]'
+- Compare available capacity vs actual utilization:
+    tmux list-sessions 2>/dev/null | wc -l  # active sessions
+    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+      "$CODEBERG_API/issues?state=open&type=issues&labels=backlog&limit=50" | jq 'length'
+
+Look for:
+- Formulas that haven't produced output recently (stale journals/logs)
+- Idle compute when dispatchable work exists (backlog items but no active sessions)
+- High churn (issues opened and closed rapidly without merged PRs)
+- Low throughput relative to available agents
+
+### 7. External environment scan
+
+Look outside the box for signals that could affect the project:
+
+- Identify key dependencies from the project (package.json, go.mod, Cargo.toml,
+  requirements.txt, or similar — whatever exists in $PROJECT_REPO_ROOT)
+- Identify key tools (Claude CLI version, Woodpecker CI, Caddy, Docker, etc.)
+- For each major dependency or tool, use web search to check for:
+    - Security advisories or CVEs
+    - Breaking changes in recent releases
+    - Deprecation notices
+    - Major version bumps that could require migration
+
+Use WebSearch to gather these signals. Be targeted — search for specific
+dependencies and tools used by the project, not general news.
+Limit to 5 web searches maximum to keep the run fast.
+
+Look for:
+- CVEs or security advisories mentioning project dependencies
+- Major version releases of key tools (could break CI, require migration)
+- Deprecation notices for APIs or services in use
+- Ecosystem shifts that could obsolete current approaches
 """
 needs = ["preflight"]
 
@@ -149,6 +210,28 @@ Analyze the collected signals for patterns and file up to 5 prediction issues.
 - Box idle (RAM > 3000MB, load < 1.0, few active sessions) → good time
   for expensive operations if any are pending
 
+**Low throughput** — Factory running but not producing:
+- No issues closed in 7+ days despite available backlog → pipeline may be stuck
+- PRs merged but no issues closed → work not tracked properly
+- Agent sessions active but no PRs created → agents may be spinning
+- Formulas with no recent journal entries → agent may not be running
+
+**Idle capacity** — Dispatchable work not being picked up:
+- Backlog items available but no in-progress issues → dev-poll may be stuck
+- Multiple agents idle (few tmux sessions) with work queued → scheduling problem
+- High churn: issues opened and closed quickly without PRs → busy but not productive
+
+**External risk** — Threats or opportunities from outside:
+- CVE or security advisory for a project dependency → patch urgently
+- Major version release of a key tool → may require migration planning
+- Deprecation notice for an API or service in use → plan transition
+- Breaking change upstream that could affect CI or builds → investigate
+
+**External opportunity** — Beneficial changes in the ecosystem:
+- New tool release that could accelerate work → consider adoption
+- Upstream improvement that simplifies current workarounds → refactor opportunity
+- Security patch available for a known vulnerability → apply proactively
+
 ## Filing predictions
 
 For each prediction, create a Codeberg issue with the `prediction/unreviewed` label.
@@ -177,10 +260,12 @@ For each prediction, create a Codeberg issue with the `prediction/unreviewed` la
 
 ## Rules
 - Max 5 predictions total
-- Do NOT predict feature work — only infrastructure/health/metric observations
+- Do NOT predict feature work — only health observations, outcome measurements,
+  and external risk/opportunity signals
 - Do NOT duplicate existing open predictions (checked in collect-signals)
 - Be specific: name the metric, the value, the threshold
 - Prefer high-confidence predictions backed by concrete data
+- External signals must name the specific dependency/tool and the advisory/change
 - If no meaningful patterns found, file zero issues — that is a valid outcome
 
 """
diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md
index c7c34ea..0f8e738 100644
--- a/predictor/AGENTS.md
+++ b/predictor/AGENTS.md
@@ -1,14 +1,22 @@
 <!-- last-reviewed: ac51497489abc5412bc47f451facc30b0455cbd2 -->
 # Predictor Agent
 
-**Role**: Infrastructure pattern detection (the "goblin"). Runs a 3-step
+**Role**: Risk oracle and opportunity spotter (the "goblin"). Runs a 3-step
 formula (preflight → collect-signals → analyze-and-predict) via interactive
-tmux Claude session (sonnet). Collects disinto-specific signals: CI pipeline
-trends (Woodpecker), stale issues, agent health (tmux sessions + logs), and
-resource patterns (RAM, disk, load, containers). Files up to 5
-`prediction/unreviewed` issues for the Planner to triage. The predictor MUST
-NOT emit feature work — only observations about CI health, issue staleness,
-agent status, and system conditions.
+tmux Claude session (sonnet). Collects three categories of signals:
+
+1. **Health signals** — CI pipeline trends (Woodpecker), stale issues, agent
+   health (tmux sessions + logs), resource patterns (RAM, disk, load, containers)
+2. **Outcome signals** — output freshness (formula journals/artifacts), capacity
+   utilization (idle agents vs dispatchable backlog), throughput (closed issues,
+   merged PRs, churn detection)
+3. **External signals** — dependency security advisories, upstream breaking
+   changes, deprecation notices, ecosystem shifts (via targeted web search)
+
+Files up to 5 `prediction/unreviewed` issues for the Planner to triage.
+Predictions cover both "things going wrong" and "opportunities being missed".
+The predictor MUST NOT emit feature work — only observations about health,
+outcomes, and external risks/opportunities.
 
 **Trigger**: `predictor-run.sh` runs daily at 06:00 UTC via cron (1h before
 the planner at 07:00). Guarded by PID lock (`/tmp/predictor-run.lock`) and
@@ -31,6 +39,8 @@ memory check (skips if available RAM < 2000 MB).
 
 **Lifecycle**: predictor-run.sh (daily 06:00 cron) → lock + memory guard →
 load formula + context → create tmux session → Claude collects signals
-(CI trends, stale issues, agent health, resources) → dedup against existing
-open predictions → file `prediction/unreviewed` issues → `PHASE:done`.
+(health: CI trends, stale issues, agent health, resources; outcomes: output
+freshness, capacity utilization, throughput; external: dependency advisories,
+ecosystem changes via web search) → dedup against existing open predictions →
+file `prediction/unreviewed` issues → `PHASE:done`.
 The planner's Phase 1 later triages these predictions.
diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh
index 2c43a4f..d8b6556 100755
--- a/predictor/predictor-run.sh
+++ b/predictor/predictor-run.sh
@@ -57,10 +57,16 @@ build_prompt_footer
 # shellcheck disable=SC2034  # consumed by run_formula_and_monitor
 PROMPT="You are the prediction agent (goblin) for ${CODEBERG_REPO}. Work through the formula below. You MUST write PHASE:done to '${PHASE_FILE}' when finished — the orchestrator will time you out if you return to the prompt without signalling.
 
-Your role: spot patterns in infrastructure signals and file them as prediction issues.
+Your role: spot patterns across three signal categories and file them as prediction issues:
+1. Health signals — CI trends, agent status, resource pressure, stale issues
+2. Outcome signals — output freshness, capacity utilization, throughput
+3. External signals — dependency advisories, upstream changes, ecosystem shifts
+
 The planner (adult) will triage every prediction before acting.
 You MUST NOT emit feature work or implementation issues — only predictions
-about CI health, issue staleness, agent status, and system conditions.
+about health, outcomes, and external risks/opportunities.
+Use WebSearch for external signal scanning — be targeted (project dependencies
+and tools only, not general news). Limit to 5 web searches per run.
 
 ## Project context
 ${CONTEXT_BLOCK}