From b417747f8973d6fa460ed00486ff9d9a5cfece77 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 22 Mar 2026 11:47:13 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20feat:=20predictor=20v2=20=E2=80=94=20out?= =?UTF-8?q?come=20measurement=20+=20external=20signal=20scanning=20(#547)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- formulas/run-predictor.toml | 101 +++++++++++++++++++++++++++++++++--- predictor/AGENTS.md | 28 ++++++---- predictor/predictor-run.sh | 10 +++- 3 files changed, 120 insertions(+), 19 deletions(-) diff --git a/formulas/run-predictor.toml b/formulas/run-predictor.toml index 63283b8..2508d72 100644 --- a/formulas/run-predictor.toml +++ b/formulas/run-predictor.toml @@ -6,15 +6,23 @@ # # Steps: preflight → collect-signals → analyze-and-predict # -# Disinto-specific signal sources: -# - CI pipeline trends (Woodpecker) -# - Stale issues (open issues with no recent activity) -# - Agent health (tmux sessions, recent logs) -# - Resource patterns (RAM, disk, load, containers) +# Signal sources (three categories): +# Health signals: +# - CI pipeline trends (Woodpecker) +# - Stale issues (open issues with no recent activity) +# - Agent health (tmux sessions, recent logs) +# - Resource patterns (RAM, disk, load, containers) +# Outcome signals: +# - Output freshness (formula evidence/artifacts) +# - Capacity utilization (idle agents vs dispatchable work) +# - Throughput (recently closed issues, merged PRs) +# External signals: +# - Dependency security advisories +# - Upstream breaking changes and deprecations name = "run-predictor" -description = "Evidence-based prediction: CI trends, stale issues, agent health, resource patterns" -version = 1 +description = "Evidence-based prediction: health, outcome measurement, external environment signals" +version = 2 model = "sonnet" [context] @@ -114,6 +122,59 @@ Also check prediction/backlog (watched but not yet actioned): "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Fbacklog&limit=50" Record their titles so you can avoid duplicating them. + +### 6. Outcome measurement + +Check whether the factory is producing results, not just running: + +- Read RESOURCES.md for available formulas and capabilities +- Read $PROJECT_REPO_ROOT/formulas/*.toml for dispatchable work +- Check evidence/output directories for freshness: + find "$PROJECT_REPO_ROOT" -maxdepth 3 -name "*.log" -o -name "journal" -type d | \ + while read -r f; do + echo "=== $f ===" + find "$f" -maxdepth 1 -type f -printf '%T+ %p\n' 2>/dev/null | sort -r | head -5 + done +- Check recently closed issues — is work completing or just cycling? + curl -sf -H "Authorization: token $CODEBERG_TOKEN" \ + "$CODEBERG_API/issues?state=closed&type=issues&limit=20&sort=updated&direction=desc" +- Check recently merged PRs — what's the throughput? + curl -sf -H "Authorization: token $CODEBERG_TOKEN" \ + "$CODEBERG_API/pulls?state=closed&sort=updated&direction=desc&limit=20" | \ + jq '[.[] | select(.merged)]' +- Compare available capacity vs actual utilization: + tmux list-sessions 2>/dev/null | wc -l # active sessions + curl -sf -H "Authorization: token $CODEBERG_TOKEN" \ + "$CODEBERG_API/issues?state=open&type=issues&labels=backlog&limit=50" | jq 'length' + +Look for: +- Formulas that haven't produced output recently (stale journals/logs) +- Idle compute when dispatchable work exists (backlog items but no active sessions) +- High churn (issues opened and closed rapidly without merged PRs) +- Low throughput relative to available agents + +### 7. External environment scan + +Look outside the box for signals that could affect the project: + +- Identify key dependencies from the project (package.json, go.mod, Cargo.toml, + requirements.txt, or similar — whatever exists in $PROJECT_REPO_ROOT) +- Identify key tools (Claude CLI version, Woodpecker CI, Caddy, Docker, etc.) +- For each major dependency or tool, use web search to check for: + - Security advisories or CVEs + - Breaking changes in recent releases + - Deprecation notices + - Major version bumps that could require migration + +Use WebSearch to gather these signals. Be targeted — search for specific +dependencies and tools used by the project, not general news. +Limit to 5 web searches maximum to keep the run fast. + +Look for: +- CVEs or security advisories mentioning project dependencies +- Major version releases of key tools (could break CI, require migration) +- Deprecation notices for APIs or services in use +- Ecosystem shifts that could obsolete current approaches """ needs = ["preflight"] @@ -149,6 +210,28 @@ Analyze the collected signals for patterns and file up to 5 prediction issues. - Box idle (RAM > 3000MB, load < 1.0, few active sessions) → good time for expensive operations if any are pending +**Low throughput** — Factory running but not producing: +- No issues closed in 7+ days despite available backlog → pipeline may be stuck +- PRs merged but no issues closed → work not tracked properly +- Agent sessions active but no PRs created → agents may be spinning +- Formulas with no recent journal entries → agent may not be running + +**Idle capacity** — Dispatchable work not being picked up: +- Backlog items available but no in-progress issues → dev-poll may be stuck +- Multiple agents idle (few tmux sessions) with work queued → scheduling problem +- High churn: issues opened and closed quickly without PRs → busy but not productive + +**External risk** — Threats or opportunities from outside: +- CVE or security advisory for a project dependency → patch urgently +- Major version release of a key tool → may require migration planning +- Deprecation notice for an API or service in use → plan transition +- Breaking change upstream that could affect CI or builds → investigate + +**External opportunity** — Beneficial changes in the ecosystem: +- New tool release that could accelerate work → consider adoption +- Upstream improvement that simplifies current workarounds → refactor opportunity +- Security patch available for a known vulnerability → apply proactively + ## Filing predictions For each prediction, create a Codeberg issue with the `prediction/unreviewed` label. @@ -177,10 +260,12 @@ For each prediction, create a Codeberg issue with the `prediction/unreviewed` la ## Rules - Max 5 predictions total -- Do NOT predict feature work — only infrastructure/health/metric observations +- Do NOT predict feature work — only health observations, outcome measurements, + and external risk/opportunity signals - Do NOT duplicate existing open predictions (checked in collect-signals) - Be specific: name the metric, the value, the threshold - Prefer high-confidence predictions backed by concrete data +- External signals must name the specific dependency/tool and the advisory/change - If no meaningful patterns found, file zero issues — that is a valid outcome """ diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index c7c34ea..0f8e738 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -1,14 +1,22 @@ # Predictor Agent -**Role**: Infrastructure pattern detection (the "goblin"). Runs a 3-step +**Role**: Risk oracle and opportunity spotter (the "goblin"). Runs a 3-step formula (preflight → collect-signals → analyze-and-predict) via interactive -tmux Claude session (sonnet). Collects disinto-specific signals: CI pipeline -trends (Woodpecker), stale issues, agent health (tmux sessions + logs), and -resource patterns (RAM, disk, load, containers). Files up to 5 -`prediction/unreviewed` issues for the Planner to triage. The predictor MUST -NOT emit feature work — only observations about CI health, issue staleness, -agent status, and system conditions. +tmux Claude session (sonnet). Collects three categories of signals: + +1. **Health signals** — CI pipeline trends (Woodpecker), stale issues, agent + health (tmux sessions + logs), resource patterns (RAM, disk, load, containers) +2. **Outcome signals** — output freshness (formula journals/artifacts), capacity + utilization (idle agents vs dispatchable backlog), throughput (closed issues, + merged PRs, churn detection) +3. **External signals** — dependency security advisories, upstream breaking + changes, deprecation notices, ecosystem shifts (via targeted web search) + +Files up to 5 `prediction/unreviewed` issues for the Planner to triage. +Predictions cover both "things going wrong" and "opportunities being missed". +The predictor MUST NOT emit feature work — only observations about health, +outcomes, and external risks/opportunities. **Trigger**: `predictor-run.sh` runs daily at 06:00 UTC via cron (1h before the planner at 07:00). Guarded by PID lock (`/tmp/predictor-run.lock`) and @@ -31,6 +39,8 @@ memory check (skips if available RAM < 2000 MB). **Lifecycle**: predictor-run.sh (daily 06:00 cron) → lock + memory guard → load formula + context → create tmux session → Claude collects signals -(CI trends, stale issues, agent health, resources) → dedup against existing -open predictions → file `prediction/unreviewed` issues → `PHASE:done`. +(health: CI trends, stale issues, agent health, resources; outcomes: output +freshness, capacity utilization, throughput; external: dependency advisories, +ecosystem changes via web search) → dedup against existing open predictions → +file `prediction/unreviewed` issues → `PHASE:done`. The planner's Phase 1 later triages these predictions. diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh index 2c43a4f..d8b6556 100755 --- a/predictor/predictor-run.sh +++ b/predictor/predictor-run.sh @@ -57,10 +57,16 @@ build_prompt_footer # shellcheck disable=SC2034 # consumed by run_formula_and_monitor PROMPT="You are the prediction agent (goblin) for ${CODEBERG_REPO}. Work through the formula below. You MUST write PHASE:done to '${PHASE_FILE}' when finished — the orchestrator will time you out if you return to the prompt without signalling. -Your role: spot patterns in infrastructure signals and file them as prediction issues. +Your role: spot patterns across three signal categories and file them as prediction issues: +1. Health signals — CI trends, agent status, resource pressure, stale issues +2. Outcome signals — output freshness, capacity utilization, throughput +3. External signals — dependency advisories, upstream changes, ecosystem shifts + The planner (adult) will triage every prediction before acting. You MUST NOT emit feature work or implementation issues — only predictions -about CI health, issue staleness, agent status, and system conditions. +about health, outcomes, and external risks/opportunities. +Use WebSearch for external signal scanning — be targeted (project dependencies +and tools only, not general news). Limit to 5 web searches per run. ## Project context ${CONTEXT_BLOCK}