From 14e1c9ecde78f6d6d9e336357534f9a9cc2c10c0 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 23 Mar 2026 13:56:59 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20feat:=20predictor=20v3=20=E2=80=94=20abs?=
 =?UTF-8?q?tract=20adversary=20with=20explore/exploit=20and=20formula=20di?=
 =?UTF-8?q?spatch=20(#609)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 formulas/run-predictor.toml | 510 +++++++++++-------------------------
 predictor/AGENTS.md         |  53 ++--
 predictor/predictor-run.sh  |  14 +-
 3 files changed, 191 insertions(+), 386 deletions(-)

diff --git a/formulas/run-predictor.toml b/formulas/run-predictor.toml
index ac78e4a..2cb5d9e 100644
--- a/formulas/run-predictor.toml
+++ b/formulas/run-predictor.toml
@@ -1,385 +1,187 @@
-# formulas/run-predictor.toml — Predictor formula (disinto-specific signals)
+# formulas/run-predictor.toml — Predictor v3: abstract adversary
+#
+# Goal: find the project's biggest weakness. Explore when uncertain,
+# exploit when confident (dispatch a formula to prove the theory).
+#
+# Memory: previous predictions on Codeberg ARE the memory.
+# No separate memory file — the issue tracker is the source of truth.
 #
 # Executed by predictor/predictor-run.sh via cron — no action issues.
 # predictor-run.sh creates a tmux session with Claude (sonnet) and injects
 # this formula as context. Claude executes all steps autonomously.
 #
-# Steps: preflight → collect-signals → re-evaluate-backlog → analyze-and-predict
-#
-# Signal sources (three categories):
-#   Health signals:
-#     - CI pipeline trends (Woodpecker)
-#     - Stale issues (open issues with no recent activity)
-#     - Agent health (tmux sessions, recent logs)
-#     - Resource patterns (RAM, disk, load, containers)
-#   Outcome signals:
-#     - Output freshness (formula evidence/artifacts)
-#     - Capacity utilization (idle agents vs dispatchable work)
-#     - Throughput (recently closed issues, merged PRs)
-#   External signals:
-#     - Dependency security advisories
-#     - Upstream breaking changes and deprecations
+# Steps: preflight → find-weakness-and-act
 
 name        = "run-predictor"
-description = "Evidence-based prediction: health, outcome measurement, external environment signals"
-version     = 2
+description = "Abstract adversary: find weaknesses, challenge the planner, generate evidence"
+version     = 3
 model       = "sonnet"
 
 [context]
-files = ["AGENTS.md", "RESOURCES.md"]
+files = ["AGENTS.md", "RESOURCES.md", "VISION.md", "planner/prerequisite-tree.md"]
 
 [[steps]]
 id    = "preflight"
-title = "Pull latest code and gather environment"
+title = "Pull latest and gather history"
 description = """
-Set up the working environment for this prediction run.
+Set up the working environment and load your prediction history.
 
-1. Change to the project repository:
+1. Pull latest code:
      cd "$PROJECT_REPO_ROOT"
-
-2. Pull the latest code:
      git fetch origin "$PRIMARY_BRANCH" --quiet
      git checkout "$PRIMARY_BRANCH" --quiet
      git pull --ff-only origin "$PRIMARY_BRANCH" --quiet
-"""
 
-[[steps]]
-id    = "collect-signals"
-title = "Collect disinto-specific signals"
-description = """
-Gather raw signal data for pattern analysis. Collect each signal category
-and store the results for the analysis step.
-
-### 1. CI pipeline trends (Woodpecker)
-
-Fetch recent builds from Woodpecker CI:
-  curl -sf -H "Authorization: Bearer $WOODPECKER_TOKEN" \
-    "${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines?page=1&perPage=20"
-
-Look for:
-- Build failure rate over last 20 builds
-- Repeated failures on the same step
-- Builds stuck in running/pending state
-- Time since last successful build
-
-If WOODPECKER_TOKEN or WOODPECKER_SERVER are not set, skip CI signals and note
-"CI signals unavailable — WOODPECKER_TOKEN not configured".
-
-### 2. Stale issues
-
-Fetch all open issues:
-  curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-    "$CODEBERG_API/issues?state=open&type=issues&limit=50&sort=updated&direction=asc"
-
-Identify:
-- Issues with no update in 14+ days (stale)
-- Issues with no update in 30+ days (very stale)
-- Issues labeled 'action' or 'backlog' that are stale (work not progressing)
-- Blocked issues where the blocker may have been resolved
-
-### 3. Agent health
-
-Check active tmux sessions:
-  tmux list-sessions 2>/dev/null || echo "no sessions"
-
-Check recent agent logs (last 24h of activity):
-  for log in supervisor/supervisor.log planner/planner.log planner/prediction.log \
-             gardener/gardener.log dev/dev.log review/review.log; do
-    if [ -f "$PROJECT_REPO_ROOT/$log" ]; then
-      echo "=== $log (last 20 lines) ==="
-      tail -20 "$PROJECT_REPO_ROOT/$log"
-    fi
-  done
-
-Look for:
-- Agents that haven't run recently (missing log entries in last 24h)
-- Repeated errors or failures in logs
-- Sessions stuck or crashed (tmux sessions present but no recent activity)
-- Lock files that may be stale: /tmp/*-poll.lock, /tmp/*-run.lock
-
-### 4. Resource patterns
-
-Collect current resource state:
-  free -m                          # RAM
-  df -h /                          # Disk
-  cat /proc/loadavg                # Load average
-  docker ps --format '{{.Names}} {{.Status}}' 2>/dev/null || true  # Containers
-
-Look for:
-- Available RAM < 2000MB (agents will skip runs)
-- Disk usage > 80% (approaching danger zone)
-- Load average > 3.0 (box overloaded)
-- Containers in unhealthy or restarting state
-
-### 5. Already-open predictions (deduplication)
-
-Fetch existing open predictions to avoid duplicates:
-  curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-    "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Funreviewed&limit=50"
-
-Also check prediction/backlog (watched but not yet actioned):
-  curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-    "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Fbacklog&limit=50"
-
-Record their titles so you can avoid duplicating them.
-
-### 6. Outcome measurement
-
-Check whether the factory is producing results, not just running:
-
-- Read RESOURCES.md for available formulas and capabilities
-- Read $PROJECT_REPO_ROOT/formulas/*.toml for dispatchable work
-- Check evidence/output directories for freshness:
-    find "$PROJECT_REPO_ROOT" -maxdepth 3 -name "*.log" -o -name "journal" -type d | \
-      while read -r f; do
-        echo "=== $f ==="
-        find "$f" -maxdepth 1 -type f -printf '%T+ %p\n' 2>/dev/null | sort -r | head -5
-      done
-- Check recently closed issues — is work completing or just cycling?
-    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-      "$CODEBERG_API/issues?state=closed&type=issues&limit=20&sort=updated&direction=desc"
-- Check recently merged PRs — what's the throughput?
-    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-      "$CODEBERG_API/pulls?state=closed&sort=updated&direction=desc&limit=20" | \
-      jq '[.[] | select(.merged)]'
-- Compare available capacity vs actual utilization:
-    tmux list-sessions 2>/dev/null | wc -l  # active sessions
-    curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-      "$CODEBERG_API/issues?state=open&type=issues&labels=backlog&limit=50" | jq 'length'
-
-Look for:
-- Formulas that haven't produced output recently (stale journals/logs)
-- Idle compute when dispatchable work exists (backlog items but no active sessions)
-- High churn (issues opened and closed rapidly without merged PRs)
-- Low throughput relative to available agents
-
-### 7. External environment scan
-
-Look outside the box for signals that could affect the project:
-
-- Identify key dependencies from the project (package.json, go.mod, Cargo.toml,
-  requirements.txt, or similar — whatever exists in $PROJECT_REPO_ROOT)
-- Identify key tools (Claude CLI version, Woodpecker CI, Caddy, Docker, etc.)
-- For each major dependency or tool, use web search to check for:
-    - Security advisories or CVEs
-    - Breaking changes in recent releases
-    - Deprecation notices
-    - Major version bumps that could require migration
-
-Use WebSearch to gather these signals. Be targeted — search for specific
-dependencies and tools used by the project, not general news.
-Limit to 5 web searches maximum to keep the run fast.
-
-Look for:
-- CVEs or security advisories mentioning project dependencies
-- Major version releases of key tools (could break CI, require migration)
-- Deprecation notices for APIs or services in use
-- Ecosystem shifts that could obsolete current approaches
-"""
-needs = ["preflight"]
-
-[[steps]]
-id    = "re-evaluate-backlog"
-title = "Re-evaluate open prediction/backlog watches"
-description = """
-Re-check prediction/backlog issues to detect changed conditions or stale watches.
-The collect-signals step already fetched prediction/backlog issues (step 5).
-Now actively re-evaluate each one instead of just using them for dedup.
-
-For each open prediction/backlog issue:
-
-### 1. Read context
-
-Fetch the issue body and all comments:
-  curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-    "$CODEBERG_API/issues/<issue_number>"
-  curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-    "$CODEBERG_API/issues/<issue_number>/comments"
-
-Pay attention to:
-- The original prediction body (signal source, confidence, suggested action)
-- The planner's triage comment (the "Watching — ..." comment with reasoning)
-- Any subsequent comments with updated context
-- The issue's created_at and updated_at timestamps
-
-### 2. Extract conditions
-
-From the planner's triage comment and original prediction body, identify the
-specific assumptions that made this a "watch, don't act" decision. Examples:
-- "static site config, no FastCGI" (Caddy CVE watch)
-- "RAM stable above 3GB" (resource pressure watch)
-- "no reverse proxy configured" (security exposure watch)
-- "dependency not in use yet" (CVE watch for unused feature)
-
-### 3. Re-check conditions
-
-Verify each assumption still holds by checking current system state:
-- Config files: read relevant configs in $PROJECT_REPO_ROOT
-- Versions: check installed versions of referenced tools/dependencies
-- Infrastructure: re-run relevant resource/health checks from collect-signals
-- Code changes: check git log for changes to affected files since the issue was created:
-    git log --oneline --since="<issue_created_at>" -- <affected_files>
-
-### 4. Decide
-
-For each prediction/backlog issue, choose one action:
-
-**CONDITIONS_CHANGED** — one or more assumptions no longer hold:
-  a. Resolve the prediction/backlog and prediction/unreviewed label IDs:
-       curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-         "$CODEBERG_API/labels" | jq '.[] | select(.name == "prediction/unreviewed") | .id'
-       curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-         "$CODEBERG_API/labels" | jq '.[] | select(.name == "prediction/actioned") | .id'
-  b. File a NEW prediction/unreviewed issue with updated context:
-       curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues" \
-         -d '{"title":"<original title> — CONDITIONS CHANGED",
-              "body":"Re-evaluation of #<old_number>: conditions have changed.\\n\\n<what changed and why risk level is different now>\\n\\nOriginal prediction: #<old_number>\\n\\n---\\n**Signal source:** re-evaluation of prediction/backlog #<old_number>\\n**Confidence:** <high|medium|low>\\n**Suggested action:** <concrete next step>",
-              "labels":[<unreviewed_label_id>]}'
-  c. Comment on the OLD issue explaining what changed:
-       curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<old_number>/comments" \
-         -d '{"body":"Superseded by #<new_number> — conditions changed: <summary>"}'
-  d. Relabel old issue: remove prediction/backlog, add prediction/actioned:
-       curl -sf -X DELETE -H "Authorization: token $CODEBERG_TOKEN" \
-         "$CODEBERG_API/issues/<old_number>/labels/<backlog_label_id>"
-       curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<old_number>/labels" \
-         -d '{"labels":[<actioned_label_id>]}'
-  e. Close the old issue:
-       curl -sf -X PATCH -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<old_number>" \
-         -d '{"state":"closed"}'
-
-**STALE** — 30+ days since last update AND conditions unchanged:
-  a. Comment explaining the closure:
-       curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<issue_number>/comments" \
-         -d '{"body":"Closing stale watch — conditions stable for 30+ days. Will re-file if conditions change."}'
-  b. Relabel: remove prediction/backlog, add prediction/actioned:
-       curl -sf -X DELETE -H "Authorization: token $CODEBERG_TOKEN" \
-         "$CODEBERG_API/issues/<issue_number>/labels/<backlog_label_id>"
-       curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<issue_number>/labels" \
-         -d '{"labels":[<actioned_label_id>]}'
-  c. Close the issue:
-       curl -sf -X PATCH -H "Authorization: token $CODEBERG_TOKEN" \
-         -H "Content-Type: application/json" \
-         "$CODEBERG_API/issues/<issue_number>" \
-         -d '{"state":"closed"}'
-
-**UNCHANGED_RECENT** — conditions unchanged AND last update < 30 days ago:
-  Skip — no action needed. This is the current behavior.
-
-## Rules
-- Process ALL open prediction/backlog issues (already fetched in collect-signals step 5)
-- New predictions filed here count toward the 5-prediction cap in analyze-and-predict
-- Track how many new predictions were filed so analyze-and-predict can adjust its cap
-- Be conservative: only mark CONDITIONS_CHANGED when you have concrete evidence
-- Use the updated_at timestamp from the issue API to determine staleness
-"""
-needs = ["collect-signals"]
-
-[[steps]]
-id    = "analyze-and-predict"
-title = "Analyze signals and file prediction issues"
-description = """
-Analyze the collected signals for patterns and file prediction issues.
-
-The re-evaluate-backlog step may have already filed new predictions from changed
-conditions. Subtract those from the 5-prediction cap: if re-evaluation filed N
-predictions, you may file at most (5 - N) new predictions in this step.
-
-## What to look for
-
-**CI regression** — Build failure rate increasing or repeated failures:
-- Failure rate > 30% over last 20 builds → high confidence
-- Same step failing 3+ times in a row → high confidence
-- No successful build in 24+ hours → medium confidence
-
-**Stale work** — Issues not progressing:
-- Action issues stale 7+ days → the action agent may be stuck
-- Backlog issues stale 14+ days → work not being picked up
-- Blocked issues whose blockers are now closed → can be unblocked
-
-**Agent health** — Agents not running or failing:
-- Agent log with no entries in 24+ hours → agent may be down
-- Repeated errors in agent logs → systemic problem
-- Stale lock files (process not running but lock exists)
-
-**Resource pressure** — System approaching limits:
-- RAM < 2000MB → agents will start skipping runs
-- Disk > 80% → approaching critical threshold
-- Load sustained > 3.0 → box is overloaded, queued work backing up
-
-**Opportunity** — Good conditions for expensive work:
-- Box idle (RAM > 3000MB, load < 1.0, few active sessions) → good time
-  for expensive operations if any are pending
-
-**Low throughput** — Factory running but not producing:
-- No issues closed in 7+ days despite available backlog → pipeline may be stuck
-- PRs merged but no issues closed → work not tracked properly
-- Agent sessions active but no PRs created → agents may be spinning
-- Formulas with no recent journal entries → agent may not be running
-
-**Idle capacity** — Dispatchable work not being picked up:
-- Backlog items available but no in-progress issues → dev-poll may be stuck
-- Multiple agents idle (few tmux sessions) with work queued → scheduling problem
-- High churn: issues opened and closed quickly without PRs → busy but not productive
-
-**External risk** — Threats or opportunities from outside:
-- CVE or security advisory for a project dependency → patch urgently
-- Major version release of a key tool → may require migration planning
-- Deprecation notice for an API or service in use → plan transition
-- Breaking change upstream that could affect CI or builds → investigate
-
-**External opportunity** — Beneficial changes in the ecosystem:
-- New tool release that could accelerate work → consider adoption
-- Upstream improvement that simplifies current workarounds → refactor opportunity
-- Security patch available for a known vulnerability → apply proactively
-
-## Filing predictions
-
-For each prediction, create a Codeberg issue with the `prediction/unreviewed` label.
-
-1. Look up the label ID:
+2. Fetch ALL your previous predictions (open + recently closed):
      curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
-       "$CODEBERG_API/labels" | jq '.[] | select(.name == "prediction/unreviewed") | .id'
+       "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Funreviewed&limit=50"
+     curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+       "$CODEBERG_API/issues?state=open&type=issues&labels=prediction%2Fbacklog&limit=50"
+     curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+       "$CODEBERG_API/issues?state=closed&type=issues&labels=prediction%2Factioned&limit=50&sort=updated&direction=desc"
 
-2. For each prediction, create an issue:
+   For each prediction, note:
+   - What you predicted (title + body)
+   - What the planner decided (comments — look for triage reasoning)
+   - Outcome: actioned (planner valued it), dismissed (planner rejected it),
+     watching (planner deferred it), unreviewed (planner hasn't seen it yet)
+
+3. Read the prerequisite tree:
+     cat "$PROJECT_REPO_ROOT/planner/prerequisite-tree.md"
+
+4. Count evidence per claim area:
+     for dir in evidence/red-team evidence/holdout evidence/evolution evidence/user-test; do
+       echo "=== $dir ===$(find "$PROJECT_REPO_ROOT/$dir" -name '*.json' 2>/dev/null | wc -l) files"
+       find "$PROJECT_REPO_ROOT/$dir" -name '*.json' -printf '%T+ %p\n' 2>/dev/null | sort -r | head -3
+     done
+
+5. Check current system state (lightweight — don't over-collect):
+     free -m | head -2
+     df -h / | tail -1
+     tmux list-sessions 2>/dev/null || echo "no sessions"
+"""
+
+[[steps]]
+id    = "find-weakness-and-act"
+title = "Find the biggest weakness and act on it"
+description = """
+You are an adversary. Your job is to find what's wrong, weak, or untested
+in this project. Not to help — to challenge.
+
+## Your track record
+
+Review your prediction history from the preflight step:
+- Which predictions did the planner action? Those are areas where your
+  instincts were right. The planner values those signals.
+- Which were dismissed? You were wrong or the planner disagreed. Don't
+  repeat the same theory without new evidence.
+- Which are watching (prediction/backlog)? Check if conditions changed.
+  If changed → file a new prediction superseding it (close the old one
+  as prediction/actioned with "superseded by #NNN").
+  If stale (30+ days, unchanged) → close it.
+  If recent and unchanged → leave it.
+
+## Finding weaknesses
+
+Look at EVERYTHING available to you:
+- The prerequisite tree — what does the planner claim is DONE? How much
+  evidence backs that claim? A DONE item with 2 data points is weak.
+- Evidence directories — which are empty? Which are stale?
+- VISION.md — what does "launched" require? Is the project on track?
+- RESOURCES.md — what capabilities exist? What's missing?
+- Open issues — are things stuck? Bouncing? Starved?
+- Agent logs — is the factory healthy?
+- External world — are there CVEs, breaking changes, or ecosystem shifts
+  affecting project dependencies? (Use web search — max 3 searches.)
+
+Don't scan everything every time. Use your history to focus:
+- If you've never looked at evidence gaps → explore there
+- If you found a crack last time → exploit it deeper
+- If the planner just marked something DONE → challenge it
+
+## Acting
+
+You have up to 5 actions per run (predictions + dispatches combined).
+
+For each weakness you identify, choose one:
+
+**EXPLORE** — low confidence, need more information:
+  File a prediction/unreviewed issue. The planner will triage it.
+
+  Body format:
+    <What you observed. Why it's a weakness. What could go wrong.>
+
+    ---
+    **Theory:** <your hypothesis>
+    **Confidence:** <low|medium>
+    **Evidence checked:** <what you looked at>
+    **Suggested action:** <what the planner should consider>
+
+**EXPLOIT** — high confidence, have a theory you can test:
+  File a prediction/unreviewed issue AND an action issue that dispatches
+  a formula to generate evidence.
+
+  The prediction explains the theory. The action generates the proof.
+  When the planner runs next, evidence is already there.
+
+  Action issue body format (label: action):
+    Dispatched by predictor to test theory in #<prediction_number>.
+
+    ## Task
+    Run <formula name> with focus on <specific test>.
+
+    ## Expected evidence
+    Results in evidence/<dir>/<date>-<name>.json
+
+    ## Acceptance criteria
+    - [ ] Formula ran to completion
+    - [ ] Evidence file written with structured results
+
+    ## Affected files
+    - evidence/<dir>/
+
+  Available formulas (check $PROJECT_REPO_ROOT/formulas/*.toml for current list):
+    cat "$PROJECT_REPO_ROOT/formulas/"*.toml | grep '^name' | head -10
+
+**SKIP** — nothing worth acting on:
+  Valid outcome. Not every run needs to produce a prediction.
+  But if you skip, write a brief note to your scratch file about why.
+
+## Filing
+
+1. Look up label IDs:
+     curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+       "$CODEBERG_API/labels" | jq '[.[] | select(.name | startswith("prediction")) | {name, id}]'
+     curl -sf -H "Authorization: token $CODEBERG_TOKEN" \
+       "$CODEBERG_API/labels" | jq '.[] | select(.name == "action") | .id'
+
+2. File predictions:
      curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
        -H "Content-Type: application/json" \
        "$CODEBERG_API/issues" \
-       -d '{"title":"<title>","body":"<body>","labels":[<label_id>]}'
+       -d '{"title":"<title>","body":"<body>","labels":[<prediction_unreviewed_id>]}'
 
-   Body format:
-     <2-4 sentence description of what was observed, why it matters,
-      what the planner should consider>
+3. File action dispatches (if exploiting):
+     curl -sf -X POST -H "Authorization: token $CODEBERG_TOKEN" \
+       -H "Content-Type: application/json" \
+       "$CODEBERG_API/issues" \
+       -d '{"title":"action: test prediction #NNN — <formula> <focus>","body":"<body>","labels":[<action_label_id>]}'
 
-     ---
-     **Signal source:** <which signal triggered this>
-     **Confidence:** <high|medium|low>
-     **Suggested action:** <concrete next step for the planner>
-
-3. Send a Matrix notification for each prediction created (optional):
-     Use matrix_send if available, or skip if MATRIX_TOKEN is not set.
+4. Do NOT duplicate existing open predictions. If your theory matches
+   an open prediction/unreviewed or prediction/backlog issue, skip it.
 
 ## Rules
-- Max 5 predictions total (including any filed during re-evaluate-backlog)
-- Do NOT predict feature work — only health observations, outcome measurements,
-  and external risk/opportunity signals
-- Do NOT duplicate existing open predictions (checked in collect-signals)
-- Do NOT duplicate predictions just filed by re-evaluate-backlog for changed conditions
-- Be specific: name the metric, the value, the threshold
-- Prefer high-confidence predictions backed by concrete data
-- External signals must name the specific dependency/tool and the advisory/change
-- If no meaningful patterns found, file zero issues — that is a valid outcome
 
+- Max 5 actions total (predictions + action dispatches combined)
+- Each exploit counts as 2 (prediction + action dispatch)
+- So: 5 explores, or 2 exploits + 1 explore, or 1 exploit + 3 explores
+- Never re-file a dismissed prediction without new evidence
+- When superseding a prediction/backlog issue, close the old one properly
+- Action issues must reference existing formulas — don't invent formulas
+- Be specific: name the file, the metric, the threshold, the formula
+- If no weaknesses found, file nothing — that's a strong signal the project is healthy
+
+After filing (or deciding to skip), write PHASE:done to the phase file:
+  echo "PHASE:done" > "$PHASE_FILE"
 """
-needs = ["re-evaluate-backlog"]
+needs = ["preflight"]
diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md
index 0789b43..8b3bbea 100644
--- a/predictor/AGENTS.md
+++ b/predictor/AGENTS.md
@@ -1,22 +1,26 @@
 <!-- last-reviewed: eb7e24cb1df028c6061f47ddfdf9b4ebec33e1cf -->
 # Predictor Agent
 
-**Role**: Risk oracle and opportunity spotter (the "goblin"). Runs a 4-step
-formula (preflight → collect-signals → re-evaluate-backlog → analyze-and-predict)
-via interactive tmux Claude session (sonnet). Collects three categories of signals:
+**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
+(preflight → find-weakness-and-act) via interactive tmux Claude session
+(sonnet). Finds the project's biggest weakness, challenges planner claims,
+and generates evidence through explore/exploit decisions:
 
-1. **Health signals** — CI pipeline trends (Woodpecker), stale issues, agent
-   health (tmux sessions + logs), resource patterns (RAM, disk, load, containers)
-2. **Outcome signals** — output freshness (formula journals/artifacts), capacity
-   utilization (idle agents vs dispatchable backlog), throughput (closed issues,
-   merged PRs, churn detection)
-3. **External signals** — dependency security advisories, upstream breaking
-   changes, deprecation notices, ecosystem shifts (via targeted web search)
+- **Explore** (low confidence) — file a `prediction/unreviewed` issue for
+  the planner to triage
+- **Exploit** (high confidence) — file a prediction AND dispatch a formula
+  via an `action` issue to generate evidence before the planner even runs
 
-Files up to 5 `prediction/unreviewed` issues for the Planner to triage.
-Predictions cover both "things going wrong" and "opportunities being missed".
-The predictor MUST NOT emit feature work — only observations about health,
-outcomes, and external risks/opportunities.
+The predictor's own prediction history (open + closed issues) serves as its
+memory — it reviews what was actioned, dismissed, or deferred to decide where
+to focus next. No hardcoded signal categories; Claude decides where to look
+based on available data: prerequisite tree, evidence directories, VISION.md,
+RESOURCES.md, open issues, agent logs, and external signals (via web search).
+
+Files up to 5 actions per run (predictions + dispatches combined). Each
+exploit counts as 2 (prediction + action dispatch). The predictor MUST NOT
+emit feature work — only observations challenging claims, exposing gaps,
+and surfacing risks.
 
 **Trigger**: `predictor-run.sh` runs daily at 06:00 UTC via cron (1h before
 the planner at 07:00). Guarded by PID lock (`/tmp/predictor-run.lock`) and
@@ -27,22 +31,21 @@ memory check (skips if available RAM < 2000 MB).
   sources disinto project config, builds prompt with formula + Codeberg API
   reference, creates tmux session (sonnet), monitors phase file, handles crash
   recovery via `run_formula_and_monitor`
-- `formulas/run-predictor.toml` — Execution spec: four steps (preflight,
-  collect-signals, re-evaluate-backlog, analyze-and-predict) with `needs`
-  dependencies. Claude collects signals, re-evaluates watched predictions,
-  and files prediction issues in a single interactive session
+- `formulas/run-predictor.toml` — Execution spec: two steps (preflight,
+  find-weakness-and-act) with `needs` dependencies. Claude reviews prediction
+  history, explores/exploits weaknesses, and files issues in a single
+  interactive session
 
 **Environment variables consumed**:
 - `CODEBERG_TOKEN`, `CODEBERG_REPO`, `CODEBERG_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by predictor-run.sh)
-- `WOODPECKER_TOKEN`, `WOODPECKER_SERVER` — CI pipeline trend queries (optional; skipped if unset)
 - `MATRIX_TOKEN`, `MATRIX_ROOM_ID`, `MATRIX_HOMESERVER` — Notifications (optional)
 
 **Lifecycle**: predictor-run.sh (daily 06:00 cron) → lock + memory guard →
-load formula + context → create tmux session → Claude collects signals
-(health: CI trends, stale issues, agent health, resources; outcomes: output
-freshness, capacity utilization, throughput; external: dependency advisories,
-ecosystem changes via web search) → dedup against existing open predictions →
-re-evaluate prediction/backlog watches (close stale, supersede changed) →
-file `prediction/unreviewed` issues → `PHASE:done`.
+load formula + context (AGENTS.md, RESOURCES.md, VISION.md, prerequisite-tree.md)
+→ create tmux session → Claude fetches prediction history (open + closed) →
+reviews track record (actioned/dismissed/watching) → finds weaknesses
+(prerequisite tree gaps, thin evidence, stale watches, external risks) →
+dedup against existing open predictions → explore (file prediction) or exploit
+(file prediction + dispatch formula via action issue) → `PHASE:done`.
 The planner's Phase 1 later triages these predictions.
diff --git a/predictor/predictor-run.sh b/predictor/predictor-run.sh
index d8b6556..ce4badf 100755
--- a/predictor/predictor-run.sh
+++ b/predictor/predictor-run.sh
@@ -45,7 +45,7 @@ log "--- Predictor run start ---"
 
 # ── Load formula + context ───────────────────────────────────────────────
 load_formula "$FACTORY_ROOT/formulas/run-predictor.toml"
-build_context_block AGENTS.md RESOURCES.md
+build_context_block AGENTS.md RESOURCES.md VISION.md planner/prerequisite-tree.md
 
 # ── Read scratch file (compaction survival) ───────────────────────────────
 SCRATCH_CONTEXT=$(read_scratch_context "$SCRATCH_FILE")
@@ -57,16 +57,16 @@ build_prompt_footer
 # shellcheck disable=SC2034  # consumed by run_formula_and_monitor
 PROMPT="You are the prediction agent (goblin) for ${CODEBERG_REPO}. Work through the formula below. You MUST write PHASE:done to '${PHASE_FILE}' when finished — the orchestrator will time you out if you return to the prompt without signalling.
 
-Your role: spot patterns across three signal categories and file them as prediction issues:
-1. Health signals — CI trends, agent status, resource pressure, stale issues
-2. Outcome signals — output freshness, capacity utilization, throughput
-3. External signals — dependency advisories, upstream changes, ecosystem shifts
+Your role: abstract adversary. Find the project's biggest weakness, challenge
+planner claims, and generate evidence. Explore when uncertain (file a prediction),
+exploit when confident (file a prediction AND dispatch a formula via an action issue).
 
+Your prediction history IS your memory — review it to decide where to focus.
 The planner (adult) will triage every prediction before acting.
 You MUST NOT emit feature work or implementation issues — only predictions
-about health, outcomes, and external risks/opportunities.
+challenging claims, exposing gaps, and surfacing risks.
 Use WebSearch for external signal scanning — be targeted (project dependencies
-and tools only, not general news). Limit to 5 web searches per run.
+and tools only, not general news). Limit to 3 web searches per run.
 
 ## Project context
 ${CONTEXT_BLOCK}