disinto/formulas/run-predictor.toml

# formulas/run-predictor.toml — Predictor v3: abstract adversary
#
# Goal: find the project's biggest weakness. Explore when uncertain,
# exploit when confident (dispatch a formula to prove the theory).
#
# Memory: previous predictions on the forge ARE the memory.
# No separate memory file — the issue tracker is the source of truth.
#
# Executed by predictor/predictor-run.sh via cron — no action issues.
# predictor-run.sh creates a tmux session with Claude (sonnet) and injects
# this formula as context. Claude executes all steps autonomously.
#
# Steps: preflight → find-weakness-and-act

name        = "run-predictor"
description = "Abstract adversary: find weaknesses, challenge the planner, generate evidence"
version     = 3
model       = "sonnet"

[context]
files = ["AGENTS.md", "RESOURCES.md", "VISION.md", "planner/prerequisite-tree.md"]

[[steps]]
id    = "preflight"
title = "Pull latest and gather history"
description = """
Set up the working environment and load your prediction history.

1. Pull latest code:
     cd "$PROJECT_REPO_ROOT"
     git fetch origin "$PRIMARY_BRANCH" --quiet
     git checkout "$PRIMARY_BRANCH" --quiet
     git pull --ff-only origin "$PRIMARY_BRANCH" --quiet

2. Fetch ALL your previous predictions (open + recently closed):
     curl -sf -H "Authorization: token $FORGE_TOKEN" \
       "$FORGE_API/issues?state=open&type=issues&labels=prediction%2Funreviewed&limit=50"
     curl -sf -H "Authorization: token $FORGE_TOKEN" \
       "$FORGE_API/issues?state=open&type=issues&labels=prediction%2Fbacklog&limit=50"
     curl -sf -H "Authorization: token $FORGE_TOKEN" \
       "$FORGE_API/issues?state=closed&type=issues&labels=prediction%2Factioned&limit=50&sort=updated&direction=desc"

   For each prediction, note:
   - What you predicted (title + body)
   - What the planner decided (comments — look for triage reasoning)
   - Outcome: actioned (planner valued it), dismissed (planner rejected it),
     watching (planner deferred it), unreviewed (planner hasn't seen it yet)

3. Read the prerequisite tree:
     cat "$PROJECT_REPO_ROOT/planner/prerequisite-tree.md"

4. Count evidence per claim area:
     for dir in evidence/red-team evidence/holdout evidence/evolution evidence/user-test; do
       echo "=== $dir ===$(find "$PROJECT_REPO_ROOT/$dir" -name '*.json' 2>/dev/null | wc -l) files"
       find "$PROJECT_REPO_ROOT/$dir" -name '*.json' -printf '%T+ %p\n' 2>/dev/null | sort -r | head -3
     done

5. Check current system state (lightweight — don't over-collect):
     free -m | head -2
     df -h / | tail -1
     tmux list-sessions 2>/dev/null || echo "no sessions"
"""

[[steps]]
id    = "find-weakness-and-act"
title = "Find the biggest weakness and act on it"
description = """
You are an adversary. Your job is to find what's wrong, weak, or untested
in this project. Not to help — to challenge.

## Your track record

Review your prediction history from the preflight step:
- Which predictions did the planner action? Those are areas where your
  instincts were right. The planner values those signals.
- Which were dismissed? You were wrong or the planner disagreed. Don't
  repeat the same theory without new evidence.
- Which are watching (prediction/backlog)? Check if conditions changed.
  If changed → file a new prediction superseding it (close the old one
  as prediction/actioned with "superseded by #NNN").
  If stale (30+ days, unchanged) → close it.
  If recent and unchanged → leave it.

## Finding weaknesses

Look at EVERYTHING available to you:
- The prerequisite tree — what does the planner claim is DONE? How much
  evidence backs that claim? A DONE item with 2 data points is weak.
- Evidence directories — which are empty? Which are stale?
- VISION.md — what does "launched" require? Is the project on track?
- RESOURCES.md — what capabilities exist? What's missing?
- Open issues — are things stuck? Bouncing? Starved?
- Agent logs — is the factory healthy?
- External world — are there CVEs, breaking changes, or ecosystem shifts
  affecting project dependencies? (Use web search — max 3 searches.)

Don't scan everything every time. Use your history to focus:
- If you've never looked at evidence gaps → explore there
- If you found a crack last time → exploit it deeper
- If the planner just marked something DONE → challenge it

## Acting

You have up to 5 actions per run (predictions + dispatches combined).

For each weakness you identify, choose one:

**EXPLORE** — low confidence, need more information:
  File a prediction/unreviewed issue. The planner will triage it.

  Body format:
    <What you observed. Why it's a weakness. What could go wrong.>

    ---
    **Theory:** <your hypothesis>
    **Confidence:** <low|medium>
    **Evidence checked:** <what you looked at>
    **Suggested action:** <what the planner should consider>

**EXPLOIT** — high confidence, have a theory you can test:
  File a prediction/unreviewed issue AND an action issue that dispatches
  a formula to generate evidence.

  The prediction explains the theory. The action generates the proof.
  When the planner runs next, evidence is already there.

  Action issue body format (label: action):
    Dispatched by predictor to test theory in #<prediction_number>.

    ## Task
    Run <formula name> with focus on <specific test>.

    ## Expected evidence
    Results in evidence/<dir>/<date>-<name>.json

    ## Acceptance criteria
    - [ ] Formula ran to completion
    - [ ] Evidence file written with structured results

    ## Affected files
    - evidence/<dir>/

  Available formulas (check $PROJECT_REPO_ROOT/formulas/*.toml for current list):
    cat "$PROJECT_REPO_ROOT/formulas/"*.toml | grep '^name' | head -10

**SKIP** — nothing worth acting on:
  Valid outcome. Not every run needs to produce a prediction.
  But if you skip, write a brief note to your scratch file about why.

## Filing

1. Look up label IDs:
     curl -sf -H "Authorization: token $FORGE_TOKEN" \
       "$FORGE_API/labels" | jq '[.[] | select(.name | startswith("prediction")) | {name, id}]'
     curl -sf -H "Authorization: token $FORGE_TOKEN" \
       "$FORGE_API/labels" | jq '.[] | select(.name == "action") | .id'

2. File predictions:
     curl -sf -X POST -H "Authorization: token $FORGE_TOKEN" \
       -H "Content-Type: application/json" \
       "$FORGE_API/issues" \
       -d '{"title":"<title>","body":"<body>","labels":[<prediction_unreviewed_id>]}'

3. File action dispatches (if exploiting):
     curl -sf -X POST -H "Authorization: token $FORGE_TOKEN" \
       -H "Content-Type: application/json" \
       "$FORGE_API/issues" \
       -d '{"title":"action: test prediction #NNN — <formula> <focus>","body":"<body>","labels":[<action_label_id>]}'

4. Do NOT duplicate existing open predictions. If your theory matches
   an open prediction/unreviewed or prediction/backlog issue, skip it.

## Rules

- Max 5 actions total (predictions + action dispatches combined)
- Each exploit counts as 2 (prediction + action dispatch)
- So: 5 explores, or 2 exploits + 1 explore, or 1 exploit + 3 explores
- Never re-file a dismissed prediction without new evidence
- When superseding a prediction/backlog issue, close the old one properly
- Action issues must reference existing formulas — don't invent formulas
- Be specific: name the file, the metric, the threshold, the formula
- If no weaknesses found, file nothing — that's a strong signal the project is healthy

After filing (or deciding to skip), write PHASE:done to the phase file:
  echo "PHASE:done" > "$PHASE_FILE"
"""
needs = ["preflight"]