- Remove CLAUDE_TIMEOUT no-op override — inherit factory default (7200s) from env.sh - Use anchored grep -qxF "NO_PREDICTIONS" to avoid false early exits - Fetch closed PRs (state=closed, merged_at filter) instead of open — captures merged activity signals - Parse staleness age from filename date (YYYY-MM-DD.json) instead of file mtime - Log a warning when date -d falls back due to non-GNU date - Add comment explaining global lock serialisation trade-off Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
277 lines
12 KiB
Bash
Executable file
277 lines
12 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# prediction-agent.sh — Per-project LLM prediction agent
|
|
#
|
|
# Reads structured evidence from the project's evidence/ directory plus
|
|
# secondary Codeberg signals, then asks Claude to identify patterns and
|
|
# file up to 5 prediction/unreviewed issues for the planner to triage.
|
|
#
|
|
# The predictor is the goblin — it sees patterns and shouts about them.
|
|
# The planner is the adult — it triages every prediction before acting.
|
|
# The predictor MUST NOT emit feature work directly.
|
|
#
|
|
# Signal sources:
|
|
# evidence/red-team/ — attack results, floor status, vulnerability trends
|
|
# evidence/evolution/ — fitness scores, champion improvements
|
|
# evidence/user-test/ — persona journey completion, friction points
|
|
# evidence/holdout/ — scenario pass rates, quality gate history
|
|
# evidence/resources/ — CPU, RAM, disk, container utilization
|
|
# evidence/protocol/ — on-chain metrics from Ponder
|
|
#
|
|
# Secondary:
|
|
# Codeberg activity (new issues, merged PRs), system resource snapshot
|
|
#
|
|
# Usage: prediction-agent.sh [project-toml]
|
|
# Called by: prediction-poll.sh
|
|
# =============================================================================
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
|
|
|
|
export PROJECT_TOML="${1:-}"
|
|
# shellcheck source=../lib/env.sh
|
|
source "$FACTORY_ROOT/lib/env.sh"
|
|
|
|
LOG_FILE="$SCRIPT_DIR/prediction.log"
|
|
# env.sh already exports CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}"; inherit that default
|
|
EVIDENCE_DIR="${PROJECT_REPO_ROOT}/evidence"
|
|
|
|
log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
|
|
|
|
log "--- prediction-agent start (project: ${PROJECT_NAME}) ---"
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────
|
|
|
|
# Find the most recent JSON file in a directory (files named YYYY-MM-DD.json
|
|
# sort correctly in alphabetical order).
|
|
latest_json() { find "$1" -maxdepth 1 -name '*.json' 2>/dev/null | sort | tail -1; }
|
|
prev_json() { find "$1" -maxdepth 1 -name '*.json' 2>/dev/null | sort | tail -2 | head -1; }
|
|
|
|
# ── Scan evidence/ directory ──────────────────────────────────────────────
|
|
EVIDENCE_SUMMARY=""
|
|
for subdir in red-team evolution user-test holdout resources protocol; do
|
|
subdir_path="${EVIDENCE_DIR}/${subdir}"
|
|
|
|
if [ ! -d "$subdir_path" ]; then
|
|
EVIDENCE_SUMMARY="${EVIDENCE_SUMMARY}
|
|
=== evidence/${subdir} ===
|
|
(no data — directory not yet created)"
|
|
continue
|
|
fi
|
|
|
|
latest=$(latest_json "$subdir_path")
|
|
if [ -z "$latest" ]; then
|
|
EVIDENCE_SUMMARY="${EVIDENCE_SUMMARY}
|
|
=== evidence/${subdir} ===
|
|
(no data — no JSON files found)"
|
|
continue
|
|
fi
|
|
|
|
latest_name=$(basename "$latest")
|
|
# Derive age from the date in the filename (YYYY-MM-DD.json) — more reliable
|
|
# than mtime, which changes when files are copied or synced.
|
|
file_date=$(basename "$latest" .json)
|
|
file_ts=$(date -d "$file_date" +%s 2>/dev/null || date -r "$latest" +%s)
|
|
now_ts=$(date +%s)
|
|
age_hours=$(( (now_ts - file_ts) / 3600 ))
|
|
content=$(head -c 3000 "$latest" 2>/dev/null || echo "{}")
|
|
|
|
prev=$(prev_json "$subdir_path")
|
|
prev_section=""
|
|
if [ -n "$prev" ] && [ "$prev" != "$latest" ]; then
|
|
prev_name=$(basename "$prev")
|
|
prev_content=$(head -c 2000 "$prev" 2>/dev/null || echo "{}")
|
|
prev_section="
|
|
previous: ${prev_name}
|
|
previous_content: ${prev_content}"
|
|
fi
|
|
|
|
EVIDENCE_SUMMARY="${EVIDENCE_SUMMARY}
|
|
=== evidence/${subdir} ===
|
|
latest: ${latest_name} (age: ${age_hours}h, path: ${latest})
|
|
content: ${content}${prev_section}"
|
|
done
|
|
|
|
# ── Secondary signals — Codeberg activity (last 24h) ─────────────────────
|
|
SINCE_ISO=$(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || true)
|
|
if [ -z "$SINCE_ISO" ]; then
|
|
log "WARN: date -d '24 hours ago' failed (non-GNU date?) — skipping Codeberg activity"
|
|
fi
|
|
RECENT_ISSUES=""
|
|
RECENT_PRS=""
|
|
if [ -n "$SINCE_ISO" ]; then
|
|
RECENT_ISSUES=$(codeberg_api GET "/issues?state=open&type=issues&limit=20&sort=newest" 2>/dev/null | \
|
|
jq -r --arg since "$SINCE_ISO" \
|
|
'.[] | select(.created_at >= $since) | " #\(.number) [\(.labels | map(.name) | join(","))] \(.title)"' \
|
|
2>/dev/null || true)
|
|
# Use state=closed to capture recently-merged PRs — merged activity is the
|
|
# key signal (e.g. new red-team PR merged since last evolution run).
|
|
RECENT_PRS=$(codeberg_api GET "/pulls?state=closed&limit=20&sort=newest" 2>/dev/null | \
|
|
jq -r --arg since "$SINCE_ISO" \
|
|
'.[] | select(.merged_at != null and .merged_at >= $since) | " #\(.number) \(.title) (merged \(.merged_at[:10]))"' \
|
|
2>/dev/null || true)
|
|
fi
|
|
|
|
# ── Already-open predictions (avoid duplicates) ───────────────────────────
|
|
OPEN_PREDICTIONS=$(codeberg_api GET "/issues?state=open&type=issues&labels=prediction%2Funreviewed&limit=50" 2>/dev/null | \
|
|
jq -r '.[] | " #\(.number) \(.title)"' 2>/dev/null || true)
|
|
|
|
# ── System resource snapshot ──────────────────────────────────────────────
|
|
AVAIL_MB=$(free -m | awk '/Mem:/{print $7}' 2>/dev/null || echo "unknown")
|
|
DISK_PCT=$(df -h / | awk 'NR==2{print $5}' | tr -d '%' 2>/dev/null || echo "unknown")
|
|
LOAD_AVG=$(cut -d' ' -f1-3 /proc/loadavg 2>/dev/null || echo "unknown")
|
|
ACTIVE_SESSIONS=$(tmux list-sessions 2>/dev/null | \
|
|
grep -cE "^(dev|action|gardener|review)-" || echo "0")
|
|
|
|
# ── Build prompt ──────────────────────────────────────────────────────────
|
|
PROMPT="You are the prediction agent (goblin) for ${CODEBERG_REPO}.
|
|
|
|
Your role: spot patterns in evidence and signal them as prediction issues.
|
|
The planner (adult) will triage every prediction before acting.
|
|
You MUST NOT emit feature work or implementation issues — only predictions
|
|
about evidence state, metric trends, and system conditions.
|
|
|
|
## Evidence from evidence/ directory
|
|
${EVIDENCE_SUMMARY}
|
|
|
|
## System resource snapshot (right now)
|
|
Available RAM: ${AVAIL_MB}MB
|
|
Disk used: ${DISK_PCT}%
|
|
Load avg (1/5/15 min): ${LOAD_AVG}
|
|
Active agent sessions (tmux): ${ACTIVE_SESSIONS}
|
|
|
|
## Recent Codeberg activity (last 24h)
|
|
New issues:
|
|
${RECENT_ISSUES:- (none)}
|
|
|
|
Recently merged PRs (last 24h):
|
|
${RECENT_PRS:- (none)}
|
|
|
|
## Already-open predictions (do NOT duplicate these)
|
|
${OPEN_PREDICTIONS:- (none)}
|
|
|
|
## What to look for
|
|
|
|
**Staleness** — Evidence older than its expected refresh interval:
|
|
- red-team: stale after 7 days
|
|
- evolution: stale after 7 days
|
|
- user-test: stale after 14 days
|
|
- holdout: stale after 7 days
|
|
- resources: stale after 1 day
|
|
- protocol: stale after 1 day
|
|
- any directory missing entirely: flag as critical gap
|
|
|
|
**Regression** — Metrics worse in latest vs previous run:
|
|
- Decreased: fitness score, pass rate, conversion, floor price
|
|
- Increased: error count, risk score, ETH extracted by attacker
|
|
- Only flag if change is meaningful (>5% relative, or clearly significant)
|
|
|
|
**Opportunity** — Conditions that make a process worth running now:
|
|
- Box is relatively idle (RAM>2000MB, load<2.0, no active agent sessions)
|
|
AND evidence is stale — good time to run evolution or red-team
|
|
- New attack vectors in red-team since last evolution run → evolution scores stale
|
|
|
|
**Risk** — Conditions that suggest deferring expensive work:
|
|
- RAM<1500MB or disk>85% or load>3.0 → defer evolution/red-team
|
|
- Active dev session in progress on related work
|
|
|
|
## Output format
|
|
|
|
For each prediction, output a JSON object on its own line (no array wrapper,
|
|
no markdown fences):
|
|
|
|
{\"title\": \"...\", \"signal_source\": \"...\", \"confidence\": \"high|medium|low\", \"suggested_action\": \"...\", \"body\": \"...\"}
|
|
|
|
Fields:
|
|
- title: Short declarative statement of what you observed. Not an action.
|
|
- signal_source: Which evidence file or signal triggered this
|
|
(e.g. \"evidence/evolution/2024-01-15.json\", \"system resources\",
|
|
\"evidence/red-team/ missing\")
|
|
- confidence: high (clear numerical evidence), medium (trend/pattern),
|
|
low (inferred or absent data but important to flag)
|
|
- suggested_action: Concrete next step for the planner —
|
|
\"run formula X\", \"file issue for Y\", \"escalate to human\",
|
|
\"monitor for N days\", \"run process X\"
|
|
- body: 2-4 sentences. What changed or is missing, why it matters,
|
|
what the planner should consider doing. Be specific: name the file,
|
|
metric, and value.
|
|
|
|
## Rules
|
|
- Max 5 predictions total
|
|
- Do NOT predict feature work — only evidence/metric/system observations
|
|
- Do NOT duplicate existing open predictions (listed above)
|
|
- Do NOT predict things you cannot support with the evidence provided
|
|
- Prefer high-confidence predictions; emit low-confidence only when the
|
|
signal is important (e.g. missing critical evidence)
|
|
- Be specific: name the file, the metric, the value
|
|
|
|
If you see no meaningful patterns, output exactly: NO_PREDICTIONS
|
|
|
|
Output ONLY the JSON lines (or NO_PREDICTIONS) — no preamble, no markdown."
|
|
|
|
# ── Invoke Claude (one-shot) ──────────────────────────────────────────────
|
|
log "invoking claude -p for ${PROJECT_NAME} predictions"
|
|
CLAUDE_OUTPUT=$(timeout "$CLAUDE_TIMEOUT" claude -p "$PROMPT" \
|
|
--model sonnet \
|
|
2>/dev/null) || {
|
|
EXIT_CODE=$?
|
|
log "ERROR: claude exited with code $EXIT_CODE"
|
|
exit 1
|
|
}
|
|
|
|
log "claude finished ($(printf '%s' "$CLAUDE_OUTPUT" | wc -c) bytes)"
|
|
|
|
if printf '%s' "$CLAUDE_OUTPUT" | grep -qxF "NO_PREDICTIONS"; then
|
|
log "no predictions — evidence looks healthy for ${PROJECT_NAME}"
|
|
log "--- prediction-agent done ---"
|
|
exit 0
|
|
fi
|
|
|
|
# ── Look up prediction/unreviewed label ───────────────────────────────────
|
|
PREDICTION_LABEL_ID=$(codeberg_api GET "/labels" 2>/dev/null | \
|
|
jq -r '.[] | select(.name == "prediction/unreviewed") | .id' 2>/dev/null || true)
|
|
if [ -z "$PREDICTION_LABEL_ID" ]; then
|
|
log "WARN: 'prediction/unreviewed' label not found — issues created without label (see #141)"
|
|
fi
|
|
|
|
# ── Create prediction issues ──────────────────────────────────────────────
|
|
CREATED=0
|
|
while IFS= read -r line; do
|
|
[ -z "$line" ] && continue
|
|
# Skip non-JSON lines
|
|
printf '%s' "$line" | jq -e . >/dev/null 2>&1 || continue
|
|
|
|
TITLE=$(printf '%s' "$line" | jq -r '.title')
|
|
SIGNAL=$(printf '%s' "$line" | jq -r '.signal_source // "unknown"')
|
|
CONFIDENCE=$(printf '%s' "$line" | jq -r '.confidence // "medium"')
|
|
ACTION=$(printf '%s' "$line" | jq -r '.suggested_action // ""')
|
|
BODY_TEXT=$(printf '%s' "$line" | jq -r '.body')
|
|
|
|
FULL_BODY="${BODY_TEXT}
|
|
|
|
---
|
|
**Signal source:** ${SIGNAL}
|
|
**Confidence:** ${CONFIDENCE}
|
|
**Suggested action:** ${ACTION}"
|
|
|
|
CREATE_PAYLOAD=$(jq -nc --arg t "$TITLE" --arg b "$FULL_BODY" \
|
|
'{title: $t, body: $b}')
|
|
|
|
if [ -n "$PREDICTION_LABEL_ID" ]; then
|
|
CREATE_PAYLOAD=$(printf '%s' "$CREATE_PAYLOAD" | \
|
|
jq --argjson lid "$PREDICTION_LABEL_ID" '.labels = [$lid]')
|
|
fi
|
|
|
|
RESULT=$(codeberg_api POST "/issues" -d "$CREATE_PAYLOAD" 2>/dev/null || true)
|
|
ISSUE_NUM=$(printf '%s' "$RESULT" | jq -r '.number // "?"' 2>/dev/null || echo "?")
|
|
|
|
log "Created prediction #${ISSUE_NUM} [${CONFIDENCE}]: ${TITLE}"
|
|
matrix_send "predictor" "🔮 Prediction #${ISSUE_NUM} [${CONFIDENCE}]: ${TITLE} — ${ACTION}" \
|
|
2>/dev/null || true
|
|
|
|
CREATED=$((CREATED + 1))
|
|
[ "$CREATED" -ge 5 ] && break
|
|
done <<< "$CLAUDE_OUTPUT"
|
|
|
|
log "--- prediction-agent done (created ${CREATED} predictions for ${PROJECT_NAME}) ---"
|