fix: feat: triage agent — deep root cause analysis for reproduced bugs (#258) #337

Merged
dev-bot merged 1 commit from fix/issue-258 into main 2026-04-07 08:19:02 +00:00
2 changed files with 181 additions and 0 deletions

View file

@ -578,6 +578,131 @@ dispatch_reproduce() {
log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}"
}
# -----------------------------------------------------------------------------
# Triage dispatch — launch sidecar for bug-report + in-triage issues
# -----------------------------------------------------------------------------
# Check if a triage run is already in-flight for a given issue.
_triage_lockfile() {
local issue="$1"
echo "/tmp/triage-inflight-${issue}.pid"
}
is_triage_running() {
local issue="$1"
local pidfile
pidfile=$(_triage_lockfile "$issue")
[ -f "$pidfile" ] || return 1
local pid
pid=$(cat "$pidfile" 2>/dev/null || echo "")
[ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
}
# Fetch open issues labelled both bug-report and in-triage.
# Returns a newline-separated list of issue numbers.
fetch_triage_candidates() {
# Require FORGE_TOKEN, FORGE_URL, FORGE_REPO
[ -n "${FORGE_TOKEN:-}" ] || return 0
[ -n "${FORGE_URL:-}" ] || return 0
[ -n "${FORGE_REPO:-}" ] || return 0
local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
local issues_json
issues_json=$(curl -sf \
-H "Authorization: token ${FORGE_TOKEN}" \
"${api}/issues?type=issues&state=open&labels=bug-report&limit=20" 2>/dev/null) || return 0
# Filter to issues that carry BOTH bug-report AND in-triage labels.
local tmpjson
tmpjson=$(mktemp)
echo "$issues_json" > "$tmpjson"
python3 - "$tmpjson" <<'PYEOF'
import sys, json
data = json.load(open(sys.argv[1]))
for issue in data:
labels = {l["name"] for l in (issue.get("labels") or [])}
if "bug-report" in labels and "in-triage" in labels:
print(issue["number"])
PYEOF
rm -f "$tmpjson"
}
# Launch one triage container per candidate issue.
# Uses the same disinto-reproduce:latest image as the reproduce-agent,
# selecting the triage formula via DISINTO_FORMULA env var.
# Stack lock is held for the full run (no timeout).
dispatch_triage() {
local issue_number="$1"
if is_triage_running "$issue_number"; then
log "Triage already running for issue #${issue_number}, skipping"
return 0
fi
# Find first project TOML available (same convention as dev-poll)
local project_toml=""
for toml in "${FACTORY_ROOT}"/projects/*.toml; do
[ -f "$toml" ] && { project_toml="$toml"; break; }
done
if [ -z "$project_toml" ]; then
log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping triage for #${issue_number}"
return 0
fi
log "Dispatching triage-agent for issue #${issue_number} (project: ${project_toml})"
# Build docker run command using array (safe from injection)
local -a cmd=(docker run --rm
--name "disinto-triage-${issue_number}"
--network host
--security-opt apparmor=unconfined
-v /var/run/docker.sock:/var/run/docker.sock
-v agent-data:/home/agent/data
-v project-repos:/home/agent/repos
-e "FORGE_URL=${FORGE_URL}"
-e "FORGE_TOKEN=${FORGE_TOKEN}"
-e "FORGE_REPO=${FORGE_REPO}"
-e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
-e DISINTO_CONTAINER=1
-e DISINTO_FORMULA=triage
)
# Pass through ANTHROPIC_API_KEY if set
if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
fi
# Mount ~/.claude and ~/.ssh from the runtime user's home if available
local runtime_home="${HOME:-/home/debian}"
if [ -d "${runtime_home}/.claude" ]; then
cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
fi
if [ -f "${runtime_home}/.claude.json" ]; then
cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
fi
if [ -d "${runtime_home}/.ssh" ]; then
cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
fi
# Mount claude CLI binary if present on host
if [ -f /usr/local/bin/claude ]; then
cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
fi
# Mount the project TOML into the container at a stable path
local container_toml="/home/agent/project.toml"
cmd+=(-v "${project_toml}:${container_toml}:ro")
cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
# Launch in background; write pid-file so we don't double-launch
"${cmd[@]}" &
local bg_pid=$!
echo "$bg_pid" > "$(_triage_lockfile "$issue_number")"
log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
}
# -----------------------------------------------------------------------------
# Main dispatcher loop
# -----------------------------------------------------------------------------
@ -638,6 +763,16 @@ main() {
done <<< "$candidate_issues"
fi
# Triage dispatch: check for bug-report + in-triage issues needing deep analysis
local triage_issues
triage_issues=$(fetch_triage_candidates) || true
if [ -n "$triage_issues" ]; then
while IFS= read -r issue_num; do
[ -n "$issue_num" ] || continue
dispatch_triage "$issue_num" || true
done <<< "$triage_issues"
fi
# Wait before next poll
sleep 60
done

46
formulas/triage.toml Normal file
View file

@ -0,0 +1,46 @@
# formulas/triage.toml — Triage-agent formula
#
# Declares the triage-agent's runtime parameters.
# The dispatcher reads this to configure the sidecar container.
#
# Triggered by: bug-report + in-triage label combination.
# Set by the reproduce-agent when:
# - Bug was confirmed (reproduced)
# - Quick log analysis did not reveal an obvious root cause
# - Reproduce-agent documented all steps taken and logs examined
#
# What it does:
# 1. Reads reproduce-agent findings from issue comments (do not repeat work)
# 2. Deep-traces the data flow from symptom to source:
# UI component → composable → API/GraphQL → indexer → chain
# - Compare what the code expects vs what APIs actually return
# - Create a throwaway branch, add debug instrumentation (console.log, verbose logging)
# - Restart services, re-run reproduction, observe new output
# - Delete throwaway branch when done
# 3. Decomposes all root causes (may be 1 or multiple compounding):
# - For each root cause, create a separate backlog issue with:
# * Which cause it is (1 of N)
# * Specific code path and fix suggestion
# * Depends-on: #X if causes are layered
# 4. Updates original issue:
# - Posts summary: "Found N root causes, tracked as #X, #Y, #Z"
# - Replaces in-triage with in-progress
#
# No hard timeout — runs until Claude hits its turn limit.
# Stack lock held for full run (triage is rare; blocking CI is acceptable).
#
# stack_script: path (relative to PROJECT_REPO_ROOT) of the script used to
# restart/rebuild the project stack. Leave empty ("") to connect to an
# existing staging environment instead.
#
# tools: MCP servers to pass to claude via --mcp-server flags.
name = "triage"
description = "Deep root cause analysis: trace data flow, add debug instrumentation, decompose causes into backlog issues."
version = 1
# Set stack_script to the restart command for local stacks.
# Leave empty ("") to target an existing staging environment.
stack_script = ""
tools = ["playwright"]