# formulas/triage.toml — Triage-agent formula (generic template) # # This is the base template for triage investigations. # Project-specific formulas (e.g. formulas/triage-harb.toml) extend this by # overriding the fields in the [project] section and providing stack-specific # step descriptions. # # Triggered by: bug-report + in-triage label combination. # Set by the reproduce-agent when: # - Bug was confirmed (reproduced) # - Quick log analysis did not reveal an obvious root cause # - Reproduce-agent documented all steps taken and logs examined # # Steps: # 1. read-findings — parse issue comments for prior reproduce-agent evidence # 2. trace-data-flow — follow symptom through UI → API → backend → data store # 3. instrumentation — throwaway branch, add logging, restart, observe # 4. decompose — file backlog issues for each root cause # 5. link-back — update original issue, swap in-triage → in-progress # 6. cleanup — delete throwaway debug branch # # Best practices: # - Start from reproduce-agent findings; do not repeat their work # - Budget: 70% tracing data flow, 30% instrumented re-runs # - Multiple causes: check if layered (Depends-on) or independent (Related) # - Always delete the throwaway debug branch before finishing # - If inconclusive after full turn budget: leave in-triage, post what was # tried, do NOT relabel — supervisor handles stale triage sessions # # Project-specific formulas extend this template by defining: # - stack_script: how to start/stop the project stack # - [project].data_flow: layer names (e.g. "chain → indexer → GraphQL → UI") # - [project].api_endpoints: which APIs/services to inspect # - [project].stack_lock: stack lock configuration # - Per-step description overrides with project-specific commands # # No hard timeout — runs until Claude hits its turn limit. # Stack lock held for full run (triage is rare; blocking CI is acceptable). name = "triage" description = "Deep root cause analysis: trace data flow, add debug instrumentation, decompose causes into backlog issues." version = 2 # Set stack_script to the restart command for local stacks. # Leave empty ("") to connect to an existing staging environment. stack_script = "" tools = ["playwright"] # --------------------------------------------------------------------------- # Project-specific extension fields. # Override these in formulas/triage-.toml. # --------------------------------------------------------------------------- [project] # Human-readable layer names for the data-flow trace (generic default). # Example project override: "chain → indexer → GraphQL → UI" data_flow = "UI → API → backend → data store" # Comma-separated list of API endpoints or services to inspect. # Example: "GraphQL /graphql, REST /api/v1, RPC ws://localhost:8545" api_endpoints = "" # Stack lock configuration (leave empty for default behavior). # Example: "full" to hold a full stack lock during triage. stack_lock = "" # --------------------------------------------------------------------------- # Steps # --------------------------------------------------------------------------- [[steps]] id = "read-findings" title = "Read reproduce-agent findings" description = """ Before doing anything else, parse all prior evidence from the issue comments. 1. Fetch the issue body and all comments: curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/issues/${ISSUE_NUMBER}" | jq -r '.body' curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ "${FORGE_API}/issues/${ISSUE_NUMBER}/comments" | jq -r '.[].body' 2. Identify the reproduce-agent comment (look for sections like "Reproduction steps", "Logs examined", "What was tried"). 3. Extract and note: - The exact symptom (error message, unexpected value, visual regression) - Steps that reliably trigger the bug - Log lines or API responses already captured - Any hypotheses the reproduce-agent already ruled out Do NOT repeat work the reproduce-agent already did. Your job starts where theirs ended. If no reproduce-agent comment is found, note it and proceed with fresh investigation using the issue body only. """ [[steps]] id = "trace-data-flow" title = "Trace data flow from symptom to source" description = """ Systematically follow the symptom backwards through each layer of the stack. Spend ~70% of your total turn budget here before moving to instrumentation. Generic layer traversal (adapt to the project's actual stack): UI → API → backend → data store For each layer boundary: 1. What does the upstream layer send? 2. What does the downstream layer expect? 3. Is there a mismatch? If yes — is this the root cause or a symptom? Tracing checklist: a. Start at the layer closest to the visible symptom. b. Read the relevant source files — do not guess data shapes. c. Cross-reference API contracts: compare what the code sends vs what it should send according to schemas, type definitions, or documentation. d. Check recent git history on suspicious files: git log --oneline -20 -- e. Search for related issues or TODOs in the code: grep -r "TODO\|FIXME\|HACK" -- Capture for each layer: - The data shape flowing in and out (field names, types, nullability) - Whether the layer's behavior matches its documented contract - Any discrepancy found If a clear root cause becomes obvious during tracing, note it and continue checking whether additional causes exist downstream. """ needs = ["read-findings"] [[steps]] id = "instrumentation" title = "Add debug instrumentation on a throwaway branch" description = """ Use ~30% of your total turn budget here. Only instrument after tracing has identified the most likely failure points — do not instrument blindly. 1. Create a throwaway debug branch (NEVER commit this to main): cd "$PROJECT_REPO_ROOT" git checkout -b debug/triage-${ISSUE_NUMBER} 2. Add targeted logging at the layer boundaries identified during tracing: - Console.log / structured log statements around the suspicious code path - Log the actual values flowing through: inputs, outputs, intermediate state - Add verbose mode flags if the stack supports them - Keep instrumentation minimal — only what confirms or refutes the hypothesis 3. Restart the stack using the configured script (if set): ${stack_script:-"# No stack_script configured — restart manually or connect to staging"} 4. Re-run the reproduction steps from the reproduce-agent findings. 5. Observe and capture new output: - Paste relevant log lines into your working notes - Note whether the observed values match or contradict the hypothesis 6. If the first instrumentation pass is inconclusive, iterate: - Narrow the scope to the next most suspicious boundary - Re-instrument, restart, re-run - Maximum 2-3 instrumentation rounds before declaring inconclusive Do NOT push the debug branch. It will be deleted in the cleanup step. """ needs = ["trace-data-flow"] [[steps]] id = "decompose" title = "Decompose root causes into backlog issues" description = """ After tracing and instrumentation, articulate each distinct root cause. For each root cause found: 1. Determine the relationship to other causes: - Layered (one causes another) → use Depends-on in the issue body - Independent (separate code paths fail independently) → use Related 2. Create a backlog issue for each root cause: curl -sf -X POST "${FORGE_API}/issues" \\ -H "Authorization: token ${FORGE_TOKEN}" \\ -H "Content-Type: application/json" \\ -d '{ "title": "fix: ", "body": "## Root cause\\n\\n\\n## Fix suggestion\\n\\n\\n## Context\\nDecomposed from #${ISSUE_NUMBER} (cause N of M)\\n\\n## Dependencies\\n<#X if this depends on another cause being fixed first>", "labels": ["backlog"] }' 3. Note the newly created issue numbers. If only one root cause is found, still create a single backlog issue with the specific code location and fix suggestion. If the investigation is inconclusive (no clear root cause found), skip this step and proceed directly to link-back with the inconclusive outcome. """ needs = ["instrumentation"] [[steps]] id = "link-back" title = "Update original issue and relabel" description = """ Post a summary comment on the original issue and update its labels. ### If root causes were found (conclusive): Post a comment: "## Triage findings Found N root cause(s): - #X — (cause 1 of N) - #Y — (cause 2 of N, depends on #X) Data flow traced: Instrumentation: Next step: backlog issues above will be implemented in dependency order." Then swap labels: - Remove: in-triage - Add: in-progress ### If investigation was inconclusive (turn budget exhausted): Post a comment: "## Triage — inconclusive Traced: Tried: Hypothesis: No definitive root cause identified. Leaving in-triage for supervisor to handle as a stale triage session." Do NOT relabel. Leave in-triage. The supervisor monitors stale triage sessions and will escalate or reassign. """ needs = ["decompose"] [[steps]] id = "cleanup" title = "Delete throwaway debug branch" description = """ Always delete the debug branch, even if the investigation was inconclusive. 1. Switch back to the main branch: cd "$PROJECT_REPO_ROOT" git checkout "$PRIMARY_BRANCH" 2. Delete the local debug branch: git branch -D debug/triage-${ISSUE_NUMBER} 3. Confirm no remote was pushed (if accidentally pushed, delete it too): git push origin --delete debug/triage-${ISSUE_NUMBER} 2>/dev/null || true 4. Verify the worktree is clean: git status git worktree list A clean repo is a prerequisite for the next dev-agent run. Never leave debug branches behind — they accumulate and pollute the branch list. """ needs = ["link-back"]