fix: feat: reproduce agent re-verifies bug-report issues after all dependency fixes merge (#400)

2026-04-08 07:14:57 +00:00 · 2026-04-08 07:14:57 +00:00 · 083c734390
commit 083c734390
parent 4b4eb741e6
3 changed files with 496 additions and 2 deletions
--- a/docker/edge/dispatcher.sh
+++ b/docker/edge/dispatcher.sh
@ -708,6 +708,207 @@ dispatch_triage() {
  log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
 }

+# -----------------------------------------------------------------------------
+# Verification dispatch — launch sidecar for bug-report parents with all deps closed
+# -----------------------------------------------------------------------------
+
+# Check if a verification run is already in-flight for a given issue.
+_verify_lockfile() {
+  local issue="$1"
+  echo "/tmp/verify-inflight-${issue}.pid"
+}
+
+is_verify_running() {
+  local issue="$1"
+  local pidfile
+  pidfile=$(_verify_lockfile "$issue")
+  [ -f "$pidfile" ] || return 1
+  local pid
+  pid=$(cat "$pidfile" 2>/dev/null || echo "")
+  [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
+}
+
+# Check if an issue is a parent with sub-issues (identified by sub-issues
+# whose body contains "Decomposed from #N" where N is the parent's number).
+# Returns: 0 if parent with sub-issues found, 1 otherwise
+_is_parent_issue() {
+  local parent_num="$1"
+
+  # Fetch all issues (open and closed) to find sub-issues
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  local all_issues_json
+  all_issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=all&limit=50" 2>/dev/null) || return 1
+
+  # Find issues whose body contains "Decomposed from #<parent_num>"
+  local sub_issues
+  sub_issues=$(python3 -c '
+import sys, json
+parent_num = sys.argv[1]
+data = json.load(open("/dev/stdin"))
+sub_issues = []
+for issue in data:
+    body = issue.get("body") or ""
+    if f"Decomposed from #{parent_num}" in body:
+        sub_issues.append(str(issue["number"]))
+print(" ".join(sub_issues))
+' "$parent_num" < <(echo "$all_issues_json")) || return 1
+
+  [ -n "$sub_issues" ]
+}
+
+# Check if all sub-issues of a parent are closed.
+# Returns: 0 if all closed, 1 if any still open
+_are_all_sub_issues_closed() {
+  local parent_num="$1"
+
+  # Fetch all issues (open and closed) to find sub-issues
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  local all_issues_json
+  all_issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=all&limit=50" 2>/dev/null) || return 1
+
+  # Find issues whose body contains "Decomposed from #<parent_num>"
+  local sub_issues
+  sub_issues=$(python3 -c '
+import sys, json
+parent_num = sys.argv[1]
+data = json.load(open("/dev/stdin"))
+sub_issues = []
+for issue in data:
+    body = issue.get("body") or ""
+    if f"Decomposed from #{parent_num}" in body:
+        sub_issues.append(str(issue["number"]))
+print(" ".join(sub_issues))
+' "$parent_num" < <(echo "$all_issues_json")) || return 1
+
+  [ -z "$sub_issues" ] && return 1
+
+  # Check if all sub-issues are closed
+  for sub_num in $sub_issues; do
+    local sub_state
+    sub_state=$(curl -sf \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${api}/issues/${sub_num}" 2>/dev/null | jq -r '.state // "unknown"') || return 1
+    if [ "$sub_state" != "closed" ]; then
+      return 1
+    fi
+  done
+  return 0
+}
+
+# Fetch open bug-report + in-progress issues whose sub-issues are all closed.
+# Returns a newline-separated list of issue numbers ready for verification.
+fetch_verification_candidates() {
+  # Require FORGE_TOKEN, FORGE_URL, FORGE_REPO
+  [ -n "${FORGE_TOKEN:-}" ] || return 0
+  [ -n "${FORGE_URL:-}" ]   || return 0
+  [ -n "${FORGE_REPO:-}" ]  || return 0
+
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+
+  # Fetch open bug-report + in-progress issues
+  local issues_json
+  issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=open&labels=bug-report&limit=20" 2>/dev/null) || return 0
+
+  # Filter to issues that also have in-progress label and have all sub-issues closed
+  local tmpjson
+  tmpjson=$(mktemp)
+  echo "$issues_json" > "$tmpjson"
+  python3 - "$tmpjson" "$api" "${FORGE_TOKEN}" <<'PYEOF'
+import sys, json
+api_base = sys.argv[2]
+token = sys.argv[3]
+data = json.load(open(sys.argv[1]))
+
+for issue in data:
+    labels = {l["name"] for l in (issue.get("labels") or [])}
+    # Must have BOTH bug-report AND in-progress labels
+    if "bug-report" not in labels or "in-progress" not in labels:
+        continue
+    print(issue["number"])
+PYEOF
+  rm -f "$tmpjson"
+}
+
+# Launch one verification container per candidate issue.
+# Uses the same disinto-reproduce:latest image as the reproduce-agent,
+# selecting the verify formula via DISINTO_FORMULA env var.
+dispatch_verify() {
+  local issue_number="$1"
+
+  if is_verify_running "$issue_number"; then
+    log "Verification already running for issue #${issue_number}, skipping"
+    return 0
+  fi
+
+  # Find first project TOML available (same convention as dev-poll)
+  local project_toml=""
+  for toml in "${FACTORY_ROOT}"/projects/*.toml; do
+    [ -f "$toml" ] && { project_toml="$toml"; break; }
+  done
+
+  if [ -z "$project_toml" ]; then
+    log "WARNING: no project TOML found under ${FACTORY_ROOT}/projects/ — skipping verification for #${issue_number}"
+    return 0
+  fi
+
+  log "Dispatching verification-agent for issue #${issue_number} (project: ${project_toml})"
+
+  # Build docker run command using array (safe from injection)
+  local -a cmd=(docker run --rm
+    --name "disinto-verify-${issue_number}"
+    --network host
+    --security-opt apparmor=unconfined
+    -v /var/run/docker.sock:/var/run/docker.sock
+    -v agent-data:/home/agent/data
+    -v project-repos:/home/agent/repos
+    -e "FORGE_URL=${FORGE_URL}"
+    -e "FORGE_TOKEN=${FORGE_TOKEN}"
+    -e "FORGE_REPO=${FORGE_REPO}"
+    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
+    -e DISINTO_CONTAINER=1
+    -e DISINTO_FORMULA=verify
+  )
+
+  # Pass through ANTHROPIC_API_KEY if set
+  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
+  fi
+
+  # Mount ~/.claude and ~/.ssh from the runtime user's home if available
+  local runtime_home="${HOME:-/home/debian}"
+  if [ -d "${runtime_home}/.claude" ]; then
+    cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
+  fi
+  if [ -f "${runtime_home}/.claude.json" ]; then
+    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
+  fi
+  if [ -d "${runtime_home}/.ssh" ]; then
+    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
+  fi
+  # Mount claude CLI binary if present on host
+  if [ -f /usr/local/bin/claude ]; then
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
+  fi
+
+  # Mount the project TOML into the container at a stable path
+  local container_toml="/home/agent/project.toml"
+  cmd+=(-v "${project_toml}:${container_toml}:ro")
+
+  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
+
+  # Launch in background; write pid-file so we don't double-launch
+  "${cmd[@]}" &
+  local bg_pid=$!
+  echo "$bg_pid" > "$(_verify_lockfile "$issue_number")"
+  log "Verification container launched (pid ${bg_pid}) for issue #${issue_number}"
+}
+
 # -----------------------------------------------------------------------------
 # Main dispatcher loop
 # -----------------------------------------------------------------------------
@ -778,6 +979,22 @@ main() {
      done <<< "$triage_issues"
    fi

+    # Verification dispatch: check for bug-report + in-progress issues whose sub-issues are all closed
+    # These are parents whose fixes have merged and need verification
+    local verify_issues
+    verify_issues=$(fetch_verification_candidates) || true
+    if [ -n "$verify_issues" ]; then
+      while IFS= read -r issue_num; do
+        [ -n "$issue_num" ] || continue
+        # Double-check: this issue must have all sub-issues closed before dispatching
+        if _are_all_sub_issues_closed "$issue_num"; then
+          dispatch_verify "$issue_num" || true
+        else
+          log "Issue #${issue_num} has open sub-issues — skipping verification"
+        fi
+      done <<< "$verify_issues"
+    fi
+
    # Wait before next poll
    sleep 60
  done
--- a/docker/reproduce/entrypoint-reproduce.sh
+++ b/docker/reproduce/entrypoint-reproduce.sh
@ -29,6 +29,9 @@ case "${DISINTO_FORMULA:-reproduce}" in
  triage)
    ACTIVE_FORMULA="${DISINTO_DIR}/formulas/triage.toml"
    ;;
+  verify)
+    ACTIVE_FORMULA="${DISINTO_DIR}/formulas/reproduce.toml"
+    ;;
  *)
    ACTIVE_FORMULA="${DISINTO_DIR}/formulas/reproduce.toml"
    ;;
@ -43,6 +46,8 @@ SCREENSHOT_DIR="/home/agent/data/screenshots"
 # ---------------------------------------------------------------------------
 if [ "${DISINTO_FORMULA:-reproduce}" = "triage" ]; then
  AGENT_TYPE="triage"
+elif [ "${DISINTO_FORMULA:-reproduce}" = "verify" ]; then
+  AGENT_TYPE="verify"
 else
  AGENT_TYPE="reproduce"
 fi
@ -576,6 +581,193 @@ fi

 log "Outcome: ${OUTCOME}"

+# ---------------------------------------------------------------------------
+# Verification mode: check if this is a parent issue whose sub-issues are all closed
+# If so, re-run reproduction to verify the bug is fixed
+# ---------------------------------------------------------------------------
+if [ "$AGENT_TYPE" = "verify" ]; then
+  log "Verification mode: checking for sub-issues of parent issue #${ISSUE_NUMBER}"
+
+  # Check if this issue is a parent with sub-issues
+  if _is_parent_issue "$ISSUE_NUMBER"; then
+    log "Found ${#_PARENT_SUB_ISSUES[@]} sub-issue(s) for parent #${ISSUE_NUMBER}"
+
+    # Check if all sub-issues are closed
+    if _are_all_sub_issues_closed; then
+      log "All sub-issues are closed — triggering verification reproduction"
+
+      # Re-run the reproduction to check if bug is fixed
+      log "Running verification reproduction..."
+
+      # Build Claude prompt for verification mode
+      SUB_ISSUE_LIST=$(_get_sub_issue_list)
+      CLAUDE_PROMPT=$(cat <<PROMPT
+You are the reproduce-agent running in **verification mode**. Your task is to re-run the reproduction steps from the original bug report to verify that the bug has been fixed after all sub-issues were resolved.
+
+## Issue title
+${ISSUE_TITLE}
+
+## Issue body
+${ISSUE_BODY}
+
+## Context
+This issue was decomposed into the following sub-issues that have all been resolved:
+${SUB_ISSUE_LIST}
+
+All sub-issues have been closed, indicating that fixes have been merged. Your task is to verify that the original bug is now fixed.
+
+## Your task
+1. Follow the reproduction steps from the original issue body
+2. Execute each step carefully and observe the current behavior
+3. Take screenshots as evidence (save to: ${SCREENSHOT_PREFIX}-step-N.png)
+4. Determine if the bug is **fixed** (no longer reproduces) or **still present**
+
+## Output files
+
+1. **Findings report** — Write to: /tmp/reproduce-findings-${ISSUE_NUMBER}.md
+   Include:
+   - Steps you followed
+   - What you observed (screenshots referenced by path)
+   - OUTCOME line: OUTCOME=verified-fixed OR OUTCOME=still-reproduces
+
+2. **Outcome file** — Write to: /tmp/reproduce-outcome-${ISSUE_NUMBER}.txt
+   Write ONLY the outcome word: verified-fixed OR still-reproduces
+
+## Notes
+- The application is accessible at localhost (network_mode: host)
+- Take screenshots liberally — they are evidence
+- If the app is not running or not reachable, write outcome: still-reproduces with reason "stack not reachable"
+- Timeout: ${FORMULA_TIMEOUT_MINUTES} minutes total
+
+Begin now.
+PROMPT
+      )
+
+      # Run Claude for verification
+      log "Starting Claude verification session (timeout: ${FORMULA_TIMEOUT_MINUTES}m)..."
+
+      CLAUDE_EXIT=0
+      timeout "$(( FORMULA_TIMEOUT_MINUTES * 60 ))" \
+        claude -p "$CLAUDE_PROMPT" \
+          --mcp-server playwright \
+          --output-format text \
+          --max-turns 40 \
+        > "/tmp/reproduce-claude-output-${ISSUE_NUMBER}.txt" 2>&1 || CLAUDE_EXIT=$?
+
+      if [ $CLAUDE_EXIT -eq 124 ]; then
+        log "WARNING: Claude verification session timed out after ${FORMULA_TIMEOUT_MINUTES}m"
+      fi
+
+      # Read verification outcome
+      VERIFY_OUTCOME="still-reproduces"
+      if [ -f "/tmp/reproduce-outcome-${ISSUE_NUMBER}.txt" ]; then
+        _raw=$(tr -d '[:space:]' < "/tmp/reproduce-outcome-${ISSUE_NUMBER}.txt" | tr '[:upper:]' '[:lower:]')
+        case "$_raw" in
+          verified-fixed|still-reproduces)
+            VERIFY_OUTCOME="$_raw"
+            ;;
+          *)
+            log "WARNING: unexpected verification outcome '${_raw}' — defaulting to still-reproduces"
+            ;;
+        esac
+      fi
+
+      log "Verification outcome: ${VERIFY_OUTCOME}"
+
+      # Read findings
+      VERIFY_FINDINGS=""
+      if [ -f "/tmp/reproduce-findings-${ISSUE_NUMBER}.md" ]; then
+        VERIFY_FINDINGS=$(cat "/tmp/reproduce-findings-${ISSUE_NUMBER}.md")
+      else
+        VERIFY_FINDINGS="Verification-agent completed but did not write a findings report. Claude output:\n\`\`\`\n$(tail -100 "/tmp/reproduce-claude-output-${ISSUE_NUMBER}.txt" 2>/dev/null || echo '(no output)')\n\`\`\`"
+      fi
+
+      # Collect screenshot paths
+      VERIFY_SCREENSHOT_LIST=""
+      if find "$(dirname "${SCREENSHOT_PREFIX}")" -name "$(basename "${SCREENSHOT_PREFIX}")-*.png" -maxdepth 1 2>/dev/null | grep -q .; then
+        VERIFY_SCREENSHOT_LIST="\n\n**Screenshots taken:**\n"
+        for f in "${SCREENSHOT_PREFIX}"-*.png; do
+          VERIFY_SCREENSHOT_LIST="${VERIFY_SCREENSHOT_LIST}- \`$(basename "$f")\`\n"
+        done
+      fi
+
+      # Process verification result
+      if [ "$VERIFY_OUTCOME" = "verified-fixed" ]; then
+        # Bug is fixed — comment, remove in-progress, close the issue
+        AGENT_NAME="Verification-agent"
+        COMMENT_HEADER="## ${AGENT_NAME}: **Verified fixed** :white_check_mark: :tada:"
+
+        COMMENT_BODY="${COMMENT_HEADER}
+
+The bug described in this issue has been **verified as fixed** after all sub-issues were resolved.
+
+**Resolved sub-issues:**
+${SUB_ISSUE_LIST}
+
+**Verification result:** The reproduction steps no longer trigger the bug.
+
+${VERIFY_FINDINGS}${VERIFY_SCREENSHOT_LIST}
+
+---
+*${AGENT_NAME} verification run at $(date -u '+%Y-%m-%d %H:%M:%S UTC') — project: ${PROJECT_NAME}*
+
+Closing this issue as the bug has been verified fixed."
+
+        _post_comment "$ISSUE_NUMBER" "$COMMENT_BODY"
+        log "Posted verification comment to issue #${ISSUE_NUMBER}"
+
+        # Remove in-progress label
+        IN_PROGRESS_ID=$(_label_id "in-progress" "#1d76db")
+        _remove_label "$ISSUE_NUMBER" "$IN_PROGRESS_ID"
+
+        # Close the issue
+        curl -sf -X PATCH \
+          -H "Authorization: token ${FORGE_TOKEN}" \
+          -H "Content-Type: application/json" \
+          "${FORGE_API}/issues/${ISSUE_NUMBER}" \
+          -d '{"state":"closed"}' >/dev/null 2>&1 || true
+        log "Closed issue #${ISSUE_NUMBER} — bug verified fixed"
+      else
+        # Bug still reproduces — comment, keep in-progress, trigger new triage
+        AGENT_NAME="Verification-agent"
+        COMMENT_HEADER="## ${AGENT_NAME}: **Still reproduces after fixes** :x:"
+
+        COMMENT_BODY="${COMMENT_HEADER}
+
+The bug described in this issue **still reproduces** after all sub-issues were resolved.
+
+**Resolved sub-issues:**
+${SUB_ISSUE_LIST}
+
+**Verification result:** The reproduction steps still trigger the bug. Additional investigation needed.
+
+${VERIFY_FINDINGS}${VERIFY_SCREENSHOT_LIST}
+
+---
+*${AGENT_NAME} verification run at $(date -u '+%Y-%m-%d %H:%M:%S UTC') — project: ${PROJECT_NAME}*
+
+This issue will be re-entered into triage for further investigation."
+
+        _post_comment "$ISSUE_NUMBER" "$COMMENT_BODY"
+        log "Posted verification comment to issue #${ISSUE_NUMBER}"
+
+        # Re-trigger triage by adding in-triage label
+        IN_TRIAGE_ID=$(_label_id "in-triage" "#d93f0b")
+        _add_label "$ISSUE_NUMBER" "$IN_TRIAGE_ID"
+        log "Re-triggered triage for issue #${ISSUE_NUMBER} — bug still reproduces"
+      fi
+
+      # Exit after verification
+      log "Verification complete. Outcome: ${VERIFY_OUTCOME}"
+      exit 0
+    else
+      log "Not all sub-issues are closed yet — skipping verification"
+    fi
+  else
+    log "Issue #${ISSUE_NUMBER} is not a parent with tracked sub-issues — running standard reproduction"
+  fi
+fi
+
 # ---------------------------------------------------------------------------
 # Read findings
 # ---------------------------------------------------------------------------
@ -601,6 +793,85 @@ if find "$(dirname "${SCREENSHOT_PREFIX}")" -name "$(basename "${SCREENSHOT_PREF
  done
 fi

+# ---------------------------------------------------------------------------
+# Verification helpers — bug-report lifecycle
+# ---------------------------------------------------------------------------
+
+# Check if an issue is a parent with sub-issues (identified by sub-issues
+# whose body contains "Decomposed from #N" where N is the parent's number).
+# Returns: 0 if parent with sub-issues found, 1 otherwise
+# Sets: _PARENT_SUB_ISSUES (space-separated list of sub-issue numbers)
+_is_parent_issue() {
+  local parent_num="$1"
+  _PARENT_SUB_ISSUES=""
+
+  # Fetch all issues (open and closed) to find sub-issues
+  local all_issues_json
+  all_issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${FORGE_API}/issues?type=issues&state=all&limit=50" 2>/dev/null) || return 1
+
+  # Find issues whose body contains "Decomposed from #<parent_num>"
+  local sub_issues
+  sub_issues=$(python3 -c '
+import sys, json
+parent_num = sys.argv[1]
+data = json.load(open("/dev/stdin"))
+sub_issues = []
+for issue in data:
+    body = issue.get("body") or ""
+    if f"Decomposed from #{parent_num}" in body:
+        sub_issues.append(str(issue["number"]))
+print(" ".join(sub_issues))
+' "$parent_num" < <(echo "$all_issues_json")) || return 1
+
+  if [ -n "$sub_issues" ]; then
+    _PARENT_SUB_ISSUES="$sub_issues"
+    return 0
+  fi
+  return 1
+}
+
+# Check if all sub-issues of a parent are closed.
+# Requires: _PARENT_SUB_ISSUES to be set
+# Returns: 0 if all closed, 1 if any still open
+_are_all_sub_issues_closed() {
+  if [ -z "${_PARENT_SUB_ISSUES:-}" ]; then
+    return 1
+  fi
+
+  for sub_num in $_PARENT_SUB_ISSUES; do
+    local sub_state
+    sub_state=$(curl -sf \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_API}/issues/${sub_num}" 2>/dev/null | jq -r '.state // "unknown"') || {
+      log "WARNING: could not fetch state of sub-issue #${sub_num}"
+      return 1
+    }
+    if [ "$sub_state" != "closed" ]; then
+      log "Sub-issue #${sub_num} is not closed (state: ${sub_state})"
+      return 1
+    fi
+  done
+  return 0
+}
+
+# Get sub-issue details for comment
+# Returns: formatted list of sub-issues
+_get_sub_issue_list() {
+  local result=""
+  for sub_num in $_PARENT_SUB_ISSUES; do
+    local sub_title
+    sub_title=$(curl -sf \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_API}/issues/${sub_num}" 2>/dev/null | jq -r '.title // "unknown"') || {
+      sub_title="unknown"
+    }
+    result="${result}- #${sub_num} ${sub_title}"$'\n'
+  done
+  printf '%s' "$result"
+}
+
 # ---------------------------------------------------------------------------
 # Label helpers
 # ---------------------------------------------------------------------------
--- a/formulas/reproduce.toml
+++ b/formulas/reproduce.toml
@ -11,13 +11,19 @@
 #
 # timeout_minutes: hard upper bound on the Claude session.
 #
-# Exit gate logic:
+# Exit gate logic (standard mode):
 #   1. Can I reproduce it? → NO → rejected/blocked → EXIT
 #                          → YES → continue
 #   2. Is the cause obvious? → YES → in-progress + backlog issue → EXIT
 #                            → NO → in-triage → EXIT
 #
-# Turn budget: 60% on step 1 (reproduction), 40% on step 2 (cause check).
+# Exit gate logic (verification mode):
+#   Triggered when all sub-issues of a parent bug-report are closed.
+#   1. Bug fixed → comment "verified fixed", remove in-progress, close issue
+#   2. Bug persists → comment "still reproduces", add in-triage, re-enter triage
+#
+# Turn budget (standard mode): 60% on step 1 (reproduction), 40% on step 2 (cause check).
+# Turn budget (verification mode): 100% on re-running reproduction steps.

 name            = "reproduce"
 description     = "Primary: reproduce the bug. Secondary: check if cause is obvious. Exit gates enforced."