fix: chore: remove dead tmux-based session code (agent-session.sh, phase-handler.sh) (#262)
- Delete lib/agent-session.sh (entirely dead file with no active callers)
- Delete dev/phase-handler.sh (entirely dead file with no active callers)
- Update lib/formula-session.sh to remove tmux-based functions:
- Removed: start_formula_session, run_formula_and_monitor, formula_phase_callback,
write_compact_context, remove_formula_worktree, cleanup_stale_crashed_worktrees
- Kept utility functions: acquire_cron_lock, check_memory, load_formula,
profile_write_journal, formula_prepare_profile_context, build_graph_section, etc.
- Update dev/phase-test.sh to inline read_phase() function (no longer sources agent-session.sh)
- Update documentation: AGENTS.md, lib/AGENTS.md, dev/AGENTS.md, .woodpecker/agent-smoke.sh,
docs/PHASE-PROTOCOL.md, lib/pr-lifecycle.sh
- All 38 phase tests pass
This commit is contained in:
parent
410a5ee948
commit
7ad1c63de3
10 changed files with 62 additions and 1498 deletions
|
|
@ -6,8 +6,6 @@
|
|||
# 2. Every custom function called by agent scripts is defined in lib/ or the script itself
|
||||
#
|
||||
# Fast (<10s): no network, no tmux, no Claude needed.
|
||||
# Would have caught: kill_tmux_session (renamed), create_agent_session (missing),
|
||||
# read_phase (missing from dev-agent.sh scope)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
|
@ -95,13 +93,12 @@ echo "=== 2/2 Function resolution ==="
|
|||
#
|
||||
# Included — these are inline-sourced by agent scripts:
|
||||
# lib/env.sh — sourced by every agent (log, forge_api, etc.)
|
||||
# lib/agent-session.sh — sourced by orchestrators (create_agent_session, monitor_phase_loop, etc.)
|
||||
# lib/agent-sdk.sh — sourced by SDK agents (agent_run, agent_recover_session)
|
||||
# lib/ci-helpers.sh — sourced by pollers and review (ci_passed, classify_pipeline_failure, etc.)
|
||||
# lib/load-project.sh — sourced by env.sh when PROJECT_TOML is set
|
||||
# lib/file-action-issue.sh — sourced by gardener-run.sh (file_action_issue)
|
||||
# lib/secret-scan.sh — sourced by file-action-issue.sh, phase-handler.sh (scan_for_secrets, redact_secrets)
|
||||
# lib/formula-session.sh — sourced by formula-driven agents (acquire_cron_lock, run_formula_and_monitor, etc.)
|
||||
# lib/secret-scan.sh — sourced by file-action-issue.sh (scan_for_secrets, redact_secrets)
|
||||
# lib/formula-session.sh — sourced by formula-driven agents (acquire_cron_lock, check_memory, etc.)
|
||||
# lib/mirrors.sh — sourced by merge sites (mirror_push)
|
||||
# lib/guard.sh — sourced by all cron entry points (check_active)
|
||||
# lib/issue-lifecycle.sh — sourced by agents for issue claim/release/block/deps
|
||||
|
|
@ -116,7 +113,7 @@ echo "=== 2/2 Function resolution ==="
|
|||
# If a new lib file is added and sourced by agents, add it to LIB_FUNS below
|
||||
# and add a check_script call for it in the lib files section further down.
|
||||
LIB_FUNS=$(
|
||||
for f in lib/agent-session.sh lib/agent-sdk.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/secret-scan.sh lib/file-action-issue.sh lib/formula-session.sh lib/mirrors.sh lib/guard.sh lib/pr-lifecycle.sh lib/issue-lifecycle.sh lib/worktree.sh; do
|
||||
for f in lib/agent-sdk.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/secret-scan.sh lib/file-action-issue.sh lib/formula-session.sh lib/mirrors.sh lib/guard.sh lib/pr-lifecycle.sh lib/issue-lifecycle.sh lib/worktree.sh; do
|
||||
if [ -f "$f" ]; then get_fns "$f"; fi
|
||||
done | sort -u
|
||||
)
|
||||
|
|
@ -180,13 +177,12 @@ check_script() {
|
|||
# These are already in LIB_FUNS (their definitions are available to agents),
|
||||
# but this verifies calls *within* each lib file are also resolvable.
|
||||
check_script lib/env.sh lib/mirrors.sh
|
||||
check_script lib/agent-session.sh
|
||||
check_script lib/agent-sdk.sh
|
||||
check_script lib/ci-helpers.sh
|
||||
check_script lib/secret-scan.sh
|
||||
check_script lib/file-action-issue.sh lib/secret-scan.sh
|
||||
check_script lib/tea-helpers.sh lib/secret-scan.sh
|
||||
check_script lib/formula-session.sh lib/agent-session.sh
|
||||
check_script lib/formula-session.sh
|
||||
check_script lib/load-project.sh
|
||||
check_script lib/mirrors.sh lib/env.sh
|
||||
check_script lib/guard.sh
|
||||
|
|
@ -199,15 +195,13 @@ check_script lib/ci-debug.sh
|
|||
check_script lib/parse-deps.sh
|
||||
|
||||
# Agent scripts — list cross-sourced files where function scope flows across files.
|
||||
# phase-handler.sh defines default callback stubs; sourcing agents may override.
|
||||
check_script dev/dev-agent.sh
|
||||
check_script dev/phase-handler.sh lib/secret-scan.sh
|
||||
check_script dev/dev-poll.sh
|
||||
check_script dev/phase-test.sh
|
||||
check_script gardener/gardener-run.sh
|
||||
check_script review/review-pr.sh lib/agent-sdk.sh
|
||||
check_script review/review-poll.sh
|
||||
check_script planner/planner-run.sh lib/agent-session.sh lib/formula-session.sh
|
||||
check_script planner/planner-run.sh lib/formula-session.sh
|
||||
check_script supervisor/supervisor-poll.sh
|
||||
check_script supervisor/update-prompt.sh
|
||||
check_script supervisor/supervisor-run.sh
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ See `README.md` for the full architecture and `disinto-factory/SKILL.md` for set
|
|||
|
||||
```
|
||||
disinto/ (code repo)
|
||||
├── dev/ dev-poll.sh, dev-agent.sh, phase-handler.sh — issue implementation
|
||||
├── dev/ dev-poll.sh, dev-agent.sh, phase-test.sh — issue implementation
|
||||
├── review/ review-poll.sh, review-pr.sh — PR review
|
||||
├── gardener/ gardener-run.sh — direct cron executor for run-gardener formula
|
||||
├── predictor/ predictor-run.sh — daily cron executor for run-predictor formula
|
||||
|
|
@ -31,7 +31,7 @@ disinto/ (code repo)
|
|||
│ supervisor-poll.sh — legacy bash orchestrator (superseded)
|
||||
├── architect/ architect-run.sh — strategic decomposition of vision into sprints
|
||||
├── vault/ vault-env.sh — shared env setup (vault redesign in progress, see #73-#77)
|
||||
├── lib/ env.sh, agent-session.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, profile.sh, build-graph.py
|
||||
├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, profile.sh, build-graph.py
|
||||
├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored)
|
||||
├── formulas/ Issue templates (TOML specs for multi-step agent tasks)
|
||||
└── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
|
||||
|
|
|
|||
|
|
@ -14,9 +14,8 @@ in-progress issues are also picked up. The direct-merge scan runs before the loc
|
|||
check so approved PRs get merged even while a dev-agent session is active.
|
||||
|
||||
**Key files**:
|
||||
- `dev/dev-poll.sh` — Cron scheduler: finds next ready issue, handles merge/rebase of approved PRs, tracks CI fix attempts. Formula guard skips issues labeled `formula`, `prediction/dismissed`, or `prediction/unreviewed`. **Race prevention**: checks issue assignee before claiming — skips if assigned to a different bot user. **Stale branch abandonment**: closes PRs and deletes branches that are behind `$PRIMARY_BRANCH` (restarts poll cycle for a fresh start). **Stale in-progress recovery**: on each poll cycle, scans for issues labeled `in-progress` with no active tmux session and no open PR — removes `in-progress`, adds `blocked` with a human-triage comment (requires maintainer review before re-queuing).
|
||||
- `dev/dev-poll.sh` — Cron scheduler: finds next ready issue, handles merge/rebase of approved PRs, tracks CI fix attempts. Formula guard skips issues labeled `formula`, `prediction/dismissed`, or `prediction/unreviewed`. **Race prevention**: checks issue assignee before claiming — skips if assigned to a different bot user. **Stale branch abandonment**: closes PRs and deletes branches that are behind `$PRIMARY_BRANCH` (restarts poll cycle for a fresh start). **Stale in-progress recovery**: on each poll cycle, scans for issues labeled `in-progress` with no open PR — removes `in-progress`, adds `blocked` with a human-triage comment (requires maintainer review before re-queuing).
|
||||
- `dev/dev-agent.sh` — Orchestrator: claims issue, creates worktree + tmux session with interactive `claude`, monitors phase file, injects CI results and review feedback, merges on approval
|
||||
- `dev/phase-handler.sh` — Phase callback functions: `post_refusal_comment()`, `_on_phase_change()`, `build_phase_protocol_prompt()`. `do_merge()` detects already-merged PRs on HTTP 405 (race with dev-poll's pre-lock scan) and returns success instead of escalating. Sources `lib/mirrors.sh` and calls `mirror_push()` after every successful merge.
|
||||
- `dev/phase-test.sh` — Integration test for the phase protocol
|
||||
|
||||
**Environment variables consumed** (via `lib/env.sh` + project TOML):
|
||||
|
|
@ -33,7 +32,7 @@ check so approved PRs get merged even while a dev-agent session is active.
|
|||
|
||||
**Crash recovery**: on `PHASE:crashed` or non-zero exit, the worktree is **preserved** (not destroyed) for debugging. Location logged. Supervisor housekeeping removes stale crashed worktrees older than 24h.
|
||||
|
||||
**Lifecycle**: dev-poll.sh (`check_active dev`) → dev-agent.sh → tmux `dev-{project}-{issue}` → phase file
|
||||
**Lifecycle**: dev-poll.sh (`check_active dev`) → dev-agent.sh → tmux session → phase file
|
||||
drives CI/review loop → merge + `mirror_push()` → close issue. On respawn after
|
||||
`PHASE:escalate`, the stale phase file is cleared first so the session starts
|
||||
clean; the reinject prompt tells Claude not to re-escalate for the same reason.
|
||||
|
|
|
|||
|
|
@ -1,820 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# dev/phase-handler.sh — Phase callback functions for dev-agent.sh
|
||||
#
|
||||
# Source this file from agent orchestrators after lib/agent-session.sh is loaded.
|
||||
# Defines: post_refusal_comment(), _on_phase_change(), build_phase_protocol_prompt()
|
||||
#
|
||||
# Required globals (set by calling agent before or after sourcing):
|
||||
# ISSUE, FORGE_TOKEN, API, FORGE_WEB, PROJECT_NAME, FACTORY_ROOT
|
||||
# BRANCH, PHASE_FILE, WORKTREE, IMPL_SUMMARY_FILE
|
||||
# PRIMARY_BRANCH, SESSION_NAME, LOGFILE, ISSUE_TITLE
|
||||
# WOODPECKER_REPO_ID, WOODPECKER_TOKEN, WOODPECKER_SERVER
|
||||
#
|
||||
# Globals with defaults (agents can override after sourcing):
|
||||
# PR_NUMBER, CI_POLL_TIMEOUT, MAX_CI_FIXES, MAX_REVIEW_ROUNDS,
|
||||
# REVIEW_POLL_TIMEOUT, CI_RETRY_COUNT, CI_FIX_COUNT, REVIEW_ROUND,
|
||||
# CLAIMED, PHASE_POLL_INTERVAL
|
||||
#
|
||||
# Calls back to agent-defined helpers:
|
||||
# cleanup_worktree(), cleanup_labels(), status(), log()
|
||||
#
|
||||
# shellcheck shell=bash
|
||||
# shellcheck disable=SC2154 # globals are set in dev-agent.sh before calling
|
||||
# shellcheck disable=SC2034 # CLAIMED is read by cleanup() in dev-agent.sh
|
||||
|
||||
# Load secret scanner for redacting tmux output before posting to issues
|
||||
# shellcheck source=../lib/secret-scan.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/../lib/secret-scan.sh"
|
||||
|
||||
# Load shared CI helpers (is_infra_step, classify_pipeline_failure, etc.)
|
||||
# shellcheck source=../lib/ci-helpers.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/../lib/ci-helpers.sh"
|
||||
|
||||
# Load mirror push helper
|
||||
# shellcheck source=../lib/mirrors.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/../lib/mirrors.sh"
|
||||
|
||||
# --- Default callback stubs (agents can override after sourcing) ---
|
||||
# cleanup_worktree and cleanup_labels are called during phase transitions.
|
||||
# Provide no-op defaults so phase-handler.sh is self-contained; sourcing
|
||||
# agents override these with real implementations.
|
||||
if ! declare -f cleanup_worktree >/dev/null 2>&1; then
|
||||
cleanup_worktree() { :; }
|
||||
fi
|
||||
if ! declare -f cleanup_labels >/dev/null 2>&1; then
|
||||
cleanup_labels() { :; }
|
||||
fi
|
||||
|
||||
# --- Default globals (agents can override after sourcing) ---
|
||||
: "${CI_POLL_TIMEOUT:=1800}"
|
||||
: "${REVIEW_POLL_TIMEOUT:=10800}"
|
||||
: "${MAX_CI_FIXES:=3}"
|
||||
: "${MAX_REVIEW_ROUNDS:=5}"
|
||||
: "${CI_RETRY_COUNT:=0}"
|
||||
: "${CI_FIX_COUNT:=0}"
|
||||
: "${REVIEW_ROUND:=0}"
|
||||
: "${PR_NUMBER:=}"
|
||||
: "${CLAIMED:=false}"
|
||||
: "${PHASE_POLL_INTERVAL:=30}"
|
||||
|
||||
# --- Post diagnostic comment + label issue as blocked ---
|
||||
# Captures tmux pane output, posts a structured comment on the issue, removes
|
||||
# in-progress label, and adds the "blocked" label.
|
||||
#
|
||||
# Args: reason [session_name]
|
||||
# Uses globals: ISSUE, SESSION_NAME, PR_NUMBER, FORGE_TOKEN, API
|
||||
post_blocked_diagnostic() {
|
||||
local reason="$1"
|
||||
local session="${2:-${SESSION_NAME:-}}"
|
||||
|
||||
# Capture last 50 lines from tmux pane (before kill)
|
||||
local tmux_output=""
|
||||
if [ -n "$session" ] && tmux has-session -t "$session" 2>/dev/null; then
|
||||
tmux_output=$(tmux capture-pane -p -t "$session" -S -50 2>/dev/null || true)
|
||||
fi
|
||||
|
||||
# Redact any secrets from tmux output before posting to issue
|
||||
if [ -n "$tmux_output" ]; then
|
||||
tmux_output=$(redact_secrets "$tmux_output")
|
||||
fi
|
||||
|
||||
# Build diagnostic comment body
|
||||
local comment
|
||||
comment="### Session failure diagnostic
|
||||
|
||||
| Field | Value |
|
||||
|---|---|
|
||||
| Exit reason | \`${reason}\` |
|
||||
| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` |"
|
||||
[ -n "${PR_NUMBER:-}" ] && [ "${PR_NUMBER:-0}" != "0" ] && \
|
||||
comment="${comment}
|
||||
| PR | #${PR_NUMBER} |"
|
||||
|
||||
if [ -n "$tmux_output" ]; then
|
||||
comment="${comment}
|
||||
|
||||
<details><summary>Last 50 lines from tmux pane</summary>
|
||||
|
||||
\`\`\`
|
||||
${tmux_output}
|
||||
\`\`\`
|
||||
</details>"
|
||||
fi
|
||||
|
||||
# Post comment to issue
|
||||
curl -sf -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}/comments" \
|
||||
-d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true
|
||||
|
||||
# Remove in-progress, add blocked
|
||||
cleanup_labels
|
||||
local blocked_id
|
||||
blocked_id=$(ensure_blocked_label_id)
|
||||
if [ -n "$blocked_id" ]; then
|
||||
curl -sf -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}/labels" \
|
||||
-d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
CLAIMED=false
|
||||
_BLOCKED_POSTED=true
|
||||
}
|
||||
|
||||
# --- Build phase protocol prompt (shared across agents) ---
|
||||
# Generates the phase-signaling instructions for Claude prompts.
|
||||
# Args: phase_file summary_file branch [remote]
|
||||
# Output: The protocol text (stdout)
|
||||
build_phase_protocol_prompt() {
|
||||
local _pf="$1" _sf="$2" _br="$3" _remote="${4:-${FORGE_REMOTE:-origin}}"
|
||||
cat <<_PHASE_PROTOCOL_EOF_
|
||||
## Phase-Signaling Protocol (REQUIRED)
|
||||
|
||||
You are running in a persistent tmux session managed by an orchestrator.
|
||||
Communicate progress by writing to the phase file. The orchestrator watches
|
||||
this file and injects events (CI results, review feedback) back into this session.
|
||||
|
||||
### Key files
|
||||
\`\`\`
|
||||
PHASE_FILE="${_pf}"
|
||||
SUMMARY_FILE="${_sf}"
|
||||
\`\`\`
|
||||
|
||||
### Phase transitions — write these exactly:
|
||||
|
||||
**After committing and pushing your branch:**
|
||||
\`\`\`bash
|
||||
# Rebase on target branch before push to avoid merge conflicts
|
||||
git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
|
||||
git push ${_remote} ${_br}
|
||||
# Write a short summary of what you implemented:
|
||||
printf '%s' "<your summary>" > "\${SUMMARY_FILE}"
|
||||
# Signal the orchestrator to create the PR and watch for CI:
|
||||
echo "PHASE:awaiting_ci" > "${_pf}"
|
||||
\`\`\`
|
||||
Then STOP and wait. The orchestrator will inject CI results.
|
||||
|
||||
**When you receive a "CI passed" injection:**
|
||||
\`\`\`bash
|
||||
echo "PHASE:awaiting_review" > "${_pf}"
|
||||
\`\`\`
|
||||
Then STOP and wait. The orchestrator will inject review feedback.
|
||||
|
||||
**When you receive a "CI failed:" injection:**
|
||||
Fix the CI issue, then rebase on target branch and push:
|
||||
\`\`\`bash
|
||||
git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
|
||||
git push --force-with-lease ${_remote} ${_br}
|
||||
echo "PHASE:awaiting_ci" > "${_pf}"
|
||||
\`\`\`
|
||||
Then STOP and wait.
|
||||
|
||||
**When you receive a "Review: REQUEST_CHANGES" injection:**
|
||||
Address ALL review feedback, then rebase on target branch and push:
|
||||
\`\`\`bash
|
||||
git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
|
||||
git push --force-with-lease ${_remote} ${_br}
|
||||
echo "PHASE:awaiting_ci" > "${_pf}"
|
||||
\`\`\`
|
||||
(CI runs again after each push — always write awaiting_ci, not awaiting_review)
|
||||
|
||||
**When you need human help (CI exhausted, merge blocked, stuck on a decision):**
|
||||
\`\`\`bash
|
||||
printf 'PHASE:escalate\nReason: %s\n' "describe what you need" > "${_pf}"
|
||||
\`\`\`
|
||||
Then STOP and wait. A human will review and respond via the forge.
|
||||
|
||||
**On unrecoverable failure:**
|
||||
\`\`\`bash
|
||||
printf 'PHASE:failed\nReason: %s\n' "describe what failed" > "${_pf}"
|
||||
\`\`\`
|
||||
_PHASE_PROTOCOL_EOF_
|
||||
}
|
||||
|
||||
# --- Merge helper ---
|
||||
# do_merge — attempt to merge PR via forge API.
|
||||
# Args: pr_num
|
||||
# Returns:
|
||||
# 0 = merged successfully
|
||||
# 1 = other failure (conflict, network error, etc.)
|
||||
# 2 = not enough approvals (HTTP 405) — PHASE:escalate already written
|
||||
do_merge() {
|
||||
local pr_num="$1"
|
||||
local merge_response merge_http_code merge_body
|
||||
merge_response=$(curl -s -w "\n%{http_code}" -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H 'Content-Type: application/json' \
|
||||
"${API}/pulls/${pr_num}/merge" \
|
||||
-d '{"Do":"merge","delete_branch_after_merge":true}') || true
|
||||
merge_http_code=$(echo "$merge_response" | tail -1)
|
||||
merge_body=$(echo "$merge_response" | sed '$d')
|
||||
|
||||
if [ "$merge_http_code" = "200" ] || [ "$merge_http_code" = "204" ]; then
|
||||
log "do_merge: PR #${pr_num} merged (HTTP ${merge_http_code})"
|
||||
return 0
|
||||
fi
|
||||
|
||||
# HTTP 405 — could be "merge requirements not met" OR "already merged" (race with dev-poll).
|
||||
# Before escalating, check whether the PR was already merged by another agent.
|
||||
if [ "$merge_http_code" = "405" ]; then
|
||||
local pr_state
|
||||
pr_state=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls/${pr_num}" | jq -r '.merged // false') || pr_state="false"
|
||||
if [ "$pr_state" = "true" ]; then
|
||||
log "do_merge: PR #${pr_num} already merged (detected after HTTP 405) — treating as success"
|
||||
return 0
|
||||
fi
|
||||
log "do_merge: PR #${pr_num} blocked — merge requirements not met (HTTP 405): ${merge_body:0:200}"
|
||||
printf 'PHASE:escalate\nReason: %s\n' \
|
||||
"PR #${pr_num} merge blocked — merge requirements not met (HTTP 405): ${merge_body:0:200}" \
|
||||
> "$PHASE_FILE"
|
||||
return 2
|
||||
fi
|
||||
|
||||
log "do_merge: PR #${pr_num} merge failed (HTTP ${merge_http_code}): ${merge_body:0:200}"
|
||||
return 1
|
||||
}
|
||||
|
||||
# --- Refusal comment helper ---
|
||||
post_refusal_comment() {
|
||||
local emoji="$1" title="$2" body="$3"
|
||||
local last_has_title
|
||||
last_has_title=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/issues/${ISSUE}/comments?limit=5" | \
|
||||
jq -r --arg t "Dev-agent: ${title}" '[.[] | .body // ""] | any(contains($t)) | tostring') || true
|
||||
if [ "$last_has_title" = "true" ]; then
|
||||
log "skipping duplicate refusal comment: ${title}"
|
||||
return 0
|
||||
fi
|
||||
local comment
|
||||
comment="${emoji} **Dev-agent: ${title}**
|
||||
|
||||
${body}
|
||||
|
||||
---
|
||||
*Automated assessment by dev-agent · $(date -u '+%Y-%m-%d %H:%M UTC')*"
|
||||
printf '%s' "$comment" > "/tmp/refusal-comment.txt"
|
||||
jq -Rs '{body: .}' < "/tmp/refusal-comment.txt" > "/tmp/refusal-comment.json"
|
||||
curl -sf -o /dev/null -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}/comments" \
|
||||
--data-binary @"/tmp/refusal-comment.json" 2>/dev/null || \
|
||||
log "WARNING: failed to post refusal comment"
|
||||
rm -f "/tmp/refusal-comment.txt" "/tmp/refusal-comment.json"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE DISPATCH CALLBACK
|
||||
# =============================================================================
|
||||
|
||||
# _on_phase_change — Phase dispatch callback for monitor_phase_loop
|
||||
# Receives the current phase as $1.
|
||||
# Returns 0 to continue the loop, 1 to break (terminal phase reached).
|
||||
_on_phase_change() {
|
||||
local phase="$1"
|
||||
|
||||
# ── PHASE: awaiting_ci ──────────────────────────────────────────────────────
|
||||
if [ "$phase" = "PHASE:awaiting_ci" ]; then
|
||||
# Release session lock — Claude is idle during CI polling (#724)
|
||||
session_lock_release
|
||||
|
||||
# Create PR if not yet created
|
||||
if [ -z "${PR_NUMBER:-}" ]; then
|
||||
status "creating PR for issue #${ISSUE}"
|
||||
IMPL_SUMMARY=""
|
||||
if [ -f "$IMPL_SUMMARY_FILE" ]; then
|
||||
# Don't treat refusal JSON as a PR summary
|
||||
if ! jq -e '.status' < "$IMPL_SUMMARY_FILE" >/dev/null 2>&1; then
|
||||
IMPL_SUMMARY=$(head -c 4000 "$IMPL_SUMMARY_FILE")
|
||||
fi
|
||||
fi
|
||||
|
||||
printf 'Fixes #%s\n\n## Changes\n%s' "$ISSUE" "$IMPL_SUMMARY" > "/tmp/pr-body-${ISSUE}.txt"
|
||||
jq -n \
|
||||
--arg title "fix: ${ISSUE_TITLE} (#${ISSUE})" \
|
||||
--rawfile body "/tmp/pr-body-${ISSUE}.txt" \
|
||||
--arg head "$BRANCH" \
|
||||
--arg base "${PRIMARY_BRANCH}" \
|
||||
'{title: $title, body: $body, head: $head, base: $base}' > "/tmp/pr-request-${ISSUE}.json"
|
||||
|
||||
PR_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/pulls" \
|
||||
--data-binary @"/tmp/pr-request-${ISSUE}.json")
|
||||
|
||||
PR_HTTP_CODE=$(echo "$PR_RESPONSE" | tail -1)
|
||||
PR_RESPONSE_BODY=$(echo "$PR_RESPONSE" | sed '$d')
|
||||
rm -f "/tmp/pr-body-${ISSUE}.txt" "/tmp/pr-request-${ISSUE}.json"
|
||||
|
||||
if [ "$PR_HTTP_CODE" = "201" ] || [ "$PR_HTTP_CODE" = "200" ]; then
|
||||
PR_NUMBER=$(echo "$PR_RESPONSE_BODY" | jq -r '.number')
|
||||
log "created PR #${PR_NUMBER}"
|
||||
elif [ "$PR_HTTP_CODE" = "409" ]; then
|
||||
# PR already exists (race condition) — find it
|
||||
FOUND_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls?state=open&limit=20" | \
|
||||
jq -r --arg branch "$BRANCH" \
|
||||
'.[] | select(.head.ref == $branch) | .number' | head -1) || true
|
||||
if [ -n "$FOUND_PR" ]; then
|
||||
PR_NUMBER="$FOUND_PR"
|
||||
log "PR already exists: #${PR_NUMBER}"
|
||||
else
|
||||
log "ERROR: PR creation got 409 but no existing PR found"
|
||||
agent_inject_into_session "$SESSION_NAME" "ERROR: Could not create PR (HTTP 409, no existing PR found). Check the forge API. Retry by writing PHASE:awaiting_ci again after verifying the branch was pushed."
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
log "ERROR: PR creation failed (HTTP ${PR_HTTP_CODE})"
|
||||
agent_inject_into_session "$SESSION_NAME" "ERROR: Could not create PR (HTTP ${PR_HTTP_CODE}). Check branch was pushed: git push ${FORGE_REMOTE:-origin} ${BRANCH}. Then write PHASE:awaiting_ci again."
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# No CI configured? Treat as success immediately
|
||||
if [ "${WOODPECKER_REPO_ID:-2}" = "0" ]; then
|
||||
log "no CI configured — treating as passed"
|
||||
agent_inject_into_session "$SESSION_NAME" "CI passed on PR #${PR_NUMBER} (no CI configured for this project).
|
||||
Write PHASE:awaiting_review to the phase file, then stop and wait for review feedback."
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Poll CI until done or timeout
|
||||
status "waiting for CI on PR #${PR_NUMBER}"
|
||||
CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || \
|
||||
curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls/${PR_NUMBER}" | jq -r '.head.sha')
|
||||
|
||||
CI_DONE=false
|
||||
CI_STATE="unknown"
|
||||
CI_POLL_ELAPSED=0
|
||||
while [ "$CI_POLL_ELAPSED" -lt "$CI_POLL_TIMEOUT" ]; do
|
||||
sleep 30
|
||||
CI_POLL_ELAPSED=$(( CI_POLL_ELAPSED + 30 ))
|
||||
|
||||
# Check session still alive during CI wait (exit_marker + tmux fallback)
|
||||
if [ -f "/tmp/claude-exited-${SESSION_NAME}.ts" ] || ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
|
||||
log "session died during CI wait"
|
||||
break
|
||||
fi
|
||||
|
||||
# Re-fetch HEAD — Claude may have pushed new commits since loop started
|
||||
CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || echo "$CI_CURRENT_SHA")
|
||||
|
||||
CI_STATE=$(ci_commit_status "$CI_CURRENT_SHA")
|
||||
if [ "$CI_STATE" = "success" ] || [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
|
||||
CI_DONE=true
|
||||
[ "$CI_STATE" = "success" ] && CI_FIX_COUNT=0
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if ! $CI_DONE; then
|
||||
log "TIMEOUT: CI didn't complete in ${CI_POLL_TIMEOUT}s"
|
||||
agent_inject_into_session "$SESSION_NAME" "CI TIMEOUT: CI did not complete within 30 minutes for PR #${PR_NUMBER} (SHA: ${CI_CURRENT_SHA:0:7}). This may be an infrastructure issue. Write PHASE:escalate if you cannot proceed."
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "CI: ${CI_STATE}"
|
||||
|
||||
if [ "$CI_STATE" = "success" ]; then
|
||||
agent_inject_into_session "$SESSION_NAME" "CI passed on PR #${PR_NUMBER}.
|
||||
Write PHASE:awaiting_review to the phase file, then stop and wait for review feedback:
|
||||
echo \"PHASE:awaiting_review\" > \"${PHASE_FILE}\""
|
||||
else
|
||||
# Fetch CI error details
|
||||
PIPELINE_NUM=$(ci_pipeline_number "$CI_CURRENT_SHA")
|
||||
|
||||
FAILED_STEP=""
|
||||
FAILED_EXIT=""
|
||||
IS_INFRA=false
|
||||
if [ -n "$PIPELINE_NUM" ]; then
|
||||
FAILED_INFO=$(curl -sf \
|
||||
-H "Authorization: Bearer ${WOODPECKER_TOKEN}" \
|
||||
"${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${PIPELINE_NUM}" | \
|
||||
jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.name)|\(.exit_code)"' | head -1 || true)
|
||||
FAILED_STEP=$(echo "$FAILED_INFO" | cut -d'|' -f1)
|
||||
FAILED_EXIT=$(echo "$FAILED_INFO" | cut -d'|' -f2)
|
||||
fi
|
||||
|
||||
log "CI failed: step=${FAILED_STEP:-unknown} exit=${FAILED_EXIT:-?}"
|
||||
|
||||
if [ -n "$FAILED_STEP" ] && is_infra_step "$FAILED_STEP" "${FAILED_EXIT:-0}" >/dev/null 2>&1; then
|
||||
IS_INFRA=true
|
||||
fi
|
||||
|
||||
if [ "$IS_INFRA" = true ] && [ "${CI_RETRY_COUNT:-0}" -lt 1 ]; then
|
||||
CI_RETRY_COUNT=$(( CI_RETRY_COUNT + 1 ))
|
||||
log "infra failure — retrigger CI (retry ${CI_RETRY_COUNT})"
|
||||
(cd "$WORKTREE" && git commit --allow-empty \
|
||||
-m "ci: retrigger after infra failure (#${ISSUE})" --no-verify 2>&1 | tail -1)
|
||||
# Rebase on target branch before push to avoid merge conflicts
|
||||
if ! (cd "$WORKTREE" && \
|
||||
git fetch "${FORGE_REMOTE:-origin}" "${PRIMARY_BRANCH}" 2>/dev/null && \
|
||||
git rebase "${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}" 2>&1 | tail -5); then
|
||||
log "rebase conflict detected — aborting, agent must resolve"
|
||||
(cd "$WORKTREE" && git rebase --abort 2>/dev/null || git reset --hard HEAD 2>/dev/null) || true
|
||||
agent_inject_into_session "$SESSION_NAME" "REBASE CONFLICT: Cannot rebase onto ${PRIMARY_BRANCH} automatically.
|
||||
|
||||
Please resolve merge conflicts manually:
|
||||
1. Check conflict status: git status
|
||||
2. Resolve conflicts in the conflicted files
|
||||
3. Stage resolved files: git add <files>
|
||||
4. Continue rebase: git rebase --continue
|
||||
|
||||
If you cannot resolve conflicts, abort: git rebase --abort
|
||||
Then write PHASE:escalate with a reason."
|
||||
return 0
|
||||
fi
|
||||
# Rebase succeeded — push the result
|
||||
(cd "$WORKTREE" && git push --force-with-lease "${FORGE_REMOTE:-origin}" "$BRANCH" 2>&1 | tail -3)
|
||||
# Touch phase file so we recheck CI on the new SHA
|
||||
# Do NOT update LAST_PHASE_MTIME here — let the main loop detect the fresh mtime
|
||||
touch "$PHASE_FILE"
|
||||
CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || true)
|
||||
return 0
|
||||
fi
|
||||
|
||||
CI_FIX_COUNT=$(( CI_FIX_COUNT + 1 ))
|
||||
_ci_pipeline_url="${WOODPECKER_SERVER}/repos/${WOODPECKER_REPO_ID}/pipeline/${PIPELINE_NUM:-0}"
|
||||
if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then
|
||||
log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating"
|
||||
printf 'PHASE:escalate\nReason: ci_exhausted after %d attempts (step: %s)\n' "$CI_FIX_COUNT" "${FAILED_STEP:-unknown}" > "$PHASE_FILE"
|
||||
# Do NOT update LAST_PHASE_MTIME here — let the main loop detect PHASE:escalate
|
||||
return 0
|
||||
fi
|
||||
|
||||
CI_ERROR_LOG=""
|
||||
if [ -n "$PIPELINE_NUM" ]; then
|
||||
CI_ERROR_LOG=$(bash "${FACTORY_ROOT}/lib/ci-debug.sh" failures "$PIPELINE_NUM" 2>/dev/null | tail -80 | head -c 8000 || echo "")
|
||||
fi
|
||||
|
||||
# Save CI result for crash recovery
|
||||
printf 'CI failed (attempt %d/%d)\nStep: %s\nExit: %s\n\n%s' \
|
||||
"$CI_FIX_COUNT" "$MAX_CI_FIXES" "${FAILED_STEP:-unknown}" "${FAILED_EXIT:-?}" "$CI_ERROR_LOG" \
|
||||
> "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" 2>/dev/null || true
|
||||
|
||||
agent_inject_into_session "$SESSION_NAME" "CI failed on PR #${PR_NUMBER} (attempt ${CI_FIX_COUNT}/${MAX_CI_FIXES}).
|
||||
|
||||
Failed step: ${FAILED_STEP:-unknown} (exit code ${FAILED_EXIT:-?}, pipeline #${PIPELINE_NUM:-?})
|
||||
|
||||
CI debug tool:
|
||||
bash ${FACTORY_ROOT}/lib/ci-debug.sh failures ${PIPELINE_NUM:-0}
|
||||
bash ${FACTORY_ROOT}/lib/ci-debug.sh logs ${PIPELINE_NUM:-0} <step-name>
|
||||
|
||||
Error snippet:
|
||||
${CI_ERROR_LOG:-No logs available. Use ci-debug.sh to query the pipeline.}
|
||||
|
||||
Instructions:
|
||||
1. Run ci-debug.sh failures to get the full error output.
|
||||
2. Read the failing test file(s) — understand what the tests EXPECT.
|
||||
3. Fix the root cause — do NOT weaken tests.
|
||||
4. Rebase on target branch and push: git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
|
||||
git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
|
||||
5. Write: echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
|
||||
6. Stop and wait."
|
||||
fi
|
||||
|
||||
# ── PHASE: awaiting_review ──────────────────────────────────────────────────
|
||||
elif [ "$phase" = "PHASE:awaiting_review" ]; then
|
||||
# Release session lock — Claude is idle during review wait (#724)
|
||||
session_lock_release
|
||||
status "waiting for review on PR #${PR_NUMBER:-?}"
|
||||
CI_FIX_COUNT=0 # Reset CI fix budget for this review cycle
|
||||
|
||||
if [ -z "${PR_NUMBER:-}" ]; then
|
||||
log "WARNING: awaiting_review but PR_NUMBER unknown — searching for PR"
|
||||
FOUND_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls?state=open&limit=20" | \
|
||||
jq -r --arg branch "$BRANCH" \
|
||||
'.[] | select(.head.ref == $branch) | .number' | head -1) || true
|
||||
if [ -n "$FOUND_PR" ]; then
|
||||
PR_NUMBER="$FOUND_PR"
|
||||
log "found PR #${PR_NUMBER}"
|
||||
else
|
||||
agent_inject_into_session "$SESSION_NAME" "ERROR: Cannot find open PR for branch ${BRANCH}. Did you push? Verify with git status and git push ${FORGE_REMOTE:-origin} ${BRANCH}, then write PHASE:awaiting_ci."
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
REVIEW_POLL_ELAPSED=0
|
||||
REVIEW_FOUND=false
|
||||
while [ "$REVIEW_POLL_ELAPSED" -lt "$REVIEW_POLL_TIMEOUT" ]; do
|
||||
sleep 300 # 5 min between review checks
|
||||
REVIEW_POLL_ELAPSED=$(( REVIEW_POLL_ELAPSED + 300 ))
|
||||
|
||||
# Check session still alive (exit_marker + tmux fallback)
|
||||
if [ -f "/tmp/claude-exited-${SESSION_NAME}.ts" ] || ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
|
||||
log "session died during review wait"
|
||||
REVIEW_FOUND=false
|
||||
break
|
||||
fi
|
||||
|
||||
# Check if phase was updated while we wait (e.g., Claude reacted to something)
|
||||
NEW_MTIME=$(stat -c %Y "$PHASE_FILE" 2>/dev/null || echo 0)
|
||||
if [ "$NEW_MTIME" -gt "$LAST_PHASE_MTIME" ]; then
|
||||
log "phase file updated during review wait — re-entering main loop"
|
||||
# Do NOT update LAST_PHASE_MTIME here — leave it stale so the outer
|
||||
# loop detects the change on its next tick and dispatches the new phase.
|
||||
REVIEW_FOUND=true # Prevent timeout injection
|
||||
# Clean up review-poll sentinel if it exists (session already advanced)
|
||||
rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
break
|
||||
fi
|
||||
|
||||
REVIEW_SHA=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls/${PR_NUMBER}" | jq -r '.head.sha') || true
|
||||
REVIEW_COMMENT=$(forge_api_all "/issues/${PR_NUMBER}/comments" | \
|
||||
jq -r --arg sha "$REVIEW_SHA" \
|
||||
'[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | last // empty') || true
|
||||
|
||||
if [ -n "$REVIEW_COMMENT" ] && [ "$REVIEW_COMMENT" != "null" ]; then
|
||||
REVIEW_TEXT=$(echo "$REVIEW_COMMENT" | jq -r '.body')
|
||||
|
||||
# Skip error reviews — they have no verdict
|
||||
if echo "$REVIEW_TEXT" | grep -q "review-error\|Review — Error"; then
|
||||
log "review was an error, waiting for re-review"
|
||||
continue
|
||||
fi
|
||||
|
||||
VERDICT=$(echo "$REVIEW_TEXT" | grep -oP '\*\*(APPROVE|REQUEST_CHANGES|DISCUSS)\*\*' | head -1 | tr -d '*' || true)
|
||||
log "review verdict: ${VERDICT:-unknown}"
|
||||
|
||||
# Also check formal forge reviews
|
||||
if [ -z "$VERDICT" ]; then
|
||||
VERDICT=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls/${PR_NUMBER}/reviews" | \
|
||||
jq -r '[.[] | select(.stale == false)] | last | .state // empty' || true)
|
||||
if [ "$VERDICT" = "APPROVED" ]; then
|
||||
VERDICT="APPROVE"
|
||||
elif [ "$VERDICT" != "REQUEST_CHANGES" ]; then
|
||||
VERDICT=""
|
||||
fi
|
||||
[ -n "$VERDICT" ] && log "verdict from formal review: $VERDICT"
|
||||
fi
|
||||
|
||||
# Skip injection if review-poll.sh already injected (sentinel present).
|
||||
# Exception: APPROVE always falls through so do_merge() runs even when
|
||||
# review-poll injected first — prevents Claude writing PHASE:done on a
|
||||
# failed merge without the orchestrator detecting the error.
|
||||
REVIEW_SENTINEL="/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
if [ -n "$VERDICT" ] && [ -f "$REVIEW_SENTINEL" ] && [ "$VERDICT" != "APPROVE" ]; then
|
||||
log "review already injected by review-poll (sentinel exists) — skipping"
|
||||
rm -f "$REVIEW_SENTINEL"
|
||||
REVIEW_FOUND=true
|
||||
break
|
||||
fi
|
||||
rm -f "$REVIEW_SENTINEL" # consume sentinel before APPROVE handling below
|
||||
|
||||
if [ "$VERDICT" = "APPROVE" ]; then
|
||||
REVIEW_FOUND=true
|
||||
_merge_rc=0; do_merge "$PR_NUMBER" || _merge_rc=$?
|
||||
if [ "$_merge_rc" -eq 0 ]; then
|
||||
# Merge succeeded — close issue and signal done
|
||||
curl -sf -X PATCH \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H 'Content-Type: application/json' \
|
||||
"${API}/issues/${ISSUE}" \
|
||||
-d '{"state":"closed"}' >/dev/null 2>&1 || true
|
||||
# Pull merged primary branch and push to mirrors
|
||||
git -C "$PROJECT_REPO_ROOT" fetch "${FORGE_REMOTE:-origin}" "$PRIMARY_BRANCH" 2>/dev/null || true
|
||||
git -C "$PROJECT_REPO_ROOT" checkout "$PRIMARY_BRANCH" 2>/dev/null || true
|
||||
git -C "$PROJECT_REPO_ROOT" pull --ff-only "${FORGE_REMOTE:-origin}" "$PRIMARY_BRANCH" 2>/dev/null || true
|
||||
mirror_push
|
||||
printf 'PHASE:done\n' > "$PHASE_FILE"
|
||||
elif [ "$_merge_rc" -ne 2 ]; then
|
||||
# Other merge failure (conflict, etc.) — delegate to Claude for rebase + retry
|
||||
agent_inject_into_session "$SESSION_NAME" "Approved! PR #${PR_NUMBER} has been approved, but the merge failed (likely conflicts).
|
||||
|
||||
Rebase onto ${PRIMARY_BRANCH} and push:
|
||||
git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
|
||||
git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
|
||||
echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
|
||||
|
||||
Do NOT merge or close the issue — the orchestrator handles that after CI passes.
|
||||
If rebase repeatedly fails, write PHASE:escalate with a reason."
|
||||
fi
|
||||
# _merge_rc=2: PHASE:escalate already written by do_merge()
|
||||
break
|
||||
|
||||
elif [ "$VERDICT" = "REQUEST_CHANGES" ] || [ "$VERDICT" = "DISCUSS" ]; then
|
||||
REVIEW_ROUND=$(( REVIEW_ROUND + 1 ))
|
||||
if [ "$REVIEW_ROUND" -ge "$MAX_REVIEW_ROUNDS" ]; then
|
||||
log "hit max review rounds (${MAX_REVIEW_ROUNDS})"
|
||||
log "PR #${PR_NUMBER}: hit ${MAX_REVIEW_ROUNDS} review rounds, needs human attention"
|
||||
fi
|
||||
REVIEW_FOUND=true
|
||||
agent_inject_into_session "$SESSION_NAME" "Review feedback (round ${REVIEW_ROUND}) on PR #${PR_NUMBER}:
|
||||
|
||||
${REVIEW_TEXT}
|
||||
|
||||
Instructions:
|
||||
1. Address each piece of feedback carefully.
|
||||
2. Run lint and tests when done.
|
||||
3. Rebase on target branch and push: git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
|
||||
git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
|
||||
4. Write: echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
|
||||
5. Stop and wait for the next CI result."
|
||||
log "review REQUEST_CHANGES received (round ${REVIEW_ROUND})"
|
||||
break
|
||||
|
||||
else
|
||||
# No verdict found in comment or formal review — keep waiting
|
||||
log "review comment found but no verdict, continuing to wait"
|
||||
continue
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check if PR was merged or closed externally
|
||||
PR_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/pulls/${PR_NUMBER}") || true
|
||||
PR_STATE=$(echo "$PR_JSON" | jq -r '.state // "unknown"')
|
||||
PR_MERGED=$(echo "$PR_JSON" | jq -r '.merged // false')
|
||||
if [ "$PR_STATE" != "open" ]; then
|
||||
if [ "$PR_MERGED" = "true" ]; then
|
||||
log "PR #${PR_NUMBER} was merged externally"
|
||||
curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}" -d '{"state":"closed"}' >/dev/null 2>&1 || true
|
||||
cleanup_labels
|
||||
agent_kill_session "$SESSION_NAME"
|
||||
cleanup_worktree
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}"
|
||||
exit 0
|
||||
else
|
||||
log "PR #${PR_NUMBER} was closed WITHOUT merge — NOT closing issue"
|
||||
cleanup_labels
|
||||
agent_kill_session "$SESSION_NAME"
|
||||
cleanup_worktree
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log "waiting for review on PR #${PR_NUMBER} (${REVIEW_POLL_ELAPSED}s elapsed)"
|
||||
done
|
||||
|
||||
if ! $REVIEW_FOUND && [ "$REVIEW_POLL_ELAPSED" -ge "$REVIEW_POLL_TIMEOUT" ]; then
|
||||
log "TIMEOUT: no review after 3h"
|
||||
agent_inject_into_session "$SESSION_NAME" "TIMEOUT: No review received after 3 hours for PR #${PR_NUMBER}. Write PHASE:escalate to escalate to a human reviewer."
|
||||
fi
|
||||
|
||||
# ── PHASE: escalate ──────────────────────────────────────────────────────
|
||||
elif [ "$phase" = "PHASE:escalate" ]; then
|
||||
status "escalated — waiting for human input on issue #${ISSUE}"
|
||||
ESCALATE_REASON=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
|
||||
log "phase: escalate — reason: ${ESCALATE_REASON:-none}"
|
||||
# Session stays alive — human input arrives via vault/forge
|
||||
|
||||
# ── PHASE: done ─────────────────────────────────────────────────────────────
|
||||
# PR merged and issue closed (by orchestrator or Claude). Just clean up local state.
|
||||
elif [ "$phase" = "PHASE:done" ]; then
|
||||
if [ -n "${PR_NUMBER:-}" ]; then
|
||||
status "phase done — PR #${PR_NUMBER} merged, cleaning up"
|
||||
else
|
||||
status "phase done — issue #${ISSUE} complete, cleaning up"
|
||||
fi
|
||||
|
||||
# Belt-and-suspenders: ensure in-progress label removed (idempotent)
|
||||
cleanup_labels
|
||||
|
||||
# Local cleanup
|
||||
agent_kill_session "$SESSION_NAME"
|
||||
cleanup_worktree
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
|
||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
CLAIMED=false # Don't unclaim again in cleanup()
|
||||
|
||||
# ── PHASE: failed ───────────────────────────────────────────────────────────
|
||||
elif [ "$phase" = "PHASE:failed" ]; then
|
||||
if [[ -f "$PHASE_FILE" ]]; then
|
||||
FAILURE_REASON=$(sed -n '2p' "$PHASE_FILE" | sed 's/^Reason: //')
|
||||
fi
|
||||
FAILURE_REASON="${FAILURE_REASON:-unspecified}"
|
||||
log "phase: failed — reason: ${FAILURE_REASON}"
|
||||
# Gitea labels API requires []int64 — look up the "backlog" label ID once
|
||||
BACKLOG_LABEL_ID=$(forge_api GET "/labels" 2>/dev/null \
|
||||
| jq -r '.[] | select(.name == "backlog") | .id' 2>/dev/null || true)
|
||||
BACKLOG_LABEL_ID="${BACKLOG_LABEL_ID:-1300815}"
|
||||
UNDERSPECIFIED_LABEL_ID=$(forge_api GET "/labels" 2>/dev/null \
|
||||
| jq -r '.[] | select(.name == "underspecified") | .id' 2>/dev/null || true)
|
||||
UNDERSPECIFIED_LABEL_ID="${UNDERSPECIFIED_LABEL_ID:-1300816}"
|
||||
|
||||
# Check if this is a refusal (Claude wrote refusal JSON to IMPL_SUMMARY_FILE)
|
||||
REFUSAL_JSON=""
|
||||
if [ -f "$IMPL_SUMMARY_FILE" ] && jq -e '.status' < "$IMPL_SUMMARY_FILE" >/dev/null 2>&1; then
|
||||
REFUSAL_JSON=$(cat "$IMPL_SUMMARY_FILE")
|
||||
fi
|
||||
|
||||
if [ -n "$REFUSAL_JSON" ] && [ "$FAILURE_REASON" = "refused" ]; then
|
||||
REFUSAL_STATUS=$(printf '%s' "$REFUSAL_JSON" | jq -r '.status')
|
||||
log "claude refused: ${REFUSAL_STATUS}"
|
||||
|
||||
# Write preflight result for dev-poll.sh
|
||||
printf '%s' "$REFUSAL_JSON" > "$PREFLIGHT_RESULT"
|
||||
|
||||
# Unclaim issue (restore backlog label, remove in-progress)
|
||||
cleanup_labels
|
||||
curl -sf -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}/labels" \
|
||||
-d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true
|
||||
|
||||
case "$REFUSAL_STATUS" in
|
||||
unmet_dependency)
|
||||
BLOCKED_BY_MSG=$(printf '%s' "$REFUSAL_JSON" | jq -r '.blocked_by // "unknown"')
|
||||
SUGGESTION=$(printf '%s' "$REFUSAL_JSON" | jq -r '.suggestion // empty')
|
||||
COMMENT_BODY="### Blocked by unmet dependency
|
||||
|
||||
${BLOCKED_BY_MSG}"
|
||||
if [ -n "$SUGGESTION" ] && [ "$SUGGESTION" != "null" ]; then
|
||||
COMMENT_BODY="${COMMENT_BODY}
|
||||
|
||||
**Suggestion:** Work on #${SUGGESTION} first."
|
||||
fi
|
||||
post_refusal_comment "🚧" "Unmet dependency" "$COMMENT_BODY"
|
||||
;;
|
||||
too_large)
|
||||
REASON=$(printf '%s' "$REFUSAL_JSON" | jq -r '.reason // "unspecified"')
|
||||
post_refusal_comment "📏" "Too large for single session" "### Why this can't be implemented as-is
|
||||
|
||||
${REASON}
|
||||
|
||||
### Next steps
|
||||
A maintainer should split this issue or add more detail to the spec."
|
||||
curl -sf -X POST \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}/labels" \
|
||||
-d "{\"labels\":[${UNDERSPECIFIED_LABEL_ID}]}" >/dev/null 2>&1 || true
|
||||
curl -sf -X DELETE \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
"${API}/issues/${ISSUE}/labels/${BACKLOG_LABEL_ID}" >/dev/null 2>&1 || true
|
||||
;;
|
||||
already_done)
|
||||
REASON=$(printf '%s' "$REFUSAL_JSON" | jq -r '.reason // "unspecified"')
|
||||
post_refusal_comment "✅" "Already implemented" "### Existing implementation
|
||||
|
||||
${REASON}
|
||||
|
||||
Closing as already implemented."
|
||||
curl -sf -X PATCH \
|
||||
-H "Authorization: token ${FORGE_TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
"${API}/issues/${ISSUE}" \
|
||||
-d '{"state":"closed"}' >/dev/null 2>&1 || true
|
||||
;;
|
||||
*)
|
||||
post_refusal_comment "❓" "Unable to proceed" "The dev-agent could not process this issue.
|
||||
|
||||
Raw response:
|
||||
\`\`\`json
|
||||
$(printf '%s' "$REFUSAL_JSON" | head -c 2000)
|
||||
\`\`\`"
|
||||
;;
|
||||
esac
|
||||
|
||||
CLAIMED=false # Don't unclaim again in cleanup()
|
||||
agent_kill_session "$SESSION_NAME"
|
||||
cleanup_worktree
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
|
||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
return 1
|
||||
|
||||
else
|
||||
# Genuine unrecoverable failure — label blocked with diagnostic
|
||||
log "session failed: ${FAILURE_REASON}"
|
||||
post_blocked_diagnostic "$FAILURE_REASON"
|
||||
|
||||
agent_kill_session "$SESSION_NAME"
|
||||
if [ -n "${PR_NUMBER:-}" ]; then
|
||||
log "keeping worktree (PR #${PR_NUMBER} still open)"
|
||||
else
|
||||
cleanup_worktree
|
||||
fi
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
|
||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# ── PHASE: crashed ──────────────────────────────────────────────────────────
|
||||
# Session died unexpectedly (OOM kill, tmux crash, etc.). Label blocked with
|
||||
# diagnostic comment so humans can triage directly on the issue.
|
||||
elif [ "$phase" = "PHASE:crashed" ]; then
|
||||
log "session crashed for issue #${ISSUE}"
|
||||
post_blocked_diagnostic "crashed"
|
||||
log "PRESERVED crashed worktree for debugging: $WORKTREE"
|
||||
rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
|
||||
"/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
|
||||
[ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
|
||||
|
||||
else
|
||||
log "WARNING: unknown phase value: ${phase}"
|
||||
fi
|
||||
}
|
||||
|
|
@ -8,8 +8,13 @@
|
|||
|
||||
set -euo pipefail
|
||||
|
||||
# Source canonical read_phase() from shared library
|
||||
source "$(dirname "$0")/../lib/agent-session.sh"
|
||||
# Inline read_phase() function (previously from lib/agent-session.sh)
|
||||
# Read the current phase from a phase file, stripped of whitespace.
|
||||
# Usage: read_phase [file] — defaults to $PHASE_FILE
|
||||
read_phase() {
|
||||
local file="${1:-${PHASE_FILE:-}}"
|
||||
{ cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
|
||||
}
|
||||
|
||||
PROJECT="testproject"
|
||||
ISSUE="999"
|
||||
|
|
@ -84,7 +89,7 @@ else
|
|||
fail "PHASE:failed format: first='$first_line' second='$second_line'"
|
||||
fi
|
||||
|
||||
# ── Test 5: orchestrator read function (canonical read_phase from lib/agent-session.sh)
|
||||
# ── Test 5: orchestrator read function (inline read_phase)
|
||||
echo "PHASE:awaiting_ci" > "$PHASE_FILE"
|
||||
phase=$(read_phase "$PHASE_FILE")
|
||||
if [ "$phase" = "PHASE:awaiting_ci" ]; then
|
||||
|
|
|
|||
|
|
@ -92,10 +92,9 @@ PHASE:failed → label issue blocked, post diagnostic comment
|
|||
|
||||
### `idle_prompt` exit reason
|
||||
|
||||
`monitor_phase_loop` (in `lib/agent-session.sh`) can exit with
|
||||
`_MONITOR_LOOP_EXIT=idle_prompt`. This happens when Claude returns to the
|
||||
interactive prompt (`❯`) for **3 consecutive polls** without writing any phase
|
||||
signal to the phase file.
|
||||
The phase monitor can exit with `_MONITOR_LOOP_EXIT=idle_prompt`. This happens
|
||||
when Claude returns to the interactive prompt (`❯`) for **3 consecutive polls**
|
||||
without writing any phase signal to the phase file.
|
||||
|
||||
**Trigger conditions:**
|
||||
- The phase file is empty (no phase has ever been written), **and**
|
||||
|
|
@ -111,14 +110,13 @@ signal to the phase file.
|
|||
callback without the phase file actually containing that value.
|
||||
|
||||
**Agent requirements:**
|
||||
- **Callback (`_on_phase_change` / `formula_phase_callback`):** Must handle
|
||||
`PHASE:failed` defensively — the session is already dead, so any tmux
|
||||
send-keys or session-dependent logic must be skipped or guarded.
|
||||
- **Callback:** Must handle `PHASE:failed` defensively — the session is already
|
||||
dead, so any tmux send-keys or session-dependent logic must be skipped or
|
||||
guarded.
|
||||
- **Post-loop exit handler (`case $_MONITOR_LOOP_EXIT`):** Must include an
|
||||
`idle_prompt)` branch. Typical actions: log the event, clean up temp files,
|
||||
and (for agents that use escalation) write an escalation entry or notify via
|
||||
vault/forge. See `dev/dev-agent.sh` and
|
||||
`gardener/gardener-agent.sh` for reference implementations.
|
||||
vault/forge. See `dev/dev-agent.sh` for reference implementations.
|
||||
|
||||
## Crash Recovery
|
||||
|
||||
|
|
|
|||
|
|
@ -11,17 +11,16 @@ sourced as needed.
|
|||
| `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
|
||||
| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) |
|
||||
| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` / `blocked by #N` patterns. Inline scan skips fenced code blocks to prevent false positives from code examples in issue bodies. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll |
|
||||
| `lib/formula-session.sh` | `acquire_cron_lock()`, `check_memory()`, `load_formula()`, `build_context_block()`, `consume_escalation_reply()`, `start_formula_session()`, `formula_phase_callback()`, `build_prompt_footer()`, `build_graph_section()`, `run_formula_and_monitor(AGENT [TIMEOUT] [CALLBACK])` — shared helpers for formula-driven cron agents (lock, memory guard, formula loading, prompt assembly, tmux session, monitor loop, crash recovery). `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `formula_phase_callback()` handles `PHASE:escalate` (unified escalation path — kills the session). `run_formula_and_monitor` accepts an optional CALLBACK (default: `formula_phase_callback`) so callers can install custom merge-through or escalation handlers. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
|
||||
| `lib/formula-session.sh` | `acquire_cron_lock()`, `check_memory()`, `load_formula()`, `load_formula_or_profile()`, `build_context_block()`, `ensure_ops_repo()`, `ops_commit_and_push()`, `build_prompt_footer()`, `build_sdk_prompt_footer()`, `formula_worktree_setup()`, `formula_prepare_profile_context()`, `formula_lessons_block()`, `profile_write_journal()`, `profile_load_lessons()`, `ensure_profile_repo()`, `_profile_has_repo()`, `_count_undigested_journals()`, `_profile_digest_journals()`, `_profile_commit_and_push()`, `resolve_agent_identity()`, `build_graph_section()`, `build_scratch_instruction()`, `read_scratch_context()`, `cleanup_stale_crashed_worktrees()` — shared helpers for formula-driven cron agents (lock, memory guard, formula loading, .profile repo management, prompt assembly, worktree setup). `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
|
||||
| `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in cron logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | cron entry points |
|
||||
| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh and dev/phase-handler.sh — called after every successful merge. | dev-poll.sh, phase-handler.sh |
|
||||
| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh — called after every successful merge. | dev-poll.sh |
|
||||
| `lib/build-graph.py` | Python tool: parses VISION.md, prerequisites.md (from ops repo), AGENTS.md, formulas/*.toml, evidence/ (from ops repo), and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh |
|
||||
| `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | file-action-issue.sh, phase-handler.sh |
|
||||
| `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | file-action-issue.sh |
|
||||
| `lib/file-action-issue.sh` | `file_action_issue()` — dedup check, secret scan, label lookup, and issue creation for formula-driven cron wrappers. Sets `FILED_ISSUE_NUM` on success. Returns 4 if secrets detected in body. | (available for future use) |
|
||||
| `lib/tea-helpers.sh` | `tea_file_issue(title, body, labels...)` — create issue via tea CLI with secret scanning; sets `FILED_ISSUE_NUM`. `tea_relabel(issue_num, labels...)` — replace labels using tea's `edit` subcommand (not `label`). `tea_comment(issue_num, body)` — add comment with secret scanning. `tea_close(issue_num)` — close issue. All use `TEA_LOGIN` and `FORGE_REPO` from env.sh. Labels by name (no ID lookup). Tea binary download verified via sha256 checksum. Sourced by env.sh when `tea` binary is available. | env.sh (conditional) |
|
||||
| `lib/worktree.sh` | Reusable git worktree management: `worktree_create(path, branch, [base_ref])` — create worktree, checkout base, fetch submodules. `worktree_recover(path, branch, [remote])` — detect existing worktree, reuse if on correct branch (sets `_WORKTREE_REUSED`), otherwise clean and recreate. `worktree_cleanup(path)` — `git worktree remove --force`, clear Claude Code project cache (`~/.claude/projects/` matching path). `worktree_cleanup_stale([max_age_hours])` — scan `/tmp` for orphaned worktrees older than threshold, skip preserved and active tmux worktrees, prune. `worktree_preserve(path, reason)` — mark worktree as preserved for debugging (writes `.worktree-preserved` marker, skipped by stale cleanup). | dev-agent.sh, supervisor-run.sh, planner-run.sh, predictor-run.sh, gardener-run.sh |
|
||||
| `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) |
|
||||
| `lib/issue-lifecycle.sh` | Reusable issue lifecycle library: `issue_claim()` (add in-progress, remove backlog), `issue_release()` (remove in-progress, add backlog), `issue_block()` (post diagnostic comment with secret redaction, add blocked label), `issue_close()`, `issue_check_deps()` (parse deps, check transitive closure; sets `_ISSUE_BLOCKED_BY`, `_ISSUE_SUGGESTION`), `issue_suggest_next()` (find next unblocked backlog issue; sets `_ISSUE_NEXT`), `issue_post_refusal()` (structured refusal comment with dedup). Label IDs cached in globals on first lookup. Sources `lib/secret-scan.sh`. | dev-agent.sh (future) |
|
||||
| `lib/agent-session.sh` | Shared tmux + Claude session helpers: `create_agent_session()`, `inject_formula()`, `agent_wait_for_claude_ready()`, `agent_inject_into_session()`, `agent_kill_session()`, `monitor_phase_loop()`, `read_phase()`, `write_compact_context()`. `create_agent_session(session, workdir, [phase_file])` optionally installs a PostToolUse hook (matcher `Bash\|Write`) that detects phase file writes in real-time — when Claude writes to the phase file, the hook writes a marker so `monitor_phase_loop` reacts on the next poll instead of waiting for mtime changes. Also installs a StopFailure hook (matcher `rate_limit\|server_error\|authentication_failed\|billing_error`) that writes `PHASE:failed` with an `api_error` reason to the phase file and touches the phase-changed marker, so the orchestrator discovers API errors within one poll cycle instead of waiting for idle timeout. Also installs a SessionStart hook (matcher `compact`) that re-injects phase protocol instructions after context compaction — callers write the context file via `write_compact_context(phase_file, content)`, and the hook (`on-compact-reinject.sh`) outputs the file content to stdout so Claude retains critical instructions. When `phase_file` is set, passes it to the idle stop hook (`on-idle-stop.sh`) so the hook can **nudge Claude** (up to 2 times) if Claude returns to the prompt without writing to the phase file — the hook injects a tmux reminder asking Claude to signal PHASE:done or PHASE:awaiting_ci. The PreToolUse guard hook (`on-pretooluse-guard.sh`) receives the session name as a third argument — formula agents (`gardener-*`, `planner-*`, `predictor-*`, `supervisor-*`) are identified this way and allowed to access `FACTORY_ROOT` from worktrees (they need env.sh, AGENTS.md, formulas/, lib/). **OAuth flock**: when `DISINTO_CONTAINER=1`, Claude CLI is wrapped in `flock -w 300 ~/.claude/session.lock` to queue concurrent token refresh attempts and prevent rotation races across agents sharing the same credentials. `monitor_phase_loop` sets `_MONITOR_LOOP_EXIT` to one of: `done`, `idle_timeout`, `idle_prompt` (Claude returned to `>` for 3 consecutive polls without writing any phase — callback invoked with `PHASE:failed`, session already dead), `crashed`, or `PHASE:escalate` / other `PHASE:*` string. **Unified escalation**: `PHASE:escalate` is the signal that a session needs human input (renamed from `PHASE:needs_human`). **Callers must handle `idle_prompt`** in both their callback and their post-loop exit handler — see [`docs/PHASE-PROTOCOL.md` idle_prompt](docs/PHASE-PROTOCOL.md#idle_prompt-exit-reason) for the full contract. | dev-agent.sh |
|
||||
| `lib/vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
|
||||
| `lib/branch-protection.sh` | Branch protection helpers for Forgejo repos. `setup_vault_branch_protection()` — configures admin-only merge protection on main (require 1 approval, restrict merge to admin role, block direct pushes). `setup_profile_branch_protection()` — same protection for `.profile` repos. `verify_branch_protection()` — checks protection is correctly configured. `remove_branch_protection()` — removes protection (cleanup/testing). Handles race condition after initial push: retries with backoff if Forgejo hasn't processed the branch yet. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OPS_REPO`. | bin/disinto (hire-an-agent) |
|
||||
| `lib/agent-sdk.sh` | `agent_run([--resume SESSION_ID] [--worktree DIR] PROMPT)` — one-shot `claude -p` invocation with session persistence. Saves session ID to `SID_FILE`, reads it back on resume. `agent_recover_session()` — restore previous session ID from `SID_FILE` on startup. **Nudge guard**: skips nudge injection if the worktree is clean and no push is expected, preventing spurious re-invocations. Callers must define `SID_FILE`, `LOGFILE`, and `log()` before sourcing. | formula-driven agents (dev-agent, planner-run, predictor-run, gardener-run) |
|
||||
|
|
|
|||
|
|
@ -1,486 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# agent-session.sh — Shared tmux + Claude interactive session helpers
|
||||
#
|
||||
# Source this into agent orchestrator scripts for reusable session management.
|
||||
#
|
||||
# Functions:
|
||||
# agent_wait_for_claude_ready SESSION_NAME [TIMEOUT_SECS]
|
||||
# agent_inject_into_session SESSION_NAME TEXT
|
||||
# agent_kill_session SESSION_NAME
|
||||
# monitor_phase_loop PHASE_FILE IDLE_TIMEOUT_SECS CALLBACK_FN [SESSION_NAME]
|
||||
# session_lock_acquire [TIMEOUT_SECS]
|
||||
# session_lock_release
|
||||
|
||||
# --- Cooperative session lock (fd-based) ---
|
||||
# File descriptor for the session lock. Set by create_agent_session().
|
||||
# Callers can release/re-acquire via session_lock_release/session_lock_acquire
|
||||
# to allow other Claude sessions during idle phases (awaiting_review/awaiting_ci).
|
||||
SESSION_LOCK_FD=""
|
||||
|
||||
# Release the session lock without closing the file descriptor.
|
||||
# The fd stays open so it can be re-acquired later.
|
||||
session_lock_release() {
|
||||
if [ -n "${SESSION_LOCK_FD:-}" ]; then
|
||||
flock -u "$SESSION_LOCK_FD"
|
||||
fi
|
||||
}
|
||||
|
||||
# Re-acquire the session lock. Blocks until available or timeout.
|
||||
# Opens the lock fd if not already open (for use by external callers).
|
||||
# Args: [timeout_secs] (default 300)
|
||||
# Returns 0 on success, 1 on timeout/error.
|
||||
# shellcheck disable=SC2120 # timeout arg is used by external callers
|
||||
session_lock_acquire() {
|
||||
local timeout="${1:-300}"
|
||||
if [ -z "${SESSION_LOCK_FD:-}" ]; then
|
||||
local lock_dir="${HOME}/.claude"
|
||||
mkdir -p "$lock_dir"
|
||||
exec {SESSION_LOCK_FD}>>"${lock_dir}/session.lock"
|
||||
fi
|
||||
flock -w "$timeout" "$SESSION_LOCK_FD"
|
||||
}
|
||||
|
||||
# Wait for the Claude ❯ ready prompt in a tmux pane.
|
||||
# Returns 0 if ready within TIMEOUT_SECS (default 120), 1 otherwise.
|
||||
agent_wait_for_claude_ready() {
|
||||
local session="$1"
|
||||
local timeout="${2:-120}"
|
||||
local elapsed=0
|
||||
while [ "$elapsed" -lt "$timeout" ]; do
|
||||
if tmux capture-pane -t "$session" -p 2>/dev/null | grep -q '❯'; then
|
||||
return 0
|
||||
fi
|
||||
sleep 2
|
||||
elapsed=$((elapsed + 2))
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
# Paste TEXT into SESSION (waits for Claude to be ready first), then press Enter.
|
||||
agent_inject_into_session() {
|
||||
local session="$1"
|
||||
local text="$2"
|
||||
local tmpfile
|
||||
# Re-acquire session lock before injecting — Claude will resume working
|
||||
# shellcheck disable=SC2119 # using default timeout
|
||||
session_lock_acquire || true
|
||||
agent_wait_for_claude_ready "$session" 120 || true
|
||||
# Clear idle marker — new work incoming
|
||||
rm -f "/tmp/claude-idle-${session}.ts"
|
||||
tmpfile=$(mktemp /tmp/agent-inject-XXXXXX)
|
||||
printf '%s' "$text" > "$tmpfile"
|
||||
tmux load-buffer -b "agent-inject-$$" "$tmpfile"
|
||||
tmux paste-buffer -t "$session" -b "agent-inject-$$"
|
||||
sleep 0.5
|
||||
tmux send-keys -t "$session" "" Enter
|
||||
tmux delete-buffer -b "agent-inject-$$" 2>/dev/null || true
|
||||
rm -f "$tmpfile"
|
||||
}
|
||||
|
||||
# Create a tmux session running Claude in the given workdir.
|
||||
# Installs a Stop hook for idle detection (see monitor_phase_loop).
|
||||
# Installs a PreToolUse hook to guard destructive Bash operations.
|
||||
# Optionally installs a PostToolUse hook for phase file write detection.
|
||||
# Optionally installs a StopFailure hook for immediate phase file update on API error.
|
||||
# Args: session workdir [phase_file]
|
||||
# Returns 0 if session is ready, 1 otherwise.
|
||||
create_agent_session() {
|
||||
local session="$1"
|
||||
local workdir="${2:-.}"
|
||||
local phase_file="${3:-}"
|
||||
|
||||
# Prepare settings directory for hooks
|
||||
mkdir -p "${workdir}/.claude"
|
||||
local settings="${workdir}/.claude/settings.json"
|
||||
|
||||
# Install Stop hook for idle detection: when Claude finishes a response,
|
||||
# the hook writes a timestamp to a marker file. monitor_phase_loop checks
|
||||
# this marker instead of fragile tmux pane scraping.
|
||||
local idle_marker="/tmp/claude-idle-${session}.ts"
|
||||
local hook_script="${FACTORY_ROOT}/lib/hooks/on-idle-stop.sh"
|
||||
if [ -x "$hook_script" ]; then
|
||||
local hook_cmd="${hook_script} ${idle_marker}"
|
||||
# When a phase file is available, pass it and the session name so the
|
||||
# hook can nudge Claude if it returns to the prompt without signalling.
|
||||
if [ -n "$phase_file" ]; then
|
||||
hook_cmd="${hook_script} ${idle_marker} ${phase_file} ${session}"
|
||||
fi
|
||||
if [ -f "$settings" ]; then
|
||||
# Append our Stop hook to existing project settings
|
||||
jq --arg cmd "$hook_cmd" '
|
||||
if (.hooks.Stop // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.Stop = (.hooks.Stop // []) + [{
|
||||
matcher: "",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$hook_cmd" '{
|
||||
hooks: {
|
||||
Stop: [{
|
||||
matcher: "",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Install PostToolUse hook for phase file write detection: when Claude
|
||||
# writes to the phase file via Bash or Write, the hook writes a marker
|
||||
# so monitor_phase_loop can react immediately instead of waiting for
|
||||
# the next mtime-based poll cycle.
|
||||
if [ -n "$phase_file" ]; then
|
||||
local phase_marker="/tmp/phase-changed-${session}.marker"
|
||||
local phase_hook_script="${FACTORY_ROOT}/lib/hooks/on-phase-change.sh"
|
||||
if [ -x "$phase_hook_script" ]; then
|
||||
local phase_hook_cmd="${phase_hook_script} ${phase_file} ${phase_marker}"
|
||||
if [ -f "$settings" ]; then
|
||||
jq --arg cmd "$phase_hook_cmd" '
|
||||
if (.hooks.PostToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.PostToolUse = (.hooks.PostToolUse // []) + [{
|
||||
matcher: "Bash|Write",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$phase_hook_cmd" '{
|
||||
hooks: {
|
||||
PostToolUse: [{
|
||||
matcher: "Bash|Write",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
rm -f "$phase_marker"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Install StopFailure hook for immediate phase file update on API error:
|
||||
# when Claude hits a rate limit, server error, billing error, or auth failure,
|
||||
# the hook writes PHASE:failed to the phase file and touches the phase-changed
|
||||
# marker so monitor_phase_loop picks it up within one poll cycle instead of
|
||||
# waiting for idle timeout (up to 2 hours).
|
||||
if [ -n "$phase_file" ]; then
|
||||
local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh"
|
||||
if [ -x "$stop_failure_hook_script" ]; then
|
||||
# phase_marker is defined in the PostToolUse block above; redeclare so
|
||||
# this block is self-contained if that block is ever removed.
|
||||
local sf_phase_marker="/tmp/phase-changed-${session}.marker"
|
||||
local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${sf_phase_marker}"
|
||||
if [ -f "$settings" ]; then
|
||||
jq --arg cmd "$stop_failure_hook_cmd" '
|
||||
if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.StopFailure = (.hooks.StopFailure // []) + [{
|
||||
matcher: "rate_limit|server_error|authentication_failed|billing_error",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$stop_failure_hook_cmd" '{
|
||||
hooks: {
|
||||
StopFailure: [{
|
||||
matcher: "rate_limit|server_error|authentication_failed|billing_error",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
# Install PreToolUse hook for destructive operation guard: blocks force push
|
||||
# to primary branch, rm -rf outside worktree, direct API merge calls, and
|
||||
# checkout/switch to primary branch. Claude sees the denial reason on exit 2
|
||||
# and can self-correct.
|
||||
local guard_hook_script="${FACTORY_ROOT}/lib/hooks/on-pretooluse-guard.sh"
|
||||
if [ -x "$guard_hook_script" ]; then
|
||||
local abs_workdir
|
||||
abs_workdir=$(cd "$workdir" 2>/dev/null && pwd) || abs_workdir="$workdir"
|
||||
local guard_hook_cmd="${guard_hook_script} ${PRIMARY_BRANCH:-main} ${abs_workdir} ${session}"
|
||||
if [ -f "$settings" ]; then
|
||||
jq --arg cmd "$guard_hook_cmd" '
|
||||
if (.hooks.PreToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.PreToolUse = (.hooks.PreToolUse // []) + [{
|
||||
matcher: "Bash",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$guard_hook_cmd" '{
|
||||
hooks: {
|
||||
PreToolUse: [{
|
||||
matcher: "Bash",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Install SessionEnd hook for guaranteed cleanup: when the Claude session
|
||||
# exits (clean or crash), write a termination marker so monitor_phase_loop
|
||||
# detects the exit faster than tmux has-session polling alone.
|
||||
local exit_marker="/tmp/claude-exited-${session}.ts"
|
||||
local session_end_hook_script="${FACTORY_ROOT}/lib/hooks/on-session-end.sh"
|
||||
if [ -x "$session_end_hook_script" ]; then
|
||||
local session_end_hook_cmd="${session_end_hook_script} ${exit_marker}"
|
||||
if [ -f "$settings" ]; then
|
||||
jq --arg cmd "$session_end_hook_cmd" '
|
||||
if (.hooks.SessionEnd // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.SessionEnd = (.hooks.SessionEnd // []) + [{
|
||||
matcher: "",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$session_end_hook_cmd" '{
|
||||
hooks: {
|
||||
SessionEnd: [{
|
||||
matcher: "",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
fi
|
||||
rm -f "$exit_marker"
|
||||
|
||||
# Install SessionStart hook for context re-injection after compaction:
|
||||
# when Claude Code compacts context during long sessions, the phase protocol
|
||||
# instructions are lost. This hook fires after each compaction and outputs
|
||||
# the content of a context file so Claude retains critical instructions.
|
||||
# The context file is written by callers via write_compact_context().
|
||||
if [ -n "$phase_file" ]; then
|
||||
local compact_hook_script="${FACTORY_ROOT}/lib/hooks/on-compact-reinject.sh"
|
||||
if [ -x "$compact_hook_script" ]; then
|
||||
local context_file="${phase_file%.phase}.context"
|
||||
local compact_hook_cmd="${compact_hook_script} ${context_file}"
|
||||
if [ -f "$settings" ]; then
|
||||
jq --arg cmd "$compact_hook_cmd" '
|
||||
if (.hooks.SessionStart // [] | any(.[]; .hooks[]?.command == $cmd))
|
||||
then .
|
||||
else .hooks.SessionStart = (.hooks.SessionStart // []) + [{
|
||||
matcher: "compact",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
end
|
||||
' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
|
||||
else
|
||||
jq -n --arg cmd "$compact_hook_cmd" '{
|
||||
hooks: {
|
||||
SessionStart: [{
|
||||
matcher: "compact",
|
||||
hooks: [{type: "command", command: $cmd}]
|
||||
}]
|
||||
}
|
||||
}' > "$settings"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
rm -f "$idle_marker"
|
||||
local model_flag=""
|
||||
if [ -n "${CLAUDE_MODEL:-}" ]; then
|
||||
model_flag="--model ${CLAUDE_MODEL}"
|
||||
fi
|
||||
|
||||
# Acquire a session-level mutex via fd-based flock to prevent concurrent
|
||||
# Claude sessions from racing on OAuth token refresh. Unlike the previous
|
||||
# command-wrapper flock, the fd approach allows callers to release the lock
|
||||
# during idle phases (awaiting_review/awaiting_ci) and re-acquire before
|
||||
# injecting the next prompt. See #724.
|
||||
# Use ~/.claude/session.lock so the lock is shared across containers when
|
||||
# the host ~/.claude directory is bind-mounted.
|
||||
local lock_dir="${HOME}/.claude"
|
||||
mkdir -p "$lock_dir"
|
||||
local claude_lock="${lock_dir}/session.lock"
|
||||
if [ -z "${SESSION_LOCK_FD:-}" ]; then
|
||||
exec {SESSION_LOCK_FD}>>"${claude_lock}"
|
||||
fi
|
||||
if ! flock -w 300 "$SESSION_LOCK_FD"; then
|
||||
return 1
|
||||
fi
|
||||
local claude_cmd="claude --dangerously-skip-permissions ${model_flag}"
|
||||
|
||||
tmux new-session -d -s "$session" -c "$workdir" \
|
||||
"$claude_cmd" 2>/dev/null
|
||||
sleep 1
|
||||
tmux has-session -t "$session" 2>/dev/null || return 1
|
||||
agent_wait_for_claude_ready "$session" 120 || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
# Inject a prompt/formula into a session (alias for agent_inject_into_session).
|
||||
inject_formula() {
|
||||
agent_inject_into_session "$@"
|
||||
}
|
||||
|
||||
# Monitor a phase file, calling a callback on changes and handling idle timeout.
|
||||
# Sets _MONITOR_LOOP_EXIT to the exit reason (idle_timeout, idle_prompt, done, crashed, PHASE:failed, PHASE:escalate).
|
||||
# Sets _MONITOR_SESSION to the resolved session name (arg 4 or $SESSION_NAME).
|
||||
# Callbacks should reference _MONITOR_SESSION instead of $SESSION_NAME directly.
|
||||
# Args: phase_file idle_timeout_secs callback_fn [session_name]
|
||||
# session_name — tmux session to health-check; falls back to $SESSION_NAME global
|
||||
#
|
||||
# Idle detection: uses a Stop hook marker file (written by lib/hooks/on-idle-stop.sh)
|
||||
# to detect when Claude finishes responding without writing a phase signal.
|
||||
# If the marker exists for 3 consecutive polls with no phase written, the session
|
||||
# is killed and the callback invoked with "PHASE:failed".
|
||||
monitor_phase_loop() {
|
||||
local phase_file="$1"
|
||||
local idle_timeout="$2"
|
||||
local callback="$3"
|
||||
local _session="${4:-${SESSION_NAME:-}}"
|
||||
# Export resolved session name so callbacks can reference it regardless of
|
||||
# which session was passed to monitor_phase_loop (analogous to _MONITOR_LOOP_EXIT).
|
||||
export _MONITOR_SESSION="$_session"
|
||||
local poll_interval="${PHASE_POLL_INTERVAL:-10}"
|
||||
local last_mtime=0
|
||||
local idle_elapsed=0
|
||||
local idle_pane_count=0
|
||||
|
||||
while true; do
|
||||
sleep "$poll_interval"
|
||||
idle_elapsed=$(( idle_elapsed + poll_interval ))
|
||||
|
||||
# Session health check: SessionEnd hook marker provides fast detection,
|
||||
# tmux has-session is the fallback for unclean exits (e.g. tmux crash).
|
||||
local exit_marker="/tmp/claude-exited-${_session}.ts"
|
||||
if [ -f "$exit_marker" ] || ! tmux has-session -t "${_session}" 2>/dev/null; then
|
||||
local current_phase
|
||||
current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
|
||||
case "$current_phase" in
|
||||
PHASE:done|PHASE:failed|PHASE:merged|PHASE:escalate)
|
||||
;; # terminal — fall through to phase handler
|
||||
*)
|
||||
# Call callback with "crashed" — let agent-specific code handle recovery
|
||||
if type "${callback}" &>/dev/null; then
|
||||
"$callback" "PHASE:crashed"
|
||||
fi
|
||||
# If callback didn't restart session, break
|
||||
if ! tmux has-session -t "${_session}" 2>/dev/null; then
|
||||
_MONITOR_LOOP_EXIT="crashed"
|
||||
return 1
|
||||
fi
|
||||
idle_elapsed=0
|
||||
idle_pane_count=0
|
||||
continue
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Check phase-changed marker from PostToolUse hook — if present, the hook
|
||||
# detected a phase file write so we reset last_mtime to force processing
|
||||
# this cycle instead of waiting for the next mtime change.
|
||||
local phase_marker="/tmp/phase-changed-${_session}.marker"
|
||||
if [ -f "$phase_marker" ]; then
|
||||
rm -f "$phase_marker"
|
||||
last_mtime=0
|
||||
fi
|
||||
|
||||
# Check phase file for changes
|
||||
local phase_mtime
|
||||
phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
|
||||
local current_phase
|
||||
current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
|
||||
|
||||
if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$last_mtime" ]; then
|
||||
# No phase change — check idle timeout
|
||||
if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
|
||||
_MONITOR_LOOP_EXIT="idle_timeout"
|
||||
agent_kill_session "${_session}"
|
||||
return 0
|
||||
fi
|
||||
# Idle detection via Stop hook: the on-idle-stop.sh hook writes a marker
|
||||
# file when Claude finishes a response. If the marker exists and no phase
|
||||
# has been written, Claude returned to the prompt without following the
|
||||
# phase protocol. 3 consecutive polls = confirmed idle (not mid-turn).
|
||||
local idle_marker="/tmp/claude-idle-${_session}.ts"
|
||||
if [ -z "$current_phase" ] && [ -f "$idle_marker" ]; then
|
||||
idle_pane_count=$(( idle_pane_count + 1 ))
|
||||
if [ "$idle_pane_count" -ge 3 ]; then
|
||||
_MONITOR_LOOP_EXIT="idle_prompt"
|
||||
# Session is killed before the callback is invoked.
|
||||
# Callbacks that handle PHASE:failed must not assume the session is alive.
|
||||
agent_kill_session "${_session}"
|
||||
if type "${callback}" &>/dev/null; then
|
||||
"$callback" "PHASE:failed"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
idle_pane_count=0
|
||||
fi
|
||||
continue
|
||||
fi
|
||||
|
||||
# Phase changed
|
||||
last_mtime="$phase_mtime"
|
||||
# shellcheck disable=SC2034 # read by phase-handler.sh callback
|
||||
LAST_PHASE_MTIME="$phase_mtime"
|
||||
idle_elapsed=0
|
||||
idle_pane_count=0
|
||||
|
||||
# Terminal phases
|
||||
case "$current_phase" in
|
||||
PHASE:done|PHASE:merged)
|
||||
_MONITOR_LOOP_EXIT="done"
|
||||
if type "${callback}" &>/dev/null; then
|
||||
"$callback" "$current_phase"
|
||||
fi
|
||||
return 0
|
||||
;;
|
||||
PHASE:failed|PHASE:escalate)
|
||||
_MONITOR_LOOP_EXIT="$current_phase"
|
||||
if type "${callback}" &>/dev/null; then
|
||||
"$callback" "$current_phase"
|
||||
fi
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
# Non-terminal phase — call callback
|
||||
if type "${callback}" &>/dev/null; then
|
||||
"$callback" "$current_phase"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# Write context to a file for re-injection after context compaction.
|
||||
# The SessionStart compact hook reads this file and outputs it to stdout.
|
||||
# Args: phase_file content
|
||||
write_compact_context() {
|
||||
local phase_file="$1"
|
||||
local content="$2"
|
||||
local context_file="${phase_file%.phase}.context"
|
||||
printf '%s\n' "$content" > "$context_file"
|
||||
}
|
||||
|
||||
# Kill a tmux session gracefully (no-op if not found).
|
||||
agent_kill_session() {
|
||||
local session="${1:-}"
|
||||
[ -n "$session" ] && tmux kill-session -t "$session" 2>/dev/null || true
|
||||
rm -f "/tmp/claude-idle-${session}.ts"
|
||||
rm -f "/tmp/phase-changed-${session}.marker"
|
||||
rm -f "/tmp/claude-exited-${session}.ts"
|
||||
rm -f "/tmp/claude-nudge-${session}.count"
|
||||
}
|
||||
|
||||
# Read the current phase from a phase file, stripped of whitespace.
|
||||
# Usage: read_phase [file] — defaults to $PHASE_FILE
|
||||
read_phase() {
|
||||
local file="${1:-${PHASE_FILE:-}}"
|
||||
{ cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
|
||||
}
|
||||
|
|
@ -1,24 +1,35 @@
|
|||
#!/usr/bin/env bash
|
||||
# formula-session.sh — Shared helpers for formula-driven cron agents
|
||||
#
|
||||
# Provides reusable functions for the common cron-wrapper + tmux-session
|
||||
# pattern used by planner-run.sh, predictor-run.sh, gardener-run.sh, and supervisor-run.sh.
|
||||
# Provides reusable utility functions for the common cron-wrapper pattern
|
||||
# used by planner-run.sh, predictor-run.sh, gardener-run.sh, and supervisor-run.sh.
|
||||
#
|
||||
# Functions:
|
||||
# acquire_cron_lock LOCK_FILE — PID lock with stale cleanup
|
||||
# check_memory [MIN_MB] — skip if available RAM too low
|
||||
# load_formula FORMULA_FILE — sets FORMULA_CONTENT
|
||||
# build_context_block FILE [FILE ...] — sets CONTEXT_BLOCK
|
||||
# start_formula_session SESSION WORKDIR PHASE_FILE — create tmux + claude
|
||||
# build_prompt_footer [EXTRA_API] — sets PROMPT_FOOTER (API ref + env + phase)
|
||||
# run_formula_and_monitor AGENT [TIMEOUT] [CALLBACK] — session start, inject, monitor, log
|
||||
# formula_phase_callback PHASE — standard crash-recovery callback
|
||||
# build_prompt_footer [EXTRA_API_LINES] — sets PROMPT_FOOTER (API ref + env)
|
||||
# build_sdk_prompt_footer [EXTRA_API] — omits phase protocol (SDK mode)
|
||||
# formula_worktree_setup WORKTREE — isolated worktree for formula execution
|
||||
# formula_prepare_profile_context — load lessons from .profile repo (pre-session)
|
||||
# formula_lessons_block — return lessons block for prompt
|
||||
# profile_write_journal ISSUE_NUM TITLE OUTCOME [FILES] — post-session journal
|
||||
# profile_load_lessons — load lessons-learned.md into LESSONS_CONTEXT
|
||||
# ensure_profile_repo [AGENT_IDENTITY] — clone/pull .profile repo
|
||||
# _profile_has_repo — check if agent has .profile repo
|
||||
# _count_undigested_journals — count journal entries to digest
|
||||
# _profile_digest_journals — digest journals into lessons
|
||||
# _profile_commit_and_push MESSAGE [FILES] — commit/push to .profile repo
|
||||
# resolve_agent_identity — resolve agent user login from FORGE_TOKEN
|
||||
# build_graph_section — run build-graph.py and set GRAPH_SECTION
|
||||
# build_scratch_instruction SCRATCH_FILE — return context scratch instruction
|
||||
# read_scratch_context SCRATCH_FILE — return scratch file content block
|
||||
# ensure_ops_repo — clone/pull ops repo
|
||||
# ops_commit_and_push MESSAGE [FILES] — commit/push to ops repo
|
||||
# cleanup_stale_crashed_worktrees [HOURS] — thin wrapper around worktree_cleanup_stale
|
||||
#
|
||||
# Requires: lib/agent-session.sh sourced first (for create_agent_session,
|
||||
# agent_kill_session, agent_inject_into_session).
|
||||
# Globals used by formula_phase_callback: SESSION_NAME, PHASE_FILE,
|
||||
# PROJECT_REPO_ROOT, PROMPT (set by the calling script).
|
||||
# Requires: lib/env.sh, lib/worktree.sh sourced first for shared helpers.
|
||||
|
||||
# ── Cron guards ──────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -562,7 +573,7 @@ $(cat "$ctx_path")
|
|||
done
|
||||
}
|
||||
|
||||
# ── Ops repo helpers ─────────────────────────────────────────────────
|
||||
# ── Ops repo helpers ────────────────────────────────────────────────────
|
||||
|
||||
# ensure_ops_repo
|
||||
# Clones or pulls the ops repo so agents can read/write operational data.
|
||||
|
|
@ -625,90 +636,6 @@ ops_commit_and_push() {
|
|||
)
|
||||
}
|
||||
|
||||
# ── Session management ───────────────────────────────────────────────────
|
||||
|
||||
# start_formula_session SESSION WORKDIR PHASE_FILE
|
||||
# Kills stale session, resets phase file, creates a per-agent git worktree
|
||||
# for session isolation, and creates a new tmux + claude session in it.
|
||||
# Sets _FORMULA_SESSION_WORKDIR to the worktree path (or original workdir
|
||||
# on fallback). Callers must clean up via remove_formula_worktree after
|
||||
# the session ends.
|
||||
# Returns 0 on success, 1 on failure.
|
||||
start_formula_session() {
|
||||
local session="$1" workdir="$2" phase_file="$3"
|
||||
agent_kill_session "$session"
|
||||
rm -f "$phase_file"
|
||||
|
||||
# Create per-agent git worktree for session isolation.
|
||||
# Each agent gets its own CWD so Claude Code treats them as separate
|
||||
# projects — no resume collisions between sequential formula runs.
|
||||
_FORMULA_SESSION_WORKDIR="/tmp/disinto-${session}"
|
||||
# Clean up any stale worktree from a previous run
|
||||
git -C "$workdir" worktree remove "$_FORMULA_SESSION_WORKDIR" --force 2>/dev/null || true
|
||||
if git -C "$workdir" worktree add "$_FORMULA_SESSION_WORKDIR" HEAD --detach 2>/dev/null; then
|
||||
log "Created worktree: ${_FORMULA_SESSION_WORKDIR}"
|
||||
else
|
||||
log "WARNING: worktree creation failed — falling back to ${workdir}"
|
||||
_FORMULA_SESSION_WORKDIR="$workdir"
|
||||
fi
|
||||
|
||||
log "Creating tmux session: ${session}"
|
||||
if ! create_agent_session "$session" "$_FORMULA_SESSION_WORKDIR" "$phase_file"; then
|
||||
log "ERROR: failed to create tmux session ${session}"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# remove_formula_worktree
|
||||
# Removes the worktree created by start_formula_session if it differs from
|
||||
# PROJECT_REPO_ROOT. Safe to call multiple times. No-op if no worktree was created.
|
||||
remove_formula_worktree() {
|
||||
if [ -n "${_FORMULA_SESSION_WORKDIR:-}" ] \
|
||||
&& [ "$_FORMULA_SESSION_WORKDIR" != "${PROJECT_REPO_ROOT:-}" ]; then
|
||||
git -C "$PROJECT_REPO_ROOT" worktree remove "$_FORMULA_SESSION_WORKDIR" --force 2>/dev/null || true
|
||||
log "Removed worktree: ${_FORMULA_SESSION_WORKDIR}"
|
||||
fi
|
||||
}
|
||||
|
||||
# formula_phase_callback PHASE
|
||||
# Standard crash-recovery phase callback for formula sessions.
|
||||
# Requires globals: SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT.
|
||||
# Uses _FORMULA_CRASH_COUNT (auto-initialized) for single-retry limit.
|
||||
# shellcheck disable=SC2154 # SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT set by caller
|
||||
formula_phase_callback() {
|
||||
local phase="$1"
|
||||
log "phase: ${phase}"
|
||||
case "$phase" in
|
||||
PHASE:crashed)
|
||||
if [ "${_FORMULA_CRASH_COUNT:-0}" -gt 0 ]; then
|
||||
log "ERROR: session crashed again after recovery — giving up"
|
||||
return 0
|
||||
fi
|
||||
_FORMULA_CRASH_COUNT=$(( ${_FORMULA_CRASH_COUNT:-0} + 1 ))
|
||||
log "WARNING: tmux session died unexpectedly — attempting recovery"
|
||||
if create_agent_session "${_MONITOR_SESSION:-$SESSION_NAME}" "${_FORMULA_SESSION_WORKDIR:-$PROJECT_REPO_ROOT}" "$PHASE_FILE" 2>/dev/null; then
|
||||
agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROMPT"
|
||||
log "Recovery session started"
|
||||
else
|
||||
log "ERROR: could not restart session after crash"
|
||||
fi
|
||||
;;
|
||||
PHASE:done|PHASE:failed|PHASE:escalate|PHASE:merged)
|
||||
agent_kill_session "${_MONITOR_SESSION:-$SESSION_NAME}"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# ── Stale crashed worktree cleanup ─────────────────────────────────────────
|
||||
|
||||
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
|
||||
# Thin wrapper around worktree_cleanup_stale() from lib/worktree.sh.
|
||||
# Kept for backwards compatibility with existing callers.
|
||||
# Requires: lib/worktree.sh sourced.
|
||||
cleanup_stale_crashed_worktrees() {
|
||||
worktree_cleanup_stale "${1:-24}"
|
||||
}
|
||||
|
||||
# ── Scratch file helpers (compaction survival) ────────────────────────────
|
||||
|
||||
# build_scratch_instruction SCRATCH_FILE
|
||||
|
|
@ -795,14 +722,14 @@ formula_worktree_setup() {
|
|||
trap "worktree_cleanup '$worktree'" EXIT
|
||||
}
|
||||
|
||||
# ── Prompt + monitor helpers ──────────────────────────────────────────────
|
||||
# ── Prompt helpers ──────────────────────────────────────────────────────
|
||||
|
||||
# build_prompt_footer [EXTRA_API_LINES]
|
||||
# Assembles the common forge API reference + environment + phase protocol
|
||||
# block for formula prompts. Sets PROMPT_FOOTER.
|
||||
# Assembles the common forge API reference + environment block for formula prompts.
|
||||
# Sets PROMPT_FOOTER.
|
||||
# Pass additional API endpoint lines (pre-formatted, newline-prefixed) via $1.
|
||||
# Requires globals: FORGE_API, FACTORY_ROOT, PROJECT_REPO_ROOT,
|
||||
# PRIMARY_BRANCH, PHASE_FILE.
|
||||
# PRIMARY_BRANCH.
|
||||
build_prompt_footer() {
|
||||
local extra_api="${1:-}"
|
||||
# shellcheck disable=SC2034 # consumed by the calling script's PROMPT
|
||||
|
|
@ -818,66 +745,15 @@ NEVER echo or include the actual token value in output — always reference \${F
|
|||
FACTORY_ROOT=${FACTORY_ROOT}
|
||||
PROJECT_REPO_ROOT=${PROJECT_REPO_ROOT}
|
||||
OPS_REPO_ROOT=${OPS_REPO_ROOT}
|
||||
PRIMARY_BRANCH=${PRIMARY_BRANCH}
|
||||
PHASE_FILE=${PHASE_FILE}
|
||||
|
||||
## Phase protocol (REQUIRED)
|
||||
When all work is done:
|
||||
echo 'PHASE:done' > '${PHASE_FILE}'
|
||||
On unrecoverable error:
|
||||
printf 'PHASE:failed\nReason: %s\n' 'describe error' > '${PHASE_FILE}'"
|
||||
PRIMARY_BRANCH=${PRIMARY_BRANCH}"
|
||||
}
|
||||
|
||||
# run_formula_and_monitor AGENT_NAME [TIMEOUT]
|
||||
# Starts the formula session, injects PROMPT, monitors phase, and logs result.
|
||||
# Requires globals: SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT,
|
||||
# FORGE_REPO, CLAUDE_MODEL (exported).
|
||||
# shellcheck disable=SC2154 # SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT set by caller
|
||||
run_formula_and_monitor() {
|
||||
local agent_name="$1"
|
||||
local timeout="${2:-7200}"
|
||||
local callback="${3:-formula_phase_callback}"
|
||||
# ── Stale crashed worktree cleanup ────────────────────────────────────────
|
||||
|
||||
if ! start_formula_session "$SESSION_NAME" "$PROJECT_REPO_ROOT" "$PHASE_FILE"; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Write phase protocol to context file for compaction survival
|
||||
if [ -n "${PROMPT_FOOTER:-}" ]; then
|
||||
write_compact_context "$PHASE_FILE" "$PROMPT_FOOTER"
|
||||
fi
|
||||
|
||||
agent_inject_into_session "$SESSION_NAME" "$PROMPT"
|
||||
log "Prompt sent to tmux session"
|
||||
|
||||
log "Monitoring phase file: ${PHASE_FILE}"
|
||||
_FORMULA_CRASH_COUNT=0
|
||||
|
||||
monitor_phase_loop "$PHASE_FILE" "$timeout" "$callback"
|
||||
|
||||
FINAL_PHASE=$(read_phase "$PHASE_FILE")
|
||||
log "Final phase: ${FINAL_PHASE:-none}"
|
||||
|
||||
if [ "$FINAL_PHASE" != "PHASE:done" ]; then
|
||||
case "${_MONITOR_LOOP_EXIT:-}" in
|
||||
idle_prompt)
|
||||
log "${agent_name}: Claude returned to prompt without writing phase signal"
|
||||
;;
|
||||
idle_timeout)
|
||||
log "${agent_name}: timed out with no phase signal"
|
||||
;;
|
||||
*)
|
||||
log "${agent_name} finished without PHASE:done (phase: ${FINAL_PHASE:-none}, exit: ${_MONITOR_LOOP_EXIT:-})"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Preserve worktree on crash for debugging; clean up on success
|
||||
if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
|
||||
worktree_preserve "${_FORMULA_SESSION_WORKDIR:-}" "crashed (agent=${agent_name})"
|
||||
else
|
||||
remove_formula_worktree
|
||||
fi
|
||||
|
||||
log "--- ${agent_name^} run done ---"
|
||||
# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
|
||||
# Thin wrapper around worktree_cleanup_stale() from lib/worktree.sh.
|
||||
# Kept for backwards compatibility with existing callers.
|
||||
# Requires: lib/worktree.sh sourced.
|
||||
cleanup_stale_crashed_worktrees() {
|
||||
worktree_cleanup_stale "${1:-24}"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -524,8 +524,7 @@ Commit, rebase on ${PRIMARY_BRANCH}, and push:
|
|||
# build_phase_protocol_prompt — Generate push/commit instructions for Claude.
|
||||
#
|
||||
# For the synchronous agent_run architecture: tells Claude how to commit and
|
||||
# push (no phase files). For the tmux session architecture, use the
|
||||
# build_phase_protocol_prompt in dev/phase-handler.sh instead.
|
||||
# push (no phase files).
|
||||
#
|
||||
# Args: branch [remote]
|
||||
# Stdout: instruction text
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue