fix: Migrate action-agent.sh to SDK + shared libraries (#5)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude 2026-03-28 11:07:25 +00:00
parent 02dd03eaaf
commit 83ab2930e6

View file

@ -1,73 +1,72 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# action-agent.sh — Autonomous action agent: tmux + Claude + action formula # =============================================================================
# action-agent.sh — Synchronous action agent: SDK + shared libraries
#
# Synchronous bash loop using claude -p (one-shot invocation).
# No tmux sessions, no phase files — the bash script IS the state machine.
# #
# Usage: ./action-agent.sh <issue-number> [project.toml] # Usage: ./action-agent.sh <issue-number> [project.toml]
# #
# Lifecycle: # Flow:
# 1. Fetch issue body (action formula) + existing comments # 1. Preflight: issue_check_deps(), memory guard, concurrency lock
# 2. Create isolated git worktree: /tmp/action-{issue}-{timestamp} # 2. Parse model from YAML front matter in issue body (custom model selection)
# 3. Create tmux session: action-{project}-{issue_num} with interactive claude in worktree # 3. Worktree: worktree_create() for action isolation
# 4. Inject initial prompt: formula + comments + phase protocol instructions # 4. Load formula from issue body
# 5. Monitor phase file via monitor_phase_loop (shared with dev-agent) # 5. Build prompt: formula + prior non-bot comments (resume context)
# Path A (git output): Claude pushes → handler creates PR → CI poll → review # 6. agent_run(worktree, prompt) → Claude executes action, may push
# injection → merge → cleanup (same loop as dev-agent via phase-handler.sh) # 7. If pushed: pr_walk_to_merge() from lib/pr-lifecycle.sh
# Path B (no git output): Claude posts results → PHASE:done → cleanup # 8. Cleanup: worktree_cleanup(), issue_close()
# 6. For human input: Claude writes PHASE:escalate; human responds via vault/forge
# 7. Cleanup on terminal phase: kill children, destroy worktree, remove temp files
# #
# Key principle: The runtime creates and destroys. The formula preserves. # Action-specific (stays in runner):
# The formula must push results before signaling done — the worktree is nuked after. # - YAML front matter parsing (model selection)
# - Bot username filtering for prior comments
# - Lifetime watchdog (MAX_LIFETIME=8h wall-clock cap)
# - Child process cleanup (docker compose, background jobs)
# #
# Session: action-{project}-{issue_num} (tmux) # From shared libraries:
# Log: action/action-poll-{project}.log # - Issue lifecycle: lib/issue-lifecycle.sh
# - Worktree: lib/worktree.sh
# - PR lifecycle: lib/pr-lifecycle.sh
# - Agent SDK: lib/agent-sdk.sh
#
# Log: action/action-poll-{project}.log
# =============================================================================
set -euo pipefail set -euo pipefail
ISSUE="${1:?Usage: action-agent.sh <issue-number> [project.toml]}" ISSUE="${1:?Usage: action-agent.sh <issue-number> [project.toml]}"
export PROJECT_TOML="${2:-${PROJECT_TOML:-}}" export PROJECT_TOML="${2:-${PROJECT_TOML:-}}"
source "$(dirname "$0")/../lib/env.sh" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
# shellcheck source=../lib/env.sh
source "$FACTORY_ROOT/lib/env.sh"
# Use action-bot's own Forgejo identity (#747) # Use action-bot's own Forgejo identity (#747)
FORGE_TOKEN="${FORGE_ACTION_TOKEN:-${FORGE_TOKEN}}" FORGE_TOKEN="${FORGE_ACTION_TOKEN:-${FORGE_TOKEN}}"
source "$(dirname "$0")/../lib/ci-helpers.sh" # shellcheck source=../lib/ci-helpers.sh
source "$(dirname "$0")/../lib/agent-session.sh" source "$FACTORY_ROOT/lib/ci-helpers.sh"
source "$(dirname "$0")/../lib/formula-session.sh" # shellcheck source=../lib/worktree.sh
source "$(dirname "$0")/../lib/worktree.sh" source "$FACTORY_ROOT/lib/worktree.sh"
# shellcheck source=../dev/phase-handler.sh # shellcheck source=../lib/issue-lifecycle.sh
source "$(dirname "$0")/../dev/phase-handler.sh" source "$FACTORY_ROOT/lib/issue-lifecycle.sh"
SESSION_NAME="action-${PROJECT_NAME}-${ISSUE}" # shellcheck source=../lib/agent-sdk.sh
source "$FACTORY_ROOT/lib/agent-sdk.sh"
# shellcheck source=../lib/pr-lifecycle.sh
source "$FACTORY_ROOT/lib/pr-lifecycle.sh"
BRANCH="action/issue-${ISSUE}"
WORKTREE="/tmp/action-${ISSUE}-$(date +%s)"
LOCKFILE="/tmp/action-agent-${ISSUE}.lock" LOCKFILE="/tmp/action-agent-${ISSUE}.lock"
LOGFILE="${DISINTO_LOG_DIR}/action/action-poll-${PROJECT_NAME:-default}.log" LOGFILE="${DISINTO_LOG_DIR}/action/action-poll-${PROJECT_NAME:-default}.log"
IDLE_TIMEOUT="${ACTION_IDLE_TIMEOUT:-14400}" # 4h default # shellcheck disable=SC2034 # consumed by agent-sdk.sh
MAX_LIFETIME="${ACTION_MAX_LIFETIME:-28800}" # 8h default wall-clock cap SID_FILE="/tmp/action-session-${PROJECT_NAME:-default}-${ISSUE}.sid"
MAX_LIFETIME="${ACTION_MAX_LIFETIME:-28800}" # 8h default wall-clock cap
SESSION_START_EPOCH=$(date +%s) SESSION_START_EPOCH=$(date +%s)
# --- Phase handler globals (agent-specific; defaults in phase-handler.sh) ---
# shellcheck disable=SC2034 # used by phase-handler.sh
API="${FORGE_API}"
BRANCH="action/issue-${ISSUE}"
# shellcheck disable=SC2034 # used by phase-handler.sh
WORKTREE="/tmp/action-${ISSUE}-$(date +%s)"
PHASE_FILE="/tmp/action-session-${PROJECT_NAME:-default}-${ISSUE}.phase"
IMPL_SUMMARY_FILE="/tmp/action-impl-summary-${PROJECT_NAME:-default}-${ISSUE}.txt"
PREFLIGHT_RESULT="/tmp/action-preflight-${ISSUE}.json"
SCRATCH_FILE="/tmp/action-${ISSUE}-scratch.md"
log() { log() {
printf '[%s] action#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$ISSUE" "$*" >> "$LOGFILE" printf '[%s] action#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$ISSUE" "$*" >> "$LOGFILE"
} }
status() {
log "$*"
}
# --- Action-specific helpers for phase-handler.sh ---
cleanup_worktree() {
worktree_cleanup "$WORKTREE"
log "destroyed worktree: ${WORKTREE}"
}
cleanup_labels() { :; } # action agent doesn't use in-progress labels
# --- Concurrency lock (per issue) --- # --- Concurrency lock (per issue) ---
if [ -f "$LOCKFILE" ]; then if [ -f "$LOCKFILE" ]; then
LOCK_PID=$(cat "$LOCKFILE" 2>/dev/null || echo "") LOCK_PID=$(cat "$LOCKFILE" 2>/dev/null || echo "")
@ -87,7 +86,6 @@ cleanup() {
wait "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true wait "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
fi fi
rm -f "$LOCKFILE" rm -f "$LOCKFILE"
agent_kill_session "$SESSION_NAME"
# Kill any remaining child processes spawned during the run # Kill any remaining child processes spawned during the run
local children local children
children=$(jobs -p 2>/dev/null) || true children=$(jobs -p 2>/dev/null) || true
@ -100,23 +98,17 @@ cleanup() {
# Best-effort docker cleanup for containers started during this action # Best-effort docker cleanup for containers started during this action
(cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true (cd "${WORKTREE}" 2>/dev/null && docker compose down 2>/dev/null) || true
# Preserve worktree on crash for debugging; clean up on success # Preserve worktree on crash for debugging; clean up on success
local final_phase="" if [ "$exit_code" -ne 0 ]; then
[ -f "$PHASE_FILE" ] && final_phase=$(head -1 "$PHASE_FILE" 2>/dev/null || true) worktree_preserve "$WORKTREE" "crashed (exit=$exit_code)"
if [ "${final_phase:-}" = "PHASE:crashed" ] || [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ] || [ "$exit_code" -ne 0 ]; then
worktree_preserve "$WORKTREE" "crashed (exit=$exit_code, phase=${final_phase:-unknown})"
else else
cleanup_worktree worktree_cleanup "$WORKTREE"
fi fi
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$PREFLIGHT_RESULT" rm -f "$SID_FILE"
} }
trap cleanup EXIT trap cleanup EXIT
# --- Memory guard --- # --- Memory guard ---
AVAIL_MB=$(awk '/MemAvailable/ {printf "%d", $2/1024}' /proc/meminfo) memory_guard 2000
if [ "$AVAIL_MB" -lt 2000 ]; then
log "SKIP: only ${AVAIL_MB}MB available (need 2000MB)"
exit 0
fi
# --- Fetch issue --- # --- Fetch issue ---
log "fetching issue #${ISSUE}" log "fetching issue #${ISSUE}"
@ -139,25 +131,10 @@ fi
log "Issue: ${ISSUE_TITLE}" log "Issue: ${ISSUE_TITLE}"
# --- Dependency check (skip before spawning Claude) --- # --- Dependency check (shared library) ---
DEPS=$(printf '%s' "$ISSUE_BODY" | bash "${FACTORY_ROOT}/lib/parse-deps.sh") if ! issue_check_deps "$ISSUE"; then
if [ -n "$DEPS" ]; then log "SKIP: issue #${ISSUE} blocked by: ${_ISSUE_BLOCKED_BY[*]}"
ALL_MET=true exit 0
while IFS= read -r dep; do
[ -z "$dep" ] && continue
DEP_STATE=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
"${FORGE_API}/issues/${dep}" | jq -r '.state // "open"') || DEP_STATE="open"
if [ "$DEP_STATE" != "closed" ]; then
log "SKIP: dependency #${dep} still open — not spawning session"
ALL_MET=false
break
fi
done <<< "$DEPS"
if [ "$ALL_MET" = false ]; then
rm -f "$LOCKFILE"
exit 0
fi
log "all dependencies met"
fi fi
# --- Extract model from YAML front matter (if present) --- # --- Extract model from YAML front matter (if present) ---
@ -191,28 +168,23 @@ if [ -n "$COMMENTS_JSON" ] && [ "$COMMENTS_JSON" != "null" ] && [ "$COMMENTS_JSO
"[\(.user.login) at \(.created_at[:19])]\n\(.body)\n---"' 2>/dev/null || true) "[\(.user.login) at \(.created_at[:19])]\n\(.body)\n---"' 2>/dev/null || true)
fi fi
# --- Create isolated worktree --- # --- Determine git remote ---
log "creating worktree: ${WORKTREE}"
cd "${PROJECT_REPO_ROOT}" cd "${PROJECT_REPO_ROOT}"
# Determine which git remote corresponds to FORGE_URL
_forge_host=$(echo "$FORGE_URL" | sed 's|https\?://||; s|/.*||') _forge_host=$(echo "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
FORGE_REMOTE=$(git remote -v | awk -v host="$_forge_host" '$2 ~ host && /\(push\)/ {print $1; exit}') FORGE_REMOTE=$(git remote -v | awk -v host="$_forge_host" '$2 ~ host && /\(push\)/ {print $1; exit}')
FORGE_REMOTE="${FORGE_REMOTE:-origin}" FORGE_REMOTE="${FORGE_REMOTE:-origin}"
export FORGE_REMOTE export FORGE_REMOTE
# --- Create isolated worktree ---
log "creating worktree: ${WORKTREE}"
git fetch "${FORGE_REMOTE}" "${PRIMARY_BRANCH}" 2>/dev/null || true git fetch "${FORGE_REMOTE}" "${PRIMARY_BRANCH}" 2>/dev/null || true
if ! git worktree add "$WORKTREE" "${FORGE_REMOTE}/${PRIMARY_BRANCH}" 2>&1; then if ! worktree_create "$WORKTREE" "$BRANCH"; then
log "ERROR: worktree creation failed" log "ERROR: worktree creation failed"
exit 1 exit 1
fi fi
log "worktree ready: ${WORKTREE}" log "worktree ready: ${WORKTREE}"
# --- Read scratch file (compaction survival) --- # --- Build prompt ---
SCRATCH_CONTEXT=$(read_scratch_context "$SCRATCH_FILE")
SCRATCH_INSTRUCTION=$(build_scratch_instruction "$SCRATCH_FILE")
# --- Build initial prompt ---
PRIOR_SECTION="" PRIOR_SECTION=""
if [ -n "$PRIOR_COMMENTS" ]; then if [ -n "$PRIOR_COMMENTS" ]; then
PRIOR_SECTION="## Prior comments (resume context) PRIOR_SECTION="## Prior comments (resume context)
@ -222,19 +194,15 @@ ${PRIOR_COMMENTS}
" "
fi fi
# Build phase protocol from shared function (Path B covered in Instructions section above) GIT_INSTRUCTIONS=$(build_phase_protocol_prompt "$BRANCH" "$FORGE_REMOTE")
PHASE_PROTOCOL_INSTRUCTIONS="$(build_phase_protocol_prompt "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "$BRANCH")"
# Write phase protocol to context file for compaction survival PROMPT="You are an action agent. Your job is to execute the action formula
write_compact_context "$PHASE_FILE" "$PHASE_PROTOCOL_INSTRUCTIONS"
INITIAL_PROMPT="You are an action agent. Your job is to execute the action formula
in the issue below. in the issue below.
## Issue #${ISSUE}: ${ISSUE_TITLE} ## Issue #${ISSUE}: ${ISSUE_TITLE}
${ISSUE_BODY} ${ISSUE_BODY}
${SCRATCH_CONTEXT}
${PRIOR_SECTION}## Instructions ${PRIOR_SECTION}## Instructions
1. Read the action formula steps in the issue body carefully. 1. Read the action formula steps in the issue body carefully.
@ -248,29 +216,20 @@ ${PRIOR_SECTION}## Instructions
\"${FORGE_API}/issues/${ISSUE}/comments\" \\ \"${FORGE_API}/issues/${ISSUE}/comments\" \\
-d \"{\\\"body\\\": \\\"your comment here\\\"}\" -d \"{\\\"body\\\": \\\"your comment here\\\"}\"
4. If a step requires human input or approval, write PHASE:escalate with a reason. 4. If a step requires human input or approval, post a comment explaining what
A human will review and respond via the forge. is needed and stop — the orchestrator will block the issue.
### Path A: If this action produces code changes (e.g. config updates, baselines): ### Path A: If this action produces code changes (e.g. config updates, baselines):
- You are already in an isolated worktree at: ${WORKTREE} - You are already in an isolated worktree at: ${WORKTREE}
- Create and switch to branch: git checkout -b ${BRANCH} - You are on branch: ${BRANCH}
- Make your changes, commit, and push: git push ${FORGE_REMOTE} ${BRANCH} - Make your changes, commit, and push: git push ${FORGE_REMOTE} ${BRANCH}
- **IMPORTANT:** The worktree is destroyed after completion. Push all - **IMPORTANT:** The worktree is destroyed after completion. Push all
results before signaling done — unpushed work will be lost. results before finishing — unpushed work will be lost.
- Follow the phase protocol below — the orchestrator handles PR creation,
CI monitoring, and review injection.
### Path B: If this action produces no code changes (investigation, report): ### Path B: If this action produces no code changes (investigation, report):
- Post results as a comment on issue #${ISSUE}. - Post results as a comment on issue #${ISSUE}.
- **IMPORTANT:** The worktree is destroyed after completion. Copy any - **IMPORTANT:** The worktree is destroyed after completion. Copy any
files you need to persistent paths before signaling done. files you need to persistent paths before finishing.
- Close the issue:
curl -sf -X PATCH \\
-H \"Authorization: token \${FORGE_TOKEN}\" \\
-H 'Content-Type: application/json' \\
\"${FORGE_API}/issues/${ISSUE}\" \\
-d '{\"state\": \"closed\"}'
- Signal completion: echo \"PHASE:done\" > \"${PHASE_FILE}\"
5. Environment variables available in your bash sessions: 5. Environment variables available in your bash sessions:
FORGE_TOKEN, FORGE_API, FORGE_REPO, FORGE_WEB, PROJECT_NAME FORGE_TOKEN, FORGE_API, FORGE_REPO, FORGE_WEB, PROJECT_NAME
@ -286,73 +245,79 @@ ${PRIOR_SECTION}## Instructions
If the prior comments above show work already completed, resume from where it If the prior comments above show work already completed, resume from where it
left off. left off.
${SCRATCH_INSTRUCTION} ${GIT_INSTRUCTIONS}"
${PHASE_PROTOCOL_INSTRUCTIONS}"
# --- Create tmux session ---
log "creating tmux session: ${SESSION_NAME}"
if ! create_agent_session "${SESSION_NAME}" "${WORKTREE}" "${PHASE_FILE}"; then
log "ERROR: failed to create tmux session"
exit 1
fi
# --- Inject initial prompt ---
inject_formula "${SESSION_NAME}" "${INITIAL_PROMPT}"
log "initial prompt injected into session"
# --- Wall-clock lifetime watchdog (background) --- # --- Wall-clock lifetime watchdog (background) ---
# Caps total session time independently of idle timeout. When the cap is # Caps total run time independently of claude -p timeout. When the cap is
# hit the watchdog kills the tmux session, posts a summary comment on the # hit the watchdog kills the main process, which triggers cleanup via trap.
# issue, and writes PHASE:failed so monitor_phase_loop exits.
_lifetime_watchdog() { _lifetime_watchdog() {
local remaining=$(( MAX_LIFETIME - ($(date +%s) - SESSION_START_EPOCH) )) local remaining=$(( MAX_LIFETIME - ($(date +%s) - SESSION_START_EPOCH) ))
[ "$remaining" -le 0 ] && remaining=1 [ "$remaining" -le 0 ] && remaining=1
sleep "$remaining" sleep "$remaining"
local hours=$(( MAX_LIFETIME / 3600 )) local hours=$(( MAX_LIFETIME / 3600 ))
log "MAX_LIFETIME (${hours}h) reached — killing session" log "MAX_LIFETIME (${hours}h) reached — killing agent"
agent_kill_session "$SESSION_NAME"
# Post summary comment on issue # Post summary comment on issue
local body="Action session killed: wall-clock lifetime cap (${hours}h) reached." local body="Action agent killed: wall-clock lifetime cap (${hours}h) reached."
curl -sf -X POST \ curl -sf -X POST \
-H "Authorization: token ${FORGE_TOKEN}" \ -H "Authorization: token ${FORGE_TOKEN}" \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
"${FORGE_API}/issues/${ISSUE}/comments" \ "${FORGE_API}/issues/${ISSUE}/comments" \
-d "{\"body\": \"${body}\"}" >/dev/null 2>&1 || true -d "{\"body\": \"${body}\"}" >/dev/null 2>&1 || true
printf 'PHASE:failed\nReason: max_lifetime (%sh) reached\n' "$hours" > "$PHASE_FILE" kill $$ 2>/dev/null || true
# Touch phase-changed marker so monitor_phase_loop picks up immediately
touch "/tmp/phase-changed-${SESSION_NAME}.marker"
} }
_lifetime_watchdog & _lifetime_watchdog &
LIFETIME_WATCHDOG_PID=$! LIFETIME_WATCHDOG_PID=$!
# --- Monitor phase loop (shared with dev-agent) --- # --- Run agent ---
status "monitoring phase: ${PHASE_FILE} (action agent)" log "running agent (worktree: ${WORKTREE})"
monitor_phase_loop "$PHASE_FILE" "$IDLE_TIMEOUT" _on_phase_change "$SESSION_NAME" agent_run --worktree "$WORKTREE" "$PROMPT"
log "agent_run complete"
# Handle exit reason from monitor_phase_loop # --- Detect if branch was pushed (Path A vs Path B) ---
case "${_MONITOR_LOOP_EXIT:-}" in PUSHED=false
idle_timeout) # Check if remote branch exists
# Post diagnostic comment + label blocked git fetch "${FORGE_REMOTE}" "$BRANCH" 2>/dev/null || true
post_blocked_diagnostic "idle_timeout" if git rev-parse --verify "${FORGE_REMOTE}/${BRANCH}" >/dev/null 2>&1; then
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$SCRATCH_FILE" PUSHED=true
;; fi
idle_prompt) # Fallback: check local commits ahead of base
# Notification + blocked label already handled by _on_phase_change(PHASE:failed) callback if [ "$PUSHED" = false ]; then
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$SCRATCH_FILE" if git -C "$WORKTREE" log "${FORGE_REMOTE}/${PRIMARY_BRANCH}..${BRANCH}" --oneline 2>/dev/null | grep -q .; then
;; PUSHED=true
PHASE:failed) fi
# Check if this was a max_lifetime kill (phase file contains the reason) fi
if grep -q 'max_lifetime' "$PHASE_FILE" 2>/dev/null; then
post_blocked_diagnostic "max_lifetime" if [ "$PUSHED" = true ]; then
fi # --- Path A: code changes pushed — create PR and walk to merge ---
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$SCRATCH_FILE" log "branch pushed — creating PR"
;; PR_NUMBER=""
done) PR_NUMBER=$(pr_create "$BRANCH" "action: ${ISSUE_TITLE}" \
# Belt-and-suspenders: callback handles primary cleanup, "Closes #${ISSUE}
# but ensure sentinel files are removed if callback was interrupted
rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$SCRATCH_FILE" Automated action execution by action-agent.") || true
;;
esac if [ -n "$PR_NUMBER" ]; then
log "walking PR #${PR_NUMBER} to merge"
pr_walk_to_merge "$PR_NUMBER" "$_AGENT_SESSION_ID" "$WORKTREE" || true
case "${_PR_WALK_EXIT_REASON:-}" in
merged)
log "PR #${PR_NUMBER} merged — closing issue"
issue_close "$ISSUE"
;;
*)
log "PR #${PR_NUMBER} not merged (reason: ${_PR_WALK_EXIT_REASON:-unknown})"
issue_block "$ISSUE" "pr_not_merged: ${_PR_WALK_EXIT_REASON:-unknown}"
;;
esac
else
log "ERROR: failed to create PR"
issue_block "$ISSUE" "pr_creation_failed"
fi
else
# --- Path B: no code changes — close issue directly ---
log "no branch pushed — closing issue (Path B)"
issue_close "$ISSUE"
fi
log "action-agent finished for issue #${ISSUE}" log "action-agent finished for issue #${ISSUE}"