From 5adf34e6956cca7520c80d8bae063731d3b34e90 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 28 Mar 2026 07:02:50 +0000 Subject: [PATCH] fix: Migrate gardener-run.sh to SDK + pr-lifecycle (#801) Replace tmux-based run_formula_and_monitor architecture with synchronous agent_run() from agent-sdk.sh. Replace custom CI/review/merge phase callbacks (~350 lines) with pr_walk_to_merge() from pr-lifecycle.sh. Key changes: - Source agent-sdk.sh + pr-lifecycle.sh instead of agent-session.sh - One-shot claude -p invocation replaces tmux session management - Bash script IS the state machine (no phase files needed) - Keep _gardener_execute_manifest() for post-merge manifest execution - Keep all guards, formula loading, context building unchanged Co-Authored-By: Claude Opus 4.6 (1M context) --- gardener/gardener-run.sh | 506 +++++++-------------------------------- 1 file changed, 88 insertions(+), 418 deletions(-) diff --git a/gardener/gardener-run.sh b/gardener/gardener-run.sh index 8b3e2ae..733583d 100755 --- a/gardener/gardener-run.sh +++ b/gardener/gardener-run.sh @@ -1,10 +1,18 @@ #!/usr/bin/env bash # ============================================================================= -# gardener-run.sh — Cron wrapper: gardener execution via Claude + formula +# gardener-run.sh — Cron wrapper: gardener execution via SDK + formula # -# Runs 4x/day (or on-demand). Guards against concurrent runs and low memory. -# Creates a tmux session with Claude (sonnet) reading formulas/run-gardener.toml. -# No action issues — the gardener is a nervous system component, not work (AD-001). +# Synchronous bash loop using claude -p (one-shot invocation). +# No tmux sessions, no phase files — the bash script IS the state machine. +# +# Flow: +# 1. Guards: cron lock, memory check +# 2. Load formula (formulas/run-gardener.toml) +# 3. Build context: AGENTS.md, scratch file, prompt footer +# 4. agent_run(worktree, prompt) → Claude does maintenance, pushes if needed +# 5. If pushed: pr_walk_to_merge() from lib/pr-lifecycle.sh +# 6. Post-merge: execute pending actions manifest (gardener/pending-actions.json) +# 7. Mirror push # # Usage: # gardener-run.sh [projects/disinto.toml] # project config (default: disinto) @@ -22,8 +30,6 @@ export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}" source "$FACTORY_ROOT/lib/env.sh" # Use gardener-bot's own Forgejo identity (#747) FORGE_TOKEN="${FORGE_GARDENER_TOKEN:-${FORGE_TOKEN}}" -# shellcheck source=../lib/agent-session.sh -source "$FACTORY_ROOT/lib/agent-session.sh" # shellcheck source=../lib/formula-session.sh source "$FACTORY_ROOT/lib/formula-session.sh" # shellcheck source=../lib/worktree.sh @@ -34,26 +40,20 @@ source "$FACTORY_ROOT/lib/ci-helpers.sh" source "$FACTORY_ROOT/lib/mirrors.sh" # shellcheck source=../lib/guard.sh source "$FACTORY_ROOT/lib/guard.sh" +# shellcheck source=../lib/agent-sdk.sh +source "$FACTORY_ROOT/lib/agent-sdk.sh" +# shellcheck source=../lib/pr-lifecycle.sh +source "$FACTORY_ROOT/lib/pr-lifecycle.sh" LOG_FILE="$SCRIPT_DIR/gardener.log" -# shellcheck disable=SC2034 # consumed by run_formula_and_monitor -SESSION_NAME="gardener-${PROJECT_NAME}" -PHASE_FILE="/tmp/gardener-session-${PROJECT_NAME}.phase" - -# shellcheck disable=SC2034 # read by monitor_phase_loop in lib/agent-session.sh -PHASE_POLL_INTERVAL=15 - +# shellcheck disable=SC2034 # consumed by agent-sdk.sh +LOGFILE="$LOG_FILE" +# shellcheck disable=SC2034 # consumed by agent-sdk.sh +SID_FILE="/tmp/gardener-session-${PROJECT_NAME}.sid" SCRATCH_FILE="/tmp/gardener-${PROJECT_NAME}-scratch.md" RESULT_FILE="/tmp/gardener-result-${PROJECT_NAME}.txt" GARDENER_PR_FILE="/tmp/gardener-pr-${PROJECT_NAME}.txt" - -# Merge-through state (used by _gardener_on_phase_change callback) -_GARDENER_PR="" -_GARDENER_MERGE_START=0 -_GARDENER_MERGE_TIMEOUT=1800 # 30 min -_GARDENER_CI_FIX_COUNT=0 -_GARDENER_REVIEW_ROUND=0 -_GARDENER_CRASH_COUNT=0 +WORKTREE="/tmp/${PROJECT_NAME}-gardener-run" log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; } @@ -72,7 +72,7 @@ build_context_block AGENTS.md SCRATCH_CONTEXT=$(read_scratch_context "$SCRATCH_FILE") SCRATCH_INSTRUCTION=$(build_scratch_instruction "$SCRATCH_FILE") -# ── Build prompt (manifest format reference for deferred actions) ───────── +# ── Build prompt ───────────────────────────────────────────────────────── GARDENER_API_EXTRA=" ## Pending-actions manifest (REQUIRED) @@ -91,28 +91,28 @@ Supported actions: The commit-and-pr step converts JSONL to JSON array. The orchestrator executes actions after the PR merges. Do NOT call mutation APIs directly during the run." -build_prompt_footer "$GARDENER_API_EXTRA" -# Extend phase protocol with merge-through instructions for compaction survival -PROMPT_FOOTER="${PROMPT_FOOTER} +PROMPT_FOOTER="## Forge API reference +Base URL: ${FORGE_API} +Auth header: -H \"Authorization: token \${FORGE_TOKEN}\" + Read issue: curl -sf -H \"Authorization: token \${FORGE_TOKEN}\" '${FORGE_API}/issues/{number}' | jq '.body' + Create issue: curl -sf -X POST -H \"Authorization: token \${FORGE_TOKEN}\" -H 'Content-Type: application/json' '${FORGE_API}/issues' -d '{\"title\":\"...\",\"body\":\"...\",\"labels\":[LABEL_ID]}'${GARDENER_API_EXTRA} + List labels: curl -sf -H \"Authorization: token \${FORGE_TOKEN}\" '${FORGE_API}/labels' +NEVER echo or include the actual token value in output — always reference \${FORGE_TOKEN}. -## Merge-through protocol (commit-and-pr step) -After creating the PR, write the PR number and signal CI: +## Environment +FACTORY_ROOT=${FACTORY_ROOT} +PROJECT_REPO_ROOT=${PROJECT_REPO_ROOT} +OPS_REPO_ROOT=${OPS_REPO_ROOT} +PRIMARY_BRANCH=${PRIMARY_BRANCH} + +## Completion protocol (REQUIRED) +When the commit-and-pr step creates a PR, write the PR number and stop: echo \"\$PR_NUMBER\" > '${GARDENER_PR_FILE}' - echo 'PHASE:awaiting_ci' > '${PHASE_FILE}' -Then STOP and WAIT for CI results. -When 'CI passed' is injected: - echo 'PHASE:awaiting_review' > '${PHASE_FILE}' -Then STOP and WAIT. -When 'CI failed' is injected: - Fix, commit, push, then: echo 'PHASE:awaiting_ci' > '${PHASE_FILE}' -When review feedback is injected: - Address all feedback, commit, push, then: echo 'PHASE:awaiting_ci' > '${PHASE_FILE}' -If no file changes in commit-and-pr: - echo 'PHASE:done' > '${PHASE_FILE}'" +Then STOP. Do NOT write PHASE: signals — the orchestrator handles CI, review, and merge. +If no file changes exist (empty commit-and-pr), just stop — no PR needed." -# shellcheck disable=SC2034 # consumed by run_formula_and_monitor -PROMPT="You are the issue gardener for ${FORGE_REPO}. Work through the formula below. Follow the phase protocol: if the commit-and-pr step creates a PR, write PHASE:awaiting_ci and wait for orchestrator CI/review/merge handling. If no file changes, write PHASE:done. The orchestrator will time you out if you return to the prompt without signalling. +PROMPT="You are the issue gardener for ${FORGE_REPO}. Work through the formula below. You have full shell access and --dangerously-skip-permissions. Fix what you can. File vault items for what you cannot. Do NOT ask permission — act first, report after. @@ -130,14 +130,21 @@ ${FORMULA_CONTENT} ${SCRATCH_INSTRUCTION} ${PROMPT_FOOTER}" -# ── Phase callback for merge-through ───────────────────────────────────── -# Handles CI polling, review injection, merge, and cleanup after PR creation. -# Lighter than dev/phase-handler.sh — tailored for gardener doc-only PRs. +# ── Create worktree ────────────────────────────────────────────────────── +cd "$PROJECT_REPO_ROOT" +git fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true +worktree_cleanup "$WORKTREE" +git worktree add "$WORKTREE" "origin/${PRIMARY_BRANCH}" --detach 2>/dev/null -# ── Post-merge manifest execution ───────────────────────────────────── +cleanup() { + worktree_cleanup "$WORKTREE" + rm -f "$GARDENER_PR_FILE" +} +trap cleanup EXIT + +# ── Post-merge manifest execution ──────────────────────────────────────── # Reads gardener/pending-actions.json and executes each action via API. # Failed actions are logged but do not block completion. -# shellcheck disable=SC2317 # called indirectly via _gardener_merge _gardener_execute_manifest() { local manifest_file="$PROJECT_REPO_ROOT/gardener/pending-actions.json" if [ ! -f "$manifest_file" ]; then @@ -295,387 +302,50 @@ _gardener_execute_manifest() { log "manifest: execution complete (${count} actions processed)" } -# shellcheck disable=SC2317 # called indirectly by monitor_phase_loop -_gardener_merge() { - local merge_response merge_http_code - merge_response=$(curl -s -w "\n%{http_code}" -X POST \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H 'Content-Type: application/json' \ - "${FORGE_API}/pulls/${_GARDENER_PR}/merge" \ - -d '{"Do":"merge","delete_branch_after_merge":true}') || true - merge_http_code=$(echo "$merge_response" | tail -1) +# ── Reset result file ──────────────────────────────────────────────────── +rm -f "$RESULT_FILE" "$GARDENER_PR_FILE" +touch "$RESULT_FILE" - if [ "$merge_http_code" = "200" ] || [ "$merge_http_code" = "204" ]; then - log "gardener PR #${_GARDENER_PR} merged" - # Pull merged primary branch and push to mirrors +# ── Run agent ───────────────────────────────────────────────────────────── +export CLAUDE_MODEL="sonnet" + +agent_run --worktree "$WORKTREE" "$PROMPT" +log "agent_run complete" + +# ── Detect PR ───────────────────────────────────────────────────────────── +PR_NUMBER="" +if [ -f "$GARDENER_PR_FILE" ]; then + PR_NUMBER=$(tr -d '[:space:]' < "$GARDENER_PR_FILE") +fi + +# Fallback: search for open gardener PRs +if [ -z "$PR_NUMBER" ]; then + PR_NUMBER=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ + "${FORGE_API}/pulls?state=open&limit=10" | \ + jq -r '[.[] | select(.head.ref | startswith("chore/gardener-"))] | .[0].number // empty') || true +fi + +# ── Walk PR to merge ────────────────────────────────────────────────────── +if [ -n "$PR_NUMBER" ]; then + log "walking PR #${PR_NUMBER} to merge" + pr_walk_to_merge "$PR_NUMBER" "$_AGENT_SESSION_ID" "$WORKTREE" || true + + if [ "$_PR_WALK_EXIT_REASON" = "merged" ]; then + # Post-merge: pull primary, mirror push, execute manifest git -C "$PROJECT_REPO_ROOT" fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true git -C "$PROJECT_REPO_ROOT" checkout "$PRIMARY_BRANCH" 2>/dev/null || true git -C "$PROJECT_REPO_ROOT" pull --ff-only origin "$PRIMARY_BRANCH" 2>/dev/null || true mirror_push _gardener_execute_manifest - printf 'PHASE:done\n' > "$PHASE_FILE" - return 0 - fi - - # Already merged (race)? - if [ "$merge_http_code" = "405" ]; then - local pr_merged - pr_merged=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/pulls/${_GARDENER_PR}" | jq -r '.merged // false') || true - if [ "$pr_merged" = "true" ]; then - log "gardener PR #${_GARDENER_PR} already merged" - # Pull merged primary branch and push to mirrors - git -C "$PROJECT_REPO_ROOT" fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true - git -C "$PROJECT_REPO_ROOT" checkout "$PRIMARY_BRANCH" 2>/dev/null || true - git -C "$PROJECT_REPO_ROOT" pull --ff-only origin "$PRIMARY_BRANCH" 2>/dev/null || true - mirror_push - _gardener_execute_manifest - printf 'PHASE:done\n' > "$PHASE_FILE" - return 0 - fi - log "gardener merge blocked (HTTP 405)" - printf 'PHASE:failed\nReason: gardener PR #%s merge blocked (HTTP 405)\n' \ - "$_GARDENER_PR" > "$PHASE_FILE" - return 0 - fi - - # Other failure (likely conflicts) — tell Claude to rebase - log "gardener merge failed (HTTP ${merge_http_code}) — requesting rebase" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "Merge failed for PR #${_GARDENER_PR} (likely conflicts). Rebase and push: - git fetch origin ${PRIMARY_BRANCH} && git rebase origin/${PRIMARY_BRANCH} - git push --force-with-lease origin HEAD - echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\" -If rebase fails, write PHASE:failed with a reason." -} - -# shellcheck disable=SC2317 # called indirectly by monitor_phase_loop -_gardener_timeout_cleanup() { - log "gardener merge-through timed out (${_GARDENER_MERGE_TIMEOUT}s) — closing PR" - if [ -n "$_GARDENER_PR" ]; then - curl -sf -X PATCH \ - -H "Authorization: token ${FORGE_TOKEN}" \ - -H 'Content-Type: application/json' \ - "${FORGE_API}/pulls/${_GARDENER_PR}" \ - -d '{"state":"closed"}' >/dev/null 2>&1 || true - fi - printf 'PHASE:failed\nReason: merge-through timeout (%ss)\n' \ - "$_GARDENER_MERGE_TIMEOUT" > "$PHASE_FILE" -} - -# shellcheck disable=SC2317 # called indirectly by monitor_phase_loop -_gardener_handle_ci() { - # Start merge-through timer on first CI phase - if [ "$_GARDENER_MERGE_START" -eq 0 ]; then - _GARDENER_MERGE_START=$(date +%s) - fi - - # Check merge-through timeout - local elapsed - elapsed=$(( $(date +%s) - _GARDENER_MERGE_START )) - if [ "$elapsed" -ge "$_GARDENER_MERGE_TIMEOUT" ]; then - _gardener_timeout_cleanup - return 0 - fi - - # Discover PR number if unknown - if [ -z "$_GARDENER_PR" ]; then - if [ -f "$GARDENER_PR_FILE" ]; then - _GARDENER_PR=$(tr -d '[:space:]' < "$GARDENER_PR_FILE") - fi - # Fallback: search for open gardener PRs - if [ -z "$_GARDENER_PR" ]; then - _GARDENER_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/pulls?state=open&limit=10" | \ - jq -r '[.[] | select(.head.ref | startswith("chore/gardener-"))] | .[0].number // empty') || true - fi - if [ -z "$_GARDENER_PR" ]; then - log "ERROR: cannot find gardener PR" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "ERROR: Could not find the gardener PR. Verify branch was pushed and PR created. Write the PR number to ${GARDENER_PR_FILE}, then write PHASE:awaiting_ci again." - return 0 - fi - log "tracking gardener PR #${_GARDENER_PR}" - fi - - # Skip CI for doc-only PRs - if ! ci_required_for_pr "$_GARDENER_PR" 2>/dev/null; then - log "CI not required (doc-only) — treating as passed" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "CI passed on PR #${_GARDENER_PR} (doc-only changes, CI not required). -Write PHASE:awaiting_review to the phase file, then stop and wait: - echo \"PHASE:awaiting_review\" > \"${PHASE_FILE}\"" - return 0 - fi - - # No CI configured? - if [ "${WOODPECKER_REPO_ID:-2}" = "0" ]; then - log "no CI configured — treating as passed" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "CI passed on PR #${_GARDENER_PR} (no CI configured). -Write PHASE:awaiting_review to the phase file, then stop and wait: - echo \"PHASE:awaiting_review\" > \"${PHASE_FILE}\"" - return 0 - fi - - # Get HEAD SHA from PR - local head_sha - head_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/pulls/${_GARDENER_PR}" | jq -r '.head.sha // empty') || true - - if [ -z "$head_sha" ]; then - log "WARNING: could not get HEAD SHA for PR #${_GARDENER_PR}" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "WARNING: Could not read HEAD SHA for PR #${_GARDENER_PR}. Verify push succeeded. Then write PHASE:awaiting_ci again." - return 0 - fi - - # Poll CI (15 min max within this phase) - local ci_done=false ci_state="unknown" ci_elapsed=0 ci_timeout=900 - while [ "$ci_elapsed" -lt "$ci_timeout" ]; do - sleep 30 - ci_elapsed=$((ci_elapsed + 30)) - - # Session health check - if [ -f "/tmp/claude-exited-${_MONITOR_SESSION:-$SESSION_NAME}.ts" ] || \ - ! tmux has-session -t "${_MONITOR_SESSION:-$SESSION_NAME}" 2>/dev/null; then - log "session died during CI wait" - return 0 - fi - - # Merge-through timeout check - elapsed=$(( $(date +%s) - _GARDENER_MERGE_START )) - if [ "$elapsed" -ge "$_GARDENER_MERGE_TIMEOUT" ]; then - _gardener_timeout_cleanup - return 0 - fi - - # Re-fetch HEAD in case Claude pushed new commits - head_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/pulls/${_GARDENER_PR}" | jq -r '.head.sha // empty') || true - - ci_state=$(ci_commit_status "$head_sha") || ci_state="unknown" - - case "$ci_state" in - success|failure|error) ci_done=true; break ;; - esac - done - - if ! $ci_done; then - log "CI timeout for PR #${_GARDENER_PR}" - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "CI TIMEOUT: CI did not complete within 15 minutes for PR #${_GARDENER_PR}. Write PHASE:failed with a reason if you cannot proceed." - return 0 - fi - - log "CI: ${ci_state} for PR #${_GARDENER_PR}" - - if [ "$ci_state" = "success" ]; then - _GARDENER_CI_FIX_COUNT=0 - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "CI passed on PR #${_GARDENER_PR}. -Write PHASE:awaiting_review to the phase file, then stop and wait: - echo \"PHASE:awaiting_review\" > \"${PHASE_FILE}\"" + rm -f "$SCRATCH_FILE" + log "gardener PR #${PR_NUMBER} merged — manifest executed" else - _GARDENER_CI_FIX_COUNT=$(( _GARDENER_CI_FIX_COUNT + 1 )) - if [ "$_GARDENER_CI_FIX_COUNT" -gt 3 ]; then - log "CI exhausted after ${_GARDENER_CI_FIX_COUNT} attempts" - printf 'PHASE:failed\nReason: gardener CI exhausted after %d attempts\n' \ - "$_GARDENER_CI_FIX_COUNT" > "$PHASE_FILE" - return 0 - fi - - # Get error details - local pipeline_num ci_error_log - pipeline_num=$(ci_pipeline_number "$head_sha") - - ci_error_log="" - if [ -n "$pipeline_num" ]; then - ci_error_log=$(bash "${FACTORY_ROOT}/lib/ci-debug.sh" failures "$pipeline_num" 2>/dev/null \ - | tail -80 | head -c 8000 || true) - fi - - agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" \ - "CI failed on PR #${_GARDENER_PR} (attempt ${_GARDENER_CI_FIX_COUNT}/3). -${ci_error_log:+Error output: -${ci_error_log} -}Fix the issue, commit, push, then write: - echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\" -Then stop and wait." + log "PR #${PR_NUMBER} not merged (reason: ${_PR_WALK_EXIT_REASON:-unknown})" fi -} - -# shellcheck disable=SC2317 # called indirectly by monitor_phase_loop -_gardener_handle_review() { - log "waiting for review on PR #${_GARDENER_PR:-?}" - _GARDENER_CI_FIX_COUNT=0 # Reset CI fix budget for next review cycle - - local review_elapsed=0 review_timeout=1800 - while [ "$review_elapsed" -lt "$review_timeout" ]; do - sleep 60 # 1 min between review checks (gardener PRs are fast-tracked) - review_elapsed=$((review_elapsed + 60)) - - # Session health check - if [ -f "/tmp/claude-exited-${_MONITOR_SESSION:-$SESSION_NAME}.ts" ] || \ - ! tmux has-session -t "${_MONITOR_SESSION:-$SESSION_NAME}" 2>/dev/null; then - log "session died during review wait" - return 0 - fi - - # Merge-through timeout check - local elapsed - elapsed=$(( $(date +%s) - _GARDENER_MERGE_START )) - if [ "$elapsed" -ge "$_GARDENER_MERGE_TIMEOUT" ]; then - _gardener_timeout_cleanup - return 0 - fi - - # Check if phase changed while we wait (e.g. review-poll injected feedback) - local new_mtime - new_mtime=$(stat -c %Y "$PHASE_FILE" 2>/dev/null || echo 0) - if [ "$new_mtime" -gt "${LAST_PHASE_MTIME:-0}" ]; then - log "phase changed during review wait — returning to monitor loop" - return 0 - fi - - # Check for review on current HEAD - local review_sha review_comment - review_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \ - "${FORGE_API}/pulls/${_GARDENER_PR}" | jq -r '.head.sha // empty') || true - - review_comment=$(forge_api_all "/issues/${_GARDENER_PR}/comments" 2>/dev/null | \ - jq -r --arg sha "${review_sha:-none}" \ - '[.[] | select(.body | contains("