From 42620a13416511646cab2d51f6483cf4ce0091b3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 20 Mar 2026 23:51:46 +0000
Subject: [PATCH] fix: No combined wall-clock + idle cap for action-agent
 sessions (#334)

Add ACTION_MAX_LIFETIME env var (default 8h) that caps total session
wall-clock time independently of ACTION_IDLE_TIMEOUT.  A background
watchdog sleeps for the remaining lifetime and, when triggered, kills
the tmux session, posts a summary comment on the issue, writes
PHASE:failed with a max_lifetime reason, and escalates to the
supervisor.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 AGENTS.md              |  1 +
 action/action-agent.sh | 43 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 1bc24d1..d518752 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -270,6 +270,7 @@ issues labeled `action` that have no active tmux session, then spawns
 - `CODEBERG_TOKEN`, `CODEBERG_REPO`, `CODEBERG_API`, `PROJECT_NAME`, `CODEBERG_WEB`
 - `MATRIX_TOKEN`, `MATRIX_ROOM_ID`, `MATRIX_HOMESERVER` — Matrix notifications + human input
 - `ACTION_IDLE_TIMEOUT` — Max seconds before killing idle session (default 14400 = 4h)
+- `ACTION_MAX_LIFETIME` — Max total session wall-clock seconds (default 28800 = 8h); caps session independently of idle timeout
 
 ---
 
diff --git a/action/action-agent.sh b/action/action-agent.sh
index f0920a4..d1a4fcb 100644
--- a/action/action-agent.sh
+++ b/action/action-agent.sh
@@ -32,6 +32,8 @@ LOCKFILE="/tmp/action-agent-${ISSUE}.lock"
 LOGFILE="${FACTORY_ROOT}/action/action-poll-${PROJECT_NAME:-harb}.log"
 THREAD_FILE="/tmp/action-thread-${ISSUE}"
 IDLE_TIMEOUT="${ACTION_IDLE_TIMEOUT:-14400}"  # 4h default
+MAX_LIFETIME="${ACTION_MAX_LIFETIME:-28800}" # 8h default wall-clock cap
+SESSION_START_EPOCH=$(date +%s)
 
 # --- Phase handler globals (agent-specific; defaults in phase-handler.sh) ---
 # shellcheck disable=SC2034  # used by phase-handler.sh
@@ -84,6 +86,11 @@ fi
 echo $$ > "$LOCKFILE"
 
 cleanup() {
+  # Kill lifetime watchdog if running
+  if [ -n "${LIFETIME_WATCHDOG_PID:-}" ] && kill -0 "$LIFETIME_WATCHDOG_PID" 2>/dev/null; then
+    kill "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
+    wait "$LIFETIME_WATCHDOG_PID" 2>/dev/null || true
+  fi
   rm -f "$LOCKFILE"
   agent_kill_session "$SESSION_NAME"
   # Best-effort docker cleanup for containers started during this action
@@ -262,6 +269,31 @@ log "initial prompt injected into session"
 matrix_send "action" "⚡ #${ISSUE}: session started — ${ISSUE_TITLE}" \
   "${THREAD_ID}" 2>/dev/null || true
 
+# --- Wall-clock lifetime watchdog (background) ---
+# Caps total session time independently of idle timeout.  When the cap is
+# hit the watchdog kills the tmux session, posts a summary comment on the
+# issue, and writes PHASE:failed so monitor_phase_loop exits.
+_lifetime_watchdog() {
+  local remaining=$(( MAX_LIFETIME - ($(date +%s) - SESSION_START_EPOCH) ))
+  [ "$remaining" -le 0 ] && remaining=1
+  sleep "$remaining"
+  local hours=$(( MAX_LIFETIME / 3600 ))
+  log "MAX_LIFETIME (${hours}h) reached — killing session"
+  agent_kill_session "$SESSION_NAME"
+  # Post summary comment on issue
+  local body="Action session killed: wall-clock lifetime cap (${hours}h) reached."
+  curl -sf -X POST \
+    -H "Authorization: token ${CODEBERG_TOKEN}" \
+    -H 'Content-Type: application/json' \
+    "${CODEBERG_API}/issues/${ISSUE}/comments" \
+    -d "{\"body\": \"${body}\"}" >/dev/null 2>&1 || true
+  printf 'PHASE:failed\nReason: max_lifetime (%sh) reached\n' "$hours" > "$PHASE_FILE"
+  # Touch phase-changed marker so monitor_phase_loop picks up immediately
+  touch "/tmp/phase-changed-${SESSION_NAME}.marker"
+}
+_lifetime_watchdog &
+LIFETIME_WATCHDOG_PID=$!
+
 # --- Monitor phase loop (shared with dev-agent) ---
 status "monitoring phase: ${PHASE_FILE} (action agent)"
 monitor_phase_loop "$PHASE_FILE" "$IDLE_TIMEOUT" _on_phase_change "$SESSION_NAME"
@@ -281,6 +313,17 @@ case "${_MONITOR_LOOP_EXIT:-}" in
     # Notification + escalation already handled by _on_phase_change(PHASE:failed) callback
     rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE"
     ;;
+  PHASE:failed)
+    # Check if this was a max_lifetime kill (phase file contains the reason)
+    if grep -q 'max_lifetime' "$PHASE_FILE" 2>/dev/null; then
+      notify_ctx \
+        "session killed — wall-clock cap ($((MAX_LIFETIME / 3600))h) reached" \
+        "session killed — wall-clock cap ($((MAX_LIFETIME / 3600))h) reached"
+      echo "{\"issue\":${ISSUE},\"pr\":${PR_NUMBER:-0},\"reason\":\"max_lifetime\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" \
+        >> "${FACTORY_ROOT}/supervisor/escalations-${PROJECT_NAME}.jsonl"
+    fi
+    rm -f "$PHASE_FILE" "${PHASE_FILE%.phase}.context" "$IMPL_SUMMARY_FILE" "$THREAD_FILE" "$SCRATCH_FILE"
+    ;;
   done)
     # Belt-and-suspenders: callback handles primary cleanup,
     # but ensure sentinel files are removed if callback was interrupted