disinto/lib/agent-session.sh

#!/usr/bin/env bash
# agent-session.sh — Shared tmux + Claude interactive session helpers
#
# Source this into agent orchestrator scripts for reusable session management.
#
# Functions:
#   agent_wait_for_claude_ready SESSION_NAME [TIMEOUT_SECS]
#   agent_inject_into_session   SESSION_NAME TEXT
#   agent_kill_session          SESSION_NAME
#   monitor_phase_loop          PHASE_FILE IDLE_TIMEOUT_SECS CALLBACK_FN [SESSION_NAME]
#   session_lock_acquire        [TIMEOUT_SECS]
#   session_lock_release

# --- Cooperative session lock (fd-based) ---
# File descriptor for the session lock. Set by create_agent_session().
# Callers can release/re-acquire via session_lock_release/session_lock_acquire
# to allow other Claude sessions during idle phases (awaiting_review/awaiting_ci).
SESSION_LOCK_FD=""

# Release the session lock without closing the file descriptor.
# The fd stays open so it can be re-acquired later.
session_lock_release() {
  if [ -n "${SESSION_LOCK_FD:-}" ]; then
    flock -u "$SESSION_LOCK_FD"
  fi
}

# Re-acquire the session lock. Blocks until available or timeout.
# Opens the lock fd if not already open (for use by external callers).
# Args: [timeout_secs] (default 300)
# Returns 0 on success, 1 on timeout/error.
# shellcheck disable=SC2120  # timeout arg is used by external callers
session_lock_acquire() {
  local timeout="${1:-300}"
  if [ -z "${SESSION_LOCK_FD:-}" ]; then
    local lock_dir="${HOME}/.claude"
    mkdir -p "$lock_dir"
    exec {SESSION_LOCK_FD}>>"${lock_dir}/session.lock"
  fi
  flock -w "$timeout" "$SESSION_LOCK_FD"
}

# Wait for the Claude ❯ ready prompt in a tmux pane.
# Returns 0 if ready within TIMEOUT_SECS (default 120), 1 otherwise.
agent_wait_for_claude_ready() {
  local session="$1"
  local timeout="${2:-120}"
  local elapsed=0
  while [ "$elapsed" -lt "$timeout" ]; do
    if tmux capture-pane -t "$session" -p 2>/dev/null | grep -q '❯'; then
      return 0
    fi
    sleep 2
    elapsed=$((elapsed + 2))
  done
  return 1
}

# Paste TEXT into SESSION (waits for Claude to be ready first), then press Enter.
agent_inject_into_session() {
  local session="$1"
  local text="$2"
  local tmpfile
  # Re-acquire session lock before injecting — Claude will resume working
  # shellcheck disable=SC2119  # using default timeout
  session_lock_acquire || true
  agent_wait_for_claude_ready "$session" 120 || true
  # Clear idle marker — new work incoming
  rm -f "/tmp/claude-idle-${session}.ts"
  tmpfile=$(mktemp /tmp/agent-inject-XXXXXX)
  printf '%s' "$text" > "$tmpfile"
  tmux load-buffer -b "agent-inject-$$" "$tmpfile"
  tmux paste-buffer -t "$session" -b "agent-inject-$$"
  sleep 0.5
  tmux send-keys -t "$session" "" Enter
  tmux delete-buffer -b "agent-inject-$$" 2>/dev/null || true
  rm -f "$tmpfile"
}

# Create a tmux session running Claude in the given workdir.
# Installs a Stop hook for idle detection (see monitor_phase_loop).
# Installs a PreToolUse hook to guard destructive Bash operations.
# Optionally installs a PostToolUse hook for phase file write detection.
# Optionally installs a StopFailure hook for immediate phase file update on API error.
# Args: session workdir [phase_file]
# Returns 0 if session is ready, 1 otherwise.
create_agent_session() {
  local session="$1"
  local workdir="${2:-.}"
  local phase_file="${3:-}"

  # Prepare settings directory for hooks
  mkdir -p "${workdir}/.claude"
  local settings="${workdir}/.claude/settings.json"

  # Install Stop hook for idle detection: when Claude finishes a response,
  # the hook writes a timestamp to a marker file. monitor_phase_loop checks
  # this marker instead of fragile tmux pane scraping.
  local idle_marker="/tmp/claude-idle-${session}.ts"
  local hook_script="${FACTORY_ROOT}/lib/hooks/on-idle-stop.sh"
  if [ -x "$hook_script" ]; then
    local hook_cmd="${hook_script} ${idle_marker}"
    # When a phase file is available, pass it and the session name so the
    # hook can nudge Claude if it returns to the prompt without signalling.
    if [ -n "$phase_file" ]; then
      hook_cmd="${hook_script} ${idle_marker} ${phase_file} ${session}"
    fi
    if [ -f "$settings" ]; then
      # Append our Stop hook to existing project settings
      jq --arg cmd "$hook_cmd" '
        if (.hooks.Stop // [] | any(.[]; .hooks[]?.command == $cmd))
        then .
        else .hooks.Stop = (.hooks.Stop // []) + [{
          matcher: "",
          hooks: [{type: "command", command: $cmd}]
        }]
        end
      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
    else
      jq -n --arg cmd "$hook_cmd" '{
        hooks: {
          Stop: [{
            matcher: "",
            hooks: [{type: "command", command: $cmd}]
          }]
        }
      }' > "$settings"
    fi
  fi

  # Install PostToolUse hook for phase file write detection: when Claude
  # writes to the phase file via Bash or Write, the hook writes a marker
  # so monitor_phase_loop can react immediately instead of waiting for
  # the next mtime-based poll cycle.
  if [ -n "$phase_file" ]; then
    local phase_marker="/tmp/phase-changed-${session}.marker"
    local phase_hook_script="${FACTORY_ROOT}/lib/hooks/on-phase-change.sh"
    if [ -x "$phase_hook_script" ]; then
      local phase_hook_cmd="${phase_hook_script} ${phase_file} ${phase_marker}"
      if [ -f "$settings" ]; then
        jq --arg cmd "$phase_hook_cmd" '
          if (.hooks.PostToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
          then .
          else .hooks.PostToolUse = (.hooks.PostToolUse // []) + [{
            matcher: "Bash|Write",
            hooks: [{type: "command", command: $cmd}]
          }]
          end
        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
      else
        jq -n --arg cmd "$phase_hook_cmd" '{
          hooks: {
            PostToolUse: [{
              matcher: "Bash|Write",
              hooks: [{type: "command", command: $cmd}]
            }]
          }
        }' > "$settings"
      fi
      rm -f "$phase_marker"
    fi
  fi

  # Install StopFailure hook for immediate phase file update on API error:
  # when Claude hits a rate limit, server error, billing error, or auth failure,
  # the hook writes PHASE:failed to the phase file and touches the phase-changed
  # marker so monitor_phase_loop picks it up within one poll cycle instead of
  # waiting for idle timeout (up to 2 hours).
  if [ -n "$phase_file" ]; then
    local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh"
    if [ -x "$stop_failure_hook_script" ]; then
      # phase_marker is defined in the PostToolUse block above; redeclare so
      # this block is self-contained if that block is ever removed.
      local sf_phase_marker="/tmp/phase-changed-${session}.marker"
      local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${sf_phase_marker}"
      if [ -f "$settings" ]; then
        jq --arg cmd "$stop_failure_hook_cmd" '
          if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd))
          then .
          else .hooks.StopFailure = (.hooks.StopFailure // []) + [{
            matcher: "rate_limit|server_error|authentication_failed|billing_error",
            hooks: [{type: "command", command: $cmd}]
          }]
          end
        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
      else
        jq -n --arg cmd "$stop_failure_hook_cmd" '{
          hooks: {
            StopFailure: [{
              matcher: "rate_limit|server_error|authentication_failed|billing_error",
              hooks: [{type: "command", command: $cmd}]
            }]
          }
        }' > "$settings"
      fi
    fi
  fi

  # Install PreToolUse hook for destructive operation guard: blocks force push
  # to primary branch, rm -rf outside worktree, direct API merge calls, and
  # checkout/switch to primary branch.  Claude sees the denial reason on exit 2
  # and can self-correct.
  local guard_hook_script="${FACTORY_ROOT}/lib/hooks/on-pretooluse-guard.sh"
  if [ -x "$guard_hook_script" ]; then
    local abs_workdir
    abs_workdir=$(cd "$workdir" 2>/dev/null && pwd) || abs_workdir="$workdir"
    local guard_hook_cmd="${guard_hook_script} ${PRIMARY_BRANCH:-main} ${abs_workdir} ${session}"
    if [ -f "$settings" ]; then
      jq --arg cmd "$guard_hook_cmd" '
        if (.hooks.PreToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
        then .
        else .hooks.PreToolUse = (.hooks.PreToolUse // []) + [{
          matcher: "Bash",
          hooks: [{type: "command", command: $cmd}]
        }]
        end
      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
    else
      jq -n --arg cmd "$guard_hook_cmd" '{
        hooks: {
          PreToolUse: [{
            matcher: "Bash",
            hooks: [{type: "command", command: $cmd}]
          }]
        }
      }' > "$settings"
    fi
  fi

  # Install SessionEnd hook for guaranteed cleanup: when the Claude session
  # exits (clean or crash), write a termination marker so monitor_phase_loop
  # detects the exit faster than tmux has-session polling alone.
  local exit_marker="/tmp/claude-exited-${session}.ts"
  local session_end_hook_script="${FACTORY_ROOT}/lib/hooks/on-session-end.sh"
  if [ -x "$session_end_hook_script" ]; then
    local session_end_hook_cmd="${session_end_hook_script} ${exit_marker}"
    if [ -f "$settings" ]; then
      jq --arg cmd "$session_end_hook_cmd" '
        if (.hooks.SessionEnd // [] | any(.[]; .hooks[]?.command == $cmd))
        then .
        else .hooks.SessionEnd = (.hooks.SessionEnd // []) + [{
          matcher: "",
          hooks: [{type: "command", command: $cmd}]
        }]
        end
      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
    else
      jq -n --arg cmd "$session_end_hook_cmd" '{
        hooks: {
          SessionEnd: [{
            matcher: "",
            hooks: [{type: "command", command: $cmd}]
          }]
        }
      }' > "$settings"
    fi
  fi
  rm -f "$exit_marker"

  # Install SessionStart hook for context re-injection after compaction:
  # when Claude Code compacts context during long sessions, the phase protocol
  # instructions are lost. This hook fires after each compaction and outputs
  # the content of a context file so Claude retains critical instructions.
  # The context file is written by callers via write_compact_context().
  if [ -n "$phase_file" ]; then
    local compact_hook_script="${FACTORY_ROOT}/lib/hooks/on-compact-reinject.sh"
    if [ -x "$compact_hook_script" ]; then
      local context_file="${phase_file%.phase}.context"
      local compact_hook_cmd="${compact_hook_script} ${context_file}"
      if [ -f "$settings" ]; then
        jq --arg cmd "$compact_hook_cmd" '
          if (.hooks.SessionStart // [] | any(.[]; .hooks[]?.command == $cmd))
          then .
          else .hooks.SessionStart = (.hooks.SessionStart // []) + [{
            matcher: "compact",
            hooks: [{type: "command", command: $cmd}]
          }]
          end
        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
      else
        jq -n --arg cmd "$compact_hook_cmd" '{
          hooks: {
            SessionStart: [{
              matcher: "compact",
              hooks: [{type: "command", command: $cmd}]
            }]
          }
        }' > "$settings"
      fi
    fi
  fi

  rm -f "$idle_marker"
  local model_flag=""
  if [ -n "${CLAUDE_MODEL:-}" ]; then
    model_flag="--model ${CLAUDE_MODEL}"
  fi

  # Acquire a session-level mutex via fd-based flock to prevent concurrent
  # Claude sessions from racing on OAuth token refresh.  Unlike the previous
  # command-wrapper flock, the fd approach allows callers to release the lock
  # during idle phases (awaiting_review/awaiting_ci) and re-acquire before
  # injecting the next prompt.  See #724.
  # Use ~/.claude/session.lock so the lock is shared across containers when
  # the host ~/.claude directory is bind-mounted.
  local lock_dir="${HOME}/.claude"
  mkdir -p "$lock_dir"
  local claude_lock="${lock_dir}/session.lock"
  if [ -z "${SESSION_LOCK_FD:-}" ]; then
    exec {SESSION_LOCK_FD}>>"${claude_lock}"
  fi
  if ! flock -w 300 "$SESSION_LOCK_FD"; then
    return 1
  fi
  local claude_cmd="claude --dangerously-skip-permissions ${model_flag}"

  tmux new-session -d -s "$session" -c "$workdir" \
    "$claude_cmd" 2>/dev/null
  sleep 1
  tmux has-session -t "$session" 2>/dev/null || return 1
  agent_wait_for_claude_ready "$session" 120 || return 1
  return 0
}

# Inject a prompt/formula into a session (alias for agent_inject_into_session).
inject_formula() {
  agent_inject_into_session "$@"
}

# Monitor a phase file, calling a callback on changes and handling idle timeout.
# Sets _MONITOR_LOOP_EXIT to the exit reason (idle_timeout, idle_prompt, done, crashed, PHASE:failed, PHASE:escalate).
# Sets _MONITOR_SESSION to the resolved session name (arg 4 or $SESSION_NAME).
#   Callbacks should reference _MONITOR_SESSION instead of $SESSION_NAME directly.
# Args: phase_file idle_timeout_secs callback_fn [session_name]
#   session_name — tmux session to health-check; falls back to $SESSION_NAME global
#
# Idle detection: uses a Stop hook marker file (written by lib/hooks/on-idle-stop.sh)
# to detect when Claude finishes responding without writing a phase signal.
# If the marker exists for 3 consecutive polls with no phase written, the session
# is killed and the callback invoked with "PHASE:failed".
monitor_phase_loop() {
  local phase_file="$1"
  local idle_timeout="$2"
  local callback="$3"
  local _session="${4:-${SESSION_NAME:-}}"
  # Export resolved session name so callbacks can reference it regardless of
  # which session was passed to monitor_phase_loop (analogous to _MONITOR_LOOP_EXIT).
  export _MONITOR_SESSION="$_session"
  local poll_interval="${PHASE_POLL_INTERVAL:-10}"
  local last_mtime=0
  local idle_elapsed=0
  local idle_pane_count=0

  while true; do
    sleep "$poll_interval"
    idle_elapsed=$(( idle_elapsed + poll_interval ))

    # Session health check: SessionEnd hook marker provides fast detection,
    # tmux has-session is the fallback for unclean exits (e.g. tmux crash).
    local exit_marker="/tmp/claude-exited-${_session}.ts"
    if [ -f "$exit_marker" ] || ! tmux has-session -t "${_session}" 2>/dev/null; then
      local current_phase
      current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
      case "$current_phase" in
        PHASE:done|PHASE:failed|PHASE:merged|PHASE:escalate)
          ;; # terminal — fall through to phase handler
        *)
          # Call callback with "crashed" — let agent-specific code handle recovery
          if type "${callback}" &>/dev/null; then
            "$callback" "PHASE:crashed"
          fi
          # If callback didn't restart session, break
          if ! tmux has-session -t "${_session}" 2>/dev/null; then
            _MONITOR_LOOP_EXIT="crashed"
            return 1
          fi
          idle_elapsed=0
          idle_pane_count=0
          continue
          ;;
      esac
    fi

    # Check phase-changed marker from PostToolUse hook — if present, the hook
    # detected a phase file write so we reset last_mtime to force processing
    # this cycle instead of waiting for the next mtime change.
    local phase_marker="/tmp/phase-changed-${_session}.marker"
    if [ -f "$phase_marker" ]; then
      rm -f "$phase_marker"
      last_mtime=0
    fi

    # Check phase file for changes
    local phase_mtime
    phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
    local current_phase
    current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)

    if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$last_mtime" ]; then
      # No phase change — check idle timeout
      if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
        _MONITOR_LOOP_EXIT="idle_timeout"
        agent_kill_session "${_session}"
        return 0
      fi
      # Idle detection via Stop hook: the on-idle-stop.sh hook writes a marker
      # file when Claude finishes a response. If the marker exists and no phase
      # has been written, Claude returned to the prompt without following the
      # phase protocol. 3 consecutive polls = confirmed idle (not mid-turn).
      local idle_marker="/tmp/claude-idle-${_session}.ts"
      if [ -z "$current_phase" ] && [ -f "$idle_marker" ]; then
        idle_pane_count=$(( idle_pane_count + 1 ))
        if [ "$idle_pane_count" -ge 3 ]; then
          _MONITOR_LOOP_EXIT="idle_prompt"
          # Session is killed before the callback is invoked.
          # Callbacks that handle PHASE:failed must not assume the session is alive.
          agent_kill_session "${_session}"
          if type "${callback}" &>/dev/null; then
            "$callback" "PHASE:failed"
          fi
          return 0
        fi
      else
        idle_pane_count=0
      fi
      continue
    fi

    # Phase changed
    last_mtime="$phase_mtime"
    # shellcheck disable=SC2034  # read by phase-handler.sh callback
    LAST_PHASE_MTIME="$phase_mtime"
    idle_elapsed=0
    idle_pane_count=0

    # Terminal phases
    case "$current_phase" in
      PHASE:done|PHASE:merged)
        _MONITOR_LOOP_EXIT="done"
        if type "${callback}" &>/dev/null; then
          "$callback" "$current_phase"
        fi
        return 0
        ;;
      PHASE:failed|PHASE:escalate)
        _MONITOR_LOOP_EXIT="$current_phase"
        if type "${callback}" &>/dev/null; then
          "$callback" "$current_phase"
        fi
        return 0
        ;;
    esac

    # Non-terminal phase — call callback
    if type "${callback}" &>/dev/null; then
      "$callback" "$current_phase"
    fi
  done
}

# Write context to a file for re-injection after context compaction.
# The SessionStart compact hook reads this file and outputs it to stdout.
# Args: phase_file content
write_compact_context() {
  local phase_file="$1"
  local content="$2"
  local context_file="${phase_file%.phase}.context"
  printf '%s\n' "$content" > "$context_file"
}

# Kill a tmux session gracefully (no-op if not found).
agent_kill_session() {
  local session="${1:-}"
  [ -n "$session" ] && tmux kill-session -t "$session" 2>/dev/null || true
  rm -f "/tmp/claude-idle-${session}.ts"
  rm -f "/tmp/phase-changed-${session}.marker"
  rm -f "/tmp/claude-exited-${session}.ts"
  rm -f "/tmp/claude-nudge-${session}.count"
}

# Read the current phase from a phase file, stripped of whitespace.
# Usage: read_phase [file]  — defaults to $PHASE_FILE
read_phase() {
  local file="${1:-${PHASE_FILE:-}}"
  { cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
}