diff --git a/AGENTS.md b/AGENTS.md index e9f12e1..bc2182b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -174,7 +174,7 @@ Humans write these. Agents read and enforce them. | ID | Decision | Rationale | |---|---|---| | AD-001 | Nervous system runs from a polling loop (`docker/agents/entrypoint.sh`), not PR-based actions. | Planner, predictor, gardener, supervisor run directly via `*-run.sh`. They create work, they don't become work. (See PR #474 revert.) | -| AD-002 | **Concurrency is bounded per LLM backend, not per project.** One concurrent Claude session per OAuth credential pool; one concurrent session per llama-server instance. Containers with disjoint backends may run in parallel. | The single-thread invariant is about *backends*, not pipelines. **(a) Anthropic OAuth credentials race on token refresh** — two sessions sharing one mounted `~/.claude` will trip over each other during rotation and 401. All agents inside an OAuth-mounted container serialize on `flock session.lock`. **(b) llama-server has finite VRAM and one KV cache** — parallel inference thrashes the cache and risks OOM. All llama-backed agents serialize on the same lock. **(c) Disjoint backends are free to parallelize.** Today `disinto-agents` (Anthropic OAuth, runs `review,gardener`) runs concurrently with `disinto-agents-llama` (llama, runs `dev`) on the same project — they share neither OAuth state nor llama VRAM. **(d) Per-project work-conflict safety** (no duplicate dev work, no merge conflicts on the same branch) is enforced by `issue_claim` (assignee + `in-progress` label) and per-issue worktrees — that's a separate guard that does NOT depend on this AD. | +| AD-002 | **Concurrency is bounded per LLM backend, not per project.** One concurrent Claude session per OAuth credential pool; one concurrent session per llama-server instance. Containers with disjoint backends may run in parallel. | The single-thread invariant is about *backends*, not pipelines. **(a) Anthropic OAuth credentials race on token refresh** — each container uses a per-session `CLAUDE_CONFIG_DIR`, so Claude Code's native lockfile-based OAuth refresh handles contention automatically without external serialization. (Legacy: set `CLAUDE_EXTERNAL_LOCK=1` to re-enable the old `flock session.lock` wrapper for rollback.) **(b) llama-server has finite VRAM and one KV cache** — parallel inference thrashes the cache and risks OOM. All llama-backed agents serialize on the same lock. **(c) Disjoint backends are free to parallelize.** Today `disinto-agents` (Anthropic OAuth, runs `review,gardener`) runs concurrently with `disinto-agents-llama` (llama, runs `dev`) on the same project — they share neither OAuth state nor llama VRAM. **(d) Per-project work-conflict safety** (no duplicate dev work, no merge conflicts on the same branch) is enforced by `issue_claim` (assignee + `in-progress` label) and per-issue worktrees — that's a separate guard that does NOT depend on this AD. | | AD-003 | The runtime creates and destroys, the formula preserves. | Runtime manages worktrees/sessions/temp. Formulas commit knowledge to git before signaling done. | | AD-004 | Event-driven > polling > fixed delays. | Never `waitForTimeout` or hardcoded sleep. Use phase files, webhooks, or poll loops with backoff. | | AD-005 | Secrets via env var indirection, never in issue bodies. | Issue bodies become code. Agent secrets go in `.env.enc`, vault secrets in `.env.vault.enc` (SOPS-encrypted when available; plaintext `.env`/`.env.vault` fallback supported). Referenced as `$VAR_NAME`. Runner gets only vault secrets; agents get only agent secrets. | @@ -184,7 +184,7 @@ Humans write these. Agents read and enforce them. - **Gardener** checks open backlog issues against ADs during grooming; closes violations with a comment referencing the AD number. - **Planner** plans within the architecture; does not create issues that violate ADs. - **Dev-agent** reads AGENTS.md before implementing; refuses work that violates ADs. -- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** Concurrency is enforced by `flock session.lock` within each container and by `issue_claim` for per-issue work. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue. +- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** OAuth concurrency is handled by per-session `CLAUDE_CONFIG_DIR` isolation (with `CLAUDE_EXTERNAL_LOCK` as a rollback flag). Per-issue work is enforced by `issue_claim`. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue. --- diff --git a/docker/agents/entrypoint.sh b/docker/agents/entrypoint.sh index 171ceca..66b0b1b 100644 --- a/docker/agents/entrypoint.sh +++ b/docker/agents/entrypoint.sh @@ -354,7 +354,8 @@ while true; do # Fast agents (review-poll, dev-poll) run in background so they don't block # each other. Slow agents (gardener, architect, planner, predictor) also run # in background but are guarded by pgrep so only one instance runs at a time. - # The flock on session.lock already serializes claude -p calls. + # Per-session CLAUDE_CONFIG_DIR isolation handles OAuth concurrency natively. + # Set CLAUDE_EXTERNAL_LOCK=1 to re-enable the legacy flock serialization. for toml in "${DISINTO_DIR}"/projects/*.toml; do [ -f "$toml" ] || continue diff --git a/lib/agent-sdk.sh b/lib/agent-sdk.sh index ca38dca..8355f93 100644 --- a/lib/agent-sdk.sh +++ b/lib/agent-sdk.sh @@ -137,11 +137,16 @@ agent_run() { local run_dir="${worktree_dir:-$(pwd)}" local lock_file="${HOME}/.claude/session.lock" - mkdir -p "$(dirname "$lock_file")" local output rc log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})" - # Acquire lock separately (flock cannot exec bash functions) - output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" 2>>"$LOGFILE") && rc=0 || rc=$? + # External flock is redundant once CLAUDE_CONFIG_DIR rollout is verified (#647). + # Gate behind CLAUDE_EXTERNAL_LOCK for rollback safety; default off. + if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then + mkdir -p "$(dirname "$lock_file")" + output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" 2>>"$LOGFILE") && rc=0 || rc=$? + else + output=$(cd "$run_dir" && claude_run_with_watchdog claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$? + fi if [ "$rc" -eq 124 ]; then log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)" elif [ "$rc" -ne 0 ]; then @@ -182,7 +187,11 @@ agent_run() { local nudge="You stopped but did not push any code. You have uncommitted changes. Commit them and push." log "agent_run: nudging (uncommitted changes)" local nudge_rc - output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 9>"$lock_file" 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$? + if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then + output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 9>"$lock_file" 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$? + else + output=$(cd "$run_dir" && claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$? + fi if [ "$nudge_rc" -eq 124 ]; then log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)" elif [ "$nudge_rc" -ne 0 ]; then