Merge pull request 'chore: gardener housekeeping' (#740 ) from chore/gardener-20260412-0628 into main

chore: gardener housekeeping 2026-04-12
Merge pull request 'fix: tech-debt: close_vision_issue state=closed PATCH swallows errors — stuck-open vision issues after idempotency guard (#737 )' (#739 ) from fix/issue-737 into main
2026-04-13 10:27:47 +00:00 · 2026-04-12 06:28:02 +00:00 · 2026-04-12 06:12:47 +00:00 · 2026-04-12 06:06:25 +00:00 · 2026-04-12 05:37:08 +00:00 · 2026-04-12 05:19:57 +00:00
129 changed files with 16181 additions and 6275 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,20 @@
+# Secrets — prevent .env files from being baked into the image
+.env
+.env.enc
+.env.vault
+.env.vault.enc
+
+# Version control — .git is huge and not needed in image
+.git
+
+# Archives — not needed at runtime
+*.tar.gz
+
+# Prometheus data — large, ephemeral data
+prometheus-data/
+
+# Compose files — only needed at runtime via volume mount
+docker-compose.yml
+
+# Project TOML files — gitignored anyway, won't be in build context
+projects/*.toml
--- a/.env.example
+++ b/.env.example
@ -19,14 +19,32 @@ FORGE_URL=http://localhost:3000             # [CONFIG] local Forgejo instance
 # ── Auth tokens ───────────────────────────────────────────────────────────
 # Each agent has its own Forgejo account and API token (#747).
 # Per-agent tokens fall back to FORGE_TOKEN if not set.
+#
+# Tokens and passwords are auto-generated by `disinto init` and stored in .env.
+# Each bot user gets:
+#   - FORGE_TOKEN_<BOT> = API token for REST calls (user identity via /api/v1/user)
+#   - FORGE_PASS_<BOT>  = password for git HTTP push (#361, Forgejo 11.x limitation)
+#
+# Local-model agents (agents-llama) use FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA
+# with FORGE_BOT_USER_LLAMA=dev-qwen to ensure correct attribution (#563).
 FORGE_TOKEN=                               # [SECRET] dev-bot API token (default for all agents)
+FORGE_PASS=                                # [SECRET] dev-bot password for git HTTP push (#361)
+FORGE_TOKEN_LLAMA=                         # [SECRET] dev-qwen API token (for agents-llama)
+FORGE_PASS_LLAMA=                          # [SECRET] dev-qwen password for git HTTP push
 FORGE_REVIEW_TOKEN=                        # [SECRET] review-bot API token
+FORGE_REVIEW_PASS=                         # [SECRET] review-bot password for git HTTP push
 FORGE_PLANNER_TOKEN=                       # [SECRET] planner-bot API token
+FORGE_PLANNER_PASS=                        # [SECRET] planner-bot password for git HTTP push
 FORGE_GARDENER_TOKEN=                      # [SECRET] gardener-bot API token
+FORGE_GARDENER_PASS=                       # [SECRET] gardener-bot password for git HTTP push
 FORGE_VAULT_TOKEN=                         # [SECRET] vault-bot API token
+FORGE_VAULT_PASS=                          # [SECRET] vault-bot password for git HTTP push
 FORGE_SUPERVISOR_TOKEN=                    # [SECRET] supervisor-bot API token
+FORGE_SUPERVISOR_PASS=                     # [SECRET] supervisor-bot password for git HTTP push
 FORGE_PREDICTOR_TOKEN=                     # [SECRET] predictor-bot API token
+FORGE_PREDICTOR_PASS=                      # [SECRET] predictor-bot password for git HTTP push
 FORGE_ARCHITECT_TOKEN=                     # [SECRET] architect-bot API token
+FORGE_ARCHITECT_PASS=                      # [SECRET] architect-bot password for git HTTP push
 FORGE_BOT_USERNAMES=dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot

 # ── Backwards compatibility ───────────────────────────────────────────────
@ -34,6 +52,10 @@ FORGE_BOT_USERNAMES=dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,superv
 # CODEBERG_TOKEN automatically (same for REVIEW_BOT_TOKEN, CODEBERG_REPO,
 # CODEBERG_BOT_USERNAMES). No action needed for existing deployments.
 # Per-agent tokens default to FORGE_TOKEN when unset (single-token setups).
+#
+# Note: `disinto init` auto-generates all bot tokens/passwords when you
+# configure [agents.llama] in a project TOML. The credentials are stored
+# in .env.enc (encrypted) or .env (plaintext fallback).

 # ── Woodpecker CI ─────────────────────────────────────────────────────────
 WOODPECKER_TOKEN=                          # [SECRET] Woodpecker API token
@ -47,6 +69,12 @@ WOODPECKER_DB_USER=woodpecker              # [CONFIG] Postgres user
 WOODPECKER_DB_HOST=127.0.0.1              # [CONFIG] Postgres host
 WOODPECKER_DB_NAME=woodpecker              # [CONFIG] Postgres database name

+# ── Chat OAuth (#708) ────────────────────────────────────────────────────
+CHAT_OAUTH_CLIENT_ID=                     # [SECRET] Chat OAuth2 client ID (auto-generated by init)
+CHAT_OAUTH_CLIENT_SECRET=                 # [SECRET] Chat OAuth2 client secret (auto-generated by init)
+DISINTO_CHAT_ALLOWED_USERS=               # [CONFIG] CSV of allowed usernames (disinto-admin always allowed)
+FORWARD_AUTH_SECRET=                      # [SECRET] Shared secret for Caddy ↔ chat forward_auth (#709)
+
 # ── Vault-only secrets (DO NOT put these in .env) ────────────────────────
 # These tokens grant access to external systems (GitHub, ClawHub, deploy targets).
 # They live ONLY in .env.vault.enc and are injected into the ephemeral runner
@ -67,6 +95,15 @@ BASE_RPC_URL=                              # [SECRET] on-chain RPC endpoint
 # ── Tuning ────────────────────────────────────────────────────────────────
 CLAUDE_TIMEOUT=7200                        # [CONFIG] max seconds per Claude invocation

+# ── Claude Code shared OAuth state ─────────────────────────────────────────
+# Shared directory used by every factory container so Claude Code's internal
+# proper-lockfile-based OAuth refresh lock works across containers. Both
+# values must live outside $HOME (so docker bind mounts don't depend on UID
+# mapping) and must be the same absolute path on host and inside each
+# container. See docs/CLAUDE-AUTH-CONCURRENCY.md.
+CLAUDE_SHARED_DIR=/var/lib/disinto/claude-shared
+CLAUDE_CONFIG_DIR=${CLAUDE_SHARED_DIR}/config
+
 # ── Factory safety ────────────────────────────────────────────────────────
 # Disables Claude Code auto-updater, telemetry, error reporting, and bug
 # command. Factory sessions are production processes — they must never phone
--- a/.codeberg/ISSUE_TEMPLATE/bug.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/bug.yaml
@ -1,7 +1,7 @@
 name: Bug Report
 about: Something is broken or behaving incorrectly
 labels:
-  - bug
+  - bug-report
 body:
  - type: textarea
    id: what
--- a/.codeberg/ISSUE_TEMPLATE/feature.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/feature.yaml
--- a/.codeberg/ISSUE_TEMPLATE/refactor.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/refactor.yaml
--- a/.gitignore
+++ b/.gitignore
@ -28,3 +28,12 @@ secrets/

 # Pre-built binaries for Docker builds (avoid network calls during build)
 docker/agents/bin/
+
+# Generated docker-compose.yml (run 'bin/disinto init' to regenerate)
+# Note: This file is now committed to track volume mount configuration
+# docker-compose.yml
+
+# Python bytecode
+__pycache__/
+*.pyc
+*.pyo
--- a/.woodpecker/agent-smoke.sh
+++ b/.woodpecker/agent-smoke.sh
@ -6,13 +6,16 @@
 #   2. Every custom function called by agent scripts is defined in lib/ or the script itself
 #
 # Fast (<10s): no network, no tmux, no Claude needed.
-# Would have caught: kill_tmux_session (renamed), create_agent_session (missing),
-#                    read_phase (missing from dev-agent.sh scope)

 set -euo pipefail

 cd "$(dirname "$0")/.."

+# CI-side filesystem snapshot: show lib/ state at smoke time (#600)
+echo "=== smoke environment snapshot ==="
+ls -la lib/ 2>&1 | head -50
+echo "=== "
+
 FAILED=0

 # ── helpers ─────────────────────────────────────────────────────────────────
@ -21,12 +24,16 @@ FAILED=0
 # Uses awk instead of grep -Eo for busybox/Alpine compatibility (#296).
 get_fns() {
  local f="$1"
-  # BRE mode (no -E).  Use [(][)] for literal parens — unambiguous across
-  # GNU grep and BusyBox grep (some BusyBox builds treat bare () as grouping
-  # even in BRE).  BRE one-or-more via [X][X]* instead of +.
-  grep '^[[:space:]]*[a-zA-Z_][a-zA-Z0-9_][a-zA-Z0-9_]*[[:space:]]*[(][)]' "$f" 2>/dev/null \
-    | sed 's/^[[:space:]]*//; s/[[:space:]]*[(][)].*$//' \
-    | sort -u || true
+  # Pure-awk implementation: avoids grep/sed cross-platform differences
+  # (BusyBox grep BRE quirks, sed ; separator issues on Alpine).
+  awk '
+    /^[[:space:]]*[a-zA-Z_][a-zA-Z0-9_][a-zA-Z0-9_]*[[:space:]]*[(][)]/ {
+      line = $0
+      gsub(/^[[:space:]]+/, "", line)
+      sub(/[[:space:]]*[(].*/, "", line)
+      print line
+    }
+  ' "$f" 2>/dev/null | sort -u || true
 }

 # Extract call-position identifiers that look like custom function calls:
@ -91,19 +98,37 @@ echo "syntax check done"

 echo "=== 2/2  Function resolution ==="

+# Required lib files for LIB_FUNS construction. Missing any of these means the
+# checkout is incomplete or the test is misconfigured — fail loudly, do NOT
+# silently produce a partial LIB_FUNS list (that masquerades as "undef" errors
+# in unrelated scripts; see #600).
+REQUIRED_LIBS=(
+  lib/agent-sdk.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh
+  lib/secret-scan.sh lib/formula-session.sh lib/mirrors.sh lib/guard.sh
+  lib/pr-lifecycle.sh lib/issue-lifecycle.sh lib/worktree.sh
+)
+
+for f in "${REQUIRED_LIBS[@]}"; do
+  if [ ! -f "$f" ]; then
+    printf 'FAIL [missing-lib] expected %s but it is not present at smoke time\n' "$f" >&2
+    printf '  pwd=%s\n' "$(pwd)" >&2
+    printf '  ls lib/=%s\n' "$(ls lib/ 2>&1 | tr '\n' ' ')" >&2
+    echo '=== SMOKE TEST FAILED (precondition) ===' >&2
+    exit 2
+  fi
+done
+
 # Functions provided by shared lib files (available to all agent scripts via source).
 #
 # Included — these are inline-sourced by agent scripts:
 #   lib/env.sh              — sourced by every agent (log, forge_api, etc.)
-#   lib/agent-session.sh    — sourced by orchestrators (create_agent_session, monitor_phase_loop, etc.)
 #   lib/agent-sdk.sh        — sourced by SDK agents (agent_run, agent_recover_session)
 #   lib/ci-helpers.sh       — sourced by pollers and review (ci_passed, classify_pipeline_failure, etc.)
 #   lib/load-project.sh     — sourced by env.sh when PROJECT_TOML is set
-#   lib/file-action-issue.sh — sourced by gardener-run.sh (file_action_issue)
-#   lib/secret-scan.sh      — sourced by file-action-issue.sh, phase-handler.sh (scan_for_secrets, redact_secrets)
-#   lib/formula-session.sh  — sourced by formula-driven agents (acquire_cron_lock, run_formula_and_monitor, etc.)
+#   lib/secret-scan.sh      — standalone CLI tool, run directly (not sourced)
+#   lib/formula-session.sh  — sourced by formula-driven agents (acquire_run_lock, check_memory, etc.)
 #   lib/mirrors.sh          — sourced by merge sites (mirror_push)
-#   lib/guard.sh            — sourced by all cron entry points (check_active)
+#   lib/guard.sh            — sourced by all polling-loop entry points (check_active)
 #   lib/issue-lifecycle.sh  — sourced by agents for issue claim/release/block/deps
 #   lib/worktree.sh         — sourced by agents for worktree create/recover/cleanup/preserve
 #
@ -116,9 +141,7 @@ echo "=== 2/2  Function resolution ==="
 # If a new lib file is added and sourced by agents, add it to LIB_FUNS below
 # and add a check_script call for it in the lib files section further down.
 LIB_FUNS=$(
-  for f in lib/agent-session.sh lib/agent-sdk.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/secret-scan.sh lib/file-action-issue.sh lib/formula-session.sh lib/mirrors.sh lib/guard.sh lib/pr-lifecycle.sh lib/issue-lifecycle.sh lib/worktree.sh; do
-    if [ -f "$f" ]; then get_fns "$f"; fi
-  done | sort -u
+  for f in "${REQUIRED_LIBS[@]}"; do get_fns "$f"; done | sort -u
 )

 # Known external commands and shell builtins — never flag these
@ -171,6 +194,12 @@ check_script() {
    is_known_cmd "$fn" && continue
    if ! printf '%s\n' "$all_fns" | grep -qxF "$fn"; then
      printf 'FAIL [undef] %s: %s\n' "$script" "$fn"
+      # Diagnostic dump (#600): if the function is expected to be in a known lib,
+      # print what the actual all_fns set looks like so we can tell whether the
+      # function is genuinely missing or whether the resolution loop is broken.
+      printf '  all_fns count: %d\n' "$(printf '%s\n' "$all_fns" | wc -l)"
+      printf '  LIB_FUNS contains "%s": %s\n' "$fn" "$(printf '%s\n' "$LIB_FUNS" | grep -cxF "$fn")"
+      printf '  defining lib (if any): %s\n' "$(grep -l "^[[:space:]]*${fn}[[:space:]]*()" lib/*.sh 2>/dev/null | tr '\n' ' ')"
      FAILED=1
    fi
  done <<< "$candidates"
@ -180,13 +209,11 @@ check_script() {
 # These are already in LIB_FUNS (their definitions are available to agents),
 # but this verifies calls *within* each lib file are also resolvable.
 check_script lib/env.sh              lib/mirrors.sh
-check_script lib/agent-session.sh
 check_script lib/agent-sdk.sh
 check_script lib/ci-helpers.sh
 check_script lib/secret-scan.sh
-check_script lib/file-action-issue.sh   lib/secret-scan.sh
 check_script lib/tea-helpers.sh         lib/secret-scan.sh
-check_script lib/formula-session.sh     lib/agent-session.sh
+check_script lib/formula-session.sh     lib/ops-setup.sh
 check_script lib/load-project.sh
 check_script lib/mirrors.sh              lib/env.sh
 check_script lib/guard.sh
@ -199,18 +226,16 @@ check_script lib/ci-debug.sh
 check_script lib/parse-deps.sh

 # Agent scripts — list cross-sourced files where function scope flows across files.
-# phase-handler.sh defines default callback stubs; sourcing agents may override.
 check_script dev/dev-agent.sh
-check_script dev/phase-handler.sh      lib/secret-scan.sh
 check_script dev/dev-poll.sh
 check_script dev/phase-test.sh
-check_script gardener/gardener-run.sh
+check_script gardener/gardener-run.sh    lib/formula-session.sh
 check_script review/review-pr.sh         lib/agent-sdk.sh
 check_script review/review-poll.sh
-check_script planner/planner-run.sh      lib/agent-session.sh lib/formula-session.sh
+check_script planner/planner-run.sh      lib/formula-session.sh
 check_script supervisor/supervisor-poll.sh
 check_script supervisor/update-prompt.sh
-check_script supervisor/supervisor-run.sh
+check_script supervisor/supervisor-run.sh  lib/formula-session.sh
 check_script supervisor/preflight.sh
 check_script predictor/predictor-run.sh
 check_script architect/architect-run.sh
--- a/.woodpecker/ci.yml
+++ b/.woodpecker/ci.yml
@ -8,6 +8,19 @@
 when:
  event: [push, pull_request]

+# Override default clone to authenticate against Forgejo using FORGE_TOKEN.
+# Required because Forgejo is configured with REQUIRE_SIGN_IN, so anonymous
+# git clones fail with exit code 128. FORGE_TOKEN is injected globally via
+# WOODPECKER_ENVIRONMENT in docker-compose.yml (generated by lib/generators.sh).
+clone:
+  git:
+    image: alpine/git
+    commands:
+      - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
+      - git clone --depth 1 "$AUTH_URL" .
+      - git fetch --depth 1 origin "$CI_COMMIT_REF"
+      - git checkout FETCH_HEAD
+
 steps:
  - name: shellcheck
    image: koalaman/shellcheck-alpine:stable
--- a/.woodpecker/detect-duplicates.py
+++ b/.woodpecker/detect-duplicates.py
@ -267,6 +267,31 @@ def main() -> int:
        "2653705045fdf65072cccfd16eb04900": "Standard prompt template (GRAPH_SECTION, SCRATCH_CONTEXT, FORMULA_CONTENT)",
        "93726a3c799b72ed2898a55552031921": "Standard prompt template continuation (SCRATCH_CONTEXT, FORMULA_CONTENT, SCRATCH_INSTRUCTION)",
        "c11eaaacab69c9a2d3c38c75215eca84": "Standard prompt template end (FORMULA_CONTENT, SCRATCH_INSTRUCTION)",
+        # Appears in stack_lock_acquire (lib/stack-lock.sh) and lib/pr-lifecycle.sh
+        "29d4f34b703f44699237713cc8d8065b": "Structural end-of-while-loop+case (return 1, esac, done, closing brace)",
+        # Forgejo org-creation API call pattern shared between forge-setup.sh and ops-setup.sh
+        # Extracted from bin/disinto (not a .sh file, excluded from prior scans) into lib/forge-setup.sh
+        "059b11945140c172465f9126b829ed7f": "Forgejo org-creation curl pattern (forge-setup.sh + ops-setup.sh)",
+        # Docker compose environment block for agents service (generators.sh + hire-agent.sh)
+        # Intentional duplicate - both generate the same docker-compose.yml template
+        "8066210169a462fe565f18b6a26a57e0": "Docker compose environment block (generators.sh + hire-agent.sh) - old",
+        "fd978fcd726696e0f280eba2c5198d50": "Docker compose environment block continuation (generators.sh + hire-agent.sh) - old",
+        "e2760ccc2d4b993a3685bd8991594eb2": "Docker compose env_file + depends_on block (generators.sh + hire-agent.sh) - old",
+        # The hash shown in output is 161a80f7 - need to match exactly what the script finds
+        "161a80f7296d6e9d45895607b7f5b9c9": "Docker compose env_file + depends_on block (generators.sh + hire-agent.sh) - old",
+        # New hash after explicit environment fix (#381)
+        "83fa229b86a7fdcb1d3591ab8e718f9d": "Docker compose explicit environment block (generators.sh + hire-agent.sh) - #381",
+        # Verification mode helper functions - intentionally duplicated in dispatcher and entrypoint
+        # These functions check if bug-report parent issues have all sub-issues closed
+        "b783d403276f78b49ad35840845126a1": "Verification helper: sub_issues variable declaration",
+        "4b19b9a1bdfbc62f003fc237ed270ed9": "Verification helper: python3 -c invocation",
+        "cc1d0a9f85dfe0cc32e9ef6361cb8c3a": "Verification helper: Python imports and args",
+        "768926748b811ebd30f215f57db5de40": "Verification helper: json.load from /dev/stdin",
+        "4c58586a30bcf6b009c02010ed8f6256": "Verification helper: sub_issues list initialization",
+        "53ea3d6359f51d622467bd77b079cc88": "Verification helper: iterate issues in data",
+        "21aec56a99d5252b23fb9a38b895e8e8": "Verification helper: check body for Decomposed from pattern",
+        "60ea98b3604557d539193b2a6624e232": "Verification helper: append sub-issue number",
+        "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
    }

    if not sh_files:
--- a/.woodpecker/smoke-init.yml
+++ b/.woodpecker/smoke-init.yml
@ -0,0 +1,19 @@
+when:
+  - event: pull_request
+    path:
+      - "bin/disinto"
+      - "lib/load-project.sh"
+      - "lib/env.sh"
+      - "lib/generators.sh"
+      - "tests/**"
+      - ".woodpecker/smoke-init.yml"
+
+steps:
+  - name: smoke-init
+    image: python:3-alpine
+    commands:
+      - apk add --no-cache bash curl jq git coreutils
+      - python3 tests/mock-forgejo.py & echo $! > /tmp/mock-forgejo.pid
+      - sleep 2
+      - bash tests/smoke-init.sh
+      - kill $(cat /tmp/mock-forgejo.pid) 2>/dev/null || true
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,13 +1,13 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Disinto — Agent Instructions

 ## What this repo is

-Disinto is an autonomous code factory. It manages seven agents (dev, review,
-gardener, supervisor, planner, predictor, architect) that pick up issues from
-forge, implement them, review PRs, plan from the vision, and keep the system
-healthy — all via cron and `claude -p`. The dispatcher executes formula-based
-operational tasks.
+Disinto is an autonomous code factory. It manages ten agents (dev, review,
+gardener, supervisor, planner, predictor, architect, reproduce, triage, edge
+dispatcher) that pick up issues from forge, implement them, review PRs, plan
+from the vision, and keep the system healthy — all via a polling loop (`docker/agents/entrypoint.sh`) and `claude -p`.
+The dispatcher executes formula-based operational tasks.

 Each agent has a `.profile` repository on Forgejo that stores lessons learned
 from prior sessions, providing continuous improvement across runs.
@ -21,27 +21,45 @@ See `README.md` for the full architecture and `disinto-factory/SKILL.md` for set

 ```
 disinto/                 (code repo)
-├── dev/           dev-poll.sh, dev-agent.sh, phase-handler.sh — issue implementation
+├── dev/           dev-poll.sh, dev-agent.sh, phase-test.sh — issue implementation
 ├── review/        review-poll.sh, review-pr.sh — PR review
-├── gardener/      gardener-run.sh — direct cron executor for run-gardener formula
-├── predictor/     predictor-run.sh — daily cron executor for run-predictor formula
-├── planner/       planner-run.sh — direct cron executor for run-planner formula
-├── supervisor/    supervisor-run.sh — formula-driven health monitoring (cron wrapper)
+├── gardener/      gardener-run.sh — polling-loop executor for run-gardener formula
+│                  best-practices.md — gardener best-practice reference
+│                  pending-actions.json — queued gardener actions
+├── predictor/     predictor-run.sh — polling-loop executor for run-predictor formula
+├── planner/       planner-run.sh — polling-loop executor for run-planner formula
+├── supervisor/    supervisor-run.sh — formula-driven health monitoring (polling-loop executor)
 │                  preflight.sh — pre-flight data collection for supervisor formula
-│                  supervisor-poll.sh — legacy bash orchestrator (superseded)
 ├── architect/     architect-run.sh — strategic decomposition of vision into sprints
 ├── vault/         vault-env.sh — shared env setup (vault redesign in progress, see #73-#77)
-├── lib/           env.sh, agent-session.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, profile.sh, build-graph.py
+│                  SCHEMA.md — vault item schema documentation
+│                  validate.sh — vault item validator
+│                  examples/ — example vault action TOMLs (promote, publish, release, webhook-call)
+├── lib/           env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, vault.sh, ci-log-reader.py, git-creds.sh
+│                  hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure)
 ├── projects/      *.toml.example — templates; *.toml — local per-box config (gitignored)
 ├── formulas/      Issue templates (TOML specs for multi-step agent tasks)
-└── docs/          Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
+├── docker/        Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/)
+├── tools/         Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh)
+├── docs/          Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
+├── site/          disinto.ai website content
+├── tests/         Test files (mock-forgejo.py, smoke-init.sh)
+├── templates/     Issue templates
+├── bin/           The `disinto` CLI script
+├── disinto-factory/  Setup documentation and skill
+├── state/         Runtime state
+├── .woodpecker/   Woodpecker CI pipeline configs
+├── VISION.md      High-level project vision
+└── CLAUDE.md      Claude Code project instructions

 disinto-ops/             (ops repo — {project}-ops)
 ├── vault/
+│   ├── actions/   where vault action TOMLs land (core of vault workflow)
 │   ├── pending/   vault items awaiting approval
 │   ├── approved/  approved vault items
 │   ├── fired/     executed vault items
 │   └── rejected/  rejected vault items
+├── sprints/       sprint planning artifacts
 ├── knowledge/     shared agent knowledge + best practices
 ├── evidence/      engagement data, experiment results
 ├── portfolio.md   addressables + observables
@ -49,39 +67,11 @@ disinto-ops/             (ops repo — {project}-ops)
 └── RESOURCES.md   accounts, tokens (refs), infra inventory
 ```

-> **Note:** Journal directories (`journal/planner/` and `journal/supervisor/`) have been removed from the ops repo. Agent journals are now stored in each agent's `.profile` repo on Forgejo.
-
 ## Agent .profile Model

-Each agent has a `.profile` repository on Forgejo that stores:
- `formula.toml` — agent-specific formula (optional, falls back to `formulas/<agent>.toml`)
- `knowledge/lessons-learned.md` — distilled lessons from journal entries
- `journal/` — session reflection entries (archived after digestion)
+Each agent has a `.profile` repository on Forgejo storing `knowledge/lessons-learned.md` (injected into each session prompt) and `journal/` reflection entries (digested into lessons). Pre-session: `formula_prepare_profile_context()` loads lessons. Post-session: `profile_write_journal` records reflections. See `lib/formula-session.sh`.

-### How it works
-
-1. **Pre-session:** The agent calls `formula_prepare_profile_context()` which:
-   - Resolves the agent's Forgejo identity from their token
-   - Clones/pulls the `.profile` repo to a local cache
-   - Loads `knowledge/lessons-learned.md` into `LESSONS_CONTEXT` for prompt injection
-   - Automatically digests journals if >10 undigested entries exist
-
-2. **Prompt injection:** Lessons are injected into the agent prompt:
-   ```
-   ## Lessons learned (from .profile/knowledge/lessons-learned.md)
-   <abstracted lessons from prior sessions>
-   ```
-
-3. **Post-session:** The agent calls `profile_write_journal` which:
-   - Generates a reflection entry about the session
-   - Writes it to `journal/issue-{N}.md`
-   - Commits and pushes to the `.profile` repo
-   - Journals are archived after being digested into lessons-learned.md
-
-> **Terminology note:** "Formulas" in this repo are TOML issue templates in `formulas/` that
-> orchestrate multi-step agent tasks (e.g., `run-gardener.toml`, `run-planner.toml`). This is
-> distinct from "processes" described in `docs/EVIDENCE-ARCHITECTURE.md`, which are measurement
-> and mutation pipelines that read external platforms and write structured evidence to git.
+> **Terminology note:** "Formulas" are TOML issue templates in `formulas/` that orchestrate multi-step agent tasks. Distinct from "processes" in `docs/EVIDENCE-ARCHITECTURE.md`.

 ## Tech stack

@ -124,6 +114,9 @@ bash dev/phase-test.sh
 | Planner | `planner/` | Strategic planning | [planner/AGENTS.md](planner/AGENTS.md) |
 | Predictor | `predictor/` | Infrastructure pattern detection | [predictor/AGENTS.md](predictor/AGENTS.md) |
 | Architect | `architect/` | Strategic decomposition | [architect/AGENTS.md](architect/AGENTS.md) |
+| Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` |
+| Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` |
+| Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` |

 > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77).
 > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details.
@ -146,30 +139,24 @@ Issues flow: `backlog` → `in-progress` → PR → CI → review → merge →
 | `blocked` | Issue is stuck — agent session failed, crashed, timed out, or CI exhausted. Diagnostic comment on the issue has details. Also used for unmet dependencies. | dev-agent.sh, dev-poll.sh (on failure) |
 | `tech-debt` | Pre-existing issue flagged by AI reviewer, not introduced by a PR. | review-pr.sh (auto-created follow-ups) |
 | `underspecified` | Dev-agent refused the issue as too large or vague. | dev-poll.sh (on preflight `too_large`), dev-agent.sh (on mid-run `too_large` refusal) |
+| `bug-report` | Issue describes user-facing broken behavior with reproduction steps. Separate triage track for reproduction automation. | Gardener (bug-report detection in grooming) |
+| `in-triage` | Bug reproduced but root cause not obvious — triage agent investigates. Set alongside `bug-report`. | reproduce-agent (when reproduction succeeds but cause unclear) |
+| `rejected` | Issue formally rejected — cannot reproduce, out of scope, or invalid. | reproduce-agent, humans |
 | `vision` | Goal anchors — high-level objectives from VISION.md. | Planner, humans |
 | `prediction/unreviewed` | Unprocessed prediction filed by predictor. | predictor-run.sh |
 | `prediction/dismissed` | Prediction triaged as DISMISS — planner disagrees, closed with reason. | Planner (triage-predictions step) |
 | `prediction/actioned` | Prediction promoted or dismissed by planner. | Planner (triage-predictions step) |
+| `formula` | Issue is a formula-based operational task. Dev-poll skips these; dispatcher handles them. | Dispatcher (when dispatching formula tasks) |

 ### Dependency conventions

-Issues declare dependencies in their body using a `## Dependencies` or
-`## Depends on` section listing `#N` references. The dev-poll scheduler uses
-`lib/parse-deps.sh` to extract these and only picks issues whose dependencies
-are all closed.
-
-### Single-threaded pipeline
-
-Each project processes one issue at a time. Dev-poll will not start new work
-while an open PR is waiting for CI or review. This keeps context clear and
-prevents merge conflicts between concurrent changes.
+Issues declare dependencies via `## Dependencies` / `## Depends on` sections listing `#N` refs. `lib/parse-deps.sh` extracts these; dev-poll only picks issues whose deps are all closed. See AD-002 for concurrency bounds per LLM backend.

 ---

-## Addressables
+## Addressables and Observables

-Concrete artifacts the factory has produced or is building. The gardener
-maintains this table during grooming — see `formulas/run-gardener.toml`.
+Concrete artifacts the factory has produced or is building. Observables have measurement wired — the gardener promotes addressables once an evidence process is connected.

 | Artifact | Location | Observable? |
 |----------|----------|-------------|
@ -178,14 +165,6 @@ maintains this table during grooming — see `formulas/run-gardener.toml`.
 | Skill    | ClawHub (in progress) | No |
 | GitHub org | github.com/Disinto | No |

-## Observables
-
-Addressables with measurement wired — the factory can read structured
-feedback from these. The gardener promotes addressables here once an
-evidence process is connected.
-
-None yet.
-
 ---

 ## Architecture Decisions
@ -194,17 +173,18 @@ Humans write these. Agents read and enforce them.

 | ID | Decision | Rationale |
 |---|---|---|
-| AD-001 | Nervous system runs from cron, not PR-based actions. | Planner, predictor, gardener, supervisor run directly via `*-run.sh`. They create work, they don't become work. (See PR #474 revert.) |
-| AD-002 | Single-threaded pipeline per project. | One dev issue at a time. No new work while a PR awaits CI or review. Prevents merge conflicts and keeps context clear. |
+| AD-001 | Nervous system runs from a polling loop (`docker/agents/entrypoint.sh`), not PR-based actions. | Planner, predictor, gardener, supervisor run directly via `*-run.sh`. They create work, they don't become work. (See PR #474 revert.) |
+| AD-002 | **Concurrency is bounded per LLM backend, not per project.** One concurrent Claude session per OAuth credential pool; one concurrent session per llama-server instance. Containers with disjoint backends may run in parallel. | The single-thread invariant is about *backends*, not pipelines. **(a) Anthropic OAuth credentials race on token refresh** — each container uses a per-session `CLAUDE_CONFIG_DIR`, so Claude Code's native lockfile-based OAuth refresh handles contention automatically without external serialization. (Legacy: set `CLAUDE_EXTERNAL_LOCK=1` to re-enable the old `flock session.lock` wrapper for rollback.) **(b) llama-server has finite VRAM and one KV cache** — parallel inference thrashes the cache and risks OOM. All llama-backed agents serialize on the same lock. **(c) Disjoint backends are free to parallelize.** Today `disinto-agents` (Anthropic OAuth, runs `review,gardener`) runs concurrently with `disinto-agents-llama` (llama, runs `dev`) on the same project — they share neither OAuth state nor llama VRAM. **(d) Per-project work-conflict safety** (no duplicate dev work, no merge conflicts on the same branch) is enforced by `issue_claim` (assignee + `in-progress` label) and per-issue worktrees — that's a separate guard that does NOT depend on this AD. |
 | AD-003 | The runtime creates and destroys, the formula preserves. | Runtime manages worktrees/sessions/temp. Formulas commit knowledge to git before signaling done. |
 | AD-004 | Event-driven > polling > fixed delays. | Never `waitForTimeout` or hardcoded sleep. Use phase files, webhooks, or poll loops with backoff. |
-| AD-005 | Secrets via env var indirection, never in issue bodies. | Issue bodies become code. Agent secrets go in `.env.enc`, vault secrets in `.env.vault.enc` (both SOPS-encrypted). Referenced as `$VAR_NAME`. Runner gets only vault secrets; agents get only agent secrets. |
+| AD-005 | Secrets via env var indirection, never in issue bodies. | Issue bodies become code. Agent secrets go in `.env.enc`, vault secrets in `.env.vault.enc` (SOPS-encrypted when available; plaintext `.env`/`.env.vault` fallback supported). Referenced as `$VAR_NAME`. Runner gets only vault secrets; agents get only agent secrets. |
 | AD-006 | External actions go through vault dispatch, never direct. | Agents build addressables; only the vault exercises them (publishes, deploys, posts). Tokens for external systems (`GITHUB_TOKEN`, `CLAWHUB_TOKEN`, deploy keys) live only in `.env.vault.enc` and are injected into the ephemeral runner container. `lib/env.sh` unsets them so agents never hold them. PRs with direct external actions without vault dispatch get REQUEST_CHANGES. (Vault redesign in progress: PR-based approval on ops repo, see #73-#77) |

 **Who enforces what:**
 - **Gardener** checks open backlog issues against ADs during grooming; closes violations with a comment referencing the AD number.
 - **Planner** plans within the architecture; does not create issues that violate ADs.
 - **Dev-agent** reads AGENTS.md before implementing; refuses work that violates ADs.
+- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** OAuth concurrency is handled by per-session `CLAUDE_CONFIG_DIR` isolation (with `CLAUDE_EXTERNAL_LOCK` as a rollback flag). Per-issue work is enforced by `issue_claim`. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue.

 ---

@ -217,5 +197,4 @@ at each phase boundary by writing to a phase file (e.g.
 Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`.
 Also: `PHASE:escalate` (needs human input), `PHASE:failed`.

-See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec
-including the orchestrator reaction matrix, sequence diagram, and crash recovery.
+See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery.
--- a/README.md
+++ b/README.md
@ -21,22 +21,29 @@ Point it at a git repo with a Woodpecker CI pipeline and it will pick up issues,
 ## Architecture

 ```
-cron (*/10) ──→ supervisor-poll.sh    ← supervisor (bash checks, zero tokens)
-                 ├── all clear? → exit 0
-                 └── problem? → claude -p (diagnose, fix, or escalate)
-
-cron (*/10) ──→ dev-poll.sh        ← pulls ready issues, spawns dev-agent
-                 └── dev-agent.sh   ← claude -p: implement → PR → CI → review → merge
-
-cron (*/10) ──→ review-poll.sh     ← finds unreviewed PRs, spawns review
-                 └── review-pr.sh   ← claude -p: review → approve/request changes
-
-cron (daily) ──→ gardener-poll.sh  ← backlog grooming (duplicates, stale, tech-debt)
-                  └── claude -p: triage → promote/close/escalate
-
-cron (weekly) ──→ planner-poll.sh  ← gap-analyse VISION.md, create backlog issues
-                   └── claude -p: update AGENTS.md → create issues
+entrypoint.sh (while-true polling loop, 5 min base interval)
+ │
+ ├── every 5 min ──→ review-poll.sh   ← finds unreviewed PRs, spawns review
+ │                    └── review-pr.sh  ← claude -p: review → approve/request changes
+ │
+ ├── every 5 min ──→ dev-poll.sh      ← pulls ready issues, spawns dev-agent
+ │                    └── dev-agent.sh  ← claude -p: implement → PR → CI → review → merge
+ │
+ ├── every 6h ────→ gardener-run.sh   ← backlog grooming (duplicates, stale, tech-debt)
+ │                   └── claude -p: triage → promote/close/escalate
+ │
+ ├── every 6h ────→ architect-run.sh  ← strategic decomposition of vision into sprints
+ │
+ ├── every 12h ───→ planner-run.sh    ← gap-analyse VISION.md, create backlog issues
+ │                   └── claude -p: update AGENTS.md → create issues
+ │
+ └── every 24h ───→ predictor-run.sh  ← infrastructure pattern detection

+entrypoint-edge.sh (edge container)
+ ├── dispatcher.sh                    ← polls ops repo for vault actions
+ └── every 20 min → supervisor-run.sh ← health checks (bash checks, zero tokens)
+                     ├── all clear? → exit 0
+                     └── problem? → claude -p (diagnose, fix, or escalate)
 ```

 ## Prerequisites
@ -65,6 +72,8 @@ cd disinto
 disinto init https://github.com/yourorg/yourproject
 ```

+This will generate a `docker-compose.yml` file.
+
 Or configure manually — edit `.env` with your values:

 ```bash
@ -86,17 +95,11 @@ CLAUDE_TIMEOUT=7200         # max seconds per Claude invocation (default: 2h)
 ```

 ```bash
-# 3. Install cron (staggered to avoid overlap)
-crontab -e
-# Add:
-#   0,10,20,30,40,50 * * * * /path/to/disinto/supervisor/supervisor-poll.sh
-#   3,13,23,33,43,53 * * * * /path/to/disinto/review/review-poll.sh
-#   6,16,26,36,46,56 * * * * /path/to/disinto/dev/dev-poll.sh
-#   15 8 * * *                /path/to/disinto/gardener/gardener-poll.sh
-#   0 9 * * 1                 /path/to/disinto/planner/planner-poll.sh
+# 3. Start the agent and edge containers
+docker compose up -d

-# 4. Verify
-bash supervisor/supervisor-poll.sh   # should log "all clear"
+# 4. Verify the entrypoint loop is running
+docker exec disinto-agents tail -f /home/agent/data/agent-entrypoint.log
 ```

 ## Directory Structure
@ -109,16 +112,16 @@ disinto/
 │   ├── env.sh              # Shared: load .env, PATH, API helpers
 │   └── ci-debug.sh         # Woodpecker CI log/failure helper
 ├── dev/
-│   ├── dev-poll.sh       # Cron entry: find ready issues
+│   ├── dev-poll.sh       # Poll: find ready issues
 │   └── dev-agent.sh      # Implementation agent (claude -p)
 ├── review/
-│   ├── review-poll.sh    # Cron entry: find unreviewed PRs
+│   ├── review-poll.sh    # Poll: find unreviewed PRs
 │   └── review-pr.sh      # Review agent (claude -p)
 ├── gardener/
-│   ├── gardener-poll.sh  # Cron entry: backlog grooming
+│   ├── gardener-run.sh   # Executor: backlog grooming
 │   └── best-practices.md # Gardener knowledge base
 ├── planner/
-│   ├── planner-poll.sh   # Cron entry: weekly vision gap analysis
+│   ├── planner-run.sh    # Executor: vision gap analysis
 │   └── (formula-driven)  # run-planner.toml executed by dispatcher
 ├── vault/
 │   └── vault-env.sh      # Shared env setup (vault redesign in progress, see #73-#77)
@ -141,11 +144,11 @@ disinto/

 | Agent | Trigger | Job |
 |-------|---------|-----|
-| **Supervisor** | Every 10 min | Health checks (RAM, disk, CI, git). Calls Claude only when something is broken. Self-improving via `best-practices/`. |
-| **Dev** | Every 10 min | Picks up `backlog`-labeled issues, creates a branch, implements, opens a PR, monitors CI, responds to review, merges. |
-| **Review** | Every 10 min | Finds PRs without review, runs Claude-powered code review, approves or requests changes. |
-| **Gardener** | Daily | Grooms the issue backlog: detects duplicates, promotes `tech-debt` to `backlog`, closes stale issues, escalates ambiguous items. |
-| **Planner** | Weekly | Updates AGENTS.md documentation to reflect recent code changes, then gap-analyses VISION.md vs current state and creates up to 5 backlog issues for the highest-leverage gaps. |
+| **Supervisor** | Every 20 min | Health checks (RAM, disk, CI, git). Calls Claude only when something is broken. Self-improving via `best-practices/`. |
+| **Dev** | Every 5 min | Picks up `backlog`-labeled issues, creates a branch, implements, opens a PR, monitors CI, responds to review, merges. |
+| **Review** | Every 5 min | Finds PRs without review, runs Claude-powered code review, approves or requests changes. |
+| **Gardener** | Every 6h | Grooms the issue backlog: detects duplicates, promotes `tech-debt` to `backlog`, closes stale issues, escalates ambiguous items. |
+| **Planner** | Every 12h | Updates AGENTS.md documentation to reflect recent code changes, then gap-analyses VISION.md vs current state and creates up to 5 backlog issues for the highest-leverage gaps. |

 > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77).
 > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow and branch protection details.
--- a/architect/AGENTS.md
+++ b/architect/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: auto-generated -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Architect — Agent Instructions

 ## What this agent is
@ -11,7 +11,7 @@ converses with humans through PR comments.

 - **Input**: Vision issues from VISION.md, prerequisite tree from ops repo
 - **Output**: Sprint proposals as PRs on the ops repo, sub-issue files
- **Mechanism**: Formula-driven execution via `formulas/run-architect.toml`
+- **Mechanism**: Bash-driven orchestration in `architect-run.sh`, pitching formula via `formulas/run-architect.toml`
 - **Identity**: `architect-bot` on Forgejo

 ## Responsibilities
@ -29,28 +29,85 @@ converses with humans through PR comments.

 ## Formula

-The architect is driven by `formulas/run-architect.toml`. This formula defines
+The architect pitching is driven by `formulas/run-architect.toml`. This formula defines
 the steps for:
 - Research: analyzing vision items and prerequisite tree
- Design: identifying implementation approaches and forks
- Sprint proposal: creating structured sprint PRs
+- Pitch: creating structured sprint PRs
 - Sub-issue filing: creating concrete implementation issues

+## Bash-driven orchestration
+
+Bash in `architect-run.sh` handles state detection and orchestration:
+
+- **Deterministic state detection**: Bash reads the Forgejo reviews API to detect
+  ACCEPT/REJECT decisions — checks both formal APPROVED reviews and PR comments, not just comments (#718)
+- **Human guidance injection**: Review body text from ACCEPT reviews is injected
+  directly into the research prompt as context
+- **Response processing**: When ACCEPT/REJECT responses are detected, bash invokes
+  the agent with appropriate context (session resumed for questions phase)
+- **Pitch capture**: `pitch_output` is written to a temp file instead of captured via `$()` subshell, because `agent_run` writes to side-channels (`SID_FILE`, `LOGFILE`) that subshell capture would suppress (#716)
+- **PR URL construction**: existing-PR check uses `${FORGE_API}/pulls` directly (not `${FORGE_API}/repos/…`) — the base URL already includes the repos segment (#717)
+
+### State transitions
+
+```
+New vision issue → pitch PR (model generates pitch, bash creates PR)
+  ↓
+APPROVED review → start design questions (model posts Q1:, adds Design forks section)
+  ↓
+Answers received → continue Q&A (model processes answers, posts follow-ups)
+  ↓
+All forks resolved → sub-issue filing (model files implementation issues)
+  ↓
+REJECT review → close PR + journal (model processes rejection, bash merges PR)
+```
+
+### Vision issue lifecycle
+
+Vision issues decompose into sprint sub-issues tracked via "Decomposed from #N" in sub-issue bodies. The architect automatically closes vision issues when all sub-issues are closed:
+
+1. Before picking new vision issues, the architect checks each open vision issue
+2. For each, it queries merged sprint PRs — **only PRs whose title or body reference the specific vision issue** (matched via `#N` pattern, filtering out unrelated PRs that happen to close unrelated issues) (#735/#736)
+3. Extracts sub-issue numbers from those PRs, excluding the vision issue itself
+4. If all sub-issues are closed, posts a summary comment listing completed sub-issues (with an idempotency guard: checks both comment presence AND `.state == "closed"` — if the comment exists but the issue is still open, retries the close rather than returning early) (#737)
+5. The vision issue is then closed automatically
+
+This ensures vision issues transition from `open` → `closed` once their work is complete, without manual intervention. The #N-scoped matching prevents false positives where unrelated sub-issues would incorrectly trigger vision issue closure.
+
+### Session management
+
+The agent maintains a global session file at `/tmp/architect-session-{project}.sid`.
+When processing responses, bash checks if the PR is in the questions phase and
+resumes the session using `--resume session_id` to preserve codebase context.
+
 ## Execution

 Run via `architect/architect-run.sh`, which:
- Acquires a cron lock and checks available memory
+- Acquires a poll-loop lock (via `acquire_lock`) and checks available memory
+- Cleans up per-issue scratch files from previous runs (`/tmp/architect-{project}-scratch-*.md`)
 - Sources shared libraries (env.sh, formula-session.sh)
 - Uses FORGE_ARCHITECT_TOKEN for authentication
+- Processes existing architect PRs via bash-driven design phase
 - Loads the formula and builds context from VISION.md, AGENTS.md, and ops repo
- Executes the formula via `agent_run`
+- Bash orchestrates state management:
+  - Fetches open vision issues, open architect PRs, and merged sprint PRs from Forgejo API
+  - Filters out visions already with open PRs, in-progress label, sub-issues, or merged sprint PRs
+  - Selects up to `pitch_budget` (3 - open architect PRs) remaining vision issues
+  - For each selected issue, invokes stateless `claude -p` with issue body + context
+  - Creates PRs directly from pitch content (no scratch files)
+- Agent is invoked only for response processing (ACCEPT/REJECT handling)

-## Cron
+**Multi-sprint pitching**: The architect pitches up to 3 sprints per run. Bash handles all state management:
+- Fetches Forgejo API data (vision issues, open PRs, merged PRs)
+- Filters and deduplicates (no model-level dedup or journal-based memory)
+- For each selected vision issue, bash invokes stateless `claude -p` to generate pitch markdown
+- Bash creates the PR with pitch content and posts ACCEPT/REJECT footer comment
+- Branch names use issue number (architect/sprint-vision-{issue_number}) to avoid collisions

-Suggested cron entry (every 6 hours):
-```cron
-0 */6 * * * cd /path/to/disinto && bash architect/architect-run.sh
-```
+## Schedule
+
+The architect runs every 6 hours as part of the polling loop in
+`docker/agents/entrypoint.sh` (iteration math at line 196-208).

 ## State

@ -63,3 +120,4 @@ empty file not created, just document it).
 - #100: Architect formula — research + design fork identification
 - #101: Architect formula — sprint PR creation with questions
 - #102: Architect formula — answer parsing + sub-issue filing
+- #491: Refactor — bash-driven design phase with stateful session resumption
--- a/architect/architect-run.sh
+++ b/architect/architect-run.sh
--- a/bin/disinto
+++ b/bin/disinto
--- a/dev/AGENTS.md
+++ b/dev/AGENTS.md
@ -1,22 +1,40 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Dev Agent

 **Role**: Implement issues autonomously — write code, push branches, address
 CI failures and review feedback.

-**Trigger**: `dev-poll.sh` runs every 10 min via cron. Sources `lib/guard.sh` and
-calls `check_active dev` first — skips if `$FACTORY_ROOT/state/.dev-active` is
-absent. Then performs a direct-merge scan (approved + CI green PRs — including
-chore/gardener PRs without issue numbers), then checks the agent lock and scans
-for ready issues using a two-tier priority queue: (1) `priority`+`backlog` issues
-first (FIFO within tier), then (2) plain `backlog` issues (FIFO). Orphaned
-in-progress issues are also picked up. The direct-merge scan runs before the lock
-check so approved PRs get merged even while a dev-agent session is active.
+**Trigger**: `dev-poll.sh` is invoked by the polling loop in `docker/agents/entrypoint.sh`
+every 5 minutes (iteration math at line 171-175). Sources `lib/guard.sh` and calls
+`check_active dev` first — skips if `$FACTORY_ROOT/state/.dev-active` is absent. Then
+performs a direct-merge scan (approved + CI green PRs — including chore/gardener PRs
+without issue numbers), then checks the agent lock and scans for ready issues using a
+two-tier priority queue: (1) `priority`+`backlog` issues first (FIFO within tier), then
+(2) plain `backlog` issues (FIFO). Orphaned in-progress issues are also picked up. The
+direct-merge scan runs before the lock check so approved PRs get merged even while a
+dev-agent session is active.

 **Key files**:
- `dev/dev-poll.sh` — Cron scheduler: finds next ready issue, handles merge/rebase of approved PRs, tracks CI fix attempts. Formula guard skips issues labeled `formula`, `prediction/dismissed`, or `prediction/unreviewed` (replaced `prediction/backlog` — that label no longer exists)
- `dev/dev-agent.sh` — Orchestrator: claims issue, creates worktree + tmux session with interactive `claude`, monitors phase file, injects CI results and review feedback, merges on approval
- `dev/phase-handler.sh` — Phase callback functions: `post_refusal_comment()`, `_on_phase_change()`, `build_phase_protocol_prompt()`. `do_merge()` detects already-merged PRs on HTTP 405 (race with dev-poll's pre-lock scan) and returns success instead of escalating. Sources `lib/mirrors.sh` and calls `mirror_push()` after every successful merge.
+- `dev/dev-poll.sh` — Polling loop participant: finds next ready issue, handles merge/rebase
+of approved PRs, tracks CI fix attempts. Invoked by `docker/agents/entrypoint.sh` every 5
+minutes. `BOT_USER` is resolved once at startup via the Forge `/user` API and cached for
+all assignee checks. Formula guard skips issues labeled `formula`, `prediction/dismissed`,
+or `prediction/unreviewed`. **Race prevention**: checks issue assignee before claiming —
+skips if assigned to a different bot user. **Stale branch abandonment**: closes PRs and
+deletes branches that are behind `$PRIMARY_BRANCH` (restarts poll cycle for a fresh start).
+**Stale in-progress recovery**: on each poll cycle, scans for issues labeled `in-progress`.
+If the issue has a `vision` label, sets `BLOCKED_BY_INPROGRESS=true` and skips further
+stale checks (vision issues are managed by the architect). If the issue is assigned to
+`$BOT_USER` (this agent), checks for pending review feedback first — if an open PR has
+`REQUEST_CHANGES`, spawns the dev-agent to address it before setting `BLOCKED_BY_INPROGRESS=true`;
+otherwise just sets blocked. If assigned to another agent, logs and falls through (does not
+block). If no assignee, no open PR, and no agent lock file — removes `in-progress`, adds
+`blocked` with a human-triage comment. **Per-agent open-PR gate**: before starting new work,
+filters open waiting PRs to only those assigned to this agent (`$BOT_USER`). Other agents'
+PRs do not block this agent's pipeline (#358, #369). **Pre-lock merge scan own-PRs only**:
+the direct-merge scan only merges PRs whose linked issue is assigned to this agent — skips
+PRs owned by other bot users (#374).
+- `dev/dev-agent.sh` — Orchestrator: claims issue, creates worktree + tmux session with interactive `claude`, monitors phase file, injects CI results and review feedback, merges on approval. **Launched as a subshell** (`("${SCRIPT_DIR}/dev-agent.sh" ...) &`) — not via `nohup` — to avoid deadlocking the polling loop and review-poll when running in the same container (#693).
 - `dev/phase-test.sh` — Integration test for the phase protocol

 **Environment variables consumed** (via `lib/env.sh` + project TOML):
@ -33,9 +51,9 @@ check so approved PRs get merged even while a dev-agent session is active.

 **Crash recovery**: on `PHASE:crashed` or non-zero exit, the worktree is **preserved** (not destroyed) for debugging. Location logged. Supervisor housekeeping removes stale crashed worktrees older than 24h.

-**Lifecycle**: dev-poll.sh (`check_active dev`) → dev-agent.sh → tmux `dev-{project}-{issue}` → phase file
-drives CI/review loop → merge + `mirror_push()` → close issue. On respawn after
-`PHASE:escalate`, the stale phase file is cleared first so the session starts
-clean; the reinject prompt tells Claude not to re-escalate for the same reason.
-On respawn for any active PR, the prompt explicitly tells Claude the PR already
-exists and not to create a new one via API.
+**Lifecycle**: dev-poll.sh (invoked by polling loop, `check_active dev`) → dev-agent.sh →
+tmux session → phase file drives CI/review loop → merge + `mirror_push()` → close issue.
+On respawn after `PHASE:escalate`, the stale phase file is cleared first so the session
+starts clean; the reinject prompt tells Claude not to re-escalate for the same reason.
+On respawn for any active PR, the prompt explicitly tells Claude the PR already exists
+and not to create a new one via API.
--- a/dev/dev-agent.sh
+++ b/dev/dev-agent.sh
@ -41,7 +41,7 @@ REPO_ROOT="${PROJECT_REPO_ROOT}"

 LOCKFILE="/tmp/dev-agent-${PROJECT_NAME:-default}.lock"
 STATUSFILE="/tmp/dev-agent-status-${PROJECT_NAME:-default}"
-BRANCH="fix/issue-${ISSUE}"
+BRANCH="fix/issue-${ISSUE}"  # Default; will be updated after FORGE_REMOTE is known
 WORKTREE="/tmp/${PROJECT_NAME}-worktree-${ISSUE}"
 SID_FILE="/tmp/dev-session-${PROJECT_NAME}-${ISSUE}.sid"
 PREFLIGHT_RESULT="/tmp/dev-agent-preflight.json"
@ -263,6 +263,33 @@ FORGE_REMOTE="${FORGE_REMOTE:-origin}"
 export FORGE_REMOTE
 log "forge remote: ${FORGE_REMOTE}"

+# Generate unique branch name per attempt to avoid collision with failed attempts
+# Only apply when not in recovery mode (RECOVERY_MODE branch is already set from existing PR)
+# First attempt: fix/issue-N, subsequent: fix/issue-N-1, fix/issue-N-2, etc.
+if [ "$RECOVERY_MODE" = false ]; then
+  # Count only branches matching fix/issue-N, fix/issue-N-1, fix/issue-N-2, etc. (exact prefix match)
+  # Use explicit error handling to avoid silent failure from set -e + pipefail when git ls-remote fails.
+  if _lr1=$(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}" 2>&1); then
+    ATTEMPT=$(printf '%s\n' "$_lr1" | grep -c "refs/heads/fix/issue-${ISSUE}$" || true)
+  else
+    log "WARNING: git ls-remote failed for attempt counting: $_lr1"
+    ATTEMPT=0
+  fi
+  ATTEMPT="${ATTEMPT:-0}"
+
+  if _lr2=$(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}-*" 2>&1); then
+    # Guard on empty to avoid off-by-one: command substitution strips trailing newlines,
+    # so wc -l undercounts by 1 when output exists. Re-add newline only if non-empty.
+    ATTEMPT=$((ATTEMPT + $( [ -z "$_lr2" ] && echo 0 || printf '%s\n' "$_lr2" | wc -l )))
+  else
+    log "WARNING: git ls-remote failed for suffix counting: $_lr2"
+  fi
+  if [ "$ATTEMPT" -gt 0 ]; then
+    BRANCH="fix/issue-${ISSUE}-${ATTEMPT}"
+  fi
+fi
+log "using branch: ${BRANCH}"
+
 if [ "$RECOVERY_MODE" = true ]; then
  if ! worktree_recover "$WORKTREE" "$BRANCH" "$FORGE_REMOTE"; then
    log "ERROR: worktree recovery failed"
@ -575,11 +602,8 @@ else
  outcome="blocked_${_PR_WALK_EXIT_REASON:-agent_failed}"
  profile_write_journal "$ISSUE" "$ISSUE_TITLE" "$outcome" "$FILES_CHANGED" || true

-  # Cleanup on failure: close PR, delete remote branch, clean up worktree
-  if [ -n "$PR_NUMBER" ]; then
-    pr_close "$PR_NUMBER"
-  fi
-  git push "$FORGE_REMOTE" --delete "$BRANCH" 2>/dev/null || true
+  # Cleanup on failure: preserve remote branch and PR for debugging, clean up local worktree
+  # Remote state (PR and branch) stays open for inspection of CI logs and review comments
  worktree_cleanup "$WORKTREE"
  rm -f "$SID_FILE" "$IMPL_SUMMARY_FILE"
  CLAIMED=false
--- a/dev/dev-poll.sh
+++ b/dev/dev-poll.sh
@ -14,7 +14,7 @@
 #   3. Ready "backlog" issues without "priority" (FIFO within tier)
 #
 # Usage:
-#   cron every 10min
+#   Called by: entrypoint.sh polling loop (every 10 min)
 #   dev-poll.sh [projects/harb.toml]   # optional project config

 set -euo pipefail
@ -42,6 +42,11 @@ log() {
  printf '[%s] poll: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE"
 }

+# Resolve current agent identity once at startup — cache for all assignee checks
+BOT_USER=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${API%%/repos*}/user" | jq -r '.login') || BOT_USER=""
+log "running as agent: ${BOT_USER}"
+
 # =============================================================================
 # CI FIX TRACKER: per-PR counter to avoid infinite respawn loops (max 3)
 # =============================================================================
@ -94,6 +99,100 @@ is_blocked() {
    | jq -e '.[] | select(.name == "blocked")' >/dev/null 2>&1
 }

+# =============================================================================
+# STALENESS DETECTION FOR IN-PROGRESS ISSUES
+# =============================================================================
+
+# Check if in-progress label was added recently (within grace period).
+# Prevents race where a poller marks an issue as stale before the claiming
+# agent's assign + label sequence has fully propagated. See issue #471.
+# Args: issue_number [grace_seconds]
+# Returns: 0 if recently added (within grace period), 1 if not
+in_progress_recently_added() {
+  local issue="$1" grace="${2:-60}"
+  local now label_ts delta
+
+  now=$(date +%s)
+
+  # Query issue timeline for the most recent in-progress label event.
+  # Forgejo 11.x API returns type as string "label", not integer 7.
+  label_ts=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API}/issues/${issue}/timeline" | \
+    jq -r '[.[] | select(.type == "label") | select(.label.name == "in-progress")] | last | .created_at // empty') || true
+
+  if [ -z "$label_ts" ]; then
+    return 1  # no label event found — not recently added
+  fi
+
+  # Convert ISO timestamp to epoch and compare
+  local label_epoch
+  label_epoch=$(date -d "$label_ts" +%s 2>/dev/null || echo 0)
+  delta=$(( now - label_epoch ))
+
+  if [ "$delta" -lt "$grace" ]; then
+    return 0  # within grace period
+  fi
+  return 1
+}
+
+# Check if there's an open PR for a specific issue
+# Args: issue_number
+# Returns: 0 if open PR exists, 1 if not
+open_pr_exists() {
+  local issue="$1"
+  local branch="fix/issue-${issue}"
+  local pr_num
+
+  pr_num=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API}/pulls?state=open&limit=20" | \
+    jq -r --arg branch "$branch" \
+    '.[] | select(.head.ref == $branch) | .number' | head -1) || true
+
+  [ -n "$pr_num" ]
+}
+
+# Relabel a stale in-progress issue to blocked with diagnostic comment
+# Args: issue_number reason
+# Uses shared helpers from lib/issue-lifecycle.sh
+relabel_stale_issue() {
+  local issue="$1" reason="$2"
+
+  log "relabeling stale in-progress issue #${issue} to blocked: ${reason}"
+
+  # Remove in-progress label
+  local ip_id
+  ip_id=$(_ilc_in_progress_id)
+  if [ -n "$ip_id" ]; then
+    curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
+      "${API}/issues/${issue}/labels/${ip_id}" >/dev/null 2>&1 || true
+  fi
+
+  # Add blocked label
+  local bk_id
+  bk_id=$(_ilc_blocked_id)
+  if [ -n "$bk_id" ]; then
+    curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Content-Type: application/json" \
+      "${API}/issues/${issue}/labels" \
+      -d "{\"labels\":[${bk_id}]}" >/dev/null 2>&1 || true
+  fi
+
+  # Post diagnostic comment using shared helper
+  local comment_body
+  comment_body=$(
+    printf '%s\n\n' '### Stale in-progress issue detected'
+    printf '%s\n' '| Field | Value |'
+    printf '%s\n' '|---|---|'
+    printf '| Detection reason | `%s` |\n' "$reason"
+    printf '| Timestamp | `%s` |\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+    printf '%s\n' '**Status:** This issue was labeled `in-progress` but has no assignee, no open PR, and no agent lock file.'
+    printf '%s\n' '**Action required:** A maintainer should triage this issue.'
+  )
+  _ilc_post_comment "$issue" "$comment_body"
+
+  _ilc_log "stale issue #${issue} relabeled to blocked: ${reason}"
+}
+
 # =============================================================================
 # HELPER: handle CI-exhaustion check/block (DRY for 3 call sites)
 # Sets CI_FIX_ATTEMPTS for caller use. Returns 0 if exhausted, 1 if not.
@ -278,6 +377,16 @@ for i in $(seq 0 $(($(echo "$PL_PRS" | jq 'length') - 1))); do
    jq -r '[.[] | select(.state == "APPROVED") | select(.stale == false)] | length') || true

  if [ "${PL_HAS_APPROVE:-0}" -gt 0 ]; then
+    # Check if issue is assigned to this agent — only merge own PRs
+    if [ "$PL_ISSUE" -gt 0 ]; then
+      PR_ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+        "${API}/issues/${PL_ISSUE}") || true
+      PR_ISSUE_ASSIGNEE=$(echo "$PR_ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+      if [ -n "$PR_ISSUE_ASSIGNEE" ] && [ "$PR_ISSUE_ASSIGNEE" != "$BOT_USER" ]; then
+        log "PR #${PL_PR_NUM} (issue #${PL_ISSUE}) assigned to ${PR_ISSUE_ASSIGNEE} — skipping merge (not mine)"
+        continue
+      fi
+    fi
    if try_direct_merge "$PL_PR_NUM" "$PL_ISSUE"; then
      PL_MERGED_ANY=true
    fi
@ -301,6 +410,9 @@ if [ -f "$LOCKFILE" ]; then
  rm -f "$LOCKFILE"
 fi

+# --- Fetch origin refs before any stale branch checks ---
+git fetch origin --prune 2>/dev/null || true
+
 # --- Memory guard ---
 memory_guard 2000

@ -309,109 +421,214 @@ memory_guard 2000
 # =============================================================================
 log "checking for in-progress issues"

-# Get current bot identity for assignee checks
-BOT_USER=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-  "${API%%/repos*}/user" | jq -r '.login') || BOT_USER=""
-
 ORPHANS_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
  "${API}/issues?state=open&labels=in-progress&limit=10&type=issues")

 ORPHAN_COUNT=$(echo "$ORPHANS_JSON" | jq 'length')
+BLOCKED_BY_INPROGRESS=false
+OTHER_AGENT_INPROGRESS=false
 if [ "$ORPHAN_COUNT" -gt 0 ]; then
  ISSUE_NUM=$(echo "$ORPHANS_JSON" | jq -r '.[0].number')

-  # Formula guard: formula-labeled issues should not be worked on by dev-agent.
-  # Remove in-progress label and skip to prevent infinite respawn cycle (#115).
-  ORPHAN_LABELS=$(echo "$ORPHANS_JSON" | jq -r '.[0].labels[].name' 2>/dev/null) || true
-  SKIP_LABEL=$(echo "$ORPHAN_LABELS" | grep -oE '^(formula|prediction/dismissed|prediction/unreviewed)$' | head -1) || true
-  if [ -n "$SKIP_LABEL" ]; then
-    log "issue #${ISSUE_NUM} has '${SKIP_LABEL}' label — removing in-progress, skipping"
-    IP_ID=$(_ilc_in_progress_id)
-    curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
-      "${API}/issues/${ISSUE_NUM}/labels/${IP_ID}" >/dev/null 2>&1 || true
-    exit 0
+  # Staleness check: if no assignee, no open PR, and no agent lock, the issue is stale
+  OPEN_PR=false
+  if curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API}/pulls?state=open&limit=20" | \
+    jq -e --arg branch "fix/issue-${ISSUE_NUM}" \
+    '.[] | select(.head.ref == $branch)' >/dev/null 2>&1; then
+    OPEN_PR=true
  fi

-  # Check if there's already an open PR for this issue
-  HAS_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${API}/pulls?state=open&limit=20" | \
-    jq -r --arg branch "fix/issue-${ISSUE_NUM}" \
-    '.[] | select(.head.ref == $branch) | .number' | head -1) || true
+  # Skip issues owned by non-dev agents (bug-report, vision, prediction, etc.)
+  # See issue #608: dev-poll must only touch issues it could actually claim.
+  issue_labels=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API}/issues/${ISSUE_NUM}" | jq -r '[.labels[].name] | join(",")')
+  if ! issue_is_dev_claimable "$issue_labels"; then
+    log "issue #${ISSUE_NUM} has non-dev label(s) [${issue_labels}] — skipping (owned by another agent)"
+    BLOCKED_BY_INPROGRESS=false
+    OTHER_AGENT_INPROGRESS=true
+  fi

-  if [ -n "$HAS_PR" ]; then
-    PR_SHA=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-      "${API}/pulls/${HAS_PR}" | jq -r '.head.sha') || true
-    CI_STATE=$(ci_commit_status "$PR_SHA") || true
+  # Check if issue has an assignee — only block on issues assigned to this agent
+  assignee=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" "${API}/issues/${ISSUE_NUM}" | jq -r '.assignee.login // ""')
+  if [ -n "$assignee" ]; then
+    if [ "$assignee" = "$BOT_USER" ]; then
+      # Check if my PR has review feedback to address before exiting
+      HAS_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+        "${API}/pulls?state=open&limit=20" | \
+        jq -r --arg branch "fix/issue-${ISSUE_NUM}" \
+        '.[] | select(.head.ref == $branch) | .number' | head -1) || true

-    # Non-code PRs (docs, formulas, evidence) may have no CI — treat as passed
-    if ! ci_passed "$CI_STATE" && ! ci_required_for_pr "$HAS_PR"; then
-      CI_STATE="success"
-      log "PR #${HAS_PR} has no code files — treating CI as passed"
-    fi
+      if [ -n "$HAS_PR" ]; then
+        # Check for REQUEST_CHANGES review feedback
+        REVIEWS_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+          "${API}/pulls/${HAS_PR}/reviews") || true
+        HAS_CHANGES=$(echo "$REVIEWS_JSON" | \
+          jq -r '[.[] | select(.state == "REQUEST_CHANGES") | select(.stale == false)] | length') || true

-    # Check formal reviews (single fetch to avoid race window)
-    REVIEWS_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-      "${API}/pulls/${HAS_PR}/reviews") || true
-    HAS_APPROVE=$(echo "$REVIEWS_JSON" | \
-      jq -r '[.[] | select(.state == "APPROVED") | select(.stale == false)] | length') || true
-    HAS_CHANGES=$(echo "$REVIEWS_JSON" | \
-      jq -r '[.[] | select(.state == "REQUEST_CHANGES") | select(.stale == false)] | length') || true
-
-    if ci_passed "$CI_STATE" && [ "${HAS_APPROVE:-0}" -gt 0 ]; then
-      if try_direct_merge "$HAS_PR" "$ISSUE_NUM"; then
-        exit 0
-      fi
-      # Direct merge failed (conflicts?) — fall back to dev-agent
-      log "falling back to dev-agent for PR #${HAS_PR} merge"
-      nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
-      log "started dev-agent PID $! for issue #${ISSUE_NUM} (agent-merge)"
-      exit 0
-
-    # Do NOT gate REQUEST_CHANGES on ci_passed: act immediately even if CI is
-    # pending/unknown. Definitive CI failure is handled by the elif below.
-    elif [ "${HAS_CHANGES:-0}" -gt 0 ] && { ci_passed "$CI_STATE" || [ "$CI_STATE" = "pending" ] || [ "$CI_STATE" = "unknown" ] || [ -z "$CI_STATE" ]; }; then
-      log "issue #${ISSUE_NUM} PR #${HAS_PR} has REQUEST_CHANGES — spawning agent"
-      nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
-      log "started dev-agent PID $! for issue #${ISSUE_NUM} (review fix)"
-      exit 0
-
-    elif ci_failed "$CI_STATE"; then
-      if handle_ci_exhaustion "$HAS_PR" "$ISSUE_NUM" "check_only"; then
-        # Fall through to backlog scan instead of exit
-        :
-      else
-        # Increment at actual launch time (not on guard-hit paths)
-        if handle_ci_exhaustion "$HAS_PR" "$ISSUE_NUM"; then
-          exit 0  # exhausted between check and launch
+        if [ "${HAS_CHANGES:-0}" -gt 0 ]; then
+          log "issue #${ISSUE_NUM} has review feedback — spawning agent"
+          ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
+          log "started dev-agent PID $! for issue #${ISSUE_NUM} (review fix)"
+          BLOCKED_BY_INPROGRESS=true
+        else
+          log "issue #${ISSUE_NUM} assigned to me — my thread is busy"
+          BLOCKED_BY_INPROGRESS=true
        fi
-        log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
-        nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
-        log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
-        exit 0
+      else
+        log "issue #${ISSUE_NUM} assigned to me — my thread is busy"
+        BLOCKED_BY_INPROGRESS=true
      fi
-
    else
-      log "issue #${ISSUE_NUM} has open PR #${HAS_PR} (CI: ${CI_STATE}, waiting)"
-      exit 0
+      log "issue #${ISSUE_NUM} assigned to ${assignee} — their thread, not blocking"
+      OTHER_AGENT_INPROGRESS=true
+      # Issue assigned to another agent — skip stale checks but fall through to backlog
    fi
-  else
-    # Check assignee before adopting orphaned issue
-    ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-      "${API}/issues/${ISSUE_NUM}") || true
-    ASSIGNEE=$(echo "$ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+  fi

-    if [ -n "$ASSIGNEE" ] && [ "$ASSIGNEE" != "$BOT_USER" ]; then
-      log "issue #${ISSUE_NUM} assigned to ${ASSIGNEE} — skipping (not orphaned)"
-      # Remove in-progress label since this agent isn't working on it
-      IP_ID=$(_ilc_in_progress_id)
-      curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
-        "${API}/issues/${ISSUE_NUM}/labels/${IP_ID}" >/dev/null 2>&1 || true
-      exit 0
+  # Only proceed with in-progress checks if not blocked by this agent's own work
+  if [ "$BLOCKED_BY_INPROGRESS" = false ] && [ "$OTHER_AGENT_INPROGRESS" = false ]; then
+    # Check for dev-agent lock file (agent may be running in another container)
+    LOCK_FILE="/tmp/dev-impl-summary-${PROJECT_NAME}-${ISSUE_NUM}.txt"
+    if [ -f "$LOCK_FILE" ]; then
+      log "issue #${ISSUE_NUM} has agent lock file — trusting active work"
+      BLOCKED_BY_INPROGRESS=true
    fi

-    log "recovering orphaned issue #${ISSUE_NUM} (no PR found, assigned to ${BOT_USER:-unassigned})"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
-    log "started dev-agent PID $! for issue #${ISSUE_NUM} (recovery)"
+    if [ "$OPEN_PR" = false ] && [ "$BLOCKED_BY_INPROGRESS" = false ]; then
+      # Grace period: skip if in-progress label was added <60s ago (issue #471)
+      if in_progress_recently_added "$ISSUE_NUM" 60; then
+        log "issue #${ISSUE_NUM} in-progress label added <60s ago — skipping stale detection (grace period)"
+        BLOCKED_BY_INPROGRESS=true
+      else
+        log "issue #${ISSUE_NUM} is stale (no assignee, no open PR, no agent lock) — relabeling to blocked"
+        relabel_stale_issue "$ISSUE_NUM" "no_assignee_no_open_pr_no_lock"
+        BLOCKED_BY_INPROGRESS=true
+      fi
+    fi
+
+    # Check if there's already an open PR for this issue
+    if [ "$BLOCKED_BY_INPROGRESS" = false ]; then
+      HAS_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+        "${API}/pulls?state=open&limit=20" | \
+        jq -r --arg branch "fix/issue-${ISSUE_NUM}" \
+        '.[] | select(.head.ref == $branch) | .number' | head -1) || true
+
+      if [ -n "$HAS_PR" ]; then
+        # Check if branch is stale (behind primary branch)
+        BRANCH="fix/issue-${ISSUE_NUM}"
+        AHEAD=$(git rev-list --count "origin/${BRANCH}..origin/${PRIMARY_BRANCH}" 2>/dev/null || echo "0")
+        if [ "$AHEAD" -gt 0 ]; then
+          log "issue #${ISSUE_NUM} PR #${HAS_PR} is $AHEAD commits behind ${PRIMARY_BRANCH} — abandoning stale PR"
+          # Close the PR via API
+          curl -sf -X PATCH \
+            -H "Authorization: token ${FORGE_TOKEN}" \
+            -H "Content-Type: application/json" \
+            "${API}/pulls/${HAS_PR}" \
+            -d '{"state":"closed"}' >/dev/null 2>&1 || true
+          # Delete the branch via git push
+          git -C "${PROJECT_REPO_ROOT:-}" push origin --delete "${BRANCH}" 2>/dev/null || true
+          # Reset to fresh start on primary branch
+          git -C "${PROJECT_REPO_ROOT:-}" checkout "${PRIMARY_BRANCH}" 2>/dev/null || true
+          git -C "${PROJECT_REPO_ROOT:-}" pull --ff-only origin "${PRIMARY_BRANCH}" 2>/dev/null || true
+          BLOCKED_BY_INPROGRESS=true
+        fi
+
+        # Only process PR if not abandoned (stale branch check above)
+        if [ "$BLOCKED_BY_INPROGRESS" = false ]; then
+          PR_SHA=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+            "${API}/pulls/${HAS_PR}" | jq -r '.head.sha') || true
+          CI_STATE=$(ci_commit_status "$PR_SHA") || true
+
+          # Non-code PRs (docs, formulas, evidence) may have no CI — treat as passed
+          if ! ci_passed "$CI_STATE" && ! ci_required_for_pr "$HAS_PR"; then
+            CI_STATE="success"
+            log "PR #${HAS_PR} has no code files — treating CI as passed"
+          fi
+
+          # Check formal reviews (single fetch to avoid race window)
+          REVIEWS_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+            "${API}/pulls/${HAS_PR}/reviews") || true
+          HAS_APPROVE=$(echo "$REVIEWS_JSON" | \
+            jq -r '[.[] | select(.state == "APPROVED") | select(.stale == false)] | length') || true
+          HAS_CHANGES=$(echo "$REVIEWS_JSON" | \
+            jq -r '[.[] | select(.state == "REQUEST_CHANGES") | select(.stale == false)] | length') || true
+
+          if ci_passed "$CI_STATE" && [ "${HAS_APPROVE:-0}" -gt 0 ]; then
+            if try_direct_merge "$HAS_PR" "$ISSUE_NUM"; then
+              BLOCKED_BY_INPROGRESS=true
+            else
+              # Direct merge failed (conflicts?) — fall back to dev-agent
+              log "falling back to dev-agent for PR #${HAS_PR} merge"
+              ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
+              log "started dev-agent PID $! for issue #${ISSUE_NUM} (agent-merge)"
+              BLOCKED_BY_INPROGRESS=true
+            fi
+
+          # Do NOT gate REQUEST_CHANGES on ci_passed: act immediately even if CI is
+          # pending/unknown. Definitive CI failure is handled by the elif below.
+          elif [ "${HAS_CHANGES:-0}" -gt 0 ] && { ci_passed "$CI_STATE" || [ "$CI_STATE" = "pending" ] || [ "$CI_STATE" = "unknown" ] || [ -z "$CI_STATE" ]; }; then
+            # Check if issue is assigned to this agent — skip if assigned to another bot
+            ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+              "${API}/issues/${ISSUE_NUM}") || true
+            assignee=$(echo "$ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+            if [ -n "$assignee" ] && [ "$assignee" != "$BOT_USER" ]; then
+              log "issue #${ISSUE_NUM} PR #${HAS_PR} REQUEST_CHANGES but assigned to ${assignee} — skipping"
+              # Don't block — fall through to backlog
+              BLOCKED_BY_INPROGRESS=false
+            else
+              log "issue #${ISSUE_NUM} PR #${HAS_PR} has REQUEST_CHANGES — spawning agent"
+              ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
+              log "started dev-agent PID $! for issue #${ISSUE_NUM} (review fix)"
+              BLOCKED_BY_INPROGRESS=true
+            fi
+
+          elif ci_failed "$CI_STATE"; then
+            if handle_ci_exhaustion "$HAS_PR" "$ISSUE_NUM" "check_only"; then
+              # Fall through to backlog scan instead of exit
+              :
+            else
+              # Increment at actual launch time (not on guard-hit paths)
+              if handle_ci_exhaustion "$HAS_PR" "$ISSUE_NUM"; then
+                BLOCKED_BY_INPROGRESS=true  # exhausted between check and launch
+              else
+                log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
+                ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
+                log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
+                BLOCKED_BY_INPROGRESS=true
+              fi
+            fi
+
+          else
+            log "issue #${ISSUE_NUM} has open PR #${HAS_PR} (CI: ${CI_STATE}, waiting)"
+            BLOCKED_BY_INPROGRESS=true
+          fi
+        fi
+      else
+        # Check assignee before adopting orphaned issue
+        ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+          "${API}/issues/${ISSUE_NUM}") || true
+        ASSIGNEE=$(echo "$ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+
+        if [ -n "$ASSIGNEE" ] && [ "$ASSIGNEE" != "$BOT_USER" ]; then
+          log "issue #${ISSUE_NUM} assigned to ${ASSIGNEE} — skipping (not orphaned)"
+          # Remove in-progress label since this agent isn't working on it
+          IP_ID=$(_ilc_in_progress_id)
+          curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
+            "${API}/issues/${ISSUE_NUM}/labels/${IP_ID}" >/dev/null 2>&1 || true
+          # Don't block — fall through to backlog
+        else
+          log "recovering orphaned issue #${ISSUE_NUM} (no PR found, assigned to ${BOT_USER:-unassigned})"
+          ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
+          log "started dev-agent PID $! for issue #${ISSUE_NUM} (recovery)"
+          BLOCKED_BY_INPROGRESS=true
+        fi
+      fi
+    fi
+  fi
+
+  # If blocked by in-progress work, exit now
+  if [ "$BLOCKED_BY_INPROGRESS" = true ]; then
    exit 0
  fi
 fi
@ -469,7 +686,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
    fi
    # Direct merge failed (conflicts?) — fall back to dev-agent
    log "falling back to dev-agent for PR #${PR_NUM} merge"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM} (agent-merge)"
    exit 0
  fi
@ -481,8 +698,16 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do

  # Stuck: REQUEST_CHANGES or CI failure -> spawn agent
  if [ "${HAS_CHANGES:-0}" -gt 0 ] && { ci_passed "$CI_STATE" || [ "$CI_STATE" = "pending" ] || [ "$CI_STATE" = "unknown" ] || [ -z "$CI_STATE" ]; }; then
+    # Check if issue is assigned to this agent — skip if assigned to another bot
+    ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      "${API}/issues/${STUCK_ISSUE}") || true
+    assignee=$(echo "$ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+    if [ -n "$assignee" ] && [ "$assignee" != "$BOT_USER" ]; then
+      log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) REQUEST_CHANGES but assigned to ${assignee} — skipping"
+      continue  # skip this PR, check next stuck PR or fall through to backlog
+    fi
    log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) has REQUEST_CHANGES — fixing first"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM}"
    exit 0
  elif ci_failed "$CI_STATE"; then
@ -494,7 +719,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
      continue  # exhausted between check and launch
    fi
    log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt ${CI_FIX_ATTEMPTS}/3)"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM}"
    exit 0
  fi
@ -543,6 +768,15 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
  ISSUE_NUM=$(echo "$BACKLOG_JSON" | jq -r ".[$i].number")
  ISSUE_BODY=$(echo "$BACKLOG_JSON" | jq -r ".[$i].body // \"\"")

+  # Check assignee before claiming — skip if assigned to another bot
+  ISSUE_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API}/issues/${ISSUE_NUM}") || true
+  ASSIGNEE=$(echo "$ISSUE_JSON" | jq -r '.assignee.login // ""') || true
+  if [ -n "$ASSIGNEE" ] && [ "$ASSIGNEE" != "$BOT_USER" ]; then
+    log "  #${ISSUE_NUM} assigned to ${ASSIGNEE} — skipping"
+    continue
+  fi
+
  # Formula guard: formula-labeled issues must not be picked up by dev-agent.
  ISSUE_LABELS=$(echo "$BACKLOG_JSON" | jq -r ".[$i].labels[].name" 2>/dev/null) || true
  SKIP_LABEL=$(echo "$ISSUE_LABELS" | grep -oE '^(formula|prediction/dismissed|prediction/unreviewed)$' | head -1) || true
@ -562,6 +796,26 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
    '.[] | select((.head.ref == $branch) or (.title | contains($num))) | .number' | head -1) || true

  if [ -n "$EXISTING_PR" ]; then
+    # Check if branch is stale (behind primary branch)
+    BRANCH="fix/issue-${ISSUE_NUM}"
+    AHEAD=$(git rev-list --count "origin/${BRANCH}..origin/${PRIMARY_BRANCH}" 2>/dev/null || echo "0")
+    if [ "$AHEAD" -gt 0 ]; then
+      log "issue #${ISSUE_NUM} PR #${EXISTING_PR} is $AHEAD commits behind ${PRIMARY_BRANCH} — abandoning stale PR"
+      # Close the PR via API
+      curl -sf -X PATCH \
+        -H "Authorization: token ${FORGE_TOKEN}" \
+        -H "Content-Type: application/json" \
+        "${API}/pulls/${EXISTING_PR}" \
+        -d '{"state":"closed"}' >/dev/null 2>&1 || true
+      # Delete the branch via git push
+      git -C "${PROJECT_REPO_ROOT:-}" push origin --delete "${BRANCH}" 2>/dev/null || true
+      # Reset to fresh start on primary branch
+      git -C "${PROJECT_REPO_ROOT:-}" checkout "${PRIMARY_BRANCH}" 2>/dev/null || true
+      git -C "${PROJECT_REPO_ROOT:-}" pull --ff-only origin "${PRIMARY_BRANCH}" 2>/dev/null || true
+      # Continue to find another ready issue
+      continue
+    fi
+
    PR_SHA=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
      "${API}/pulls/${EXISTING_PR}" | jq -r '.head.sha') || true
    CI_STATE=$(ci_commit_status "$PR_SHA") || true
@ -586,7 +840,7 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
      fi
      # Direct merge failed (conflicts?) — fall back to dev-agent
      log "falling back to dev-agent for PR #${EXISTING_PR} merge"
-      nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+      ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
      log "started dev-agent PID $! for issue #${ISSUE_NUM} (agent-merge)"
      exit 0

@ -619,9 +873,32 @@ done

 # Single-threaded per project: if any issue has an open PR waiting for review/CI,
 # don't start new work — let the pipeline drain first
+# But only block on PRs assigned to this agent (per-agent logic from #358)
 if [ -n "$READY_ISSUE" ] && [ -n "${WAITING_PRS:-}" ]; then
-  log "holding #${READY_ISSUE} — waiting for open PR(s) to land first: ${WAITING_PRS}"
-  exit 0
+  # Filter to only this agent's waiting PRs
+  MY_WAITING_PRS=""
+  for pr_num in $(echo "$WAITING_PRS" | tr ',' ' '); do
+    pr_num="${pr_num#\#}"  # Remove leading #
+    # Check if this PR's issue is assigned to this agent
+    pr_info=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      "${API}/pulls/${pr_num}" 2>/dev/null) || true
+    pr_branch=$(echo "$pr_info" | jq -r '.head.ref') || true
+    issue_num=$(echo "$pr_branch" | grep -oP '(?<=fix/issue-)\d+' || true)
+    if [ -z "$issue_num" ]; then
+      continue
+    fi
+    issue_assignee=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      "${API}/issues/${issue_num}" 2>/dev/null | jq -r '.assignee.login // ""') || true
+    if [ -n "$issue_assignee" ] && [ "$issue_assignee" = "$BOT_USER" ]; then
+      MY_WAITING_PRS="${MY_WAITING_PRS:-}${MY_WAITING_PRS:+, }#${pr_num}"
+    fi
+  done
+
+  if [ -n "$MY_WAITING_PRS" ]; then
+    log "holding #${READY_ISSUE} — waiting for my open PR(s) to land first: ${MY_WAITING_PRS}"
+    exit 0
+  fi
+  log "other agents' PRs waiting: ${WAITING_PRS} — proceeding with #${READY_ISSUE}"
 fi

 if [ -z "$READY_ISSUE" ]; then
@ -641,5 +918,5 @@ if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
 fi

 log "launching dev-agent for #${READY_ISSUE}"
-nohup "${SCRIPT_DIR}/dev-agent.sh" "$READY_ISSUE" >> "$LOGFILE" 2>&1 &
+("${SCRIPT_DIR}/dev-agent.sh" "$READY_ISSUE" >> "$LOGFILE" 2>&1) &
 log "started dev-agent PID $! for issue #${READY_ISSUE}"
--- a/dev/phase-handler.sh
+++ b/dev/phase-handler.sh
@ -1,820 +0,0 @@
-#!/usr/bin/env bash
-# dev/phase-handler.sh — Phase callback functions for dev-agent.sh
-#
-# Source this file from agent orchestrators after lib/agent-session.sh is loaded.
-# Defines: post_refusal_comment(), _on_phase_change(), build_phase_protocol_prompt()
-#
-# Required globals (set by calling agent before or after sourcing):
-#   ISSUE, FORGE_TOKEN, API, FORGE_WEB, PROJECT_NAME, FACTORY_ROOT
-#   BRANCH, PHASE_FILE, WORKTREE, IMPL_SUMMARY_FILE
-#   PRIMARY_BRANCH, SESSION_NAME, LOGFILE, ISSUE_TITLE
-#   WOODPECKER_REPO_ID, WOODPECKER_TOKEN, WOODPECKER_SERVER
-#
-# Globals with defaults (agents can override after sourcing):
-#   PR_NUMBER, CI_POLL_TIMEOUT, MAX_CI_FIXES, MAX_REVIEW_ROUNDS,
-#   REVIEW_POLL_TIMEOUT, CI_RETRY_COUNT, CI_FIX_COUNT, REVIEW_ROUND,
-#   CLAIMED, PHASE_POLL_INTERVAL
-#
-# Calls back to agent-defined helpers:
-#   cleanup_worktree(), cleanup_labels(), status(), log()
-#
-# shellcheck shell=bash
-# shellcheck disable=SC2154  # globals are set in dev-agent.sh before calling
-# shellcheck disable=SC2034  # CLAIMED is read by cleanup() in dev-agent.sh
-
-# Load secret scanner for redacting tmux output before posting to issues
-# shellcheck source=../lib/secret-scan.sh
-source "$(dirname "${BASH_SOURCE[0]}")/../lib/secret-scan.sh"
-
-# Load shared CI helpers (is_infra_step, classify_pipeline_failure, etc.)
-# shellcheck source=../lib/ci-helpers.sh
-source "$(dirname "${BASH_SOURCE[0]}")/../lib/ci-helpers.sh"
-
-# Load mirror push helper
-# shellcheck source=../lib/mirrors.sh
-source "$(dirname "${BASH_SOURCE[0]}")/../lib/mirrors.sh"
-
-# --- Default callback stubs (agents can override after sourcing) ---
-# cleanup_worktree and cleanup_labels are called during phase transitions.
-# Provide no-op defaults so phase-handler.sh is self-contained; sourcing
-# agents override these with real implementations.
-if ! declare -f cleanup_worktree >/dev/null 2>&1; then
-  cleanup_worktree() { :; }
-fi
-if ! declare -f cleanup_labels >/dev/null 2>&1; then
-  cleanup_labels() { :; }
-fi
-
-# --- Default globals (agents can override after sourcing) ---
-: "${CI_POLL_TIMEOUT:=1800}"
-: "${REVIEW_POLL_TIMEOUT:=10800}"
-: "${MAX_CI_FIXES:=3}"
-: "${MAX_REVIEW_ROUNDS:=5}"
-: "${CI_RETRY_COUNT:=0}"
-: "${CI_FIX_COUNT:=0}"
-: "${REVIEW_ROUND:=0}"
-: "${PR_NUMBER:=}"
-: "${CLAIMED:=false}"
-: "${PHASE_POLL_INTERVAL:=30}"
-
-# --- Post diagnostic comment + label issue as blocked ---
-# Captures tmux pane output, posts a structured comment on the issue, removes
-# in-progress label, and adds the "blocked" label.
-#
-# Args: reason [session_name]
-# Uses globals: ISSUE, SESSION_NAME, PR_NUMBER, FORGE_TOKEN, API
-post_blocked_diagnostic() {
-  local reason="$1"
-  local session="${2:-${SESSION_NAME:-}}"
-
-  # Capture last 50 lines from tmux pane (before kill)
-  local tmux_output=""
-  if [ -n "$session" ] && tmux has-session -t "$session" 2>/dev/null; then
-    tmux_output=$(tmux capture-pane -p -t "$session" -S -50 2>/dev/null || true)
-  fi
-
-  # Redact any secrets from tmux output before posting to issue
-  if [ -n "$tmux_output" ]; then
-    tmux_output=$(redact_secrets "$tmux_output")
-  fi
-
-  # Build diagnostic comment body
-  local comment
-  comment="### Session failure diagnostic
-
-| Field | Value |
-|---|---|
-| Exit reason | \`${reason}\` |
-| Timestamp | \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\` |"
-  [ -n "${PR_NUMBER:-}" ] && [ "${PR_NUMBER:-0}" != "0" ] && \
-    comment="${comment}
-| PR | #${PR_NUMBER} |"
-
-  if [ -n "$tmux_output" ]; then
-    comment="${comment}
-
-<details><summary>Last 50 lines from tmux pane</summary>
-
-\`\`\`
-${tmux_output}
-\`\`\`
-</details>"
-  fi
-
-  # Post comment to issue
-  curl -sf -X POST \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    -H "Content-Type: application/json" \
-    "${API}/issues/${ISSUE}/comments" \
-    -d "$(jq -nc --arg b "$comment" '{body:$b}')" >/dev/null 2>&1 || true
-
-  # Remove in-progress, add blocked
-  cleanup_labels
-  local blocked_id
-  blocked_id=$(ensure_blocked_label_id)
-  if [ -n "$blocked_id" ]; then
-    curl -sf -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
-      -H "Content-Type: application/json" \
-      "${API}/issues/${ISSUE}/labels" \
-      -d "{\"labels\":[${blocked_id}]}" >/dev/null 2>&1 || true
-  fi
-  CLAIMED=false
-  _BLOCKED_POSTED=true
-}
-
-# --- Build phase protocol prompt (shared across agents) ---
-# Generates the phase-signaling instructions for Claude prompts.
-# Args: phase_file summary_file branch [remote]
-# Output: The protocol text (stdout)
-build_phase_protocol_prompt() {
-  local _pf="$1" _sf="$2" _br="$3" _remote="${4:-${FORGE_REMOTE:-origin}}"
-  cat <<_PHASE_PROTOCOL_EOF_
-## Phase-Signaling Protocol (REQUIRED)
-
-You are running in a persistent tmux session managed by an orchestrator.
-Communicate progress by writing to the phase file. The orchestrator watches
-this file and injects events (CI results, review feedback) back into this session.
-
-### Key files
-\`\`\`
-PHASE_FILE="${_pf}"
-SUMMARY_FILE="${_sf}"
-\`\`\`
-
-### Phase transitions — write these exactly:
-
-**After committing and pushing your branch:**
-\`\`\`bash
-# Rebase on target branch before push to avoid merge conflicts
-git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
-git push ${_remote} ${_br}
-# Write a short summary of what you implemented:
-printf '%s' "<your summary>" > "\${SUMMARY_FILE}"
-# Signal the orchestrator to create the PR and watch for CI:
-echo "PHASE:awaiting_ci" > "${_pf}"
-\`\`\`
-Then STOP and wait. The orchestrator will inject CI results.
-
-**When you receive a "CI passed" injection:**
-\`\`\`bash
-echo "PHASE:awaiting_review" > "${_pf}"
-\`\`\`
-Then STOP and wait. The orchestrator will inject review feedback.
-
-**When you receive a "CI failed:" injection:**
-Fix the CI issue, then rebase on target branch and push:
-\`\`\`bash
-git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
-git push --force-with-lease ${_remote} ${_br}
-echo "PHASE:awaiting_ci" > "${_pf}"
-\`\`\`
-Then STOP and wait.
-
-**When you receive a "Review: REQUEST_CHANGES" injection:**
-Address ALL review feedback, then rebase on target branch and push:
-\`\`\`bash
-git fetch ${_remote} ${PRIMARY_BRANCH} && git rebase ${_remote}/${PRIMARY_BRANCH}
-git push --force-with-lease ${_remote} ${_br}
-echo "PHASE:awaiting_ci" > "${_pf}"
-\`\`\`
-(CI runs again after each push — always write awaiting_ci, not awaiting_review)
-
-**When you need human help (CI exhausted, merge blocked, stuck on a decision):**
-\`\`\`bash
-printf 'PHASE:escalate\nReason: %s\n' "describe what you need" > "${_pf}"
-\`\`\`
-Then STOP and wait. A human will review and respond via the forge.
-
-**On unrecoverable failure:**
-\`\`\`bash
-printf 'PHASE:failed\nReason: %s\n' "describe what failed" > "${_pf}"
-\`\`\`
-_PHASE_PROTOCOL_EOF_
-}
-
-# --- Merge helper ---
-# do_merge — attempt to merge PR via forge API.
-# Args: pr_num
-# Returns:
-#   0 = merged successfully
-#   1 = other failure (conflict, network error, etc.)
-#   2 = not enough approvals (HTTP 405) — PHASE:escalate already written
-do_merge() {
-  local pr_num="$1"
-  local merge_response merge_http_code merge_body
-  merge_response=$(curl -s -w "\n%{http_code}" -X POST \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    -H 'Content-Type: application/json' \
-    "${API}/pulls/${pr_num}/merge" \
-    -d '{"Do":"merge","delete_branch_after_merge":true}') || true
-  merge_http_code=$(echo "$merge_response" | tail -1)
-  merge_body=$(echo "$merge_response" | sed '$d')
-
-  if [ "$merge_http_code" = "200" ] || [ "$merge_http_code" = "204" ]; then
-    log "do_merge: PR #${pr_num} merged (HTTP ${merge_http_code})"
-    return 0
-  fi
-
-  # HTTP 405 — could be "merge requirements not met" OR "already merged" (race with dev-poll).
-  # Before escalating, check whether the PR was already merged by another agent.
-  if [ "$merge_http_code" = "405" ]; then
-    local pr_state
-    pr_state=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-      "${API}/pulls/${pr_num}" | jq -r '.merged // false') || pr_state="false"
-    if [ "$pr_state" = "true" ]; then
-      log "do_merge: PR #${pr_num} already merged (detected after HTTP 405) — treating as success"
-      return 0
-    fi
-    log "do_merge: PR #${pr_num} blocked — merge requirements not met (HTTP 405): ${merge_body:0:200}"
-    printf 'PHASE:escalate\nReason: %s\n' \
-      "PR #${pr_num} merge blocked — merge requirements not met (HTTP 405): ${merge_body:0:200}" \
-      > "$PHASE_FILE"
-    return 2
-  fi
-
-  log "do_merge: PR #${pr_num} merge failed (HTTP ${merge_http_code}): ${merge_body:0:200}"
-  return 1
-}
-
-# --- Refusal comment helper ---
-post_refusal_comment() {
-  local emoji="$1" title="$2" body="$3"
-  local last_has_title
-  last_has_title=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${API}/issues/${ISSUE}/comments?limit=5" | \
-    jq -r --arg t "Dev-agent: ${title}" '[.[] | .body // ""] | any(contains($t)) | tostring') || true
-  if [ "$last_has_title" = "true" ]; then
-    log "skipping duplicate refusal comment: ${title}"
-    return 0
-  fi
-  local comment
-  comment="${emoji} **Dev-agent: ${title}**
-
-${body}
-
---
-*Automated assessment by dev-agent · $(date -u '+%Y-%m-%d %H:%M UTC')*"
-  printf '%s' "$comment" > "/tmp/refusal-comment.txt"
-  jq -Rs '{body: .}' < "/tmp/refusal-comment.txt" > "/tmp/refusal-comment.json"
-  curl -sf -o /dev/null -X POST \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    -H "Content-Type: application/json" \
-    "${API}/issues/${ISSUE}/comments" \
-    --data-binary @"/tmp/refusal-comment.json" 2>/dev/null || \
-    log "WARNING: failed to post refusal comment"
-  rm -f "/tmp/refusal-comment.txt" "/tmp/refusal-comment.json"
-}
-
-# =============================================================================
-# PHASE DISPATCH CALLBACK
-# =============================================================================
-
-# _on_phase_change — Phase dispatch callback for monitor_phase_loop
-# Receives the current phase as $1.
-# Returns 0 to continue the loop, 1 to break (terminal phase reached).
-_on_phase_change() {
-  local phase="$1"
-
-  # ── PHASE: awaiting_ci ──────────────────────────────────────────────────────
-  if [ "$phase" = "PHASE:awaiting_ci" ]; then
-    # Release session lock — Claude is idle during CI polling (#724)
-    session_lock_release
-
-    # Create PR if not yet created
-    if [ -z "${PR_NUMBER:-}" ]; then
-      status "creating PR for issue #${ISSUE}"
-      IMPL_SUMMARY=""
-      if [ -f "$IMPL_SUMMARY_FILE" ]; then
-        # Don't treat refusal JSON as a PR summary
-        if ! jq -e '.status' < "$IMPL_SUMMARY_FILE" >/dev/null 2>&1; then
-          IMPL_SUMMARY=$(head -c 4000 "$IMPL_SUMMARY_FILE")
-        fi
-      fi
-
-      printf 'Fixes #%s\n\n## Changes\n%s' "$ISSUE" "$IMPL_SUMMARY" > "/tmp/pr-body-${ISSUE}.txt"
-      jq -n \
-        --arg title "fix: ${ISSUE_TITLE} (#${ISSUE})" \
-        --rawfile body "/tmp/pr-body-${ISSUE}.txt" \
-        --arg head "$BRANCH" \
-        --arg base "${PRIMARY_BRANCH}" \
-        '{title: $title, body: $body, head: $head, base: $base}' > "/tmp/pr-request-${ISSUE}.json"
-
-      PR_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST \
-        -H "Authorization: token ${FORGE_TOKEN}" \
-        -H "Content-Type: application/json" \
-        "${API}/pulls" \
-        --data-binary @"/tmp/pr-request-${ISSUE}.json")
-
-      PR_HTTP_CODE=$(echo "$PR_RESPONSE" | tail -1)
-      PR_RESPONSE_BODY=$(echo "$PR_RESPONSE" | sed '$d')
-      rm -f "/tmp/pr-body-${ISSUE}.txt" "/tmp/pr-request-${ISSUE}.json"
-
-      if [ "$PR_HTTP_CODE" = "201" ] || [ "$PR_HTTP_CODE" = "200" ]; then
-        PR_NUMBER=$(echo "$PR_RESPONSE_BODY" | jq -r '.number')
-        log "created PR #${PR_NUMBER}"
-      elif [ "$PR_HTTP_CODE" = "409" ]; then
-        # PR already exists (race condition) — find it
-        FOUND_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-          "${API}/pulls?state=open&limit=20" | \
-          jq -r --arg branch "$BRANCH" \
-          '.[] | select(.head.ref == $branch) | .number' | head -1) || true
-        if [ -n "$FOUND_PR" ]; then
-          PR_NUMBER="$FOUND_PR"
-          log "PR already exists: #${PR_NUMBER}"
-        else
-          log "ERROR: PR creation got 409 but no existing PR found"
-          agent_inject_into_session "$SESSION_NAME" "ERROR: Could not create PR (HTTP 409, no existing PR found). Check the forge API. Retry by writing PHASE:awaiting_ci again after verifying the branch was pushed."
-          return 0
-        fi
-      else
-        log "ERROR: PR creation failed (HTTP ${PR_HTTP_CODE})"
-        agent_inject_into_session "$SESSION_NAME" "ERROR: Could not create PR (HTTP ${PR_HTTP_CODE}). Check branch was pushed: git push ${FORGE_REMOTE:-origin} ${BRANCH}. Then write PHASE:awaiting_ci again."
-        return 0
-      fi
-    fi
-
-    # No CI configured? Treat as success immediately
-    if [ "${WOODPECKER_REPO_ID:-2}" = "0" ]; then
-      log "no CI configured — treating as passed"
-      agent_inject_into_session "$SESSION_NAME" "CI passed on PR #${PR_NUMBER} (no CI configured for this project).
-Write PHASE:awaiting_review to the phase file, then stop and wait for review feedback."
-      return 0
-    fi
-
-    # Poll CI until done or timeout
-    status "waiting for CI on PR #${PR_NUMBER}"
-    CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || \
-      curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-        "${API}/pulls/${PR_NUMBER}" | jq -r '.head.sha')
-
-    CI_DONE=false
-    CI_STATE="unknown"
-    CI_POLL_ELAPSED=0
-    while [ "$CI_POLL_ELAPSED" -lt "$CI_POLL_TIMEOUT" ]; do
-      sleep 30
-      CI_POLL_ELAPSED=$(( CI_POLL_ELAPSED + 30 ))
-
-      # Check session still alive during CI wait (exit_marker + tmux fallback)
-      if [ -f "/tmp/claude-exited-${SESSION_NAME}.ts" ] || ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
-        log "session died during CI wait"
-        break
-      fi
-
-      # Re-fetch HEAD — Claude may have pushed new commits since loop started
-      CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || echo "$CI_CURRENT_SHA")
-
-      CI_STATE=$(ci_commit_status "$CI_CURRENT_SHA")
-      if [ "$CI_STATE" = "success" ] || [ "$CI_STATE" = "failure" ] || [ "$CI_STATE" = "error" ]; then
-        CI_DONE=true
-        [ "$CI_STATE" = "success" ] && CI_FIX_COUNT=0
-        break
-      fi
-    done
-
-    if ! $CI_DONE; then
-      log "TIMEOUT: CI didn't complete in ${CI_POLL_TIMEOUT}s"
-      agent_inject_into_session "$SESSION_NAME" "CI TIMEOUT: CI did not complete within 30 minutes for PR #${PR_NUMBER} (SHA: ${CI_CURRENT_SHA:0:7}). This may be an infrastructure issue. Write PHASE:escalate if you cannot proceed."
-      return 0
-    fi
-
-    log "CI: ${CI_STATE}"
-
-    if [ "$CI_STATE" = "success" ]; then
-      agent_inject_into_session "$SESSION_NAME" "CI passed on PR #${PR_NUMBER}.
-Write PHASE:awaiting_review to the phase file, then stop and wait for review feedback:
-  echo \"PHASE:awaiting_review\" > \"${PHASE_FILE}\""
-    else
-      # Fetch CI error details
-      PIPELINE_NUM=$(ci_pipeline_number "$CI_CURRENT_SHA")
-
-      FAILED_STEP=""
-      FAILED_EXIT=""
-      IS_INFRA=false
-      if [ -n "$PIPELINE_NUM" ]; then
-        FAILED_INFO=$(curl -sf \
-          -H "Authorization: Bearer ${WOODPECKER_TOKEN}" \
-          "${WOODPECKER_SERVER}/api/repos/${WOODPECKER_REPO_ID}/pipelines/${PIPELINE_NUM}" | \
-          jq -r '.workflows[]?.children[]? | select(.state=="failure") | "\(.name)|\(.exit_code)"' | head -1 || true)
-        FAILED_STEP=$(echo "$FAILED_INFO" | cut -d'|' -f1)
-        FAILED_EXIT=$(echo "$FAILED_INFO" | cut -d'|' -f2)
-      fi
-
-      log "CI failed: step=${FAILED_STEP:-unknown} exit=${FAILED_EXIT:-?}"
-
-      if [ -n "$FAILED_STEP" ] && is_infra_step "$FAILED_STEP" "${FAILED_EXIT:-0}" >/dev/null 2>&1; then
-        IS_INFRA=true
-      fi
-
-      if [ "$IS_INFRA" = true ] && [ "${CI_RETRY_COUNT:-0}" -lt 1 ]; then
-        CI_RETRY_COUNT=$(( CI_RETRY_COUNT + 1 ))
-        log "infra failure — retrigger CI (retry ${CI_RETRY_COUNT})"
-        (cd "$WORKTREE" && git commit --allow-empty \
-          -m "ci: retrigger after infra failure (#${ISSUE})" --no-verify 2>&1 | tail -1)
-        # Rebase on target branch before push to avoid merge conflicts
-        if ! (cd "$WORKTREE" && \
-          git fetch "${FORGE_REMOTE:-origin}" "${PRIMARY_BRANCH}" 2>/dev/null && \
-          git rebase "${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}" 2>&1 | tail -5); then
-          log "rebase conflict detected — aborting, agent must resolve"
-          (cd "$WORKTREE" && git rebase --abort 2>/dev/null || git reset --hard HEAD 2>/dev/null) || true
-          agent_inject_into_session "$SESSION_NAME" "REBASE CONFLICT: Cannot rebase onto ${PRIMARY_BRANCH} automatically.
-
-Please resolve merge conflicts manually:
-1. Check conflict status: git status
-2. Resolve conflicts in the conflicted files
-3. Stage resolved files: git add <files>
-4. Continue rebase: git rebase --continue
-
-If you cannot resolve conflicts, abort: git rebase --abort
-Then write PHASE:escalate with a reason."
-          return 0
-        fi
-        # Rebase succeeded — push the result
-        (cd "$WORKTREE" && git push --force-with-lease "${FORGE_REMOTE:-origin}" "$BRANCH" 2>&1 | tail -3)
-        # Touch phase file so we recheck CI on the new SHA
-        # Do NOT update LAST_PHASE_MTIME here — let the main loop detect the fresh mtime
-        touch "$PHASE_FILE"
-        CI_CURRENT_SHA=$(git -C "${WORKTREE}" rev-parse HEAD 2>/dev/null || true)
-        return 0
-      fi
-
-      CI_FIX_COUNT=$(( CI_FIX_COUNT + 1 ))
-      _ci_pipeline_url="${WOODPECKER_SERVER}/repos/${WOODPECKER_REPO_ID}/pipeline/${PIPELINE_NUM:-0}"
-      if [ "$CI_FIX_COUNT" -gt "$MAX_CI_FIXES" ]; then
-        log "CI failure not recoverable after ${CI_FIX_COUNT} fix attempts — escalating"
-        printf 'PHASE:escalate\nReason: ci_exhausted after %d attempts (step: %s)\n' "$CI_FIX_COUNT" "${FAILED_STEP:-unknown}" > "$PHASE_FILE"
-        # Do NOT update LAST_PHASE_MTIME here — let the main loop detect PHASE:escalate
-        return 0
-      fi
-
-      CI_ERROR_LOG=""
-      if [ -n "$PIPELINE_NUM" ]; then
-        CI_ERROR_LOG=$(bash "${FACTORY_ROOT}/lib/ci-debug.sh" failures "$PIPELINE_NUM" 2>/dev/null | tail -80 | head -c 8000 || echo "")
-      fi
-
-      # Save CI result for crash recovery
-      printf 'CI failed (attempt %d/%d)\nStep: %s\nExit: %s\n\n%s' \
-        "$CI_FIX_COUNT" "$MAX_CI_FIXES" "${FAILED_STEP:-unknown}" "${FAILED_EXIT:-?}" "$CI_ERROR_LOG" \
-        > "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt" 2>/dev/null || true
-
-      agent_inject_into_session "$SESSION_NAME" "CI failed on PR #${PR_NUMBER} (attempt ${CI_FIX_COUNT}/${MAX_CI_FIXES}).
-
-Failed step: ${FAILED_STEP:-unknown} (exit code ${FAILED_EXIT:-?}, pipeline #${PIPELINE_NUM:-?})
-
-CI debug tool:
-  bash ${FACTORY_ROOT}/lib/ci-debug.sh failures ${PIPELINE_NUM:-0}
-  bash ${FACTORY_ROOT}/lib/ci-debug.sh logs ${PIPELINE_NUM:-0} <step-name>
-
-Error snippet:
-${CI_ERROR_LOG:-No logs available. Use ci-debug.sh to query the pipeline.}
-
-Instructions:
-1. Run ci-debug.sh failures to get the full error output.
-2. Read the failing test file(s) — understand what the tests EXPECT.
-3. Fix the root cause — do NOT weaken tests.
-4. Rebase on target branch and push: git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
-  git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
-5. Write: echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
-6. Stop and wait."
-    fi
-
-  # ── PHASE: awaiting_review ──────────────────────────────────────────────────
-  elif [ "$phase" = "PHASE:awaiting_review" ]; then
-    # Release session lock — Claude is idle during review wait (#724)
-    session_lock_release
-    status "waiting for review on PR #${PR_NUMBER:-?}"
-    CI_FIX_COUNT=0  # Reset CI fix budget for this review cycle
-
-    if [ -z "${PR_NUMBER:-}" ]; then
-      log "WARNING: awaiting_review but PR_NUMBER unknown — searching for PR"
-      FOUND_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-        "${API}/pulls?state=open&limit=20" | \
-        jq -r --arg branch "$BRANCH" \
-        '.[] | select(.head.ref == $branch) | .number' | head -1) || true
-      if [ -n "$FOUND_PR" ]; then
-        PR_NUMBER="$FOUND_PR"
-        log "found PR #${PR_NUMBER}"
-      else
-        agent_inject_into_session "$SESSION_NAME" "ERROR: Cannot find open PR for branch ${BRANCH}. Did you push? Verify with git status and git push ${FORGE_REMOTE:-origin} ${BRANCH}, then write PHASE:awaiting_ci."
-        return 0
-      fi
-    fi
-
-    REVIEW_POLL_ELAPSED=0
-    REVIEW_FOUND=false
-    while [ "$REVIEW_POLL_ELAPSED" -lt "$REVIEW_POLL_TIMEOUT" ]; do
-      sleep 300  # 5 min between review checks
-      REVIEW_POLL_ELAPSED=$(( REVIEW_POLL_ELAPSED + 300 ))
-
-      # Check session still alive (exit_marker + tmux fallback)
-      if [ -f "/tmp/claude-exited-${SESSION_NAME}.ts" ] || ! tmux has-session -t "${SESSION_NAME}" 2>/dev/null; then
-        log "session died during review wait"
-        REVIEW_FOUND=false
-        break
-      fi
-
-      # Check if phase was updated while we wait (e.g., Claude reacted to something)
-      NEW_MTIME=$(stat -c %Y "$PHASE_FILE" 2>/dev/null || echo 0)
-      if [ "$NEW_MTIME" -gt "$LAST_PHASE_MTIME" ]; then
-        log "phase file updated during review wait — re-entering main loop"
-        # Do NOT update LAST_PHASE_MTIME here — leave it stale so the outer
-        # loop detects the change on its next tick and dispatches the new phase.
-        REVIEW_FOUND=true  # Prevent timeout injection
-        # Clean up review-poll sentinel if it exists (session already advanced)
-        rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-        break
-      fi
-
-      REVIEW_SHA=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-        "${API}/pulls/${PR_NUMBER}" | jq -r '.head.sha') || true
-      REVIEW_COMMENT=$(forge_api_all "/issues/${PR_NUMBER}/comments" | \
-        jq -r --arg sha "$REVIEW_SHA" \
-        '[.[] | select(.body | contains("<!-- reviewed: " + $sha))] | last // empty') || true
-
-      if [ -n "$REVIEW_COMMENT" ] && [ "$REVIEW_COMMENT" != "null" ]; then
-        REVIEW_TEXT=$(echo "$REVIEW_COMMENT" | jq -r '.body')
-
-        # Skip error reviews — they have no verdict
-        if echo "$REVIEW_TEXT" | grep -q "review-error\|Review — Error"; then
-          log "review was an error, waiting for re-review"
-          continue
-        fi
-
-        VERDICT=$(echo "$REVIEW_TEXT" | grep -oP '\*\*(APPROVE|REQUEST_CHANGES|DISCUSS)\*\*' | head -1 | tr -d '*' || true)
-        log "review verdict: ${VERDICT:-unknown}"
-
-        # Also check formal forge reviews
-        if [ -z "$VERDICT" ]; then
-          VERDICT=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-            "${API}/pulls/${PR_NUMBER}/reviews" | \
-            jq -r '[.[] | select(.stale == false)] | last | .state // empty' || true)
-          if [ "$VERDICT" = "APPROVED" ]; then
-            VERDICT="APPROVE"
-          elif [ "$VERDICT" != "REQUEST_CHANGES" ]; then
-            VERDICT=""
-          fi
-          [ -n "$VERDICT" ] && log "verdict from formal review: $VERDICT"
-        fi
-
-        # Skip injection if review-poll.sh already injected (sentinel present).
-        # Exception: APPROVE always falls through so do_merge() runs even when
-        # review-poll injected first — prevents Claude writing PHASE:done on a
-        # failed merge without the orchestrator detecting the error.
-        REVIEW_SENTINEL="/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-        if [ -n "$VERDICT" ] && [ -f "$REVIEW_SENTINEL" ] && [ "$VERDICT" != "APPROVE" ]; then
-          log "review already injected by review-poll (sentinel exists) — skipping"
-          rm -f "$REVIEW_SENTINEL"
-          REVIEW_FOUND=true
-          break
-        fi
-        rm -f "$REVIEW_SENTINEL"  # consume sentinel before APPROVE handling below
-
-        if [ "$VERDICT" = "APPROVE" ]; then
-          REVIEW_FOUND=true
-          _merge_rc=0; do_merge "$PR_NUMBER" || _merge_rc=$?
-          if [ "$_merge_rc" -eq 0 ]; then
-            # Merge succeeded — close issue and signal done
-            curl -sf -X PATCH \
-              -H "Authorization: token ${FORGE_TOKEN}" \
-              -H 'Content-Type: application/json' \
-              "${API}/issues/${ISSUE}" \
-              -d '{"state":"closed"}' >/dev/null 2>&1 || true
-            # Pull merged primary branch and push to mirrors
-            git -C "$PROJECT_REPO_ROOT" fetch "${FORGE_REMOTE:-origin}" "$PRIMARY_BRANCH" 2>/dev/null || true
-            git -C "$PROJECT_REPO_ROOT" checkout "$PRIMARY_BRANCH" 2>/dev/null || true
-            git -C "$PROJECT_REPO_ROOT" pull --ff-only "${FORGE_REMOTE:-origin}" "$PRIMARY_BRANCH" 2>/dev/null || true
-            mirror_push
-            printf 'PHASE:done\n' > "$PHASE_FILE"
-          elif [ "$_merge_rc" -ne 2 ]; then
-            # Other merge failure (conflict, etc.) — delegate to Claude for rebase + retry
-            agent_inject_into_session "$SESSION_NAME" "Approved! PR #${PR_NUMBER} has been approved, but the merge failed (likely conflicts).
-
-Rebase onto ${PRIMARY_BRANCH} and push:
-  git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
-  git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
-  echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
-
-Do NOT merge or close the issue — the orchestrator handles that after CI passes.
-If rebase repeatedly fails, write PHASE:escalate with a reason."
-          fi
-          # _merge_rc=2: PHASE:escalate already written by do_merge()
-          break
-
-        elif [ "$VERDICT" = "REQUEST_CHANGES" ] || [ "$VERDICT" = "DISCUSS" ]; then
-          REVIEW_ROUND=$(( REVIEW_ROUND + 1 ))
-          if [ "$REVIEW_ROUND" -ge "$MAX_REVIEW_ROUNDS" ]; then
-            log "hit max review rounds (${MAX_REVIEW_ROUNDS})"
-            log "PR #${PR_NUMBER}: hit ${MAX_REVIEW_ROUNDS} review rounds, needs human attention"
-          fi
-          REVIEW_FOUND=true
-          agent_inject_into_session "$SESSION_NAME" "Review feedback (round ${REVIEW_ROUND}) on PR #${PR_NUMBER}:
-
-${REVIEW_TEXT}
-
-Instructions:
-1. Address each piece of feedback carefully.
-2. Run lint and tests when done.
-3. Rebase on target branch and push: git fetch ${FORGE_REMOTE:-origin} ${PRIMARY_BRANCH} && git rebase ${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH}
-  git push --force-with-lease ${FORGE_REMOTE:-origin} ${BRANCH}
-4. Write: echo \"PHASE:awaiting_ci\" > \"${PHASE_FILE}\"
-5. Stop and wait for the next CI result."
-          log "review REQUEST_CHANGES received (round ${REVIEW_ROUND})"
-          break
-
-        else
-          # No verdict found in comment or formal review — keep waiting
-          log "review comment found but no verdict, continuing to wait"
-          continue
-        fi
-      fi
-
-      # Check if PR was merged or closed externally
-      PR_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-        "${API}/pulls/${PR_NUMBER}") || true
-      PR_STATE=$(echo "$PR_JSON" | jq -r '.state // "unknown"')
-      PR_MERGED=$(echo "$PR_JSON" | jq -r '.merged // false')
-      if [ "$PR_STATE" != "open" ]; then
-        if [ "$PR_MERGED" = "true" ]; then
-          log "PR #${PR_NUMBER} was merged externally"
-          curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
-            -H "Content-Type: application/json" \
-            "${API}/issues/${ISSUE}" -d '{"state":"closed"}' >/dev/null 2>&1 || true
-          cleanup_labels
-          agent_kill_session "$SESSION_NAME"
-          cleanup_worktree
-          rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}"
-          exit 0
-        else
-          log "PR #${PR_NUMBER} was closed WITHOUT merge — NOT closing issue"
-          cleanup_labels
-          agent_kill_session "$SESSION_NAME"
-          cleanup_worktree
-          exit 0
-        fi
-      fi
-
-      log "waiting for review on PR #${PR_NUMBER} (${REVIEW_POLL_ELAPSED}s elapsed)"
-    done
-
-    if ! $REVIEW_FOUND && [ "$REVIEW_POLL_ELAPSED" -ge "$REVIEW_POLL_TIMEOUT" ]; then
-      log "TIMEOUT: no review after 3h"
-      agent_inject_into_session "$SESSION_NAME" "TIMEOUT: No review received after 3 hours for PR #${PR_NUMBER}. Write PHASE:escalate to escalate to a human reviewer."
-    fi
-
-  # ── PHASE: escalate ──────────────────────────────────────────────────────
-  elif [ "$phase" = "PHASE:escalate" ]; then
-    status "escalated — waiting for human input on issue #${ISSUE}"
-    ESCALATE_REASON=$(sed -n '2p' "$PHASE_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
-    log "phase: escalate — reason: ${ESCALATE_REASON:-none}"
-    # Session stays alive — human input arrives via vault/forge
-
-  # ── PHASE: done ─────────────────────────────────────────────────────────────
-  # PR merged and issue closed (by orchestrator or Claude). Just clean up local state.
-  elif [ "$phase" = "PHASE:done" ]; then
-    if [ -n "${PR_NUMBER:-}" ]; then
-      status "phase done — PR #${PR_NUMBER} merged, cleaning up"
-    else
-      status "phase done — issue #${ISSUE} complete, cleaning up"
-    fi
-
-    # Belt-and-suspenders: ensure in-progress label removed (idempotent)
-    cleanup_labels
-
-    # Local cleanup
-    agent_kill_session "$SESSION_NAME"
-    cleanup_worktree
-    rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
-      "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
-    [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-    CLAIMED=false  # Don't unclaim again in cleanup()
-
-  # ── PHASE: failed ───────────────────────────────────────────────────────────
-  elif [ "$phase" = "PHASE:failed" ]; then
-    if [[ -f "$PHASE_FILE" ]]; then
-      FAILURE_REASON=$(sed -n '2p' "$PHASE_FILE" | sed 's/^Reason: //')
-    fi
-    FAILURE_REASON="${FAILURE_REASON:-unspecified}"
-    log "phase: failed — reason: ${FAILURE_REASON}"
-    # Gitea labels API requires []int64 — look up the "backlog" label ID once
-    BACKLOG_LABEL_ID=$(forge_api GET "/labels" 2>/dev/null \
-      | jq -r '.[] | select(.name == "backlog") | .id' 2>/dev/null || true)
-    BACKLOG_LABEL_ID="${BACKLOG_LABEL_ID:-1300815}"
-    UNDERSPECIFIED_LABEL_ID=$(forge_api GET "/labels" 2>/dev/null \
-      | jq -r '.[] | select(.name == "underspecified") | .id' 2>/dev/null || true)
-    UNDERSPECIFIED_LABEL_ID="${UNDERSPECIFIED_LABEL_ID:-1300816}"
-
-    # Check if this is a refusal (Claude wrote refusal JSON to IMPL_SUMMARY_FILE)
-    REFUSAL_JSON=""
-    if [ -f "$IMPL_SUMMARY_FILE" ] && jq -e '.status' < "$IMPL_SUMMARY_FILE" >/dev/null 2>&1; then
-      REFUSAL_JSON=$(cat "$IMPL_SUMMARY_FILE")
-    fi
-
-    if [ -n "$REFUSAL_JSON" ] && [ "$FAILURE_REASON" = "refused" ]; then
-      REFUSAL_STATUS=$(printf '%s' "$REFUSAL_JSON" | jq -r '.status')
-      log "claude refused: ${REFUSAL_STATUS}"
-
-      # Write preflight result for dev-poll.sh
-      printf '%s' "$REFUSAL_JSON" > "$PREFLIGHT_RESULT"
-
-      # Unclaim issue (restore backlog label, remove in-progress)
-      cleanup_labels
-      curl -sf -X POST \
-        -H "Authorization: token ${FORGE_TOKEN}" \
-        -H "Content-Type: application/json" \
-        "${API}/issues/${ISSUE}/labels" \
-        -d "{\"labels\":[${BACKLOG_LABEL_ID}]}" >/dev/null 2>&1 || true
-
-      case "$REFUSAL_STATUS" in
-        unmet_dependency)
-          BLOCKED_BY_MSG=$(printf '%s' "$REFUSAL_JSON" | jq -r '.blocked_by // "unknown"')
-          SUGGESTION=$(printf '%s' "$REFUSAL_JSON" | jq -r '.suggestion // empty')
-          COMMENT_BODY="### Blocked by unmet dependency
-
-${BLOCKED_BY_MSG}"
-          if [ -n "$SUGGESTION" ] && [ "$SUGGESTION" != "null" ]; then
-            COMMENT_BODY="${COMMENT_BODY}
-
-**Suggestion:** Work on #${SUGGESTION} first."
-          fi
-          post_refusal_comment "🚧" "Unmet dependency" "$COMMENT_BODY"
-          ;;
-        too_large)
-          REASON=$(printf '%s' "$REFUSAL_JSON" | jq -r '.reason // "unspecified"')
-          post_refusal_comment "📏" "Too large for single session" "### Why this can't be implemented as-is
-
-${REASON}
-
-### Next steps
-A maintainer should split this issue or add more detail to the spec."
-          curl -sf -X POST \
-            -H "Authorization: token ${FORGE_TOKEN}" \
-            -H "Content-Type: application/json" \
-            "${API}/issues/${ISSUE}/labels" \
-            -d "{\"labels\":[${UNDERSPECIFIED_LABEL_ID}]}" >/dev/null 2>&1 || true
-          curl -sf -X DELETE \
-            -H "Authorization: token ${FORGE_TOKEN}" \
-            "${API}/issues/${ISSUE}/labels/${BACKLOG_LABEL_ID}" >/dev/null 2>&1 || true
-          ;;
-        already_done)
-          REASON=$(printf '%s' "$REFUSAL_JSON" | jq -r '.reason // "unspecified"')
-          post_refusal_comment "✅" "Already implemented" "### Existing implementation
-
-${REASON}
-
-Closing as already implemented."
-          curl -sf -X PATCH \
-            -H "Authorization: token ${FORGE_TOKEN}" \
-            -H "Content-Type: application/json" \
-            "${API}/issues/${ISSUE}" \
-            -d '{"state":"closed"}' >/dev/null 2>&1 || true
-          ;;
-        *)
-          post_refusal_comment "❓" "Unable to proceed" "The dev-agent could not process this issue.
-
-Raw response:
-\`\`\`json
-$(printf '%s' "$REFUSAL_JSON" | head -c 2000)
-\`\`\`"
-          ;;
-      esac
-
-      CLAIMED=false  # Don't unclaim again in cleanup()
-      agent_kill_session "$SESSION_NAME"
-      cleanup_worktree
-      rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
-        "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
-      [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-      return 1
-
-    else
-      # Genuine unrecoverable failure — label blocked with diagnostic
-      log "session failed: ${FAILURE_REASON}"
-      post_blocked_diagnostic "$FAILURE_REASON"
-
-      agent_kill_session "$SESSION_NAME"
-      if [ -n "${PR_NUMBER:-}" ]; then
-        log "keeping worktree (PR #${PR_NUMBER} still open)"
-      else
-        cleanup_worktree
-      fi
-      rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
-        "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
-      [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-      return 1
-    fi
-
-  # ── PHASE: crashed ──────────────────────────────────────────────────────────
-  # Session died unexpectedly (OOM kill, tmux crash, etc.). Label blocked with
-  # diagnostic comment so humans can triage directly on the issue.
-  elif [ "$phase" = "PHASE:crashed" ]; then
-    log "session crashed for issue #${ISSUE}"
-    post_blocked_diagnostic "crashed"
-    log "PRESERVED crashed worktree for debugging: $WORKTREE"
-    rm -f "$PHASE_FILE" "$IMPL_SUMMARY_FILE" "${SCRATCH_FILE:-}" \
-      "/tmp/ci-result-${PROJECT_NAME}-${ISSUE}.txt"
-    [ -n "${PR_NUMBER:-}" ] && rm -f "/tmp/review-injected-${PROJECT_NAME}-${PR_NUMBER}"
-
-  else
-    log "WARNING: unknown phase value: ${phase}"
-  fi
-}
--- a/dev/phase-test.sh
+++ b/dev/phase-test.sh
@ -8,8 +8,13 @@

 set -euo pipefail

-# Source canonical read_phase() from shared library
-source "$(dirname "$0")/../lib/agent-session.sh"
+# Inline read_phase() function (previously from lib/agent-session.sh)
+# Read the current phase from a phase file, stripped of whitespace.
+# Usage: read_phase [file]  — defaults to $PHASE_FILE
+read_phase() {
+  local file="${1:-${PHASE_FILE:-}}"
+  { cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
+}

 PROJECT="testproject"
 ISSUE="999"
@ -84,7 +89,7 @@ else
  fail "PHASE:failed format: first='$first_line' second='$second_line'"
 fi

-# ── Test 5: orchestrator read function (canonical read_phase from lib/agent-session.sh)
+# ── Test 5: orchestrator read function (inline read_phase)
 echo "PHASE:awaiting_ci" > "$PHASE_FILE"
 phase=$(read_phase "$PHASE_FILE")
 if [ "$phase" = "PHASE:awaiting_ci" ]; then
--- a/disinto-factory/SKILL.md
+++ b/disinto-factory/SKILL.md
@ -1,268 +1,27 @@
 ---
 name: disinto-factory
-description: Set up and operate a disinto autonomous code factory. Use when bootstrapping a new factory instance, checking on agents and CI, managing the backlog, or troubleshooting the stack.
+description: Set up and operate a disinto autonomous code factory.
 ---

 # Disinto Factory

-You are helping the user set up and operate a **disinto autonomous code factory** — a system
-of bash scripts and Claude CLI that automates the full development lifecycle: picking up
-issues, implementing via Claude, creating PRs, running CI, reviewing, merging, and mirroring.
+You are helping the user set up and operate a **disinto autonomous code factory**.

-This guide shows how to set up the factory to develop an **external project** (e.g., `johba/harb`).
+## Guides

-## First-time setup
-
-Walk the user through these steps interactively. Ask questions where marked with [ASK].
-
-### 1. Environment
-
-[ASK] Where will the factory run? Options:
- **LXD container** (recommended for isolation) — need Debian 12, Docker, nesting enabled
- **Bare VM or server** — need Debian/Ubuntu with Docker
- **Existing container** — check prerequisites
-
-Verify prerequisites:
-```bash
-docker --version && git --version && jq --version && curl --version && tmux -V && python3 --version && claude --version
-```
-
-Any missing tool — help the user install it before continuing.
-
-### 2. Clone disinto and choose a target project
-
-Clone the disinto factory itself:
-```bash
-git clone https://codeberg.org/johba/disinto.git && cd disinto
-```
-
-[ASK] What repository should the factory develop? Provide the **remote repository URL** in one of these formats:
- Full URL: `https://github.com/johba/harb.git` or `https://codeberg.org/johba/harb.git`
- Short slug: `johba/harb` (uses local Forgejo as the primary remote)
-
-The factory will clone from the remote URL (if provided) or from your local Forgejo, then mirror to the remote.
-
-Then initialize the factory for that project:
-```bash
-bin/disinto init johba/harb --yes
-# or with full URL:
-bin/disinto init https://github.com/johba/harb.git --yes
-```
-
-The `init` command will:
- Create all bot users (dev-bot, review-bot, etc.) on the local Forgejo
- Generate and save `WOODPECKER_TOKEN`
- Start the stack containers
- Clone the target repo into the agent workspace
-
-> **Note:** The `--repo-root` flag is optional and only needed if you want to customize
-> where the cloned repo lives. By default, it goes under `/home/agent/repos/<name>`.
-
-### 3. Post-init verification
-
-Run this checklist — fix any failures before proceeding:
-
-```bash
-# Stack healthy?
-docker ps --format "table {{.Names}}\t{{.Status}}"
-# Expected: forgejo, woodpecker (healthy), woodpecker-agent (healthy), agents, edge, staging
-
-# Token generated?
-grep WOODPECKER_TOKEN .env | grep -v "^$" && echo "OK" || echo "MISSING — see references/troubleshooting.md"
-
-# Agent cron active?
-docker exec -u agent disinto-agents-1 crontab -l -u agent
-
-# Agent can reach Forgejo?
-docker exec disinto-agents-1 bash -c "source /home/agent/disinto/.env && curl -sf http://forgejo:3000/api/v1/version | jq .version"
-
-# Agent repo cloned?
-docker exec -u agent disinto-agents-1 ls /home/agent/repos/
-```
-
-If the agent repo is missing, clone it:
-```bash
-docker exec disinto-agents-1 chown -R agent:agent /home/agent/repos
-docker exec -u agent disinto-agents-1 bash -c "source /home/agent/disinto/.env && git clone http://dev-bot:\${FORGE_TOKEN}@forgejo:3000/<org>/<repo>.git /home/agent/repos/<name>"
-```
-
-### 4. Create the project configuration file
-
-The factory uses a TOML file to configure how it manages your project. Create
-`projects/<name>.toml` based on the template format:
-
-```toml
-# projects/harb.toml
-
-name            = "harb"
-repo            = "johba/harb"
-forge_url       = "http://localhost:3000"
-repo_root       = "/home/agent/repos/harb"
-primary_branch  = "master"
-
-[ci]
-woodpecker_repo_id = 0
-stale_minutes      = 60
-
-[services]
-containers = ["ponder"]
-
-[monitoring]
-check_prs            = true
-check_dev_agent      = true
-check_pipeline_stall = true
-
-# [mirrors]
-# github   = "git@github.com:johba/harb.git"
-# codeberg = "git@codeberg.org:johba/harb.git"
-```
-
-**Key fields:**
- `name`: Project identifier (used for file names, logs, etc.)
- `repo`: The source repo in `owner/name` format
- `forge_url`: URL of your local Forgejo instance
- `repo_root`: Where the agent clones the repo
- `primary_branch`: Default branch name (e.g., `main` or `master`)
- `woodpecker_repo_id`: Set to `0` initially; auto-populated on first CI run
- `containers`: List of Docker containers the factory should manage
- `mirrors`: Optional external forge URLs for backup/sync
-
-### 5. Mirrors (optional)
-
-[ASK] Should the factory mirror to external forges? If yes, which?
- GitHub: need repo URL and SSH key added to GitHub account
- Codeberg: need repo URL and SSH key added to Codeberg account
-
-Show the user their public key:
-```bash
-cat ~/.ssh/id_ed25519.pub
-```
-
-Test SSH access:
-```bash
-ssh -T git@github.com 2>&1; ssh -T git@codeberg.org 2>&1
-```
-
-If SSH host keys are missing: `ssh-keyscan github.com codeberg.org >> ~/.ssh/known_hosts 2>/dev/null`
-
-Edit `projects/<name>.toml` to uncomment and configure mirrors:
-```toml
-[mirrors]
-github   = "git@github.com:Org/repo.git"
-codeberg = "git@codeberg.org:user/repo.git"
-```
-
-Test with a manual push:
-```bash
-source .env && source lib/env.sh && export PROJECT_TOML=projects/<name>.toml && source lib/load-project.sh && source lib/mirrors.sh && mirror_push
-```
-
-### 6. Seed the backlog
-
-[ASK] What should the factory work on first? Brainstorm with the user.
-
-Help them create issues on the local Forgejo. Each issue needs:
- A clear title prefixed with `fix:`, `feat:`, or `chore:`
- A body describing what to change, which files, and any constraints
- The `backlog` label (so the dev-agent picks it up)
-
-```bash
-source .env
-BACKLOG_ID=$(curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/labels" \
-  -H "Authorization: token $FORGE_TOKEN" | jq -r '.[] | select(.name=="backlog") | .id')
-
-curl -sf -X POST "http://localhost:3000/api/v1/repos/<org>/<repo>/issues" \
-  -H "Authorization: token $FORGE_TOKEN" \
-  -H "Content-Type: application/json" \
-  -d "{\"title\": \"<title>\", \"body\": \"<body>\", \"labels\": [$BACKLOG_ID]}"
-```
-
-For issues with dependencies, add `Depends-on: #N` in the body — the dev-agent checks
-these before starting.
-
-Use labels:
- `backlog` — ready for the dev-agent
- `blocked` — parked, not for the factory
- No label — tracked but not for autonomous work
-
-### 7. Watch it work
-
-The dev-agent polls every 5 minutes. Trigger manually to see it immediately:
-```bash
-source .env
-export PROJECT_TOML=projects/<name>.toml
-docker exec -u agent disinto-agents-1 bash -c "cd /home/agent/disinto && bash dev/dev-poll.sh projects/<name>.toml"
-```
-
-Then monitor:
-```bash
-# Watch the agent work
-docker exec disinto-agents-1 tail -f /home/agent/data/logs/dev/dev-agent.log
-
-# Check for Claude running
-docker exec disinto-agents-1 bash -c "for f in /proc/[0-9]*/cmdline; do cmd=\$(tr '\0' ' ' < \$f 2>/dev/null); echo \$cmd | grep -q 'claude.*-p' && echo 'Claude is running'; done"
-```
-
-## Ongoing operations
-
-### Check factory status
-
-```bash
-source .env
-
-# Issues
-curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/issues?state=open" \
-  -H "Authorization: token $FORGE_TOKEN" \
-  | jq -r '.[] | "#\(.number) [\(.labels | map(.name) | join(","))] \(.title)"'
-
-# PRs
-curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/pulls?state=open" \
-  -H "Authorization: token $FORGE_TOKEN" \
-  | jq -r '.[] | "PR #\(.number) [\(.head.ref)] \(.title)"'
-
-# Agent logs
-docker exec disinto-agents-1 tail -20 /home/agent/data/logs/dev/dev-agent.log
-```
-
-### Check CI
-
-```bash
-source .env
-WP_CSRF=$(curl -sf -b "user_sess=$WOODPECKER_TOKEN" http://localhost:8000/web-config.js \
-  | sed -n 's/.*WOODPECKER_CSRF = "\([^"]*\)".*/\1/p')
-curl -sf -b "user_sess=$WOODPECKER_TOKEN" -H "X-CSRF-Token: $WP_CSRF" \
-  "http://localhost:8000/api/repos/1/pipelines?page=1&per_page=5" \
-  | jq '.[] | {number, status, event}'
-```
-
-### Unstick a blocked issue
-
-When a dev-agent run fails (CI timeout, implementation error), the issue gets labeled `blocked`:
-
-1. Close stale PR and delete the branch
-2. `docker exec disinto-agents-1 rm -f /tmp/dev-agent-*.json /tmp/dev-agent-*.lock`
-3. Relabel the issue to `backlog`
-4. Update agent repo: `docker exec -u agent disinto-agents-1 bash -c "cd /home/agent/repos/<name> && git fetch origin && git reset --hard origin/main"`
-
-### Access Forgejo UI
-
-If running in an LXD container with reverse tunnel:
-```bash
-# From your machine:
-ssh -L 3000:localhost:13000 user@jump-host
-# Open http://localhost:3000
-```
-
-Reset admin password if needed:
-```bash
-docker exec disinto-forgejo-1 su -c "forgejo admin user change-password --username disinto-admin --password <new-pw> --must-change-password=false" git
-```
+- **[Setup guide](setup.md)** — First-time factory setup: environment, init, verification, backlog seeding
+- **[Operations guide](operations.md)** — Day-to-day: status checks, CI debugging, unsticking issues, Forgejo access

 ## Important context

 - Read `AGENTS.md` for per-agent architecture and file-level docs
 - Read `VISION.md` for project philosophy
 - The factory uses a single internal Forgejo as its forge, regardless of where mirrors go
- Dev-agent uses `claude -p --resume` for session continuity across CI/review cycles
- Mirror pushes happen automatically after every merge (fire-and-forget)
- Cron schedule: dev-poll every 5min, review-poll every 5min, gardener 4x/day
+- Dev-agent uses `claude -p` for one-shot implementation sessions
+- Mirror pushes happen automatically after every merge
+- Polling loop in `docker/agents/entrypoint.sh`: dev-poll/review-poll every 5m, gardener/architect every 6h, planner every 12h, predictor every 24h
+
+## References
+
+- [Troubleshooting](references/troubleshooting.md)
+- [Factory status script](scripts/factory-status.sh)
--- a/disinto-factory/operations.md
+++ b/disinto-factory/operations.md
@ -0,0 +1,54 @@
+# Ongoing operations
+
+### Check factory status
+
+```bash
+source .env
+
+# Issues
+curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/issues?state=open" \
+  -H "Authorization: token $FORGE_TOKEN" \
+  | jq -r '.[] | "#\(.number) [\(.labels | map(.name) | join(","))] \(.title)"'
+
+# PRs
+curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/pulls?state=open" \
+  -H "Authorization: token $FORGE_TOKEN" \
+  | jq -r '.[] | "PR #\(.number) [\(.head.ref)] \(.title)"'
+
+# Agent logs
+docker exec disinto-agents-1 tail -20 /home/agent/data/logs/dev/dev-agent.log
+```
+
+### Check CI
+
+```bash
+source .env
+WP_CSRF=$(curl -sf -b "user_sess=$WOODPECKER_TOKEN" http://localhost:8000/web-config.js \
+  | sed -n 's/.*WOODPECKER_CSRF = "\([^"]*\)".*/\1/p')
+curl -sf -b "user_sess=$WOODPECKER_TOKEN" -H "X-CSRF-Token: $WP_CSRF" \
+  "http://localhost:8000/api/repos/1/pipelines?page=1&per_page=5" \
+  | jq '.[] | {number, status, event}'
+```
+
+### Unstick a blocked issue
+
+When a dev-agent run fails (CI timeout, implementation error), the issue gets labeled `blocked`:
+
+1. Close stale PR and delete the branch
+2. `docker exec disinto-agents-1 rm -f /tmp/dev-agent-*.json /tmp/dev-agent-*.lock`
+3. Relabel the issue to `backlog`
+4. Update agent repo: `docker exec -u agent disinto-agents-1 bash -c "cd /home/agent/repos/<name> && git fetch origin && git reset --hard origin/main"`
+
+### Access Forgejo UI
+
+If running in an LXD container with reverse tunnel:
+```bash
+# From your machine:
+ssh -L 3000:localhost:13000 user@jump-host
+# Open http://localhost:3000
+```
+
+Reset admin password if needed:
+```bash
+docker exec disinto-forgejo-1 su -c "forgejo admin user change-password --username disinto-admin --password <new-pw> --must-change-password=false" git
+```
--- a/disinto-factory/setup.md
+++ b/disinto-factory/setup.md
@ -0,0 +1,191 @@
+# First-time setup
+
+Walk the user through these steps interactively. Ask questions where marked with [ASK].
+
+### 1. Environment
+
+[ASK] Where will the factory run? Options:
+- **LXD container** (recommended for isolation) — need Debian 12, Docker, nesting enabled
+- **Bare VM or server** — need Debian/Ubuntu with Docker
+- **Existing container** — check prerequisites
+
+Verify prerequisites:
+```bash
+docker --version && git --version && jq --version && curl --version && tmux -V && python3 --version && claude --version
+```
+
+Any missing tool — help the user install it before continuing.
+
+### 2. Clone disinto and choose a target project
+
+Clone the disinto factory itself:
+```bash
+git clone https://codeberg.org/johba/disinto.git && cd disinto
+```
+
+[ASK] What repository should the factory develop? Provide the **remote repository URL** in one of these formats:
+- Full URL: `https://github.com/johba/harb.git` or `https://codeberg.org/johba/harb.git`
+- Short slug: `johba/harb` (uses local Forgejo as the primary remote)
+
+The factory will clone from the remote URL (if provided) or from your local Forgejo, then mirror to the remote.
+
+Then initialize the factory for that project:
+```bash
+bin/disinto init johba/harb --yes
+# or with full URL:
+bin/disinto init https://github.com/johba/harb.git --yes
+```
+
+The `init` command will:
+- Create all bot users (dev-bot, review-bot, etc.) on the local Forgejo
+- Generate and save `WOODPECKER_TOKEN`
+- Start the stack containers
+- Clone the target repo into the agent workspace
+
+> **Note:** The `--repo-root` flag is optional and only needed if you want to customize
+> where the cloned repo lives. By default, it goes under `/home/agent/repos/<name>`.
+
+### 3. Post-init verification
+
+Run this checklist — fix any failures before proceeding:
+
+```bash
+# Stack healthy?
+docker ps --format "table {{.Names}}\t{{.Status}}"
+# Expected: forgejo, woodpecker (healthy), woodpecker-agent (healthy), agents, edge, staging
+
+# Token generated?
+grep WOODPECKER_TOKEN .env | grep -v "^$" && echo "OK" || echo "MISSING — see references/troubleshooting.md"
+
+# Agent entrypoint loop running?
+docker exec disinto-agents-1 tail -5 /home/agent/data/agent-entrypoint.log
+
+# Agent can reach Forgejo?
+docker exec disinto-agents-1 bash -c "source /home/agent/disinto/.env && curl -sf http://forgejo:3000/api/v1/version | jq .version"
+
+# Agent repo cloned?
+docker exec -u agent disinto-agents-1 ls /home/agent/repos/
+```
+
+If the agent repo is missing, clone it:
+```bash
+docker exec disinto-agents-1 chown -R agent:agent /home/agent/repos
+docker exec -u agent disinto-agents-1 bash -c "source /home/agent/disinto/.env && git clone http://dev-bot:\${FORGE_TOKEN}@forgejo:3000/<org>/<repo>.git /home/agent/repos/<name>"
+```
+
+### 4. Create the project configuration file
+
+The factory uses a TOML file to configure how it manages your project. Create
+`projects/<name>.toml` based on the template format:
+
+```toml
+# projects/harb.toml
+
+name            = "harb"
+repo            = "johba/harb"
+forge_url       = "http://localhost:3000"
+repo_root       = "/home/agent/repos/harb"
+primary_branch  = "master"
+
+[ci]
+woodpecker_repo_id = 0
+stale_minutes      = 60
+
+[services]
+containers = ["ponder"]
+
+[monitoring]
+check_prs            = true
+check_dev_agent      = true
+check_pipeline_stall = true
+
+# [mirrors]
+# github   = "git@github.com:johba/harb.git"
+# codeberg = "git@codeberg.org:johba/harb.git"
+```
+
+**Key fields:**
+- `name`: Project identifier (used for file names, logs, etc.)
+- `repo`: The source repo in `owner/name` format
+- `forge_url`: URL of your local Forgejo instance
+- `repo_root`: Where the agent clones the repo
+- `primary_branch`: Default branch name (e.g., `main` or `master`)
+- `woodpecker_repo_id`: Set to `0` initially; auto-populated on first CI run
+- `containers`: List of Docker containers the factory should manage
+- `mirrors`: Optional external forge URLs for backup/sync
+
+### 5. Mirrors (optional)
+
+[ASK] Should the factory mirror to external forges? If yes, which?
+- GitHub: need repo URL and SSH key added to GitHub account
+- Codeberg: need repo URL and SSH key added to Codeberg account
+
+Show the user their public key:
+```bash
+cat ~/.ssh/id_ed25519.pub
+```
+
+Test SSH access:
+```bash
+ssh -T git@github.com 2>&1; ssh -T git@codeberg.org 2>&1
+```
+
+If SSH host keys are missing: `ssh-keyscan github.com codeberg.org >> ~/.ssh/known_hosts 2>/dev/null`
+
+Edit `projects/<name>.toml` to uncomment and configure mirrors:
+```toml
+[mirrors]
+github   = "git@github.com:Org/repo.git"
+codeberg = "git@codeberg.org:user/repo.git"
+```
+
+Test with a manual push:
+```bash
+source .env && source lib/env.sh && export PROJECT_TOML=projects/<name>.toml && source lib/load-project.sh && source lib/mirrors.sh && mirror_push
+```
+
+### 6. Seed the backlog
+
+[ASK] What should the factory work on first? Brainstorm with the user.
+
+Help them create issues on the local Forgejo. Each issue needs:
+- A clear title prefixed with `fix:`, `feat:`, or `chore:`
+- A body describing what to change, which files, and any constraints
+- The `backlog` label (so the dev-agent picks it up)
+
+```bash
+source .env
+BACKLOG_ID=$(curl -sf "http://localhost:3000/api/v1/repos/<org>/<repo>/labels" \
+  -H "Authorization: token $FORGE_TOKEN" | jq -r '.[] | select(.name=="backlog") | .id')
+
+curl -sf -X POST "http://localhost:3000/api/v1/repos/<org>/<repo>/issues" \
+  -H "Authorization: token $FORGE_TOKEN" \
+  -H "Content-Type: application/json" \
+  -d "{\"title\": \"<title>\", \"body\": \"<body>\", \"labels\": [$BACKLOG_ID]}"
+```
+
+For issues with dependencies, add `Depends-on: #N` in the body — the dev-agent checks
+these before starting.
+
+Use labels:
+- `backlog` — ready for the dev-agent
+- `blocked` — parked, not for the factory
+- No label — tracked but not for autonomous work
+
+### 7. Watch it work
+
+The dev-agent runs every 5 minutes via the entrypoint polling loop. Trigger manually to see it immediately:
+```bash
+source .env
+export PROJECT_TOML=projects/<name>.toml
+docker exec -u agent disinto-agents-1 bash -c "cd /home/agent/disinto && bash dev/dev-poll.sh projects/<name>.toml"
+```
+
+Then monitor:
+```bash
+# Watch the agent work
+docker exec disinto-agents-1 tail -f /home/agent/data/logs/dev/dev-agent.log
+
+# Check for Claude running
+docker exec disinto-agents-1 bash -c "for f in /proc/[0-9]*/cmdline; do cmd=\$(tr '\0' ' ' < \$f 2>/dev/null); echo \$cmd | grep -q 'claude.*-p' && echo 'Claude is running'; done"
+```
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,188 @@
+version: "3.8"
+
+services:
+  agents:
+    build:
+      context: .
+      dockerfile: docker/agents/Dockerfile
+    image: disinto/agents:latest
+    container_name: disinto-agents
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - agent-data:/home/agent/data
+      - project-repos:/home/agent/repos
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - ${HOME}/.ssh:/home/agent/.ssh:ro
+      - ${HOME}/.config/sops/age:/home/agent/.config/sops/age:ro
+      - woodpecker-data:/woodpecker-data:ro
+    environment:
+      - FORGE_URL=http://forgejo:3000
+      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
+      - FORGE_TOKEN=${FORGE_TOKEN:-}
+      - FORGE_REVIEW_TOKEN=${FORGE_REVIEW_TOKEN:-}
+      - FORGE_PLANNER_TOKEN=${FORGE_PLANNER_TOKEN:-}
+      - FORGE_GARDENER_TOKEN=${FORGE_GARDENER_TOKEN:-}
+      - FORGE_VAULT_TOKEN=${FORGE_VAULT_TOKEN:-}
+      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
+      - FORGE_PREDICTOR_TOKEN=${FORGE_PREDICTOR_TOKEN:-}
+      - FORGE_ARCHITECT_TOKEN=${FORGE_ARCHITECT_TOKEN:-}
+      - FORGE_BOT_USERNAMES=${FORGE_BOT_USERNAMES:-}
+      - WOODPECKER_TOKEN=${WOODPECKER_TOKEN:-}
+      - CLAUDE_TIMEOUT=${CLAUDE_TIMEOUT:-7200}
+      - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - FORGE_PASS=${FORGE_PASS:-}
+      - FORGE_ADMIN_PASS=${FORGE_ADMIN_PASS:-}
+      - FACTORY_REPO=${FORGE_REPO:-disinto-admin/disinto}
+      - DISINTO_CONTAINER=1
+      - PROJECT_NAME=${PROJECT_NAME:-project}
+      - PROJECT_REPO_ROOT=/home/agent/repos/${PROJECT_NAME:-project}
+      - WOODPECKER_DATA_DIR=/woodpecker-data
+      - WOODPECKER_REPO_ID=${WOODPECKER_REPO_ID:-}
+      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+      - POLL_INTERVAL=${POLL_INTERVAL:-300}
+      - GARDENER_INTERVAL=${GARDENER_INTERVAL:-21600}
+      - ARCHITECT_INTERVAL=${ARCHITECT_INTERVAL:-21600}
+      - PLANNER_INTERVAL=${PLANNER_INTERVAL:-43200}
+    depends_on:
+      - forgejo
+      - woodpecker
+    networks:
+      - disinto-net
+
+  agents-llama:
+    build:
+      context: .
+      dockerfile: docker/agents/Dockerfile
+    image: disinto/agents-llama:latest
+    container_name: disinto-agents-llama
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - agent-data:/home/agent/data
+      - project-repos:/home/agent/repos
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - ${HOME}/.ssh:/home/agent/.ssh:ro
+      - ${HOME}/.config/sops/age:/home/agent/.config/sops/age:ro
+      - woodpecker-data:/woodpecker-data:ro
+    environment:
+      - FORGE_URL=http://forgejo:3000
+      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
+      - FORGE_TOKEN=${FORGE_TOKEN_LLAMA:-}
+      - FORGE_PASS=${FORGE_PASS_LLAMA:-}
+      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
+      - FORGE_PREDICTOR_TOKEN=${FORGE_PREDICTOR_TOKEN:-}
+      - FORGE_ARCHITECT_TOKEN=${FORGE_ARCHITECT_TOKEN:-}
+      - FORGE_VAULT_TOKEN=${FORGE_VAULT_TOKEN:-}
+      - FORGE_PLANNER_TOKEN=${FORGE_PLANNER_TOKEN:-}
+      - FORGE_BOT_USERNAMES=${FORGE_BOT_USERNAMES:-}
+      - WOODPECKER_TOKEN=${WOODPECKER_TOKEN:-}
+      - CLAUDE_TIMEOUT=${CLAUDE_TIMEOUT:-7200}
+      - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
+      - CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=60
+      - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS=1
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL:-}
+      - FORGE_ADMIN_PASS=${FORGE_ADMIN_PASS:-}
+      - DISINTO_CONTAINER=1
+      - PROJECT_TOML=projects/disinto.toml
+      - PROJECT_NAME=${PROJECT_NAME:-project}
+      - PROJECT_REPO_ROOT=/home/agent/repos/${PROJECT_NAME:-project}
+      - WOODPECKER_DATA_DIR=/woodpecker-data
+      - WOODPECKER_REPO_ID=${WOODPECKER_REPO_ID:-}
+      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+      - POLL_INTERVAL=${POLL_INTERVAL:-300}
+      - AGENT_ROLES=dev
+    depends_on:
+      - forgejo
+      - woodpecker
+    networks:
+      - disinto-net
+
+  reproduce:
+    build:
+      context: .
+      dockerfile: docker/reproduce/Dockerfile
+    image: disinto-reproduce:latest
+    network_mode: host
+    profiles: ["reproduce"]
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - agent-data:/home/agent/data
+      - project-repos:/home/agent/repos
+      - ${HOME}/.claude:/home/agent/.claude
+      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${HOME}/.ssh:/home/agent/.ssh:ro
+    env_file:
+      - .env
+
+  edge:
+    build:
+      context: docker/edge
+      dockerfile: Dockerfile
+    image: disinto/edge:latest
+    container_name: disinto-edge
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${HOME}/.claude.json:/root/.claude.json:ro
+      - ${HOME}/.claude:/root/.claude:ro
+      - disinto-logs:/opt/disinto-logs
+    environment:
+      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
+      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+      - CLAUDE_MODEL=claude-sonnet-4-6
+      - FORGE_TOKEN=${FORGE_TOKEN:-}
+      - FORGE_URL=http://forgejo:3000
+      - FORGE_REPO=disinto-admin/disinto
+      - FORGE_OPS_REPO=disinto-admin/disinto-ops
+      - PRIMARY_BRANCH=main
+      - DISINTO_CONTAINER=1
+      - FORGE_ADMIN_USERS=disinto-admin,vault-bot,admin
+    ports:
+      - "80:80"
+      - "443:443"
+    depends_on:
+      - forgejo
+    networks:
+      - disinto-net
+
+  forgejo:
+    image: codeberg.org/forgejo/forgejo:11.0
+    container_name: disinto-forgejo
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - forgejo-data:/data
+    environment:
+      - FORGEJO__database__DB_TYPE=sqlite3
+      - FORGEJO__server__ROOT_URL=http://forgejo:3000/
+      - FORGEJO__server__HTTP_PORT=3000
+      - FORGEJO__security__INSTALL_LOCK=true
+      - FORGEJO__service__DISABLE_REGISTRATION=true
+      - FORGEJO__webhook__ALLOWED_HOST_LIST=private
+    ports:
+      - "3000:3000"
+    networks:
+      - disinto-net
+
+volumes:
+  disinto-logs:
+  agent-data:
+  project-repos:
+  woodpecker-data:
+  forgejo-data:
+
+networks:
+  disinto-net:
+    driver: bridge
--- a/docker/agents/Dockerfile
+++ b/docker/agents/Dockerfile
@ -1,7 +1,7 @@
 FROM debian:bookworm-slim

 RUN apt-get update && apt-get install -y --no-install-recommends \
-    bash curl git jq tmux cron python3 python3-pip openssh-client ca-certificates age shellcheck \
+    bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \
    && pip3 install --break-system-packages networkx \
    && rm -rf /var/lib/apt/lists/*

@ -26,8 +26,8 @@ COPY . /home/agent/disinto
 COPY docker/agents/entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh

-# Entrypoint runs as root to start the cron daemon;
-# cron jobs execute as the agent user (crontab -u agent).
+# Entrypoint runs polling loop directly, dropping to agent user via gosu.
+# All scripts execute as the agent user (UID 1000) while preserving env vars.
 WORKDIR /home/agent/disinto

 ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/agents/entrypoint-llama.sh
+++ b/docker/agents/entrypoint-llama.sh
@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-LOG_DIR="/home/agent/data/logs/dev"
-mkdir -p "$LOG_DIR" /home/agent/data
-chown -R agent:agent /home/agent/data 2>/dev/null || true
-
-log() {
-  printf "[%s] llama-loop: %s\n" "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOG_DIR/llama-loop.log"
-}
-
-# Apply token override for named agent identity
-if [ -n "${FORGE_TOKEN_OVERRIDE:-}" ]; then
-  export FORGE_TOKEN="$FORGE_TOKEN_OVERRIDE"
-fi
-
-log "Starting llama dev-agent loop"
-log "Backend: ${ANTHROPIC_BASE_URL:-not set}"
-log "Claude CLI: $(claude --version 2>&1 || echo not found)"
-log "Agent identity: $(curl -sf -H "Authorization: token ${FORGE_TOKEN}" "${FORGE_URL:-http://forgejo:3000}/api/v1/user" 2>/dev/null | jq -r '.login // "unknown"')"
-
-# Clone repo if not present
-if [ ! -d "${PROJECT_REPO_ROOT}/.git" ]; then
-  log "Cloning repo..."
-  mkdir -p "$(dirname "$PROJECT_REPO_ROOT")"
-  chown -R agent:agent /home/agent/repos 2>/dev/null || true
-  su -s /bin/bash agent -c "git clone http://dev-bot:${FORGE_TOKEN}@forgejo:3000/${FORGE_REPO:-johba/disinto}.git ${PROJECT_REPO_ROOT}"
-  log "Repo cloned"
-fi
-
-log "Entering poll loop (interval: ${POLL_INTERVAL:-300}s)"
-
-while true; do
-  # Clear stale session IDs before each poll.
-  # Local llama does not support --resume (no server-side session storage).
-  # Stale .sid files cause agent_run to exit instantly on every retry.
-  rm -f /tmp/dev-session-*.sid 2>/dev/null || true
-
-  su -s /bin/bash agent -c "
-    export FORGE_TOKEN='${FORGE_TOKEN}'
-    cd /home/agent/disinto && \
-    bash dev/dev-poll.sh ${PROJECT_TOML:-projects/disinto.toml}
-  " >> "$LOG_DIR/llama-loop.log" 2>&1 || true
-  sleep "${POLL_INTERVAL:-300}"
-done
--- a/docker/agents/entrypoint.sh
+++ b/docker/agents/entrypoint.sh
@ -1,53 +1,117 @@
 #!/usr/bin/env bash
 set -euo pipefail

-# entrypoint.sh — Start agent container with cron in foreground
+# entrypoint.sh — Start agent container with polling loop
 #
-# Runs as root inside the container.  Installs crontab entries for the
-# agent user from project TOMLs, then starts cron in the foreground.
-# All cron jobs execute as the agent user (UID 1000).
+# Runs as root inside the container.  Drops to agent user via gosu for all
+# poll scripts.  All Docker Compose env vars are inherited (PATH, FORGE_TOKEN,
+# ANTHROPIC_API_KEY, etc.).
+#
+# AGENT_ROLES env var controls which scripts run: "review,dev,gardener,architect,planner,predictor"
+# (default: all six). Uses while-true loop with staggered intervals:
+#   - review-poll: every 5 minutes (offset by 0s)
+#   - dev-poll: every 5 minutes (offset by 2 minutes)
+#   - gardener: every GARDENER_INTERVAL seconds (default: 21600 = 6 hours)
+#   - architect: every ARCHITECT_INTERVAL seconds (default: 21600 = 6 hours)
+#   - planner: every PLANNER_INTERVAL seconds (default: 43200 = 12 hours)
+#   - predictor: every 24 hours (288 iterations * 5 min)

-DISINTO_DIR="/home/agent/disinto"
+DISINTO_BAKED="/home/agent/disinto"
+DISINTO_LIVE="/home/agent/repos/_factory"
+DISINTO_DIR="$DISINTO_BAKED"  # start with baked copy; switched to live checkout after bootstrap
 LOGFILE="/home/agent/data/agent-entrypoint.log"
-mkdir -p /home/agent/data
-chown agent:agent /home/agent/data
+
+# Create all expected log subdirectories and set ownership as root before dropping to agent.
+# This handles both fresh volumes and stale root-owned dirs from prior container runs.
+mkdir -p /home/agent/data/logs/{dev,action,review,supervisor,vault,site,metrics,gardener,planner,predictor,architect,dispatcher}
+chown -R agent:agent /home/agent/data

 log() {
  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE"
 }

-# Build crontab from project TOMLs and install for the agent user.
-install_project_crons() {
-  local cron_lines="DISINTO_CONTAINER=1
-USER=agent
-FORGE_URL=http://forgejo:3000
-PROJECT_REPO_ROOT=/home/agent/repos/${pname}"
-  for toml in "${DISINTO_DIR}"/projects/*.toml; do
-    [ -f "$toml" ] || continue
-    local pname
-    pname=$(python3 -c "
-import sys, tomllib
-with open(sys.argv[1], 'rb') as f:
-    print(tomllib.load(f)['name'])
-" "$toml" 2>/dev/null) || continue
-
-    cron_lines="${cron_lines}
-# disinto: ${pname}
-2,7,12,17,22,27,32,37,42,47,52,57 * * * * ${DISINTO_DIR}/review/review-poll.sh ${toml} >>/home/agent/data/logs/cron.log 2>&1
-4,9,14,19,24,29,34,39,44,49,54,59 * * * * ${DISINTO_DIR}/dev/dev-poll.sh ${toml} >>/home/agent/data/logs/cron.log 2>&1
-0 0,6,12,18 * * * cd ${DISINTO_DIR} && bash gardener/gardener-run.sh ${toml} >>/home/agent/data/logs/cron.log 2>&1"
+# Initialize state directory and files if they don't exist
+init_state_dir() {
+  local state_dir="${DISINTO_DIR}/state"
+  mkdir -p "$state_dir"
+  # Create empty state files so check_active guards work
+  for agent in dev reviewer gardener architect planner predictor; do
+    touch "$state_dir/.${agent}-active" 2>/dev/null || true
  done
+  chown -R agent:agent "$state_dir"
+  log "Initialized state directory"
+}

-  if [ -n "$cron_lines" ]; then
-    printf '%s\n' "$cron_lines" | crontab -u agent -
-    log "Installed crontab for agent user"
+# Source shared git credential helper library (#604).
+# shellcheck source=lib/git-creds.sh
+source "${DISINTO_BAKED}/lib/git-creds.sh"
+
+# Wrapper that calls the shared configure_git_creds with agent-specific paths,
+# then repairs any legacy baked-credential URLs in existing clones.
+_setup_git_creds() {
+  configure_git_creds "/home/agent" "gosu agent"
+  if [ -n "${FORGE_PASS:-}" ] && [ -n "${FORGE_URL:-}" ]; then
+    log "Git credential helper configured (password auth)"
+  fi
+
+  # Repair legacy clones with baked-in stale credentials (#604).
+  _GIT_CREDS_LOG_FN=log repair_baked_cred_urls --as "gosu agent" /home/agent/repos
+}
+
+# Configure git author identity for commits made by this container.
+# Derives identity from the resolved bot user (BOT_USER) to ensure commits
+# are visibly attributable to the correct bot in the forge timeline.
+configure_git_identity() {
+  # Resolve BOT_USER from FORGE_TOKEN if not already set
+  if [ -z "${BOT_USER:-}" ] && [ -n "${FORGE_TOKEN:-}" ]; then
+    BOT_USER=$(curl -sf --max-time 10 \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_URL:-http://localhost:3000}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || true
+  fi
+
+  # Default to dev-bot if resolution fails
+  BOT_USER="${BOT_USER:-dev-bot}"
+
+  # Configure git identity for all repositories
+  gosu agent git config --global user.name "${BOT_USER}"
+  gosu agent git config --global user.email "${BOT_USER}@disinto.local"
+
+  log "Git identity configured: ${BOT_USER} <${BOT_USER}@disinto.local>"
+}
+
+# Configure tea CLI login for forge operations (runs as agent user).
+# tea stores config in ~/.config/tea/ — persistent across container restarts
+# only if that directory is on a mounted volume.
+configure_tea_login() {
+  if command -v tea &>/dev/null && [ -n "${FORGE_TOKEN:-}" ] && [ -n "${FORGE_URL:-}" ]; then
+    local_tea_login="forgejo"
+    case "$FORGE_URL" in
+      *codeberg.org*) local_tea_login="codeberg" ;;
+    esac
+    gosu agent bash -c "tea login add \
+      --name '${local_tea_login}' \
+      --url '${FORGE_URL}' \
+      --token '${FORGE_TOKEN}' \
+      --no-version-check 2>/dev/null || true"
+    log "tea login configured: ${local_tea_login} → ${FORGE_URL}"
  else
-    log "No project TOMLs found — crontab empty"
+    log "tea login: skipped (tea not found or FORGE_TOKEN/FORGE_URL not set)"
  fi
 }

 log "Agent container starting"

+# Set USER and HOME for scripts that source lib/env.sh.
+# These are preconditions required by lib/env.sh's surface contract.
+# gosu agent inherits the parent's env, so exports here propagate to all children.
+export USER=agent
+export HOME=/home/agent
+
+# Source lib/env.sh to get DISINTO_LOG_DIR and other shared environment.
+# This must happen after USER/HOME are set (env.sh preconditions).
+# shellcheck source=lib/env.sh
+source "${DISINTO_BAKED}/lib/env.sh"
+
 # Verify Claude CLI is available (expected via volume mount from host).
 if ! command -v claude &>/dev/null; then
  log "FATAL: claude CLI not found in PATH."
@ -63,33 +127,332 @@ log "Claude CLI: $(claude --version 2>&1 || true)"
 # auth method is active so operators can debug 401s.
 if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
  log "Auth: ANTHROPIC_API_KEY is set — using API key (no OAuth rotation)"
-elif [ -f /home/agent/.claude/credentials.json ]; then
-  log "Auth: OAuth credentials mounted from host (~/.claude)"
+elif [ -f "${CLAUDE_CONFIG_DIR:-/home/agent/.claude}/.credentials.json" ]; then
+  log "Auth: OAuth credentials mounted from host (${CLAUDE_CONFIG_DIR:-~/.claude})"
 else
  log "WARNING: No ANTHROPIC_API_KEY and no OAuth credentials found."
  log "Run 'claude auth login' on the host, or set ANTHROPIC_API_KEY in .env"
 fi

-install_project_crons
+# Bootstrap ops repos for each project TOML (#586).
+# In compose mode the ops repo lives on a Docker named volume at
+# /home/agent/repos/<project>-ops.  If init ran migrate_ops_repo on the host
+# the container never saw those changes.  This function clones from forgejo
+# when the repo is missing, or configures the remote and pulls when it exists
+# but has no remote (orphaned local-only checkout).
+bootstrap_ops_repos() {
+  local repos_dir="/home/agent/repos"
+  mkdir -p "$repos_dir"
+  chown agent:agent "$repos_dir"

-# Configure tea CLI login for forge operations (runs as agent user).
-# tea stores config in ~/.config/tea/ — persistent across container restarts
-# only if that directory is on a mounted volume.
-if command -v tea &>/dev/null && [ -n "${FORGE_TOKEN:-}" ] && [ -n "${FORGE_URL:-}" ]; then
-  local_tea_login="forgejo"
-  case "$FORGE_URL" in
-    *codeberg.org*) local_tea_login="codeberg" ;;
-  esac
-  su -s /bin/bash agent -c "tea login add \
-    --name '${local_tea_login}' \
-    --url '${FORGE_URL}' \
-    --token '${FORGE_TOKEN}' \
-    --no-version-check 2>/dev/null || true"
-  log "tea login configured: ${local_tea_login} → ${FORGE_URL}"
-else
-  log "tea login: skipped (tea not found or FORGE_TOKEN/FORGE_URL not set)"
-fi
+  for toml in "${DISINTO_DIR}"/projects/*.toml; do
+    [ -f "$toml" ] || continue

-# Run cron in the foreground.  Cron jobs execute as the agent user.
-log "Starting cron daemon"
-exec cron -f
+    # Extract project name, ops repo slug, repo slug, and primary branch from TOML
+    local project_name ops_slug primary_branch
+    local _toml_vals
+    _toml_vals=$(python3 -c "
+import tomllib, sys
+with open(sys.argv[1], 'rb') as f:
+    cfg = tomllib.load(f)
+print(cfg.get('name', ''))
+print(cfg.get('ops_repo', ''))
+print(cfg.get('repo', ''))
+print(cfg.get('primary_branch', 'main'))
+" "$toml" 2>/dev/null || true)
+
+    project_name=$(sed -n '1p' <<< "$_toml_vals")
+    [ -n "$project_name" ] || continue
+    ops_slug=$(sed -n '2p' <<< "$_toml_vals")
+    local repo_slug
+    repo_slug=$(sed -n '3p' <<< "$_toml_vals")
+    primary_branch=$(sed -n '4p' <<< "$_toml_vals")
+    primary_branch="${primary_branch:-main}"
+
+    # Fall back to convention if ops_repo not in TOML
+    if [ -z "$ops_slug" ]; then
+      if [ -n "$repo_slug" ]; then
+        ops_slug="${repo_slug}-ops"
+      else
+        ops_slug="disinto-admin/${project_name}-ops"
+      fi
+    fi
+
+    local ops_root="${repos_dir}/${project_name}-ops"
+    local remote_url="${FORGE_URL}/${ops_slug}.git"
+
+    if [ ! -d "${ops_root}/.git" ]; then
+      # Clone ops repo from forgejo
+      log "Ops bootstrap: cloning ${ops_slug} -> ${ops_root}"
+      if gosu agent git clone --quiet "$remote_url" "$ops_root" 2>/dev/null; then
+        log "Ops bootstrap: ${ops_slug} cloned successfully"
+      else
+        # Remote may not exist yet (first run before init); create empty repo
+        log "Ops bootstrap: clone failed for ${ops_slug} — initializing empty repo"
+        gosu agent bash -c "
+          mkdir -p '${ops_root}' && \
+          git -C '${ops_root}' init --initial-branch='${primary_branch}' -q && \
+          git -C '${ops_root}' remote add origin '${remote_url}'
+        "
+      fi
+    else
+      # Repo exists — ensure remote is configured and pull latest
+      local current_remote
+      current_remote=$(git -C "$ops_root" remote get-url origin 2>/dev/null || true)
+      if [ -z "$current_remote" ]; then
+        log "Ops bootstrap: adding missing remote to ${ops_root}"
+        gosu agent git -C "$ops_root" remote add origin "$remote_url"
+      elif [ "$current_remote" != "$remote_url" ]; then
+        log "Ops bootstrap: fixing remote URL in ${ops_root}"
+        gosu agent git -C "$ops_root" remote set-url origin "$remote_url"
+      fi
+      # Pull latest from forgejo to pick up any host-side migrations
+      log "Ops bootstrap: pulling latest for ${project_name}-ops"
+      gosu agent bash -c "
+        cd '${ops_root}' && \
+        git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
+        git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
+      " || log "Ops bootstrap: pull failed for ${ops_slug} (remote may not exist yet)"
+    fi
+  done
+}
+
+# Bootstrap the factory (disinto) repo from Forgejo into the project-repos
+# volume so the entrypoint runs from a live git checkout that receives
+# updates via `git pull`, not the stale baked copy from `COPY .` (#593).
+bootstrap_factory_repo() {
+  local repo="${FACTORY_REPO:-}"
+  if [ -z "$repo" ]; then
+    log "Factory bootstrap: FACTORY_REPO not set — running from baked copy"
+    return 0
+  fi
+
+  local remote_url="${FORGE_URL}/${repo}.git"
+  local primary_branch="${PRIMARY_BRANCH:-main}"
+
+  if [ ! -d "${DISINTO_LIVE}/.git" ]; then
+    log "Factory bootstrap: cloning ${repo} -> ${DISINTO_LIVE}"
+    if gosu agent git clone --quiet --branch "$primary_branch" "$remote_url" "$DISINTO_LIVE" 2>&1; then
+      log "Factory bootstrap: cloned successfully"
+    else
+      log "Factory bootstrap: clone failed — running from baked copy"
+      return 0
+    fi
+  else
+    log "Factory bootstrap: pulling latest ${repo}"
+    gosu agent bash -c "
+      cd '${DISINTO_LIVE}' && \
+      git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
+      git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
+    " || log "Factory bootstrap: pull failed — using existing checkout"
+  fi
+
+  # Copy project TOMLs from baked dir — they are gitignored AND docker-ignored,
+  # so neither the image nor the clone normally contains them.  If the baked
+  # copy has any (e.g. operator manually placed them), propagate them.
+  if compgen -G "${DISINTO_BAKED}/projects/*.toml" >/dev/null 2>&1; then
+    mkdir -p "${DISINTO_LIVE}/projects"
+    cp "${DISINTO_BAKED}"/projects/*.toml "${DISINTO_LIVE}/projects/"
+    chown -R agent:agent "${DISINTO_LIVE}/projects"
+    log "Factory bootstrap: copied project TOMLs to live checkout"
+  fi
+
+  # Verify the live checkout has the expected structure
+  if [ -f "${DISINTO_LIVE}/lib/env.sh" ]; then
+    DISINTO_DIR="$DISINTO_LIVE"
+    log "Factory bootstrap: DISINTO_DIR switched to live checkout at ${DISINTO_LIVE}"
+  else
+    log "Factory bootstrap: live checkout missing expected files — falling back to baked copy"
+  fi
+}
+
+# Ensure the project repo is cloned on first run (#589).
+# The agents container uses a named volume (project-repos) at /home/agent/repos.
+# On first startup, if the project repo is missing, clone it from FORGE_URL/FORGE_REPO.
+# This makes the agents container self-healing and independent of init's host clone.
+ensure_project_clone() {
+  # shellcheck disable=SC2153
+  local repo_dir="/home/agent/repos/${PROJECT_NAME}"
+  if [ -d "${repo_dir}/.git" ]; then
+    log "Project repo present at ${repo_dir}"
+    return 0
+  fi
+  if [ -z "${FORGE_REPO:-}" ] || [ -z "${FORGE_URL:-}" ]; then
+    log "Cannot clone project repo: FORGE_REPO or FORGE_URL unset"
+    return 1
+  fi
+  log "Cloning ${FORGE_URL}/${FORGE_REPO}.git -> ${repo_dir} (first run)"
+  mkdir -p "$(dirname "$repo_dir")"
+  chown -R agent:agent "$(dirname "$repo_dir")"
+  if gosu agent git clone --quiet "${FORGE_URL}/${FORGE_REPO}.git" "$repo_dir"; then
+    log "Project repo cloned"
+  else
+    log "Project repo clone failed — agents may fail until manually fixed"
+    return 1
+  fi
+}
+
+# Pull latest factory code at the start of each poll iteration (#593).
+# Runs as the agent user; failures are non-fatal (stale code still works).
+pull_factory_repo() {
+  [ "$DISINTO_DIR" = "$DISINTO_LIVE" ] || return 0
+  local primary_branch="${PRIMARY_BRANCH:-main}"
+  gosu agent bash -c "
+    cd '${DISINTO_LIVE}' && \
+    git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
+    git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
+  " || log "Factory pull failed — continuing with current checkout"
+}
+
+# Configure git and tea once at startup (as root, then drop to agent)
+_setup_git_creds
+configure_git_identity
+configure_tea_login
+
+# Clone project repo on first run (makes agents self-healing, #589)
+ensure_project_clone
+
+# Bootstrap ops repos from forgejo into container volumes (#586)
+bootstrap_ops_repos
+
+# Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593)
+bootstrap_factory_repo
+
+# Initialize state directory for check_active guards
+init_state_dir
+
+# Parse AGENT_ROLES env var (default: all agents)
+# Expected format: comma-separated list like "review,dev,gardener"
+AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor}"
+log "Agent roles configured: ${AGENT_ROLES}"
+
+# Poll interval in seconds (5 minutes default)
+POLL_INTERVAL="${POLL_INTERVAL:-300}"
+
+# Gardener and architect intervals (default 6 hours = 21600 seconds)
+GARDENER_INTERVAL="${GARDENER_INTERVAL:-21600}"
+ARCHITECT_INTERVAL="${ARCHITECT_INTERVAL:-21600}"
+PLANNER_INTERVAL="${PLANNER_INTERVAL:-43200}"
+
+log "Entering polling loop (interval: ${POLL_INTERVAL}s, roles: ${AGENT_ROLES})"
+log "Gardener interval: ${GARDENER_INTERVAL}s, Architect interval: ${ARCHITECT_INTERVAL}s, Planner interval: ${PLANNER_INTERVAL}s"
+
+# Main polling loop using iteration counter for gardener scheduling
+iteration=0
+while true; do
+  iteration=$((iteration + 1))
+  now=$(date +%s)
+
+  # Pull latest factory code so poll scripts stay current (#593)
+  pull_factory_repo
+
+  # Stale .sid cleanup — needed for agents that don't support --resume
+  # Run this as the agent user
+  gosu agent bash -c "rm -f /tmp/dev-session-*.sid /tmp/review-session-*.sid 2>/dev/null || true"
+
+  # Poll each project TOML
+  # Fast agents (review-poll, dev-poll) run in background so they don't block
+  # each other.  Slow agents (gardener, architect, planner, predictor) also run
+  # in background but are guarded by pgrep so only one instance runs at a time.
+  # Per-session CLAUDE_CONFIG_DIR isolation handles OAuth concurrency natively.
+  # Set CLAUDE_EXTERNAL_LOCK=1 to re-enable the legacy flock serialization.
+  for toml in "${DISINTO_DIR}"/projects/*.toml; do
+    [ -f "$toml" ] || continue
+
+    # Parse project name and primary branch from TOML so env.sh preconditions
+    # are satisfied when agent scripts source it (#674).
+    _toml_vals=$(python3 -c "
+import tomllib, sys
+with open(sys.argv[1], 'rb') as f:
+    cfg = tomllib.load(f)
+print(cfg.get('name', ''))
+print(cfg.get('primary_branch', 'main'))
+" "$toml" 2>/dev/null || true)
+    _pname=$(sed -n '1p' <<< "$_toml_vals")
+    _pbranch=$(sed -n '2p' <<< "$_toml_vals")
+    [ -n "$_pname" ] || { log "WARNING: could not parse project name from ${toml} — skipping"; continue; }
+
+    export PROJECT_NAME="$_pname"
+    export PROJECT_REPO_ROOT="/home/agent/repos/${_pname}"
+    export OPS_REPO_ROOT="/home/agent/repos/${_pname}-ops"
+    export PRIMARY_BRANCH="${_pbranch:-main}"
+
+    log "Processing project TOML: ${toml}"
+
+    # --- Fast agents: run in background, wait before slow agents ---
+
+    # Review poll (every iteration)
+    if [[ ",${AGENT_ROLES}," == *",review,"* ]]; then
+      log "Running review-poll (iteration ${iteration}) for ${toml}"
+      gosu agent bash -c "cd ${DISINTO_DIR} && bash review/review-poll.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/review-poll.log" 2>&1 &
+    fi
+
+    sleep 2  # stagger fast polls
+
+    # Dev poll (every iteration)
+    if [[ ",${AGENT_ROLES}," == *",dev,"* ]]; then
+      log "Running dev-poll (iteration ${iteration}) for ${toml}"
+      gosu agent bash -c "cd ${DISINTO_DIR} && bash dev/dev-poll.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/dev-poll.log" 2>&1 &
+    fi
+
+    # Wait for fast polls to finish before launching slow agents
+    wait
+
+    # --- Slow agents: run in background with pgrep guard ---
+
+    # Gardener (interval configurable via GARDENER_INTERVAL env var)
+    if [[ ",${AGENT_ROLES}," == *",gardener,"* ]]; then
+      gardener_iteration=$((iteration * POLL_INTERVAL))
+      if [ $((gardener_iteration % GARDENER_INTERVAL)) -eq 0 ] && [ "$now" -ge "$gardener_iteration" ]; then
+        if ! pgrep -f "gardener-run.sh" >/dev/null; then
+          log "Running gardener (iteration ${iteration}, ${GARDENER_INTERVAL}s interval) for ${toml}"
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash gardener/gardener-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/gardener.log" 2>&1 &
+        else
+          log "Skipping gardener — already running"
+        fi
+      fi
+    fi
+
+    # Architect (interval configurable via ARCHITECT_INTERVAL env var)
+    if [[ ",${AGENT_ROLES}," == *",architect,"* ]]; then
+      architect_iteration=$((iteration * POLL_INTERVAL))
+      if [ $((architect_iteration % ARCHITECT_INTERVAL)) -eq 0 ] && [ "$now" -ge "$architect_iteration" ]; then
+        if ! pgrep -f "architect-run.sh" >/dev/null; then
+          log "Running architect (iteration ${iteration}, ${ARCHITECT_INTERVAL}s interval) for ${toml}"
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash architect/architect-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/architect.log" 2>&1 &
+        else
+          log "Skipping architect — already running"
+        fi
+      fi
+    fi
+
+    # Planner (interval configurable via PLANNER_INTERVAL env var)
+    if [[ ",${AGENT_ROLES}," == *",planner,"* ]]; then
+      planner_iteration=$((iteration * POLL_INTERVAL))
+      if [ $((planner_iteration % PLANNER_INTERVAL)) -eq 0 ] && [ "$now" -ge "$planner_iteration" ]; then
+        if ! pgrep -f "planner-run.sh" >/dev/null; then
+          log "Running planner (iteration ${iteration}, ${PLANNER_INTERVAL}s interval) for ${toml}"
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash planner/planner-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/planner.log" 2>&1 &
+        else
+          log "Skipping planner — already running"
+        fi
+      fi
+    fi
+
+    # Predictor (every 24 hours = 288 iterations * 5 min = 86400 seconds)
+    if [[ ",${AGENT_ROLES}," == *",predictor,"* ]]; then
+      predictor_iteration=$((iteration * POLL_INTERVAL))
+      predictor_interval=$((24 * 60 * 60))  # 24 hours in seconds
+      if [ $((predictor_iteration % predictor_interval)) -eq 0 ] && [ "$now" -ge "$predictor_iteration" ]; then
+        if ! pgrep -f "predictor-run.sh" >/dev/null; then
+          log "Running predictor (iteration ${iteration}, 24-hour interval) for ${toml}"
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash predictor/predictor-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/predictor.log" 2>&1 &
+        else
+          log "Skipping predictor — already running"
+        fi
+      fi
+    fi
+  done
+
+  sleep "${POLL_INTERVAL}"
+done
--- a/docker/chat/Dockerfile
+++ b/docker/chat/Dockerfile
@ -0,0 +1,35 @@
+# disinto-chat — minimal HTTP backend for Claude chat UI
+#
+# Small Debian slim base with Python runtime.
+# Chosen for simplicity and small image size (~100MB).
+#
+# Image size: ~100MB (well under the 200MB ceiling)
+#
+# The claude binary is mounted from the host at runtime via docker-compose,
+# not baked into the image — same pattern as the agents container.
+
+FROM debian:bookworm-slim
+
+# Install Python (no build-time network access needed)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Non-root user — fixed UID 10001 for sandbox hardening (#706)
+RUN useradd -m -u 10001 -s /bin/bash chat
+
+# Copy application files
+COPY server.py /usr/local/bin/server.py
+COPY entrypoint-chat.sh /entrypoint-chat.sh
+COPY ui/ /var/chat/ui/
+
+RUN chmod +x /entrypoint-chat.sh /usr/local/bin/server.py
+
+USER chat
+WORKDIR /var/chat
+
+EXPOSE 8080
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+  CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/')" || exit 1
+
+ENTRYPOINT ["/entrypoint-chat.sh"]
--- a/docker/chat/entrypoint-chat.sh
+++ b/docker/chat/entrypoint-chat.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# entrypoint-chat.sh — Start the disinto-chat backend server
+#
+# Exec-replace pattern: this script is the container entrypoint and runs
+# the server directly (no wrapper needed). Logs to stdout for docker logs.
+
+LOGFILE="/tmp/chat.log"
+
+log() {
+    printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE"
+}
+
+# Sandbox sanity checks (#706) — fail fast if isolation is broken
+if [ -e /var/run/docker.sock ]; then
+    log "FATAL: /var/run/docker.sock is accessible — sandbox violation"
+    exit 1
+fi
+if [ "$(id -u)" = "0" ]; then
+    log "FATAL: running as root (uid 0) — sandbox violation"
+    exit 1
+fi
+
+# Verify Claude CLI is available (expected via volume mount from host).
+if ! command -v claude &>/dev/null; then
+    log "FATAL: claude CLI not found in PATH"
+    log "Mount the host binary into the container, e.g.:"
+    log "  volumes:"
+    log "    - /usr/local/bin/claude:/usr/local/bin/claude:ro"
+    exit 1
+fi
+log "Claude CLI: $(claude --version 2>&1 || true)"
+
+# Start the Python server (exec-replace so signals propagate correctly)
+log "Starting disinto-chat server on port 8080..."
+exec python3 /usr/local/bin/server.py
--- a/docker/chat/server.py
+++ b/docker/chat/server.py
@ -0,0 +1,949 @@
+#!/usr/bin/env python3
+"""
+disinto-chat server — minimal HTTP backend for Claude chat UI.
+
+Routes:
+    GET /chat/auth/verify    -> Caddy forward_auth callback (returns 200+X-Forwarded-User or 401)
+    GET /chat/login          -> 302 to Forgejo OAuth authorize
+    GET /chat/oauth/callback -> exchange code for token, validate user, set session
+    GET /chat/               -> serves index.html (session required)
+    GET /chat/static/*       -> serves static assets (session required)
+    POST /chat               -> spawns `claude --print` with user message (session required)
+    GET /ws                  -> reserved for future streaming upgrade (returns 501)
+
+OAuth flow:
+    1. User hits any /chat/* route without a valid session cookie -> 302 /chat/login
+    2. /chat/login redirects to Forgejo /login/oauth/authorize
+    3. Forgejo redirects back to /chat/oauth/callback with ?code=...&state=...
+    4. Server exchanges code for access token, fetches /api/v1/user
+    5. Asserts user is in allowlist, sets HttpOnly session cookie
+    6. Redirects to /chat/
+
+The claude binary is expected to be mounted from the host at /usr/local/bin/claude.
+"""
+
+import datetime
+import json
+import os
+import re
+import secrets
+import subprocess
+import sys
+import time
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from urllib.parse import urlparse, parse_qs, urlencode
+
+# Configuration
+HOST = os.environ.get("CHAT_HOST", "0.0.0.0")
+PORT = int(os.environ.get("CHAT_PORT", 8080))
+UI_DIR = "/var/chat/ui"
+STATIC_DIR = os.path.join(UI_DIR, "static")
+CLAUDE_BIN = "/usr/local/bin/claude"
+
+# OAuth configuration
+FORGE_URL = os.environ.get("FORGE_URL", "http://localhost:3000")
+CHAT_OAUTH_CLIENT_ID = os.environ.get("CHAT_OAUTH_CLIENT_ID", "")
+CHAT_OAUTH_CLIENT_SECRET = os.environ.get("CHAT_OAUTH_CLIENT_SECRET", "")
+EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "")
+
+# Shared secret for Caddy forward_auth verify endpoint (#709).
+# When set, only requests carrying this value in X-Forward-Auth-Secret are
+# allowed to call /chat/auth/verify.  When empty the endpoint is unrestricted
+# (acceptable during local dev; production MUST set this).
+FORWARD_AUTH_SECRET = os.environ.get("FORWARD_AUTH_SECRET", "")
+
+# Rate limiting / cost caps (#711)
+CHAT_MAX_REQUESTS_PER_HOUR = int(os.environ.get("CHAT_MAX_REQUESTS_PER_HOUR", 60))
+CHAT_MAX_REQUESTS_PER_DAY = int(os.environ.get("CHAT_MAX_REQUESTS_PER_DAY", 500))
+CHAT_MAX_TOKENS_PER_DAY = int(os.environ.get("CHAT_MAX_TOKENS_PER_DAY", 1000000))
+
+# Allowed users - disinto-admin always allowed; CSV allowlist extends it
+_allowed_csv = os.environ.get("DISINTO_CHAT_ALLOWED_USERS", "")
+ALLOWED_USERS = {"disinto-admin"}
+if _allowed_csv:
+    ALLOWED_USERS.update(u.strip() for u in _allowed_csv.split(",") if u.strip())
+
+# Session cookie name
+SESSION_COOKIE = "disinto_chat_session"
+
+# Session TTL: 24 hours
+SESSION_TTL = 24 * 60 * 60
+
+# Chat history directory (bind-mounted from host)
+CHAT_HISTORY_DIR = os.environ.get("CHAT_HISTORY_DIR", "/var/lib/chat/history")
+
+# Regex for valid conversation_id (12-char hex, no slashes)
+CONVERSATION_ID_PATTERN = re.compile(r"^[0-9a-f]{12}$")
+
+# In-memory session store: token -> {"user": str, "expires": float}
+_sessions = {}
+
+# Pending OAuth state tokens: state -> expires (float)
+_oauth_states = {}
+
+# Per-user rate limiting state (#711)
+# user -> list of request timestamps (for sliding-window hourly/daily caps)
+_request_log = {}
+# user -> {"tokens": int, "date": "YYYY-MM-DD"}
+_daily_tokens = {}
+
+# MIME types for static files
+MIME_TYPES = {
+    ".html": "text/html; charset=utf-8",
+    ".js": "application/javascript; charset=utf-8",
+    ".css": "text/css; charset=utf-8",
+    ".json": "application/json; charset=utf-8",
+    ".png": "image/png",
+    ".jpg": "image/jpeg",
+    ".svg": "image/svg+xml",
+    ".ico": "image/x-icon",
+}
+
+
+def _build_callback_uri():
+    """Build the OAuth callback URI based on tunnel configuration."""
+    if EDGE_TUNNEL_FQDN:
+        return f"https://{EDGE_TUNNEL_FQDN}/chat/oauth/callback"
+    return "http://localhost/chat/oauth/callback"
+
+
+def _session_cookie_flags():
+    """Return cookie flags appropriate for the deployment mode."""
+    flags = "HttpOnly; SameSite=Lax; Path=/chat"
+    if EDGE_TUNNEL_FQDN:
+        flags += "; Secure"
+    return flags
+
+
+def _validate_session(cookie_header):
+    """Check session cookie and return username if valid, else None."""
+    if not cookie_header:
+        return None
+    for part in cookie_header.split(";"):
+        part = part.strip()
+        if part.startswith(SESSION_COOKIE + "="):
+            token = part[len(SESSION_COOKIE) + 1:]
+            session = _sessions.get(token)
+            if session and session["expires"] > time.time():
+                return session["user"]
+            # Expired - clean up
+            _sessions.pop(token, None)
+            return None
+    return None
+
+
+def _gc_sessions():
+    """Remove expired sessions (called opportunistically)."""
+    now = time.time()
+    expired = [k for k, v in _sessions.items() if v["expires"] <= now]
+    for k in expired:
+        del _sessions[k]
+    expired_states = [k for k, v in _oauth_states.items() if v <= now]
+    for k in expired_states:
+        del _oauth_states[k]
+
+
+def _exchange_code_for_token(code):
+    """Exchange an authorization code for an access token via Forgejo."""
+    import urllib.request
+    import urllib.error
+
+    data = urlencode({
+        "grant_type": "authorization_code",
+        "code": code,
+        "client_id": CHAT_OAUTH_CLIENT_ID,
+        "client_secret": CHAT_OAUTH_CLIENT_SECRET,
+        "redirect_uri": _build_callback_uri(),
+    }).encode()
+
+    req = urllib.request.Request(
+        f"{FORGE_URL}/login/oauth/access_token",
+        data=data,
+        headers={"Accept": "application/json", "Content-Type": "application/x-www-form-urlencoded"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return json.loads(resp.read().decode())
+    except (urllib.error.URLError, json.JSONDecodeError, OSError) as e:
+        print(f"OAuth token exchange failed: {e}", file=sys.stderr)
+        return None
+
+
+def _fetch_user(access_token):
+    """Fetch the authenticated user from Forgejo API."""
+    import urllib.request
+    import urllib.error
+
+    req = urllib.request.Request(
+        f"{FORGE_URL}/api/v1/user",
+        headers={"Authorization": f"token {access_token}", "Accept": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return json.loads(resp.read().decode())
+    except (urllib.error.URLError, json.JSONDecodeError, OSError) as e:
+        print(f"User fetch failed: {e}", file=sys.stderr)
+        return None
+
+
+# =============================================================================
+# Rate Limiting Functions (#711)
+# =============================================================================
+
+def _check_rate_limit(user):
+    """Check per-user rate limits. Returns (allowed, retry_after, reason) (#711).
+
+    Checks hourly request cap, daily request cap, and daily token cap.
+    """
+    now = time.time()
+    one_hour_ago = now - 3600
+    today = datetime.date.today().isoformat()
+
+    # Prune old entries from request log
+    timestamps = _request_log.get(user, [])
+    timestamps = [t for t in timestamps if t > now - 86400]
+    _request_log[user] = timestamps
+
+    # Hourly request cap
+    hourly = [t for t in timestamps if t > one_hour_ago]
+    if len(hourly) >= CHAT_MAX_REQUESTS_PER_HOUR:
+        oldest_in_window = min(hourly)
+        retry_after = int(oldest_in_window + 3600 - now) + 1
+        return False, max(retry_after, 1), "hourly request limit"
+
+    # Daily request cap
+    start_of_day = time.mktime(datetime.date.today().timetuple())
+    daily = [t for t in timestamps if t >= start_of_day]
+    if len(daily) >= CHAT_MAX_REQUESTS_PER_DAY:
+        next_day = start_of_day + 86400
+        retry_after = int(next_day - now) + 1
+        return False, max(retry_after, 1), "daily request limit"
+
+    # Daily token cap
+    token_info = _daily_tokens.get(user, {"tokens": 0, "date": today})
+    if token_info["date"] != today:
+        token_info = {"tokens": 0, "date": today}
+        _daily_tokens[user] = token_info
+    if token_info["tokens"] >= CHAT_MAX_TOKENS_PER_DAY:
+        next_day = start_of_day + 86400
+        retry_after = int(next_day - now) + 1
+        return False, max(retry_after, 1), "daily token limit"
+
+    return True, 0, ""
+
+
+def _record_request(user):
+    """Record a request timestamp for the user (#711)."""
+    _request_log.setdefault(user, []).append(time.time())
+
+
+def _record_tokens(user, tokens):
+    """Record token usage for the user (#711)."""
+    today = datetime.date.today().isoformat()
+    token_info = _daily_tokens.get(user, {"tokens": 0, "date": today})
+    if token_info["date"] != today:
+        token_info = {"tokens": 0, "date": today}
+    token_info["tokens"] += tokens
+    _daily_tokens[user] = token_info
+
+
+def _parse_stream_json(output):
+    """Parse stream-json output from claude --print (#711).
+
+    Returns (text_content, total_tokens).  Falls back gracefully if the
+    usage event is absent or malformed.
+    """
+    text_parts = []
+    total_tokens = 0
+
+    for line in output.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+
+        etype = event.get("type", "")
+
+        # Collect assistant text
+        if etype == "content_block_delta":
+            delta = event.get("delta", {})
+            if delta.get("type") == "text_delta":
+                text_parts.append(delta.get("text", ""))
+        elif etype == "assistant":
+            # Full assistant message (non-streaming)
+            content = event.get("content", "")
+            if isinstance(content, str) and content:
+                text_parts.append(content)
+            elif isinstance(content, list):
+                for block in content:
+                    if isinstance(block, dict) and block.get("text"):
+                        text_parts.append(block["text"])
+
+        # Parse usage from result event
+        if etype == "result":
+            usage = event.get("usage", {})
+            total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
+        elif "usage" in event:
+            usage = event["usage"]
+            if isinstance(usage, dict):
+                total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
+
+    return "".join(text_parts), total_tokens
+
+
+# =============================================================================
+# Conversation History Functions (#710)
+# =============================================================================
+
+def _generate_conversation_id():
+    """Generate a new conversation ID (12-char hex string)."""
+    return secrets.token_hex(6)
+
+
+def _validate_conversation_id(conv_id):
+    """Validate that conversation_id matches the required format."""
+    return bool(CONVERSATION_ID_PATTERN.match(conv_id))
+
+
+def _get_user_history_dir(user):
+    """Get the history directory path for a user."""
+    return os.path.join(CHAT_HISTORY_DIR, user)
+
+
+def _get_conversation_path(user, conv_id):
+    """Get the full path to a conversation file."""
+    user_dir = _get_user_history_dir(user)
+    return os.path.join(user_dir, f"{conv_id}.ndjson")
+
+
+def _ensure_user_dir(user):
+    """Ensure the user's history directory exists."""
+    user_dir = _get_user_history_dir(user)
+    os.makedirs(user_dir, exist_ok=True)
+    return user_dir
+
+
+def _write_message(user, conv_id, role, content):
+    """Append a message to a conversation file in NDJSON format."""
+    conv_path = _get_conversation_path(user, conv_id)
+    _ensure_user_dir(user)
+
+    record = {
+        "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+        "user": user,
+        "role": role,
+        "content": content,
+    }
+
+    with open(conv_path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+
+def _read_conversation(user, conv_id):
+    """Read all messages from a conversation file."""
+    conv_path = _get_conversation_path(user, conv_id)
+    messages = []
+
+    if not os.path.exists(conv_path):
+        return None
+
+    try:
+        with open(conv_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    try:
+                        messages.append(json.loads(line))
+                    except json.JSONDecodeError:
+                        # Skip malformed lines
+                        continue
+    except IOError:
+        return None
+
+    return messages
+
+
+def _list_user_conversations(user):
+    """List all conversation files for a user with first message preview."""
+    user_dir = _get_user_history_dir(user)
+    conversations = []
+
+    if not os.path.exists(user_dir):
+        return conversations
+
+    try:
+        for filename in os.listdir(user_dir):
+            if not filename.endswith(".ndjson"):
+                continue
+
+            conv_id = filename[:-7]  # Remove .ndjson extension
+            if not _validate_conversation_id(conv_id):
+                continue
+
+            conv_path = os.path.join(user_dir, filename)
+            messages = _read_conversation(user, conv_id)
+
+            if messages:
+                first_msg = messages[0]
+                preview = first_msg.get("content", "")[:50]
+                if len(first_msg.get("content", "")) > 50:
+                    preview += "..."
+                conversations.append({
+                    "id": conv_id,
+                    "created_at": first_msg.get("ts", ""),
+                    "preview": preview,
+                    "message_count": len(messages),
+                })
+            else:
+                # Empty conversation file
+                conversations.append({
+                    "id": conv_id,
+                    "created_at": "",
+                    "preview": "(empty)",
+                    "message_count": 0,
+                })
+    except OSError:
+        pass
+
+    # Sort by created_at descending
+    conversations.sort(key=lambda x: x["created_at"] or "", reverse=True)
+    return conversations
+
+
+def _delete_conversation(user, conv_id):
+    """Delete a conversation file."""
+    conv_path = _get_conversation_path(user, conv_id)
+    if os.path.exists(conv_path):
+        os.remove(conv_path)
+        return True
+    return False
+
+
+class ChatHandler(BaseHTTPRequestHandler):
+    """HTTP request handler for disinto-chat with Forgejo OAuth."""
+
+    def log_message(self, format, *args):
+        """Log to stderr."""
+        print(f"[{self.log_date_time_string()}] {format % args}", file=sys.stderr)
+
+    def send_error_page(self, code, message=None):
+        """Custom error response."""
+        self.send_response(code)
+        self.send_header("Content-Type", "text/plain; charset=utf-8")
+        self.end_headers()
+        if message:
+            self.wfile.write(message.encode("utf-8"))
+
+    def _require_session(self):
+        """Check session; redirect to /chat/login if missing. Returns username or None."""
+        user = _validate_session(self.headers.get("Cookie"))
+        if user:
+            return user
+        self.send_response(302)
+        self.send_header("Location", "/chat/login")
+        self.end_headers()
+        return None
+
+    def _check_forwarded_user(self, session_user):
+        """Defense-in-depth: verify X-Forwarded-User matches session user (#709).
+
+        Returns True if the request may proceed, False if a 403 was sent.
+        When X-Forwarded-User is absent (forward_auth removed from Caddy),
+        the request is rejected - fail-closed by design.
+        """
+        forwarded = self.headers.get("X-Forwarded-User")
+        if not forwarded:
+            rid = self.headers.get("X-Request-Id", "-")
+            print(
+                f"WARN: missing X-Forwarded-User for session_user={session_user} "
+                f"req_id={rid} - fail-closed (#709)",
+                file=sys.stderr,
+            )
+            self.send_error_page(403, "Forbidden: missing forwarded-user header")
+            return False
+        if forwarded != session_user:
+            rid = self.headers.get("X-Request-Id", "-")
+            print(
+                f"WARN: X-Forwarded-User mismatch: header={forwarded} "
+                f"session={session_user} req_id={rid} (#709)",
+                file=sys.stderr,
+            )
+            self.send_error_page(403, "Forbidden: user identity mismatch")
+            return False
+        return True
+
+    def do_GET(self):
+        """Handle GET requests."""
+        parsed = urlparse(self.path)
+        path = parsed.path
+
+        # Verify endpoint for Caddy forward_auth (#709)
+        if path == "/chat/auth/verify":
+            self.handle_auth_verify()
+            return
+
+        # OAuth routes (no session required)
+        if path == "/chat/login":
+            self.handle_login()
+            return
+
+        if path == "/chat/oauth/callback":
+            self.handle_oauth_callback(parsed.query)
+            return
+
+        # Conversation list endpoint: GET /chat/history
+        if path == "/chat/history":
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            self.handle_conversation_list(user)
+            return
+
+        # Single conversation endpoint: GET /chat/history/<id>
+        if path.startswith("/chat/history/"):
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            conv_id = path[len("/chat/history/"):]
+            self.handle_conversation_get(user, conv_id)
+            return
+
+        # Serve index.html at root
+        if path in ("/", "/chat", "/chat/"):
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            self.serve_index()
+            return
+
+        # Serve static files
+        if path.startswith("/chat/static/") or path.startswith("/static/"):
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            self.serve_static(path)
+            return
+
+        # Reserved WebSocket endpoint (future use)
+        if path == "/ws" or path.startswith("/ws"):
+            self.send_error_page(501, "WebSocket upgrade not yet implemented")
+            return
+
+        # 404 for unknown paths
+        self.send_error_page(404, "Not found")
+
+    def do_POST(self):
+        """Handle POST requests."""
+        parsed = urlparse(self.path)
+        path = parsed.path
+
+        # New conversation endpoint (session required)
+        if path == "/chat/new":
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            self.handle_new_conversation(user)
+            return
+
+        # Chat endpoint (session required)
+        if path in ("/chat", "/chat/"):
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            self.handle_chat(user)
+            return
+
+        # 404 for unknown paths
+        self.send_error_page(404, "Not found")
+
+    def handle_auth_verify(self):
+        """Caddy forward_auth callback - validate session and return X-Forwarded-User (#709).
+
+        Caddy calls this endpoint for every /chat/* request.  If the session
+        cookie is valid the endpoint returns 200 with the X-Forwarded-User
+        header set to the session username.  Otherwise it returns 401 so Caddy
+        knows the request is unauthenticated.
+
+        Access control: when FORWARD_AUTH_SECRET is configured, the request must
+        carry a matching X-Forward-Auth-Secret header (shared secret between
+        Caddy and the chat backend).
+        """
+        # Shared-secret gate
+        if FORWARD_AUTH_SECRET:
+            provided = self.headers.get("X-Forward-Auth-Secret", "")
+            if not secrets.compare_digest(provided, FORWARD_AUTH_SECRET):
+                self.send_error_page(403, "Forbidden: invalid forward-auth secret")
+                return
+
+        user = _validate_session(self.headers.get("Cookie"))
+        if not user:
+            self.send_error_page(401, "Unauthorized: no valid session")
+            return
+
+        self.send_response(200)
+        self.send_header("X-Forwarded-User", user)
+        self.send_header("Content-Type", "text/plain; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(b"ok")
+
+    def handle_login(self):
+        """Redirect to Forgejo OAuth authorize endpoint."""
+        _gc_sessions()
+
+        if not CHAT_OAUTH_CLIENT_ID:
+            self.send_error_page(500, "Chat OAuth not configured (CHAT_OAUTH_CLIENT_ID missing)")
+            return
+
+        state = secrets.token_urlsafe(32)
+        _oauth_states[state] = time.time() + 600  # 10 min validity
+
+        params = urlencode({
+            "client_id": CHAT_OAUTH_CLIENT_ID,
+            "redirect_uri": _build_callback_uri(),
+            "response_type": "code",
+            "state": state,
+        })
+        self.send_response(302)
+        self.send_header("Location", f"{FORGE_URL}/login/oauth/authorize?{params}")
+        self.end_headers()
+
+    def handle_oauth_callback(self, query_string):
+        """Exchange authorization code for token, validate user, set session."""
+        params = parse_qs(query_string)
+        code = params.get("code", [""])[0]
+        state = params.get("state", [""])[0]
+
+        # Validate state
+        expected_expiry = _oauth_states.pop(state, None) if state else None
+        if not expected_expiry or expected_expiry < time.time():
+            self.send_error_page(400, "Invalid or expired OAuth state")
+            return
+
+        if not code:
+            self.send_error_page(400, "Missing authorization code")
+            return
+
+        # Exchange code for access token
+        token_resp = _exchange_code_for_token(code)
+        if not token_resp or "access_token" not in token_resp:
+            self.send_error_page(502, "Failed to obtain access token from Forgejo")
+            return
+
+        access_token = token_resp["access_token"]
+
+        # Fetch user info
+        user_info = _fetch_user(access_token)
+        if not user_info or "login" not in user_info:
+            self.send_error_page(502, "Failed to fetch user info from Forgejo")
+            return
+
+        username = user_info["login"]
+
+        # Check allowlist
+        if username not in ALLOWED_USERS:
+            self.send_response(403)
+            self.send_header("Content-Type", "text/plain; charset=utf-8")
+            self.end_headers()
+            self.wfile.write(
+                f"Not authorised: user '{username}' is not in the allowed users list.\n".encode()
+            )
+            return
+
+        # Create session
+        session_token = secrets.token_urlsafe(48)
+        _sessions[session_token] = {
+            "user": username,
+            "expires": time.time() + SESSION_TTL,
+        }
+
+        cookie_flags = _session_cookie_flags()
+        self.send_response(302)
+        self.send_header("Set-Cookie", f"{SESSION_COOKIE}={session_token}; {cookie_flags}")
+        self.send_header("Location", "/chat/")
+        self.end_headers()
+
+    def serve_index(self):
+        """Serve the main index.html file."""
+        index_path = os.path.join(UI_DIR, "index.html")
+        if not os.path.exists(index_path):
+            self.send_error_page(500, "UI not found")
+            return
+
+        try:
+            with open(index_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            self.send_response(200)
+            self.send_header("Content-Type", MIME_TYPES[".html"])
+            self.send_header("Content-Length", len(content.encode("utf-8")))
+            self.end_headers()
+            self.wfile.write(content.encode("utf-8"))
+        except IOError as e:
+            self.send_error_page(500, f"Error reading index.html: {e}")
+
+    def serve_static(self, path):
+        """Serve static files from the static directory."""
+        # Strip /chat/static/ or /static/ prefix
+        if path.startswith("/chat/static/"):
+            relative_path = path[len("/chat/static/"):]
+        else:
+            relative_path = path[len("/static/"):]
+
+        if ".." in relative_path or relative_path.startswith("/"):
+            self.send_error_page(403, "Forbidden")
+            return
+
+        file_path = os.path.join(STATIC_DIR, relative_path)
+        if not os.path.exists(file_path):
+            self.send_error_page(404, "Not found")
+            return
+
+        # Determine MIME type
+        _, ext = os.path.splitext(file_path)
+        content_type = MIME_TYPES.get(ext.lower(), "application/octet-stream")
+
+        try:
+            with open(file_path, "rb") as f:
+                content = f.read()
+            self.send_response(200)
+            self.send_header("Content-Type", content_type)
+            self.send_header("Content-Length", len(content))
+            self.end_headers()
+            self.wfile.write(content)
+        except IOError as e:
+            self.send_error_page(500, f"Error reading file: {e}")
+
+    def _send_rate_limit_response(self, retry_after, reason):
+        """Send a 429 response with Retry-After header and HTMX fragment (#711)."""
+        body = (
+            f'<div class="rate-limit-error">'
+            f"Rate limit exceeded: {reason}. "
+            f"Please try again in {retry_after} seconds."
+            f"</div>"
+        )
+        self.send_response(429)
+        self.send_header("Retry-After", str(retry_after))
+        self.send_header("Content-Type", "text/html; charset=utf-8")
+        self.send_header("Content-Length", str(len(body.encode("utf-8"))))
+        self.end_headers()
+        self.wfile.write(body.encode("utf-8"))
+
+    def handle_chat(self, user):
+        """
+        Handle chat requests by spawning `claude --print` with the user message.
+        Enforces per-user rate limits and tracks token usage (#711).
+        """
+
+        # Check rate limits before processing (#711)
+        allowed, retry_after, reason = _check_rate_limit(user)
+        if not allowed:
+            self._send_rate_limit_response(retry_after, reason)
+            return
+
+        # Read request body
+        content_length = int(self.headers.get("Content-Length", 0))
+        if content_length == 0:
+            self.send_error_page(400, "No message provided")
+            return
+
+        body = self.rfile.read(content_length)
+        try:
+            # Parse form-encoded body
+            body_str = body.decode("utf-8")
+            params = parse_qs(body_str)
+            message = params.get("message", [""])[0]
+            conv_id = params.get("conversation_id", [None])[0]
+        except (UnicodeDecodeError, KeyError):
+            self.send_error_page(400, "Invalid message format")
+            return
+
+        if not message:
+            self.send_error_page(400, "Empty message")
+            return
+
+        # Get user from session
+        user = _validate_session(self.headers.get("Cookie"))
+        if not user:
+            self.send_error_page(401, "Unauthorized")
+            return
+
+        # Validate Claude binary exists
+        if not os.path.exists(CLAUDE_BIN):
+            self.send_error_page(500, "Claude CLI not found")
+            return
+
+        # Generate new conversation ID if not provided
+        if not conv_id or not _validate_conversation_id(conv_id):
+            conv_id = _generate_conversation_id()
+
+        # Record request for rate limiting (#711)
+        _record_request(user)
+
+        try:
+            # Save user message to history
+            _write_message(user, conv_id, "user", message)
+
+            # Spawn claude --print with stream-json for token tracking (#711)
+            proc = subprocess.Popen(
+                [CLAUDE_BIN, "--print", "--output-format", "stream-json", message],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+
+            raw_output = proc.stdout.read()
+
+            error_output = proc.stderr.read()
+            if error_output:
+                print(f"Claude stderr: {error_output}", file=sys.stderr)
+
+            proc.wait()
+
+            if proc.returncode != 0:
+                self.send_error_page(500, f"Claude CLI failed with exit code {proc.returncode}")
+                return
+
+            # Parse stream-json for text and token usage (#711)
+            response, total_tokens = _parse_stream_json(raw_output)
+
+            # Track token usage - does not block *this* request (#711)
+            if total_tokens > 0:
+                _record_tokens(user, total_tokens)
+                print(
+                    f"Token usage: user={user} tokens={total_tokens}",
+                    file=sys.stderr,
+                )
+
+            # Fall back to raw output if stream-json parsing yielded no text
+            if not response:
+                response = raw_output
+
+            # Save assistant response to history
+            _write_message(user, conv_id, "assistant", response)
+
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json; charset=utf-8")
+            self.end_headers()
+            self.wfile.write(json.dumps({
+                "response": response,
+                "conversation_id": conv_id,
+            }, ensure_ascii=False).encode("utf-8"))
+
+        except FileNotFoundError:
+            self.send_error_page(500, "Claude CLI not found")
+        except Exception as e:
+            self.send_error_page(500, f"Error: {e}")
+
+    # =======================================================================
+    # Conversation History Handlers
+    # =======================================================================
+
+    def handle_conversation_list(self, user):
+        """List all conversations for the logged-in user."""
+        conversations = _list_user_conversations(user)
+
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(json.dumps(conversations, ensure_ascii=False).encode("utf-8"))
+
+    def handle_conversation_get(self, user, conv_id):
+        """Get a specific conversation for the logged-in user."""
+        # Validate conversation_id format
+        if not _validate_conversation_id(conv_id):
+            self.send_error_page(400, "Invalid conversation ID")
+            return
+
+        messages = _read_conversation(user, conv_id)
+
+        if messages is None:
+            self.send_error_page(404, "Conversation not found")
+            return
+
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(json.dumps(messages, ensure_ascii=False).encode("utf-8"))
+
+    def handle_conversation_delete(self, user, conv_id):
+        """Delete a specific conversation for the logged-in user."""
+        # Validate conversation_id format
+        if not _validate_conversation_id(conv_id):
+            self.send_error_page(400, "Invalid conversation ID")
+            return
+
+        if _delete_conversation(user, conv_id):
+            self.send_response(204)  # No Content
+            self.end_headers()
+        else:
+            self.send_error_page(404, "Conversation not found")
+
+    def handle_new_conversation(self, user):
+        """Create a new conversation and return its ID."""
+        conv_id = _generate_conversation_id()
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json; charset=utf-8")
+        self.end_headers()
+        self.wfile.write(json.dumps({"conversation_id": conv_id}, ensure_ascii=False).encode("utf-8"))
+
+    def do_DELETE(self):
+        """Handle DELETE requests."""
+        parsed = urlparse(self.path)
+        path = parsed.path
+
+        # Delete conversation endpoint
+        if path.startswith("/chat/history/"):
+            user = self._require_session()
+            if not user:
+                return
+            if not self._check_forwarded_user(user):
+                return
+            conv_id = path[len("/chat/history/"):]
+            self.handle_conversation_delete(user, conv_id)
+            return
+
+        # 404 for unknown paths
+        self.send_error_page(404, "Not found")
+
+
+def main():
+    """Start the HTTP server."""
+    server_address = (HOST, PORT)
+    httpd = HTTPServer(server_address, ChatHandler)
+    print(f"Starting disinto-chat server on {HOST}:{PORT}", file=sys.stderr)
+    print(f"UI available at http://localhost:{PORT}/chat/", file=sys.stderr)
+    if CHAT_OAUTH_CLIENT_ID:
+        print(f"OAuth enabled (client_id={CHAT_OAUTH_CLIENT_ID[:8]}...)", file=sys.stderr)
+        print(f"Allowed users: {', '.join(sorted(ALLOWED_USERS))}", file=sys.stderr)
+    else:
+        print("WARNING: CHAT_OAUTH_CLIENT_ID not set - OAuth disabled", file=sys.stderr)
+    if FORWARD_AUTH_SECRET:
+        print("forward_auth secret configured (#709)", file=sys.stderr)
+    else:
+        print("WARNING: FORWARD_AUTH_SECRET not set - verify endpoint unrestricted", file=sys.stderr)
+    print(
+        f"Rate limits (#711): {CHAT_MAX_REQUESTS_PER_HOUR}/hr, "
+        f"{CHAT_MAX_REQUESTS_PER_DAY}/day, "
+        f"{CHAT_MAX_TOKENS_PER_DAY} tokens/day",
+        file=sys.stderr,
+    )
+    httpd.serve_forever()
+
+
+if __name__ == "__main__":
+    main()
--- a/docker/chat/ui/index.html
+++ b/docker/chat/ui/index.html
@ -0,0 +1,521 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>disinto-chat</title>
+    <script src="/static/htmx.min.js"></script>
+    <style>
+        * {
+            box-sizing: border-box;
+            margin: 0;
+            padding: 0;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, sans-serif;
+            background: #1a1a2e;
+            color: #eaeaea;
+            min-height: 100vh;
+            display: flex;
+        }
+        /* Sidebar styles */
+        .sidebar {
+            width: 280px;
+            background: #16213e;
+            border-right: 1px solid #0f3460;
+            display: flex;
+            flex-direction: column;
+            height: 100vh;
+            position: fixed;
+            left: 0;
+            top: 0;
+            z-index: 100;
+        }
+        .sidebar-header {
+            padding: 1rem;
+            border-bottom: 1px solid #0f3460;
+        }
+        .sidebar-header h1 {
+            font-size: 1.25rem;
+            font-weight: 600;
+            margin-bottom: 0.5rem;
+        }
+        .new-chat-btn {
+            width: 100%;
+            background: #e94560;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            padding: 0.75rem 1rem;
+            font-size: 0.9rem;
+            font-weight: 600;
+            cursor: pointer;
+            transition: background 0.2s;
+        }
+        .new-chat-btn:hover {
+            background: #d63447;
+        }
+        .new-chat-btn:disabled {
+            background: #555;
+            cursor: not-allowed;
+        }
+        .conversations-list {
+            flex: 1;
+            overflow-y: auto;
+            padding: 0.5rem;
+        }
+        .conversation-item {
+            padding: 0.75rem 1rem;
+            border-radius: 6px;
+            cursor: pointer;
+            margin-bottom: 0.25rem;
+            transition: background 0.2s;
+            border: 1px solid transparent;
+        }
+        .conversation-item:hover {
+            background: #1a1a2e;
+        }
+        .conversation-item.active {
+            background: #0f3460;
+            border-color: #e94560;
+        }
+        .conversation-item .preview {
+            font-size: 0.875rem;
+            white-space: nowrap;
+            overflow: hidden;
+            text-overflow: ellipsis;
+            opacity: 0.9;
+        }
+        .conversation-item .meta {
+            font-size: 0.75rem;
+            opacity: 0.6;
+            margin-top: 0.25rem;
+        }
+        .conversation-item .message-count {
+            float: right;
+            font-size: 0.7rem;
+            background: #0f3460;
+            padding: 0.125rem 0.5rem;
+            border-radius: 10px;
+        }
+        .main-content {
+            margin-left: 280px;
+            display: flex;
+            flex-direction: column;
+            width: 100%;
+            height: 100vh;
+        }
+        header {
+            background: #16213e;
+            padding: 1rem 2rem;
+            border-bottom: 1px solid #0f3460;
+        }
+        header h1 {
+            font-size: 1.25rem;
+            font-weight: 600;
+        }
+        main {
+            flex: 1;
+            display: flex;
+            flex-direction: column;
+            max-width: 900px;
+            margin: 0 auto;
+            width: 100%;
+            padding: 1rem;
+        }
+        #messages {
+            flex: 1;
+            overflow-y: auto;
+            padding: 1rem;
+            background: #16213e;
+            border-radius: 8px;
+            margin-bottom: 1rem;
+        }
+        .message {
+            margin-bottom: 1rem;
+            padding: 0.75rem 1rem;
+            border-radius: 8px;
+            line-height: 1.5;
+        }
+        .message.user {
+            background: #0f3460;
+            margin-left: 2rem;
+        }
+        .message.assistant {
+            background: #1a1a2e;
+            margin-right: 2rem;
+        }
+        .message.system {
+            background: #1a1a2e;
+            font-style: italic;
+            color: #888;
+            text-align: center;
+        }
+        .message .role {
+            font-weight: 600;
+            font-size: 0.875rem;
+            margin-bottom: 0.25rem;
+            opacity: 0.8;
+        }
+        .message .content {
+            white-space: pre-wrap;
+            word-wrap: break-word;
+        }
+        .input-area {
+            display: flex;
+            gap: 0.5rem;
+            padding: 1rem;
+            background: #16213e;
+            border-radius: 8px;
+        }
+        textarea {
+            flex: 1;
+            background: #1a1a2e;
+            border: 1px solid #0f3460;
+            border-radius: 6px;
+            padding: 0.75rem;
+            color: #eaeaea;
+            font-family: inherit;
+            font-size: 1rem;
+            resize: none;
+            min-height: 80px;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: #e94560;
+        }
+        button {
+            background: #e94560;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            padding: 0.75rem 1.5rem;
+            font-size: 1rem;
+            font-weight: 600;
+            cursor: pointer;
+            transition: background 0.2s;
+        }
+        button:hover {
+            background: #d63447;
+        }
+        button:disabled {
+            background: #555;
+            cursor: not-allowed;
+        }
+        .loading {
+            opacity: 0.6;
+        }
+        .empty-state {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: center;
+            height: 100%;
+            color: #888;
+            text-align: center;
+        }
+        .empty-state p {
+            margin-top: 1rem;
+        }
+        /* Responsive sidebar toggle */
+        .sidebar-toggle {
+            display: none;
+            position: fixed;
+            top: 1rem;
+            left: 1rem;
+            z-index: 200;
+            background: #e94560;
+            color: white;
+            border: none;
+            border-radius: 6px;
+            padding: 0.5rem;
+            cursor: pointer;
+        }
+        @media (max-width: 768px) {
+            .sidebar {
+                transform: translateX(-100%);
+                transition: transform 0.3s;
+            }
+            .sidebar.open {
+                transform: translateX(0);
+            }
+            .sidebar-toggle {
+                display: block;
+            }
+            .main-content {
+                margin-left: 0;
+            }
+        }
+    </style>
+</head>
+<body>
+    <button class="sidebar-toggle" id="sidebar-toggle">☰</button>
+    <aside class="sidebar" id="sidebar">
+        <div class="sidebar-header">
+            <h1>disinto-chat</h1>
+            <button class="new-chat-btn" id="new-chat-btn">+ New Chat</button>
+        </div>
+        <div class="conversations-list" id="conversations-list">
+            <!-- Conversations will be loaded here -->
+        </div>
+    </aside>
+    <div class="main-content">
+        <header>
+            <h1>disinto-chat</h1>
+        </header>
+        <main>
+            <div id="messages">
+                <div class="message system">
+                    <div class="role">system</div>
+                    <div class="content">Welcome to disinto-chat. Type a message to start chatting with Claude.</div>
+                </div>
+            </div>
+            <form class="input-area" id="chat-form">
+                <textarea name="message" placeholder="Type your message..." required></textarea>
+                <button type="submit" id="send-btn">Send</button>
+            </form>
+        </main>
+    </div>
+
+    <script>
+        // State
+        let currentConversationId = null;
+        let conversations = [];
+
+        // DOM elements
+        const messagesDiv = document.getElementById('messages');
+        const sendBtn = document.getElementById('send-btn');
+        const textarea = document.querySelector('textarea');
+        const conversationsList = document.getElementById('conversations-list');
+        const newChatBtn = document.getElementById('new-chat-btn');
+        const sidebar = document.getElementById('sidebar');
+        const sidebarToggle = document.getElementById('sidebar-toggle');
+
+        // Load conversations list
+        async function loadConversations() {
+            try {
+                const response = await fetch('/chat/history');
+                if (response.ok) {
+                    conversations = await response.json();
+                    renderConversationsList();
+                }
+            } catch (error) {
+                console.error('Failed to load conversations:', error);
+            }
+        }
+
+        // Render conversations list
+        function renderConversationsList() {
+            conversationsList.innerHTML = '';
+
+            if (conversations.length === 0) {
+                conversationsList.innerHTML = '<div style="padding: 1rem; color: #888; text-align: center; font-size: 0.875rem;">No conversations yet</div>';
+                return;
+            }
+
+            conversations.forEach(conv => {
+                const item = document.createElement('div');
+                item.className = 'conversation-item';
+                if (conv.id === currentConversationId) {
+                    item.classList.add('active');
+                }
+                item.dataset.conversationId = conv.id;
+
+                const previewDiv = document.createElement('div');
+                previewDiv.className = 'preview';
+                previewDiv.textContent = conv.preview || '(empty)';
+
+                const metaDiv = document.createElement('div');
+                metaDiv.className = 'meta';
+                const date = conv.created_at ? new Date(conv.created_at).toLocaleDateString() : '';
+                metaDiv.innerHTML = `${date} <span class="message-count">${conv.message_count || 0} msg${conv.message_count !== 1 ? 's' : ''}</span>`;
+
+                item.appendChild(previewDiv);
+                item.appendChild(metaDiv);
+
+                item.addEventListener('click', () => loadConversation(conv.id));
+                conversationsList.appendChild(item);
+            });
+        }
+
+        // Load a specific conversation
+        async function loadConversation(convId) {
+            // Early-return if already showing this conversation
+            if (convId === currentConversationId) {
+                return;
+            }
+
+            // Clear messages
+            messagesDiv.innerHTML = '';
+
+            // Update active state in sidebar
+            document.querySelectorAll('.conversation-item').forEach(item => {
+                item.classList.remove('active');
+            });
+            document.querySelector(`[data-conversation-id="${convId}"]`)?.classList.add('active');
+
+            currentConversationId = convId;
+
+            try {
+                const response = await fetch(`/chat/history/${convId}`);
+                if (response.ok) {
+                    const messages = await response.json();
+                    if (messages && messages.length > 0) {
+                        messages.forEach(msg => {
+                            addMessage(msg.role, msg.content);
+                        });
+                    } else {
+                        addSystemMessage('This conversation is empty');
+                    }
+                } else {
+                    addSystemMessage('Failed to load conversation');
+                }
+            } catch (error) {
+                console.error('Failed to load conversation:', error);
+                addSystemMessage('Error loading conversation');
+            }
+
+            // Close sidebar on mobile
+            if (window.innerWidth <= 768) {
+                sidebar.classList.remove('open');
+            }
+        }
+
+        // Create a new conversation
+        async function createNewConversation() {
+            try {
+                const response = await fetch('/chat/new', { method: 'POST' });
+                if (response.ok) {
+                    const data = await response.json();
+                    currentConversationId = data.conversation_id;
+                    messagesDiv.innerHTML = '';
+                    addSystemMessage('New conversation started');
+                    await loadConversations();
+                } else {
+                    addSystemMessage('Failed to create new conversation');
+                }
+            } catch (error) {
+                console.error('Failed to create new conversation:', error);
+                addSystemMessage('Error creating new conversation');
+            }
+        }
+
+        // Add message to display
+        function addMessage(role, content, streaming = false) {
+            const msgDiv = document.createElement('div');
+            msgDiv.className = `message ${role}`;
+            msgDiv.innerHTML = `
+                <div class="role">${role}</div>
+                <div class="content${streaming ? ' streaming' : ''}">${escapeHtml(content)}</div>
+            `;
+            messagesDiv.appendChild(msgDiv);
+            messagesDiv.scrollTop = messagesDiv.scrollHeight;
+            return msgDiv.querySelector('.content');
+        }
+
+        function addSystemMessage(content) {
+            const msgDiv = document.createElement('div');
+            msgDiv.className = 'message system';
+            msgDiv.innerHTML = `
+                <div class="role">system</div>
+                <div class="content">${escapeHtml(content)}</div>
+            `;
+            messagesDiv.appendChild(msgDiv);
+            messagesDiv.scrollTop = messagesDiv.scrollHeight;
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML.replace(/\n/g, '<br>');
+        }
+
+        // Send message handler
+        async function sendMessage() {
+            const message = textarea.value.trim();
+            if (!message) return;
+
+            // Disable input
+            textarea.disabled = true;
+            sendBtn.disabled = true;
+            sendBtn.textContent = 'Sending...';
+
+            // Add user message
+            addMessage('user', message);
+            textarea.value = '';
+
+            // If no conversation ID, create one
+            if (!currentConversationId) {
+                await createNewConversation();
+            }
+
+            try {
+                // Use fetch with URLSearchParams for application/x-www-form-urlencoded
+                const params = new URLSearchParams();
+                params.append('message', message);
+                params.append('conversation_id', currentConversationId);
+
+                const response = await fetch('/chat', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/x-www-form-urlencoded'
+                    },
+                    body: params
+                });
+
+                if (!response.ok) {
+                    throw new Error(`HTTP ${response.status}`);
+                }
+
+                // Read the response as JSON (now returns JSON with response and conversation_id)
+                const data = await response.json();
+                addMessage('assistant', data.response);
+
+            } catch (error) {
+                addSystemMessage(`Error: ${error.message}`);
+            } finally {
+                textarea.disabled = false;
+                sendBtn.disabled = false;
+                sendBtn.textContent = 'Send';
+                textarea.focus();
+                messagesDiv.scrollTop = messagesDiv.scrollHeight;
+
+                // Refresh conversations list
+                await loadConversations();
+            }
+        }
+
+        // Event listeners
+        sendBtn.addEventListener('click', sendMessage);
+
+        newChatBtn.addEventListener('click', createNewConversation);
+
+        textarea.addEventListener('keydown', (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendMessage();
+            }
+        });
+
+        // Sidebar toggle for mobile
+        sidebarToggle.addEventListener('click', () => {
+            sidebar.classList.toggle('open');
+        });
+
+        // Close sidebar when clicking outside on mobile
+        document.addEventListener('click', (e) => {
+            if (window.innerWidth <= 768) {
+                if (!sidebar.contains(e.target) && !sidebarToggle.contains(e.target)) {
+                    sidebar.classList.remove('open');
+                }
+            }
+        });
+
+        // Initial focus
+        textarea.focus();
+
+        // Load conversations on page load
+        loadConversations();
+    </script>
+</body>
+</html>
--- a/docker/chat/ui/static/htmx.min.js
+++ b/docker/chat/ui/static/htmx.min.js
--- a/docker/edge/Dockerfile
+++ b/docker/edge/Dockerfile
@ -1,4 +1,4 @@
-FROM caddy:alpine
-RUN apk add --no-cache bash jq curl git docker-cli
+FROM caddy:latest
+RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh
 COPY entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh
 ENTRYPOINT ["bash", "/usr/local/bin/entrypoint-edge.sh"]
--- a/docker/edge/dispatcher.sh
+++ b/docker/edge/dispatcher.sh
@ -9,7 +9,7 @@
 # 3. Verify TOML arrived via merged PR with admin merger (Forgejo API)
 # 4. Validate TOML using vault-env.sh validator
 # 5. Decrypt .env.vault.enc and extract only declared secrets
-# 6. Launch: docker compose run --rm runner <formula> <action-id>
+# 6. Launch: docker run --rm disinto/agents:latest <action-id>
 # 7. Write <action-id>.result.json with exit code, timestamp, logs summary
 #
 # Part of #76.
@ -22,6 +22,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 # Source shared environment
 source "${SCRIPT_ROOT}/../lib/env.sh"

+# Project TOML location: prefer mounted path, fall back to cloned path
+# Edge container mounts ./projects to /opt/disinto-projects;
+# the shallow clone only has .toml.example files.
+PROJECTS_DIR="${PROJECTS_DIR:-${FACTORY_ROOT:-/opt/disinto}-projects}"
+
 # Load vault secrets after env.sh (env.sh unsets them for agent security)
 # Vault secrets must be available to the dispatcher
 if [ -f "$FACTORY_ROOT/.env.vault.enc" ] && command -v sops &>/dev/null; then
@ -47,9 +52,14 @@ VAULT_ENV="${SCRIPT_ROOT}/../vault/vault-env.sh"
 # Comma-separated list of Forgejo usernames with admin role
 ADMIN_USERS="${FORGE_ADMIN_USERS:-vault-bot,admin}"

-# Log function
+# Persistent log file for dispatcher
+DISPATCHER_LOG_FILE="${DISINTO_LOG_DIR:-/tmp}/dispatcher/dispatcher.log"
+mkdir -p "$(dirname "$DISPATCHER_LOG_FILE")"
+
+# Log function with standardized format
 log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
+  local agent="${LOG_AGENT:-dispatcher}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$DISPATCHER_LOG_FILE"
 }

 # -----------------------------------------------------------------------------
@ -63,8 +73,12 @@ is_user_admin() {
  local username="$1"
  local user_json

+  # Use admin token for API check (Forgejo only exposes is_admin: true
+  # when the requesting user is also a site admin)
+  local admin_token="${FORGE_ADMIN_TOKEN:-${FORGE_TOKEN}}"
+
  # Fetch user info from Forgejo API
-  user_json=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  user_json=$(curl -sf -H "Authorization: token ${admin_token}" \
    "${FORGE_URL}/api/v1/users/${username}" 2>/dev/null) || return 1

  # Forgejo uses .is_admin for site-wide admin users
@ -109,33 +123,34 @@ get_pr_for_file() {
  local file_name
  file_name=$(basename "$file_path")

-  # Get recent commits that added this specific file
-  local commits
-  commits=$(git -C "$OPS_REPO_ROOT" log --oneline --diff-filter=A -- "vault/actions/${file_name}" 2>/dev/null | head -20) || true
+  # Step 1: find the commit that added the file
+  local add_commit
+  add_commit=$(git -C "$OPS_REPO_ROOT" log --diff-filter=A --format="%H" \
+    -- "vault/actions/${file_name}" 2>/dev/null | head -1)

-  if [ -z "$commits" ]; then
+  if [ -z "$add_commit" ]; then
    return 1
  fi

-  # For each commit, check if it's a merge commit from a PR
-  while IFS= read -r commit; do
-    local commit_sha commit_msg
+  # Step 2: find the merge commit that contains it via ancestry path
+  local merge_line
+  # Use --reverse to get the oldest (direct PR merge) first, not the newest
+  merge_line=$(git -C "$OPS_REPO_ROOT" log --merges --ancestry-path \
+    --reverse "${add_commit}..HEAD" --oneline 2>/dev/null | head -1)

-    commit_sha=$(echo "$commit" | awk '{print $1}')
-    commit_msg=$(git -C "$OPS_REPO_ROOT" log -1 --format="%B" "$commit_sha" 2>/dev/null) || continue
+  if [ -z "$merge_line" ]; then
+    return 1
+  fi

-    # Check if this is a merge commit (has "Merge pull request" in message)
-    if [[ "$commit_msg" =~ "Merge pull request" ]]; then
-      # Extract PR number from merge message (e.g., "Merge pull request #123")
-      local pr_num
-      pr_num=$(echo "$commit_msg" | grep -oP '#\d+' | head -1 | tr -d '#') || true
+  # Step 3: extract PR number from merge commit message
+  # Forgejo format: "Merge pull request 'title' (#N) from branch into main"
+  local pr_num
+  pr_num=$(echo "$merge_line" | grep -oE '#[0-9]+' | head -1 | tr -d '#')

-      if [ -n "$pr_num" ]; then
-        echo "$pr_num"
-        return 0
-      fi
-    fi
-  done <<< "$commits"
+  if [ -n "$pr_num" ]; then
+    echo "$pr_num"
+    return 0
+  fi

  return 1
 }
@ -146,17 +161,90 @@ get_pr_for_file() {
 get_pr_merger() {
  local pr_number="$1"

+  # Use ops repo API URL for PR lookups (not disinto repo)
+  local ops_api="${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}"
+
  curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/pulls/${pr_number}" 2>/dev/null | jq -r '{
+    "${ops_api}/pulls/${pr_number}" 2>/dev/null | jq -r '{
      username: .merge_user?.login // .user?.login,
      merged: .merged,
      merged_at: .merged_at // empty
-    }' || true
+    }'
+}
+
+# Get PR reviews
+# Usage: get_pr_reviews <pr_number>
+# Returns: JSON array of reviews with reviewer login and state
+get_pr_reviews() {
+  local pr_number="$1"
+
+  # Use ops repo API URL for PR lookups (not disinto repo)
+  local ops_api="${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}"
+
+  curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${ops_api}/pulls/${pr_number}/reviews" 2>/dev/null
+}
+
+# Verify vault action was approved by an admin via PR review
+# Usage: verify_admin_approver <pr_number> <action_id>
+# Returns: 0=verified, 1=not verified
+verify_admin_approver() {
+  local pr_number="$1"
+  local action_id="$2"
+
+  # Fetch reviews for this PR
+  local reviews_json
+  reviews_json=$(get_pr_reviews "$pr_number") || {
+    log "WARNING: Could not fetch reviews for PR #${pr_number} — skipping"
+    return 1
+  }
+
+  # Check if there are any reviews
+  local review_count
+  review_count=$(echo "$reviews_json" | jq 'length // 0')
+  if [ "$review_count" -eq 0 ]; then
+    log "WARNING: No reviews found for PR #${pr_number} — rejecting"
+    return 1
+  fi
+
+  # Check each review for admin approval
+  local review
+  while IFS= read -r review; do
+    local reviewer state
+    reviewer=$(echo "$review" | jq -r '.user?.login // empty')
+    state=$(echo "$review" | jq -r '.state // empty')
+
+    # Skip non-APPROVED reviews
+    if [ "$state" != "APPROVED" ]; then
+      continue
+    fi
+
+    # Skip if no reviewer
+    if [ -z "$reviewer" ]; then
+      continue
+    fi
+
+    # Check if reviewer is admin
+    if is_allowed_admin "$reviewer"; then
+      log "Verified: PR #${pr_number} approved by admin '${reviewer}'"
+      return 0
+    fi
+  done < <(echo "$reviews_json" | jq -c '.[]')
+
+  log "WARNING: No admin approval found for PR #${pr_number} — rejecting"
+  return 1
 }

 # Verify vault action arrived via admin-merged PR
 # Usage: verify_admin_merged <toml_file>
 # Returns: 0=verified, 1=not verified
+#
+# Verification order (for auto-merge workflow):
+# 1. Check PR reviews for admin APPROVED state (primary check for auto-merge)
+# 2. Fallback: Check if merger is admin (backwards compat for manual merges)
+#
+# This handles the case where auto-merge is performed by a bot (dev-bot)
+# but the actual approval came from an admin reviewer.
 verify_admin_merged() {
  local toml_file="$1"
  local action_id
@ -171,7 +259,12 @@ verify_admin_merged() {

  log "Action ${action_id} arrived via PR #${pr_num}"

-  # Get PR merger info
+  # First, try admin approver check (for auto-merge workflow)
+  if verify_admin_approver "$pr_num" "$action_id"; then
+    return 0
+  fi
+
+  # Fallback: Check merger (backwards compatibility for manual merges)
  local merger_json
  merger_json=$(get_pr_merger "$pr_num") || {
    log "WARNING: Could not fetch PR #${pr_num} details — skipping"
@ -199,7 +292,7 @@ verify_admin_merged() {
    return 1
  fi

-  log "Verified: PR #${pr_num} merged by admin '${merger_username}'"
+  log "Verified: PR #${pr_num} merged by admin '${merger_username}' (fallback check)"
  return 0
 }

@ -215,7 +308,7 @@ is_action_completed() {

 # Validate a vault action TOML file
 # Usage: validate_action <toml_file>
-# Sets: VAULT_ACTION_ID, VAULT_ACTION_FORMULA, VAULT_ACTION_CONTEXT, VAULT_ACTION_SECRETS
+# Sets: VAULT_ACTION_ID, VAULT_ACTION_FORMULA, VAULT_ACTION_CONTEXT, VAULT_ACTION_SECRETS, VAULT_DISPATCH_MODE
 validate_action() {
  local toml_file="$1"

@ -237,6 +330,26 @@ validate_action() {
  return 0
 }

+# Extract dispatch_mode from TOML file
+# Usage: get_dispatch_mode <toml_file>
+# Returns: "direct" for direct-commit, "pr" for PR-merged, or empty if not specified
+get_dispatch_mode() {
+  local toml_file="$1"
+  local toml_content dispatch_mode
+
+  toml_content=$(cat "$toml_file")
+
+  # Extract dispatch_mode field if present
+  dispatch_mode=$(echo "$toml_content" | grep -E '^dispatch_mode\s*=' | sed -E 's/^dispatch_mode\s*=\s*"(.*)"/\1/' | tr -d '\r')
+
+  if [ -n "$dispatch_mode" ]; then
+    echo "$dispatch_mode"
+  else
+    # Default to "pr" for backward compatibility (PR-based workflow)
+    echo "pr"
+  fi
+}
+
 # Write result file for an action
 # Usage: write_result <action_id> <exit_code> <logs>
 write_result() {
@ -279,64 +392,120 @@ launch_runner() {
    return 1
  fi

-  # Verify admin merge
-  if ! verify_admin_merged "$toml_file"; then
-    log "ERROR: Admin merge verification failed for ${action_id}"
-    write_result "$action_id" 1 "Admin merge verification failed: see logs above"
-    return 1
+  # Check dispatch mode to determine if admin verification is needed
+  local dispatch_mode
+  dispatch_mode=$(get_dispatch_mode "$toml_file")
+
+  if [ "$dispatch_mode" = "direct" ]; then
+    log "Action ${action_id}: tier=${VAULT_TIER:-unknown}, dispatch_mode=${dispatch_mode} — skipping admin merge verification (direct commit)"
+  else
+    # Verify admin merge for PR-based actions
+    log "Action ${action_id}: tier=${VAULT_TIER:-unknown}, dispatch_mode=${dispatch_mode} — verifying admin merge"
+    if ! verify_admin_merged "$toml_file"; then
+      log "ERROR: Admin merge verification failed for ${action_id}"
+      write_result "$action_id" 1 "Admin merge verification failed: see logs above"
+      return 1
+    fi
+    log "Action ${action_id}: admin merge verified"
  fi

  # Extract secrets from validated action
  local secrets_array
  secrets_array="${VAULT_ACTION_SECRETS:-}"

-  if [ -z "$secrets_array" ]; then
-    log "ERROR: Action ${action_id} has no secrets declared"
-    write_result "$action_id" 1 "No secrets declared in TOML"
-    return 1
+  # Build docker run command (self-contained, no compose context needed).
+  # The edge container has the Docker socket but not the host's compose project,
+  # so docker compose run would fail with exit 125. docker run is self-contained:
+  # the dispatcher knows the image, network, env vars, and entrypoint.
+  local -a cmd=(docker run --rm
+    --name "vault-runner-${action_id}"
+    --network host
+    --entrypoint bash
+    -e DISINTO_CONTAINER=1
+    -e "FORGE_URL=${FORGE_URL}"
+    -e "FORGE_TOKEN=${FORGE_TOKEN}"
+    -e "FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}"
+    -e "FORGE_OPS_REPO=${FORGE_OPS_REPO:-}"
+    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
+  )
+
+  # Pass through optional env vars if set
+  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
+  fi
+  if [ -n "${CLAUDE_MODEL:-}" ]; then
+    cmd+=(-e "CLAUDE_MODEL=${CLAUDE_MODEL}")
  fi

-  # Build command array (safe from shell injection)
-  local -a cmd=(docker compose run --rm runner)
+  # Mount docker socket, claude binary, and claude config
+  cmd+=(-v /var/run/docker.sock:/var/run/docker.sock)
+  if [ -f /usr/local/bin/claude ]; then
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
+  fi
+  local runtime_home="${HOME:-/home/debian}"
+  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
+    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
+    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
+  fi
+  if [ -f "${runtime_home}/.claude.json" ]; then
+    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
+  fi

-  # Add environment variables for secrets
-  for secret in $secrets_array; do
-    secret=$(echo "$secret" | xargs)
-    if [ -n "$secret" ]; then
-      # Verify secret exists in vault
-      if [ -z "${!secret:-}" ]; then
-        log "ERROR: Secret '${secret}' not found in vault for action ${action_id}"
-        write_result "$action_id" 1 "Secret not found in vault: ${secret}"
-        return 1
+  # Add environment variables for secrets (if any declared)
+  if [ -n "$secrets_array" ]; then
+    for secret in $secrets_array; do
+      secret=$(echo "$secret" | xargs)
+      if [ -n "$secret" ]; then
+        # Verify secret exists in vault
+        if [ -z "${!secret:-}" ]; then
+          log "ERROR: Secret '${secret}' not found in vault for action ${action_id}"
+          write_result "$action_id" 1 "Secret not found in vault: ${secret}"
+          return 1
+        fi
+        cmd+=(-e "${secret}=${!secret}")
      fi
-      cmd+=(-e "$secret")
-    fi
-  done
+    done
+  else
+    log "Action ${action_id} has no secrets declared — runner will execute without extra env vars"
+  fi

-  # Add formula and action id as arguments (after service name)
-  local formula="${VAULT_ACTION_FORMULA:-}"
-  cmd+=("$formula" "$action_id")
+  # Add volume mounts for file-based credentials (if any declared)
+  local mounts_array
+  mounts_array="${VAULT_ACTION_MOUNTS:-}"
+  if [ -n "$mounts_array" ]; then
+    for mount_alias in $mounts_array; do
+      mount_alias=$(echo "$mount_alias" | xargs)
+      [ -n "$mount_alias" ] || continue
+      case "$mount_alias" in
+        ssh)
+          cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
+          ;;
+        gpg)
+          cmd+=(-v "${runtime_home}/.gnupg:/home/agent/.gnupg:ro")
+          ;;
+        sops)
+          cmd+=(-v "${runtime_home}/.config/sops/age:/home/agent/.config/sops/age:ro")
+          ;;
+        *)
+          log "ERROR: Unknown mount alias '${mount_alias}' for action ${action_id}"
+          write_result "$action_id" 1 "Unknown mount alias: ${mount_alias}"
+          return 1
+          ;;
+      esac
+    done
+  fi

-  # Log command skeleton (hide all -e flags for security)
-  local -a log_cmd=()
-  local skip_next=0
-  for arg in "${cmd[@]}"; do
-    if [[ $skip_next -eq 1 ]]; then
-      skip_next=0
-      continue
-    fi
-    if [[ "$arg" == "-e" ]]; then
-      log_cmd+=("$arg" "<redacted>")
-      skip_next=1
-    else
-      log_cmd+=("$arg")
-    fi
-  done
-  log "Running: ${log_cmd[*]}"
+  # Mount the ops repo so the runner entrypoint can read the action TOML
+  cmd+=(-v "${OPS_REPO_ROOT}:/home/agent/ops:ro")
+
+  # Image and entrypoint arguments: runner entrypoint + action-id
+  cmd+=(disinto/agents:latest /home/agent/disinto/docker/runner/entrypoint-runner.sh "$action_id")
+
+  log "Running: docker run --rm vault-runner-${action_id} (secrets: ${secrets_array:-none}, mounts: ${mounts_array:-none})"

  # Create temp file for logs
  local log_file
-  log_file=$(mktemp /tmp/dispatcher-logs-XXXXXX.txt)
+  log_file=$(mktemp /tmp/dispatcher-logs-XXXXXX)
  trap 'rm -f "$log_file"' RETURN

  # Execute with array expansion (safe from shell injection)
@ -360,6 +529,462 @@ launch_runner() {
  return $exit_code
 }

+# -----------------------------------------------------------------------------
+# Reproduce dispatch — launch sidecar for bug-report issues
+# -----------------------------------------------------------------------------
+
+# Check if a reproduce run is already in-flight for a given issue.
+# Uses a simple pid-file in /tmp so we don't double-launch per dispatcher cycle.
+_reproduce_lockfile() {
+  local issue="$1"
+  echo "/tmp/reproduce-inflight-${issue}.pid"
+}
+
+is_reproduce_running() {
+  local issue="$1"
+  local pidfile
+  pidfile=$(_reproduce_lockfile "$issue")
+  [ -f "$pidfile" ] || return 1
+  local pid
+  pid=$(cat "$pidfile" 2>/dev/null || echo "")
+  [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
+}
+
+# Fetch open issues labelled bug-report that have no outcome label yet.
+# Returns a newline-separated list of "issue_number:project_toml" pairs.
+fetch_reproduce_candidates() {
+  # Require FORGE_TOKEN, FORGE_URL, FORGE_REPO
+  [ -n "${FORGE_TOKEN:-}" ] || return 0
+  [ -n "${FORGE_URL:-}" ]   || return 0
+  [ -n "${FORGE_REPO:-}" ]  || return 0
+
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+
+  local issues_json
+  issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=open&labels=bug-report&limit=20" 2>/dev/null) || return 0
+
+  # Filter out issues that already carry an outcome label.
+  # Write JSON to a temp file so python3 can read from stdin (heredoc) and
+  # still receive the JSON as an argument (avoids SC2259: pipe vs heredoc).
+  local tmpjson
+  tmpjson=$(mktemp)
+  echo "$issues_json" > "$tmpjson"
+  python3 - "$tmpjson" <<'PYEOF'
+import sys, json
+data = json.load(open(sys.argv[1]))
+skip = {"in-progress", "in-triage", "rejected", "blocked"}
+for issue in data:
+    labels = {l["name"] for l in (issue.get("labels") or [])}
+    if labels & skip:
+        continue
+    print(issue["number"])
+PYEOF
+  rm -f "$tmpjson"
+}
+
+# Launch one reproduce container per candidate issue.
+# project_toml is resolved from FACTORY_ROOT/projects/*.toml (first match).
+dispatch_reproduce() {
+  local issue_number="$1"
+
+  if is_reproduce_running "$issue_number"; then
+    log "Reproduce already running for issue #${issue_number}, skipping"
+    return 0
+  fi
+
+  # Find first project TOML available (same convention as dev-poll)
+  local project_toml=""
+  for toml in "$PROJECTS_DIR"/*.toml; do
+    [ -f "$toml" ] && { project_toml="$toml"; break; }
+  done
+
+  if [ -z "$project_toml" ]; then
+    log "WARNING: no project TOML found under ${PROJECTS_DIR}/ — skipping reproduce for #${issue_number}"
+    return 0
+  fi
+
+  log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})"
+
+  # Build docker run command using array (safe from injection)
+  local -a cmd=(docker run --rm
+    --name "disinto-reproduce-${issue_number}"
+    --network host
+    --security-opt apparmor=unconfined
+    -v /var/run/docker.sock:/var/run/docker.sock
+    -v agent-data:/home/agent/data
+    -v project-repos:/home/agent/repos
+    -e "FORGE_URL=${FORGE_URL}"
+    -e "FORGE_TOKEN=${FORGE_TOKEN}"
+    -e "FORGE_REPO=${FORGE_REPO}"
+    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
+    -e DISINTO_CONTAINER=1
+  )
+
+  # Pass through ANTHROPIC_API_KEY if set
+  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
+  fi
+
+  # Mount shared Claude config dir and ~/.ssh from the runtime user's home if available
+  local runtime_home="${HOME:-/home/debian}"
+  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
+    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
+    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
+  fi
+  if [ -f "${runtime_home}/.claude.json" ]; then
+    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
+  fi
+  if [ -d "${runtime_home}/.ssh" ]; then
+    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
+  fi
+  # Mount claude CLI binary if present on host
+  if [ -f /usr/local/bin/claude ]; then
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
+  fi
+
+  # Mount the project TOML into the container at a stable path
+  local container_toml="/home/agent/project.toml"
+  cmd+=(-v "${project_toml}:${container_toml}:ro")
+
+  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
+
+  # Launch in background; write pid-file so we don't double-launch
+  "${cmd[@]}" &
+  local bg_pid=$!
+  echo "$bg_pid" > "$(_reproduce_lockfile "$issue_number")"
+  log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}"
+}
+
+# -----------------------------------------------------------------------------
+# Triage dispatch — launch sidecar for bug-report + in-triage issues
+# -----------------------------------------------------------------------------
+
+# Check if a triage run is already in-flight for a given issue.
+_triage_lockfile() {
+  local issue="$1"
+  echo "/tmp/triage-inflight-${issue}.pid"
+}
+
+is_triage_running() {
+  local issue="$1"
+  local pidfile
+  pidfile=$(_triage_lockfile "$issue")
+  [ -f "$pidfile" ] || return 1
+  local pid
+  pid=$(cat "$pidfile" 2>/dev/null || echo "")
+  [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
+}
+
+# Fetch open issues labelled both bug-report and in-triage.
+# Returns a newline-separated list of issue numbers.
+fetch_triage_candidates() {
+  # Require FORGE_TOKEN, FORGE_URL, FORGE_REPO
+  [ -n "${FORGE_TOKEN:-}" ] || return 0
+  [ -n "${FORGE_URL:-}" ]   || return 0
+  [ -n "${FORGE_REPO:-}" ]  || return 0
+
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+
+  local issues_json
+  issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=open&labels=bug-report&limit=20" 2>/dev/null) || return 0
+
+  # Filter to issues that carry BOTH bug-report AND in-triage labels.
+  local tmpjson
+  tmpjson=$(mktemp)
+  echo "$issues_json" > "$tmpjson"
+  python3 - "$tmpjson" <<'PYEOF'
+import sys, json
+data = json.load(open(sys.argv[1]))
+for issue in data:
+    labels = {l["name"] for l in (issue.get("labels") or [])}
+    if "bug-report" in labels and "in-triage" in labels:
+        print(issue["number"])
+PYEOF
+  rm -f "$tmpjson"
+}
+
+# Launch one triage container per candidate issue.
+# Uses the same disinto-reproduce:latest image as the reproduce-agent,
+# selecting the triage formula via DISINTO_FORMULA env var.
+# Stack lock is held for the full run (no timeout).
+dispatch_triage() {
+  local issue_number="$1"
+
+  if is_triage_running "$issue_number"; then
+    log "Triage already running for issue #${issue_number}, skipping"
+    return 0
+  fi
+
+  # Find first project TOML available (same convention as dev-poll)
+  local project_toml=""
+  for toml in "$PROJECTS_DIR"/*.toml; do
+    [ -f "$toml" ] && { project_toml="$toml"; break; }
+  done
+
+  if [ -z "$project_toml" ]; then
+    log "WARNING: no project TOML found under ${PROJECTS_DIR}/ — skipping triage for #${issue_number}"
+    return 0
+  fi
+
+  log "Dispatching triage-agent for issue #${issue_number} (project: ${project_toml})"
+
+  # Build docker run command using array (safe from injection)
+  local -a cmd=(docker run --rm
+    --name "disinto-triage-${issue_number}"
+    --network host
+    --security-opt apparmor=unconfined
+    -v /var/run/docker.sock:/var/run/docker.sock
+    -v agent-data:/home/agent/data
+    -v project-repos:/home/agent/repos
+    -e "FORGE_URL=${FORGE_URL}"
+    -e "FORGE_TOKEN=${FORGE_TOKEN}"
+    -e "FORGE_REPO=${FORGE_REPO}"
+    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
+    -e DISINTO_CONTAINER=1
+    -e DISINTO_FORMULA=triage
+  )
+
+  # Pass through ANTHROPIC_API_KEY if set
+  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
+  fi
+
+  # Mount shared Claude config dir and ~/.ssh from the runtime user's home if available
+  local runtime_home="${HOME:-/home/debian}"
+  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
+    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
+    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
+  fi
+  if [ -f "${runtime_home}/.claude.json" ]; then
+    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
+  fi
+  if [ -d "${runtime_home}/.ssh" ]; then
+    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
+  fi
+  # Mount claude CLI binary if present on host
+  if [ -f /usr/local/bin/claude ]; then
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
+  fi
+
+  # Mount the project TOML into the container at a stable path
+  local container_toml="/home/agent/project.toml"
+  cmd+=(-v "${project_toml}:${container_toml}:ro")
+
+  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
+
+  # Launch in background; write pid-file so we don't double-launch
+  "${cmd[@]}" &
+  local bg_pid=$!
+  echo "$bg_pid" > "$(_triage_lockfile "$issue_number")"
+  log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
+}
+
+# -----------------------------------------------------------------------------
+# Verification dispatch — launch sidecar for bug-report parents with all deps closed
+# -----------------------------------------------------------------------------
+
+# Check if a verification run is already in-flight for a given issue.
+_verify_lockfile() {
+  local issue="$1"
+  echo "/tmp/verify-inflight-${issue}.pid"
+}
+
+is_verify_running() {
+  local issue="$1"
+  local pidfile
+  pidfile=$(_verify_lockfile "$issue")
+  [ -f "$pidfile" ] || return 1
+  local pid
+  pid=$(cat "$pidfile" 2>/dev/null || echo "")
+  [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null
+}
+
+# Check if an issue is a parent with sub-issues (identified by sub-issues
+# whose body contains "Decomposed from #N" where N is the parent's number).
+# Returns: 0 if parent with sub-issues found, 1 otherwise
+_is_parent_issue() {
+  local parent_num="$1"
+
+  # Fetch all issues (open and closed) to find sub-issues
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  local all_issues_json
+  all_issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=all&limit=50" 2>/dev/null) || return 1
+
+  # Find issues whose body contains "Decomposed from #<parent_num>"
+  local sub_issues
+  sub_issues=$(python3 -c '
+import sys, json
+parent_num = sys.argv[1]
+data = json.load(open("/dev/stdin"))
+sub_issues = []
+for issue in data:
+    body = issue.get("body") or ""
+    if f"Decomposed from #{parent_num}" in body:
+        sub_issues.append(str(issue["number"]))
+print(" ".join(sub_issues))
+' "$parent_num" < <(echo "$all_issues_json")) || return 1
+
+  [ -n "$sub_issues" ]
+}
+
+# Check if all sub-issues of a parent are closed.
+# Returns: 0 if all closed, 1 if any still open
+_are_all_sub_issues_closed() {
+  local parent_num="$1"
+
+  # Fetch all issues (open and closed) to find sub-issues
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  local all_issues_json
+  all_issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=all&limit=50" 2>/dev/null) || return 1
+
+  # Find issues whose body contains "Decomposed from #<parent_num>"
+  local sub_issues
+  sub_issues=$(python3 -c '
+import sys, json
+parent_num = sys.argv[1]
+data = json.load(open("/dev/stdin"))
+sub_issues = []
+for issue in data:
+    body = issue.get("body") or ""
+    if f"Decomposed from #{parent_num}" in body:
+        sub_issues.append(str(issue["number"]))
+print(" ".join(sub_issues))
+' "$parent_num" < <(echo "$all_issues_json")) || return 1
+
+  [ -z "$sub_issues" ] && return 1
+
+  # Check if all sub-issues are closed
+  for sub_num in $sub_issues; do
+    local sub_state
+    sub_state=$(curl -sf \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${api}/issues/${sub_num}" 2>/dev/null | jq -r '.state // "unknown"') || return 1
+    if [ "$sub_state" != "closed" ]; then
+      return 1
+    fi
+  done
+  return 0
+}
+
+# Fetch open bug-report + in-progress issues whose sub-issues are all closed.
+# Returns a newline-separated list of issue numbers ready for verification.
+fetch_verification_candidates() {
+  # Require FORGE_TOKEN, FORGE_URL, FORGE_REPO
+  [ -n "${FORGE_TOKEN:-}" ] || return 0
+  [ -n "${FORGE_URL:-}" ]   || return 0
+  [ -n "${FORGE_REPO:-}" ]  || return 0
+
+  local api="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+
+  # Fetch open bug-report + in-progress issues
+  local issues_json
+  issues_json=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api}/issues?type=issues&state=open&labels=bug-report&limit=20" 2>/dev/null) || return 0
+
+  # Filter to issues that also have in-progress label and have all sub-issues closed
+  local tmpjson
+  tmpjson=$(mktemp)
+  echo "$issues_json" > "$tmpjson"
+  python3 - "$tmpjson" "$api" "${FORGE_TOKEN}" <<'PYEOF'
+import sys, json
+api_base = sys.argv[2]
+token = sys.argv[3]
+data = json.load(open(sys.argv[1]))
+
+for issue in data:
+    labels = {l["name"] for l in (issue.get("labels") or [])}
+    # Must have BOTH bug-report AND in-progress labels
+    if "bug-report" not in labels or "in-progress" not in labels:
+        continue
+    print(issue["number"])
+PYEOF
+  rm -f "$tmpjson"
+}
+
+# Launch one verification container per candidate issue.
+# Uses the same disinto-reproduce:latest image as the reproduce-agent,
+# selecting the verify formula via DISINTO_FORMULA env var.
+dispatch_verify() {
+  local issue_number="$1"
+
+  if is_verify_running "$issue_number"; then
+    log "Verification already running for issue #${issue_number}, skipping"
+    return 0
+  fi
+
+  # Find first project TOML available (same convention as dev-poll)
+  local project_toml=""
+  for toml in "$PROJECTS_DIR"/*.toml; do
+    [ -f "$toml" ] && { project_toml="$toml"; break; }
+  done
+
+  if [ -z "$project_toml" ]; then
+    log "WARNING: no project TOML found under ${PROJECTS_DIR}/ — skipping verification for #${issue_number}"
+    return 0
+  fi
+
+  log "Dispatching verification-agent for issue #${issue_number} (project: ${project_toml})"
+
+  # Build docker run command using array (safe from injection)
+  local -a cmd=(docker run --rm
+    --name "disinto-verify-${issue_number}"
+    --network host
+    --security-opt apparmor=unconfined
+    -v /var/run/docker.sock:/var/run/docker.sock
+    -v agent-data:/home/agent/data
+    -v project-repos:/home/agent/repos
+    -e "FORGE_URL=${FORGE_URL}"
+    -e "FORGE_TOKEN=${FORGE_TOKEN}"
+    -e "FORGE_REPO=${FORGE_REPO}"
+    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
+    -e DISINTO_CONTAINER=1
+    -e DISINTO_FORMULA=verify
+  )
+
+  # Pass through ANTHROPIC_API_KEY if set
+  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
+  fi
+
+  # Mount shared Claude config dir and ~/.ssh from the runtime user's home if available
+  local runtime_home="${HOME:-/home/debian}"
+  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
+    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
+    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
+  fi
+  if [ -f "${runtime_home}/.claude.json" ]; then
+    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
+  fi
+  if [ -d "${runtime_home}/.ssh" ]; then
+    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
+  fi
+  # Mount claude CLI binary if present on host
+  if [ -f /usr/local/bin/claude ]; then
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
+  fi
+
+  # Mount the project TOML into the container at a stable path
+  local container_toml="/home/agent/project.toml"
+  cmd+=(-v "${project_toml}:${container_toml}:ro")
+
+  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
+
+  # Launch in background; write pid-file so we don't double-launch
+  "${cmd[@]}" &
+  local bg_pid=$!
+  echo "$bg_pid" > "$(_verify_lockfile "$issue_number")"
+  log "Verification container launched (pid ${bg_pid}) for issue #${issue_number}"
+}
+
 # -----------------------------------------------------------------------------
 # Main dispatcher loop
 # -----------------------------------------------------------------------------
@ -410,6 +1035,42 @@ main() {
      launch_runner "$toml_file" || true
    done

+    # Reproduce dispatch: check for bug-report issues needing reproduction
+    local candidate_issues
+    candidate_issues=$(fetch_reproduce_candidates) || true
+    if [ -n "$candidate_issues" ]; then
+      while IFS= read -r issue_num; do
+        [ -n "$issue_num" ] || continue
+        dispatch_reproduce "$issue_num" || true
+      done <<< "$candidate_issues"
+    fi
+
+    # Triage dispatch: check for bug-report + in-triage issues needing deep analysis
+    local triage_issues
+    triage_issues=$(fetch_triage_candidates) || true
+    if [ -n "$triage_issues" ]; then
+      while IFS= read -r issue_num; do
+        [ -n "$issue_num" ] || continue
+        dispatch_triage "$issue_num" || true
+      done <<< "$triage_issues"
+    fi
+
+    # Verification dispatch: check for bug-report + in-progress issues whose sub-issues are all closed
+    # These are parents whose fixes have merged and need verification
+    local verify_issues
+    verify_issues=$(fetch_verification_candidates) || true
+    if [ -n "$verify_issues" ]; then
+      while IFS= read -r issue_num; do
+        [ -n "$issue_num" ] || continue
+        # Double-check: this issue must have all sub-issues closed before dispatching
+        if _are_all_sub_issues_closed "$issue_num"; then
+          dispatch_verify "$issue_num" || true
+        else
+          log "Issue #${issue_num} has open sub-issues — skipping verification"
+        fi
+      done <<< "$verify_issues"
+    fi
+
    # Wait before next poll
    sleep 60
  done
--- a/docker/edge/entrypoint-edge.sh
+++ b/docker/edge/entrypoint-edge.sh
@ -1,16 +1,182 @@
 #!/usr/bin/env bash
 set -euo pipefail

-DISINTO_VERSION="${DISINTO_VERSION:-main}"
-DISINTO_REPO="${FORGE_URL:-http://forgejo:3000}/johba/disinto.git"
+# Set USER and HOME before sourcing env.sh — preconditions for lib/env.sh (#674).
+export USER="${USER:-agent}"
+export HOME="${HOME:-/home/agent}"

-# Shallow clone at the pinned version
-if [ ! -d /opt/disinto/.git ]; then
-  git clone --depth 1 --branch "$DISINTO_VERSION" "$DISINTO_REPO" /opt/disinto
+FORGE_URL="${FORGE_URL:-http://forgejo:3000}"
+
+# Derive FORGE_REPO from PROJECT_TOML if available, otherwise require explicit env var
+if [ -z "${FORGE_REPO:-}" ]; then
+  # Try to find a project TOML to derive FORGE_REPO from
+  _project_toml="${PROJECT_TOML:-}"
+  if [ -z "$_project_toml" ] && [ -d "${FACTORY_ROOT:-/opt/disinto}/projects" ]; then
+    for toml in "${FACTORY_ROOT:-/opt/disinto}"/projects/*.toml; do
+      if [ -f "$toml" ]; then
+        _project_toml="$toml"
+        break
+      fi
+    done
+  fi
+
+  if [ -n "$_project_toml" ] && [ -f "$_project_toml" ]; then
+    # Parse FORGE_REPO from project TOML using load-project.sh
+    if source "${FACTORY_ROOT:-/opt/disinto}/lib/load-project.sh" "$_project_toml" 2>/dev/null; then
+      if [ -n "${FORGE_REPO:-}" ]; then
+        echo "Derived FORGE_REPO from PROJECT_TOML: $_project_toml" >&2
+      fi
+    fi
+  fi
+
+  # If still not set, fail fast with a clear error message
+  if [ -z "${FORGE_REPO:-}" ]; then
+    echo "FATAL: FORGE_REPO environment variable not set" >&2
+    echo "Set FORGE_REPO=<owner>/<repo> in .env (e.g. FORGE_REPO=disinto-admin/disinto)" >&2
+    exit 1
+  fi
 fi

+# Detect bind-mount of a non-git directory before attempting clone
+if [ -d /opt/disinto ] && [ ! -d /opt/disinto/.git ] && [ -n "$(ls -A /opt/disinto 2>/dev/null)" ]; then
+  echo "FATAL: /opt/disinto contains files but no .git directory." >&2
+  echo "If you bind-mounted a directory at /opt/disinto, ensure it is a git working tree." >&2
+  echo "Sleeping 60s before exit to throttle the restart loop..." >&2
+  sleep 60
+  exit 1
+fi
+
+# Set HOME early so credential helper and git config land in the right place.
+export HOME=/home/agent
+mkdir -p "$HOME"
+
+# Configure git credential helper before cloning (#604).
+# /opt/disinto does not exist yet so we cannot source lib/git-creds.sh;
+# inline a minimal credential-helper setup here.
+if [ -n "${FORGE_PASS:-}" ] && [ -n "${FORGE_URL:-}" ]; then
+  _forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
+  _forge_proto=$(printf '%s' "$FORGE_URL" | sed 's|://.*||')
+  _bot_user=""
+  if [ -n "${FORGE_TOKEN:-}" ]; then
+    _bot_user=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_URL}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || _bot_user=""
+  fi
+  _bot_user="${_bot_user:-dev-bot}"
+
+  cat > "${HOME}/.git-credentials-helper" <<CREDEOF
+#!/bin/sh
+# Reads \$FORGE_PASS from env at runtime — file is safe to read on disk.
+[ "\$1" = "get" ] || exit 0
+cat >/dev/null
+echo "protocol=${_forge_proto}"
+echo "host=${_forge_host}"
+echo "username=${_bot_user}"
+echo "password=\$FORGE_PASS"
+CREDEOF
+  chmod 755 "${HOME}/.git-credentials-helper"
+  git config --global credential.helper "${HOME}/.git-credentials-helper"
+  git config --global --add safe.directory '*'
+fi
+
+# Shallow clone at the pinned version — use clean URL, credential helper
+# supplies auth (#604).
+# Retry with exponential backoff — forgejo may still be starting (#665).
+if [ ! -d /opt/disinto/.git ]; then
+  echo "edge: cloning ${FORGE_URL}/${FORGE_REPO} (branch ${DISINTO_VERSION:-main})..." >&2
+  _clone_ok=false
+  _backoff=2
+  _max_backoff=30
+  _max_attempts=10
+  for _attempt in $(seq 1 "$_max_attempts"); do
+    if git clone --depth 1 --branch "${DISINTO_VERSION:-main}" "${FORGE_URL}/${FORGE_REPO}.git" /opt/disinto 2>&1; then
+      _clone_ok=true
+      break
+    fi
+    rm -rf /opt/disinto  # clean up partial clone before retry
+    if [ "$_attempt" -lt "$_max_attempts" ]; then
+      echo "edge: clone attempt ${_attempt}/${_max_attempts} failed, retrying in ${_backoff}s..." >&2
+      sleep "$_backoff"
+      _backoff=$(( _backoff * 2 ))
+      if [ "$_backoff" -gt "$_max_backoff" ]; then _backoff=$_max_backoff; fi
+    fi
+  done
+  if [ "$_clone_ok" != "true" ]; then
+    echo >&2
+    echo "FATAL: failed to clone ${FORGE_URL}/${FORGE_REPO}.git (branch ${DISINTO_VERSION:-main}) after ${_max_attempts} attempts" >&2
+    echo "Likely causes:" >&2
+    echo "  - Forgejo at ${FORGE_URL} is unreachable from the edge container" >&2
+    echo "  - Repository '${FORGE_REPO}' does not exist on this forge" >&2
+    echo "  - FORGE_TOKEN/FORGE_PASS is invalid or has no read access to '${FORGE_REPO}'" >&2
+    echo "  - Branch '${DISINTO_VERSION:-main}' does not exist in '${FORGE_REPO}'" >&2
+    echo "Workaround: bind-mount a local git checkout into /opt/disinto." >&2
+    echo "Sleeping 60s before exit to throttle the restart loop..." >&2
+    sleep 60
+    exit 1
+  fi
+fi
+
+# Repair any legacy baked-credential URLs in /opt/disinto (#604).
+# Now that /opt/disinto exists, source the shared lib.
+if [ -f /opt/disinto/lib/git-creds.sh ]; then
+  # shellcheck source=/opt/disinto/lib/git-creds.sh
+  source /opt/disinto/lib/git-creds.sh
+  _GIT_CREDS_LOG_FN="echo" repair_baked_cred_urls /opt/disinto
+fi
+
+# Ensure log directory exists
+mkdir -p /opt/disinto-logs
+
+# ── Reverse tunnel (optional) ──────────────────────────────────────────
+# When EDGE_TUNNEL_HOST is set, open a single reverse-SSH forward so the
+# DO edge box can reach this container's Caddy on the project's assigned port.
+# Guarded: if EDGE_TUNNEL_HOST is empty/unset the block is skipped entirely,
+# keeping local-only dev working without errors.
+if [ -n "${EDGE_TUNNEL_HOST:-}" ]; then
+  _tunnel_key="/run/secrets/tunnel_key"
+  if [ ! -f "$_tunnel_key" ]; then
+    echo "WARN: EDGE_TUNNEL_HOST is set but ${_tunnel_key} is missing — skipping tunnel" >&2
+  else
+    # Ensure correct permissions (bind-mount may arrive as 644)
+    chmod 0400 "$_tunnel_key" 2>/dev/null || true
+
+    : "${EDGE_TUNNEL_USER:=tunnel}"
+    : "${EDGE_TUNNEL_PORT:?EDGE_TUNNEL_PORT must be set when EDGE_TUNNEL_HOST is set}"
+
+    export AUTOSSH_GATETIME=0   # don't exit if the first attempt fails quickly
+
+    autossh -M 0 -N -f \
+      -o StrictHostKeyChecking=accept-new \
+      -o ServerAliveInterval=30 \
+      -o ServerAliveCountMax=3 \
+      -o ExitOnForwardFailure=yes \
+      -i "$_tunnel_key" \
+      -R "127.0.0.1:${EDGE_TUNNEL_PORT}:localhost:80" \
+      "${EDGE_TUNNEL_USER}@${EDGE_TUNNEL_HOST}"
+
+    echo "edge: reverse tunnel → ${EDGE_TUNNEL_HOST}:${EDGE_TUNNEL_PORT}" >&2
+  fi
+fi
+
+# Set project context vars for scripts that source lib/env.sh (#674).
+# These satisfy env.sh's preconditions for edge-container scripts.
+export PROJECT_REPO_ROOT="${PROJECT_REPO_ROOT:-/opt/disinto}"
+export PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
+export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/agent/repos/${PROJECT_NAME:-disinto}-ops}"
+
 # Start dispatcher in background
 bash /opt/disinto/docker/edge/dispatcher.sh &

-# Caddy as main process
-exec caddy run --config /etc/caddy/Caddyfile --adapter caddyfile
+# Start supervisor loop in background
+PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}"
+(while true; do
+  bash /opt/disinto/supervisor/supervisor-run.sh "/opt/disinto/${PROJECT_TOML}" 2>&1 | tee -a /opt/disinto-logs/supervisor.log || true
+  sleep 1200  # 20 minutes
+done) &
+
+# Caddy as main process — run in foreground via wait so background jobs survive
+# (exec replaces the shell, which can orphan backgrounded subshells)
+caddy run --config /etc/caddy/Caddyfile --adapter caddyfile &
+
+# Exit when any child dies (caddy crash → container restart via docker compose)
+wait -n
+exit 1
--- a/docker/reproduce/Dockerfile
+++ b/docker/reproduce/Dockerfile
@ -0,0 +1,11 @@
+FROM debian:bookworm-slim
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    bash curl git jq docker.io docker-compose-plugin \
+    nodejs npm chromium \
+    && npm install -g @anthropic-ai/mcp-playwright \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 -s /bin/bash agent
+COPY docker/reproduce/entrypoint-reproduce.sh /entrypoint-reproduce.sh
+RUN chmod +x /entrypoint-reproduce.sh
+WORKDIR /home/agent
+ENTRYPOINT ["/entrypoint-reproduce.sh"]
--- a/docker/reproduce/entrypoint-reproduce.sh
+++ b/docker/reproduce/entrypoint-reproduce.sh
--- a/docker/runner/entrypoint-runner.sh
+++ b/docker/runner/entrypoint-runner.sh
@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# entrypoint-runner.sh — Vault runner entrypoint
+#
+# Receives an action-id, reads the vault action TOML to get the formula name,
+# then dispatches to the appropriate executor:
+#   - formulas/<name>.sh  → bash (mechanical operations like release)
+#   - formulas/<name>.toml → claude -p (reasoning tasks like triage, architect)
+#
+# Usage: entrypoint-runner.sh <action-id>
+#
+# Expects:
+#   OPS_REPO_ROOT  — path to the ops repo (mounted by compose)
+#   FACTORY_ROOT   — path to disinto code (default: /home/agent/disinto)
+#
+# Part of #516.
+
+set -euo pipefail
+
+FACTORY_ROOT="${FACTORY_ROOT:-/home/agent/disinto}"
+OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/agent/ops}"
+
+log() {
+  printf '[%s] runner: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$*"
+}
+
+# Configure git credential helper so formulas can clone/push without
+# needing tokens embedded in remote URLs (#604).
+if [ -f "${FACTORY_ROOT}/lib/git-creds.sh" ]; then
+  # shellcheck source=lib/git-creds.sh
+  source "${FACTORY_ROOT}/lib/git-creds.sh"
+  # shellcheck disable=SC2119  # no args intended — uses defaults
+  configure_git_creds
+fi
+
+# ── Argument parsing ─────────────────────────────────────────────────────
+
+action_id="${1:-}"
+if [ -z "$action_id" ]; then
+  log "ERROR: action-id argument required"
+  echo "Usage: entrypoint-runner.sh <action-id>" >&2
+  exit 1
+fi
+
+# ── Read vault action TOML ───────────────────────────────────────────────
+
+action_toml="${OPS_REPO_ROOT}/vault/actions/${action_id}.toml"
+if [ ! -f "$action_toml" ]; then
+  log "ERROR: vault action TOML not found: ${action_toml}"
+  exit 1
+fi
+
+# Extract formula name from TOML
+formula=$(grep -E '^formula\s*=' "$action_toml" \
+  | sed -E 's/^formula\s*=\s*"(.*)"/\1/' | tr -d '\r')
+
+if [ -z "$formula" ]; then
+  log "ERROR: no 'formula' field found in ${action_toml}"
+  exit 1
+fi
+
+# Extract context for logging
+context=$(grep -E '^context\s*=' "$action_toml" \
+  | sed -E 's/^context\s*=\s*"(.*)"/\1/' | tr -d '\r')
+
+log "Action: ${action_id}, formula: ${formula}, context: ${context:-<none>}"
+
+# Export action TOML path so formula scripts can use it directly
+export VAULT_ACTION_TOML="$action_toml"
+
+# ── Dispatch: .sh (mechanical) vs .toml (Claude reasoning) ──────────────
+
+formula_sh="${FACTORY_ROOT}/formulas/${formula}.sh"
+formula_toml="${FACTORY_ROOT}/formulas/${formula}.toml"
+
+if [ -f "$formula_sh" ]; then
+  # Mechanical operation — run directly
+  log "Dispatching to shell script: ${formula_sh}"
+  exec bash "$formula_sh" "$action_id"
+
+elif [ -f "$formula_toml" ]; then
+  # Reasoning task — launch Claude with the formula as prompt
+  log "Dispatching to Claude with formula: ${formula_toml}"
+
+  formula_content=$(cat "$formula_toml")
+  action_context=$(cat "$action_toml")
+
+  prompt="You are a vault runner executing a formula-based operational task.
+
+## Vault action
+\`\`\`toml
+${action_context}
+\`\`\`
+
+## Formula
+\`\`\`toml
+${formula_content}
+\`\`\`
+
+## Instructions
+Execute the steps defined in the formula above. The vault action context provides
+the specific parameters for this run. Execute each step in order, verifying
+success before proceeding to the next.
+
+FACTORY_ROOT=${FACTORY_ROOT}
+OPS_REPO_ROOT=${OPS_REPO_ROOT}
+"
+
+  exec claude -p "$prompt" \
+    --dangerously-skip-permissions \
+    ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"}
+
+else
+  log "ERROR: no formula found for '${formula}' — checked ${formula_sh} and ${formula_toml}"
+  exit 1
+fi
--- a/docs/BLAST-RADIUS.md
+++ b/docs/BLAST-RADIUS.md
@ -0,0 +1,25 @@
+# Vault blast-radius tiers
+
+## Tiers
+
+| Tier | Meaning | Dispatch path |
+|------|---------|---------------|
+| low | Revertable, no external side effects | Direct commit to ops main; no human gate |
+| medium | Significant but reversible | PR on ops repo; blocks calling agent until merged |
+| high | Irreversible or high-blast-radius | PR on ops repo; hard blocks |
+
+## Which agents are affected
+
+Vault-blocking applies to: predictor, planner, architect, deploy pipelines, releases, shipping.
+It does NOT apply to dev-agent — dev-agent work is always committed to a feature branch and
+revertable via git revert. Dev-agent never needs a vault gate.
+
+## Default tier
+
+Unknown formulas default to `high`. When adding a new formula, add it to
+`vault/policy.toml` (in ops repo, seeded during disinto init from disinto repo template).
+
+## Per-action override
+
+A vault action TOML may include `blast_radius = "low"` to override the policy tier
+for that specific invocation. Use sparingly — policy.toml is the authoritative source.
--- a/docs/CLAUDE-AUTH-CONCURRENCY.md
+++ b/docs/CLAUDE-AUTH-CONCURRENCY.md
@ -0,0 +1,138 @@
+# Claude Code OAuth Concurrency Model
+
+## Problem statement
+
+The factory runs multiple concurrent Claude Code processes across
+containers. OAuth access tokens are short-lived; refresh tokens rotate
+on each use. If two processes POST the same refresh token to Anthropic's
+token endpoint simultaneously, only one wins — the other gets
+`invalid_grant` and the operator is forced to re-login.
+
+Claude Code already serializes OAuth refreshes internally using
+`proper-lockfile` (`src/utils/auth.ts:1485-1491`):
+
+```typescript
+release = await lockfile.lock(claudeDir)
+```
+
+`proper-lockfile` creates a lockfile via an atomic `mkdir(${path}.lock)`
+call — a cross-process primitive that works across any number of
+processes on the same filesystem. The problem was never the lock
+implementation; it was that our old per-container bind-mount layout
+(`~/.claude` mounted but `/home/agent/` container-local) caused each
+container to compute a different lockfile path, so the locks never
+coordinated.
+
+## The fix: shared `CLAUDE_CONFIG_DIR`
+
+`CLAUDE_CONFIG_DIR` is an officially supported env var in Claude Code
+(`src/utils/envUtils.ts`). It controls where Claude resolves its config
+directory instead of the default `~/.claude`.
+
+By setting `CLAUDE_CONFIG_DIR` to a path on a shared bind mount, every
+container computes the **same** lockfile location. `proper-lockfile`'s
+atomic `mkdir(${CLAUDE_CONFIG_DIR}.lock)` then gives free cross-container
+serialization — no external wrapper needed.
+
+## Current layout
+
+```
+Host filesystem:
+  /var/lib/disinto/claude-shared/          ← CLAUDE_SHARED_DIR
+  └── config/                              ← CLAUDE_CONFIG_DIR
+      ├── .credentials.json
+      ├── settings.json
+      └── ...
+
+Inside every container:
+  Same absolute path: /var/lib/disinto/claude-shared/config
+  Env: CLAUDE_CONFIG_DIR=/var/lib/disinto/claude-shared/config
+```
+
+The shared directory is mounted at the **same absolute path** inside
+every container, so `proper-lockfile` resolves an identical lock path
+everywhere.
+
+### Where these values are defined
+
+| What | Where |
+|------|-------|
+| Defaults for `CLAUDE_SHARED_DIR`, `CLAUDE_CONFIG_DIR` | `lib/env.sh:138-140` |
+| `.env` documentation | `.env.example:92-99` |
+| Container mounts + env passthrough (edge dispatcher) | `docker/edge/dispatcher.sh:446-448` (and analogous blocks for reproduce, triage, verify) |
+| Auth detection using `CLAUDE_CONFIG_DIR` | `docker/agents/entrypoint.sh:101-102` |
+| Bootstrap / migration during `disinto init` | `lib/claude-config.sh:setup_claude_config_dir()`, `bin/disinto:952-962` |
+
+## Migration for existing dev boxes
+
+For operators upgrading from the old `~/.claude` bind-mount layout,
+`disinto init` handles the migration interactively (or with `--yes`).
+The manual equivalent is:
+
+```bash
+# 1. Stop the factory
+disinto down
+
+# 2. Create the shared directory
+mkdir -p /var/lib/disinto/claude-shared
+
+# 3. Move existing config
+mv "$HOME/.claude" /var/lib/disinto/claude-shared/config
+
+# 4. Create a back-compat symlink so host-side claude still works
+ln -sfn /var/lib/disinto/claude-shared/config "$HOME/.claude"
+
+# 5. Export the env var (add to shell rc for persistence)
+export CLAUDE_CONFIG_DIR=/var/lib/disinto/claude-shared/config
+
+# 6. Start the factory
+disinto up
+```
+
+## Verification
+
+Watch for these analytics events during concurrent agent runs:
+
+| Event | Meaning |
+|-------|---------|
+| `tengu_oauth_token_refresh_lock_acquiring` | A process is attempting to acquire the refresh lock |
+| `tengu_oauth_token_refresh_lock_acquired` | Lock acquired; refresh proceeding |
+| `tengu_oauth_token_refresh_lock_retry` | Lock is held by another process; retrying |
+| `tengu_oauth_token_refresh_lock_race_resolved` | Contention detected and resolved normally |
+| `tengu_oauth_token_refresh_lock_retry_limit_reached` | Lock acquisition failed after all retries |
+
+**Healthy:** `_race_resolved` appearing during contention windows — this
+means multiple processes tried to refresh simultaneously and the lock
+correctly serialized them.
+
+**Bad:** `_lock_retry_limit_reached` — indicates the lock is stuck or
+the shared mount is not working. Verify that `CLAUDE_CONFIG_DIR` resolves
+to the same path in all containers and that the filesystem supports
+`mkdir` atomicity (any POSIX filesystem does).
+
+## The deferred external `flock` wrapper
+
+`lib/agent-sdk.sh:139,144` still wraps every `claude` invocation in an
+external `flock` on `${HOME}/.claude/session.lock`:
+
+```bash
+local lock_file="${HOME}/.claude/session.lock"
+...
+output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1;
+  claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" ...)
+```
+
+With the `CLAUDE_CONFIG_DIR` fix in place, this external lock is
+**redundant but harmless** — `proper-lockfile` serializes the refresh
+internally, and `flock` serializes the entire invocation externally.
+The external flock remains as a defense-in-depth measure; removal is
+tracked as a separate vision-tier issue.
+
+## See also
+
+- `lib/env.sh:138-140` — `CLAUDE_SHARED_DIR` / `CLAUDE_CONFIG_DIR` defaults
+- `lib/claude-config.sh` — migration helper used by `disinto init`
+- `lib/agent-sdk.sh:139,144` — the external `flock` wrapper (deferred removal)
+- `docker/agents/entrypoint.sh:101-102` — `CLAUDE_CONFIG_DIR` auth detection
+- `.env.example:92-99` — operator-facing documentation of the env vars
+- Issue #623 — chat container auth strategy
--- a/docs/EVAL-MCP-SERVER.md
+++ b/docs/EVAL-MCP-SERVER.md
@ -39,9 +39,11 @@ programmatically instead of parsing SKILL.md instructions.
   (`mcp` package). This adds a build step, runtime dependency, and
   language that no current contributor or agent maintains.

-2. **Persistent process.** The factory is cron-driven — no long-running
-   daemons. An MCP server must stay up, be monitored, and be restarted on
-   failure. This contradicts the factory's event-driven architecture (AD-004).
+2. **Persistent process.** The factory already runs a long-lived polling loop
+   (`docker/agents/entrypoint.sh`), so an MCP server is not architecturally
+   alien — the loop could keep an MCP client alive across iterations. However,
+   adding a second long-running process increases the monitoring surface and
+   restart complexity.

 3. **Thin wrapper over existing APIs.** Every proposed MCP tool maps directly
   to a forge API call or a skill script invocation. The MCP server would be
--- a/docs/PHASE-PROTOCOL.md
+++ b/docs/PHASE-PROTOCOL.md
@ -92,10 +92,9 @@ PHASE:failed          → label issue blocked, post diagnostic comment

 ### `idle_prompt` exit reason

-`monitor_phase_loop` (in `lib/agent-session.sh`) can exit with
-`_MONITOR_LOOP_EXIT=idle_prompt`. This happens when Claude returns to the
-interactive prompt (`❯`) for **3 consecutive polls** without writing any phase
-signal to the phase file.
+The phase monitor can exit with `_MONITOR_LOOP_EXIT=idle_prompt`. This happens
+when Claude returns to the interactive prompt (`❯`) for **3 consecutive polls**
+without writing any phase signal to the phase file.

 **Trigger conditions:**
 - The phase file is empty (no phase has ever been written), **and**
@ -111,14 +110,13 @@ signal to the phase file.
   callback without the phase file actually containing that value.

 **Agent requirements:**
- **Callback (`_on_phase_change` / `formula_phase_callback`):** Must handle
-  `PHASE:failed` defensively — the session is already dead, so any tmux
-  send-keys or session-dependent logic must be skipped or guarded.
+- **Callback:** Must handle `PHASE:failed` defensively — the session is already
+  dead, so any tmux send-keys or session-dependent logic must be skipped or
+  guarded.
 - **Post-loop exit handler (`case $_MONITOR_LOOP_EXIT`):** Must include an
  `idle_prompt)` branch. Typical actions: log the event, clean up temp files,
  and (for agents that use escalation) write an escalation entry or notify via
-  vault/forge. See `dev/dev-agent.sh` and
-  `gardener/gardener-agent.sh` for reference implementations.
+  vault/forge. See `dev/dev-agent.sh` for reference implementations.

 ## Crash Recovery

--- a/docs/VAULT.md
+++ b/docs/VAULT.md
@ -33,9 +33,11 @@ The `main` branch on the ops repo (`johba/disinto-ops`) is protected via Forgejo
   - Title: `vault: <action-id>`
   - Labels: `vault`, `pending-approval`
   - File: `vault/actions/<action-id>.toml`
+   - **Auto-merge enabled** — Forgejo will auto-merge after approval
 4. **Approval** — Admin user reviews and approves the PR
-5. **Execution** — Dispatcher (issue #76) polls for approved vault PRs and executes them
-6. **Cleanup** — Executed vault items are moved to `fired/` (via PR)
+5. **Auto-merge** — Forgejo automatically merges the PR once required approvals are met
+6. **Execution** — Dispatcher (issue #76) polls for merged vault PRs and executes them
+7. **Cleanup** — Executed vault items are moved to `fired/` (via PR)

 ## Bot Account Behavior

@ -43,6 +45,7 @@ Bot accounts (dev-bot, review-bot, vault-bot, etc.) **cannot merge vault PRs** e

 - Only human admins can approve sensitive vault actions
 - Bot accounts can only create vault PRs, not execute them
+- Bot accounts cannot self-approve vault PRs (Forgejo prevents this automatically)
 - Manual admin review is always required for privileged operations

 ## Setup
--- a/docs/edge-routing-fallback.md
+++ b/docs/edge-routing-fallback.md
@ -0,0 +1,149 @@
+# Edge Routing Fallback: Per-Project Subdomains
+
+> **Status:** Contingency plan. Only implement if subpath routing (#704 / #708)
+> proves unworkable.
+
+## Context
+
+The primary approach routes services under subpaths of `<project>.disinto.ai`:
+
+| Service    | Primary (subpath)                          |
+|------------|--------------------------------------------|
+| Forgejo    | `<project>.disinto.ai/forge/`              |
+| Woodpecker | `<project>.disinto.ai/ci/`                 |
+| Chat       | `<project>.disinto.ai/chat/`               |
+| Staging    | `<project>.disinto.ai/staging/`            |
+
+The fallback uses per-service subdomains instead:
+
+| Service    | Fallback (subdomain)                       |
+|------------|--------------------------------------------|
+| Forgejo    | `forge.<project>.disinto.ai/`              |
+| Woodpecker | `ci.<project>.disinto.ai/`                 |
+| Chat       | `chat.<project>.disinto.ai/`               |
+| Staging    | `<project>.disinto.ai/`  (root)            |
+
+The wildcard cert from #621 already covers `*.<project>.disinto.ai` — no new
+DNS records or certs are needed for sub-subdomains because `*.disinto.ai`
+matches one level deep. For sub-subdomains like `forge.<project>.disinto.ai`
+we would need to add a second wildcard (`*.*.disinto.ai`) or explicit DNS
+records per project. Both are straightforward with the existing Gandi DNS-01
+setup.
+
+## Pivot Decision Criteria
+
+**Pivot if:**
+
+- Forgejo `ROOT_URL` under a subpath (`/forge/`) causes redirect loops that
+  cannot be fixed with `X-Forwarded-Prefix` or Caddy `uri strip_prefix`.
+- Woodpecker's `WOODPECKER_HOST` does not honour subpath prefixes, causing
+  OAuth callback mismatches that persist after adjusting redirect URIs.
+- Forward-auth on `/chat/*` conflicts with Forgejo's own OAuth flow when both
+  share the same origin (cookie collision, CSRF token mismatch).
+
+**Do NOT pivot if:**
+
+- Forgejo login redirects to `/` instead of `/forge/` — fixable with Caddy
+  `handle_path` + `uri prefix` rewrite.
+- Woodpecker UI assets 404 under `/ci/` — fixable with asset prefix config
+  (`WOODPECKER_ROOT_PATH`).
+- A single OAuth app needs a second redirect URI — Forgejo supports multiple
+  `redirect_uris` in the same app.
+
+## Fallback Topology
+
+### Caddyfile
+
+Replace the single `:80` block with four host blocks:
+
+```caddy
+# Main project domain — staging / landing
+<project>.disinto.ai {
+    reverse_proxy staging:80
+}
+
+# Forgejo — root path, no subpath rewrite needed
+forge.<project>.disinto.ai {
+    reverse_proxy forgejo:3000
+}
+
+# Woodpecker CI — root path
+ci.<project>.disinto.ai {
+    reverse_proxy woodpecker:8000
+}
+
+# Chat — with forward_auth (same as #709, but on its own host)
+chat.<project>.disinto.ai {
+    handle /login {
+        reverse_proxy chat:8080
+    }
+    handle /oauth/callback {
+        reverse_proxy chat:8080
+    }
+    handle /* {
+        forward_auth chat:8080 {
+            uri /auth/verify
+            copy_headers X-Forwarded-User
+            header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
+        }
+        reverse_proxy chat:8080
+    }
+}
+```
+
+**Current file:** `docker/Caddyfile` (generated by `lib/generators.sh:_generate_caddyfile_impl`, line ~596).
+
+### Service Configuration Changes
+
+| Variable / Setting         | Current (subpath)                              | Fallback (subdomain)                            | File                        |
+|----------------------------|------------------------------------------------|-------------------------------------------------|-----------------------------|
+| Forgejo `ROOT_URL`         | `https://<project>.disinto.ai/forge/`          | `https://forge.<project>.disinto.ai/`           | forgejo `app.ini`           |
+| `WOODPECKER_HOST`          | `http://localhost:8000` (subpath via proxy)     | `https://ci.<project>.disinto.ai`               | `lib/ci-setup.sh` line ~164 |
+| Woodpecker OAuth redirect  | `https://<project>.disinto.ai/ci/authorize`    | `https://ci.<project>.disinto.ai/authorize`     | `lib/ci-setup.sh` line ~153 |
+| Chat OAuth redirect        | `https://<project>.disinto.ai/chat/oauth/callback` | `https://chat.<project>.disinto.ai/oauth/callback` | `lib/ci-setup.sh` line ~188 |
+| `EDGE_TUNNEL_FQDN`         | `<project>.disinto.ai`                         | unchanged (main domain)                         | `lib/generators.sh` line ~432 |
+
+### New Environment Variables (pivot only)
+
+These would be added to `lib/generators.sh` `_generate_compose_impl()` in the
+edge service environment block (currently line ~415):
+
+| Variable                     | Value                                  |
+|------------------------------|----------------------------------------|
+| `EDGE_TUNNEL_FQDN_FORGE`    | `forge.<project>.disinto.ai`           |
+| `EDGE_TUNNEL_FQDN_CI`       | `ci.<project>.disinto.ai`              |
+| `EDGE_TUNNEL_FQDN_CHAT`     | `chat.<project>.disinto.ai`            |
+
+### DNS
+
+No new records needed if the registrar supports `*.*.disinto.ai` wildcards.
+Otherwise, add explicit A/CNAME records per project:
+
+```
+forge.<project>.disinto.ai  → edge server IP
+ci.<project>.disinto.ai     → edge server IP
+chat.<project>.disinto.ai   → edge server IP
+```
+
+The edge server already handles TLS via Caddy's automatic HTTPS with the
+existing ACME / DNS-01 challenge.
+
+### Edge Control (`tools/edge-control/register.sh`)
+
+Currently `do_register()` creates a single route for `<project>.disinto.ai`.
+The fallback would need to register four routes (or accept a `--subdomain`
+parameter). See the TODO in `register.sh`.
+
+## Files to Change on Pivot
+
+| File                              | What changes                                                    |
+|-----------------------------------|-----------------------------------------------------------------|
+| `docker/Caddyfile`               | Replace single host block → four host blocks (see above)        |
+| `lib/generators.sh`              | Add `EDGE_TUNNEL_FQDN_{FORGE,CI,CHAT}` env vars to compose     |
+| `lib/ci-setup.sh` ~line 153      | Woodpecker OAuth redirect URI → `ci.<project>` subdomain        |
+| `lib/ci-setup.sh` ~line 188      | Chat OAuth redirect URI → `chat.<project>` subdomain            |
+| `tools/edge-control/register.sh` | Register four routes per project instead of one                 |
+| `tools/edge-control/lib/caddy.sh`| `add_route()` gains subdomain support                           |
+| forgejo `app.ini`                 | `ROOT_URL` → `https://forge.<project>.disinto.ai/`             |
+
+Estimated effort for a full pivot: **under one day** given this plan.
--- a/docs/investigation-685-reviewer-approved-destructive-compose.md
+++ b/docs/investigation-685-reviewer-approved-destructive-compose.md
@ -0,0 +1,123 @@
+# Investigation: Reviewer approved destructive compose rewrite in PR #683
+
+**Issue**: #685
+**Date**: 2026-04-11
+**PR under investigation**: #683 (fix: config: gardener=1h, architect=9m, planner=11m)
+
+## Summary
+
+The reviewer agent approved PR #683 in ~1 minute without flagging that it
+contained a destructive rewrite of `docker-compose.yml` — dropping named
+volumes, bind mounts, env vars, restart policy, and security options. Six
+structural gaps in the review pipeline allowed this to pass.
+
+## Root causes
+
+### 1. No infrastructure-file-specific review checklist
+
+The review formula (`formulas/review-pr.toml`) has a generic review checklist
+(bugs, security, imports, architecture, bash specifics, dead code). It has
+**no special handling for infrastructure files** — `docker-compose.yml`,
+`Dockerfile`, CI configs, or `entrypoint.sh` are reviewed with the same
+checklist as application code.
+
+Infrastructure files have a different failure mode: a single dropped line
+(a volume mount, an env var, a restart policy) can break a running deployment
+without any syntax error or linting failure. The generic checklist doesn't
+prompt the reviewer to check for these regressions.
+
+**Fix applied**: Added step 3c "Infrastructure file review" to
+`formulas/review-pr.toml` with a compose-specific checklist covering named
+volumes, bind mounts, env vars, restart policy, and security options.
+
+### 2. No scope discipline
+
+Issue #682 asked for ~3 env var changes + `PLANNER_INTERVAL` plumbing — roughly
+10-15 lines across 3-4 files. PR #683's diff rewrote the entire compose service
+block (~50+ lines changed in `docker-compose.yml` alone).
+
+The review formula **does not instruct the reviewer to compare diff size against
+issue scope**. A scope-aware reviewer would flag: "this PR changes more lines
+than the issue scope warrants — request justification for out-of-scope changes."
+
+**Fix applied**: Added step 3d "Scope discipline" to `formulas/review-pr.toml`
+requiring the reviewer to compare actual changes against stated issue scope and
+flag out-of-scope modifications to infrastructure files.
+
+### 3. Lessons-learned bias toward approval
+
+The reviewer's `.profile/knowledge/lessons-learned.md` contains multiple entries
+that systematically bias toward approval:
+
+- "Approval means 'ready to ship,' not 'perfect.'"
+- "'Different from how I'd write it' is not a blocker."
+- "Reserve request_changes for genuinely blocking concerns."
+
+These lessons are well-intentioned (they prevent nit-picking and false blocks)
+but they create a blind spot: the reviewer suppresses its instinct to flag
+suspicious-looking changes because the lessons tell it not to block on
+"taste-based" concerns. A compose service block rewrite *looks* like a style
+preference ("the dev reorganized the file") but is actually a correctness
+regression.
+
+**Recommendation**: The lessons-learned are not wrong — they should stay. But
+the review formula now explicitly carves out infrastructure files from the
+"bias toward APPROVE" guidance, making it clear that dropped infra
+configuration is a blocking concern, not a style preference.
+
+### 4. No ground-truth for infrastructure files
+
+The reviewer only sees the diff. It has no way to compare against the running
+container's actual volume/env config. When dev-qwen rewrote a 30-line service
+block from scratch, the reviewer saw a 30-line addition and a 30-line deletion
+with no reference point.
+
+**Recommendation (future work)**: Maintain a `docker/expected-compose-config.yml`
+or have the reviewer fetch `docker compose config` output as ground truth when
+reviewing compose changes. This would let the reviewer diff the proposed config
+against the known-good config.
+
+### 5. Structural analysis blind spot
+
+`lib/build-graph.py` tracks changes to files in `formulas/`, agent directories
+(`dev/`, `review/`, etc.), and `evidence/`. It does **not track infrastructure
+files** (`docker-compose.yml`, `docker/`, `.woodpecker/`). Changes to these
+files produce no alerts in the graph report — the reviewer gets no
+"affected objectives" signal for infrastructure changes.
+
+**Recommendation (future work)**: Add infrastructure file tracking to
+`build-graph.py` so that compose/Dockerfile/CI changes surface in the
+structural analysis.
+
+### 6. Model and time budget
+
+Reviews use Sonnet (`CLAUDE_MODEL="sonnet"` at `review-pr.sh:229`) with a
+15-minute timeout. The PR #683 review completed in ~1 minute. Sonnet is
+optimized for speed, which is appropriate for most code reviews, but
+infrastructure changes benefit from the deeper reasoning of a more capable
+model.
+
+**Recommendation (future work)**: Consider escalating to a more capable model
+when the diff includes infrastructure files (compose, Dockerfiles, CI configs).
+
+## Changes made
+
+1. **`formulas/review-pr.toml`** — Added two new review steps:
+   - **Step 3c: Infrastructure file review** — When the diff touches
+     `docker-compose.yml`, `Dockerfile*`, `.woodpecker/`, or `docker/`,
+     requires checking for dropped volumes, bind mounts, env vars, restart
+     policy, security options, and network config. Instructs the reviewer to
+     read the full file (not just the diff) and compare against the base branch.
+   - **Step 3d: Scope discipline** — Requires comparing the actual diff
+     footprint against the stated issue scope. Flags out-of-scope rewrites of
+     infrastructure files as blocking concerns.
+
+## What would have caught this
+
+With the changes above, the reviewer would have:
+
+1. Seen step 3c trigger for `docker-compose.yml` changes
+2. Read the full compose file and compared against the base branch
+3. Noticed the dropped named volumes, bind mounts, env vars, restart policy
+4. Seen step 3d flag that a 3-env-var issue produced a 50+ line compose rewrite
+5. Issued REQUEST_CHANGES citing specific dropped configuration
--- a/docs/updating-factory.md
+++ b/docs/updating-factory.md
@ -0,0 +1,175 @@
+# Updating the Disinto Factory
+
+How to update the disinto factory code on a deployment box (e.g. harb-dev-box)
+after a new version lands on the upstream Forgejo.
+
+## Prerequisites
+
+- SSH access to the deployment box
+- The upstream remote (`devbox`) pointing to the disinto-dev-box Forgejo
+
+## Step 1: Pull the latest code
+
+```bash
+cd ~/disinto
+git fetch devbox main
+git log --oneline devbox/main -5   # review what changed
+git stash                           # save any local fixes
+git merge devbox/main
+```
+
+## Note: docker-compose.yml is generator-only
+
+The `docker-compose.yml` file is now generated exclusively by `bin/disinto init`.
+The tracked file has been removed. If you have a local `docker-compose.yml` from
+before this change, it is now "yours" and won't be touched by future updates.
+To pick up generator improvements, delete the existing file and run `bin/disinto init`.
+
+## Step 2: Preserve local config
+
+These files are not in git but are needed at runtime. Back them up before
+any compose regeneration:
+
+```bash
+cp .env .env.backup
+cp projects/harb.toml projects/harb.toml.backup
+cp docker-compose.override.yml docker-compose.override.yml.backup 2>/dev/null
+```
+
+## Step 3: Regenerate docker-compose.yml
+
+If `generate_compose()` changed or you need a fresh compose file:
+
+```bash
+rm docker-compose.yml
+source .env
+bin/disinto init https://codeberg.org/johba/harb --branch master --yes
+```
+
+This will regenerate the compose but may fail partway through (token collisions,
+existing users). The compose file is written early — check it exists even if
+init errors out.
+
+### Known post-regeneration fixes (until #429 lands)
+
+Most generator issues have been fixed. The following items no longer apply:
+
+- **AppArmor (#492)** — Fixed: all services now have `apparmor=unconfined`
+- **Forgejo image tag (#493)** — Fixed: generator uses `forgejo:11.0`
+- **Agent credential mounts (#495)** — Fixed: `.claude`, `.claude.json`, `.ssh`, and `project-repos` volumes are auto-generated
+- **Repo path (#494)** — Not applicable: `projects/*.toml` files are gitignored and preserved
+
+If you need to add custom volumes, edit the generated `docker-compose.yml` directly.
+It will not be overwritten by future `init` runs (the generator skips existing files).
+
+## Step 4: Rebuild and restart
+
+```bash
+# Rebuild agents image (code is baked in via COPY)
+docker compose build agents
+
+# Restart all disinto services
+docker compose up -d
+
+# If edge fails to build (caddy:alpine has no apt-get), skip it:
+docker compose up -d forgejo woodpecker woodpecker-agent agents staging
+```
+
+## Step 5: Verify
+
+```bash
+# All containers running?
+docker ps --format 'table {{.Names}}\t{{.Status}}' | grep disinto
+
+# Forgejo responding?
+curl -sf -o /dev/null -w 'HTTP %{http_code}' http://localhost:3000/
+
+# Claude auth works?
+docker exec -u agent disinto-agents bash -c 'claude -p "say ok" 2>&1'
+
+# Agent polling loop running?
+docker exec disinto-agents pgrep -f entrypoint.sh
+# If no process: check that entrypoint.sh is the container CMD and projects TOML is mounted.
+
+# Agent repo cloned?
+docker exec disinto-agents ls /home/agent/repos/harb/.git && echo ok
+# If missing:
+docker exec disinto-agents chown -R agent:agent /home/agent/repos
+source .env
+docker exec -u agent disinto-agents bash -c \
+  "git clone http://dev-bot:${FORGE_TOKEN}@forgejo:3000/johba/harb.git /home/agent/repos/harb"
+
+# Git safe.directory (needed after volume recreation)
+docker exec -u agent disinto-agents git config --global --add safe.directory /home/agent/repos/harb
+```
+
+## Step 6: Verify harb stack coexistence
+
+```bash
+# Harb stack still running?
+cd ~/harb && docker compose ps --format 'table {{.Name}}\t{{.Status}}'
+
+# No port conflicts?
+# Forgejo: 3000, Woodpecker: 8000, harb caddy: 8081, umami: 3001
+ss -tlnp | grep -E '3000|3001|8000|8081'
+```
+
+## Step 7: Docker disk hygiene
+
+The reproduce image is ~1.3GB. Dangling images accumulate fast.
+
+```bash
+# Check disk
+df -h /
+
+# Prune dangling images (safe — only removes unused)
+docker image prune -f
+
+# Nuclear option (removes ALL unused images, volumes, networks):
+docker system prune -af
+# WARNING: this removes cached layers, requiring full rebuilds
+```
+
+## Troubleshooting
+
+### Forgejo at 170%+ CPU, not responding
+AppArmor issue. Add `security_opt: [apparmor=unconfined]` and recreate:
+```bash
+docker compose up -d forgejo
+```
+
+### "Not logged in" / OAuth expired
+Re-auth on the host:
+```bash
+claude auth login
+```
+Credentials are bind-mounted into containers automatically.
+Multiple containers sharing OAuth can cause frequent expiry — consider
+using `ANTHROPIC_API_KEY` in `.env` instead.
+
+### Agent loop not running after restart
+The entrypoint reads `projects/*.toml` to determine which agents to run.
+If the TOML isn't mounted or the disinto directory is read-only,
+the polling loop won't start agents. Check:
+```bash
+docker exec disinto-agents ls /home/agent/disinto/projects/harb.toml
+docker logs disinto-agents --tail 20  # look for "Entering polling loop"
+```
+
+### "fatal: not a git repository"
+After image rebuilds, the baked-in `/home/agent/disinto` has no `.git`.
+This breaks review-pr.sh (#408). Workaround:
+```bash
+docker exec -u agent disinto-agents git config --global --add safe.directory '*'
+```
+
+### Dev-agent stuck on closed issue
+The dev-poll latches onto in-progress issues. If the issue was closed
+externally, the agent skips it every cycle but never moves on. Check:
+```bash
+docker exec disinto-agents tail -5 /home/agent/data/logs/dev/dev-agent.log
+```
+Fix: clean the worktree and let it re-scan:
+```bash
+docker exec disinto-agents rm -rf /tmp/harb-worktree-*
+```
--- a/formulas/release.sh
+++ b/formulas/release.sh
@ -0,0 +1,187 @@
+#!/usr/bin/env bash
+# formulas/release.sh — Mechanical release script
+#
+# Implements the release workflow without Claude:
+#   1. Validate prerequisites
+#   2. Tag Forgejo main via API
+#   3. Push tag to mirrors (Codeberg, GitHub) via token auth
+#   4. Build and tag the agents Docker image
+#   5. Restart agent containers
+#
+# Usage: release.sh <action-id>
+#
+# Expects env vars:
+#   FORGE_URL, FORGE_TOKEN, FORGE_REPO, PRIMARY_BRANCH
+#   GITHUB_TOKEN    — for pushing tags to GitHub mirror
+#   CODEBERG_TOKEN  — for pushing tags to Codeberg mirror
+#
+# The action TOML context field must contain the version, e.g.:
+#   context = "Release v1.2.0"
+#
+# Part of #516.
+
+set -euo pipefail
+
+FACTORY_ROOT="${FACTORY_ROOT:-/home/agent/disinto}"
+OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/agent/ops}"
+
+log() {
+  printf '[%s] release: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$*"
+}
+
+# ── Argument parsing ─────────────────────────────────────────────────────
+# VAULT_ACTION_TOML is exported by the runner entrypoint (entrypoint-runner.sh)
+
+action_id="${1:-}"
+if [ -z "$action_id" ]; then
+  log "ERROR: action-id argument required"
+  exit 1
+fi
+
+action_toml="${VAULT_ACTION_TOML:-${OPS_REPO_ROOT}/vault/actions/${action_id}.toml}"
+if [ ! -f "$action_toml" ]; then
+  log "ERROR: vault action TOML not found: ${action_toml}"
+  exit 1
+fi
+
+# Extract version from context field (e.g. "Release v1.2.0" → "v1.2.0")
+context=$(grep -E '^context\s*=' "$action_toml" \
+  | sed -E 's/^context\s*=\s*"(.*)"/\1/' | tr -d '\r')
+RELEASE_VERSION=$(echo "$context" | grep -oE 'v[0-9]+\.[0-9]+\.[0-9]+') || true
+
+if [ -z "${RELEASE_VERSION:-}" ]; then
+  log "ERROR: could not extract version from context: '${context}'"
+  log "Context must contain a version like v1.2.0"
+  exit 1
+fi
+
+log "Starting release ${RELEASE_VERSION} (action: ${action_id})"
+
+# ── Step 1: Preflight ────────────────────────────────────────────────────
+
+log "Step 1/6: Preflight checks"
+
+# Validate version format
+if ! echo "$RELEASE_VERSION" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+$'; then
+  log "ERROR: invalid version format: ${RELEASE_VERSION}"
+  exit 1
+fi
+
+# Required env vars
+for var in FORGE_URL FORGE_TOKEN FORGE_REPO PRIMARY_BRANCH; do
+  if [ -z "${!var:-}" ]; then
+    log "ERROR: required env var not set: ${var}"
+    exit 1
+  fi
+done
+
+# Check Docker access
+if ! docker info >/dev/null 2>&1; then
+  log "ERROR: Docker not accessible"
+  exit 1
+fi
+
+# Check tag doesn't already exist on Forgejo
+if curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_URL}/api/v1/repos/${FORGE_REPO}/tags/${RELEASE_VERSION}" >/dev/null 2>&1; then
+  log "ERROR: tag ${RELEASE_VERSION} already exists on Forgejo"
+  exit 1
+fi
+
+log "Preflight passed"
+
+# ── Step 2: Tag main via Forgejo API ─────────────────────────────────────
+
+log "Step 2/6: Creating tag ${RELEASE_VERSION} on Forgejo"
+
+# Get HEAD SHA of primary branch
+head_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_URL}/api/v1/repos/${FORGE_REPO}/branches/${PRIMARY_BRANCH}" \
+  | jq -r '.commit.id // empty')
+
+if [ -z "$head_sha" ]; then
+  log "ERROR: could not get HEAD SHA for ${PRIMARY_BRANCH}"
+  exit 1
+fi
+
+# Create tag via API
+curl -sf -X POST \
+  -H "Authorization: token ${FORGE_TOKEN}" \
+  -H "Content-Type: application/json" \
+  "${FORGE_URL}/api/v1/repos/${FORGE_REPO}/tags" \
+  -d "{\"tag_name\":\"${RELEASE_VERSION}\",\"target\":\"${head_sha}\",\"message\":\"Release ${RELEASE_VERSION}\"}" \
+  >/dev/null
+
+log "Tag ${RELEASE_VERSION} created (SHA: ${head_sha})"
+
+# ── Step 3: Push tag to mirrors ──────────────────────────────────────────
+
+log "Step 3/6: Pushing tag to mirrors"
+
+# Extract org/repo from FORGE_REPO (e.g. "disinto-admin/disinto" → "disinto")
+project_name="${FORGE_REPO##*/}"
+
+# Push to GitHub mirror (if GITHUB_TOKEN is available)
+if [ -n "${GITHUB_TOKEN:-}" ]; then
+  log "Pushing tag to GitHub mirror"
+  # Create tag on GitHub via API
+  if curl -sf -X POST \
+    -H "Authorization: token ${GITHUB_TOKEN}" \
+    -H "Accept: application/vnd.github+json" \
+    "https://api.github.com/repos/Disinto/${project_name}/git/refs" \
+    -d "{\"ref\":\"refs/tags/${RELEASE_VERSION}\",\"sha\":\"${head_sha}\"}" \
+    >/dev/null 2>&1; then
+    log "GitHub: tag pushed"
+  else
+    log "WARNING: GitHub tag push failed (may already exist)"
+  fi
+else
+  log "WARNING: GITHUB_TOKEN not set — skipping GitHub mirror"
+fi
+
+# Push to Codeberg mirror (if CODEBERG_TOKEN is available)
+if [ -n "${CODEBERG_TOKEN:-}" ]; then
+  log "Pushing tag to Codeberg mirror"
+  # Codeberg uses Gitea-compatible API
+  # Extract owner from FORGE_REPO for Codeberg (use same owner)
+  codeberg_owner="${FORGE_REPO%%/*}"
+  if curl -sf -X POST \
+    -H "Authorization: token ${CODEBERG_TOKEN}" \
+    -H "Content-Type: application/json" \
+    "https://codeberg.org/api/v1/repos/${codeberg_owner}/${project_name}/tags" \
+    -d "{\"tag_name\":\"${RELEASE_VERSION}\",\"target\":\"${head_sha}\",\"message\":\"Release ${RELEASE_VERSION}\"}" \
+    >/dev/null 2>&1; then
+    log "Codeberg: tag pushed"
+  else
+    log "WARNING: Codeberg tag push failed (may already exist)"
+  fi
+else
+  log "WARNING: CODEBERG_TOKEN not set — skipping Codeberg mirror"
+fi
+
+# ── Step 4: Build agents Docker image ────────────────────────────────────
+
+log "Step 4/6: Building agents Docker image"
+
+cd "$FACTORY_ROOT" || exit 1
+docker compose build --no-cache agents 2>&1 | tail -5
+log "Image built"
+
+# ── Step 5: Tag image with version ───────────────────────────────────────
+
+log "Step 5/6: Tagging image"
+
+docker tag disinto/agents:latest "disinto/agents:${RELEASE_VERSION}"
+log "Tagged disinto/agents:${RELEASE_VERSION}"
+
+# ── Step 6: Restart agent containers ─────────────────────────────────────
+
+log "Step 6/6: Restarting agent containers"
+
+docker compose stop agents agents-llama 2>/dev/null || true
+docker compose up -d agents agents-llama
+log "Agent containers restarted"
+
+# ── Done ─────────────────────────────────────────────────────────────────
+
+log "Release ${RELEASE_VERSION} completed successfully"
--- a/formulas/release.toml
+++ b/formulas/release.toml
@ -58,7 +58,7 @@ Validate release prerequisites before proceeding.

 7. Check if tag already exists on Forgejo:
   - curl -sf -H "Authorization: token $FORGE_TOKEN" \
-   -   "$FORGE_URL/api/v1/repos/johba/disinto/git/tags/$RELEASE_VERSION"
+   -   "$FORGE_URL/api/v1/repos/$FORGE_REPO/git/tags/$RELEASE_VERSION"
   - If exists, exit with error

 8. Export RELEASE_VERSION for subsequent steps:
@ -77,14 +77,14 @@ Create the release tag on Forgejo main via the Forgejo API.

 1. Get current HEAD SHA of main:
   - curl -sf -H "Authorization: token $FORGE_TOKEN" \
-   -   "$FORGE_URL/api/v1/repos/johba/disinto/branches/$PRIMARY_BRANCH"
+   -   "$FORGE_URL/api/v1/repos/$FORGE_REPO/branches/$PRIMARY_BRANCH"
   - Parse sha field from response

 2. Create tag via Forgejo API:
   - curl -sf -X POST \
   -   -H "Authorization: token $FORGE_TOKEN" \
   -   -H "Content-Type: application/json" \
-   -   "$FORGE_URL/api/v1/repos/johba/disinto/tags" \
+   -   "$FORGE_URL/api/v1/repos/$FORGE_REPO/tags" \
   -   -d "{\"tag\":\"$RELEASE_VERSION\",\"target\":\"$HEAD_SHA\",\"message\":\"Release $RELEASE_VERSION\"}"
   - Parse response for success

@ -106,8 +106,8 @@ description = """
 Push the newly created tag to all configured mirrors.

 1. Add mirror remotes if not already present:
-   - Codeberg: git remote add codeberg git@codeberg.org:johba/disinto.git
-   - GitHub: git remote add github git@github.com:disinto/disinto.git
+   - Codeberg: git remote add codeberg git@codeberg.org:${FORGE_REPO_OWNER}/${PROJECT_NAME}.git
+   - GitHub: git remote add github git@github.com:disinto/${PROJECT_NAME}.git
   - Check with: git remote -v

 2. Push tag to Codeberg:
@ -120,9 +120,9 @@ Push the newly created tag to all configured mirrors.

 4. Verify tags exist on mirrors:
   - curl -sf -H "Authorization: token $GITHUB_TOKEN" \
-   -   "https://api.github.com/repos/disinto/disinto/tags/$RELEASE_VERSION"
+   -   "https://api.github.com/repos/disinto/${PROJECT_NAME}/tags/$RELEASE_VERSION"
   - curl -sf -H "Authorization: token $FORGE_TOKEN" \
-   -   "$FORGE_URL/api/v1/repos/johba/disinto/git/tags/$RELEASE_VERSION"
+   -   "$FORGE_URL/api/v1/repos/$FORGE_REPO/git/tags/$RELEASE_VERSION"

 5. Log success:
   - echo "Tag $RELEASE_VERSION pushed to mirrors"
@ -227,7 +227,7 @@ Write the release result to a file for tracking.
   - {
   -   "version": "$RELEASE_VERSION",
   -   "image_id": "$IMAGE_ID",
-   -   "forgejo_tag_url": "$FORGE_URL/johba/disinto/src/$RELEASE_VERSION",
+   -   "forgejo_tag_url": "$FORGE_URL/$FORGE_REPO/src/$RELEASE_VERSION",
   -   "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
   -   "status": "success"
   - }
--- a/formulas/reproduce.toml
+++ b/formulas/reproduce.toml
@ -0,0 +1,37 @@
+# formulas/reproduce.toml — Reproduce-agent formula
+#
+# Declares the reproduce-agent's runtime parameters.
+# The dispatcher reads this to configure the sidecar container.
+#
+# stack_script: path (relative to PROJECT_REPO_ROOT) of the script used to
+# restart/rebuild the project stack before reproduction.  Omit (or leave
+# blank) to connect to an existing staging environment instead.
+#
+# tools: MCP servers to pass to claude via --mcp-server flags.
+#
+# timeout_minutes: hard upper bound on the Claude session.
+#
+# Exit gate logic (standard mode):
+#   1. Can I reproduce it? → NO → rejected/blocked → EXIT
+#                          → YES → continue
+#   2. Is the cause obvious? → YES → in-progress + backlog issue → EXIT
+#                            → NO → in-triage → EXIT
+#
+# Exit gate logic (verification mode):
+#   Triggered when all sub-issues of a parent bug-report are closed.
+#   1. Bug fixed → comment "verified fixed", remove in-progress, close issue
+#   2. Bug persists → comment "still reproduces", add in-triage, re-enter triage
+#
+# Turn budget (standard mode): 60% on step 1 (reproduction), 40% on step 2 (cause check).
+# Turn budget (verification mode): 100% on re-running reproduction steps.
+
+name            = "reproduce"
+description     = "Primary: reproduce the bug. Secondary: check if cause is obvious. Exit gates enforced."
+version         = 1
+
+# Set stack_script to the restart command for local stacks.
+# Leave empty ("") to target an existing staging environment.
+stack_script    = ""
+
+tools           = ["playwright"]
+timeout_minutes = 15
--- a/formulas/review-pr.toml
+++ b/formulas/review-pr.toml
@ -61,6 +61,83 @@ Do NOT flag:
 - Things that look wrong but actually work — verify by reading the code first
 - Files that were truncated from the diff (the orchestrator notes truncation)

+## 3b. Architecture and documentation consistency
+
+For each BEHAVIORAL change in the diff (not pure bug fixes or formatting):
+
+1. Identify what behavior changed (e.g., scheduling mechanism, auth flow,
+   container lifecycle, secret handling)
+2. Search AGENTS.md for claims about that behavior:
+     grep -n '<keyword>' AGENTS.md
+   Also check docs/ and any per-directory AGENTS.md files.
+3. Search for Architecture Decision references (AD-001 through AD-006):
+     grep -n 'AD-0' AGENTS.md
+   Read each AD and check if the PR's changes contradict it.
+4. If the PR changes behavior described in AGENTS.md or contradicts an AD
+   but does NOT update the documentation in the same PR:
+   REQUEST_CHANGES — require the documentation update in the same PR.
+
+This check is SKIPPED for pure bug fixes where the intended behavior is
+unchanged (the code was wrong, not the documentation).
+
+## 3c. Infrastructure file review (conditional)
+
+If the diff touches ANY of these files, apply this additional checklist:
+- `docker-compose.yml` or `docker-compose.*.yml`
+- `Dockerfile` or `docker/*`
+- `.woodpecker/` CI configs
+- `docker/agents/entrypoint.sh`
+
+Infrastructure files have a different failure mode from application code:
+a single dropped line (a volume mount, an env var, a restart policy) can
+break a running deployment with no syntax error. Treat dropped
+infrastructure configuration as a **blocking defect**, not a style choice.
+
+### For docker-compose.yml changes:
+
+1. **Read the full file** in the PR branch — do not rely only on the diff.
+2. Run `git diff <base>..HEAD -- docker-compose.yml` to see the complete
+   change, not just the truncated diff.
+3. Check that NONE of the following were dropped without explicit
+   justification in the PR description:
+   - Named volumes (e.g. `agent-data`, `project-repos`)
+   - Bind mounts (especially for config, secrets, SSH keys, shared dirs)
+   - Environment variables (compare the full `environment:` block against
+     the base branch)
+   - `restart:` policy (should be `unless-stopped` for production services)
+   - `security_opt:` settings
+   - Network configuration
+   - Resource limits / deploy constraints
+4. If ANY production configuration was dropped and the PR description does
+   not explain why, **REQUEST_CHANGES**. List each dropped item explicitly.
+
+### For Dockerfile / entrypoint changes:
+
+1. Check that base image, installed packages, and runtime deps are preserved.
+2. Verify that entrypoint/CMD changes don't break the container startup.
+
+### For CI config changes:
+
+1. Check that pipeline steps aren't silently removed.
+2. Verify that secret references still match available secrets.
+
+## 3d. Scope discipline
+
+Compare the actual diff footprint against the stated issue scope:
+
+1. Read the PR title and description to identify what the issue asked for.
+2. Estimate the expected diff size (e.g., "add 3 env vars" = ~5-10 lines
+   in compose + ~5 lines in scripts).
+3. If the actual diff in ANY single file exceeds 3x the expected scope,
+   flag it: "this file changed N lines but the issue scope suggests ~M."
+
+For infrastructure files (compose, Dockerfiles, CI), scope violations are
+**blocking**: REQUEST_CHANGES and ask the author to split out-of-scope
+changes into a separate PR or justify them in the description.
+
+For non-infrastructure files, scope violations are advisory: leave a
+non-blocking COMMENT noting the scope creep.
+
 ## 4. Vault item quality (conditional)

 If the PR adds or modifies vault item files (`vault/pending/*.md` in the ops repo), apply these
@ -177,8 +254,16 @@ tech-debt issues via API so they are tracked separately:
    -H "Content-Type: application/json" "$FORGE_API/issues" \
    -d '{"title":"...","body":"Flagged by AI reviewer in PR #NNN.\n\n## Problem\n...\n\n---\n*Auto-created from AI review*","labels":[TECH_DEBT_ID]}'

-Only create follow-ups for clear, actionable tech debt. Do not create
-issues for minor style nits or speculative improvements.
+File a tech-debt issue for every finding rated **medium** or higher that
+is pre-existing (not introduced by this PR). Also file for **low** findings
+that represent correctness risks (dead code that masks bugs, misleading
+documentation, unguarded variables under set -u).
+
+Do NOT file for: style preferences, naming opinions, missing comments,
+or speculative improvements with no concrete failure mode.
+
+When in doubt, file. A closed-as-wontfix tech-debt issue costs nothing;
+an unfiled bug costs a future debugging session.

 ## 8. Verdict

@ -191,6 +276,13 @@ Bias toward APPROVE for small, correct changes. Use REQUEST_CHANGES only
 for actual problems (bugs, security issues, broken functionality, missing
 required behavior). Use DISCUSS sparingly.

+Note: The bias toward APPROVE applies to code correctness and style decisions.
+It does NOT apply to documentation consistency (step 3b), infrastructure file
+findings (step 3c), or tech-debt filing (step 7) — those are separate concerns
+that should be handled regardless of the change's correctness. In particular,
+dropped production configuration (volumes, bind mounts, env vars, restart
+policy) is a blocking defect, not a style preference.
+
 ## 9. Output

 Write a single JSON object to the file path from REVIEW_OUTPUT_FILE.
--- a/formulas/run-architect.toml
+++ b/formulas/run-architect.toml
@ -1,19 +1,34 @@
 # formulas/run-architect.toml — Architect formula
 #
-# Executed by architect-run.sh via cron — strategic decomposition of vision
+# Executed by architect-run.sh via polling loop — strategic decomposition of vision
 # issues into development sprints.
 #
 # This formula orchestrates the architect agent's workflow:
-#   Step 1: Preflight — validate prerequisites and identify target issue
-#   Step 2: Research + pitch — analyze codebase and write sprint pitch
-#   Step 3: Sprint PR creation with questions (issue #101)
+#   Step 1: Preflight — bash handles state management:
+#            - Fetch open vision issues from Forgejo API
+#            - Fetch open architect PRs on ops repo
+#            - Fetch merged architect PRs (already pitched visions)
+#            - Filter: remove visions with open PRs, merged sprints, or sub-issues
+#            - Select up to 3 remaining vision issues for pitching
+#   Step 2: Stateless pitch generation — for each selected issue:
+#            - Invoke claude -p with: vision issue body + codebase context
+#            - Model NEVER calls Forgejo API — only generates pitch markdown
+#            - Bash creates the ops PR with pitch content
+#            - Bash posts the ACCEPT/REJECT footer comment
+#   Step 3: Sprint PR creation with questions (issue #101) (one PR per pitch)
 #   Step 4: Answer parsing + sub-issue filing (issue #102)
 #
+# Architecture:
+# - Bash script (architect-run.sh) handles ALL state management
+# - Model calls are stateless — no Forgejo API access, no memory between calls
+# - Dedup is automatic via bash filters (no journal-based memory needed)
+# - Max 3 open architect PRs at any time
+#
 # AGENTS.md maintenance is handled by the gardener (#246).

 name        = "run-architect"
 description = "Architect: strategic decomposition of vision into sprints"
-version     = 1
+version     = 2
 model       = "opus"

 [context]
@ -23,126 +38,90 @@ files = ["VISION.md", "AGENTS.md"]

 [[steps]]
 id    = "preflight"
-title = "Preflight: validate prerequisites and identify target vision issue"
+title = "Preflight: bash-driven state management and issue selection"
 description = """
-This step performs preflight checks and identifies the most unblocking vision issue.
+This step performs preflight checks and selects up to 3 vision issues for pitching.
+IMPORTANT: All state management is handled by bash (architect-run.sh), NOT the model.

-Actions:
-1. Pull latest code from both disinto repo and ops repo
-2. Read prerequisite tree from $OPS_REPO_ROOT/prerequisites.md
-3. Fetch open issues labeled 'vision' from Forgejo API
-4. Check for open architect PRs on ops repo (handled by #101/#102)
-5. If open architect PRs exist, handle accept/reject responses (see Capability B below)
-6. If no vision issues, signal PHASE:done
+Architecture Decision: Bash-driven orchestration with stateless model calls
+- The model NEVER calls Forgejo API during pitching
+- Bash fetches all data from Forgejo API (vision issues, open PRs, merged PRs)
+- Bash filters and deduplicates (no model-level dedup or journal-based memory)
+- For each selected issue, bash invokes stateless claude -p (model only generates pitch)
+- Bash creates PRs and posts footer comments (no model API access)
+
+Bash Actions (in architect-run.sh):
+1. Fetch open vision issues from Forgejo API: GET /repos/{owner}/{repo}/issues?labels=vision&state=open
+2. Fetch open architect PRs from ops repo: GET /repos/{owner}/{repo}/pulls?state=open
+3. Fetch merged sprint PRs: GET /repos/{owner}/{repo}/pulls?state=closed (filter merged=true)
+4. Filter out visions that:
+   - Already have open architect PRs (check PR body for issue number reference)
+   - Have in-progress label
+   - Have open sub-issues (check for 'Decomposed from #N' pattern)
+   - Have merged sprint PRs (decomposition already done)
+5. Select up to (3 - open_architect_pr_count) remaining vision issues
+6. If no issues remain AND no responses to process, signal PHASE:done
+
+If open architect PRs exist, handle accept/reject responses FIRST (see Capability B below).
+After handling existing PRs, count remaining open architect PRs and calculate pitch_budget.
+
+## Multi-pitch selection (up to 3 per run)
+
+After handling existing PRs, determine how many new pitches can be created:
+
+  pitch_budget = 3 - <number of open architect PRs remaining after handling>
+
+For each available pitch slot:
+1. From the vision issues list, skip any issue that already has an open architect PR
+2. Skip any issue that already has the `in-progress` label
+3. Check for existing sub-issues filed from this vision issue
+4. Check for merged sprint PRs referencing this vision issue
+5. From remaining candidates, pick the most unblocking issue first
+6. Add to ARCHITECT_TARGET_ISSUES array

 Skip conditions:
 - If no vision issues are found, signal PHASE:done
+- If pitch_budget <= 0 (already 3 open architect PRs), skip pitching
+- If all vision issues already have open architect PRs, signal PHASE:done
+- If all vision issues have open sub-issues, skip pitching
+- If all vision issues have merged sprint PRs, skip pitching

 Output:
- Sets ARCHITECT_TARGET_ISSUE to the issue number of the selected vision issue
- Exports VISION_ISSUES as a JSON array of issue objects
-
-## Capability B: Handle accept/reject on existing pitch PRs
-
-When open architect PRs exist on the ops repo:
-
-1. Fetch comments on each open architect PR via Forgejo API
-2. Look for human response:
-
-   **ACCEPT** (case insensitive): Human wants to proceed
-   - Architect does deep research for design forks (same as #100 research but now identifying decision points)
-   - Formulates multiple-choice questions (Q1, Q2, Q3...)
-   - Updates the sprint spec file on the PR branch:
-     - Adds `## Design forks` section with fork options
-     - Adds `## Proposed sub-issues` section with concrete issues per fork path
-   - Comments on the PR with the questions formatted as multiple choice
-   - Signal PHASE:done (answer processing is #102)
-
-   **REJECT: <reason>** (case insensitive, reason after colon):
-   - Journal the rejection reason via profile_write_journal (if .profile exists)
-     — the architect learns what pitches fail
-   - Close the PR via Forgejo API (do not merge — rejected pitches do not persist in sprints/)
-   - Remove the branch via Forgejo API
-   - Signal PHASE:done
-
-   **No response yet**: skip silently, signal PHASE:done
-
-All git operations use the Forgejo API (create branch, write/update file, create PR,
-close PR, delete branch). No SSH.
+- Sets ARCHITECT_TARGET_ISSUES as a JSON array of issue numbers to pitch (up to 3)
 """

 [[steps]]
 id    = "research_pitch"
-title = "Research + pitch: analyze codebase and write sprint pitch"
+title = "Stateless pitch generation: model generates content, bash creates PRs"
 description = """
-This step performs deep codebase research and writes a sprint pitch for the
-selected vision issue.
+IMPORTANT: This step is executed by bash (architect-run.sh) via stateless claude -p calls.
+The model NEVER calls Forgejo API — it only reads context and generates pitch markdown.

-Actions:
+Architecture:
+- Bash orchestrates the loop over ARCHITECT_TARGET_ISSUES
+- For each issue: bash fetches issue body from Forgejo API, then invokes stateless claude -p
+- Model receives: vision issue body + codebase context (VISION.md, AGENTS.md, prerequisites.md)
+- Model outputs: sprint pitch markdown ONLY (no API calls, no side effects)
+- Bash creates the PR and posts the ACCEPT/REJECT footer comment

-1. Read the codebase deeply:
-   - Read all files mentioned in the issue body
-   - Search for existing interfaces that could be reused
-   - Check what infrastructure already exists
+For each issue in ARCHITECT_TARGET_ISSUES, bash performs:

-2. Assess complexity and cost:
-   - How many files/subsystems are touched?
-   - What new infrastructure would need to be maintained after this sprint?
-   - What are the risks (breaking changes, security implications, integration complexity)?
-   - Is this mostly gluecode or greenfield?
+1. Fetch vision issue details from Forgejo API:
+   - GET /repos/{owner}/{repo}/issues/{issue_number}
+   - Extract: title, body

-3. Write sprint pitch to scratch file for PR creation step (#101):
+2. Invoke stateless claude -p with prompt:
+   "Write a sprint pitch for this vision issue. Output only the pitch markdown."
+   Context provided:
+   - Vision issue #N: <title>
+   - Vision issue body
+   - Project context (VISION.md, AGENTS.md)
+   - Codebase context (prerequisites.md, graph section)
+   - Formula content

-# Sprint pitch: <name>
+3. Model generates pitch markdown (NO API CALLS):

-## Vision issues
- #N — <title>
-
-## What this enables
-<what the project can do after this sprint that it can't do now>
-
-## What exists today
-<current state — infrastructure, interfaces, code that can be reused>
-
-## Complexity
-<number of files, subsystems, estimated sub-issues>
-<gluecode vs greenfield ratio>
-
-## Risks
-<what could go wrong, what breaks if this is done badly>
-
-## Cost — new infra to maintain
-<what ongoing maintenance burden does this sprint add>
-<new services, cron jobs, formulas, agent roles>
-
-## Recommendation
-<architect's assessment: worth it / defer / alternative approach>
-
-IMPORTANT: Do NOT include design forks or questions yet. The pitch is a go/no-go
-decision for the human. Questions come only after acceptance.
-
-Output:
- Writes sprint pitch to $SCRATCH_FILE (/tmp/architect-{project}-scratch.md)
- The pitch serves as input for sprint PR creation step (#101)
-"""
-
-[[steps]]
-id    = "sprint_pr_creation"
-title = "Sprint PR creation with questions (issue #101)"
-description = """
-This step creates a PR on the ops repo with the sprint proposal when no PR exists yet.
-
-## Capability A: Create pitch PR (from research output)
-
-If step 2 (research/pitch) produced a pitch and no PR exists yet:
-
-1. Create branch `architect/<sprint-slug>` on ops repo via Forgejo API
-   - Sprint slug: lowercase, hyphenated version of sprint name
-   - Use Forgejo API: POST /repos/{owner}/{repo}/git/branches
-
-2. Write sprint spec file to sprints/<sprint-slug>.md on the new branch:
-
-# Sprint: <name>
+# Sprint: <sprint-name>

 ## Vision issues
 - #N — <title>
@ -162,19 +141,104 @@ If step 2 (research/pitch) produced a pitch and no PR exists yet:

 ## Cost — new infra to maintain
 <what ongoing maintenance burden does this sprint add>
-<new services, cron jobs, formulas, agent roles>
+<new services, scheduled tasks, formulas, agent roles>

 ## Recommendation
 <architect's assessment: worth it / defer / alternative approach>

-3. Create PR on ops repo via Forgejo API:
-   - Title: `architect: <sprint summary>`
-   - Body: pitch content (what it enables, complexity, risks, cost)
-   - Base branch: primary branch (main/master)
-   - Head branch: architect/<sprint-slug>
-   - Footer: "Reply `ACCEPT` to proceed with design questions, or `REJECT: <reason>` to decline."
+IMPORTANT: Do NOT include design forks or questions yet. The pitch is a go/no-go
+decision for the human. Questions come only after acceptance.

-4. Signal PHASE:done
+4. Bash creates PR:
+   - Create branch: architect/sprint-{pitch-number}
+   - Write sprint spec to sprints/{sprint-slug}.md
+   - Create PR with pitch content as body
+   - Post footer comment: "Reply ACCEPT to proceed with design questions, or REJECT: <reason> to decline."
+   - Add in-progress label to vision issue
+
+Output:
+- One PR per vision issue (up to 3 per run)
+- Each PR contains the pitch markdown
+- If ARCHITECT_TARGET_ISSUES is empty, skip this step
+"""
+
+[[steps]]
+id    = "sprint_pr_creation"
+title = "Sprint PR creation with questions (issue #101) — handled by bash"
+description = """
+IMPORTANT: PR creation is handled by bash (architect-run.sh) during the pitch step.
+This step is for documentation only — the actual PR creation happens in research_pitch.
+
+## Approved PR → Initial design questions (issue #570)
+
+When a sprint pitch PR receives an APPROVED review but has no `## Design forks`
+section and no Q1:, Q2: comments yet, the architect enters a new state:
+
+1. detect_approved_pending_questions() identifies this state
+2. A fresh agent session starts with a special prompt
+3. The agent reads the approved pitch, posts initial design questions (Q1:, Q2:, etc.)
+4. The agent adds a `## Design forks` section to the PR body
+5. The PR transitions into the questions phase, where the existing Q&A loop takes over
+
+This ensures approved PRs don't sit indefinitely without design conversation.
+
+Architecture:
+- Bash creates PRs during stateless pitch generation (step 2)
+- Model has no role in PR creation — no Forgejo API access
+- This step describes the PR format for reference
+
+PR Format (created by bash):
+
+1. Branch: architect/sprint-{pitch-number}
+
+2. Sprint spec file: sprints/{sprint-slug}.md
+   Contains the pitch markdown from the model.
+
+3. PR via Forgejo API:
+   - Title: architect: <sprint summary>
+   - Body: plain markdown text from model output
+   - Base: main (or PRIMARY_BRANCH)
+   - Head: architect/sprint-{pitch-number}
+   - Footer comment: "Reply ACCEPT to proceed with design questions, or REJECT: <reason> to decline."
+
+4. Add in-progress label to vision issue:
+   - Look up label ID: GET /repos/{owner}/{repo}/labels
+   - Add label: POST /repos/{owner}/{repo}/issues/{issue_number}/labels
+
+After creating all PRs, signal PHASE:done.
+
+## Forgejo API Reference
+
+All operations use the Forgejo API with Authorization: token ${FORGE_TOKEN} header.
+
+### Create branch
+```
+POST /repos/{owner}/{repo}/branches
+Body: {"new_branch_name": "architect/<sprint-slug>", "old_branch_name": "main"}
+```
+
+### Create/update file
+```
+PUT /repos/{owner}/{repo}/contents/<path>
+Body: {"message": "sprint: add <sprint-slug>.md", "content": "<base64-encoded-content>", "branch": "architect/<sprint-slug>"}
+```
+
+### Create PR
+```
+POST /repos/{owner}/{repo}/pulls
+Body: {"title": "architect: <sprint summary>", "body": "<markdown-text>", "head": "architect/<sprint-slug>", "base": "main"}
+```
+
+**Important: PR body format**
+- The body field must contain plain markdown text (the raw content from the model)
+- Do NOT JSON-encode or escape the body — pass it as a JSON string value
+- Newlines and markdown formatting (headings, lists, etc.) must be preserved as-is
+
+### Add label to issue
+```
+POST /repos/{owner}/{repo}/issues/{index}/labels
+Body: {"labels": [<label-id>]}
+```

 ## Forgejo API Reference

@ -195,9 +259,14 @@ Body: {"message": "sprint: add <sprint-slug>.md", "content": "<base64-encoded-co
 ### Create PR
 ```
 POST /repos/{owner}/{repo}/pulls
-Body: {"title": "architect: <sprint summary>", "body": "<pitch-content>", "head": "architect/<sprint-slug>", "base": "main"}
+Body: {"title": "architect: <sprint summary>", "body": "<markdown-text>", "head": "architect/<sprint-slug>", "base": "main"}
 ```

+**Important: PR body format**
+- The `body` field must contain **plain markdown text** (the raw content from the scratch file)
+- Do NOT JSON-encode or escape the body — pass it as a JSON string value
+- Newlines and markdown formatting (headings, lists, etc.) must be preserved as-is
+
 ### Close PR
 ```
 PATCH /repos/{owner}/{repo}/pulls/{index}
@ -208,97 +277,20 @@ Body: {"state": "closed"}
 ```
 DELETE /repos/{owner}/{repo}/git/branches/<branch-name>
 ```
-"""

-[[steps]]
-id    = "answer_parsing"
-title = "Answer parsing + sub-issue filing (issue #102)"
-description = """
-This step processes human answers to design questions and files sub-issues.
-
-## Preflight: Detect PRs in question phase
-
-An architect PR is in the question phase if ALL of the following are true:
- PR is open
- PR body or sprint spec file contains a `## Design forks` section (added by #101 after ACCEPT)
- PR has question comments (Q1, Q2, Q3... format)
-
-## Answer parsing
-
-Human comments on the PR use this format:
-```
-Q1: A
-Q2: B
-Q3: A
-```
-
-Parser matches lines starting with `Q` + digit(s) + `:` + space + letter A-D (case insensitive).
-Ignore other content in the comment.
-
-## Processing paths
-
-### All questions answered (every `### Q` heading has a matching `Q<N>: <letter>` comment)
-
-1. Parse each answer (e.g. `Q1: A`, `Q2: C`)
-2. Read the sprint spec from the PR branch
-3. Generate final sub-issues based on answers:
-   - Each sub-issue uses the appropriate issue template (bug/feature/refactor from `.codeberg/ISSUE_TEMPLATE/`)
-   - Fill all template fields:
-     - Problem/motivation (feature) or What's broken (bug/refactor)
-     - Proposed solution (feature) or Approach (refactor) or Steps to reproduce (bug)
-     - Affected files (max 3)
-     - Acceptance criteria (max 5)
-     - Dependencies
-   - File via Forgejo API on the **disinto repo** (not ops repo)
-   - Label as `backlog`
-4. Comment on PR: "Sprint filed: #N, #N, #N"
-5. Merge the PR (sprint spec with answers persists in `ops/sprints/`)
-
-### Some questions answered, not all
-
-1. Acknowledge answers received
-2. Comment listing remaining unanswered questions
-3. Signal PHASE:done (check again next poll)
-
-### No answers yet (questions posted but human hasn't responded)
-
-1. Skip — signal PHASE:done
-
-## Forgejo API for filing issues on disinto repo
-
-All operations use the Forgejo API with `Authorization: token ${FORGE_TOKEN}` header.
-
-### Create issue
-```
-POST /repos/{owner}/{repo}/issues
-Body: {
-  "title": "<issue title>",
-  "body": "<issue body with template fields>",
-  "labels": [123],  // backlog label ID
-  "assignees": ["architect-bot"]
-}
-```
-
-### Close PR
-```
-PATCH /repos/{owner}/{repo}/pulls/{index}
-Body: {"state": "closed"}
-```
-
-### Merge PR
-```
-POST /repos/{owner}/{repo}/pulls/{index}/merge
-Body: {"Do": "merge"}
-```
-
-### Post comment on PR (via issues endpoint)
-```
-POST /repos/{owner}/{repo}/issues/{index}/comments
-Body: {"body": "<comment text>"}
-```
-
-### Get label ID
+### Get labels (look up label IDs by name)
 ```
 GET /repos/{owner}/{repo}/labels
 ```
+
+### Add label to issue (for in-progress on vision issue)
+```
+POST /repos/{owner}/{repo}/issues/{index}/labels
+Body: {"labels": [<label-id>]}
+```
+
+### Remove label from issue (for in-progress removal on REJECT)
+```
+DELETE /repos/{owner}/{repo}/issues/{index}/labels/{label-id}
+```
 """
--- a/formulas/run-gardener.toml
+++ b/formulas/run-gardener.toml
@ -1,16 +1,15 @@
 # formulas/run-gardener.toml — Gardener housekeeping formula
 #
 # Defines the gardener's complete run: grooming (Claude session via
-# gardener-run.sh) + blocked-review + AGENTS.md maintenance + final
-# commit-and-pr.
+# gardener-run.sh) + AGENTS.md maintenance + final commit-and-pr.
 #
-# No memory, no journal. The gardener does mechanical housekeeping
-# based on current state — it doesn't need to remember past runs.
+# Gardener has journaling via .profile (issue #97), so it learns from
+# past runs and improves over time.
 #
-# Steps: preflight → grooming → dust-bundling → blocked-review → stale-pr-recycle → agents-update → commit-and-pr
+# Steps: preflight -> grooming -> dust-bundling -> agents-update -> commit-and-pr

 name        = "run-gardener"
-description = "Mechanical housekeeping: grooming, blocked review, docs update"
+description = "Mechanical housekeeping: grooming, dust bundling, docs update"
 version     = 1

 [context]
@ -77,6 +76,63 @@ Pre-checks (bash, zero tokens — detect problems before invoking Claude):
 6. Tech-debt promotion: list all tech-debt labeled issues — goal is to
   process them all (promote to backlog or classify as dust).

+7. Bug-report detection: for each open unlabeled issue (no backlog, no
+   bug-report, no in-progress, no blocked, no underspecified, no vision,
+   no tech-debt), check whether it describes a user-facing bug with
+   reproduction steps. Criteria — ALL must be true:
+   a. Body describes broken behavior (something that should work but
+      doesn't), NOT a feature request or enhancement
+   b. Body contains steps to reproduce (numbered list, "steps to
+      reproduce" heading, or clear sequence of actions that trigger the bug)
+   c. Issue is not already labeled
+
+   If all criteria match, enrich the issue body and write the manifest actions:
+
+   Body enrichment (CRITICAL — turns raw reports into actionable investigation briefs):
+   Before writing the add_label action, construct an enriched body by appending
+   these sections to the original issue body:
+
+   a. ``## What was reported``
+      One or two sentence summary of the user's claim. Distill the broken
+      behavior concisely — what the user expected vs. what actually happened.
+
+   b. ``## Known context``
+      What can be inferred from the codebase without running anything:
+      - Which contracts/components/files are involved (use AGENTS.md layout
+        and file paths mentioned in the issue or body)
+      - What the expected behavior should be (from VISION.md, docs, code)
+      - Any recent changes to involved components:
+          git log --oneline -5 -- <paths>
+      - Related issues or prior fixes (cross-reference by number if known)
+
+   c. ``## Reproduction plan``
+      Concrete steps for a reproduce-agent or human. Be specific:
+      - Which environment to use (e.g. "start fresh stack with
+        \`./scripts/dev.sh restart --full\`")
+      - Which transactions or actions to execute (with \`cast\` commands,
+        API calls, or UI navigation steps where applicable)
+      - What state to check after each step (contract reads, API queries,
+        UI observations, log output)
+
+   d. ``## What needs verification``
+      Checkboxes distinguishing known facts from unknowns:
+      - ``- [ ]`` Does the reported behavior actually occur? (reproduce)
+      - ``- [ ]`` Is <component X> behaving as expected? (check state)
+      - ``- [ ]`` Is the data flow correct from <A> to <B>? (trace)
+      Tailor these to the specific bug — three to five items covering the
+      key unknowns a reproduce-agent must resolve.
+
+   e. Construct full new body = original body text + appended sections.
+      Write an edit_body action BEFORE the add_label action:
+        echo '{"action":"edit_body","issue":NNN,"body":"<full new body>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
+
+   f. Write the add_label action:
+        echo '{"action":"add_label","issue":NNN,"label":"bug-report"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
+        echo "ACTION: labeled #NNN as bug-report — <reason>" >> "$RESULT_FILE"
+
+   Do NOT also add the backlog label — bug-report is a separate triage
+   track that feeds into reproduction automation.
+
 For each issue, choose ONE action and write to result file:

 ACTION (substantial — promote, close duplicate, add acceptance criteria):
@ -120,15 +176,17 @@ DUST (trivial — single-line edit, rename, comment, style, whitespace):
  of 3+ into one backlog issue.

 VAULT (needs human decision or external resource):
-  File a vault procurement item at $OPS_REPO_ROOT/vault/pending/<id>.md:
-    # <What decision or resource is needed>
-    ## What
-    <description>
-    ## Why
-    <which issue this unblocks>
-    ## Unblocks
-    - #NNN — <title>
-  Log: echo "VAULT: filed $OPS_REPO_ROOT/vault/pending/<id>.md for #NNN — <reason>" >> "$RESULT_FILE"
+  File a vault procurement item using vault_request():
+    source "$(dirname "$0")/../lib/vault.sh"
+    TOML_CONTENT="# Vault action: <action_id>
+context = \"<description of what decision/resource is needed>\"
+unblocks = [\"#NNN\"]
+
+[execution]
+# Commands to run after approval
+"
+    PR_NUM=$(vault_request "<action_id>" "$TOML_CONTENT")
+    echo "VAULT: filed PR #${PR_NUM} for #NNN — <reason>" >> "$RESULT_FILE"

 CLEAN (only if truly nothing to do):
  echo 'CLEAN' >> "$RESULT_FILE"
@ -142,25 +200,7 @@ Sibling dependency rule (CRITICAL):
  NEVER add bidirectional ## Dependencies between siblings (creates deadlocks).
  Use ## Related for cross-references: "## Related\n- #NNN (sibling)"

-7. Architecture decision alignment check (AD check):
-   For each open issue labeled 'backlog', check whether the issue
-   contradicts any architecture decision listed in the
-   ## Architecture Decisions section of AGENTS.md.
-   Read AGENTS.md and extract the AD table. For each backlog issue,
-   compare the issue title and body against each AD. If an issue
-   clearly violates an AD:
-   a. Write a comment action to the manifest:
-        echo '{"action":"comment","issue":NNN,"body":"Closing: violates AD-NNN (<decision summary>). See AGENTS.md § Architecture Decisions."}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-   b. Write a close action to the manifest:
-        echo '{"action":"close","issue":NNN,"reason":"violates AD-NNN"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-   c. Log to the result file:
-        echo "ACTION: closed #NNN — violates AD-NNN" >> "$RESULT_FILE"
-
-   Only close for clear, unambiguous violations. If the issue is
-   borderline or could be interpreted as compatible, leave it open
-   and file a VAULT item for human decision instead.
-
-8. Quality gate — backlog label enforcement:
+6. Quality gate — backlog label enforcement:
   For each open issue labeled 'backlog', verify it has the required
   sections for dev-agent pickup:
   a. Acceptance criteria — body must contain at least one checkbox
@ -181,28 +221,65 @@ Sibling dependency rule (CRITICAL):
   Well-structured issues (both sections present) are left untouched —
   they are ready for dev-agent pickup.

-9. Portfolio lifecycle — maintain ## Addressables and ## Observables in AGENTS.md:
-   Read the current Addressables and Observables tables from AGENTS.md.
+8. Bug-report lifecycle — auto-close resolved parent issues:
+   For each open issue, check whether it is a parent that was decomposed
+   into sub-issues. A parent is identified by having OTHER issues whose
+   body contains "Decomposed from #N" where N is the parent's number.

-   a. ADD: if a recently closed issue shipped a new deployment, listing,
-      package, or external presence not yet in the table, add a row.
-   b. PROMOTE: if an addressable now has measurement wired (an evidence
-      process reads from it), move it to the Observables section.
-   c. REMOVE: if an addressable was decommissioned (vision change
-      invalidated it, service shut down), remove the row and log why.
-   d. FLAG: if an addressable has been live > 2 weeks with Observable? = No
-      and no evidence process is planned, add a comment to the result file:
-        echo "ACTION: flagged addressable '<name>' — live >2 weeks, no observation path" >> "$RESULT_FILE"
+   Algorithm:
+   a. From the open issues fetched in step 1, collect all issue numbers.
+   b. For each open issue number N, search ALL issues (open AND closed)
+      for bodies containing "Decomposed from #N":
+        curl -sf -H "Authorization: token $FORGE_TOKEN" \
+          "$FORGE_API/issues?state=all&type=issues&limit=50" \
+        | jq -r --argjson n N \
+          '[.[] | select(.body != null) | select(.body | test("Decomposed from #" + ($n | tostring) + "\\b"))] | length'
+      If zero sub-issues found, skip — this is not a decomposed parent.

-   Stage AGENTS.md if changed — the commit-and-pr step handles the actual commit.
+   c. If sub-issues exist, check whether ALL of them are closed:
+        curl -sf -H "Authorization: token $FORGE_TOKEN" \
+          "$FORGE_API/issues?state=all&type=issues&limit=50" \
+        | jq -r --argjson n N \
+          '[.[] | select(.body != null) | select(.body | test("Decomposed from #" + ($n | tostring) + "\\b"))]
+           | {total: length, closed: [.[] | select(.state == "closed")] | length}
+           | .total == .closed'
+      If the result is "false", some sub-issues are still open — skip.
+
+   d. If ALL sub-issues are closed, collect sub-issue numbers and titles:
+        SUB_ISSUES=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
+          "$FORGE_API/issues?state=all&type=issues&limit=50" \
+        | jq -r --argjson n N \
+          '[.[] | select(.body != null) | select(.body | test("Decomposed from #" + ($n | tostring) + "\\b"))]
+           | .[] | "- #\(.number) \(.title)"')
+
+   e. Write a comment action listing the resolved sub-issues.
+      Use jq to build valid JSON (sub-issue titles may contain quotes/backslashes,
+      and SUB_ISSUES is multiline — raw interpolation would break JSONL):
+        COMMENT_BODY=$(printf 'All sub-issues have been resolved:\n%s\n\nClosing this parent issue as all decomposed work is complete.' "$SUB_ISSUES")
+        jq -n --argjson issue N --arg body "$COMMENT_BODY" \
+          '{action:"comment", issue: $issue, body: $body}' \
+          >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
+
+   f. Write a close action:
+        jq -n --argjson issue N \
+          '{action:"close", issue: $issue, reason: "all sub-issues resolved"}' \
+          >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
+
+   g. Log the action:
+        echo "ACTION: closed #N — all sub-issues resolved" >> "$RESULT_FILE"
+
+   Edge cases:
+   - Already closed parent: skipped (only open issues are processed)
+   - No sub-issues found: skipped (not a decomposed issue)
+   - Multi-cause bugs: stays open until ALL sub-issues are closed

 Processing order:
  1. Handle PRIORITY_blockers_starving_factory first — promote or resolve
-  2. AD alignment check — close backlog issues that violate architecture decisions
-  3. Quality gate — strip backlog from issues missing acceptance criteria or affected files
-  4. Process tech-debt issues by score (impact/effort)
-  5. Classify remaining items as dust or route to vault
-  6. Portfolio lifecycle — update addressables/observables tables
+  2. Quality gate — strip backlog from issues missing acceptance criteria or affected files
+  3. Bug-report detection — label qualifying issues before other classification
+  4. Bug-report lifecycle — close parents whose sub-issues are all resolved
+  5. Process tech-debt issues by score (impact/effort)
+  6. Classify remaining items as dust or route to vault

 Do NOT bundle dust yourself — the dust-bundling step handles accumulation,
 dedup, TTL expiry, and bundling into backlog issues.
@ -257,137 +334,22 @@ session, so changes there would be lost.

 5. If no DUST items were emitted and no groups are ripe, skip this step.

-CRITICAL: If this step fails, log the failure and move on to blocked-review.
+CRITICAL: If this step fails, log the failure and move on.
 """
 needs = ["grooming"]

 # ─────────────────────────────────────────────────────────────────────
-# Step 4: blocked-review — triage blocked issues
-# ─────────────────────────────────────────────────────────────────────
-
-[[steps]]
-id    = "blocked-review"
-title = "Review issues labeled blocked"
-description = """
-Review all issues labeled 'blocked' and decide their fate.
-(See issue #352 for the blocked label convention.)
-
-1. Fetch all blocked issues:
-     curl -sf -H "Authorization: token $FORGE_TOKEN" \
-       "$FORGE_API/issues?state=open&type=issues&labels=blocked&limit=50"
-
-2. For each blocked issue, read the full body and comments:
-     curl -sf -H "Authorization: token $FORGE_TOKEN" \
-       "$FORGE_API/issues/<number>"
-     curl -sf -H "Authorization: token $FORGE_TOKEN" \
-       "$FORGE_API/issues/<number>/comments"
-
-3. Check dependencies — extract issue numbers from ## Dependencies /
-   ## Depends on / ## Blocked by sections. For each dependency:
-     curl -sf -H "Authorization: token $FORGE_TOKEN" \
-       "$FORGE_API/issues/<dep_number>"
-   Check if the dependency is now closed.
-
-4. For each blocked issue, choose ONE action:
-
-   UNBLOCK — all dependencies are now closed or the blocking condition resolved:
-   a. Write a remove_label action to the manifest:
-        echo '{"action":"remove_label","issue":NNN,"label":"blocked"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-   b. Write a comment action to the manifest:
-        echo '{"action":"comment","issue":NNN,"body":"Unblocked: <explanation of what resolved the blocker>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-   NEEDS HUMAN — blocking condition is ambiguous, requires architectural
-   decision, or involves external factors:
-   a. Write a comment action to the manifest:
-        echo '{"action":"comment","issue":NNN,"body":"<diagnostic: what you found and what decision is needed>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-   b. Leave the 'blocked' label in place
-
-   CLOSE — issue is stale (blocked 30+ days with no progress on blocker),
-   the blocker is wontfix, or the issue is no longer relevant:
-   a. Write a comment action to the manifest:
-        echo '{"action":"comment","issue":NNN,"body":"Closing: <reason — stale blocker, no longer relevant, etc.>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-   b. Write a close action to the manifest:
-        echo '{"action":"close","issue":NNN,"reason":"<stale blocker / no longer relevant / etc.>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-CRITICAL: If this step fails, log the failure and move on.
-"""
-needs = ["dust-bundling"]
-
-# ─────────────────────────────────────────────────────────────────────
-# Step 5: stale-pr-recycle — recycle stale failed PRs back to backlog
-# ─────────────────────────────────────────────────────────────────────
-
-[[steps]]
-id    = "stale-pr-recycle"
-title = "Recycle stale failed PRs back to backlog"
-description = """
-Detect open PRs where CI has failed and no work has happened in 24+ hours.
-These represent abandoned dev-agent attempts — recycle them so the pipeline
-can retry with a fresh session.
-
-1. Fetch all open PRs:
-     curl -sf -H "Authorization: token $FORGE_TOKEN" \
-       "$FORGE_API/pulls?state=open&limit=50"
-
-2. For each PR, check all four conditions before recycling:
-
-   a. CI failed — get the HEAD SHA from the PR's head.sha field, then:
-        curl -sf -H "Authorization: token $FORGE_TOKEN" \
-          "$FORGE_API/commits/<head_sha>/status"
-      Only proceed if the combined state is "failure" or "error".
-      Skip PRs with "success", "pending", or no CI status.
-
-   b. Last push > 24 hours ago — get the commit details:
-        curl -sf -H "Authorization: token $FORGE_TOKEN" \
-          "$FORGE_API/git/commits/<head_sha>"
-      Parse the committer.date field. Only proceed if it is older than:
-        $(date -u -d '24 hours ago' +%Y-%m-%dT%H:%M:%SZ)
-
-   c. Linked issue exists — extract the issue number from the PR body.
-      Look for "Fixes #NNN" or "ixes #NNN" patterns (case-insensitive).
-      If no linked issue found, skip this PR (cannot reset labels).
-
-   d. No active tmux session — check:
-        tmux has-session -t "dev-${PROJECT_NAME}-<issue_number>" 2>/dev/null
-      If a session exists, someone may still be working — skip this PR.
-
-3. For each PR that passes all checks (failed CI, 24+ hours stale,
-   linked issue found, no active session):
-
-   a. Write a comment on the PR explaining the recycle:
-        echo '{"action":"comment","issue":<pr_number>,"body":"Recycling stale CI failure for fresh attempt. Previous PR: #<pr_number>"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-   b. Write a close_pr action:
-        echo '{"action":"close_pr","pr":<pr_number>}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-   c. Remove the in-progress label from the linked issue:
-        echo '{"action":"remove_label","issue":<issue_number>,"label":"in-progress"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-   d. Add the backlog label to the linked issue:
-        echo '{"action":"add_label","issue":<issue_number>,"label":"backlog"}' >> "$PROJECT_REPO_ROOT/gardener/pending-actions.jsonl"
-
-   e. Log to result file:
-        echo "ACTION: recycled PR #<pr_number> (linked issue #<issue_number>) — stale CI failure" >> "$RESULT_FILE"
-
-4. If no stale failed PRs found, skip this step.
-
-CRITICAL: If this step fails, log the failure and move on to agents-update.
-"""
-needs = ["blocked-review"]
-
-# ─────────────────────────────────────────────────────────────────────
-# Step 6: agents-update — AGENTS.md watermark staleness + size enforcement
+# Step 4: agents-update — AGENTS.md watermark staleness + size enforcement
 # ─────────────────────────────────────────────────────────────────────

 [[steps]]
 id    = "agents-update"
-title = "Check AGENTS.md watermarks, update stale files, enforce size limit"
+title = "Check AGENTS.md watermarks, discover structural changes, update stale files"
 description = """
-Check all AGENTS.md files for staleness, update any that are outdated, and
-enforce the ~200-line size limit via progressive disclosure splitting.
-This keeps documentation fresh — runs 2x/day so drift stays small.
+Maintain all AGENTS.md files by detecting structural drift since the last
+review. Uses git history as the source of truth — not vibes.

-## Part A: Watermark staleness check and update
+## Part A: Discover what changed

 1. Read the HEAD SHA from preflight:
     HEAD_SHA=$(cat /tmp/gardener-head-sha)
@ -397,110 +359,80 @@ This keeps documentation fresh — runs 2x/day so drift stays small.

 3. For each file, read the watermark from line 1:
     <!-- last-reviewed: <sha> -->
+   If no watermark exists, treat the file as fully stale (review everything).

 4. Check for changes since the watermark:
     git log --oneline <watermark>..HEAD -- <directory>
   If zero changes, the file is current — skip it.

-5. For stale files:
-   - Read the AGENTS.md and the source files in that directory
-   - Update the documentation to reflect code changes since the watermark
-   - Set the watermark to the HEAD SHA from the preflight step
-   - Conventions: architecture and WHY not implementation details
+5. For each stale file, run a STRUCTURAL DIFF — this is the core of the step:

-## Part B: Size limit enforcement (progressive disclosure split)
+   a. FILE INVENTORY: list files at watermark vs HEAD for this directory:
+        git ls-tree -r --name-only <watermark> -- <directory>
+        git ls-tree -r --name-only HEAD -- <directory>
+      Diff the two lists. Categorize:
+        - NEW files: in HEAD but not in watermark
+        - DELETED files: in watermark but not in HEAD
+        - Check AGENTS.md layout section: does it list each current file?
+          Files present in the directory but absent from the layout = GAPS.
+          Files listed in the layout but missing from the directory = LIES.

-After all updates are done, count lines in the root AGENTS.md:
+   b. REFERENCE VALIDATION: extract every file path, function name, and
+      shell variable referenced in the AGENTS.md. For each:
+        - File paths: verify the file exists (ls or git ls-tree HEAD)
+        - Function names: grep for the definition in the codebase
+        - Script names: verify they exist where claimed
+      Any reference that fails validation is a LIE — flag it for correction.
+
+   c. SEMANTIC CHANGES: for files that existed at both watermark and HEAD,
+      check if they changed meaningfully:
+        git diff <watermark>..HEAD -- <directory>/*.sh <directory>/*.py <directory>/*.toml
+      Look for: new exported functions, removed functions, renamed files,
+      changed CLI flags, new environment variables, new configuration.
+      Ignore: internal refactors, comment changes, formatting.
+
+6. For each stale file, apply corrections:
+   - Add NEW files to the layout section
+   - Remove DELETED files from the layout section
+   - Fix every LIE found in reference validation
+   - Add notes about significant SEMANTIC CHANGES
+   - Set the watermark to HEAD_SHA
+   - Conventions: document architecture and WHY, not implementation details
+
+## Part B: Size limit enforcement
+
+After all updates, count lines in the root AGENTS.md:
     wc -l < "$PROJECT_REPO_ROOT/AGENTS.md"

-If the root AGENTS.md exceeds 200 lines, perform a progressive disclosure
-split. The principle: agent reads the map, drills into detail only when
-needed. You wouldn't dump a 500-page wiki on a new hire's first morning.
+If it exceeds 200 lines, split verbose sections into per-directory files
+using progressive disclosure:

-6. Identify per-directory sections to extract. Each agent section under
-   "## Agents" (e.g. "### Dev (`dev/`)", "### Review (`review/`)") and
-   each helper section (e.g. "### Shared helpers (`lib/`)") is a candidate.
-   Also extract verbose subsections like "## Issue lifecycle and label
-   conventions" and "## Phase-Signaling Protocol" into docs/ or the
-   relevant directory.
+7. Identify sections that can be extracted to per-directory files.
+   Keep the root AGENTS.md as a table of contents — brief overview,
+   directory layout, summary tables with links to detail files.

-7. For each section to extract, create a `{dir}/AGENTS.md` file with:
+8. For each extracted section, create a `{dir}/AGENTS.md` with:
   - Line 1: watermark <!-- last-reviewed: <HEAD_SHA> -->
-   - The full section content (role, trigger, key files, env vars, lifecycle)
-   - Keep the same markdown structure and detail level
+   - The full section content, preserving structure and detail

-   Example for dev/:
-   ```
-   <!-- last-reviewed: abc123 -->
-   # Dev Agent
+9. Replace extracted sections in root with concise summaries + links.

-   **Role**: Implement issues autonomously ...
-   **Trigger**: dev-poll.sh runs every 10 min ...
-   **Key files**: ...
-   **Environment variables consumed**: ...
-   **Lifecycle**: ...
-   ```
-
-8. Replace extracted sections in the root AGENTS.md with a concise
-   directory map table. The root file keeps ONLY:
-   - Watermark (line 1)
-   - ## What this repo is (brief overview)
-   - ## Directory layout (existing tree)
-   - ## Tech stack
-   - ## Coding conventions
-   - ## How to lint and test
-   - ## Agents — replaced with a summary table pointing to per-dir files:
-
-     ## Agents
-
-     | Agent | Directory | Role | Guide |
-     |-------|-----------|------|-------|
-     | Dev | dev/ | Issue implementation | [dev/AGENTS.md](dev/AGENTS.md) |
-     | Review | review/ | PR review | [review/AGENTS.md](review/AGENTS.md) |
-     | Gardener | gardener/ | Backlog grooming | [gardener/AGENTS.md](gardener/AGENTS.md) |
-     | ... | ... | ... | ... |
-
-   - ## Shared helpers — replaced with a brief pointer:
-     "See [lib/AGENTS.md](lib/AGENTS.md) for the full helper reference."
-     Keep the summary table if it fits, or move it to lib/AGENTS.md.
-
-   - ## Issue lifecycle and label conventions — keep a brief summary
-     (labels table + dependency convention) or move verbose parts to
-     docs/PHASE-PROTOCOL.md
-
-   - ## Architecture Decisions — keep in root (humans write, agents enforce)
-
-   - ## Phase-Signaling Protocol — keep a brief summary with pointer:
-     "See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the full spec."
-
-9. Verify the root AGENTS.md is now under 200 lines:
-     LINE_COUNT=$(wc -l < "$PROJECT_REPO_ROOT/AGENTS.md")
-     if [ "$LINE_COUNT" -gt 200 ]; then
-       echo "WARNING: root AGENTS.md still $LINE_COUNT lines after split"
-     fi
-   If still over 200, trim further — move more detail into per-directory
-   files. The root should read like a table of contents, not an encyclopedia.
-
-10. Each new per-directory AGENTS.md must have a watermark on line 1.
-    The gardener maintains freshness for ALL AGENTS.md files — root and
-    per-directory — using the same watermark mechanism from Part A.
+10. Verify root is under 200 lines. If still over, extract more.

 ## Staging

-11. Stage ALL AGENTS.md files you created or changed — do NOT commit yet.
-    All git writes happen in the commit-and-pr step at the end:
+11. Stage all AGENTS.md files created or changed:
      find . -name "AGENTS.md" -not -path "./.git/*" -exec git add {} +

-12. If no AGENTS.md files need updating AND root is under 200 lines,
-    skip this step entirely.
+12. If no files need updating AND root is under 200 lines, skip entirely.

 CRITICAL: If this step fails for any reason, log the failure and move on.
 Do NOT let an AGENTS.md failure prevent the commit-and-pr step.
 """
-needs = ["stale-pr-recycle"]
+needs = ["dust-bundling"]

 # ─────────────────────────────────────────────────────────────────────
-# Step 7: commit-and-pr — single commit with all file changes
+# Step 5: commit-and-pr — single commit with all file changes
 # ─────────────────────────────────────────────────────────────────────

 [[steps]]
@ -554,16 +486,14 @@ executes them after the PR merges.
        PR_NUMBER=$(echo "$PR_RESPONSE" | jq -r '.number')
   h. Save PR number for orchestrator tracking:
        echo "$PR_NUMBER" > /tmp/gardener-pr-${PROJECT_NAME}.txt
-   i. Signal the orchestrator to monitor CI:
-        echo "PHASE:awaiting_ci" > "$PHASE_FILE"
-   j. STOP and WAIT. Do NOT return to the primary branch.
-      The orchestrator polls CI, injects results and review feedback.
-      When you receive injected CI or review feedback, follow its
-      instructions, then write PHASE:awaiting_ci and wait again.
+   i. The orchestrator handles CI/review via pr_walk_to_merge.
+      The gardener stays alive to inject CI results and review feedback
+      as they come in, then executes the pending-actions manifest after merge.

 4. If no file changes existed (step 2 found nothing):
-     echo "PHASE:done" > "$PHASE_FILE"
+     # Nothing to commit — the gardener has no work to do this run.
+     exit 0

-5. If PR creation fails, log the error and write PHASE:failed.
+5. If PR creation fails, log the error and exit.
 """
 needs = ["agents-update"]
--- a/formulas/run-planner.toml
+++ b/formulas/run-planner.toml
@ -1,6 +1,6 @@
 # formulas/run-planner.toml — Strategic planning formula (v4: graph-driven)
 #
-# Executed directly by planner-run.sh via cron — no action issues.
+# Executed directly by planner-run.sh via polling loop — no action issues.
 # planner-run.sh creates a tmux session with Claude (opus) and injects
 # this formula as context, plus the graph report from build-graph.py.
 #
--- a/formulas/run-predictor.toml
+++ b/formulas/run-predictor.toml
@ -6,7 +6,7 @@
 # Memory: previous predictions on the forge ARE the memory.
 # No separate memory file — the issue tracker is the source of truth.
 #
-# Executed by predictor/predictor-run.sh via cron — no action issues.
+# Executed by predictor/predictor-run.sh via polling loop — no action issues.
 # predictor-run.sh creates a tmux session with Claude (sonnet) and injects
 # this formula as context. Claude executes all steps autonomously.
 #
@ -119,27 +119,24 @@ For each weakness you identify, choose one:
    **Suggested action:** <what the planner should consider>

 **EXPLOIT** — high confidence, have a theory you can test:
-  File a prediction/unreviewed issue AND an action issue that dispatches
-  a formula to generate evidence.
+  File a prediction/unreviewed issue AND a vault PR that dispatches
+  a formula to generate evidence (AD-006: external actions go through vault).

-  The prediction explains the theory. The action generates the proof.
-  When the planner runs next, evidence is already there.
+  The prediction explains the theory. The vault PR triggers the proof
+  after human approval. When the planner runs next, evidence is already there.

-  Action issue body format (label: action):
-    Dispatched by predictor to test theory in #<prediction_number>.
+  Vault dispatch (requires lib/vault.sh):
+    source "$PROJECT_REPO_ROOT/lib/vault.sh"

-    ## Task
-    Run <formula name> with focus on <specific test>.
-
-    ## Expected evidence
-    Results in evidence/<dir>/<date>-<name>.json
-
-    ## Acceptance criteria
-    - [ ] Formula ran to completion
-    - [ ] Evidence file written with structured results
-
-    ## Affected files
-    - evidence/<dir>/
+    TOML_CONTENT="id = \"predict-<prediction_number>-<formula>\"
+context = \"Test prediction #<prediction_number>: <theory summary> — focus: <specific test>\"
+formula = \"<formula-name>\"
+secrets = []
+# Unblocks: #<prediction_number>
+# Expected evidence: evidence/<dir>/<date>-<name>.json
+"
+    PR_NUM=$(vault_request "predict-<prediction_number>-<formula>" "$TOML_CONTENT")
+    echo "Vault PR #${PR_NUM} filed to test prediction #<prediction_number>"

  Available formulas (check $PROJECT_REPO_ROOT/formulas/*.toml for current list):
    cat "$PROJECT_REPO_ROOT/formulas/"*.toml | grep '^name' | head -10
@ -156,10 +153,10 @@ tea is pre-configured with login "$TEA_LOGIN" and repo "$FORGE_REPO".
     tea issues create --login "$TEA_LOGIN" --repo "$FORGE_REPO" \
       --title "<title>" --body "<body>" --labels "prediction/unreviewed"

-2. File action dispatches (if exploiting):
-     tea issues create --login "$TEA_LOGIN" --repo "$FORGE_REPO" \
-       --title "action: test prediction #NNN — <formula> <focus>" \
-       --body "<body>" --labels "action"
+2. Dispatch formula via vault (if exploiting):
+     source "$PROJECT_REPO_ROOT/lib/vault.sh"
+     PR_NUM=$(vault_request "predict-NNN-<formula>" "$TOML_CONTENT")
+     # See EXPLOIT section above for TOML_CONTENT format

 3. Close superseded predictions:
     tea issues close <number> --login "$TEA_LOGIN" --repo "$FORGE_REPO"
@ -173,11 +170,11 @@ tea is pre-configured with login "$TEA_LOGIN" and repo "$FORGE_REPO".

 ## Rules

- Max 5 actions total (predictions + action dispatches combined)
- Each exploit counts as 2 (prediction + action dispatch)
+- Max 5 actions total (predictions + vault dispatches combined)
+- Each exploit counts as 2 (prediction + vault dispatch)
 - So: 5 explores, or 2 exploits + 1 explore, or 1 exploit + 3 explores
 - Never re-file a dismissed prediction without new evidence
- Action issues must reference existing formulas — don't invent formulas
+- Vault dispatches must reference existing formulas — don't invent formulas
 - Be specific: name the file, the metric, the threshold, the formula
 - If no weaknesses found, file nothing — that's a strong signal the project is healthy

--- a/formulas/run-publish-site.toml
+++ b/formulas/run-publish-site.toml
@ -216,7 +216,7 @@ Check 3 — engagement evidence has been collected at least once:
    jq -r '"  visitors=\(.unique_visitors) pages=\(.page_views) referrals=\(.referred_visitors)"' "$LATEST" 2>/dev/null || true
  else
    echo "NOTE: No engagement reports yet — run: bash site/collect-engagement.sh"
-    echo "The first report will appear after the cron job runs (daily at 23:55 UTC)."
+    echo "The first report will appear after the scheduled collection runs (daily at 23:55 UTC)."
  fi

 Summary:
--- a/formulas/run-supervisor.toml
+++ b/formulas/run-supervisor.toml
@ -1,7 +1,7 @@
 # formulas/run-supervisor.toml — Supervisor formula (health monitoring + remediation)
 #
-# Executed by supervisor/supervisor-run.sh via cron (every 20 minutes).
-# supervisor-run.sh creates a tmux session with Claude (sonnet) and injects
+# Executed by supervisor/supervisor-run.sh via polling loop (every 20 minutes).
+# supervisor-run.sh runs claude -p via agent-sdk.sh and injects
 # this formula with pre-collected metrics as context.
 #
 # Steps: preflight → health-assessment → decide-actions → report → journal
@ -34,13 +34,15 @@ and injected into your prompt above. Review them now.
   (24h grace period). Check the "Stale Phase Cleanup" section for any
   files cleaned or in grace period this run.

-2. Check vault state: read $OPS_REPO_ROOT/vault/pending/*.md for any procurement items
+2. Check vault state: read ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/*.md for any procurement items
   the planner has filed. Note items relevant to the health assessment
   (e.g. a blocked resource that explains why the pipeline is stalled).
+   Note: In degraded mode, vault items are stored locally.

 3. Read the supervisor journal for recent history:
-     JOURNAL_FILE="$OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md"
+     JOURNAL_FILE="${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md"
     if [ -f "$JOURNAL_FILE" ]; then cat "$JOURNAL_FILE"; fi
+   Note: In degraded mode, the journal is stored locally and not committed to git.

 4. Note any values that cross these thresholds:
   - RAM available < 500MB or swap > 3GB → P0 (memory crisis)
@ -105,8 +107,13 @@ For each finding from the health assessment, decide and execute an action.
  sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true

 **P1 Disk pressure:**
-  # Docker cleanup
+  # First pass: dangling only (cheap, safe)
  sudo docker system prune -f >/dev/null 2>&1 || true
+  # If still > 80%, escalate to all unused images (more aggressive but necessary)
+  _pct=$(df -h / | awk 'NR==2{print $5}' | tr -d '%')
+  if [ "${_pct:-0}" -gt 80 ]; then
+    sudo docker system prune -a -f >/dev/null 2>&1 || true
+  fi
  # Truncate logs > 10MB
  for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do
    [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f"
@ -137,21 +144,22 @@ For each finding from the health assessment, decide and execute an action.

 **P3 Stale PRs (CI done >20min, no push since):**
  Do NOT read dev-poll.sh, push branches, attempt merges, or investigate pipeline code.
-  Instead, nudge the dev-agent via tmux injection if a session is alive:
-    # Find the dev session for this issue
-    SESSION=$(tmux list-sessions -F '#{session_name}' 2>/dev/null | grep "dev-.*-${ISSUE_NUM}" | head -1)
-    if [ -n "$SESSION" ]; then
-      # Inject a nudge into the dev-agent session
-      tmux send-keys -t "$SESSION" "# [supervisor] PR stale >20min — CI finished, please push or update" Enter
-    fi
-  If no active tmux session exists, note it in the journal for the next dev-poll cycle.
+  Instead, file a vault item for the dev-agent to pick up:
+    Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/stale-pr-${ISSUE_NUM}.md:
+      # Stale PR: ${PR_TITLE}
+      ## What
+      CI finished >20min ago but no git push has been made to the PR branch.
+      ## Why
+      P3 — Factory degraded: PRs should be pushed within 20min of CI completion.
+      ## Unblocks
+      - Factory health: dev-agent will push the branch and continue the workflow
  Do NOT file vault items for stale PRs unless they remain stale for >3 consecutive runs.

 ### Cannot auto-fix → file vault item

 For P0-P2 issues that persist after auto-fix attempts, or issues requiring
 human judgment, file a vault procurement item:
-  Write $OPS_REPO_ROOT/vault/pending/supervisor-<issue-slug>.md:
+  Write ${OPS_VAULT_ROOT:-$OPS_REPO_ROOT/vault/pending}/supervisor-<issue-slug>.md:
    # <What is needed>
    ## What
    <description of the problem and why the supervisor cannot fix it>
@ -160,13 +168,23 @@ human judgment, file a vault procurement item:
    ## Unblocks
    - Factory health: <what this resolves>
  Vault PR filed on ops repo — human approves via PR review.
+  Note: In degraded mode (no ops repo), vault items are written locally to ${OPS_VAULT_ROOT:-local path}.

-Read the relevant best-practices file before taking action:
-  cat "$OPS_REPO_ROOT/knowledge/memory.md"    # P0
-  cat "$OPS_REPO_ROOT/knowledge/disk.md"      # P1
-  cat "$OPS_REPO_ROOT/knowledge/ci.md"        # P2 CI
-  cat "$OPS_REPO_ROOT/knowledge/dev-agent.md" # P2 agent
-  cat "$OPS_REPO_ROOT/knowledge/git.md"       # P2 git
+### Reading best-practices files
+
+Read the relevant best-practices file before taking action. In degraded mode,
+use the bundled knowledge files from ${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}:
+
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/memory.md"    # P0
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/disk.md"      # P1
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/ci.md"        # P2 CI
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/dev-agent.md" # P2 agent
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/git.md"       # P2 git
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/review-agent.md" # P2 review
+  cat "${OPS_KNOWLEDGE_ROOT:-$OPS_REPO_ROOT/knowledge}/forge.md"     # P2 forge
+
+Note: If OPS_REPO_ROOT is not available (degraded mode), the bundled knowledge
+files in ${OPS_KNOWLEDGE_ROOT:-<unset>} provide fallback guidance.

 Track what you fixed and what vault items you filed for the report step.
 """
@ -208,7 +226,7 @@ description = """
 Append a timestamped entry to the supervisor journal.

 File path:
-  $OPS_REPO_ROOT/journal/supervisor/$(date -u +%Y-%m-%d).md
+  ${OPS_JOURNAL_ROOT:-$OPS_REPO_ROOT/journal/supervisor}/$(date -u +%Y-%m-%d).md

 If the file already exists (multiple runs per day), append a new section.
 If it does not exist, create it.
@ -241,17 +259,24 @@ run-to-run context so future supervisor runs can detect trends
 IMPORTANT: Do NOT commit or push the journal — it is a local working file.
 The journal directory is committed to git periodically by other agents.

+Note: In degraded mode (no ops repo), the journal is written locally to
+${OPS_JOURNAL_ROOT:-<unset>} and is NOT automatically committed to any repo.
+
 ## Learning

-If you discover something new during this run, append it to the relevant
-knowledge file in the ops repo:
-  echo "### Lesson title
-  Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
+If you discover something new during this run:
+
+- In full mode (ops repo available): append to the relevant knowledge file:
+    echo "### Lesson title
+    Description of what you learned." >> "${OPS_REPO_ROOT}/knowledge/<file>.md"
+
+- In degraded mode: write to the local knowledge directory for reference:
+    echo "### Lesson title
+    Description of what you learned." >> "${OPS_KNOWLEDGE_ROOT:-<unset>}/<file>.md"

 Knowledge files: memory.md, disk.md, ci.md, forge.md, dev-agent.md,
 review-agent.md, git.md.

-After writing the journal, write the phase signal:
-  echo 'PHASE:done' > "$PHASE_FILE"
+After writing the journal, the agent session completes automatically.
 """
 needs = ["report"]
--- a/formulas/triage.toml
+++ b/formulas/triage.toml
@ -0,0 +1,267 @@
+# formulas/triage.toml — Triage-agent formula (generic template)
+#
+# This is the base template for triage investigations.
+# Project-specific formulas (e.g. formulas/triage-harb.toml) extend this by
+# overriding the fields in the [project] section and providing stack-specific
+# step descriptions.
+#
+# Triggered by: bug-report + in-triage label combination.
+# Set by the reproduce-agent when:
+#   - Bug was confirmed (reproduced)
+#   - Quick log analysis did not reveal an obvious root cause
+#   - Reproduce-agent documented all steps taken and logs examined
+#
+# Steps:
+#   1. read-findings   — parse issue comments for prior reproduce-agent evidence
+#   2. trace-data-flow — follow symptom through UI → API → backend → data store
+#   3. instrumentation — throwaway branch, add logging, restart, observe
+#   4. decompose       — file backlog issues for each root cause
+#   5. link-back       — update original issue, swap in-triage → in-progress
+#   6. cleanup         — delete throwaway debug branch
+#
+# Best practices:
+#   - Start from reproduce-agent findings; do not repeat their work
+#   - Budget: 70% tracing data flow, 30% instrumented re-runs
+#   - Multiple causes: check if layered (Depends-on) or independent (Related)
+#   - Always delete the throwaway debug branch before finishing
+#   - If inconclusive after full turn budget: leave in-triage, post what was
+#     tried, do NOT relabel — supervisor handles stale triage sessions
+#
+# Project-specific formulas extend this template by defining:
+#   - stack_script: how to start/stop the project stack
+#   - [project].data_flow: layer names (e.g. "chain → indexer → GraphQL → UI")
+#   - [project].api_endpoints: which APIs/services to inspect
+#   - [project].stack_lock: stack lock configuration
+#   - Per-step description overrides with project-specific commands
+#
+# No hard timeout — runs until Claude hits its turn limit.
+# Stack lock held for full run (triage is rare; blocking CI is acceptable).
+
+name            = "triage"
+description     = "Deep root cause analysis: trace data flow, add debug instrumentation, decompose causes into backlog issues."
+version         = 2
+
+# Set stack_script to the restart command for local stacks.
+# Leave empty ("") to connect to an existing staging environment.
+stack_script    = ""
+
+tools           = ["playwright"]
+
+# ---------------------------------------------------------------------------
+# Project-specific extension fields.
+# Override these in formulas/triage-<project>.toml.
+# ---------------------------------------------------------------------------
+[project]
+# Human-readable layer names for the data-flow trace (generic default).
+# Example project override: "chain → indexer → GraphQL → UI"
+data_flow       = "UI → API → backend → data store"
+
+# Comma-separated list of API endpoints or services to inspect.
+# Example: "GraphQL /graphql, REST /api/v1, RPC ws://localhost:8545"
+api_endpoints   = ""
+
+# Stack lock configuration (leave empty for default behavior).
+# Example: "full" to hold a full stack lock during triage.
+stack_lock      = ""
+
+# ---------------------------------------------------------------------------
+# Steps
+# ---------------------------------------------------------------------------
+
+[[steps]]
+id    = "read-findings"
+title = "Read reproduce-agent findings"
+description = """
+Before doing anything else, parse all prior evidence from the issue comments.
+
+1. Fetch the issue body and all comments:
+     curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+       "${FORGE_API}/issues/${ISSUE_NUMBER}" | jq -r '.body'
+     curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+       "${FORGE_API}/issues/${ISSUE_NUMBER}/comments" | jq -r '.[].body'
+
+2. Identify the reproduce-agent comment (look for sections like
+   "Reproduction steps", "Logs examined", "What was tried").
+
+3. Extract and note:
+   - The exact symptom (error message, unexpected value, visual regression)
+   - Steps that reliably trigger the bug
+   - Log lines or API responses already captured
+   - Any hypotheses the reproduce-agent already ruled out
+
+Do NOT repeat work the reproduce-agent already did. Your job starts where
+theirs ended. If no reproduce-agent comment is found, note it and proceed
+with fresh investigation using the issue body only.
+"""
+
+[[steps]]
+id    = "trace-data-flow"
+title = "Trace data flow from symptom to source"
+description = """
+Systematically follow the symptom backwards through each layer of the stack.
+Spend ~70% of your total turn budget here before moving to instrumentation.
+
+Generic layer traversal (adapt to the project's actual stack):
+  UI → API → backend → data store
+
+For each layer boundary:
+  1. What does the upstream layer send?
+  2. What does the downstream layer expect?
+  3. Is there a mismatch? If yes — is this the root cause or a symptom?
+
+Tracing checklist:
+  a. Start at the layer closest to the visible symptom.
+  b. Read the relevant source files — do not guess data shapes.
+  c. Cross-reference API contracts: compare what the code sends vs what it
+     should send according to schemas, type definitions, or documentation.
+  d. Check recent git history on suspicious files:
+       git log --oneline -20 -- <file>
+  e. Search for related issues or TODOs in the code:
+       grep -r "TODO\|FIXME\|HACK" -- <relevant directory>
+
+Capture for each layer:
+  - The data shape flowing in and out (field names, types, nullability)
+  - Whether the layer's behavior matches its documented contract
+  - Any discrepancy found
+
+If a clear root cause becomes obvious during tracing, note it and continue
+checking whether additional causes exist downstream.
+"""
+needs = ["read-findings"]
+
+[[steps]]
+id    = "instrumentation"
+title = "Add debug instrumentation on a throwaway branch"
+description = """
+Use ~30% of your total turn budget here. Only instrument after tracing has
+identified the most likely failure points — do not instrument blindly.
+
+1. Create a throwaway debug branch (NEVER commit this to main):
+     cd "$PROJECT_REPO_ROOT"
+     git checkout -b debug/triage-${ISSUE_NUMBER}
+
+2. Add targeted logging at the layer boundaries identified during tracing:
+   - Console.log / structured log statements around the suspicious code path
+   - Log the actual values flowing through: inputs, outputs, intermediate state
+   - Add verbose mode flags if the stack supports them
+   - Keep instrumentation minimal — only what confirms or refutes the hypothesis
+
+3. Restart the stack using the configured script (if set):
+     ${stack_script:-"# No stack_script configured — restart manually or connect to staging"}
+
+4. Re-run the reproduction steps from the reproduce-agent findings.
+
+5. Observe and capture new output:
+   - Paste relevant log lines into your working notes
+   - Note whether the observed values match or contradict the hypothesis
+
+6. If the first instrumentation pass is inconclusive, iterate:
+   - Narrow the scope to the next most suspicious boundary
+   - Re-instrument, restart, re-run
+   - Maximum 2-3 instrumentation rounds before declaring inconclusive
+
+Do NOT push the debug branch. It will be deleted in the cleanup step.
+"""
+needs = ["trace-data-flow"]
+
+[[steps]]
+id    = "decompose"
+title = "Decompose root causes into backlog issues"
+description = """
+After tracing and instrumentation, articulate each distinct root cause.
+
+For each root cause found:
+
+1. Determine the relationship to other causes:
+   - Layered (one causes another) → use Depends-on in the issue body
+   - Independent (separate code paths fail independently) → use Related
+
+2. Create a backlog issue for each root cause:
+     curl -sf -X POST "${FORGE_API}/issues" \\
+       -H "Authorization: token ${FORGE_TOKEN}" \\
+       -H "Content-Type: application/json" \\
+       -d '{
+         "title": "fix: <specific description of root cause N>",
+         "body": "## Root cause\\n<exact code path, file:line>\\n\\n## Fix suggestion\\n<recommended approach>\\n\\n## Context\\nDecomposed from #${ISSUE_NUMBER} (cause N of M)\\n\\n## Dependencies\\n<#X if this depends on another cause being fixed first>",
+         "labels": [{"name": "backlog"}]
+       }'
+
+3. Note the newly created issue numbers.
+
+If only one root cause is found, still create a single backlog issue with
+the specific code location and fix suggestion.
+
+If the investigation is inconclusive (no clear root cause found), skip this
+step and proceed directly to link-back with the inconclusive outcome.
+"""
+needs = ["instrumentation"]
+
+[[steps]]
+id    = "link-back"
+title = "Update original issue and relabel"
+description = """
+Post a summary comment on the original issue and update its labels.
+
+### If root causes were found (conclusive):
+
+Post a comment:
+  "## Triage findings
+
+  Found N root cause(s):
+  - #X — <one-line description> (cause 1 of N)
+  - #Y — <one-line description> (cause 2 of N, depends on #X)
+
+  Data flow traced: <layer where the bug originates>
+  Instrumentation: <key log output that confirmed the cause>
+
+  Next step: backlog issues above will be implemented in dependency order."
+
+Then swap labels:
+  - Remove: in-triage
+  - Add: in-progress
+
+### If investigation was inconclusive (turn budget exhausted):
+
+Post a comment:
+  "## Triage — inconclusive
+
+  Traced: <layers checked>
+  Tried: <instrumentation attempts and what they showed>
+  Hypothesis: <best guess at cause, if any>
+
+  No definitive root cause identified. Leaving in-triage for supervisor
+  to handle as a stale triage session."
+
+Do NOT relabel. Leave in-triage. The supervisor monitors stale triage
+sessions and will escalate or reassign.
+
+**CRITICAL: Write outcome file** — Always write the outcome to the outcome file:
+  - If root causes found (conclusive): echo "reproduced" > /tmp/triage-outcome-${ISSUE_NUMBER}.txt
+  - If inconclusive: echo "needs-triage" > /tmp/triage-outcome-${ISSUE_NUMBER}.txt
+"""
+needs = ["decompose"]
+
+[[steps]]
+id    = "cleanup"
+title = "Delete throwaway debug branch"
+description = """
+Always delete the debug branch, even if the investigation was inconclusive.
+
+1. Switch back to the main branch:
+     cd "$PROJECT_REPO_ROOT"
+     git checkout "$PRIMARY_BRANCH"
+
+2. Delete the local debug branch:
+     git branch -D debug/triage-${ISSUE_NUMBER}
+
+3. Confirm no remote was pushed (if accidentally pushed, delete it too):
+     git push origin --delete debug/triage-${ISSUE_NUMBER} 2>/dev/null || true
+
+4. Verify the worktree is clean:
+     git status
+     git worktree list
+
+A clean repo is a prerequisite for the next dev-agent run. Never leave
+debug branches behind — they accumulate and pollute the branch list.
+"""
+needs = ["link-back"]
--- a/gardener/AGENTS.md
+++ b/gardener/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Gardener Agent

 **Role**: Backlog grooming — detect duplicate issues, missing acceptance
@ -7,22 +7,26 @@ the quality gate: strips the `backlog` label from issues that lack acceptance
 criteria checkboxes (`- [ ]`) or an `## Affected files` section. Invokes
 Claude to fix what it can; files vault items for what it cannot.

-**Trigger**: `gardener-run.sh` runs 4x/day via cron. Sources `lib/guard.sh` and
-calls `check_active gardener` first — skips if `$FACTORY_ROOT/state/.gardener-active`
-is absent. Then creates a tmux session with `claude --model sonnet`, injects
-`formulas/run-gardener.toml` as context, monitors the phase file, and cleans up
-on completion or timeout (2h max session). No action issues — the gardener runs
-directly from cron like the planner, predictor, and supervisor.
+**Trigger**: `gardener-run.sh` is invoked by the polling loop in `docker/agents/entrypoint.sh`
+every 6 hours (iteration math at line 182-194). Sources `lib/guard.sh` and calls
+`check_active gardener` first — skips if `$FACTORY_ROOT/state/.gardener-active` is absent.
+**Early-exit optimization**: if no issues, PRs, or repo files have changed since the last
+run (checked via Forgejo API and `git diff`), the model is not invoked — the run exits
+immediately (no tmux session, no tokens consumed). Otherwise, creates a tmux session with
+`claude --model sonnet`, injects `formulas/run-gardener.toml` as context, monitors the
+phase file, and cleans up on completion or timeout (2h max session). No action issues —
+the gardener runs as part of the polling loop alongside the planner, predictor, and supervisor.

 **Key files**:
- `gardener/gardener-run.sh` — Cron wrapper + orchestrator: lock, memory guard,
+- `gardener/gardener-run.sh` — Polling loop participant + orchestrator: lock, memory guard,
  sources disinto project config, creates tmux session, injects formula prompt,
  monitors phase file via custom `_gardener_on_phase_change` callback (passed to
  `run_formula_and_monitor`). Stays alive through CI/review/merge cycle after
  `PHASE:awaiting_ci` — injects CI results and review feedback, re-signals
  `PHASE:awaiting_ci` after fixes, signals `PHASE:awaiting_review` on CI pass.
  Executes pending-actions manifest after PR merge.
- `formulas/run-gardener.toml` — Execution spec: preflight, grooming, dust-bundling, blocked-review, agents-update, commit-and-pr
+- `formulas/run-gardener.toml` — Execution spec: preflight, grooming, dust-bundling,
+  agents-update, commit-and-pr
 - `gardener/pending-actions.json` — Manifest of deferred repo actions (label changes,
  closures, comments, issue creation). Written during grooming steps, committed to the
  PR, reviewed alongside AGENTS.md changes, executed by gardener-run.sh after merge.
@ -31,10 +35,10 @@ directly from cron like the planner, predictor, and supervisor.
 - `FORGE_TOKEN`, `FORGE_GARDENER_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by gardener-run.sh)

-**Lifecycle**: gardener-run.sh (cron 0,6,12,18) → `check_active gardener` → lock + memory guard →
-load formula + context → create tmux session →
+**Lifecycle**: gardener-run.sh (invoked by polling loop every 6h, `check_active gardener`) →
+lock + memory guard → load formula + context → create tmux session →
 Claude grooms backlog (writes proposed actions to manifest), bundles dust,
-reviews blocked issues, updates AGENTS.md, commits manifest + docs to PR →
+updates AGENTS.md, commits manifest + docs to PR →
 `PHASE:awaiting_ci` (stays alive) → CI pass → `PHASE:awaiting_review` →
 review feedback → address + re-signal → merge → gardener-run.sh executes
 manifest actions via API → `PHASE:done`. When blocked on external resources
--- a/gardener/best-practices.md
+++ b/gardener/best-practices.md
@ -51,3 +51,4 @@ Compact, decision-ready. Human should be able to reply "1a 2c 3b" and be done.
 - Dev-agent doesn't understand the product — clear acceptance criteria save 2-3 CI cycles
 - Feature issues MUST list affected e2e test files
 - Issue templates from ISSUE-TEMPLATES.md propagate via triage gate
+- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** Concurrency is enforced by `flock session.lock` within each container and by `issue_claim` for per-issue work. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue.
--- a/gardener/gardener-run.sh
+++ b/gardener/gardener-run.sh
@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 # =============================================================================
-# gardener-run.sh — Cron wrapper: gardener execution via SDK + formula
+# gardener-run.sh — Polling-loop wrapper: gardener execution via SDK + formula
 #
 # Synchronous bash loop using claude -p (one-shot invocation).
 # No tmux sessions, no phase files — the bash script IS the state machine.
 #
 # Flow:
-#   1. Guards: cron lock, memory check
+#   1. Guards: run lock, memory check
 #   2. Load formula (formulas/run-gardener.toml)
 #   3. Build context: AGENTS.md, scratch file, prompt footer
 #   4. agent_run(worktree, prompt) → Claude does maintenance, pushes if needed
@ -17,7 +17,7 @@
 # Usage:
 #   gardener-run.sh [projects/disinto.toml]   # project config (default: disinto)
 #
-# Cron: 0 0,6,12,18 * * * cd /home/debian/dark-factory && bash gardener/gardener-run.sh projects/disinto.toml
+# Called by: entrypoint.sh polling loop (every 6 hours)
 # =============================================================================
 set -euo pipefail

@ -45,7 +45,7 @@ source "$FACTORY_ROOT/lib/agent-sdk.sh"
 # shellcheck source=../lib/pr-lifecycle.sh
 source "$FACTORY_ROOT/lib/pr-lifecycle.sh"

-LOG_FILE="$SCRIPT_DIR/gardener.log"
+LOG_FILE="${DISINTO_LOG_DIR}/gardener/gardener.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@ -54,22 +54,46 @@ SCRATCH_FILE="/tmp/gardener-${PROJECT_NAME}-scratch.md"
 RESULT_FILE="/tmp/gardener-result-${PROJECT_NAME}.txt"
 GARDENER_PR_FILE="/tmp/gardener-pr-${PROJECT_NAME}.txt"
 WORKTREE="/tmp/${PROJECT_NAME}-gardener-run"
+LAST_SHA_FILE="${DISINTO_DATA_DIR}/gardener-last-sha.txt"

-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+# shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
+LOG_AGENT="gardener"

 # ── Guards ────────────────────────────────────────────────────────────────
 check_active gardener
-acquire_cron_lock "/tmp/gardener-run.lock"
-check_memory 2000
+acquire_run_lock "/tmp/gardener-run.lock"
+memory_guard 2000

 log "--- Gardener run start ---"

-# ── Resolve agent identity for .profile repo ────────────────────────────
-if [ -z "${AGENT_IDENTITY:-}" ] && [ -n "${FORGE_GARDENER_TOKEN:-}" ]; then
-  AGENT_IDENTITY=$(curl -sf -H "Authorization: token ${FORGE_GARDENER_TOKEN}" \
-    "${FORGE_URL:-http://localhost:3000}/api/v1/user" 2>/dev/null | jq -r '.login // empty' 2>/dev/null || true)
+# ── Resolve forge remote for git operations ─────────────────────────────
+# Run git operations from the project checkout, not the baked code dir
+cd "$PROJECT_REPO_ROOT"
+
+resolve_forge_remote
+
+# ── Precondition checks: skip if nothing to do ────────────────────────────
+# Check for new commits since last run
+CURRENT_SHA=$(git -C "$FACTORY_ROOT" rev-parse HEAD 2>/dev/null || echo "")
+LAST_SHA=$(cat "$LAST_SHA_FILE" 2>/dev/null || echo "")
+
+# Check for open issues needing grooming
+backlog_count=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_API}/issues?labels=backlog&state=open&limit=1" 2>/dev/null | jq length) || backlog_count=0
+tech_debt_count=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_API}/issues?labels=tech-debt&state=open&limit=1" 2>/dev/null | jq length) || tech_debt_count=0
+
+if [ "$CURRENT_SHA" = "$LAST_SHA" ] && [ "${backlog_count:-0}" -eq 0 ] && [ "${tech_debt_count:-0}" -eq 0 ]; then
+  log "no new commits and no issues to groom — skipping"
+  exit 0
 fi

+log "current sha: ${CURRENT_SHA:0:8}..., backlog issues: ${backlog_count}, tech-debt issues: ${tech_debt_count}"
+
+# ── Resolve agent identity for .profile repo ────────────────────────────
+resolve_agent_identity || true
+
 # ── Load formula + context ───────────────────────────────────────────────
 load_formula_or_profile "gardener" "$FACTORY_ROOT/formulas/run-gardener.toml" || exit 1
 build_context_block AGENTS.md
@ -127,16 +151,7 @@ ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}"

 # ── Create worktree ──────────────────────────────────────────────────────
-cd "$PROJECT_REPO_ROOT"
-git fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true
-worktree_cleanup "$WORKTREE"
-git worktree add "$WORKTREE" "origin/${PRIMARY_BRANCH}" --detach 2>/dev/null
-
-cleanup() {
-  worktree_cleanup "$WORKTREE"
-  rm -f "$GARDENER_PR_FILE"
-}
-trap cleanup EXIT
+formula_worktree_setup "$WORKTREE"

 # ── Post-merge manifest execution ────────────────────────────────────────
 # Reads gardener/pending-actions.json and executes each action via API.
@ -165,19 +180,21 @@ _gardener_execute_manifest() {

    case "$action" in
      add_label)
-        local label label_id
+        local label label_id http_code resp
        label=$(jq -r ".[$i].label" "$manifest_file")
        label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
          "${FORGE_API}/labels" | jq -r --arg n "$label" \
          '.[] | select(.name == $n) | .id') || true
        if [ -n "$label_id" ]; then
-          if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+          resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
               -H 'Content-Type: application/json' \
               "${FORGE_API}/issues/${issue}/labels" \
-               -d "{\"labels\":[${label_id}]}" >/dev/null 2>&1; then
+               -d "{\"labels\":[${label_id}]}" 2>/dev/null) || true
+          http_code=$(echo "$resp" | tail -1)
+          if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
            log "manifest: add_label '${label}' to #${issue}"
          else
-            log "manifest: FAILED add_label '${label}' to #${issue}"
+            log "manifest: FAILED add_label '${label}' to #${issue}: HTTP ${http_code}"
          fi
        else
          log "manifest: FAILED add_label — label '${label}' not found"
@ -185,17 +202,19 @@ _gardener_execute_manifest() {
        ;;

      remove_label)
-        local label label_id
+        local label label_id http_code resp
        label=$(jq -r ".[$i].label" "$manifest_file")
        label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
          "${FORGE_API}/labels" | jq -r --arg n "$label" \
          '.[] | select(.name == $n) | .id') || true
        if [ -n "$label_id" ]; then
-          if curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
-               "${FORGE_API}/issues/${issue}/labels/${label_id}" >/dev/null 2>&1; then
+          resp=$(curl -sf -w "\n%{http_code}" -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
+               "${FORGE_API}/issues/${issue}/labels/${label_id}" 2>/dev/null) || true
+          http_code=$(echo "$resp" | tail -1)
+          if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
            log "manifest: remove_label '${label}' from #${issue}"
          else
-            log "manifest: FAILED remove_label '${label}' from #${issue}"
+            log "manifest: FAILED remove_label '${label}' from #${issue}: HTTP ${http_code}"
          fi
        else
          log "manifest: FAILED remove_label — label '${label}' not found"
@ -203,34 +222,38 @@ _gardener_execute_manifest() {
        ;;

      close)
-        local reason
+        local reason http_code resp
        reason=$(jq -r ".[$i].reason // empty" "$manifest_file")
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
             -H 'Content-Type: application/json' \
             "${FORGE_API}/issues/${issue}" \
-             -d '{"state":"closed"}' >/dev/null 2>&1; then
+             -d '{"state":"closed"}' 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
          log "manifest: closed #${issue} (${reason})"
        else
-          log "manifest: FAILED close #${issue}"
+          log "manifest: FAILED close #${issue}: HTTP ${http_code}"
        fi
        ;;

      comment)
-        local body escaped_body
+        local body escaped_body http_code resp
        body=$(jq -r ".[$i].body" "$manifest_file")
        escaped_body=$(printf '%s' "$body" | jq -Rs '.')
-        if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
             -H 'Content-Type: application/json' \
             "${FORGE_API}/issues/${issue}/comments" \
-             -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then
+             -d "{\"body\":${escaped_body}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
          log "manifest: commented on #${issue}"
        else
-          log "manifest: FAILED comment on #${issue}"
+          log "manifest: FAILED comment on #${issue}: HTTP ${http_code}"
        fi
        ;;

      create_issue)
-        local title body labels escaped_title escaped_body label_ids
+        local title body labels escaped_title escaped_body label_ids http_code resp
        title=$(jq -r ".[$i].title" "$manifest_file")
        body=$(jq -r ".[$i].body" "$manifest_file")
        labels=$(jq -r ".[$i].labels // [] | .[]" "$manifest_file")
@ -250,40 +273,46 @@ _gardener_execute_manifest() {
          done <<< "$labels"
          [ -n "$ids_json" ] && label_ids="[${ids_json}]"
        fi
-        if curl -sf -X POST -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X POST -H "Authorization: token ${FORGE_TOKEN}" \
             -H 'Content-Type: application/json' \
             "${FORGE_API}/issues" \
-             -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" >/dev/null 2>&1; then
+             -d "{\"title\":${escaped_title},\"body\":${escaped_body},\"labels\":${label_ids}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
          log "manifest: created issue '${title}'"
        else
-          log "manifest: FAILED create_issue '${title}'"
+          log "manifest: FAILED create_issue '${title}': HTTP ${http_code}"
        fi
        ;;

      edit_body)
-        local body escaped_body
+        local body escaped_body http_code resp
        body=$(jq -r ".[$i].body" "$manifest_file")
        escaped_body=$(printf '%s' "$body" | jq -Rs '.')
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
             -H 'Content-Type: application/json' \
             "${FORGE_API}/issues/${issue}" \
-             -d "{\"body\":${escaped_body}}" >/dev/null 2>&1; then
+             -d "{\"body\":${escaped_body}}" 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
          log "manifest: edited body of #${issue}"
        else
-          log "manifest: FAILED edit_body #${issue}"
+          log "manifest: FAILED edit_body #${issue}: HTTP ${http_code}"
        fi
        ;;

      close_pr)
-        local pr
+        local pr http_code resp
        pr=$(jq -r ".[$i].pr" "$manifest_file")
-        if curl -sf -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
+        resp=$(curl -sf -w "\n%{http_code}" -X PATCH -H "Authorization: token ${FORGE_TOKEN}" \
             -H 'Content-Type: application/json' \
             "${FORGE_API}/pulls/${pr}" \
-             -d '{"state":"closed"}' >/dev/null 2>&1; then
+             -d '{"state":"closed"}' 2>/dev/null) || true
+        http_code=$(echo "$resp" | tail -1)
+        if [ "$http_code" = "200" ] || [ "$http_code" = "204" ]; then
          log "manifest: closed PR #${pr}"
        else
-          log "manifest: FAILED close_pr #${pr}"
+          log "manifest: FAILED close_pr #${pr}: HTTP ${http_code}"
        fi
        ;;

@ -328,9 +357,9 @@ if [ -n "$PR_NUMBER" ]; then

  if [ "$_PR_WALK_EXIT_REASON" = "merged" ]; then
    # Post-merge: pull primary, mirror push, execute manifest
-    git -C "$PROJECT_REPO_ROOT" fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true
+    git -C "$PROJECT_REPO_ROOT" fetch "${FORGE_REMOTE}" "$PRIMARY_BRANCH" 2>/dev/null || true
    git -C "$PROJECT_REPO_ROOT" checkout "$PRIMARY_BRANCH" 2>/dev/null || true
-    git -C "$PROJECT_REPO_ROOT" pull --ff-only origin "$PRIMARY_BRANCH" 2>/dev/null || true
+    git -C "$PROJECT_REPO_ROOT" pull --ff-only "${FORGE_REMOTE}" "$PRIMARY_BRANCH" 2>/dev/null || true
    mirror_push
    _gardener_execute_manifest
    rm -f "$SCRATCH_FILE"
@ -347,4 +376,8 @@ fi
 profile_write_journal "gardener-run" "Gardener run $(date -u +%Y-%m-%d)" "complete" "" || true

 rm -f "$GARDENER_PR_FILE"
+
+# Persist last-seen SHA for next run comparison
+echo "$CURRENT_SHA" > "$LAST_SHA_FILE"
+
 log "--- Gardener run done ---"
--- a/gardener/pending-actions.json
+++ b/gardener/pending-actions.json
@ -1,32 +1,32 @@
 [
  {
    "action": "edit_body",
-    "issue": 765,
-    "body": "Depends on: none\n\n## Goal\n\nThe disinto website becomes a versioned artifact: built by CI, published to Codeberg's generic package registry, deployed to staging automatically. Version visible in footer.\n\n## Files to add/change\n\n### `site/VERSION`\n```\n0.1.0\n```\n\n### `site/build.sh`\n```bash\n#!/bin/bash\nVERSION=$(cat VERSION)\nmkdir -p dist\ncp *.html *.jpg *.webp *.png *.ico *.xml robots.txt dist/\nsed -i \"s|Built from scrap, powered by a single battery.|v${VERSION} · Built from scrap, powered by a single battery.|\" dist/index.html\necho \"$VERSION\" > dist/VERSION\n```\n\n### `site/index.html`\nNo template placeholder needed — `build.sh` does the sed replacement on the existing footer text.\n\n### `.woodpecker/site.yml`\n```yaml\nwhen:\n  path: \"site/**\"\n  event: push\n  branch: main\n\nsteps:\n  - name: build\n    image: alpine\n    commands:\n      - cd site && sh build.sh\n      - VERSION=$(cat site/VERSION)\n      - tar czf site-${VERSION}.tar.gz -C site/dist .\n\n  - name: publish\n    image: alpine\n    commands:\n      - apk add curl\n      - VERSION=$(cat site/VERSION)\n      - >-\n        curl -sf --user \"johba:$$FORGE_TOKEN\"\n        --upload-file site-${VERSION}.tar.gz\n        \"https://codeberg.org/api/packages/johba/generic/disinto-site/${VERSION}/site-${VERSION}.tar.gz\"\n    environment:\n      FORGE_TOKEN:\n        from_secret: forge_token\n\n  - name: deploy-staging\n    image: alpine\n    commands:\n      - apk add curl\n      - VERSION=$(cat site/VERSION)\n      - >-\n        curl -sf --user \"johba:$$FORGE_TOKEN\"\n        \"https://codeberg.org/api/packages/johba/generic/disinto-site/${VERSION}/site-${VERSION}.tar.gz\"\n        -o site.tar.gz\n      - rm -rf /srv/staging/*\n      - tar xzf site.tar.gz -C /srv/staging/\n    environment:\n      FORGE_TOKEN:\n        from_secret: forge_token\n    volumes:\n      - /home/debian/staging-site:/srv/staging\n```\n\n## Infra setup (manual, before first run)\n- `mkdir -p /home/debian/staging-site`\n- Add to Caddyfile: `staging.disinto.ai { root * /home/debian/staging-site; file_server }`\n- DNS: `staging.disinto.ai` A record → same IP as `disinto.ai`\n- Reload Caddy: `sudo systemctl reload caddy`\n- Add `forge_token` as Woodpecker repo secret for johba/disinto (if not already set)\n- Add `/home/debian/staging-site` to `WOODPECKER_BACKEND_DOCKER_VOLUMES`\n\n## Verification\n- [ ] Merge PR that touches `site/` → CI runs site pipeline\n- [ ] Package appears at `codeberg.org/johba/-/packages/generic/disinto-site/0.1.0`\n- [ ] `staging.disinto.ai` serves the site with `v0.1.0` in footer\n- [ ] `disinto.ai` (production) unchanged\n\n## Related\n- #764 — docker stack edge proxy + staging (future: this moves inside the stack)\n- #755 — vault-gated production promotion (production deploy comes later)\n\n## Affected files\n- `site/VERSION` — new, holds current version string\n- `site/build.sh` — new, builds dist/ with version injected into footer\n- `.woodpecker/site.yml` — new, CI pipeline for build/publish/deploy-staging"
+    "issue": 712,
+    "body": "## Goal\n\nLet `disinto-chat` perform scoped write actions against the factory — specifically: trigger a Woodpecker CI run, create a Forgejo issue, create a Forgejo PR — via explicit backend endpoints. The UI surfaces these as buttons the user clicks from a chat turn that proposes an action. The model never holds API tokens directly.\n\n## Why\n\n- #623 lists these escalations as the difference between \"chat that talks about the project\" and \"chat that moves the project forward\".\n- Routing through explicit backend endpoints (instead of giving the sandboxed claude process API tokens) keeps the trust model tight: the *user* authorises each action, not the model.\n\n## Scope\n\n### Files to touch\n\n- `docker/chat/server.{py,go}` — new authenticated endpoints (reuse #708 / #709 session check):\n  - `POST /chat/action/ci-run` — body `{repo, branch}` → calls Woodpecker API with `WOODPECKER_TOKEN` (already in `.env` from existing factory setup) to trigger a pipeline.\n  - `POST /chat/action/issue-create` — body `{title, body, labels}` → calls Forgejo API `/repos/<owner>/<repo>/issues` with `FORGE_TOKEN`.\n  - `POST /chat/action/pr-create` — body `{head, base, title, body}` → calls `/repos/<owner>/<repo>/pulls`.\n  - All actions record to #710's NDJSON history as `{role: \"action\", ...}` lines.\n- `docker/chat/ui/index.html` — small HTMX pattern: when claude's response contains a marker like `<action type=\"issue-create\">{...}</action>`, render a clickable button below the message; clicking POSTs to `/chat/action/<type>` with the payload.\n- `lib/generators.sh` chat env: pass `WOODPECKER_TOKEN`, `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OWNER`, `FORGE_REPO`.\n\n### Out of scope\n\n- Destructive actions (branch delete, force push, secret rotation) — deliberately excluded.\n- Multi-step workflows / approval chains.\n- Arbitrary code execution in the chat container (that is what the agents exist for).\n\n## Acceptance\n\n- [ ] A chat turn that emits an `<action type=\"issue-create\">{...}</action>` block renders a button; clicking it creates an issue on Forgejo, visible via the API.\n- [ ] CI-trigger action creates a Woodpecker pipeline that can be seen in the CI UI.\n- [ ] PR-create action produces a Forgejo PR with the specified head / base.\n- [ ] All three actions are logged into the #710 history file with role `action` and the response from the API call.\n- [ ] Unauthenticated requests to `/chat/action/*` return 401 (inherits #708 gate).\n\n## Depends on\n\n- #708 (OAuth gate — actions are authorised by the logged-in user).\n- #710 (history — actions need to be logged alongside chat turns).\n\n## Notes\n\n- Forgejo API auth: the factory's `FORGE_TOKEN` is a long-lived admin token. For MVP, reuse it; a follow-up issue can scope it down to per-user Forgejo tokens derived from the OAuth flow.\n- Woodpecker API is at `http://woodpecker:8000/api/...`, reachable via the compose network — no need to go through the edge container.\n- The `<action>` marker is deliberately simple markup the model can emit in its response text. Do not implement tool-calling protocol; do not spin up an MCP server.\n\n## Boundaries for dev-agent\n\n- Do not give the claude subprocess direct API tokens. The chat backend holds them; the model only emits action markers the user clicks.\n- Do not add destructive actions (delete, force-push). Additive only.\n- Do not invent a new markup format beyond `<action type=\"...\">{JSON}</action>`.\n- Parent vision: #623.\n\n## Affected files\n\n- `docker/chat/server.py` (or `server.go`) — new `/chat/action/ci-run`, `/chat/action/issue-create`, `/chat/action/pr-create` endpoints\n- `docker/chat/ui/index.html` — action button rendering from `<action type=\"...\">{...}</action>` markers\n- `lib/generators.sh` — chat service env block: pass `WOODPECKER_TOKEN`, `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OWNER`, `FORGE_REPO`\n"
  },
  {
-    "action": "edit_body",
-    "issue": 764,
-    "body": "Depends on: none (builds on existing docker-compose generation in `bin/disinto`)\n\n## Design\n\n`disinto init` + `disinto up` starts two additional containers as base factory infrastructure:\n\n### Edge proxy (Caddy)\n- Reverse proxies to Forgejo and Woodpecker\n- Serves staging site\n- Runs on ports 80/443\n- At bootstrap: IP-only, self-signed TLS or HTTP\n- Domain + Let's Encrypt added later via vault resource request\n\n### Staging container (Caddy)\n- Static file server for the project's staging artifacts\n- Starts with a default \"Nothing shipped yet\" page\n- CI pipelines write to a shared volume to update staging content\n- No vault approval needed — staging is the factory's sandbox\n\n### docker-compose addition\n```yaml\nservices:\n  edge:\n    image: caddy:alpine\n    ports:\n      - \"80:80\"\n      - \"443:443\"\n    volumes:\n      - ./Caddyfile:/etc/caddy/Caddyfile\n      - caddy_data:/data\n    depends_on:\n      - forgejo\n      - woodpecker-server\n      - staging\n\n  staging:\n    image: caddy:alpine\n    volumes:\n      - staging-site:/srv/site\n    # Not exposed directly — edge proxies to it\n\nvolumes:\n  caddy_data:\n  staging-site:\n```\n\n### Caddyfile (generated by `disinto init`)\n```\n# IP-only at bootstrap, domain added later\n:80 {\n    handle /forgejo/* {\n        reverse_proxy forgejo:3000\n    }\n    handle /ci/* {\n        reverse_proxy woodpecker-server:8000\n    }\n    handle {\n        reverse_proxy staging:80\n    }\n}\n```\n\n### Staging update flow\n1. CI builds artifact (site tarball, etc.)\n2. CI step writes to `staging-site` volume\n3. Staging container serves updated content immediately\n4. No restart needed — Caddy serves files directly\n\n### Domain lifecycle\n- Bootstrap: no domain, edge serves on IP\n- Later: factory files vault resource request for domain\n- Human buys domain, sets DNS\n- Caddyfile updated with domain, Let's Encrypt auto-provisions TLS\n\n## Affected files\n- `bin/disinto` — `generate_compose()` adds edge + staging services\n- New: default staging page (\"Nothing shipped yet\")\n- New: Caddyfile template in `docker/`\n\n## Related\n- #755 — vault-gated deployment promotion (production comes later)\n- #757 — ops repo (domain is a resource requested through vault)\n\n## Acceptance criteria\n- [ ] `disinto init` generates a `docker-compose.yml` that includes `edge` (Caddy) and `staging` containers\n- [ ] Edge proxy routes `/forgejo/*` → Forgejo, `/ci/*` → Woodpecker, default → staging container\n- [ ] Staging container serves a default \"Nothing shipped yet\" page on first boot\n- [ ] `docker/` directory contains a Caddyfile template generated by `disinto init`\n- [ ] `disinto up` starts all containers including edge and staging without manual steps"
-  },
-  {
-    "action": "edit_body",
-    "issue": 761,
-    "body": "Depends on: #747\n\n## Design\n\nEach agent account on the bundled Forgejo gets a `.profile` repo. This repo holds the agent's formula (copied from disinto at creation time) and its journal.\n\n### Structure\n```\n{agent-bot}/.profile/\n├── formula.toml        # snapshot of the formula at agent creation time\n├── journal/            # daily logs of what the agent did\n│   ├── 2026-03-26.md\n│   └── ...\n└── knowledge/          # learned patterns, best-practices (optional, agent can evolve)\n```\n\n### Lifecycle\n1. **Create agent** — `disinto init` or `disinto spawn-agent` creates Forgejo account + `.profile` repo\n2. **Copy formula** — current `formulas/{role}.toml` from disinto repo is copied to `.profile/formula.toml`\n3. **Agent reads its own formula** — at session start, agent reads from its `.profile`, not from the disinto repo\n4. **Agent writes journal** — daily entries pushed to `.profile/journal/`\n5. **Agent can evolve knowledge** — best-practices, heuristics, patterns written to `.profile/knowledge/`\n\n### What this enables\n\n**A/B testing formulas:** Create two agents from different formula versions, run both against the same backlog, compare results (cycle time, CI pass rate, review rejection rate).\n\n**Rollback:** New formula worse? Kill agent, spawn from older formula version.\n\n**Audit:** What formula was this agent running when it produced that PR? Check its `.profile` at that git commit.\n\n**Drift tracking:** Diff what an agent learned (`.profile/knowledge/`) vs what it started with. Measure formula evolution over time.\n\n**Portability:** Move agent to different box — `git clone` its `.profile`.\n\n### Disinto repo becomes the template\n\n```\ndisinto repo:\n  formulas/dev-agent.toml       ← canonical template, evolves\n  formulas/review-agent.toml\n  formulas/planner.toml\n  ...\n\nRunning agents:\n  dev-bot-v2/.profile/formula.toml   ← snapshot from formulas/dev-agent.toml@v2\n  dev-bot-v3/.profile/formula.toml   ← snapshot from formulas/dev-agent.toml@v3\n  review-bot/.profile/formula.toml   ← snapshot from formulas/review-agent.toml\n```\n\nThe formula in the disinto repo is the template. The `.profile` copy is the instance. They can diverge — that's a feature, not a bug.\n\n## Affected files\n- `bin/disinto` — agent creation copies formula to .profile\n- Agent session scripts — read formula from .profile instead of local formulas/ dir\n- Planner/supervisor — can read other agents' journals from their .profile repos\n\n## Related\n- #747 — per-agent Forgejo accounts (prerequisite)\n- #757 — ops repo (shared concerns stay there: vault, portfolio, resources)\n\n## Acceptance criteria\n- [ ] `disinto spawn-agent` (or `disinto init`) creates a Forgejo account + `.profile` repo for each agent bot\n- [ ] Current `formulas/{role}.toml` is copied to `.profile/formula.toml` at agent creation time\n- [ ] Agent session script reads its formula from `.profile/formula.toml`, not from the repo's `formulas/` directory\n- [ ] Agent writes daily journal entries to `.profile/journal/YYYY-MM-DD.md`"
-  },
-  {
-    "action": "edit_body",
-    "issue": 742,
-    "body": "## Problem\n\n`gardener/recipes/*.toml` (4 files: cascade-rebase, chicken-egg-ci, flaky-test, shellcheck-violations) are an older pattern predating `formulas/*.toml`. Two systems for the same thing.\n\n## Fix\n\nMigrate any unique content from recipes to the gardener formula or to new formulas. Delete the recipes directory.\n\n## Affected files\n- `gardener/recipes/*.toml` — delete after migration\n- `formulas/run-gardener.toml` — absorb relevant content\n- Gardener scripts that reference recipes/\n\n## Acceptance criteria\n- [ ] Contents of `gardener/recipes/*.toml` are diff'd against `formulas/run-gardener.toml` — any unique content is migrated\n- [ ] `gardener/recipes/` directory is deleted\n- [ ] No scripts in `gardener/` reference the `recipes/` path after migration\n- [ ] ShellCheck passes on all modified scripts"
+    "action": "remove_label",
+    "issue": 712,
+    "label": "blocked"
  },
  {
    "action": "add_label",
-    "issue": 742,
+    "issue": 712,
    "label": "backlog"
  },
+  {
+    "action": "edit_body",
+    "issue": 707,
+    "body": "## Goal\n\nGive `disinto-chat` its own Claude identity mount so its OAuth refresh races cannot corrupt the factory agents' shared `~/.claude` credentials. Default to a separate `~/.claude-chat/` on the host; support `ANTHROPIC_API_KEY` as a fallback that skips OAuth entirely.\n\n## Why\n\n- #623 root-caused this: Claude Code's internal refresh lock in `~/.claude.lock` operates outside bind-mounted directories, so two containers sharing `~/.claude` can race during token refresh and invalidate each other. The factory has already had OAuth expiry incidents traced to multiple agents sharing credentials.\n- Scoping chat to its own identity dir means chat can be logged in as a different Anthropic account, or pinned to an API key, without touching agent credentials.\n\n## Scope\n\n### Files to touch\n\n- `lib/generators.sh` chat service block (from #705):\n  - Replace the throwaway named volume with `${CHAT_CLAUDE_DIR:-${HOME}/.claude-chat}:/home/chat/.claude-chat`.\n  - Env: `CLAUDE_CONFIG_DIR=/home/chat/.claude-chat/config`, `CLAUDE_CREDENTIALS_DIR=/home/chat/.claude-chat/config/credentials`.\n  - Conditional: if `ANTHROPIC_API_KEY` is set in `.env`, pass it through and **do not** mount `~/.claude-chat` at all (no credentials on disk in that mode).\n- `bin/disinto disinto_init()` — after #620's admin password prompt, add an optional prompt: `Use separate Anthropic identity for chat? (y/N)`. On yes, create `~/.claude-chat/` and invoke `claude login` in a subshell with `CLAUDE_CONFIG_DIR=~/.claude-chat/config`.\n- `lib/claude-config.sh` — factor out the existing `~/.claude` setup logic so a non-default `CLAUDE_CONFIG_DIR` is a first-class parameter. If it is already parameterised, just document it; if not, extract a helper `setup_claude_dir <dir>` and have the existing path call it with the default dir.\n- `docker/chat/Dockerfile` — declare `VOLUME /home/chat/.claude-chat`, set owner to the non-root chat user introduced in #706.\n\n### Out of scope\n\n- Cross-session lock coherence for multiple concurrent chat containers (single-chat-container assumption is fine for MVP).\n- Anthropic team / workspace support — single identity is enough.\n\n## Acceptance\n\n- [ ] Fresh `disinto init` with \"use separate chat identity\" answered yes creates `~/.claude-chat/` and logs in successfully.\n- [ ] With `ANTHROPIC_API_KEY=sk-ant-...` set in `.env`, chat starts without any `~/.claude-chat` mount (verified via `docker inspect disinto-chat`) and successfully completes a test prompt.\n- [ ] Running the factory agents AND chat simultaneously for 24h does not produce any OAuth refresh failures on either side (manual soak test — document result in PR).\n- [ ] `CLAUDE_CONFIG_DIR` and `CLAUDE_CREDENTIALS_DIR` inside the chat container resolve to `/home/chat/.claude-chat/config*`, not the shared factory path.\n\n## Depends on\n\n- #705 (chat scaffold).\n- #620 (admin password prompt — same init flow this adds a step to).\n\n## Notes\n\n- The factory's existing shared mount is `/var/lib/disinto/claude-shared` (see `lib/generators.sh:113,327,381,426`). Chat must NOT use this path.\n- `flock(\"${HOME}/.claude/session.lock\")` logic mentioned in #623 is load-bearing, not redundant — do not \"simplify\" it.\n- Prefer the API-key path for anyone running the factory on shared hardware; call this out in README updates.\n\n## Boundaries for dev-agent\n\n- Do not try to make chat share `~/.claude` with the agents \"just for convenience\". The whole point of this chunk is the opposite.\n- Do not add a third claude config dir. One for agents, one for chat, done.\n- Do not refactor `lib/claude-config.sh` beyond extracting a parameterised helper if needed.\n- Parent vision: #623.\n\n## Affected files\n\n- `lib/generators.sh` — chat service block: replace throwaway named volume with `${CHAT_CLAUDE_DIR:-${HOME}/.claude-chat}` bind mount; add `CLAUDE_CONFIG_DIR`/`CLAUDE_CREDENTIALS_DIR` env vars; skip mount when `ANTHROPIC_API_KEY` is set\n- `bin/disinto` — `disinto_init()`: add optional prompt for separate Anthropic identity for chat\n- `lib/claude-config.sh` — extract parameterized `setup_claude_dir <dir>` helper\n- `docker/chat/Dockerfile` — declare `VOLUME /home/chat/.claude-chat`, set owner to non-root chat user\n"
+  },
+  {
+    "action": "remove_label",
+    "issue": 707,
+    "label": "blocked"
+  },
  {
    "action": "add_label",
-    "issue": 741,
+    "issue": 707,
    "label": "backlog"
  }
 ]
--- a/knowledge/ci.md
+++ b/knowledge/ci.md
@ -0,0 +1,28 @@
+# CI/CD — Best Practices
+
+## CI Pipeline Issues (P2)
+
+When CI pipelines are stuck running >20min or pending >30min:
+
+### Investigation Steps
+1. Check pipeline status via Forgejo API:
+   ```bash
+   curl -sf -H "Authorization: token $FORGE_TOKEN" \
+     "$FORGE_API/pipelines?limit=50" | jq '.[] | {number, status, created}'
+   ```
+
+2. Check Woodpecker CI if configured:
+   ```bash
+   curl -sf -H "Authorization: Bearer $WOODPECKER_TOKEN" \
+     "$WOODPECKER_SERVER/api/repos/${WOODPECKER_REPO_ID}/pipelines?limit=10"
+   ```
+
+### Common Fixes
+- **Stuck pipeline**: Cancel via Forgejo API, retrigger
+- **Pending pipeline**: Check queue depth, scale CI runners
+- **Failed pipeline**: Review logs, fix failing test/step
+
+### Prevention
+- Set timeout limits on CI pipelines
+- Monitor runner capacity and scale as needed
+- Use caching for dependencies to reduce build time
--- a/knowledge/dev-agent.md
+++ b/knowledge/dev-agent.md
@ -0,0 +1,28 @@
+# Dev Agent — Best Practices
+
+## Dev Agent Issues (P2)
+
+When dev-agent is stuck, blocked, or in bad state:
+
+### Dead Lock File
+```bash
+# Check if process still exists
+ps -p $(cat /path/to/lock.file) 2>/dev/null || rm -f /path/to/lock.file
+```
+
+### Stale Worktree Cleanup
+```bash
+cd "$PROJECT_REPO_ROOT"
+git worktree remove --force /tmp/stale-worktree 2>/dev/null || true
+git worktree prune 2>/dev/null || true
+```
+
+### Blocked Pipeline
+- Check if PR is awaiting review or CI
+- Verify no other agent is actively working on same issue
+- Check for unmet dependencies (issues with `Depends on` refs)
+
+### Prevention
+- Concurrency bounded per LLM backend (AD-002)
+- Clear lock files in EXIT traps
+- Use phase files to track agent state
--- a/knowledge/disk.md
+++ b/knowledge/disk.md
@ -0,0 +1,35 @@
+# Disk Management — Best Practices
+
+## Disk Pressure Response (P1)
+
+When disk usage exceeds 80%, take these actions in order:
+
+### Immediate Actions
+1. **Docker cleanup** (safe, low impact):
+   ```bash
+   sudo docker system prune -f
+   ```
+
+2. **Aggressive Docker cleanup** (if still >80%):
+   ```bash
+   sudo docker system prune -a -f
+   ```
+   This removes unused images in addition to containers/volumes.
+
+3. **Log rotation**:
+   ```bash
+   for f in "$FACTORY_ROOT"/{dev,review,supervisor,gardener,planner,predictor}/*.log; do
+     [ -f "$f" ] && [ "$(du -k "$f" | cut -f1)" -gt 10240 ] && truncate -s 0 "$f"
+   done
+   ```
+
+### Prevention
+- Monitor disk with alerts at 70% (warning) and 80% (critical)
+- Set up automatic log rotation for agent logs
+- Clean up old Docker images regularly
+- Consider using separate partitions for `/var/lib/docker`
+
+### When to Escalate
+- Disk stays >80% after cleanup (indicates legitimate growth)
+- No unused Docker images to clean
+- Critical data filling disk (check /home, /var/log)
--- a/knowledge/forge.md
+++ b/knowledge/forge.md
@ -0,0 +1,25 @@
+# Forgejo Operations — Best Practices
+
+## Forgejo Issues
+
+When Forgejo operations encounter issues:
+
+### API Rate Limits
+- Monitor rate limit headers in API responses
+- Implement exponential backoff on 429 responses
+- Use agent-specific tokens (#747) to increase limits
+
+### Authentication Issues
+- Verify FORGE_TOKEN is valid and not expired
+- Check agent identity matches token (#747)
+- Use FORGE_<AGENT>_TOKEN for agent-specific identities
+
+### Repository Access
+- Verify FORGE_REMOTE matches actual git remote
+- Check token has appropriate permissions (repo, write)
+- Use `resolve_forge_remote()` to auto-detect remote
+
+### Prevention
+- Set up monitoring for API failures
+- Rotate tokens before expiry
+- Document required permissions per agent
--- a/knowledge/git.md
+++ b/knowledge/git.md
@ -0,0 +1,28 @@
+# Git State Recovery — Best Practices
+
+## Git State Issues (P2)
+
+When git repo is on wrong branch or in broken rebase state:
+
+### Wrong Branch Recovery
+```bash
+cd "$PROJECT_REPO_ROOT"
+git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
+```
+
+### Broken Rebase Recovery
+```bash
+cd "$PROJECT_REPO_ROOT"
+git rebase --abort 2>/dev/null || true
+git checkout "$PRIMARY_BRANCH" 2>/dev/null || git checkout master 2>/dev/null
+```
+
+### Stale Lock File Cleanup
+```bash
+rm -f /path/to/stale.lock
+```
+
+### Prevention
+- Always checkout primary branch after rebase conflicts
+- Remove lock files after agent sessions complete
+- Use `git status` to verify repo state before operations
--- a/knowledge/memory.md
+++ b/knowledge/memory.md
@ -0,0 +1,27 @@
+# Memory Management — Best Practices
+
+## Memory Crisis Response (P0)
+
+When RAM available drops below 500MB or swap usage exceeds 3GB, take these actions:
+
+### Immediate Actions
+1. **Kill stale claude processes** (>3 hours old):
+   ```bash
+   pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
+   ```
+
+2. **Drop filesystem caches**:
+   ```bash
+   sync && echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null 2>&1 || true
+   ```
+
+### Prevention
+- Set memory_guard to 2000MB minimum (default in env.sh)
+- Configure swap usage alerts at 2GB
+- Monitor for memory leaks in long-running processes
+- Use cgroups for process memory limits
+
+### When to Escalate
+- RAM stays <500MB after cache drop
+- Swap continues growing after process kills
+- System becomes unresponsive (OOM killer active)
--- a/knowledge/review-agent.md
+++ b/knowledge/review-agent.md
@ -0,0 +1,23 @@
+# Review Agent — Best Practices
+
+## Review Agent Issues
+
+When review agent encounters issues with PRs:
+
+### Stale PR Handling
+- PRs stale >20min (CI done, no push since) → file vault item for dev-agent
+- Do NOT push branches or attempt merges directly
+- File vault item with:
+  - What: Stale PR requiring push
+  - Why: Factory degraded
+  - Unblocks: dev-agent will push the branch
+
+### Circular Dependencies
+- Check backlog for issues with circular `Depends on` refs
+- Use `lib/parse-deps.sh` to analyze dependency graph
+- Report to planner for resolution
+
+### Prevention
+- Review agent only reads PRs, never modifies
+- Use vault items for actions requiring dev-agent
+- Monitor for PRs stuck in review state
--- a/lib/AGENTS.md
+++ b/lib/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Shared Helpers (`lib/`)

 All agents source `lib/env.sh` as their first action. Additional helpers are
@ -6,20 +6,30 @@ sourced as needed.

 | File | What it provides | Sourced by |
 |---|---|---|
-| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. | Every agent |
-| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). | dev-poll, review-poll, review-pr, supervisor-poll |
+| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Required env var**: `FORGE_PASS` — bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). **Hard preconditions (#674)**: `USER` and `HOME` must be exported by the entrypoint before sourcing. When `PROJECT_TOML` is set, `PROJECT_REPO_ROOT`, `PRIMARY_BRANCH`, and `OPS_REPO_ROOT` must also be set (by entrypoint or TOML). | Every agent |
+| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. | dev-poll, review-poll, review-pr |
 | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
-| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). | env.sh (when `PROJECT_TOML` is set), supervisor-poll (per-project iteration) |
-| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` / `blocked by #N` patterns. Inline scan skips fenced code blocks to prevent false positives from code examples in issue bodies. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll, supervisor-poll |
-| `lib/formula-session.sh` | `acquire_cron_lock()`, `check_memory()`, `load_formula()`, `build_context_block()`, `consume_escalation_reply()`, `start_formula_session()`, `formula_phase_callback()`, `build_prompt_footer()`, `build_graph_section()`, `run_formula_and_monitor(AGENT [TIMEOUT] [CALLBACK])` — shared helpers for formula-driven cron agents (lock, memory guard, formula loading, prompt assembly, tmux session, monitor loop, crash recovery). `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `formula_phase_callback()` handles `PHASE:escalate` (unified escalation path — kills the session). `run_formula_and_monitor` accepts an optional CALLBACK (default: `formula_phase_callback`) so callers can install custom merge-through or escalation handlers. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
-| `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in cron logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | cron entry points |
-| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh and dev/phase-handler.sh — called after every successful merge. | dev-poll.sh, phase-handler.sh |
+| `lib/ci-log-reader.py` | Python tool: reads CI logs from Woodpecker SQLite database. `<pipeline_number> [--step <name>]` — returns last 200 lines from failed steps (or specified step). Used by `ci_get_logs()` in ci-helpers.sh. Requires `WOODPECKER_DATA_DIR` (default: /woodpecker-data). | ci-helpers.sh |
+| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). Reads `repo_root` and `ops_repo_root` from the TOML for host-CLI callers. **Container path handling (#674)**: no longer derives `PROJECT_REPO_ROOT` or `OPS_REPO_ROOT` inside the script — container entrypoints export the correct paths before agent scripts source `env.sh`, and the `DISINTO_CONTAINER` guard (line 90) skips TOML overrides when those vars are already set. | env.sh (when `PROJECT_TOML` is set) |
+| `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` / `blocked by #N` patterns. Inline scan skips fenced code blocks to prevent false positives from code examples in issue bodies. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll |
+| `lib/formula-session.sh` | `acquire_run_lock()`, `load_formula()`, `load_formula_or_profile()`, `build_context_block()`, `ensure_ops_repo()`, `ops_commit_and_push()`, `build_prompt_footer()`, `build_sdk_prompt_footer()`, `formula_worktree_setup()`, `formula_prepare_profile_context()`, `formula_lessons_block()`, `profile_write_journal()`, `profile_load_lessons()`, `ensure_profile_repo()`, `_profile_has_repo()`, `_count_undigested_journals()`, `_profile_digest_journals()`, `_profile_restore_lessons()`, `_profile_commit_and_push()`, `resolve_agent_identity()`, `build_graph_section()`, `build_scratch_instruction()`, `read_scratch_context()`, `cleanup_stale_crashed_worktrees()` — shared helpers for formula-driven polling-loop agents (lock, .profile repo management, prompt assembly, worktree setup). Memory guard is provided by `memory_guard()` in `lib/env.sh` (not duplicated here). `resolve_agent_identity()` — sets `FORGE_TOKEN`, `AGENT_IDENTITY`, `FORGE_REMOTE` from per-agent token env vars and FORGE_URL remote detection. `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). **Journal digestion guards (#702)**: `_profile_digest_journals()` respects `PROFILE_DIGEST_TIMEOUT` (default 300s) and `PROFILE_DIGEST_MAX_BATCH` (default 5 journals per run); `_profile_restore_lessons()` restores the previous lessons-learned.md on digest failure. | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
+| `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in loop logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | polling-loop entry points |
+| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh — called after every successful merge. | dev-poll.sh |
 | `lib/build-graph.py` | Python tool: parses VISION.md, prerequisites.md (from ops repo), AGENTS.md, formulas/*.toml, evidence/ (from ops repo), and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh |
-| `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | file-action-issue.sh, phase-handler.sh |
-| `lib/file-action-issue.sh` | `file_action_issue()` — dedup check, secret scan, label lookup, and issue creation for formula-driven cron wrappers. Sets `FILED_ISSUE_NUM` on success. Returns 4 if secrets detected in body. | (available for future use) |
+| `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | issue-lifecycle.sh |
+| `lib/stack-lock.sh` | File-based lock protocol for singleton project stack access. `stack_lock_acquire(holder, project)` — polls until free, breaks stale heartbeats (>10 min old), claims lock. `stack_lock_release(project)` — deletes lock file. `stack_lock_check(project)` — inspect current lock state. `stack_lock_heartbeat(project)` — update heartbeat timestamp (callers must call every 2 min while holding). Lock files at `~/data/locks/<project>-stack.lock`. | docker/edge/dispatcher.sh, reproduce formula |
 | `lib/tea-helpers.sh` | `tea_file_issue(title, body, labels...)` — create issue via tea CLI with secret scanning; sets `FILED_ISSUE_NUM`. `tea_relabel(issue_num, labels...)` — replace labels using tea's `edit` subcommand (not `label`). `tea_comment(issue_num, body)` — add comment with secret scanning. `tea_close(issue_num)` — close issue. All use `TEA_LOGIN` and `FORGE_REPO` from env.sh. Labels by name (no ID lookup). Tea binary download verified via sha256 checksum. Sourced by env.sh when `tea` binary is available. | env.sh (conditional) |
 | `lib/worktree.sh` | Reusable git worktree management: `worktree_create(path, branch, [base_ref])` — create worktree, checkout base, fetch submodules. `worktree_recover(path, branch, [remote])` — detect existing worktree, reuse if on correct branch (sets `_WORKTREE_REUSED`), otherwise clean and recreate. `worktree_cleanup(path)` — `git worktree remove --force`, clear Claude Code project cache (`~/.claude/projects/` matching path). `worktree_cleanup_stale([max_age_hours])` — scan `/tmp` for orphaned worktrees older than threshold, skip preserved and active tmux worktrees, prune. `worktree_preserve(path, reason)` — mark worktree as preserved for debugging (writes `.worktree-preserved` marker, skipped by stale cleanup). | dev-agent.sh, supervisor-run.sh, planner-run.sh, predictor-run.sh, gardener-run.sh |
 | `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) |
 | `lib/issue-lifecycle.sh` | Reusable issue lifecycle library: `issue_claim()` (add in-progress, remove backlog), `issue_release()` (remove in-progress, add backlog), `issue_block()` (post diagnostic comment with secret redaction, add blocked label), `issue_close()`, `issue_check_deps()` (parse deps, check transitive closure; sets `_ISSUE_BLOCKED_BY`, `_ISSUE_SUGGESTION`), `issue_suggest_next()` (find next unblocked backlog issue; sets `_ISSUE_NEXT`), `issue_post_refusal()` (structured refusal comment with dedup). Label IDs cached in globals on first lookup. Sources `lib/secret-scan.sh`. | dev-agent.sh (future) |
-| `lib/agent-session.sh` | Shared tmux + Claude session helpers: `create_agent_session()`, `inject_formula()`, `agent_wait_for_claude_ready()`, `agent_inject_into_session()`, `agent_kill_session()`, `monitor_phase_loop()`, `read_phase()`, `write_compact_context()`. `create_agent_session(session, workdir, [phase_file])` optionally installs a PostToolUse hook (matcher `Bash\|Write`) that detects phase file writes in real-time — when Claude writes to the phase file, the hook writes a marker so `monitor_phase_loop` reacts on the next poll instead of waiting for mtime changes. Also installs a StopFailure hook (matcher `rate_limit\|server_error\|authentication_failed\|billing_error`) that writes `PHASE:failed` with an `api_error` reason to the phase file and touches the phase-changed marker, so the orchestrator discovers API errors within one poll cycle instead of waiting for idle timeout. Also installs a SessionStart hook (matcher `compact`) that re-injects phase protocol instructions after context compaction — callers write the context file via `write_compact_context(phase_file, content)`, and the hook (`on-compact-reinject.sh`) outputs the file content to stdout so Claude retains critical instructions. When `phase_file` is set, passes it to the idle stop hook (`on-idle-stop.sh`) so the hook can **nudge Claude** (up to 2 times) if Claude returns to the prompt without writing to the phase file — the hook injects a tmux reminder asking Claude to signal PHASE:done or PHASE:awaiting_ci. The PreToolUse guard hook (`on-pretooluse-guard.sh`) receives the session name as a third argument — formula agents (`gardener-*`, `planner-*`, `predictor-*`, `supervisor-*`) are identified this way and allowed to access `FACTORY_ROOT` from worktrees (they need env.sh, AGENTS.md, formulas/, lib/). **OAuth flock**: when `DISINTO_CONTAINER=1`, Claude CLI is wrapped in `flock -w 300 ~/.claude/session.lock` to queue concurrent token refresh attempts and prevent rotation races across agents sharing the same credentials. `monitor_phase_loop` sets `_MONITOR_LOOP_EXIT` to one of: `done`, `idle_timeout`, `idle_prompt` (Claude returned to `>` for 3 consecutive polls without writing any phase — callback invoked with `PHASE:failed`, session already dead), `crashed`, or `PHASE:escalate` / other `PHASE:*` string. **Unified escalation**: `PHASE:escalate` is the signal that a session needs human input (renamed from `PHASE:needs_human`). **Callers must handle `idle_prompt`** in both their callback and their post-loop exit handler — see [`docs/PHASE-PROTOCOL.md` idle_prompt](docs/PHASE-PROTOCOL.md#idle_prompt-exit-reason) for the full contract. | dev-agent.sh |
-| `lib/vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
+| `lib/vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. **Low-tier bypass**: if the action's `blast_radius` classifies as `low` (via `vault/classify.sh`), `vault_request` calls `_vault_commit_direct()` which commits directly to ops `main` using `FORGE_ADMIN_TOKEN` — no PR, no approval wait. Returns `0` (not a PR number) for direct commits. Requires `FORGE_TOKEN`, `FORGE_ADMIN_TOKEN` (low-tier only), `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
+| `lib/branch-protection.sh` | Branch protection helpers for Forgejo repos. `setup_vault_branch_protection()` — configures admin-only merge protection on main (require 1 approval, restrict merge to admin role, block direct pushes). `setup_profile_branch_protection()` — same protection for `.profile` repos. `verify_branch_protection()` — checks protection is correctly configured. `remove_branch_protection()` — removes protection (cleanup/testing). Handles race condition after initial push: retries with backoff if Forgejo hasn't processed the branch yet. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OPS_REPO`. | bin/disinto (hire-an-agent) |
+| `lib/agent-sdk.sh` | `agent_run([--resume SESSION_ID] [--worktree DIR] PROMPT)` — one-shot `claude -p` invocation with session persistence. Saves session ID to `SID_FILE`, reads it back on resume. `agent_recover_session()` — restore previous session ID from `SID_FILE` on startup. **Nudge guard**: skips nudge injection if the worktree is clean and no push is expected, preventing spurious re-invocations. Callers must define `SID_FILE`, `LOGFILE`, and `log()` before sourcing. **Concurrency**: external `flock` on `session.lock` is gated behind `CLAUDE_EXTERNAL_LOCK=1` (default off). When unset, each container's per-session `CLAUDE_CONFIG_DIR` isolation lets Claude Code's native lockfile handle OAuth refresh — no external serialization needed. Set `CLAUDE_EXTERNAL_LOCK=1` to re-enable the old flock wrapper as a rollback mechanism. See [`docs/CLAUDE-AUTH-CONCURRENCY.md`](../docs/CLAUDE-AUTH-CONCURRENCY.md) and AD-002 (#647). | formula-driven agents (dev-agent, planner-run, predictor-run, gardener-run) |
+| `lib/forge-setup.sh` | `setup_forge()` — Forgejo instance provisioning: creates admin user, bot accounts, org, repos (code + ops), configures webhooks, sets repo topics. Extracted from `bin/disinto`. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`. **Password storage (#361)**: after creating each bot account, stores its password in `.env` as `FORGE_<BOT>_PASS` (e.g. `FORGE_PASS`, `FORGE_REVIEW_PASS`, etc.) for use by `forge-push.sh`. | bin/disinto (init) |
+| `lib/forge-push.sh` | `push_to_forge()` — pushes a local clone to the Forgejo remote and verifies the push. `_assert_forge_push_globals()` validates required env vars before use. Requires `FORGE_URL`, `FORGE_PASS`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. **Auth**: uses `FORGE_PASS` (bot password) for git HTTP push — Forgejo 11.x rejects API tokens for `git push` (#361). | bin/disinto (init) |
+| `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) |
+| `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) |
+| `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
+| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
+| `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
+| `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
--- a/lib/agent-sdk.sh
+++ b/lib/agent-sdk.sh
@ -27,6 +27,96 @@ agent_recover_session() {
  fi
 }

+# claude_run_with_watchdog — run claude with idle-after-final-message watchdog
+#
+# Mitigates upstream Claude Code hang (#591) by detecting when the final
+# assistant message has been written and terminating the process after a
+# short grace period instead of waiting for CLAUDE_TIMEOUT.
+#
+# The watchdog:
+#   1. Streams claude stdout to a temp file
+#   2. Polls for the final result marker ("type":"result" for stream-json
+#      or closing } for regular json output)
+#   3. After detecting the final marker, starts a CLAUDE_IDLE_GRACE countdown
+#   4. SIGTERM claude if it hasn't exited cleanly within the grace period
+#   5. Falls back to CLAUDE_TIMEOUT as the absolute hard ceiling
+#
+# Usage: claude_run_with_watchdog claude [args...]
+# Expects: LOGFILE, CLAUDE_TIMEOUT, CLAUDE_IDLE_GRACE (default 30)
+# Returns: exit code from claude or timeout
+claude_run_with_watchdog() {
+  local -a cmd=("$@")
+  local out_file pid grace_pid rc
+
+  # Create temp file for stdout capture
+  out_file=$(mktemp) || return 1
+  trap 'rm -f "$out_file"' RETURN
+
+  # Start claude in background, capturing stdout to temp file
+  "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
+  pid=$!
+
+  # Background watchdog: poll for final result marker
+  (
+    local grace="${CLAUDE_IDLE_GRACE:-30}"
+    local detected=0
+
+    while kill -0 "$pid" 2>/dev/null; do
+      # Check for stream-json result marker first (more reliable)
+      if grep -q '"type":"result"' "$out_file" 2>/dev/null; then
+        detected=1
+        break
+      fi
+      # Fallback: check for closing brace of top-level result object
+      if tail -c 100 "$out_file" 2>/dev/null | grep -q '}[[:space:]]*$'; then
+        # Verify it looks like a JSON result (has session_id or result key)
+        if grep -qE '"(session_id|result)":' "$out_file" 2>/dev/null; then
+          detected=1
+          break
+        fi
+      fi
+      sleep 2
+    done
+
+    # If we detected a final message, wait grace period then kill if still running
+    if [ "$detected" -eq 1 ] && kill -0 "$pid" 2>/dev/null; then
+      log "watchdog: final result detected, ${grace}s grace period before SIGTERM"
+      sleep "$grace"
+      if kill -0 "$pid" 2>/dev/null; then
+        log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
+        kill -TERM "$pid" 2>/dev/null || true
+        # Give it a moment to clean up
+        sleep 5
+        if kill -0 "$pid" 2>/dev/null; then
+          log "watchdog: force kill after SIGTERM timeout"
+          kill -KILL "$pid" 2>/dev/null || true
+        fi
+      fi
+    fi
+  ) &
+  grace_pid=$!
+
+  # Hard ceiling timeout (existing behavior) — use tail --pid to wait for process
+  timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
+  rc=$?
+
+  # Clean up the watchdog
+  kill "$grace_pid" 2>/dev/null || true
+  wait "$grace_pid" 2>/dev/null || true
+
+  # When timeout fires (rc=124), explicitly kill the orphaned claude process
+  # tail --pid is a passive waiter, not a supervisor
+  if [ "$rc" -eq 124 ]; then
+    kill "$pid" 2>/dev/null || true
+    sleep 1
+    kill -KILL "$pid" 2>/dev/null || true
+  fi
+
+  # Output the captured stdout
+  cat "$out_file"
+  return "$rc"
+}
+
 # agent_run — synchronous Claude invocation (one-shot claude -p)
 # Usage: agent_run [--resume SESSION_ID] [--worktree DIR] PROMPT
 # Sets: _AGENT_SESSION_ID (updated each call, persisted to SID_FILE)
@ -41,14 +131,36 @@ agent_run() {
  done
  local prompt="${1:-}"

+  _AGENT_LAST_OUTPUT=""
+
  local -a args=(-p "$prompt" --output-format json --dangerously-skip-permissions --max-turns 200)
  [ -n "$resume_id" ] && args+=(--resume "$resume_id")
  [ -n "${CLAUDE_MODEL:-}" ] && args+=(--model "$CLAUDE_MODEL")

  local run_dir="${worktree_dir:-$(pwd)}"
-  local output
+  local lock_file="${HOME}/.claude/session.lock"
+  local output rc
  log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
-  output=$(cd "$run_dir" && timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") || true
+  # External flock is redundant once CLAUDE_CONFIG_DIR rollout is verified (#647).
+  # Gate behind CLAUDE_EXTERNAL_LOCK for rollback safety; default off.
+  if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then
+    mkdir -p "$(dirname "$lock_file")"
+    output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" 2>>"$LOGFILE") && rc=0 || rc=$?
+  else
+    output=$(cd "$run_dir" && claude_run_with_watchdog claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
+  fi
+  if [ "$rc" -eq 124 ]; then
+    log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
+  elif [ "$rc" -ne 0 ]; then
+    log "agent_run: claude exited with code $rc"
+    # Log last 3 lines of output for diagnostics
+    if [ -n "$output" ]; then
+      log "agent_run: last output lines: $(echo "$output" | tail -3)"
+    fi
+  fi
+  if [ -z "$output" ]; then
+    log "agent_run: empty output (claude may have crashed or failed, exit code: $rc)"
+  fi

  # Extract and persist session_id
  local new_sid
@ -61,32 +173,48 @@ agent_run() {

  # Save output for diagnostics (no_push, crashes)
  _AGENT_LAST_OUTPUT="$output"
-  local diag_file="${DISINTO_LOG_DIR:-/tmp}/dev/agent-run-last.json"
+  local diag_dir="${DISINTO_LOG_DIR:-/tmp}/${LOG_AGENT:-dev}"
+  mkdir -p "$diag_dir" 2>/dev/null || true
+  local diag_file="${diag_dir}/agent-run-last.json"
  printf '%s' "$output" > "$diag_file" 2>/dev/null || true

  # Nudge: if the model stopped without pushing, resume with encouragement.
  # Some models emit end_turn prematurely when confused. A nudge often unsticks them.
-  if [ -n "$_AGENT_SESSION_ID" ]; then
+  if [ -n "$_AGENT_SESSION_ID" ] && [ -n "$output" ]; then
    local has_changes
    has_changes=$(cd "$run_dir" && git status --porcelain 2>/dev/null | head -1) || true
    local has_pushed
    has_pushed=$(cd "$run_dir" && git log --oneline "${FORGE_REMOTE:-origin}/${PRIMARY_BRANCH:-main}..HEAD" 2>/dev/null | head -1) || true
    if [ -z "$has_pushed" ]; then
-      local nudge="You stopped but did not push any code. "
      if [ -n "$has_changes" ]; then
-        nudge+="You have uncommitted changes. Commit them and push."
+        # Nudge: there are uncommitted changes
+        local nudge="You stopped but did not push any code. You have uncommitted changes. Commit them and push."
+        log "agent_run: nudging (uncommitted changes)"
+        local nudge_rc
+        if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then
+          output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 9>"$lock_file" 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
+        else
+          output=$(cd "$run_dir" && claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
+        fi
+        if [ "$nudge_rc" -eq 124 ]; then
+          log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
+        elif [ "$nudge_rc" -ne 0 ]; then
+          log "agent_run: nudge claude exited with code $nudge_rc"
+          # Log last 3 lines of output for diagnostics
+          if [ -n "$output" ]; then
+            log "agent_run: nudge last output lines: $(echo "$output" | tail -3)"
+          fi
+        fi
+        new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
+        if [ -n "$new_sid" ]; then
+          _AGENT_SESSION_ID="$new_sid"
+          printf '%s' "$new_sid" > "$SID_FILE"
+        fi
+        printf '%s' "$output" > "$diag_file" 2>/dev/null || true
+        _AGENT_LAST_OUTPUT="$output"
      else
-        nudge+="Complete the implementation, commit, and push your branch."
+        log "agent_run: no push and no changes — skipping nudge"
      fi
-      log "agent_run: nudging (no push detected)"
-      output=$(cd "$run_dir" && timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") || true
-      new_sid=$(printf '%s' "$output" | jq -r '.session_id // empty' 2>/dev/null) || true
-      if [ -n "$new_sid" ]; then
-        _AGENT_SESSION_ID="$new_sid"
-        printf '%s' "$new_sid" > "$SID_FILE"
-      fi
-      printf '%s' "$output" > "$diag_file" 2>/dev/null || true
-      _AGENT_LAST_OUTPUT="$output"
    fi
  fi
 }
--- a/lib/agent-session.sh
+++ b/lib/agent-session.sh
@ -1,486 +0,0 @@
-#!/usr/bin/env bash
-# agent-session.sh — Shared tmux + Claude interactive session helpers
-#
-# Source this into agent orchestrator scripts for reusable session management.
-#
-# Functions:
-#   agent_wait_for_claude_ready SESSION_NAME [TIMEOUT_SECS]
-#   agent_inject_into_session   SESSION_NAME TEXT
-#   agent_kill_session          SESSION_NAME
-#   monitor_phase_loop          PHASE_FILE IDLE_TIMEOUT_SECS CALLBACK_FN [SESSION_NAME]
-#   session_lock_acquire        [TIMEOUT_SECS]
-#   session_lock_release
-
-# --- Cooperative session lock (fd-based) ---
-# File descriptor for the session lock. Set by create_agent_session().
-# Callers can release/re-acquire via session_lock_release/session_lock_acquire
-# to allow other Claude sessions during idle phases (awaiting_review/awaiting_ci).
-SESSION_LOCK_FD=""
-
-# Release the session lock without closing the file descriptor.
-# The fd stays open so it can be re-acquired later.
-session_lock_release() {
-  if [ -n "${SESSION_LOCK_FD:-}" ]; then
-    flock -u "$SESSION_LOCK_FD"
-  fi
-}
-
-# Re-acquire the session lock. Blocks until available or timeout.
-# Opens the lock fd if not already open (for use by external callers).
-# Args: [timeout_secs] (default 300)
-# Returns 0 on success, 1 on timeout/error.
-# shellcheck disable=SC2120  # timeout arg is used by external callers
-session_lock_acquire() {
-  local timeout="${1:-300}"
-  if [ -z "${SESSION_LOCK_FD:-}" ]; then
-    local lock_dir="${HOME}/.claude"
-    mkdir -p "$lock_dir"
-    exec {SESSION_LOCK_FD}>>"${lock_dir}/session.lock"
-  fi
-  flock -w "$timeout" "$SESSION_LOCK_FD"
-}
-
-# Wait for the Claude ❯ ready prompt in a tmux pane.
-# Returns 0 if ready within TIMEOUT_SECS (default 120), 1 otherwise.
-agent_wait_for_claude_ready() {
-  local session="$1"
-  local timeout="${2:-120}"
-  local elapsed=0
-  while [ "$elapsed" -lt "$timeout" ]; do
-    if tmux capture-pane -t "$session" -p 2>/dev/null | grep -q '❯'; then
-      return 0
-    fi
-    sleep 2
-    elapsed=$((elapsed + 2))
-  done
-  return 1
-}
-
-# Paste TEXT into SESSION (waits for Claude to be ready first), then press Enter.
-agent_inject_into_session() {
-  local session="$1"
-  local text="$2"
-  local tmpfile
-  # Re-acquire session lock before injecting — Claude will resume working
-  # shellcheck disable=SC2119  # using default timeout
-  session_lock_acquire || true
-  agent_wait_for_claude_ready "$session" 120 || true
-  # Clear idle marker — new work incoming
-  rm -f "/tmp/claude-idle-${session}.ts"
-  tmpfile=$(mktemp /tmp/agent-inject-XXXXXX)
-  printf '%s' "$text" > "$tmpfile"
-  tmux load-buffer -b "agent-inject-$$" "$tmpfile"
-  tmux paste-buffer -t "$session" -b "agent-inject-$$"
-  sleep 0.5
-  tmux send-keys -t "$session" "" Enter
-  tmux delete-buffer -b "agent-inject-$$" 2>/dev/null || true
-  rm -f "$tmpfile"
-}
-
-# Create a tmux session running Claude in the given workdir.
-# Installs a Stop hook for idle detection (see monitor_phase_loop).
-# Installs a PreToolUse hook to guard destructive Bash operations.
-# Optionally installs a PostToolUse hook for phase file write detection.
-# Optionally installs a StopFailure hook for immediate phase file update on API error.
-# Args: session workdir [phase_file]
-# Returns 0 if session is ready, 1 otherwise.
-create_agent_session() {
-  local session="$1"
-  local workdir="${2:-.}"
-  local phase_file="${3:-}"
-
-  # Prepare settings directory for hooks
-  mkdir -p "${workdir}/.claude"
-  local settings="${workdir}/.claude/settings.json"
-
-  # Install Stop hook for idle detection: when Claude finishes a response,
-  # the hook writes a timestamp to a marker file. monitor_phase_loop checks
-  # this marker instead of fragile tmux pane scraping.
-  local idle_marker="/tmp/claude-idle-${session}.ts"
-  local hook_script="${FACTORY_ROOT}/lib/hooks/on-idle-stop.sh"
-  if [ -x "$hook_script" ]; then
-    local hook_cmd="${hook_script} ${idle_marker}"
-    # When a phase file is available, pass it and the session name so the
-    # hook can nudge Claude if it returns to the prompt without signalling.
-    if [ -n "$phase_file" ]; then
-      hook_cmd="${hook_script} ${idle_marker} ${phase_file} ${session}"
-    fi
-    if [ -f "$settings" ]; then
-      # Append our Stop hook to existing project settings
-      jq --arg cmd "$hook_cmd" '
-        if (.hooks.Stop // [] | any(.[]; .hooks[]?.command == $cmd))
-        then .
-        else .hooks.Stop = (.hooks.Stop // []) + [{
-          matcher: "",
-          hooks: [{type: "command", command: $cmd}]
-        }]
-        end
-      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-    else
-      jq -n --arg cmd "$hook_cmd" '{
-        hooks: {
-          Stop: [{
-            matcher: "",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-        }
-      }' > "$settings"
-    fi
-  fi
-
-  # Install PostToolUse hook for phase file write detection: when Claude
-  # writes to the phase file via Bash or Write, the hook writes a marker
-  # so monitor_phase_loop can react immediately instead of waiting for
-  # the next mtime-based poll cycle.
-  if [ -n "$phase_file" ]; then
-    local phase_marker="/tmp/phase-changed-${session}.marker"
-    local phase_hook_script="${FACTORY_ROOT}/lib/hooks/on-phase-change.sh"
-    if [ -x "$phase_hook_script" ]; then
-      local phase_hook_cmd="${phase_hook_script} ${phase_file} ${phase_marker}"
-      if [ -f "$settings" ]; then
-        jq --arg cmd "$phase_hook_cmd" '
-          if (.hooks.PostToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
-          then .
-          else .hooks.PostToolUse = (.hooks.PostToolUse // []) + [{
-            matcher: "Bash|Write",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-          end
-        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-      else
-        jq -n --arg cmd "$phase_hook_cmd" '{
-          hooks: {
-            PostToolUse: [{
-              matcher: "Bash|Write",
-              hooks: [{type: "command", command: $cmd}]
-            }]
-          }
-        }' > "$settings"
-      fi
-      rm -f "$phase_marker"
-    fi
-  fi
-
-  # Install StopFailure hook for immediate phase file update on API error:
-  # when Claude hits a rate limit, server error, billing error, or auth failure,
-  # the hook writes PHASE:failed to the phase file and touches the phase-changed
-  # marker so monitor_phase_loop picks it up within one poll cycle instead of
-  # waiting for idle timeout (up to 2 hours).
-  if [ -n "$phase_file" ]; then
-    local stop_failure_hook_script="${FACTORY_ROOT}/lib/hooks/on-stop-failure.sh"
-    if [ -x "$stop_failure_hook_script" ]; then
-      # phase_marker is defined in the PostToolUse block above; redeclare so
-      # this block is self-contained if that block is ever removed.
-      local sf_phase_marker="/tmp/phase-changed-${session}.marker"
-      local stop_failure_hook_cmd="${stop_failure_hook_script} ${phase_file} ${sf_phase_marker}"
-      if [ -f "$settings" ]; then
-        jq --arg cmd "$stop_failure_hook_cmd" '
-          if (.hooks.StopFailure // [] | any(.[]; .hooks[]?.command == $cmd))
-          then .
-          else .hooks.StopFailure = (.hooks.StopFailure // []) + [{
-            matcher: "rate_limit|server_error|authentication_failed|billing_error",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-          end
-        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-      else
-        jq -n --arg cmd "$stop_failure_hook_cmd" '{
-          hooks: {
-            StopFailure: [{
-              matcher: "rate_limit|server_error|authentication_failed|billing_error",
-              hooks: [{type: "command", command: $cmd}]
-            }]
-          }
-        }' > "$settings"
-      fi
-    fi
-  fi
-
-  # Install PreToolUse hook for destructive operation guard: blocks force push
-  # to primary branch, rm -rf outside worktree, direct API merge calls, and
-  # checkout/switch to primary branch.  Claude sees the denial reason on exit 2
-  # and can self-correct.
-  local guard_hook_script="${FACTORY_ROOT}/lib/hooks/on-pretooluse-guard.sh"
-  if [ -x "$guard_hook_script" ]; then
-    local abs_workdir
-    abs_workdir=$(cd "$workdir" 2>/dev/null && pwd) || abs_workdir="$workdir"
-    local guard_hook_cmd="${guard_hook_script} ${PRIMARY_BRANCH:-main} ${abs_workdir} ${session}"
-    if [ -f "$settings" ]; then
-      jq --arg cmd "$guard_hook_cmd" '
-        if (.hooks.PreToolUse // [] | any(.[]; .hooks[]?.command == $cmd))
-        then .
-        else .hooks.PreToolUse = (.hooks.PreToolUse // []) + [{
-          matcher: "Bash",
-          hooks: [{type: "command", command: $cmd}]
-        }]
-        end
-      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-    else
-      jq -n --arg cmd "$guard_hook_cmd" '{
-        hooks: {
-          PreToolUse: [{
-            matcher: "Bash",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-        }
-      }' > "$settings"
-    fi
-  fi
-
-  # Install SessionEnd hook for guaranteed cleanup: when the Claude session
-  # exits (clean or crash), write a termination marker so monitor_phase_loop
-  # detects the exit faster than tmux has-session polling alone.
-  local exit_marker="/tmp/claude-exited-${session}.ts"
-  local session_end_hook_script="${FACTORY_ROOT}/lib/hooks/on-session-end.sh"
-  if [ -x "$session_end_hook_script" ]; then
-    local session_end_hook_cmd="${session_end_hook_script} ${exit_marker}"
-    if [ -f "$settings" ]; then
-      jq --arg cmd "$session_end_hook_cmd" '
-        if (.hooks.SessionEnd // [] | any(.[]; .hooks[]?.command == $cmd))
-        then .
-        else .hooks.SessionEnd = (.hooks.SessionEnd // []) + [{
-          matcher: "",
-          hooks: [{type: "command", command: $cmd}]
-        }]
-        end
-      ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-    else
-      jq -n --arg cmd "$session_end_hook_cmd" '{
-        hooks: {
-          SessionEnd: [{
-            matcher: "",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-        }
-      }' > "$settings"
-    fi
-  fi
-  rm -f "$exit_marker"
-
-  # Install SessionStart hook for context re-injection after compaction:
-  # when Claude Code compacts context during long sessions, the phase protocol
-  # instructions are lost. This hook fires after each compaction and outputs
-  # the content of a context file so Claude retains critical instructions.
-  # The context file is written by callers via write_compact_context().
-  if [ -n "$phase_file" ]; then
-    local compact_hook_script="${FACTORY_ROOT}/lib/hooks/on-compact-reinject.sh"
-    if [ -x "$compact_hook_script" ]; then
-      local context_file="${phase_file%.phase}.context"
-      local compact_hook_cmd="${compact_hook_script} ${context_file}"
-      if [ -f "$settings" ]; then
-        jq --arg cmd "$compact_hook_cmd" '
-          if (.hooks.SessionStart // [] | any(.[]; .hooks[]?.command == $cmd))
-          then .
-          else .hooks.SessionStart = (.hooks.SessionStart // []) + [{
-            matcher: "compact",
-            hooks: [{type: "command", command: $cmd}]
-          }]
-          end
-        ' "$settings" > "${settings}.tmp" && mv "${settings}.tmp" "$settings"
-      else
-        jq -n --arg cmd "$compact_hook_cmd" '{
-          hooks: {
-            SessionStart: [{
-              matcher: "compact",
-              hooks: [{type: "command", command: $cmd}]
-            }]
-          }
-        }' > "$settings"
-      fi
-    fi
-  fi
-
-  rm -f "$idle_marker"
-  local model_flag=""
-  if [ -n "${CLAUDE_MODEL:-}" ]; then
-    model_flag="--model ${CLAUDE_MODEL}"
-  fi
-
-  # Acquire a session-level mutex via fd-based flock to prevent concurrent
-  # Claude sessions from racing on OAuth token refresh.  Unlike the previous
-  # command-wrapper flock, the fd approach allows callers to release the lock
-  # during idle phases (awaiting_review/awaiting_ci) and re-acquire before
-  # injecting the next prompt.  See #724.
-  # Use ~/.claude/session.lock so the lock is shared across containers when
-  # the host ~/.claude directory is bind-mounted.
-  local lock_dir="${HOME}/.claude"
-  mkdir -p "$lock_dir"
-  local claude_lock="${lock_dir}/session.lock"
-  if [ -z "${SESSION_LOCK_FD:-}" ]; then
-    exec {SESSION_LOCK_FD}>>"${claude_lock}"
-  fi
-  if ! flock -w 300 "$SESSION_LOCK_FD"; then
-    return 1
-  fi
-  local claude_cmd="claude --dangerously-skip-permissions ${model_flag}"
-
-  tmux new-session -d -s "$session" -c "$workdir" \
-    "$claude_cmd" 2>/dev/null
-  sleep 1
-  tmux has-session -t "$session" 2>/dev/null || return 1
-  agent_wait_for_claude_ready "$session" 120 || return 1
-  return 0
-}
-
-# Inject a prompt/formula into a session (alias for agent_inject_into_session).
-inject_formula() {
-  agent_inject_into_session "$@"
-}
-
-# Monitor a phase file, calling a callback on changes and handling idle timeout.
-# Sets _MONITOR_LOOP_EXIT to the exit reason (idle_timeout, idle_prompt, done, crashed, PHASE:failed, PHASE:escalate).
-# Sets _MONITOR_SESSION to the resolved session name (arg 4 or $SESSION_NAME).
-#   Callbacks should reference _MONITOR_SESSION instead of $SESSION_NAME directly.
-# Args: phase_file idle_timeout_secs callback_fn [session_name]
-#   session_name — tmux session to health-check; falls back to $SESSION_NAME global
-#
-# Idle detection: uses a Stop hook marker file (written by lib/hooks/on-idle-stop.sh)
-# to detect when Claude finishes responding without writing a phase signal.
-# If the marker exists for 3 consecutive polls with no phase written, the session
-# is killed and the callback invoked with "PHASE:failed".
-monitor_phase_loop() {
-  local phase_file="$1"
-  local idle_timeout="$2"
-  local callback="$3"
-  local _session="${4:-${SESSION_NAME:-}}"
-  # Export resolved session name so callbacks can reference it regardless of
-  # which session was passed to monitor_phase_loop (analogous to _MONITOR_LOOP_EXIT).
-  export _MONITOR_SESSION="$_session"
-  local poll_interval="${PHASE_POLL_INTERVAL:-10}"
-  local last_mtime=0
-  local idle_elapsed=0
-  local idle_pane_count=0
-
-  while true; do
-    sleep "$poll_interval"
-    idle_elapsed=$(( idle_elapsed + poll_interval ))
-
-    # Session health check: SessionEnd hook marker provides fast detection,
-    # tmux has-session is the fallback for unclean exits (e.g. tmux crash).
-    local exit_marker="/tmp/claude-exited-${_session}.ts"
-    if [ -f "$exit_marker" ] || ! tmux has-session -t "${_session}" 2>/dev/null; then
-      local current_phase
-      current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
-      case "$current_phase" in
-        PHASE:done|PHASE:failed|PHASE:merged|PHASE:escalate)
-          ;; # terminal — fall through to phase handler
-        *)
-          # Call callback with "crashed" — let agent-specific code handle recovery
-          if type "${callback}" &>/dev/null; then
-            "$callback" "PHASE:crashed"
-          fi
-          # If callback didn't restart session, break
-          if ! tmux has-session -t "${_session}" 2>/dev/null; then
-            _MONITOR_LOOP_EXIT="crashed"
-            return 1
-          fi
-          idle_elapsed=0
-          idle_pane_count=0
-          continue
-          ;;
-      esac
-    fi
-
-    # Check phase-changed marker from PostToolUse hook — if present, the hook
-    # detected a phase file write so we reset last_mtime to force processing
-    # this cycle instead of waiting for the next mtime change.
-    local phase_marker="/tmp/phase-changed-${_session}.marker"
-    if [ -f "$phase_marker" ]; then
-      rm -f "$phase_marker"
-      last_mtime=0
-    fi
-
-    # Check phase file for changes
-    local phase_mtime
-    phase_mtime=$(stat -c %Y "$phase_file" 2>/dev/null || echo 0)
-    local current_phase
-    current_phase=$(head -1 "$phase_file" 2>/dev/null | tr -d '[:space:]' || true)
-
-    if [ -z "$current_phase" ] || [ "$phase_mtime" -le "$last_mtime" ]; then
-      # No phase change — check idle timeout
-      if [ "$idle_elapsed" -ge "$idle_timeout" ]; then
-        _MONITOR_LOOP_EXIT="idle_timeout"
-        agent_kill_session "${_session}"
-        return 0
-      fi
-      # Idle detection via Stop hook: the on-idle-stop.sh hook writes a marker
-      # file when Claude finishes a response. If the marker exists and no phase
-      # has been written, Claude returned to the prompt without following the
-      # phase protocol. 3 consecutive polls = confirmed idle (not mid-turn).
-      local idle_marker="/tmp/claude-idle-${_session}.ts"
-      if [ -z "$current_phase" ] && [ -f "$idle_marker" ]; then
-        idle_pane_count=$(( idle_pane_count + 1 ))
-        if [ "$idle_pane_count" -ge 3 ]; then
-          _MONITOR_LOOP_EXIT="idle_prompt"
-          # Session is killed before the callback is invoked.
-          # Callbacks that handle PHASE:failed must not assume the session is alive.
-          agent_kill_session "${_session}"
-          if type "${callback}" &>/dev/null; then
-            "$callback" "PHASE:failed"
-          fi
-          return 0
-        fi
-      else
-        idle_pane_count=0
-      fi
-      continue
-    fi
-
-    # Phase changed
-    last_mtime="$phase_mtime"
-    # shellcheck disable=SC2034  # read by phase-handler.sh callback
-    LAST_PHASE_MTIME="$phase_mtime"
-    idle_elapsed=0
-    idle_pane_count=0
-
-    # Terminal phases
-    case "$current_phase" in
-      PHASE:done|PHASE:merged)
-        _MONITOR_LOOP_EXIT="done"
-        if type "${callback}" &>/dev/null; then
-          "$callback" "$current_phase"
-        fi
-        return 0
-        ;;
-      PHASE:failed|PHASE:escalate)
-        _MONITOR_LOOP_EXIT="$current_phase"
-        if type "${callback}" &>/dev/null; then
-          "$callback" "$current_phase"
-        fi
-        return 0
-        ;;
-    esac
-
-    # Non-terminal phase — call callback
-    if type "${callback}" &>/dev/null; then
-      "$callback" "$current_phase"
-    fi
-  done
-}
-
-# Write context to a file for re-injection after context compaction.
-# The SessionStart compact hook reads this file and outputs it to stdout.
-# Args: phase_file content
-write_compact_context() {
-  local phase_file="$1"
-  local content="$2"
-  local context_file="${phase_file%.phase}.context"
-  printf '%s\n' "$content" > "$context_file"
-}
-
-# Kill a tmux session gracefully (no-op if not found).
-agent_kill_session() {
-  local session="${1:-}"
-  [ -n "$session" ] && tmux kill-session -t "$session" 2>/dev/null || true
-  rm -f "/tmp/claude-idle-${session}.ts"
-  rm -f "/tmp/phase-changed-${session}.marker"
-  rm -f "/tmp/claude-exited-${session}.ts"
-  rm -f "/tmp/claude-nudge-${session}.count"
-}
-
-# Read the current phase from a phase file, stripped of whitespace.
-# Usage: read_phase [file]  — defaults to $PHASE_FILE
-read_phase() {
-  local file="${1:-${PHASE_FILE:-}}"
-  { cat "$file" 2>/dev/null || true; } | head -1 | tr -d '[:space:]'
-}
--- a/lib/branch-protection.sh
+++ b/lib/branch-protection.sh
@ -34,6 +34,55 @@ _ops_api() {
  printf '%s' "${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}"
 }

+# -----------------------------------------------------------------------------
+# _bp_wait_for_branch — Wait for Forgejo to index a branch with exponential backoff
+#
+# Forgejo's branch indexer can take 5–15s to register a newly-pushed branch.
+# This helper retries up to 10 times with exponential backoff (2s, 4s, 6s, …)
+# capped at 10s per wait, for a worst-case total of ~70s.
+#
+# Args:
+#   $1 - Full API URL for the repo (e.g. https://forge.example/api/v1/repos/owner/repo)
+#   $2 - Branch name
+#   $3 - Human-readable repo identifier for log messages
+#
+# Returns: 0 if branch found, 1 if not found after all retries
+# -----------------------------------------------------------------------------
+_bp_wait_for_branch() {
+  local api_url="$1"
+  local branch="$2"
+  local repo_label="$3"
+
+  local max_retries=10
+  local base_wait=2
+  local attempt=1
+  local branch_status="0"
+
+  while [ "$attempt" -le "$max_retries" ]; do
+    branch_status=$(curl -s -o /dev/null -w "%{http_code}" \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
+
+    if [ "$branch_status" = "200" ]; then
+      _bp_log "Branch ${branch} exists on ${repo_label}"
+      return 0
+    fi
+
+    if [ "$attempt" -lt "$max_retries" ]; then
+      local wait_time=$(( base_wait * attempt ))
+      if [ "$wait_time" -gt 10 ]; then
+        wait_time=10
+      fi
+      _bp_log "Branch ${branch} not indexed yet (attempt ${attempt}/${max_retries}), waiting ${wait_time}s..."
+      sleep "$wait_time"
+    fi
+    attempt=$((attempt + 1))
+  done
+
+  _bp_log "ERROR: Branch ${branch} does not exist on ${repo_label} after ${max_retries} attempts"
+  return 1
+}
+
 # -----------------------------------------------------------------------------
 # setup_vault_branch_protection — Set up admin-only branch protection for main
 #
@ -51,14 +100,8 @@ setup_vault_branch_protection() {

  _bp_log "Setting up branch protection for ${branch} on ${FORGE_OPS_REPO}"

-  # Check if branch exists
-  local branch_exists
-  branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
-
-  if [ "$branch_exists" != "200" ]; then
-    _bp_log "ERROR: Branch ${branch} does not exist"
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$FORGE_OPS_REPO"; then
    return 1
  fi

@ -228,14 +271,8 @@ setup_profile_branch_protection() {
  local api_url
  api_url="${FORGE_URL}/api/v1/repos/${repo}"

-  # Check if branch exists
-  local branch_exists
-  branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
-
-  if [ "$branch_exists" != "200" ]; then
-    _bp_log "ERROR: Branch ${branch} does not exist on ${repo}"
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$repo"; then
    return 1
  fi

@ -369,6 +406,109 @@ remove_branch_protection() {
  return 0
 }

+# -----------------------------------------------------------------------------
+# setup_project_branch_protection — Set up branch protection for project repos
+#
+# Configures the following protection rules:
+# - Block direct pushes to main (all changes must go through PR)
+# - Require 1 approval before merge
+# - Allow merge only via dev-bot (for auto-merge after review+CI)
+# - Allow review-bot to approve PRs
+#
+# Args:
+#   $1 - Repo path in format 'owner/repo' (e.g., 'disinto-admin/disinto')
+#   $2 - Branch to protect (default: main)
+#
+# Returns: 0 on success, 1 on failure
+# -----------------------------------------------------------------------------
+setup_project_branch_protection() {
+  local repo="${1:-}"
+  local branch="${2:-main}"
+
+  if [ -z "$repo" ]; then
+    _bp_log "ERROR: repo path required (format: owner/repo)"
+    return 1
+  fi
+
+  _bp_log "Setting up branch protection for ${branch} on ${repo}"
+
+  local api_url
+  api_url="${FORGE_URL}/api/v1/repos/${repo}"
+
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$repo"; then
+    return 1
+  fi
+
+  # Check if protection already exists
+  local protection_exists
+  protection_exists=$(curl -s -o /dev/null -w "%{http_code}" \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${api_url}/branches/${branch}/protection" 2>/dev/null || echo "0")
+
+  if [ "$protection_exists" = "200" ]; then
+    _bp_log "Branch protection already exists for ${branch}"
+    _bp_log "Updating existing protection rules"
+  fi
+
+  # Create/update branch protection
+  # Forgejo API for branch protection (factory mode):
+  # - enable_push: false (block direct pushes)
+  # - enable_merge_whitelist: true (only whitelisted users can merge)
+  # - merge_whitelist_usernames: ["dev-bot"] (dev-bot merges after CI)
+  # - required_approvals: 1 (review-bot must approve)
+  local protection_json
+  protection_json=$(cat <<EOF
+{
+  "enable_push": false,
+  "enable_force_push": false,
+  "enable_merge_commit": true,
+  "enable_rebase": true,
+  "enable_rebase_merge": true,
+  "required_approvals": 1,
+  "required_signatures": false,
+  "enable_merge_whitelist": true,
+  "merge_whitelist_usernames": ["dev-bot"],
+  "required_status_checks": false,
+  "required_linear_history": false
+}
+EOF
+)
+
+  local http_code
+  if [ "$protection_exists" = "200" ]; then
+    # Update existing protection
+    http_code=$(curl -s -o /dev/null -w "%{http_code}" \
+      -X PUT \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Content-Type: application/json" \
+      "${api_url}/branches/${branch}/protection" \
+      -d "$protection_json" || echo "0")
+  else
+    # Create new protection
+    http_code=$(curl -s -o /dev/null -w "%{http_code}" \
+      -X POST \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Content-Type: application/json" \
+      "${api_url}/branches/${branch}/protection" \
+      -d "$protection_json" || echo "0")
+  fi
+
+  if [ "$http_code" != "200" ] && [ "$http_code" != "201" ]; then
+    _bp_log "ERROR: Failed to set up branch protection (HTTP ${http_code})"
+    return 1
+  fi
+
+  _bp_log "Branch protection configured successfully for ${branch}"
+  _bp_log "  - Pushes blocked: true"
+  _bp_log "  - Force pushes blocked: true"
+  _bp_log "  - Required approvals: 1"
+  _bp_log "  - Merge whitelist: dev-bot only"
+  _bp_log "  - review-bot can approve: yes"
+
+  return 0
+}
+
 # -----------------------------------------------------------------------------
 # Test mode — run when executed directly
 # -----------------------------------------------------------------------------
@ -401,6 +541,13 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
      fi
      setup_profile_branch_protection "${2}" "${3:-main}"
      ;;
+    setup-project)
+      if [ -z "${2:-}" ]; then
+        echo "ERROR: repo path required (format: owner/repo)" >&2
+        exit 1
+      fi
+      setup_project_branch_protection "${2}" "${3:-main}"
+      ;;
    verify)
      verify_branch_protection "${2:-main}"
      ;;
@ -408,18 +555,19 @@ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
      remove_branch_protection "${2:-main}"
      ;;
    help|*)
-      echo "Usage: $0 {setup|setup-profile|verify|remove} [args...]"
+      echo "Usage: $0 {setup|setup-profile|setup-project|verify|remove} [args...]"
      echo ""
      echo "Commands:"
      echo "  setup [branch]              Set up branch protection on ops repo (default: main)"
      echo "  setup-profile <repo> [branch] Set up branch protection on .profile repo"
+      echo "  setup-project <repo> [branch] Set up branch protection on project repo"
      echo "  verify [branch]             Verify branch protection is configured correctly"
      echo "  remove [branch]             Remove branch protection (for cleanup/testing)"
      echo ""
      echo "Required environment variables:"
      echo "  FORGE_TOKEN     Forgejo API token (admin user recommended)"
      echo "  FORGE_URL       Forgejo instance URL (e.g., https://codeberg.org)"
-      echo "  FORGE_OPS_REPO  Ops repo in format owner/repo (e.g., johba/disinto-ops)"
+      echo "  FORGE_OPS_REPO  Ops repo in format owner/repo (e.g., disinto-admin/disinto-ops)"
      exit 0
      ;;
  esac
--- a/lib/ci-helpers.sh
+++ b/lib/ci-helpers.sh
@ -7,27 +7,6 @@ set -euo pipefail
 # ci_commit_status() / ci_pipeline_number() require: woodpecker_api(), forge_api() (from env.sh)
 # classify_pipeline_failure() requires: woodpecker_api() (defined in env.sh)

-# ensure_blocked_label_id — look up (or create) the "blocked" label, print its ID.
-# Caches the result in _BLOCKED_LABEL_ID to avoid repeated API calls.
-# Requires: FORGE_TOKEN, FORGE_API (from env.sh), forge_api()
-ensure_blocked_label_id() {
-  if [ -n "${_BLOCKED_LABEL_ID:-}" ]; then
-    printf '%s' "$_BLOCKED_LABEL_ID"
-    return 0
-  fi
-  _BLOCKED_LABEL_ID=$(forge_api GET "/labels" 2>/dev/null \
-    | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || true)
-  if [ -z "$_BLOCKED_LABEL_ID" ]; then
-    _BLOCKED_LABEL_ID=$(curl -sf -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
-      -H "Content-Type: application/json" \
-      "${FORGE_API}/labels" \
-      -d '{"name":"blocked","color":"#e11d48"}' 2>/dev/null \
-      | jq -r '.id // empty' 2>/dev/null || true)
-  fi
-  printf '%s' "$_BLOCKED_LABEL_ID"
-}
-
 # ensure_priority_label — look up (or create) the "priority" label, print its ID.
 # Caches the result in _PRIORITY_LABEL_ID to avoid repeated API calls.
 # Requires: FORGE_TOKEN, FORGE_API (from env.sh), forge_api()
@ -267,3 +246,42 @@ ci_promote() {

  echo "$new_num"
 }
+
+# ci_get_logs <pipeline_number> [--step <step_name>]
+# Reads CI logs from the Woodpecker SQLite database.
+# Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data
+# Returns: 0 on success, 1 on failure. Outputs log text to stdout.
+#
+# Usage:
+#   ci_get_logs 346                  # Get all failed step logs
+#   ci_get_logs 346 --step smoke-init # Get logs for specific step
+ci_get_logs() {
+  local pipeline_number="$1"
+  shift || true
+
+  local step_name=""
+  while [ $# -gt 0 ]; do
+    case "$1" in
+      --step|-s)
+        step_name="$2"
+        shift 2
+        ;;
+      *)
+        echo "Unknown option: $1" >&2
+        return 1
+        ;;
+    esac
+  done
+
+  local log_reader="${FACTORY_ROOT:-/home/agent/disinto}/lib/ci-log-reader.py"
+  if [ -f "$log_reader" ]; then
+    if [ -n "$step_name" ]; then
+      python3 "$log_reader" "$pipeline_number" --step "$step_name"
+    else
+      python3 "$log_reader" "$pipeline_number"
+    fi
+  else
+    echo "ERROR: ci-log-reader.py not found at $log_reader" >&2
+    return 1
+  fi
+}
--- a/lib/ci-log-reader.py
+++ b/lib/ci-log-reader.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+ci-log-reader.py — Read CI logs from Woodpecker SQLite database.
+
+Usage:
+    ci-log-reader.py <pipeline_number> [--step <step_name>]
+
+Reads log entries from the Woodpecker SQLite database and outputs them to stdout.
+If --step is specified, filters to that step only. Otherwise returns logs from
+all failed steps, truncated to the last 200 lines to avoid context bloat.
+
+Environment:
+    WOODPECKER_DATA_DIR - Path to Woodpecker data directory (default: /woodpecker-data)
+
+The SQLite database is located at: $WOODPECKER_DATA_DIR/woodpecker.sqlite
+"""
+
+import argparse
+import sqlite3
+import sys
+import os
+
+DEFAULT_DB_PATH = "/woodpecker-data/woodpecker.sqlite"
+DEFAULT_WOODPECKER_DATA_DIR = "/woodpecker-data"
+MAX_OUTPUT_LINES = 200
+
+
+def get_db_path():
+    """Determine the path to the Woodpecker SQLite database."""
+    env_dir = os.environ.get("WOODPECKER_DATA_DIR", DEFAULT_WOODPECKER_DATA_DIR)
+    return os.path.join(env_dir, "woodpecker.sqlite")
+
+
+def query_logs(pipeline_number: int, step_name: str | None = None) -> list[str]:
+    """
+    Query log entries from the Woodpecker database.
+
+    Args:
+        pipeline_number: The pipeline number to query
+        step_name: Optional step name to filter by
+
+    Returns:
+        List of log data strings
+    """
+    db_path = get_db_path()
+
+    if not os.path.exists(db_path):
+        print(f"ERROR: Woodpecker database not found at {db_path}", file=sys.stderr)
+        print(f"Set WOODPECKER_DATA_DIR or mount volume to {DEFAULT_WOODPECKER_DATA_DIR}", file=sys.stderr)
+        sys.exit(1)
+
+    conn = sqlite3.connect(db_path)
+    conn.row_factory = sqlite3.Row
+    cursor = conn.cursor()
+
+    if step_name:
+        # Query logs for a specific step
+        query = """
+            SELECT le.data
+            FROM log_entries le
+            JOIN steps s ON le.step_id = s.id
+            JOIN pipelines p ON s.pipeline_id = p.id
+            WHERE p.number = ? AND s.name = ?
+            ORDER BY le.id
+        """
+        cursor.execute(query, (pipeline_number, step_name))
+    else:
+        # Query logs for all failed steps in the pipeline
+        query = """
+            SELECT le.data
+            FROM log_entries le
+            JOIN steps s ON le.step_id = s.id
+            JOIN pipelines p ON s.pipeline_id = p.id
+            WHERE p.number = ? AND s.state IN ('failure', 'error', 'killed')
+            ORDER BY le.id
+        """
+        cursor.execute(query, (pipeline_number,))
+
+    logs = [row["data"] for row in cursor.fetchall()]
+    conn.close()
+    return logs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Read CI logs from Woodpecker SQLite database"
+    )
+    parser.add_argument(
+        "pipeline_number",
+        type=int,
+        help="Pipeline number to query"
+    )
+    parser.add_argument(
+        "--step", "-s",
+        dest="step_name",
+        default=None,
+        help="Filter to a specific step name"
+    )
+
+    args = parser.parse_args()
+
+    logs = query_logs(args.pipeline_number, args.step_name)
+
+    if not logs:
+        if args.step_name:
+            print(f"No logs found for pipeline #{args.pipeline_number}, step '{args.step_name}'", file=sys.stderr)
+        else:
+            print(f"No failed steps found in pipeline #{args.pipeline_number}", file=sys.stderr)
+        sys.exit(0)
+
+    # Join all log data and output
+    full_output = "\n".join(logs)
+
+    # Truncate to last N lines to avoid context bloat
+    lines = full_output.split("\n")
+    if len(lines) > MAX_OUTPUT_LINES:
+        # Keep last N lines
+        truncated = lines[-MAX_OUTPUT_LINES:]
+        print("\n".join(truncated))
+    else:
+        print(full_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/lib/ci-setup.sh
+++ b/lib/ci-setup.sh
@ -0,0 +1,504 @@
+#!/usr/bin/env bash
+# =============================================================================
+# ci-setup.sh — CI setup functions for Woodpecker and scheduling configuration
+#
+# Internal functions (called via _load_ci_context + _*_impl):
+#   _install_cron_impl()              - Install crontab entries (bare-metal only; compose uses polling loop)
+#   _create_forgejo_oauth_app()       - Generic: create an OAuth2 app on Forgejo (shared helper)
+#   _create_woodpecker_oauth_impl()   - Create OAuth2 app on Forgejo for Woodpecker
+#   _create_chat_oauth_impl()         - Create OAuth2 app on Forgejo for disinto-chat
+#   _generate_woodpecker_token_impl() - Auto-generate WOODPECKER_TOKEN via OAuth2 flow
+#   _activate_woodpecker_repo_impl()  - Activate repo in Woodpecker
+#
+# Globals expected (asserted by _load_ci_context):
+#   FORGE_URL    - Forge instance URL (e.g. http://localhost:3000)
+#   FORGE_TOKEN  - Forge API token
+#   FACTORY_ROOT - Root of the disinto factory
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/ci-setup.sh"
+# =============================================================================
+set -euo pipefail
+
+# Assert required globals are set before using this module.
+_load_ci_context() {
+  local missing=()
+  [ -z "${FORGE_URL:-}" ]    && missing+=("FORGE_URL")
+  [ -z "${FORGE_TOKEN:-}" ]  && missing+=("FORGE_TOKEN")
+  [ -z "${FACTORY_ROOT:-}" ] && missing+=("FACTORY_ROOT")
+  if [ "${#missing[@]}" -gt 0 ]; then
+    echo "Error: ci-setup.sh requires these globals to be set: ${missing[*]}" >&2
+    exit 1
+  fi
+}
+
+# Generate and optionally install cron entries for bare-metal deployments.
+# In compose mode, the agents container uses a polling loop (entrypoint.sh) instead.
+# Usage: install_cron <name> <toml_path> <auto_yes> <bare>
+_install_cron_impl() {
+  local name="$1" toml="$2" auto_yes="$3" bare="${4:-false}"
+
+  # In compose mode, skip host cron — the agents container uses a polling loop
+  if [ "$bare" = false ]; then
+    echo ""
+    echo "Cron:    skipped (agents container handles scheduling in compose mode)"
+    return
+  fi
+
+  # Bare mode: crontab is required on the host
+  if ! command -v crontab &>/dev/null; then
+    echo "Warning: crontab not found (required for bare-metal scheduling)" >&2
+    echo "  Install: apt install cron  /  brew install cron" >&2
+    return 1
+  fi
+
+  # Use absolute path for the TOML in cron entries
+  local abs_toml
+  abs_toml="$(cd "$(dirname "$toml")" && pwd)/$(basename "$toml")"
+
+  local cron_block
+  cron_block="# disinto: ${name}
+2,7,12,17,22,27,32,37,42,47,52,57 * * * * ${FACTORY_ROOT}/review/review-poll.sh ${abs_toml} >/dev/null 2>&1
+4,9,14,19,24,29,34,39,44,49,54,59 * * * * ${FACTORY_ROOT}/dev/dev-poll.sh ${abs_toml} >/dev/null 2>&1
+0 0,6,12,18 * * * cd ${FACTORY_ROOT} && bash gardener/gardener-run.sh ${abs_toml} >/dev/null 2>&1"
+
+  echo ""
+  echo "Cron entries to install:"
+  echo "$cron_block"
+  echo ""
+
+  # Check if cron entries already exist
+  local current_crontab
+  current_crontab=$(crontab -l 2>/dev/null || true)
+  if echo "$current_crontab" | grep -q "# disinto: ${name}"; then
+    echo "Cron:    skipped (entries for ${name} already installed)"
+    return
+  fi
+
+  if [ "$auto_yes" = false ] && [ -t 0 ]; then
+    read -rp "Install these cron entries? [y/N] " confirm
+    if [[ ! "$confirm" =~ ^[Yy] ]]; then
+      echo "Skipped cron install. Add manually with: crontab -e"
+      return
+    fi
+  fi
+
+  # Append to existing crontab
+  if { crontab -l 2>/dev/null || true; printf '%s\n' "$cron_block"; } | crontab -; then
+    echo "Cron entries installed for ${name}"
+  else
+    echo "Error: failed to install cron entries" >&2
+    return 1
+  fi
+}
+
+# Create an OAuth2 application on Forgejo.
+# Generic helper used by both Woodpecker and chat OAuth setup.
+# Sets _OAUTH_CLIENT_ID and _OAUTH_CLIENT_SECRET on success.
+# Usage: _create_forgejo_oauth_app <app_name> <redirect_uri>
+_create_forgejo_oauth_app() {
+  local oauth2_name="$1"
+  local redirect_uri="$2"
+  local forge_url="${FORGE_URL}"
+
+  _OAUTH_CLIENT_ID=""
+  _OAUTH_CLIENT_SECRET=""
+
+  local existing_app
+  existing_app=$(curl -sf \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${forge_url}/api/v1/user/applications/oauth2" 2>/dev/null \
+    | jq -r --arg name "$oauth2_name" '.[] | select(.name == $name) | .client_id // empty' 2>/dev/null) || true
+
+  if [ -n "$existing_app" ]; then
+    echo "OAuth2:  ${oauth2_name} (already exists, client_id=${existing_app})"
+    _OAUTH_CLIENT_ID="$existing_app"
+    return 0
+  fi
+
+  local oauth2_resp
+  oauth2_resp=$(curl -sf -X POST \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/user/applications/oauth2" \
+    -d "{\"name\":\"${oauth2_name}\",\"redirect_uris\":[\"${redirect_uri}\"],\"confidential_client\":true}" \
+    2>/dev/null) || oauth2_resp=""
+
+  if [ -z "$oauth2_resp" ]; then
+    echo "Warning: failed to create OAuth2 app '${oauth2_name}' on Forgejo" >&2
+    return 1
+  fi
+
+  _OAUTH_CLIENT_ID=$(printf '%s' "$oauth2_resp" | jq -r '.client_id // empty')
+  _OAUTH_CLIENT_SECRET=$(printf '%s' "$oauth2_resp" | jq -r '.client_secret // empty')
+
+  if [ -z "$_OAUTH_CLIENT_ID" ]; then
+    echo "Warning: OAuth2 app creation returned no client_id" >&2
+    return 1
+  fi
+
+  echo "OAuth2:  ${oauth2_name} created (client_id=${_OAUTH_CLIENT_ID})"
+}
+
+# Set up Woodpecker CI to use Forgejo as its forge backend.
+# Creates an OAuth2 app on Forgejo for Woodpecker, activates the repo.
+# Usage: create_woodpecker_oauth <forge_url> <repo_slug>
+_create_woodpecker_oauth_impl() {
+  local forge_url="$1"
+  local _repo_slug="$2" # unused but required for signature compatibility
+
+  echo ""
+  echo "── Woodpecker OAuth2 setup ────────────────────────────"
+
+  _create_forgejo_oauth_app "woodpecker-ci" "http://localhost:8000/authorize" || return 0
+  local client_id="${_OAUTH_CLIENT_ID}"
+  local client_secret="${_OAUTH_CLIENT_SECRET}"
+
+  # Store Woodpecker forge config in .env
+  # WP_FORGEJO_CLIENT/SECRET match the docker-compose.yml variable references
+  # WOODPECKER_HOST must be host-accessible URL to match OAuth2 redirect_uri
+  local env_file="${FACTORY_ROOT}/.env"
+  local wp_vars=(
+    "WOODPECKER_FORGEJO=true"
+    "WOODPECKER_FORGEJO_URL=${forge_url}"
+    "WOODPECKER_HOST=http://localhost:8000"
+  )
+  if [ -n "${client_id:-}" ]; then
+    wp_vars+=("WP_FORGEJO_CLIENT=${client_id}")
+  fi
+  if [ -n "${client_secret:-}" ]; then
+    wp_vars+=("WP_FORGEJO_SECRET=${client_secret}")
+  fi
+
+  for var_line in "${wp_vars[@]}"; do
+    local var_name="${var_line%%=*}"
+    if grep -q "^${var_name}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${var_name}=.*|${var_line}|" "$env_file"
+    else
+      printf '%s\n' "$var_line" >> "$env_file"
+    fi
+  done
+  echo "Config:  Woodpecker forge vars written to .env"
+}
+
+# Create OAuth2 app on Forgejo for disinto-chat.
+# Writes CHAT_OAUTH_CLIENT_ID / CHAT_OAUTH_CLIENT_SECRET to .env.
+# Usage: _create_chat_oauth_impl <redirect_uri>
+_create_chat_oauth_impl() {
+  local redirect_uri="$1"
+
+  echo ""
+  echo "── Chat OAuth2 setup ──────────────────────────────────"
+
+  _create_forgejo_oauth_app "disinto-chat" "$redirect_uri" || return 0
+  local client_id="${_OAUTH_CLIENT_ID}"
+  local client_secret="${_OAUTH_CLIENT_SECRET}"
+
+  local env_file="${FACTORY_ROOT}/.env"
+  local chat_vars=()
+  if [ -n "${client_id:-}" ]; then
+    chat_vars+=("CHAT_OAUTH_CLIENT_ID=${client_id}")
+  fi
+  if [ -n "${client_secret:-}" ]; then
+    chat_vars+=("CHAT_OAUTH_CLIENT_SECRET=${client_secret}")
+  fi
+
+  for var_line in "${chat_vars[@]}"; do
+    local var_name="${var_line%%=*}"
+    if grep -q "^${var_name}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${var_name}=.*|${var_line}|" "$env_file"
+    else
+      printf '%s\n' "$var_line" >> "$env_file"
+    fi
+  done
+  echo "Config:  Chat OAuth vars written to .env"
+}
+
+# Auto-generate WOODPECKER_TOKEN by driving the Forgejo OAuth2 login flow.
+# Requires _FORGE_ADMIN_PASS (set by setup_forge when admin user was just created).
+# Called after compose stack is up, before activate_woodpecker_repo.
+# Usage: generate_woodpecker_token <forge_url>
+_generate_woodpecker_token_impl() {
+  local forge_url="$1"
+  local wp_server="${WOODPECKER_SERVER:-http://localhost:8000}"
+  local env_file="${FACTORY_ROOT}/.env"
+  local admin_user="disinto-admin"
+  local admin_pass="${_FORGE_ADMIN_PASS:-}"
+
+  # Skip if already set
+  if grep -q '^WOODPECKER_TOKEN=' "$env_file" 2>/dev/null; then
+    echo "Config:  WOODPECKER_TOKEN already set in .env"
+    return 0
+  fi
+
+  echo ""
+  echo "── Woodpecker token generation ────────────────────────"
+
+  if [ -z "$admin_pass" ]; then
+    echo "Warning: Forgejo admin password not available — cannot generate WOODPECKER_TOKEN" >&2
+    echo "  Log into Woodpecker at ${wp_server} and create a token manually" >&2
+    return 1
+  fi
+
+  # Wait for Woodpecker to become ready
+  echo -n "Waiting for Woodpecker"
+  local retries=0
+  while ! curl -sf --max-time 3 "${wp_server}/api/version" >/dev/null 2>&1; do
+    retries=$((retries + 1))
+    if [ "$retries" -gt 30 ]; then
+      echo ""
+      echo "Warning: Woodpecker not ready at ${wp_server} — skipping token generation" >&2
+      return 1
+    fi
+    echo -n "."
+    sleep 2
+  done
+  echo " ready"
+
+  # Flow: Forgejo web login → OAuth2 authorize → Woodpecker callback → token
+  local cookie_jar auth_body_file
+  cookie_jar=$(mktemp /tmp/wp-auth-XXXXXX)
+  auth_body_file=$(mktemp /tmp/wp-body-XXXXXX)
+
+  # Step 1: Log into Forgejo web UI (session cookie needed for OAuth consent)
+  local csrf
+  csrf=$(curl -sf -c "$cookie_jar" "${forge_url}/user/login" 2>/dev/null \
+    | grep -o 'name="_csrf"[^>]*' | head -1 \
+    | grep -oE '(content|value)="[^"]*"' | head -1 \
+    | cut -d'"' -f2) || csrf=""
+
+  if [ -z "$csrf" ]; then
+    echo "Warning: could not get Forgejo CSRF token — skipping token generation" >&2
+    rm -f "$cookie_jar" "$auth_body_file"
+    return 1
+  fi
+
+  curl -sf -b "$cookie_jar" -c "$cookie_jar" -X POST \
+    -o /dev/null \
+    "${forge_url}/user/login" \
+    --data-urlencode "_csrf=${csrf}" \
+    --data-urlencode "user_name=${admin_user}" \
+    --data-urlencode "password=${admin_pass}" \
+    2>/dev/null || true
+
+  # Step 2: Start Woodpecker OAuth2 flow (captures authorize URL with state param)
+  local wp_redir
+  wp_redir=$(curl -sf -o /dev/null -w '%{redirect_url}' \
+    "${wp_server}/authorize" 2>/dev/null) || wp_redir=""
+
+  if [ -z "$wp_redir" ]; then
+    echo "Warning: Woodpecker did not provide OAuth redirect — skipping token generation" >&2
+    rm -f "$cookie_jar" "$auth_body_file"
+    return 1
+  fi
+
+  # Rewrite internal Docker network URLs to host-accessible URLs.
+  # Handle both plain and URL-encoded forms of the internal hostnames.
+  local forge_url_enc wp_server_enc
+  forge_url_enc=$(printf '%s' "$forge_url" | sed 's|:|%3A|g; s|/|%2F|g')
+  wp_server_enc=$(printf '%s' "$wp_server" | sed 's|:|%3A|g; s|/|%2F|g')
+  wp_redir=$(printf '%s' "$wp_redir" \
+    | sed "s|http://forgejo:3000|${forge_url}|g" \
+    | sed "s|http%3A%2F%2Fforgejo%3A3000|${forge_url_enc}|g" \
+    | sed "s|http://woodpecker:8000|${wp_server}|g" \
+    | sed "s|http%3A%2F%2Fwoodpecker%3A8000|${wp_server_enc}|g")
+
+  # Step 3: Hit Forgejo OAuth authorize endpoint with session
+  # First time: shows consent page. Already approved: redirects with code.
+  local auth_headers redirect_loc auth_code
+  auth_headers=$(curl -sf -b "$cookie_jar" -c "$cookie_jar" \
+    -D - -o "$auth_body_file" \
+    "$wp_redir" 2>/dev/null) || auth_headers=""
+
+  redirect_loc=$(printf '%s' "$auth_headers" \
+    | grep -i '^location:' | head -1 | tr -d '\r' | awk '{print $2}')
+
+  if printf '%s' "${redirect_loc:-}" | grep -q 'code='; then
+    # Auto-approved: extract code from redirect
+    auth_code=$(printf '%s' "$redirect_loc" | sed 's/.*code=\([^&]*\).*/\1/')
+  else
+    # Consent page: extract CSRF and all form fields, POST grant approval
+    local consent_csrf form_client_id form_state form_redirect_uri
+    consent_csrf=$(grep -o 'name="_csrf"[^>]*' "$auth_body_file" 2>/dev/null \
+      | head -1 | grep -oE '(content|value)="[^"]*"' | head -1 \
+      | cut -d'"' -f2) || consent_csrf=""
+    form_client_id=$(grep 'name="client_id"' "$auth_body_file" 2>/dev/null \
+      | grep -oE 'value="[^"]*"' | cut -d'"' -f2) || form_client_id=""
+    form_state=$(grep 'name="state"' "$auth_body_file" 2>/dev/null \
+      | grep -oE 'value="[^"]*"' | cut -d'"' -f2) || form_state=""
+    form_redirect_uri=$(grep 'name="redirect_uri"' "$auth_body_file" 2>/dev/null \
+      | grep -oE 'value="[^"]*"' | cut -d'"' -f2) || form_redirect_uri=""
+
+    if [ -n "$consent_csrf" ]; then
+      local grant_headers
+      grant_headers=$(curl -sf -b "$cookie_jar" -c "$cookie_jar" \
+        -D - -o /dev/null -X POST \
+        "${forge_url}/login/oauth/grant" \
+        --data-urlencode "_csrf=${consent_csrf}" \
+        --data-urlencode "client_id=${form_client_id}" \
+        --data-urlencode "state=${form_state}" \
+        --data-urlencode "scope=" \
+        --data-urlencode "nonce=" \
+        --data-urlencode "redirect_uri=${form_redirect_uri}" \
+        --data-urlencode "granted=true" \
+        2>/dev/null) || grant_headers=""
+
+      redirect_loc=$(printf '%s' "$grant_headers" \
+        | grep -i '^location:' | head -1 | tr -d '\r' | awk '{print $2}')
+
+      if printf '%s' "${redirect_loc:-}" | grep -q 'code='; then
+        auth_code=$(printf '%s' "$redirect_loc" | sed 's/.*code=\([^&]*\).*/\1/')
+      fi
+    fi
+  fi
+
+  rm -f "$auth_body_file"
+
+  if [ -z "${auth_code:-}" ]; then
+    echo "Warning: could not obtain OAuth2 authorization code — skipping token generation" >&2
+    rm -f "$cookie_jar"
+    return 1
+  fi
+
+  # Step 4: Complete Woodpecker OAuth callback (exchanges code for session)
+  local state
+  state=$(printf '%s' "$wp_redir" | sed -n 's/.*[&?]state=\([^&]*\).*/\1/p')
+
+  local wp_headers wp_token
+  wp_headers=$(curl -sf -c "$cookie_jar" \
+    -D - -o /dev/null \
+    "${wp_server}/authorize?code=${auth_code}&state=${state:-}" \
+    2>/dev/null) || wp_headers=""
+
+  # Extract token from redirect URL (Woodpecker returns ?access_token=...)
+  redirect_loc=$(printf '%s' "$wp_headers" \
+    | grep -i '^location:' | head -1 | tr -d '\r' | awk '{print $2}')
+
+  wp_token=""
+  if printf '%s' "${redirect_loc:-}" | grep -q 'access_token='; then
+    wp_token=$(printf '%s' "$redirect_loc" | sed 's/.*access_token=\([^&]*\).*/\1/')
+  fi
+
+  # Fallback: check for user_sess cookie
+  if [ -z "$wp_token" ]; then
+    wp_token=$(awk '/user_sess/{print $NF}' "$cookie_jar" 2>/dev/null) || wp_token=""
+  fi
+
+  rm -f "$cookie_jar"
+
+  if [ -z "$wp_token" ]; then
+    echo "Warning: could not obtain Woodpecker token — skipping token generation" >&2
+    return 1
+  fi
+
+  # Step 5: Create persistent personal access token via Woodpecker API
+  # WP v3 requires CSRF header for POST operations with session tokens.
+  local wp_csrf
+  wp_csrf=$(curl -sf -b "user_sess=${wp_token}" \
+    "${wp_server}/web-config.js" 2>/dev/null \
+    | sed -n 's/.*WOODPECKER_CSRF = "\([^"]*\)".*/\1/p') || wp_csrf=""
+
+  local pat_resp final_token
+  pat_resp=$(curl -sf -X POST \
+    -b "user_sess=${wp_token}" \
+    ${wp_csrf:+-H "X-CSRF-Token: ${wp_csrf}"} \
+    "${wp_server}/api/user/token" \
+    2>/dev/null) || pat_resp=""
+
+  final_token=""
+  if [ -n "$pat_resp" ]; then
+    final_token=$(printf '%s' "$pat_resp" \
+      | jq -r 'if .token then .token elif .access_token then .access_token else empty end' \
+      2>/dev/null) || final_token=""
+  fi
+
+  # Use persistent token if available, otherwise use session token
+  final_token="${final_token:-$wp_token}"
+
+  # Save to .env
+  if grep -q '^WOODPECKER_TOKEN=' "$env_file" 2>/dev/null; then
+    sed -i "s|^WOODPECKER_TOKEN=.*|WOODPECKER_TOKEN=${final_token}|" "$env_file"
+  else
+    printf 'WOODPECKER_TOKEN=%s\n' "$final_token" >> "$env_file"
+  fi
+  export WOODPECKER_TOKEN="$final_token"
+  echo "Config:  WOODPECKER_TOKEN generated and saved to .env"
+}
+
+# Activate a repo in Woodpecker CI.
+# Usage: activate_woodpecker_repo <forge_repo>
+_activate_woodpecker_repo_impl() {
+  local forge_repo="$1"
+  local wp_server="${WOODPECKER_SERVER:-http://localhost:8000}"
+
+  # Wait for Woodpecker to become ready after stack start
+  local retries=0
+  while [ $retries -lt 10 ]; do
+    if curl -sf --max-time 3 "${wp_server}/api/version" >/dev/null 2>&1; then
+      break
+    fi
+    retries=$((retries + 1))
+    sleep 2
+  done
+
+  if ! curl -sf --max-time 5 "${wp_server}/api/version" >/dev/null 2>&1; then
+    echo "Woodpecker: not reachable at ${wp_server} after stack start, skipping repo activation" >&2
+    return
+  fi
+
+  echo ""
+  echo "── Woodpecker repo activation ─────────────────────────"
+
+  local wp_token="${WOODPECKER_TOKEN:-}"
+  if [ -z "$wp_token" ]; then
+    echo "Warning: WOODPECKER_TOKEN not set — cannot activate repo" >&2
+    echo "  Activate manually: woodpecker-cli repo add ${forge_repo}" >&2
+    return
+  fi
+
+  local wp_repo_id
+  wp_repo_id=$(curl -sf \
+    -H "Authorization: Bearer ${wp_token}" \
+    "${wp_server}/api/repos/lookup/${forge_repo}" 2>/dev/null \
+    | jq -r '.id // empty' 2>/dev/null) || true
+
+  if [ -n "$wp_repo_id" ] && [ "$wp_repo_id" != "0" ]; then
+    echo "Repo:    ${forge_repo} already active in Woodpecker (id=${wp_repo_id})"
+  else
+    # Get Forgejo repo numeric ID for WP activation
+    local forge_repo_id
+    forge_repo_id=$(curl -sf \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_URL:-http://localhost:3000}/api/v1/repos/${forge_repo}" 2>/dev/null \
+      | jq -r '.id // empty' 2>/dev/null) || forge_repo_id=""
+
+    local activate_resp
+    activate_resp=$(curl -sf -X POST \
+      -H "Authorization: Bearer ${wp_token}" \
+      "${wp_server}/api/repos?forge_remote_id=${forge_repo_id:-0}" \
+      2>/dev/null) || activate_resp=""
+
+    wp_repo_id=$(printf '%s' "$activate_resp" | jq -r '.id // empty' 2>/dev/null) || true
+
+    if [ -n "$wp_repo_id" ] && [ "$wp_repo_id" != "0" ]; then
+      echo "Repo:    ${forge_repo} activated in Woodpecker (id=${wp_repo_id})"
+
+      # Set pipeline timeout to 5 minutes (default is 60)
+      if curl -sf -X PATCH \
+        -H "Authorization: Bearer ${wp_token}" \
+        -H "Content-Type: application/json" \
+        "${wp_server}/api/repos/${wp_repo_id}" \
+        -d '{"timeout": 5}' >/dev/null 2>&1; then
+        echo "Config:  pipeline timeout set to 5 minutes"
+      fi
+    else
+      echo "Warning: could not activate repo in Woodpecker" >&2
+      echo "  Activate manually: woodpecker-cli repo add ${forge_repo}" >&2
+    fi
+  fi
+
+  # Store repo ID for later TOML generation
+  if [ -n "$wp_repo_id" ] && [ "$wp_repo_id" != "0" ]; then
+    _WP_REPO_ID="$wp_repo_id"
+  fi
+}
--- a/lib/claude-config.sh
+++ b/lib/claude-config.sh
@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# lib/claude-config.sh — Shared Claude config directory helpers (#641)
+#
+# Provides setup_claude_config_dir() for creating/migrating CLAUDE_CONFIG_DIR
+# and _env_set_idempotent() for writing env vars to .env files.
+#
+# Requires: CLAUDE_CONFIG_DIR, CLAUDE_SHARED_DIR (set by lib/env.sh)
+
+# Idempotent .env writer.
+# Usage: _env_set_idempotent KEY VALUE FILE
+_env_set_idempotent() {
+  local key="$1" value="$2" file="$3"
+  if grep -q "^${key}=" "$file" 2>/dev/null; then
+    local existing
+    existing=$(grep "^${key}=" "$file" | head -1 | cut -d= -f2-)
+    if [ "$existing" != "$value" ]; then
+      sed -i "s|^${key}=.*|${key}=${value}|" "$file"
+    fi
+  else
+    printf '%s=%s\n' "$key" "$value" >> "$file"
+  fi
+}
+
+# Create the shared CLAUDE_CONFIG_DIR, optionally migrating ~/.claude.
+# Usage: setup_claude_config_dir [auto_yes]
+setup_claude_config_dir() {
+  local auto_yes="${1:-false}"
+  local home_claude="${HOME}/.claude"
+
+  # Create the shared config directory (idempotent)
+  install -d -m 0700 -o "$USER" "$CLAUDE_CONFIG_DIR"
+  echo "Claude:  ${CLAUDE_CONFIG_DIR} (ready)"
+
+  # If ~/.claude is already a symlink to CLAUDE_CONFIG_DIR, nothing to do
+  if [ -L "$home_claude" ]; then
+    local link_target
+    link_target=$(readlink -f "$home_claude")
+    local config_real
+    config_real=$(readlink -f "$CLAUDE_CONFIG_DIR")
+    if [ "$link_target" = "$config_real" ]; then
+      echo "Claude:  ${home_claude} -> ${CLAUDE_CONFIG_DIR} (symlink OK)"
+      return 0
+    fi
+  fi
+
+  local home_exists=false home_nonempty=false
+  local config_nonempty=false
+
+  # Check ~/.claude (skip if it's a symlink — already handled above)
+  if [ -d "$home_claude" ] && [ ! -L "$home_claude" ]; then
+    home_exists=true
+    if [ -n "$(ls -A "$home_claude" 2>/dev/null)" ]; then
+      home_nonempty=true
+    fi
+  fi
+
+  # Check CLAUDE_CONFIG_DIR contents
+  if [ -n "$(ls -A "$CLAUDE_CONFIG_DIR" 2>/dev/null)" ]; then
+    config_nonempty=true
+  fi
+
+  # Case: both non-empty — abort, operator must reconcile
+  if [ "$home_nonempty" = true ] && [ "$config_nonempty" = true ]; then
+    echo "ERROR: both ${home_claude} and ${CLAUDE_CONFIG_DIR} exist and are non-empty" >&2
+    echo "  Reconcile manually: merge or remove one, then re-run disinto init" >&2
+    return 1
+  fi
+
+  # Case: ~/.claude exists and CLAUDE_CONFIG_DIR is empty — offer migration
+  if [ "$home_nonempty" = true ] && [ "$config_nonempty" = false ]; then
+    local do_migrate=false
+    if [ "$auto_yes" = true ]; then
+      do_migrate=true
+    elif [ -t 0 ]; then
+      read -rp "Migrate ${home_claude} to ${CLAUDE_CONFIG_DIR}? [Y/n] " confirm
+      if [[ ! "$confirm" =~ ^[Nn] ]]; then
+        do_migrate=true
+      fi
+    else
+      echo "Warning: ${home_claude} exists but cannot prompt for migration (no TTY)" >&2
+      echo "  Re-run with --yes to auto-migrate, or move files manually" >&2
+      return 0
+    fi
+
+    if [ "$do_migrate" = true ]; then
+      # Move contents (not the dir itself) to preserve CLAUDE_CONFIG_DIR ownership
+      cp -a "$home_claude/." "$CLAUDE_CONFIG_DIR/"
+      rm -rf "$home_claude"
+      ln -sfn "$CLAUDE_CONFIG_DIR" "$home_claude"
+      echo "Claude:  migrated ${home_claude} -> ${CLAUDE_CONFIG_DIR}"
+      return 0
+    fi
+  fi
+
+  # Case: ~/.claude exists but is empty, or doesn't exist — create symlink
+  if [ "$home_exists" = true ] && [ "$home_nonempty" = false ]; then
+    rmdir "$home_claude" 2>/dev/null || true
+  fi
+  if [ ! -e "$home_claude" ]; then
+    ln -sfn "$CLAUDE_CONFIG_DIR" "$home_claude"
+    echo "Claude:  ${home_claude} -> ${CLAUDE_CONFIG_DIR} (symlink created)"
+  fi
+}
--- a/lib/env.sh
+++ b/lib/env.sh
@ -1,92 +1,117 @@
 #!/usr/bin/env bash
+# =============================================================================
 # env.sh — Load environment and shared utilities
 # Source this at the top of every script: source "$(dirname "$0")/lib/env.sh"
+#
+# SURFACE CONTRACT
+#
+# Required preconditions — the entrypoint (or caller) MUST set these before
+# sourcing this file:
+#   USER              — OS user name (e.g. "agent", "johba")
+#   HOME              — home directory (e.g. "/home/agent")
+#
+# Required when PROJECT_TOML is set (i.e. agent scripts loading a project):
+#   PROJECT_REPO_ROOT — absolute path to the project git clone
+#   PRIMARY_BRANCH    — default branch name (e.g. "main")
+#   OPS_REPO_ROOT     — absolute path to the ops repo clone
+#   (these are normally populated by load-project.sh from the TOML)
+#
+# What this file sets / exports:
+#   FACTORY_ROOT, DISINTO_LOG_DIR
+#   .env / .env.enc secrets (FORGE_TOKEN, etc.)
+#   FORGE_API, FORGE_WEB, TEA_LOGIN, FORGE_OPS_REPO (derived from FORGE_URL/FORGE_REPO)
+#   Per-agent tokens (FORGE_REVIEW_TOKEN, FORGE_GARDENER_TOKEN, …)
+#   CLAUDE_SHARED_DIR, CLAUDE_CONFIG_DIR
+#   Helper functions: log(), validate_url(), forge_api(), forge_api_all(),
+#     woodpecker_api(), wpdb(), memory_guard()
+# =============================================================================

 set -euo pipefail

 # Resolve script root (parent of lib/)
 FACTORY_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

+# ── Precondition assertions ──────────────────────────────────────────────────
+# These must be set by the entrypoint before sourcing this file.
+: "${USER:?must be set by entrypoint before sourcing lib/env.sh}"
+: "${HOME:?must be set by entrypoint before sourcing lib/env.sh}"
+
 # Container detection: when running inside the agent container, DISINTO_CONTAINER
 # is set by docker-compose.yml.  Adjust paths so phase files, logs, and thread
 # maps land on the persistent volume instead of /tmp (which is ephemeral).
 if [ "${DISINTO_CONTAINER:-}" = "1" ]; then
  DISINTO_DATA_DIR="${HOME}/data"
  DISINTO_LOG_DIR="${DISINTO_DATA_DIR}/logs"
-  mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics}
+  mkdir -p "${DISINTO_DATA_DIR}" "${DISINTO_LOG_DIR}"/{dev,action,review,supervisor,vault,site,metrics,gardener,planner,predictor,architect,dispatcher}
 else
  DISINTO_LOG_DIR="${FACTORY_ROOT}"
 fi
 export DISINTO_LOG_DIR

 # Load secrets: prefer .env.enc (SOPS-encrypted), fall back to plaintext .env.
-# Always source .env — cron jobs inside the container do NOT inherit compose
-# env vars (FORGE_TOKEN, etc.). Compose-injected vars (like FORGE_URL) are
-# already set and won't be clobbered since env.sh uses ${VAR:-default} patterns
-# for derived values. FORGE_URL from .env (localhost:3000) is overridden below
-# by the compose-injected value when running via docker exec.
-if [ -f "$FACTORY_ROOT/.env.enc" ] && command -v sops &>/dev/null; then
-  set -a
-  _saved_forge_url="${FORGE_URL:-}"
-  _saved_forge_token="${FORGE_TOKEN:-}"
-  # Use temp file + validate dotenv format before sourcing (avoids eval injection)
-  # SOPS -d automatically verifies MAC/GCM authentication tag during decryption
-  _tmpenv=$(mktemp) || { echo "Error: failed to create temp file for .env.enc" >&2; exit 1; }
-  if ! sops -d --output-type dotenv "$FACTORY_ROOT/.env.enc" > "$_tmpenv" 2>/dev/null; then
-    echo "Error: failed to decrypt .env.enc — decryption failed, possible corruption" >&2
+# Inside containers (DISINTO_CONTAINER=1), compose environment is the source of truth.
+# On bare metal, .env/.env.enc is sourced to provide default values.
+if [ "${DISINTO_CONTAINER:-}" != "1" ]; then
+  if [ -f "$FACTORY_ROOT/.env.enc" ] && command -v sops &>/dev/null; then
+    set -a
+    _saved_forge_url="${FORGE_URL:-}"
+    # Use temp file + validate dotenv format before sourcing (avoids eval injection)
+    # SOPS -d automatically verifies MAC/GCM authentication tag during decryption
+    _tmpenv=$(mktemp) || { echo "Error: failed to create temp file for .env.enc" >&2; exit 1; }
+    if ! sops -d --output-type dotenv "$FACTORY_ROOT/.env.enc" > "$_tmpenv" 2>/dev/null; then
+      echo "Error: failed to decrypt .env.enc — decryption failed, possible corruption" >&2
+      rm -f "$_tmpenv"
+      exit 1
+    fi
+    # Validate: non-empty, non-comment lines must match KEY=value pattern
+    # Filter out blank lines and comments before validation
+    _validated=$(grep -E '^[A-Za-z_][A-Za-z0-9_]*=' "$_tmpenv" 2>/dev/null || true)
+    if [ -n "$_validated" ]; then
+      # Write validated content to a second temp file and source it
+      _validated_env=$(mktemp)
+      printf '%s\n' "$_validated" > "$_validated_env"
+      # shellcheck source=/dev/null
+      source "$_validated_env"
+      rm -f "$_validated_env"
+    else
+      echo "Error: .env.enc decryption output failed format validation" >&2
+      rm -f "$_tmpenv"
+      exit 1
+    fi
    rm -f "$_tmpenv"
-    exit 1
-  fi
-  # Validate: non-empty, non-comment lines must match KEY=value pattern
-  # Filter out blank lines and comments before validation
-  _validated=$(grep -E '^[A-Za-z_][A-Za-z0-9_]*=' "$_tmpenv" 2>/dev/null || true)
-  if [ -n "$_validated" ]; then
-    # Write validated content to a second temp file and source it
-    _validated_env=$(mktemp)
-    printf '%s\n' "$_validated" > "$_validated_env"
+    set +a
+    [ -n "$_saved_forge_url" ] && export FORGE_URL="$_saved_forge_url"
+  elif [ -f "$FACTORY_ROOT/.env" ]; then
+    # Preserve compose-injected FORGE_URL (localhost in .env != forgejo in Docker)
+    _saved_forge_url="${FORGE_URL:-}"
+    set -a
    # shellcheck source=/dev/null
-    source "$_validated_env"
-    rm -f "$_validated_env"
-  else
-    echo "Error: .env.enc decryption output failed format validation" >&2
-    rm -f "$_tmpenv"
-    exit 1
+    source "$FACTORY_ROOT/.env"
+    set +a
+    [ -n "$_saved_forge_url" ] && export FORGE_URL="$_saved_forge_url"
  fi
-  rm -f "$_tmpenv"
-  set +a
-  [ -n "$_saved_forge_url" ] && export FORGE_URL="$_saved_forge_url"
-  [ -n "$_saved_forge_token" ] && export FORGE_TOKEN="$_saved_forge_token"
-elif [ -f "$FACTORY_ROOT/.env" ]; then
-  # Preserve compose-injected FORGE_URL (localhost in .env != forgejo in Docker)
-  _saved_forge_url="${FORGE_URL:-}"
-  _saved_forge_token="${FORGE_TOKEN:-}"
-  set -a
-  # shellcheck source=/dev/null
-  source "$FACTORY_ROOT/.env"
-  set +a
-  [ -n "$_saved_forge_url" ] && export FORGE_URL="$_saved_forge_url"
-  [ -n "$_saved_forge_token" ] && export FORGE_TOKEN="$_saved_forge_token"
+fi
+
+# Allow per-container token override (#375): .env sets the default FORGE_TOKEN
+# (dev-bot), then FORGE_TOKEN_OVERRIDE replaces it for containers that need a
+# different Forgejo identity (e.g. dev-qwen).
+if [ -n "${FORGE_TOKEN_OVERRIDE:-}" ]; then
+  export FORGE_TOKEN="$FORGE_TOKEN_OVERRIDE"
 fi

 # PATH: foundry, node, system
 export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}"
-export HOME="${HOME:-/home/debian}"

 # Load project TOML if PROJECT_TOML is set (by poll scripts that accept project arg)
 if [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then
  source "${FACTORY_ROOT}/lib/load-project.sh" "$PROJECT_TOML"
 fi

-# Forge token: new FORGE_TOKEN > legacy CODEBERG_TOKEN
-if [ -z "${FORGE_TOKEN:-}" ]; then
-  FORGE_TOKEN="${CODEBERG_TOKEN:-}"
-fi
-export FORGE_TOKEN
-export CODEBERG_TOKEN="${FORGE_TOKEN}"  # backwards compat
+# Forge token
+export FORGE_TOKEN="${FORGE_TOKEN:-}"

-# Review bot token: FORGE_REVIEW_TOKEN > legacy REVIEW_BOT_TOKEN
+# Review bot token
 export FORGE_REVIEW_TOKEN="${FORGE_REVIEW_TOKEN:-${REVIEW_BOT_TOKEN:-}}"
-export REVIEW_BOT_TOKEN="${FORGE_REVIEW_TOKEN}"  # backwards compat

 # Per-agent tokens (#747): each agent gets its own Forgejo identity.
 # Falls back to FORGE_TOKEN for backwards compat with single-token setups.
@ -97,18 +122,15 @@ export FORGE_SUPERVISOR_TOKEN="${FORGE_SUPERVISOR_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_PREDICTOR_TOKEN="${FORGE_PREDICTOR_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_ARCHITECT_TOKEN="${FORGE_ARCHITECT_TOKEN:-${FORGE_TOKEN}}"

-# Bot usernames filter: FORGE_BOT_USERNAMES > legacy CODEBERG_BOT_USERNAMES
-export FORGE_BOT_USERNAMES="${FORGE_BOT_USERNAMES:-${CODEBERG_BOT_USERNAMES:-dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot}}"
-export CODEBERG_BOT_USERNAMES="${FORGE_BOT_USERNAMES}"  # backwards compat
+# Bot usernames filter
+export FORGE_BOT_USERNAMES="${FORGE_BOT_USERNAMES:-dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot}"

-# Project config (FORGE_* preferred, CODEBERG_* fallback)
-export FORGE_REPO="${FORGE_REPO:-${CODEBERG_REPO:-}}"
-export CODEBERG_REPO="${FORGE_REPO}"  # backwards compat
+# Project config
+export FORGE_REPO="${FORGE_REPO:-}"
 export FORGE_URL="${FORGE_URL:-http://localhost:3000}"
-export FORGE_API="${FORGE_API:-${FORGE_URL}/api/v1/repos/${FORGE_REPO}}"
+export FORGE_API_BASE="${FORGE_API_BASE:-${FORGE_URL}/api/v1}"
+export FORGE_API="${FORGE_API:-${FORGE_API_BASE}/repos/${FORGE_REPO}}"
 export FORGE_WEB="${FORGE_WEB:-${FORGE_URL}/${FORGE_REPO}}"
-export CODEBERG_API="${FORGE_API}"  # backwards compat
-export CODEBERG_WEB="${FORGE_WEB}"  # backwards compat
 # tea CLI login name: derived from FORGE_URL (codeberg vs local forgejo)
 if [ -z "${TEA_LOGIN:-}" ]; then
  case "${FORGE_URL}" in
@ -119,12 +141,14 @@ fi
 export TEA_LOGIN

 export PROJECT_NAME="${PROJECT_NAME:-${FORGE_REPO##*/}}"
-export PROJECT_REPO_ROOT="${PROJECT_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}}"
-export PRIMARY_BRANCH="${PRIMARY_BRANCH:-master}"

-# Ops repo: operational data (vault items, journals, evidence, prerequisites).
-# Default convention: sibling directory named {project}-ops.
-export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}-ops}"
+# Project-specific paths: no guessing from USER/HOME — must be set by
+# the entrypoint or loaded from PROJECT_TOML (via load-project.sh above).
+if [ -n "${PROJECT_TOML:-}" ]; then
+  : "${PROJECT_REPO_ROOT:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
+  : "${PRIMARY_BRANCH:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
+  : "${OPS_REPO_ROOT:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
+fi

 # Forge repo slug for the ops repo (used by agents that commit to ops).
 export FORGE_OPS_REPO="${FORGE_OPS_REPO:-${FORGE_REPO:+${FORGE_REPO}-ops}}"
@ -139,13 +163,24 @@ export CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}"
 unset GITHUB_TOKEN 2>/dev/null || true
 unset CLAWHUB_TOKEN 2>/dev/null || true

+# Shared Claude config directory for cross-container OAuth lock coherence (#641).
+# All containers and the host resolve to the same CLAUDE_CONFIG_DIR on a shared
+# bind-mounted filesystem, so proper-lockfile's atomic mkdir works across them.
+: "${CLAUDE_SHARED_DIR:=/var/lib/disinto/claude-shared}"
+: "${CLAUDE_CONFIG_DIR:=${CLAUDE_SHARED_DIR}/config}"
+export CLAUDE_SHARED_DIR CLAUDE_CONFIG_DIR
+
 # Disable Claude Code auto-updater, telemetry, error reporting in factory sessions.
 # Factory processes must never phone home or auto-update mid-session (#725).
 export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1

 # Shared log helper
+# Usage: log "message"
+# Output: [2026-04-03T14:00:00Z] agent: message
+# Where agent is set via LOG_AGENT variable (defaults to caller's context)
 log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*"
+  local agent="${LOG_AGENT:-agent}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*"
 }

 # =============================================================================
@ -209,8 +244,6 @@ forge_api() {
    -H "Content-Type: application/json" \
    "${FORGE_API}${path}" "$@"
 }
-# Backwards-compat alias
-codeberg_api() { forge_api "$@"; }

 # Paginate a Forge API GET endpoint and return all items as a merged JSON array.
 # Usage: forge_api_all /path             (no existing query params)
@ -227,7 +260,8 @@ forge_api_all() {
  page=1
  while true; do
    page_items=$(forge_api GET "${path_prefix}${sep}limit=50&page=${page}")
-    count=$(printf '%s' "$page_items" | jq 'length')
+    count=$(printf '%s' "$page_items" | jq 'length' 2>/dev/null) || count=0
+    [ -z "$count" ] && count=0
    [ "$count" -eq 0 ] && break
    all_items=$(printf '%s\n%s' "$all_items" "$page_items" | jq -s 'add')
    [ "$count" -lt 50 ] && break
@ -253,13 +287,13 @@ woodpecker_api() {
  fi

  curl -sfL \
-    -H "Authorization: Bearer ${WOODPECKER_TOKEN}" \
-    "${WOODPECKER_SERVER}/api${path}" "$@"
+    -H "Authorization: Bearer ${WOODPECKER_TOKEN:-}" \
+    "${WOODPECKER_SERVER:-}/api${path}" "$@"
 }

 # Woodpecker DB query helper
 wpdb() {
-  PGPASSWORD="${WOODPECKER_DB_PASSWORD}" psql \
+  PGPASSWORD="${WOODPECKER_DB_PASSWORD:-}" psql \
    -U "${WOODPECKER_DB_USER:-woodpecker}" \
    -h "${WOODPECKER_DB_HOST:-127.0.0.1}" \
    -d "${WOODPECKER_DB_NAME:-woodpecker}" \
--- a/lib/file-action-issue.sh
+++ b/lib/file-action-issue.sh
@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-# file-action-issue.sh — File an action issue for a formula run
-#
-# Usage: source this file, then call file_action_issue.
-# Requires: forge_api() from lib/env.sh, jq, lib/secret-scan.sh
-#
-# file_action_issue <formula_name> <title> <body>
-#   Sets FILED_ISSUE_NUM on success.
-#   Returns: 0=created, 1=duplicate exists, 2=label not found, 3=API error, 4=secrets detected
-
-# Load secret scanner
-# shellcheck source=secret-scan.sh
-source "$(dirname "${BASH_SOURCE[0]}")/secret-scan.sh"
-
-file_action_issue() {
-  local formula_name="$1" title="$2" body="$3"
-  FILED_ISSUE_NUM=""
-
-  # Secret scan: reject issue bodies containing embedded secrets
-  if ! scan_for_secrets "$body"; then
-    echo "file-action-issue: BLOCKED — issue body for '${formula_name}' contains potential secrets. Use env var references instead." >&2
-    return 4
-  fi
-
-  # Dedup: skip if an open action issue for this formula already exists
-  local open_actions
-  open_actions=$(forge_api_all "/issues?state=open&type=issues&labels=action" 2>/dev/null || true)
-  if [ -n "$open_actions" ] && [ "$open_actions" != "null" ]; then
-    local existing
-    existing=$(printf '%s' "$open_actions" | \
-      jq --arg f "$formula_name" '[.[] | select(.title | test($f))] | length' 2>/dev/null || echo 0)
-    if [ "${existing:-0}" -gt 0 ]; then
-      return 1
-    fi
-  fi
-
-  # Fetch 'action' label ID
-  local action_label_id
-  action_label_id=$(forge_api GET "/labels" 2>/dev/null | \
-    jq -r '.[] | select(.name == "action") | .id' 2>/dev/null || true)
-  if [ -z "$action_label_id" ]; then
-    return 2
-  fi
-
-  # Create the issue
-  local payload result
-  payload=$(jq -nc \
-    --arg title "$title" \
-    --arg body "$body" \
-    --argjson labels "[$action_label_id]" \
-    '{title: $title, body: $body, labels: $labels}')
-
-  result=$(forge_api POST "/issues" -d "$payload" 2>/dev/null || true)
-  FILED_ISSUE_NUM=$(printf '%s' "$result" | jq -r '.number // empty' 2>/dev/null || true)
-
-  if [ -z "$FILED_ISSUE_NUM" ]; then
-    return 3
-  fi
-}
--- a/lib/forge-push.sh
+++ b/lib/forge-push.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# =============================================================================
+# forge-push.sh — push_to_forge() function
+#
+# Handles pushing a local clone to the Forgejo remote and verifying the push.
+#
+# Globals expected:
+#   FORGE_URL    - Forge instance URL (e.g. http://localhost:3000)
+#   FORGE_TOKEN  - API token for Forge operations (used for API verification)
+#   FACTORY_ROOT - Root of the disinto factory
+#   PRIMARY_BRANCH - Primary branch name (e.g. main)
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/forge-push.sh"
+#   push_to_forge <repo_root> <forge_url> <repo_slug>
+# =============================================================================
+set -euo pipefail
+
+# Assert required globals are set before using this module.
+_assert_forge_push_globals() {
+  local missing=()
+  [ -z "${FORGE_URL:-}" ]      && missing+=("FORGE_URL")
+  [ -z "${FORGE_TOKEN:-}" ]    && missing+=("FORGE_TOKEN")
+  [ -z "${FACTORY_ROOT:-}" ]   && missing+=("FACTORY_ROOT")
+  [ -z "${PRIMARY_BRANCH:-}" ] && missing+=("PRIMARY_BRANCH")
+  if [ "${#missing[@]}" -gt 0 ]; then
+    echo "Error: forge-push.sh requires these globals to be set: ${missing[*]}" >&2
+    exit 1
+  fi
+}
+
+# Push local clone to the Forgejo remote.
+push_to_forge() {
+  local repo_root="$1" forge_url="$2" repo_slug="$3"
+
+  # Use clean URL — credential helper supplies auth (#604).
+  # Forgejo 11.x rejects API tokens for git HTTP push (#361); password auth works
+  # via the credential helper configured in configure_git_creds().
+  local remote_url="${forge_url}/${repo_slug}.git"
+  local display_url="$remote_url"
+
+  # Always set the remote URL to ensure credentials are current
+  if git -C "$repo_root" remote get-url forgejo >/dev/null 2>&1; then
+    git -C "$repo_root" remote set-url forgejo "$remote_url"
+  else
+    git -C "$repo_root" remote add forgejo "$remote_url"
+  fi
+  echo "Remote:  forgejo -> ${display_url}"
+
+  # Skip push if local repo has no commits (e.g. cloned from empty Forgejo repo)
+  if ! git -C "$repo_root" rev-parse HEAD >/dev/null 2>&1; then
+    echo "Push:    skipped (local repo has no commits)"
+    return 0
+  fi
+
+  # Push all branches and tags
+  echo "Pushing: branches to forgejo"
+  if ! git -C "$repo_root" push forgejo --all 2>&1; then
+    echo "Error: failed to push branches to Forgejo" >&2
+    return 1
+  fi
+  echo "Pushing: tags to forgejo"
+  if ! git -C "$repo_root" push forgejo --tags 2>&1; then
+    echo "Error: failed to push tags to Forgejo" >&2
+    return 1
+  fi
+
+  # Verify the repo is no longer empty (Forgejo may need a moment to index pushed refs)
+  local is_empty="true"
+  local verify_attempt
+  for verify_attempt in $(seq 1 5); do
+    local repo_info
+    repo_info=$(curl -sf --max-time 10 \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${forge_url}/api/v1/repos/${repo_slug}" 2>/dev/null) || repo_info=""
+    if [ -z "$repo_info" ]; then
+      is_empty="skipped"
+      break  # API unreachable, skip verification
+    fi
+    is_empty=$(printf '%s' "$repo_info" | jq -r '.empty // "unknown"')
+    if [ "$is_empty" != "true" ]; then
+      echo "Verify:  repo is not empty (push confirmed)"
+      break
+    fi
+    if [ "$verify_attempt" -lt 5 ]; then
+      sleep 2
+    fi
+  done
+  if [ "$is_empty" = "true" ]; then
+    echo "Warning: Forgejo repo still reports empty after push" >&2
+    return 1
+  fi
+}
--- a/lib/forge-setup.sh
+++ b/lib/forge-setup.sh
@ -0,0 +1,772 @@
+#!/usr/bin/env bash
+# =============================================================================
+# forge-setup.sh — setup_forge() and helpers for Forgejo provisioning
+#
+# Handles admin user creation, bot user creation, token generation,
+# password resets, repo creation, and collaborator setup.
+#
+# Globals expected (asserted by _load_init_context):
+#   FORGE_URL    - Forge instance URL (e.g. http://localhost:3000)
+#   FACTORY_ROOT - Root of the disinto factory
+#   PRIMARY_BRANCH - Primary branch name (e.g. main)
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/forge-setup.sh"
+#   setup_forge <forge_url> <repo_slug>
+# =============================================================================
+set -euo pipefail
+
+# Assert required globals are set before using this module.
+_load_init_context() {
+  local missing=()
+  [ -z "${FORGE_URL:-}" ]    && missing+=("FORGE_URL")
+  [ -z "${FACTORY_ROOT:-}" ] && missing+=("FACTORY_ROOT")
+  [ -z "${PRIMARY_BRANCH:-}" ] && missing+=("PRIMARY_BRANCH")
+  if [ "${#missing[@]}" -gt 0 ]; then
+    echo "Error: forge-setup.sh requires these globals to be set: ${missing[*]}" >&2
+    exit 1
+  fi
+}
+
+# Execute a command in the Forgejo container (for admin operations)
+_forgejo_exec() {
+  local use_bare="${DISINTO_BARE:-false}"
+  if [ "$use_bare" = true ]; then
+    docker exec -u git disinto-forgejo "$@"
+  else
+    docker compose -f "${FACTORY_ROOT}/docker-compose.yml" exec -T -u git forgejo "$@"
+  fi
+}
+
+# Check if a token already exists in .env (for idempotency)
+# Returns 0 if token exists, 1 if it doesn't
+_token_exists_in_env() {
+  local token_var="$1"
+  local env_file="$2"
+  grep -q "^${token_var}=" "$env_file" 2>/dev/null
+}
+
+# Check if a password already exists in .env (for idempotency)
+# Returns 0 if password exists, 1 if it doesn't
+_pass_exists_in_env() {
+  local pass_var="$1"
+  local env_file="$2"
+  grep -q "^${pass_var}=" "$env_file" 2>/dev/null
+}
+
+# Provision or connect to a local Forgejo instance.
+# Creates admin + bot users, generates API tokens, stores in .env.
+# When $DISINTO_BARE is set, uses standalone docker run; otherwise uses compose.
+# Usage: setup_forge [--rotate-tokens] <forge_url> <repo_slug>
+setup_forge() {
+  local rotate_tokens=false
+  # Parse optional --rotate-tokens flag
+  if [ "$1" = "--rotate-tokens" ]; then
+    rotate_tokens=true
+    shift
+  fi
+  local forge_url="$1"
+  local repo_slug="$2"
+  local use_bare="${DISINTO_BARE:-false}"
+
+  echo ""
+  echo "── Forge setup ────────────────────────────────────────"
+
+  # Check if Forgejo is already running
+  if curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/version" >/dev/null 2>&1; then
+    echo "Forgejo:  ${forge_url} (already running)"
+  else
+    echo "Forgejo not reachable at ${forge_url}"
+    echo "Starting Forgejo via Docker..."
+
+    if ! command -v docker &>/dev/null; then
+      echo "Error: docker not found — needed to provision Forgejo" >&2
+      echo "  Install Docker or start Forgejo manually at ${forge_url}" >&2
+      exit 1
+    fi
+
+    # Extract port from forge_url
+    local forge_port
+    forge_port=$(printf '%s' "$forge_url" | sed -E 's|.*:([0-9]+)/?$|\1|')
+    forge_port="${forge_port:-3000}"
+
+    if [ "$use_bare" = true ]; then
+      # Bare-metal mode: standalone docker run
+      mkdir -p "${FORGEJO_DATA_DIR}"
+
+      if docker ps -a --format '{{.Names}}' | grep -q '^disinto-forgejo$'; then
+        docker start disinto-forgejo >/dev/null 2>&1 || true
+      else
+        docker run -d \
+          --name disinto-forgejo \
+          --restart unless-stopped \
+          -p "${forge_port}:3000" \
+          -p 2222:22 \
+          -v "${FORGEJO_DATA_DIR}:/data" \
+          -e "FORGEJO__database__DB_TYPE=sqlite3" \
+          -e "FORGEJO__server__ROOT_URL=${forge_url}/" \
+          -e "FORGEJO__server__HTTP_PORT=3000" \
+          -e "FORGEJO__service__DISABLE_REGISTRATION=true" \
+          codeberg.org/forgejo/forgejo:11.0
+      fi
+    else
+      # Compose mode: start Forgejo via docker compose
+      docker compose -f "${FACTORY_ROOT}/docker-compose.yml" up -d forgejo
+    fi
+
+    # Wait for Forgejo to become healthy
+    echo -n "Waiting for Forgejo to start"
+    local retries=0
+    while ! curl -sf --max-time 3 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/version" >/dev/null 2>&1; do
+      retries=$((retries + 1))
+      if [ "$retries" -gt 60 ]; then
+        echo ""
+        echo "Error: Forgejo did not become ready within 60s" >&2
+        exit 1
+      fi
+      echo -n "."
+      sleep 1
+    done
+    echo " ready"
+  fi
+
+  # Wait for Forgejo database to accept writes (API may be ready before DB is)
+  echo -n "Waiting for Forgejo database"
+  local db_ready=false
+  for _i in $(seq 1 30); do
+    if _forgejo_exec forgejo admin user list >/dev/null 2>&1; then
+      db_ready=true
+      break
+    fi
+    echo -n "."
+    sleep 1
+  done
+  echo ""
+  if [ "$db_ready" != true ]; then
+    echo "Error: Forgejo database not ready after 30s" >&2
+    exit 1
+  fi
+
+  # Create admin user if it doesn't exist
+  local admin_user="disinto-admin"
+  local admin_pass
+  local env_file="${FACTORY_ROOT}/.env"
+
+  # Re-read persisted admin password if available (#158)
+  if grep -q '^FORGE_ADMIN_PASS=' "$env_file" 2>/dev/null; then
+    admin_pass=$(grep '^FORGE_ADMIN_PASS=' "$env_file" | head -1 | cut -d= -f2-)
+  fi
+  # Generate a fresh password only when none was persisted
+  if [ -z "${admin_pass:-}" ]; then
+    admin_pass="admin-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+  fi
+
+  if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
+    echo "Creating admin user: ${admin_user}"
+    local create_output
+    if ! create_output=$(_forgejo_exec forgejo admin user create \
+      --admin \
+      --username "${admin_user}" \
+      --password "${admin_pass}" \
+      --email "admin@disinto.local" \
+      --must-change-password=false 2>&1); then
+      echo "Error: failed to create admin user '${admin_user}':" >&2
+      echo "  ${create_output}" >&2
+      exit 1
+    fi
+    # Forgejo 11.x ignores --must-change-password=false on create;
+    # explicitly clear the flag so basic-auth token creation works.
+    _forgejo_exec forgejo admin user change-password \
+      --username "${admin_user}" \
+      --password "${admin_pass}" \
+      --must-change-password=false
+
+    # Verify admin user was actually created
+    if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
+      echo "Error: admin user '${admin_user}' not found after creation" >&2
+      exit 1
+    fi
+
+    # Persist admin password to .env for idempotent re-runs (#158)
+    if grep -q '^FORGE_ADMIN_PASS=' "$env_file" 2>/dev/null; then
+      sed -i "s|^FORGE_ADMIN_PASS=.*|FORGE_ADMIN_PASS=${admin_pass}|" "$env_file"
+    else
+      printf 'FORGE_ADMIN_PASS=%s\n' "$admin_pass" >> "$env_file"
+    fi
+  else
+    echo "Admin user: ${admin_user} (already exists)"
+    # Only reset password if basic auth fails (#158, #267)
+    # Forgejo 11.x may ignore --must-change-password=false, blocking token creation
+    if ! curl -sf --max-time 5 -u "${admin_user}:${admin_pass}" \
+        "${forge_url}/api/v1/user" >/dev/null 2>&1; then
+      _forgejo_exec forgejo admin user change-password \
+        --username "${admin_user}" \
+        --password "${admin_pass}" \
+        --must-change-password=false
+    fi
+  fi
+  # Preserve password for Woodpecker OAuth2 token generation (#779)
+  _FORGE_ADMIN_PASS="$admin_pass"
+
+  # Create human user (disinto-admin) as site admin if it doesn't exist
+  local human_user="disinto-admin"
+  local human_pass
+  human_pass="admin-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+
+  if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
+    echo "Creating human user: ${human_user}"
+    local create_output
+    if ! create_output=$(_forgejo_exec forgejo admin user create \
+      --admin \
+      --username "${human_user}" \
+      --password "${human_pass}" \
+      --email "admin@disinto.local" \
+      --must-change-password=false 2>&1); then
+      echo "Error: failed to create human user '${human_user}':" >&2
+      echo "  ${create_output}" >&2
+      exit 1
+    fi
+    # Forgejo 11.x ignores --must-change-password=false on create;
+    # explicitly clear the flag so basic-auth token creation works.
+    _forgejo_exec forgejo admin user change-password \
+      --username "${human_user}" \
+      --password "${human_pass}" \
+      --must-change-password=false
+
+    # Verify human user was actually created
+    if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
+      echo "Error: human user '${human_user}' not found after creation" >&2
+      exit 1
+    fi
+    echo "  Human user '${human_user}' created as site admin"
+  else
+    echo "Human user: ${human_user} (already exists)"
+  fi
+
+  # Delete existing admin token if present (token sha1 is only returned at creation time)
+  local existing_token_id
+  existing_token_id=$(curl -sf \
+    -u "${admin_user}:${admin_pass}" \
+    "${forge_url}/api/v1/users/${admin_user}/tokens" 2>/dev/null \
+    | jq -r '.[] | select(.name == "disinto-admin-token") | .id') || existing_token_id=""
+  if [ -n "$existing_token_id" ]; then
+    curl -sf -X DELETE \
+      -u "${admin_user}:${admin_pass}" \
+      "${forge_url}/api/v1/users/${admin_user}/tokens/${existing_token_id}" >/dev/null 2>&1 || true
+  fi
+
+  # Create admin token (fresh, so sha1 is returned)
+  local admin_token
+  admin_token=$(curl -sf -X POST \
+    -u "${admin_user}:${admin_pass}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/users/${admin_user}/tokens" \
+    -d '{"name":"disinto-admin-token","scopes":["all"]}' 2>/dev/null \
+    | jq -r '.sha1 // empty') || admin_token=""
+
+  if [ -z "$admin_token" ]; then
+    echo "Error: failed to obtain admin API token" >&2
+    exit 1
+  fi
+
+  # Get or create human user token
+  local human_token=""
+  # Delete existing human token if present (token sha1 is only returned at creation time)
+  local existing_human_token_id
+  existing_human_token_id=$(curl -sf \
+    -u "${human_user}:${human_pass}" \
+    "${forge_url}/api/v1/users/${human_user}/tokens" 2>/dev/null \
+    | jq -r '.[] | select(.name == "disinto-human-token") | .id') || existing_human_token_id=""
+  if [ -n "$existing_human_token_id" ]; then
+    curl -sf -X DELETE \
+      -u "${human_user}:${human_pass}" \
+      "${forge_url}/api/v1/users/${human_user}/tokens/${existing_human_token_id}" >/dev/null 2>&1 || true
+  fi
+
+  # Create human token (fresh, so sha1 is returned)
+  human_token=$(curl -sf -X POST \
+    -u "${human_user}:${human_pass}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/users/${human_user}/tokens" \
+    -d '{"name":"disinto-human-token","scopes":["all"]}' 2>/dev/null \
+    | jq -r '.sha1 // empty') || human_token=""
+
+  if [ -n "$human_token" ]; then
+    # Store human token in .env
+    if grep -q '^HUMAN_TOKEN=' "$env_file" 2>/dev/null; then
+      sed -i "s|^HUMAN_TOKEN=.*|HUMAN_TOKEN=${human_token}|" "$env_file"
+    else
+      printf 'HUMAN_TOKEN=%s\n' "$human_token" >> "$env_file"
+    fi
+    export HUMAN_TOKEN="$human_token"
+    echo "  Human token saved (HUMAN_TOKEN)"
+  fi
+
+  # Create bot users and tokens
+  # Each agent gets its own Forgejo account for identity and audit trail (#747).
+  # Map: bot-username -> env-var-name for the token
+  local -A bot_token_vars=(
+    [dev-bot]="FORGE_TOKEN"
+    [review-bot]="FORGE_REVIEW_TOKEN"
+    [planner-bot]="FORGE_PLANNER_TOKEN"
+    [gardener-bot]="FORGE_GARDENER_TOKEN"
+    [vault-bot]="FORGE_VAULT_TOKEN"
+    [supervisor-bot]="FORGE_SUPERVISOR_TOKEN"
+    [predictor-bot]="FORGE_PREDICTOR_TOKEN"
+    [architect-bot]="FORGE_ARCHITECT_TOKEN"
+  )
+  # Map: bot-username -> env-var-name for the password
+  # Forgejo 11.x API tokens don't work for git HTTP push (#361).
+  # Store passwords so agents can use password auth for git operations.
+  local -A bot_pass_vars=(
+    [dev-bot]="FORGE_PASS"
+    [review-bot]="FORGE_REVIEW_PASS"
+    [planner-bot]="FORGE_PLANNER_PASS"
+    [gardener-bot]="FORGE_GARDENER_PASS"
+    [vault-bot]="FORGE_VAULT_PASS"
+    [supervisor-bot]="FORGE_SUPERVISOR_PASS"
+    [predictor-bot]="FORGE_PREDICTOR_PASS"
+    [architect-bot]="FORGE_ARCHITECT_PASS"
+  )
+  # Llama bot users (local-model agents) — separate from main agents
+  # Each llama agent gets its own Forgejo user, token, and password
+  local -A llama_token_vars=(
+    [dev-qwen]="FORGE_TOKEN_LLAMA"
+    [dev-qwen-nightly]="FORGE_TOKEN_LLAMA_NIGHTLY"
+  )
+  local -A llama_pass_vars=(
+    [dev-qwen]="FORGE_PASS_LLAMA"
+    [dev-qwen-nightly]="FORGE_PASS_LLAMA_NIGHTLY"
+  )
+
+  local bot_user bot_pass token token_var pass_var
+
+  for bot_user in dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot; do
+    token_var="${bot_token_vars[$bot_user]}"
+    pass_var="${bot_pass_vars[$bot_user]}"
+
+    # Check if token already exists in .env
+    local token_exists=false
+    if _token_exists_in_env "$token_var" "$env_file"; then
+      token_exists=true
+    fi
+
+    # Check if password already exists in .env
+    local pass_exists=false
+    if _pass_exists_in_env "$pass_var" "$env_file"; then
+      pass_exists=true
+    fi
+
+    # Check if bot user exists on Forgejo
+    local user_exists=false
+    if curl -sf --max-time 5 \
+      -H "Authorization: token ${admin_token}" \
+      "${forge_url}/api/v1/users/${bot_user}" >/dev/null 2>&1; then
+      user_exists=true
+    fi
+
+    # Skip token/password regeneration if both exist in .env and not forcing rotation
+    if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
+      echo "  ${bot_user} token and password preserved (use --rotate-tokens to force)"
+      # Still export the existing token for use within this run
+      local existing_token existing_pass
+      existing_token=$(grep "^${token_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      existing_pass=$(grep "^${pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      export "${token_var}=${existing_token}"
+      export "${pass_var}=${existing_pass}"
+      continue
+    fi
+
+    # Generate new credentials if:
+    # - Token doesn't exist (first run)
+    # - Password doesn't exist (first run)
+    # - --rotate-tokens flag is set (explicit rotation)
+    if [ "$user_exists" = false ]; then
+      # User doesn't exist - create it
+      bot_pass="bot-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+      echo "Creating bot user: ${bot_user}"
+      local create_output
+      if ! create_output=$(_forgejo_exec forgejo admin user create \
+        --username "${bot_user}" \
+        --password "${bot_pass}" \
+        --email "${bot_user}@disinto.local" \
+        --must-change-password=false 2>&1); then
+        echo "Error: failed to create bot user '${bot_user}':" >&2
+        echo "  ${create_output}" >&2
+        exit 1
+      fi
+      # Forgejo 11.x ignores --must-change-password=false on create;
+      # explicitly clear the flag so basic-auth token creation works.
+      _forgejo_exec forgejo admin user change-password \
+        --username "${bot_user}" \
+        --password "${bot_pass}" \
+        --must-change-password=false
+
+      # Verify bot user was actually created
+      if ! curl -sf --max-time 5 \
+        -H "Authorization: token ${admin_token}" \
+        "${forge_url}/api/v1/users/${bot_user}" >/dev/null 2>&1; then
+        echo "Error: bot user '${bot_user}' not found after creation" >&2
+        exit 1
+      fi
+      echo "  ${bot_user} user created"
+    else
+      # User exists - reset password if needed
+      echo "  ${bot_user} user exists"
+      if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
+        bot_pass="bot-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+        _forgejo_exec forgejo admin user change-password \
+          --username "${bot_user}" \
+          --password "${bot_pass}" \
+          --must-change-password=false || {
+          echo "Error: failed to reset password for existing bot user '${bot_user}'" >&2
+          exit 1
+        }
+        echo "  ${bot_user} password reset for token generation"
+      else
+        # Password exists, get it from .env
+        bot_pass=$(grep "^${pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      fi
+    fi
+
+    # Generate token via API (basic auth as the bot user — Forgejo requires
+    # basic auth on POST /users/{username}/tokens, token auth is rejected)
+    # First, try to delete existing tokens to avoid name collision
+    # Use bot user's own Basic Auth (we just set the password above)
+    local existing_token_ids
+    existing_token_ids=$(curl -sf \
+      -u "${bot_user}:${bot_pass}" \
+      "${forge_url}/api/v1/users/${bot_user}/tokens" 2>/dev/null \
+      | jq -r '.[].id // empty' 2>/dev/null) || existing_token_ids=""
+
+    # Delete any existing tokens for this user
+    if [ -n "$existing_token_ids" ]; then
+      while IFS= read -r tid; do
+        [ -n "$tid" ] && curl -sf -X DELETE \
+          -u "${bot_user}:${bot_pass}" \
+          "${forge_url}/api/v1/users/${bot_user}/tokens/${tid}" >/dev/null 2>&1 || true
+      done <<< "$existing_token_ids"
+    fi
+
+    token=$(curl -sf -X POST \
+      -u "${bot_user}:${bot_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/users/${bot_user}/tokens" \
+      -d "{\"name\":\"disinto-${bot_user}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \
+      | jq -r '.sha1 // empty') || token=""
+
+    if [ -z "$token" ]; then
+      echo "Error: failed to create API token for '${bot_user}'" >&2
+      exit 1
+    fi
+
+    # Store token in .env under the per-agent variable name
+    if grep -q "^${token_var}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${token_var}=.*|${token_var}=${token}|" "$env_file"
+    else
+      printf '%s=%s\n' "$token_var" "$token" >> "$env_file"
+    fi
+    export "${token_var}=${token}"
+    echo "  ${bot_user} token generated and saved (${token_var})"
+
+    # Store password in .env for git HTTP push (#361)
+    # Forgejo 11.x API tokens don't work for git push; password auth does.
+    if grep -q "^${pass_var}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${pass_var}=.*|${pass_var}=${bot_pass}|" "$env_file"
+    else
+      printf '%s=%s\n' "$pass_var" "$bot_pass" >> "$env_file"
+    fi
+    export "${pass_var}=${bot_pass}"
+    echo "  ${bot_user} password saved (${pass_var})"
+
+    # Backwards-compat aliases for dev-bot and review-bot
+    if [ "$bot_user" = "dev-bot" ]; then
+      export CODEBERG_TOKEN="$token"
+    elif [ "$bot_user" = "review-bot" ]; then
+      export REVIEW_BOT_TOKEN="$token"
+    fi
+  done
+
+  # Create llama bot users and tokens (local-model agents)
+  # These are separate from the main agents and get their own credentials
+  echo ""
+  echo "── Setting up llama bot users ────────────────────────────"
+
+  local llama_user llama_pass llama_token llama_token_var llama_pass_var
+  for llama_user in "${!llama_token_vars[@]}"; do
+    llama_token_var="${llama_token_vars[$llama_user]}"
+    llama_pass_var="${llama_pass_vars[$llama_user]}"
+
+    # Check if token already exists in .env
+    local token_exists=false
+    if _token_exists_in_env "$llama_token_var" "$env_file"; then
+      token_exists=true
+    fi
+
+    # Check if password already exists in .env
+    local pass_exists=false
+    if _pass_exists_in_env "$llama_pass_var" "$env_file"; then
+      pass_exists=true
+    fi
+
+    # Check if llama bot user exists on Forgejo
+    local llama_user_exists=false
+    if curl -sf --max-time 5 \
+      -H "Authorization: token ${admin_token}" \
+      "${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
+      llama_user_exists=true
+    fi
+
+    # Skip token/password regeneration if both exist in .env and not forcing rotation
+    if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
+      echo "  ${llama_user} token and password preserved (use --rotate-tokens to force)"
+      # Still export the existing token for use within this run
+      local existing_token existing_pass
+      existing_token=$(grep "^${llama_token_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      existing_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      export "${llama_token_var}=${existing_token}"
+      export "${llama_pass_var}=${existing_pass}"
+      continue
+    fi
+
+    # Generate new credentials if:
+    # - Token doesn't exist (first run)
+    # - Password doesn't exist (first run)
+    # - --rotate-tokens flag is set (explicit rotation)
+    if [ "$llama_user_exists" = false ]; then
+      # User doesn't exist - create it
+      llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+      echo "Creating llama bot user: ${llama_user}"
+      local create_output
+      if ! create_output=$(_forgejo_exec forgejo admin user create \
+        --username "${llama_user}" \
+        --password "${llama_pass}" \
+        --email "${llama_user}@disinto.local" \
+        --must-change-password=false 2>&1); then
+        echo "Error: failed to create llama bot user '${llama_user}':" >&2
+        echo "  ${create_output}" >&2
+        exit 1
+      fi
+      # Forgejo 11.x ignores --must-change-password=false on create;
+      # explicitly clear the flag so basic-auth token creation works.
+      _forgejo_exec forgejo admin user change-password \
+        --username "${llama_user}" \
+        --password "${llama_pass}" \
+        --must-change-password=false
+
+      # Verify llama bot user was actually created
+      if ! curl -sf --max-time 5 \
+        -H "Authorization: token ${admin_token}" \
+        "${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
+        echo "Error: llama bot user '${llama_user}' not found after creation" >&2
+        exit 1
+      fi
+      echo "  ${llama_user} user created"
+    else
+      # User exists - reset password if needed
+      echo "  ${llama_user} user exists"
+      if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
+        llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+        _forgejo_exec forgejo admin user change-password \
+          --username "${llama_user}" \
+          --password "${llama_pass}" \
+          --must-change-password=false || {
+          echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2
+          exit 1
+        }
+        echo "  ${llama_user} password reset for token generation"
+      else
+        # Password exists, get it from .env
+        llama_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
+      fi
+    fi
+
+    # Generate token via API (basic auth as the llama user)
+    # First, delete any existing tokens to avoid name collision
+    local existing_llama_token_ids
+    existing_llama_token_ids=$(curl -sf \
+      -u "${llama_user}:${llama_pass}" \
+      "${forge_url}/api/v1/users/${llama_user}/tokens" 2>/dev/null \
+      | jq -r '.[].id // empty' 2>/dev/null) || existing_llama_token_ids=""
+
+    # Delete any existing tokens for this user
+    if [ -n "$existing_llama_token_ids" ]; then
+      while IFS= read -r tid; do
+        [ -n "$tid" ] && curl -sf -X DELETE \
+          -u "${llama_user}:${llama_pass}" \
+          "${forge_url}/api/v1/users/${llama_user}/tokens/${tid}" >/dev/null 2>&1 || true
+      done <<< "$existing_llama_token_ids"
+    fi
+
+    llama_token=$(curl -sf -X POST \
+      -u "${llama_user}:${llama_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/users/${llama_user}/tokens" \
+      -d "{\"name\":\"disinto-${llama_user}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \
+      | jq -r '.sha1 // empty') || llama_token=""
+
+    if [ -z "$llama_token" ]; then
+      echo "Error: failed to create API token for '${llama_user}'" >&2
+      exit 1
+    fi
+
+    # Store token in .env under the llama-specific variable name
+    if grep -q "^${llama_token_var}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${llama_token_var}=.*|${llama_token_var}=${llama_token}|" "$env_file"
+    else
+      printf '%s=%s\n' "$llama_token_var" "$llama_token" >> "$env_file"
+    fi
+    export "${llama_token_var}=${llama_token}"
+    echo "  ${llama_user} token generated and saved (${llama_token_var})"
+
+    # Store password in .env for git HTTP push (#361)
+    # Forgejo 11.x API tokens don't work for git push; password auth does.
+    if grep -q "^${llama_pass_var}=" "$env_file" 2>/dev/null; then
+      sed -i "s|^${llama_pass_var}=.*|${llama_pass_var}=${llama_pass}|" "$env_file"
+    else
+      printf '%s=%s\n' "$llama_pass_var" "$llama_pass" >> "$env_file"
+    fi
+    export "${llama_pass_var}=${llama_pass}"
+    echo "  ${llama_user} password saved (${llama_pass_var})"
+  done
+
+  # Create .profile repos for all bot users (if they don't already exist)
+  # This runs the same logic as hire-an-agent Step 2-3 for idempotent setup
+  echo ""
+  echo "── Setting up .profile repos ────────────────────────────"
+
+  local -a bot_users=(dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot)
+  # Add llama bot users to .profile repo creation
+  for llama_user in "${!llama_token_vars[@]}"; do
+    bot_users+=("$llama_user")
+  done
+  local bot_user
+
+  for bot_user in "${bot_users[@]}"; do
+    # Check if .profile repo already exists
+    if curl -sf --max-time 5 -H "Authorization: token ${admin_token}" "${forge_url}/api/v1/repos/${bot_user}/.profile" >/dev/null 2>&1; then
+      echo "  ${bot_user}/.profile already exists"
+      continue
+    fi
+
+    echo "Creating ${bot_user}/.profile repo..."
+
+    # Create the repo using the admin API to ensure it's created in the bot user's namespace
+    local create_output
+    create_output=$(curl -sf -X POST \
+      -u "${admin_user}:${admin_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/admin/users/${bot_user}/repos" \
+      -d "{\"name\":\".profile\",\"description\":\"${bot_user}'s .profile repo\",\"private\":true,\"auto_init\":false}" 2>&1) || true
+
+    if echo "$create_output" | grep -q '"id":\|[0-9]'; then
+      echo "  Created ${bot_user}/.profile (via admin API)"
+    else
+      echo "  Warning: failed to create ${bot_user}/.profile: ${create_output}" >&2
+    fi
+  done
+
+  # Store FORGE_URL in .env if not already present
+  if ! grep -q '^FORGE_URL=' "$env_file" 2>/dev/null; then
+    printf 'FORGE_URL=%s\n' "$forge_url" >> "$env_file"
+  fi
+
+  # Create the repo on Forgejo if it doesn't exist
+  local org_name="${repo_slug%%/*}"
+  local repo_name="${repo_slug##*/}"
+
+  # Check if repo already exists
+  if ! curl -sf --max-time 5 \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    "${forge_url}/api/v1/repos/${repo_slug}" >/dev/null 2>&1; then
+
+    # Try creating org first (ignore if exists)
+    curl -sf -X POST \
+      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/orgs" \
+      -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true
+
+    # Create repo under org
+    if ! curl -sf -X POST \
+      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/orgs/${org_name}/repos" \
+      -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" >/dev/null 2>&1; then
+      # Fallback: create under the human user namespace using admin endpoint
+      if [ -n "${admin_token:-}" ]; then
+        if ! curl -sf -X POST \
+          -H "Authorization: token ${admin_token}" \
+          -H "Content-Type: application/json" \
+          "${forge_url}/api/v1/admin/users/${org_name}/repos" \
+          -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" >/dev/null 2>&1; then
+          echo "Error: failed to create repo '${repo_slug}' on Forgejo (admin endpoint)" >&2
+          exit 1
+        fi
+      elif [ -n "${HUMAN_TOKEN:-}" ]; then
+        if ! curl -sf -X POST \
+          -H "Authorization: token ${HUMAN_TOKEN}" \
+          -H "Content-Type: application/json" \
+          "${forge_url}/api/v1/user/repos" \
+          -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" >/dev/null 2>&1; then
+          echo "Error: failed to create repo '${repo_slug}' on Forgejo (user endpoint)" >&2
+          exit 1
+        fi
+      else
+        echo "Error: failed to create repo '${repo_slug}' — no admin or human token available" >&2
+        exit 1
+      fi
+    fi
+
+    # Add all bot users as collaborators with appropriate permissions
+    # dev-bot: write (PR creation via lib/vault.sh)
+    # review-bot: read (PR review)
+    # planner-bot: write (prerequisites.md, memory)
+    # gardener-bot: write (backlog grooming)
+    # vault-bot: write (vault items)
+    # supervisor-bot: read (health monitoring)
+    # predictor-bot: read (pattern detection)
+    # architect-bot: write (sprint PRs)
+    local bot_perm
+    declare -A bot_permissions=(
+      [dev-bot]="write"
+      [review-bot]="read"
+      [planner-bot]="write"
+      [gardener-bot]="write"
+      [vault-bot]="write"
+      [supervisor-bot]="read"
+      [predictor-bot]="read"
+      [architect-bot]="write"
+    )
+    for bot_user in "${!bot_permissions[@]}"; do
+      bot_perm="${bot_permissions[$bot_user]}"
+      curl -sf -X PUT \
+        -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+        -H "Content-Type: application/json" \
+        "${forge_url}/api/v1/repos/${repo_slug}/collaborators/${bot_user}" \
+        -d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1 || true
+    done
+
+    # Add llama bot users as write collaborators for local-model agents
+    for llama_user in "${!llama_token_vars[@]}"; do
+      curl -sf -X PUT \
+        -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+        -H "Content-Type: application/json" \
+        "${forge_url}/api/v1/repos/${repo_slug}/collaborators/${llama_user}" \
+        -d '{"permission":"write"}' >/dev/null 2>&1 || true
+    done
+
+    # Add disinto-admin as admin collaborator
+    curl -sf -X PUT \
+      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/repos/${repo_slug}/collaborators/disinto-admin" \
+      -d '{"permission":"admin"}' >/dev/null 2>&1 || true
+
+    echo "Repo:    ${repo_slug} created on Forgejo"
+  else
+    echo "Repo:    ${repo_slug} (already exists on Forgejo)"
+  fi
+
+  echo "Forge:   ${forge_url} (ready)"
+}
--- a/lib/formula-session.sh
+++ b/lib/formula-session.sh
@ -1,55 +1,60 @@
 #!/usr/bin/env bash
-# formula-session.sh — Shared helpers for formula-driven cron agents
+# formula-session.sh — Shared helpers for formula-driven polling-loop agents
 #
-# Provides reusable functions for the common cron-wrapper + tmux-session
-# pattern used by planner-run.sh, predictor-run.sh, gardener-run.sh, and supervisor-run.sh.
+# Provides reusable utility functions for the common polling-loop wrapper pattern
+# used by planner-run.sh, predictor-run.sh, gardener-run.sh, and supervisor-run.sh.
 #
 # Functions:
-#   acquire_cron_lock   LOCK_FILE          — PID lock with stale cleanup
-#   check_memory        [MIN_MB]           — skip if available RAM too low
+#   acquire_run_lock    LOCK_FILE          — PID lock with stale cleanup
 #   load_formula        FORMULA_FILE       — sets FORMULA_CONTENT
 #   build_context_block FILE [FILE ...]    — sets CONTEXT_BLOCK
-#   start_formula_session SESSION WORKDIR PHASE_FILE — create tmux + claude
-#   build_prompt_footer    [EXTRA_API]      — sets PROMPT_FOOTER (API ref + env + phase)
-#   run_formula_and_monitor AGENT [TIMEOUT] [CALLBACK] — session start, inject, monitor, log
-#   formula_phase_callback PHASE           — standard crash-recovery callback
+#   build_prompt_footer [EXTRA_API_LINES]  — sets PROMPT_FOOTER (API ref + env)
+#   build_sdk_prompt_footer [EXTRA_API]    — omits phase protocol (SDK mode)
+#   formula_worktree_setup WORKTREE        — isolated worktree for formula execution
 #   formula_prepare_profile_context        — load lessons from .profile repo (pre-session)
+#   formula_lessons_block                  — return lessons block for prompt
+#   profile_write_journal ISSUE_NUM TITLE OUTCOME [FILES] — post-session journal
+#   profile_load_lessons                   — load lessons-learned.md into LESSONS_CONTEXT
+#   ensure_profile_repo [AGENT_IDENTITY]   — clone/pull .profile repo
+#   _profile_has_repo                      — check if agent has .profile repo
+#   _count_undigested_journals             — count journal entries to digest
+#   _profile_digest_journals               — digest journals into lessons (timeout + batch cap)
+#   _profile_restore_lessons FILE BACKUP   — restore lessons on digest failure
+#   _profile_commit_and_push MESSAGE [FILES] — commit/push to .profile repo
+#   resolve_agent_identity                 — resolve agent user login from FORGE_TOKEN
+#   build_graph_section                    — run build-graph.py and set GRAPH_SECTION
+#   build_scratch_instruction SCRATCH_FILE — return context scratch instruction
+#   read_scratch_context SCRATCH_FILE      — return scratch file content block
+#   ensure_ops_repo                        — clone/pull ops repo
+#   ops_commit_and_push MESSAGE [FILES]    — commit/push to ops repo
+#   cleanup_stale_crashed_worktrees [HOURS] — thin wrapper around worktree_cleanup_stale
 #
-# Requires: lib/agent-session.sh sourced first (for create_agent_session,
-# agent_kill_session, agent_inject_into_session).
-# Globals used by formula_phase_callback: SESSION_NAME, PHASE_FILE,
-# PROJECT_REPO_ROOT, PROMPT (set by the calling script).
+# Requires: lib/env.sh, lib/worktree.sh, lib/agent-sdk.sh sourced first for shared helpers.

-# ── Cron guards ──────────────────────────────────────────────────────────
+# Source agent-sdk for claude_run_with_watchdog watchdog helper
+source "$(dirname "${BASH_SOURCE[0]}")/agent-sdk.sh"

-# acquire_cron_lock LOCK_FILE
+# Source ops-setup for migrate_ops_repo (used by ensure_ops_repo)
+source "$(dirname "${BASH_SOURCE[0]}")/ops-setup.sh"
+
+# ── Run guards ───────────────────────────────────────────────────────────
+
+# acquire_run_lock LOCK_FILE
 # Acquires a PID lock. Exits 0 if another instance is running.
 # Sets an EXIT trap to clean up the lock file.
-acquire_cron_lock() {
-  _CRON_LOCK_FILE="$1"
-  if [ -f "$_CRON_LOCK_FILE" ]; then
+acquire_run_lock() {
+  _RUN_LOCK_FILE="$1"
+  if [ -f "$_RUN_LOCK_FILE" ]; then
    local lock_pid
-    lock_pid=$(cat "$_CRON_LOCK_FILE" 2>/dev/null || true)
+    lock_pid=$(cat "$_RUN_LOCK_FILE" 2>/dev/null || true)
    if [ -n "$lock_pid" ] && kill -0 "$lock_pid" 2>/dev/null; then
      log "run: already running (PID $lock_pid)"
      exit 0
    fi
-    rm -f "$_CRON_LOCK_FILE"
-  fi
-  echo $$ > "$_CRON_LOCK_FILE"
-  trap 'rm -f "$_CRON_LOCK_FILE"' EXIT
-}
-
-# check_memory [MIN_MB]
-# Exits 0 (skip) if available memory is below MIN_MB (default 2000).
-check_memory() {
-  local min_mb="${1:-2000}"
-  local avail_mb
-  avail_mb=$(free -m | awk '/Mem:/{print $7}')
-  if [ "${avail_mb:-0}" -lt "$min_mb" ]; then
-    log "run: skipping — only ${avail_mb}MB available (need ${min_mb})"
-    exit 0
+    rm -f "$_RUN_LOCK_FILE"
  fi
+  echo $$ > "$_RUN_LOCK_FILE"
+  trap 'rm -f "$_RUN_LOCK_FILE"' EXIT
 }

 # ── Agent identity resolution ────────────────────────────────────────────
@ -75,6 +80,24 @@ resolve_agent_identity() {
  return 0
 }

+# ── Forge remote resolution ──────────────────────────────────────────────
+
+# resolve_forge_remote
+# Resolves FORGE_REMOTE by matching FORGE_URL hostname against git remotes.
+# Falls back to "origin" if no match found.
+# Requires: FORGE_URL, git repo with remotes configured.
+# Exports: FORGE_REMOTE (always set).
+resolve_forge_remote() {
+  # Extract hostname from FORGE_URL (e.g., https://codeberg.org/user/repo -> codeberg.org)
+  _forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||; s|:.*||')
+  # Find git remote whose push URL matches the forge host
+  FORGE_REMOTE=$(git remote -v | awk -v host="$_forge_host" '$2 ~ host && /\(push\)/ {print $1; exit}')
+  # Fallback to origin if no match found
+  FORGE_REMOTE="${FORGE_REMOTE:-origin}"
+  export FORGE_REMOTE
+  log "forge remote: ${FORGE_REMOTE}"
+}
+
 # ── .profile repo management ──────────────────────────────────────────────

 # ensure_profile_repo [AGENT_IDENTITY]
@ -97,15 +120,16 @@ ensure_profile_repo() {
  # Define cache directory: /home/agent/data/.profile/{agent-name}
  PROFILE_REPO_PATH="${HOME:-/home/agent}/data/.profile/${agent_identity}"

-  # Build clone URL from FORGE_URL and agent identity
+  # Build clone URL from FORGE_URL — credential helper supplies auth (#604)
  local forge_url="${FORGE_URL:-http://localhost:3000}"
-  local auth_url
-  auth_url=$(printf '%s' "$forge_url" | sed "s|://|://$(whoami):${FORGE_TOKEN}@|")
-  local clone_url="${auth_url}/${agent_identity}/.profile.git"
+  local clone_url="${forge_url}/${agent_identity}/.profile.git"

  # Check if already cached and up-to-date
  if [ -d "${PROFILE_REPO_PATH}/.git" ]; then
    log "Pulling .profile repo: ${agent_identity}/.profile"
+    # Always refresh the remote URL to ensure it's clean (no baked credentials)
+    # This fixes auth issues when old URLs contained the wrong username (#652)
+    git -C "$PROFILE_REPO_PATH" remote set-url origin "$clone_url" 2>/dev/null || true
    if git -C "$PROFILE_REPO_PATH" fetch origin --quiet 2>/dev/null; then
      git -C "$PROFILE_REPO_PATH" checkout main --quiet 2>/dev/null || \
      git -C "$PROFILE_REPO_PATH" checkout master --quiet 2>/dev/null || true
@ -134,7 +158,7 @@ ensure_profile_repo() {
 # Checks if the agent has a .profile repo by querying Forgejo API.
 # Returns 0 if repo exists, 1 otherwise.
 _profile_has_repo() {
-  local agent_identity="${1:-${AGENT_IDENTITY:-}}"
+  local agent_identity="${AGENT_IDENTITY:-}"

  if [ -z "$agent_identity" ]; then
    if ! resolve_agent_identity; then
@ -168,10 +192,14 @@ _count_undigested_journals() {

 # _profile_digest_journals
 # Runs a claude -p one-shot to digest undigested journals into lessons-learned.md
+# Respects PROFILE_DIGEST_TIMEOUT (default 300s) and PROFILE_DIGEST_MAX_BATCH (default 5).
+# On failure/timeout, preserves the previous lessons-learned.md and does not archive journals.
 # Returns 0 on success, 1 on failure.
 _profile_digest_journals() {
-  local agent_identity="${1:-${AGENT_IDENTITY:-}}"
-  local model="${2:-${CLAUDE_MODEL:-opus}}"
+  local agent_identity="${AGENT_IDENTITY:-}"
+  local model="${CLAUDE_MODEL:-opus}"
+  local digest_timeout="${PROFILE_DIGEST_TIMEOUT:-300}"
+  local max_batch="${PROFILE_DIGEST_MAX_BATCH:-5}"

  if [ -z "$agent_identity" ]; then
    if ! resolve_agent_identity; then
@ -184,19 +212,27 @@ _profile_digest_journals() {
  local knowledge_dir="${PROFILE_REPO_PATH}/knowledge"
  local lessons_file="${knowledge_dir}/lessons-learned.md"

-  # Collect undigested journal entries
+  # Collect undigested journal entries (capped at max_batch)
  local journal_entries=""
+  local batch_count=0
+  local -a batchfiles=()
  if [ -d "$journal_dir" ]; then
    for jf in "$journal_dir"/*.md; do
      [ -f "$jf" ] || continue
      # Skip archived entries
      [[ "$jf" == */archive/* ]] && continue
+      if [ "$batch_count" -ge "$max_batch" ]; then
+        log "profile: capping digest batch at ${max_batch} journals (remaining will be digested in future runs)"
+        break
+      fi
      local basename
      basename=$(basename "$jf")
      journal_entries="${journal_entries}
 ### ${basename}
 $(cat "$jf")
 "
+      batchfiles+=("$jf")
+      batch_count=$((batch_count + 1))
    done
  fi

@ -205,65 +241,104 @@ $(cat "$jf")
    return 0
  fi

-  # Read existing lessons if available
-  local existing_lessons=""
+  log "profile: digesting ${batch_count} journals (timeout ${digest_timeout}s)"
+
+  # Ensure knowledge directory exists
+  mkdir -p "$knowledge_dir"
+
+  # Back up existing lessons-learned.md so we can restore on failure
+  local lessons_backup=""
  if [ -f "$lessons_file" ]; then
-    existing_lessons=$(cat "$lessons_file")
+    lessons_backup=$(mktemp)
+    cp "$lessons_file" "$lessons_backup"
  fi

+  # Capture mtime so we can detect a Write-tool write afterwards
+  local mtime_before=0
+  [ -f "$lessons_file" ] && mtime_before=$(stat -c %Y "$lessons_file")
+
  # Build prompt for digestion
  local digest_prompt="You are digesting journal entries from a developer agent's work sessions.

 ## Task
-Condense these journal entries into abstract, transferable lessons. Rewrite lessons-learned.md entirely.
+Update the lessons-learned file at this exact absolute path:
+
+  ${lessons_file}
+
+1. Read ${lessons_file} (it may not exist yet — that's fine, treat as empty).
+2. Digest the journal entries below into abstract, transferable patterns and heuristics.
+3. Merge with the existing lessons: preserve anything still useful, refine, drop stale or redundant entries, add new ones.
+4. Write the merged result back to ${lessons_file} using the Write tool.

 ## Constraints
 - Hard cap: 2KB maximum
 - Abstract: patterns and heuristics, not specific issues or file paths
 - Transferable: must help with future unseen work, not just recall past work
- Drop the least transferable lessons if over limit
-
-## Existing lessons-learned.md (if any)
-${existing_lessons:-<none>}
+- Drop the least transferable lessons if over the cap

 ## Journal entries to digest
-${journal_entries}
+${journal_entries}"

-## Output
-Write the complete, rewritten lessons-learned.md content below. No preamble, no explanation — just the file content."
-
-  # Run claude -p one-shot with same model as agent
-  local output
-  output=$(claude -p "$digest_prompt" \
+  # Run claude -p one-shot with digest-specific timeout
+  local output digest_rc
+  local saved_timeout="${CLAUDE_TIMEOUT:-7200}"
+  CLAUDE_TIMEOUT="$digest_timeout"
+  output=$(claude_run_with_watchdog claude -p "$digest_prompt" \
    --output-format json \
    --dangerously-skip-permissions \
-    --max-tokens 1000 \
    ${model:+--model "$model"} \
-    2>>"$LOGFILE" || echo '{"result":"error"}')
+    2>>"$LOGFILE") && digest_rc=0 || digest_rc=$?
+  CLAUDE_TIMEOUT="$saved_timeout"

-  # Extract content from JSON response
-  local lessons_content
-  lessons_content=$(printf '%s' "$output" | jq -r '.result // empty' 2>/dev/null || echo "")
-
-  if [ -z "$lessons_content" ]; then
-    log "profile: failed to digest journals"
+  if [ "$digest_rc" -eq 124 ]; then
+    log "profile: digest timed out after ${digest_timeout}s — preserving previous lessons, skipping archive"
+    _profile_restore_lessons "$lessons_file" "$lessons_backup"
    return 1
  fi

-  # Ensure knowledge directory exists
-  mkdir -p "$knowledge_dir"
+  if [ "$digest_rc" -ne 0 ]; then
+    log "profile: digest failed (exit code ${digest_rc}) — preserving previous lessons, skipping archive"
+    _profile_restore_lessons "$lessons_file" "$lessons_backup"
+    return 1
+  fi

-  # Write the lessons file (full rewrite)
-  printf '%s\n' "$lessons_content" > "$lessons_file"
-  log "profile: wrote lessons-learned.md (${#lessons_content} bytes)"
+  local mtime_after=0
+  [ -f "$lessons_file" ] && mtime_after=$(stat -c %Y "$lessons_file")

-  # Move digested journals to archive (if any were processed)
-  if [ -d "$journal_dir" ]; then
+  if [ "$mtime_after" -gt "$mtime_before" ] && [ -s "$lessons_file" ]; then
+    local file_size
+    file_size=$(wc -c < "$lessons_file")
+    # Treat tiny files (<=16 bytes) as failed digestion (e.g. "null", "{}", empty)
+    if [ "$file_size" -le 16 ]; then
+      log "profile: digest produced suspiciously small file (${file_size} bytes) — preserving previous lessons, skipping archive"
+      _profile_restore_lessons "$lessons_file" "$lessons_backup"
+      return 1
+    fi
+    log "profile: lessons-learned.md written by model via Write tool (${file_size} bytes)"
+  else
+    # Fallback: model didn't use Write tool — capture .result and strip any markdown code fence
+    local lessons_content
+    lessons_content=$(printf '%s' "$output" | jq -r '.result // empty' 2>/dev/null || echo "")
+    lessons_content=$(printf '%s' "$lessons_content" | sed -E '1{/^```(markdown|md)?[[:space:]]*$/d;};${/^```[[:space:]]*$/d;}')
+
+    if [ -z "$lessons_content" ] || [ "${#lessons_content}" -le 16 ]; then
+      log "profile: failed to digest journals (no Write tool call, empty or tiny .result) — preserving previous lessons, skipping archive"
+      _profile_restore_lessons "$lessons_file" "$lessons_backup"
+      return 1
+    fi
+
+    printf '%s\n' "$lessons_content" > "$lessons_file"
+    log "profile: lessons-learned.md written from .result fallback (${#lessons_content} bytes)"
+  fi
+
+  # Clean up backup on success
+  [ -n "$lessons_backup" ] && rm -f "$lessons_backup"
+
+  # Move only the digested journals to archive (not all — only the batch we processed)
+  if [ ${#batchfiles[@]} -gt 0 ]; then
    mkdir -p "${journal_dir}/archive"
    local archived=0
-    for jf in "$journal_dir"/*.md; do
-      [ -f "$jf" ] || continue
-      [[ "$jf" == */archive/* ]] && continue
+    for jf in "${batchfiles[@]}"; do
      local basename
      basename=$(basename "$jf")
      mv "$jf" "${journal_dir}/archive/${basename}" 2>/dev/null && archived=$((archived + 1))
@ -273,9 +348,27 @@ Write the complete, rewritten lessons-learned.md content below. No preamble, no
    fi
  fi

+  # Commit and push the digest results
+  _profile_commit_and_push \
+    "profile: digest ${archived:-0} journals → knowledge/lessons-learned.md" \
+    knowledge/lessons-learned.md \
+    journal/
+
  return 0
 }

+# _profile_restore_lessons LESSONS_FILE BACKUP_FILE
+# Restores previous lessons-learned.md from backup on digest failure.
+_profile_restore_lessons() {
+  local lessons_file="$1"
+  local backup="$2"
+  if [ -n "$backup" ] && [ -f "$backup" ]; then
+    cp "$backup" "$lessons_file"
+    rm -f "$backup"
+    log "profile: restored previous lessons-learned.md"
+  fi
+}
+
 # _profile_commit_and_push MESSAGE [FILE ...]
 # Commits and pushes changes to .profile repo.
 _profile_commit_and_push() {
@ -290,6 +383,15 @@ _profile_commit_and_push() {
  (
    cd "$PROFILE_REPO_PATH" || return 1

+    # Refresh the remote URL to ensure credentials are current (#652)
+    # This ensures we use the correct bot identity and fresh credentials
+    local forge_url="${FORGE_URL:-http://localhost:3000}"
+    local agent_identity="${AGENT_IDENTITY:-}"
+    if [ -n "$agent_identity" ]; then
+      local remote_url="${forge_url}/${agent_identity}/.profile.git"
+      git remote set-url origin "$remote_url" 2>/dev/null || true
+    fi
+
    if [ ${#files[@]} -gt 0 ]; then
      git add "${files[@]}"
    else
@ -298,7 +400,7 @@ _profile_commit_and_push() {

    if ! git diff --cached --quiet 2>/dev/null; then
      git config user.name "${AGENT_IDENTITY}" || true
-      git config user.email "${AGENT_IDENTITY}@users.noreply.codeberg.org" || true
+      git config user.email "${AGENT_IDENTITY}@disinto.local" || true
      git commit -m "$msg" --no-verify 2>/dev/null || true
      git push origin main --quiet 2>/dev/null || git push origin master --quiet 2>/dev/null || true
    fi
@ -307,7 +409,8 @@ _profile_commit_and_push() {

 # profile_load_lessons
 # Pre-session: loads lessons-learned.md into LESSONS_CONTEXT for prompt injection.
-# Lazy digestion: if >10 undigested journals exist, runs claude -p to digest them.
+# Lazy digestion: if undigested journals exceed PROFILE_DIGEST_THRESHOLD (default 10),
+# runs claude -p to digest them (bounded by PROFILE_DIGEST_MAX_BATCH and PROFILE_DIGEST_TIMEOUT).
 # Returns 0 on success, 1 if agent has no .profile repo (silent no-op).
 # Requires: ensure_profile_repo() called, AGENT_IDENTITY, FORGE_TOKEN, FORGE_URL, CLAUDE_MODEL.
 # Exports: LESSONS_CONTEXT (the lessons file content, hard-capped at 2KB).
@ -323,13 +426,14 @@ profile_load_lessons() {
  fi

  # Check journal count for lazy digestion trigger
-  local journal_count
+  local journal_count digest_threshold
  journal_count=$(_count_undigested_journals)
+  digest_threshold="${PROFILE_DIGEST_THRESHOLD:-10}"

-  if [ "${journal_count:-0}" -gt 10 ]; then
-    log "profile: digesting ${journal_count} undigested journals"
+  if [ "${journal_count:-0}" -gt "$digest_threshold" ]; then
+    log "profile: ${journal_count} undigested journals (threshold ${digest_threshold})"
    if ! _profile_digest_journals; then
-      log "profile: warning — journal digestion failed"
+      log "profile: warning — journal digestion failed, continuing with existing lessons"
    fi
  fi

@ -429,10 +533,9 @@ Write the journal entry below. Use markdown format."

  # Run claude -p one-shot with same model as agent
  local output
-  output=$(claude -p "$reflection_prompt" \
+  output=$(claude_run_with_watchdog claude -p "$reflection_prompt" \
    --output-format json \
    --dangerously-skip-permissions \
-    --max-tokens 500 \
    ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} \
    2>>"$LOGFILE" || echo '{"result":"error"}')

@ -449,16 +552,15 @@ Write the journal entry below. Use markdown format."
  local journal_dir="${PROFILE_REPO_PATH}/journal"
  mkdir -p "$journal_dir"

-  # Write journal entry (append if exists)
-  local journal_file="${journal_dir}/issue-${issue_num}.md"
-  if [ -f "$journal_file" ]; then
-    printf '\n---\n\n' >> "$journal_file"
-  fi
+  # Write journal entry with timestamped filename for accumulation
+  local ts
+  ts=$(date -u +%Y%m%d-%H%M%S)
+  local journal_file="${journal_dir}/issue-${issue_num}-${ts}.md"
  printf '%s\n' "$journal_content" >> "$journal_file"
-  log "profile: wrote journal entry for issue #${issue_num}"
+  log "profile: wrote journal entry for issue #${issue_num} (${ts})"

  # Commit and push to .profile repo
-  _profile_commit_and_push "journal: issue #${issue_num} reflection" "journal/issue-${issue_num}.md"
+  _profile_commit_and_push "journal: issue #${issue_num} reflection (${ts})" "journal/issue-${issue_num}-${ts}.md"

  return 0
 }
@ -557,7 +659,7 @@ $(cat "$ctx_path")
  done
 }

-# ── Ops repo helpers ─────────────────────────────────────────────────
+# ── Ops repo helpers ────────────────────────────────────────────────────

 # ensure_ops_repo
 # Clones or pulls the ops repo so agents can read/write operational data.
@ -572,6 +674,7 @@ ensure_ops_repo() {
    git -C "$ops_root" fetch origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
    git -C "$ops_root" checkout "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
    git -C "$ops_root" pull --ff-only origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
+    migrate_ops_repo "$ops_root" "${PRIMARY_BRANCH}"
    return 0
  fi

@ -579,14 +682,8 @@ ensure_ops_repo() {
  local ops_repo="${FORGE_OPS_REPO:-}"
  [ -n "$ops_repo" ] || return 0
  local forge_url="${FORGE_URL:-http://localhost:3000}"
-  local clone_url
-  if [ -n "${FORGE_TOKEN:-}" ]; then
-    local auth_url
-    auth_url=$(printf '%s' "$forge_url" | sed "s|://|://$(whoami):${FORGE_TOKEN}@|")
-    clone_url="${auth_url}/${ops_repo}.git"
-  else
-    clone_url="${forge_url}/${ops_repo}.git"
-  fi
+  # Use clean URL — credential helper supplies auth (#604)
+  local clone_url="${forge_url}/${ops_repo}.git"

  log "Cloning ops repo: ${ops_repo} -> ${ops_root}"
  if git clone --quiet "$clone_url" "$ops_root" 2>/dev/null; then
@ -620,90 +717,6 @@ ops_commit_and_push() {
  )
 }

-# ── Session management ───────────────────────────────────────────────────
-
-# start_formula_session SESSION WORKDIR PHASE_FILE
-# Kills stale session, resets phase file, creates a per-agent git worktree
-# for session isolation, and creates a new tmux + claude session in it.
-# Sets _FORMULA_SESSION_WORKDIR to the worktree path (or original workdir
-# on fallback). Callers must clean up via remove_formula_worktree after
-# the session ends.
-# Returns 0 on success, 1 on failure.
-start_formula_session() {
-  local session="$1" workdir="$2" phase_file="$3"
-  agent_kill_session "$session"
-  rm -f "$phase_file"
-
-  # Create per-agent git worktree for session isolation.
-  # Each agent gets its own CWD so Claude Code treats them as separate
-  # projects — no resume collisions between sequential formula runs.
-  _FORMULA_SESSION_WORKDIR="/tmp/disinto-${session}"
-  # Clean up any stale worktree from a previous run
-  git -C "$workdir" worktree remove "$_FORMULA_SESSION_WORKDIR" --force 2>/dev/null || true
-  if git -C "$workdir" worktree add "$_FORMULA_SESSION_WORKDIR" HEAD --detach 2>/dev/null; then
-    log "Created worktree: ${_FORMULA_SESSION_WORKDIR}"
-  else
-    log "WARNING: worktree creation failed — falling back to ${workdir}"
-    _FORMULA_SESSION_WORKDIR="$workdir"
-  fi
-
-  log "Creating tmux session: ${session}"
-  if ! create_agent_session "$session" "$_FORMULA_SESSION_WORKDIR" "$phase_file"; then
-    log "ERROR: failed to create tmux session ${session}"
-    return 1
-  fi
-}
-
-# remove_formula_worktree
-# Removes the worktree created by start_formula_session if it differs from
-# PROJECT_REPO_ROOT. Safe to call multiple times. No-op if no worktree was created.
-remove_formula_worktree() {
-  if [ -n "${_FORMULA_SESSION_WORKDIR:-}" ] \
-     && [ "$_FORMULA_SESSION_WORKDIR" != "${PROJECT_REPO_ROOT:-}" ]; then
-    git -C "$PROJECT_REPO_ROOT" worktree remove "$_FORMULA_SESSION_WORKDIR" --force 2>/dev/null || true
-    log "Removed worktree: ${_FORMULA_SESSION_WORKDIR}"
-  fi
-}
-
-# formula_phase_callback PHASE
-# Standard crash-recovery phase callback for formula sessions.
-# Requires globals: SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT.
-# Uses _FORMULA_CRASH_COUNT (auto-initialized) for single-retry limit.
-# shellcheck disable=SC2154  # SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT set by caller
-formula_phase_callback() {
-  local phase="$1"
-  log "phase: ${phase}"
-  case "$phase" in
-    PHASE:crashed)
-      if [ "${_FORMULA_CRASH_COUNT:-0}" -gt 0 ]; then
-        log "ERROR: session crashed again after recovery — giving up"
-        return 0
-      fi
-      _FORMULA_CRASH_COUNT=$(( ${_FORMULA_CRASH_COUNT:-0} + 1 ))
-      log "WARNING: tmux session died unexpectedly — attempting recovery"
-      if create_agent_session "${_MONITOR_SESSION:-$SESSION_NAME}" "${_FORMULA_SESSION_WORKDIR:-$PROJECT_REPO_ROOT}" "$PHASE_FILE" 2>/dev/null; then
-        agent_inject_into_session "${_MONITOR_SESSION:-$SESSION_NAME}" "$PROMPT"
-        log "Recovery session started"
-      else
-        log "ERROR: could not restart session after crash"
-      fi
-      ;;
-    PHASE:done|PHASE:failed|PHASE:escalate|PHASE:merged)
-      agent_kill_session "${_MONITOR_SESSION:-$SESSION_NAME}"
-      ;;
-  esac
-}
-
-# ── Stale crashed worktree cleanup ─────────────────────────────────────────
-
-# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
-# Thin wrapper around worktree_cleanup_stale() from lib/worktree.sh.
-# Kept for backwards compatibility with existing callers.
-# Requires: lib/worktree.sh sourced.
-cleanup_stale_crashed_worktrees() {
-  worktree_cleanup_stale "${1:-24}"
-}
-
 # ── Scratch file helpers (compaction survival) ────────────────────────────

 # build_scratch_instruction SCRATCH_FILE
@ -779,25 +792,26 @@ build_sdk_prompt_footer() {
 # Creates an isolated worktree for synchronous formula execution.
 # Fetches primary branch, cleans stale worktree, creates new one, and
 # sets an EXIT trap for cleanup.
-# Requires globals: PROJECT_REPO_ROOT, PRIMARY_BRANCH.
+# Requires globals: PROJECT_REPO_ROOT, PRIMARY_BRANCH, FORGE_REMOTE.
+# Ensure resolve_forge_remote() is called before this function.
 formula_worktree_setup() {
  local worktree="$1"
  cd "$PROJECT_REPO_ROOT" || return
-  git fetch origin "$PRIMARY_BRANCH" 2>/dev/null || true
+  git fetch "${FORGE_REMOTE}" "$PRIMARY_BRANCH" 2>/dev/null || true
  worktree_cleanup "$worktree"
-  git worktree add "$worktree" "origin/${PRIMARY_BRANCH}" --detach 2>/dev/null
+  git worktree add "$worktree" "${FORGE_REMOTE}/${PRIMARY_BRANCH}" --detach 2>/dev/null
  # shellcheck disable=SC2064  # expand worktree now, not at trap time
  trap "worktree_cleanup '$worktree'" EXIT
 }

-# ── Prompt + monitor helpers ──────────────────────────────────────────────
+# ── Prompt helpers ──────────────────────────────────────────────────────

 # build_prompt_footer [EXTRA_API_LINES]
-# Assembles the common forge API reference + environment + phase protocol
-# block for formula prompts.  Sets PROMPT_FOOTER.
+# Assembles the common forge API reference + environment block for formula prompts.
+# Sets PROMPT_FOOTER.
 # Pass additional API endpoint lines (pre-formatted, newline-prefixed) via $1.
 # Requires globals: FORGE_API, FACTORY_ROOT, PROJECT_REPO_ROOT,
-#                   PRIMARY_BRANCH, PHASE_FILE.
+#                   PRIMARY_BRANCH.
 build_prompt_footer() {
  local extra_api="${1:-}"
  # shellcheck disable=SC2034  # consumed by the calling script's PROMPT
@ -813,66 +827,15 @@ NEVER echo or include the actual token value in output — always reference \${F
 FACTORY_ROOT=${FACTORY_ROOT}
 PROJECT_REPO_ROOT=${PROJECT_REPO_ROOT}
 OPS_REPO_ROOT=${OPS_REPO_ROOT}
-PRIMARY_BRANCH=${PRIMARY_BRANCH}
-PHASE_FILE=${PHASE_FILE}
-
-## Phase protocol (REQUIRED)
-When all work is done:
-  echo 'PHASE:done' > '${PHASE_FILE}'
-On unrecoverable error:
-  printf 'PHASE:failed\nReason: %s\n' 'describe error' > '${PHASE_FILE}'"
+PRIMARY_BRANCH=${PRIMARY_BRANCH}"
 }

-# run_formula_and_monitor AGENT_NAME [TIMEOUT]
-# Starts the formula session, injects PROMPT, monitors phase, and logs result.
-# Requires globals: SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT,
-#                   FORGE_REPO, CLAUDE_MODEL (exported).
-# shellcheck disable=SC2154  # SESSION_NAME, PHASE_FILE, PROJECT_REPO_ROOT, PROMPT set by caller
-run_formula_and_monitor() {
-  local agent_name="$1"
-  local timeout="${2:-7200}"
-  local callback="${3:-formula_phase_callback}"
+# ── Stale crashed worktree cleanup ────────────────────────────────────────

-  if ! start_formula_session "$SESSION_NAME" "$PROJECT_REPO_ROOT" "$PHASE_FILE"; then
-    exit 1
-  fi
-
-  # Write phase protocol to context file for compaction survival
-  if [ -n "${PROMPT_FOOTER:-}" ]; then
-    write_compact_context "$PHASE_FILE" "$PROMPT_FOOTER"
-  fi
-
-  agent_inject_into_session "$SESSION_NAME" "$PROMPT"
-  log "Prompt sent to tmux session"
-
-  log "Monitoring phase file: ${PHASE_FILE}"
-  _FORMULA_CRASH_COUNT=0
-
-  monitor_phase_loop "$PHASE_FILE" "$timeout" "$callback"
-
-  FINAL_PHASE=$(read_phase "$PHASE_FILE")
-  log "Final phase: ${FINAL_PHASE:-none}"
-
-  if [ "$FINAL_PHASE" != "PHASE:done" ]; then
-    case "${_MONITOR_LOOP_EXIT:-}" in
-      idle_prompt)
-        log "${agent_name}: Claude returned to prompt without writing phase signal"
-        ;;
-      idle_timeout)
-        log "${agent_name}: timed out with no phase signal"
-        ;;
-      *)
-        log "${agent_name} finished without PHASE:done (phase: ${FINAL_PHASE:-none}, exit: ${_MONITOR_LOOP_EXIT:-})"
-        ;;
-    esac
-  fi
-
-  # Preserve worktree on crash for debugging; clean up on success
-  if [ "${_MONITOR_LOOP_EXIT:-}" = "crashed" ]; then
-    worktree_preserve "${_FORMULA_SESSION_WORKDIR:-}" "crashed (agent=${agent_name})"
-  else
-    remove_formula_worktree
-  fi
-
-  log "--- ${agent_name^} run done ---"
+# cleanup_stale_crashed_worktrees [MAX_AGE_HOURS]
+# Thin wrapper around worktree_cleanup_stale() from lib/worktree.sh.
+# Kept for backwards compatibility with existing callers.
+# Requires: lib/worktree.sh sourced.
+cleanup_stale_crashed_worktrees() {
+  worktree_cleanup_stale "${1:-24}"
 }
--- a/lib/generators.sh
+++ b/lib/generators.sh
@ -0,0 +1,783 @@
+#!/usr/bin/env bash
+# =============================================================================
+# generators — template generation functions for disinto init
+#
+# Generates docker-compose.yml, Dockerfile, Caddyfile, staging index, and
+# deployment pipeline configs.
+#
+# Globals expected (must be set before sourcing):
+#   FACTORY_ROOT   - Root of the disinto factory
+#   PROJECT_NAME   - Project name for the project repo (defaults to 'project')
+#   PRIMARY_BRANCH - Primary branch name (defaults to 'main')
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/generators.sh"
+#   generate_compose "$forge_port"
+#   generate_caddyfile
+#   generate_staging_index
+#   generate_deploy_pipelines "$repo_root" "$project_name"
+# =============================================================================
+set -euo pipefail
+
+# Assert required globals are set
+: "${FACTORY_ROOT:?FACTORY_ROOT must be set}"
+# PROJECT_NAME defaults to 'project' if not set (env.sh may have set it from FORGE_REPO)
+PROJECT_NAME="${PROJECT_NAME:-project}"
+# PRIMARY_BRANCH defaults to main (env.sh may have set it to 'master')
+PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
+
+# Helper: extract woodpecker_repo_id from a project TOML file
+# Returns empty string if not found or file doesn't exist
+_get_woodpecker_repo_id() {
+  local toml_file="$1"
+  if [ -f "$toml_file" ]; then
+    python3 -c "
+import sys, tomllib
+try:
+    with open(sys.argv[1], 'rb') as f:
+        cfg = tomllib.load(f)
+    ci = cfg.get('ci', {})
+    wp_id = ci.get('woodpecker_repo_id', '0')
+    print(wp_id)
+except Exception:
+    print('0')
+" "$toml_file" 2>/dev/null || echo "0"
+  else
+    echo "0"
+  fi
+}
+
+# Find all project TOML files and extract the highest woodpecker_repo_id
+# (used for the main agents service which doesn't have a per-project TOML)
+_get_primary_woodpecker_repo_id() {
+  local projects_dir="${FACTORY_ROOT}/projects"
+  local max_id="0"
+  for toml in "${projects_dir}"/*.toml; do
+    [ -f "$toml" ] || continue
+    local repo_id
+    repo_id=$(_get_woodpecker_repo_id "$toml")
+    if [ -n "$repo_id" ] && [ "$repo_id" != "0" ]; then
+      # Use the first non-zero repo_id found (or highest if multiple)
+      if [ "$repo_id" -gt "$max_id" ] 2>/dev/null; then
+        max_id="$repo_id"
+      fi
+    fi
+  done
+  echo "$max_id"
+}
+
+# Parse project TOML for local-model agents and emit compose services.
+# Writes service definitions to stdout; caller handles insertion into compose file.
+_generate_local_model_services() {
+  local compose_file="$1"
+  local projects_dir="${FACTORY_ROOT}/projects"
+  local temp_file
+  temp_file=$(mktemp)
+  local has_services=false
+  local all_vols=""
+
+  # Find all project TOML files and extract [agents.*] sections
+  for toml in "${projects_dir}"/*.toml; do
+    [ -f "$toml" ] || continue
+
+    # Get woodpecker_repo_id for this project
+    local wp_repo_id
+    wp_repo_id=$(_get_woodpecker_repo_id "$toml")
+
+    # Parse [agents.*] sections using Python - output YAML-compatible format
+    while IFS='=' read -r key value; do
+      case "$key" in
+        NAME) service_name="$value" ;;
+        BASE_URL) base_url="$value" ;;
+        MODEL) model="$value" ;;
+        ROLES) roles="$value" ;;
+        API_KEY) api_key="$value" ;;
+        FORGE_USER) forge_user="$value" ;;
+        COMPACT_PCT) compact_pct="$value" ;;
+        POLL_INTERVAL) poll_interval_val="$value" ;;
+        ---)
+          if [ -n "$service_name" ] && [ -n "$base_url" ]; then
+            cat >> "$temp_file" <<EOF
+
+  agents-${service_name}:
+    build:
+      context: .
+      dockerfile: docker/agents/Dockerfile
+    container_name: disinto-agents-${service_name}
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - agents-${service_name}-data:/home/agent/data
+      - project-repos:/home/agent/repos
+      - \${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:\${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - \${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - \${HOME}/.ssh:/home/agent/.ssh:ro
+    environment:
+      FORGE_URL: http://forgejo:3000
+      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
+      # Use llama-specific credentials if available, otherwise fall back to main FORGE_TOKEN
+      FORGE_TOKEN: \${FORGE_TOKEN_LLAMA:-\${FORGE_TOKEN:-}}
+      FORGE_PASS: \${FORGE_PASS_LLAMA:-\${FORGE_PASS:-}}
+      FORGE_REVIEW_TOKEN: \${FORGE_REVIEW_TOKEN:-}
+      FORGE_BOT_USERNAMES: \${FORGE_BOT_USERNAMES:-}
+      AGENT_ROLES: "${roles}"
+      CLAUDE_TIMEOUT: \${CLAUDE_TIMEOUT:-7200}
+      ANTHROPIC_BASE_URL: "${base_url}"
+      ANTHROPIC_API_KEY: "${api_key}"
+      CLAUDE_MODEL: "${model}"
+      CLAUDE_CONFIG_DIR: \${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+      CLAUDE_CREDENTIALS_DIR: \${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}/credentials
+      CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "${compact_pct}"
+      CLAUDE_CODE_ATTRIBUTION_HEADER: "0"
+      CLAUDE_CODE_ENABLE_TELEMETRY: "0"
+      DISINTO_CONTAINER: "1"
+      PROJECT_NAME: ${PROJECT_NAME:-project}
+      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
+      WOODPECKER_DATA_DIR: /woodpecker-data
+      WOODPECKER_REPO_ID: "${wp_repo_id}"
+      FORGE_BOT_USER_${service_name^^}: "${forge_user}"
+      POLL_INTERVAL: "${poll_interval_val}"
+      GARDENER_INTERVAL: "${GARDENER_INTERVAL:-21600}"
+      ARCHITECT_INTERVAL: "${ARCHITECT_INTERVAL:-21600}"
+      PLANNER_INTERVAL: "${PLANNER_INTERVAL:-43200}"
+    depends_on:
+      forgejo:
+        condition: service_healthy
+      woodpecker:
+        condition: service_started
+    networks:
+      - disinto-net
+    profiles: ["agents-${service_name}"]
+
+EOF
+            has_services=true
+          fi
+          # Collect volume name for later
+          local vol_name="  agents-${service_name}-data:"
+          if [ -n "$all_vols" ]; then
+            all_vols="${all_vols}
+${vol_name}"
+          else
+            all_vols="${vol_name}"
+          fi
+          service_name="" base_url="" model="" roles="" api_key="" forge_user="" compact_pct="" poll_interval_val=""
+          ;;
+      esac
+    done < <(python3 -c '
+import sys, tomllib, json, re
+
+with open(sys.argv[1], "rb") as f:
+    cfg = tomllib.load(f)
+
+agents = cfg.get("agents", {})
+for name, config in agents.items():
+    if not isinstance(config, dict):
+        continue
+
+    base_url = config.get("base_url", "")
+    model = config.get("model", "")
+    if not base_url or not model:
+        continue
+
+    roles = config.get("roles", ["dev"])
+    roles_str = " ".join(roles) if isinstance(roles, list) else roles
+    api_key = config.get("api_key", "sk-no-key-required")
+    forge_user = config.get("forge_user", f"{name}-bot")
+    compact_pct = config.get("compact_pct", 60)
+    poll_interval = config.get("poll_interval", 60)
+
+    safe_name = name.lower()
+    safe_name = re.sub(r"[^a-z0-9]", "-", safe_name)
+
+    # Output as simple key=value lines
+    print(f"NAME={safe_name}")
+    print(f"BASE_URL={base_url}")
+    print(f"MODEL={model}")
+    print(f"ROLES={roles_str}")
+    print(f"API_KEY={api_key}")
+    print(f"FORGE_USER={forge_user}")
+    print(f"COMPACT_PCT={compact_pct}")
+    print(f"POLL_INTERVAL={poll_interval}")
+    print("---")
+' "$toml" 2>/dev/null)
+  done
+
+  if [ "$has_services" = true ]; then
+    # Insert the services before the volumes section
+    local temp_compose
+    temp_compose=$(mktemp)
+    # Get everything before volumes:
+    sed -n '1,/^volumes:/p' "$compose_file" | sed '$d' > "$temp_compose"
+    # Add the services
+    cat "$temp_file" >> "$temp_compose"
+    # Add the volumes section and everything after
+    sed -n '/^volumes:/,$p' "$compose_file" >> "$temp_compose"
+
+    # Add local-model volumes to the volumes section
+    if [ -n "$all_vols" ]; then
+      # Find the volumes section and add the new volumes
+      sed -i "/^volumes:/{n;:a;n;/^[a-z]/!{s/$/\n$all_vols/;b};ba}" "$temp_compose"
+    fi
+
+    mv "$temp_compose" "$compose_file"
+  fi
+
+  rm -f "$temp_file"
+}
+
+# Generate docker-compose.yml in the factory root.
+# **CANONICAL SOURCE**: This generator is the single source of truth for docker-compose.yml.
+# The tracked docker-compose.yml file has been removed. Operators must run 'bin/disinto init'
+# to materialize a working stack on a fresh checkout.
+_generate_compose_impl() {
+  local forge_port="${1:-3000}"
+  local compose_file="${FACTORY_ROOT}/docker-compose.yml"
+
+  # Check if compose file already exists
+  if [ -f "$compose_file" ]; then
+    echo "Compose: ${compose_file} (already exists, skipping)"
+    return 0
+  fi
+
+  # Extract primary woodpecker_repo_id from project TOML files
+  local wp_repo_id
+  wp_repo_id=$(_get_primary_woodpecker_repo_id)
+
+  cat > "$compose_file" <<'COMPOSEEOF'
+# docker-compose.yml — generated by disinto init
+# Brings up Forgejo, Woodpecker, and the agent runtime.
+
+services:
+  forgejo:
+    image: codeberg.org/forgejo/forgejo:11.0
+    container_name: disinto-forgejo
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - forgejo-data:/data
+    environment:
+      FORGEJO__database__DB_TYPE: sqlite3
+      FORGEJO__server__ROOT_URL: ${FORGEJO_ROOT_URL:-http://forgejo:3000/}
+      FORGEJO__server__HTTP_PORT: "3000"
+      FORGEJO__security__INSTALL_LOCK: "true"
+      FORGEJO__service__DISABLE_REGISTRATION: "true"
+      FORGEJO__webhook__ALLOWED_HOST_LIST: "private"
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/v1/version"]
+      interval: 5s
+      timeout: 3s
+      retries: 30
+      start_period: 30s
+    networks:
+      - disinto-net
+
+  woodpecker:
+    image: woodpeckerci/woodpecker-server:v3
+    container_name: disinto-woodpecker
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    ports:
+      - "8000:8000"
+      - "9000:9000"
+    volumes:
+      - woodpecker-data:/var/lib/woodpecker
+    environment:
+      WOODPECKER_FORGEJO: "true"
+      WOODPECKER_FORGEJO_URL: http://forgejo:3000
+      WOODPECKER_FORGEJO_CLIENT: ${WP_FORGEJO_CLIENT:-}
+      WOODPECKER_FORGEJO_SECRET: ${WP_FORGEJO_SECRET:-}
+      WOODPECKER_HOST: ${WOODPECKER_HOST:-http://woodpecker:8000}
+      WOODPECKER_SERVER: http://woodpecker:9000
+      WOODPECKER_OPEN: "true"
+      WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
+      WOODPECKER_DATABASE_DRIVER: sqlite3
+      WOODPECKER_DATABASE_DATASOURCE: /var/lib/woodpecker/woodpecker.sqlite
+      WOODPECKER_ENVIRONMENT: "FORGE_TOKEN:${FORGE_TOKEN}"
+    depends_on:
+      forgejo:
+        condition: service_healthy
+    networks:
+      - disinto-net
+
+  woodpecker-agent:
+    image: woodpeckerci/woodpecker-agent:v3
+    container_name: disinto-woodpecker-agent
+    restart: unless-stopped
+    network_mode: host
+    privileged: true
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    environment:
+      WOODPECKER_SERVER: localhost:9000
+      WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
+      WOODPECKER_GRPC_SECURE: "false"
+      WOODPECKER_HEALTHCHECK_ADDR: ":3333"
+      WOODPECKER_BACKEND_DOCKER_NETWORK: disinto_disinto-net
+      WOODPECKER_MAX_WORKFLOWS: 1
+    depends_on:
+      - woodpecker
+
+  agents:
+    build:
+      context: .
+      dockerfile: docker/agents/Dockerfile
+    container_name: disinto-agents
+    restart: unless-stopped
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - agent-data:/home/agent/data
+      - project-repos:/home/agent/repos
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - ${HOME}/.ssh:/home/agent/.ssh:ro
+      - ${HOME}/.config/sops/age:/home/agent/.config/sops/age:ro
+      - woodpecker-data:/woodpecker-data:ro
+    environment:
+      FORGE_URL: http://forgejo:3000
+      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
+      FORGE_TOKEN: ${FORGE_TOKEN:-}
+      FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
+      FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
+      FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-}
+      FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-}
+      FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-}
+      FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-}
+      FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-}
+      FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
+      WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
+      CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
+      CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
+      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
+      FORGE_PASS: ${FORGE_PASS:-}
+      FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
+      FACTORY_REPO: ${FORGE_REPO:-disinto-admin/disinto}
+      DISINTO_CONTAINER: "1"
+      PROJECT_NAME: ${PROJECT_NAME:-project}
+      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
+      WOODPECKER_DATA_DIR: /woodpecker-data
+      WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
+      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+      POLL_INTERVAL: ${POLL_INTERVAL:-300}
+      GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
+      ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
+      PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
+    # IMPORTANT: agents get explicit environment variables (forge tokens, CI tokens, config).
+    # Vault-only secrets (GITHUB_TOKEN, CLAWHUB_TOKEN, deploy keys) live in
+    # .env.vault.enc and are NEVER injected here — only the runner
+    # container receives them at fire time (AD-006, #745).
+    depends_on:
+      forgejo:
+        condition: service_healthy
+      woodpecker:
+        condition: service_started
+    networks:
+      - disinto-net
+
+  runner:
+    build:
+      context: .
+      dockerfile: docker/agents/Dockerfile
+    profiles: ["vault"]
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - agent-data:/home/agent/data
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+    environment:
+      FORGE_URL: http://forgejo:3000
+      DISINTO_CONTAINER: "1"
+      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
+      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+    # Vault redesign in progress (PR-based approval, see #73-#77)
+    # This container is being replaced — entrypoint will be updated in follow-up
+    networks:
+      - disinto-net
+
+  # Edge proxy — reverse proxy to Forgejo, Woodpecker, and staging
+  # Serves on ports 80/443, routes based on path
+  edge:
+    build: ./docker/edge
+    container_name: disinto-edge
+    security_opt:
+      - apparmor=unconfined
+    ports:
+      - "80:80"
+      - "443:443"
+    environment:
+      - DISINTO_VERSION=${DISINTO_VERSION:-main}
+      - FORGE_URL=http://forgejo:3000
+      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
+      - FORGE_OPS_REPO=${FORGE_OPS_REPO:-disinto-admin/disinto-ops}
+      - FORGE_TOKEN=${FORGE_TOKEN:-}
+      - FORGE_PASS=${FORGE_PASS:-}
+      - FORGE_ADMIN_USERS=${FORGE_ADMIN_USERS:-disinto-admin}
+      - FORGE_ADMIN_TOKEN=${FORGE_ADMIN_TOKEN:-}
+      - OPS_REPO_ROOT=/opt/disinto-ops
+      - PROJECT_REPO_ROOT=/opt/disinto
+      - PRIMARY_BRANCH=main
+      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
+      # Reverse tunnel (optional — set by `disinto edge register`, see #622)
+      - EDGE_TUNNEL_HOST=${EDGE_TUNNEL_HOST:-}
+      - EDGE_TUNNEL_USER=${EDGE_TUNNEL_USER:-tunnel}
+      - EDGE_TUNNEL_PORT=${EDGE_TUNNEL_PORT:-}
+      - EDGE_TUNNEL_FQDN=${EDGE_TUNNEL_FQDN:-}
+      # Subdomain fallback (#713): if subpath routing (#704/#708) fails, add:
+      #   EDGE_TUNNEL_FQDN_FORGE, EDGE_TUNNEL_FQDN_CI, EDGE_TUNNEL_FQDN_CHAT
+      # See docs/edge-routing-fallback.md for the full pivot plan.
+      # Shared secret for Caddy ↔ chat forward_auth (#709)
+      - FORWARD_AUTH_SECRET=${FORWARD_AUTH_SECRET:-}
+    volumes:
+      - ./docker/Caddyfile:/etc/caddy/Caddyfile
+      - caddy_data:/data
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ./secrets/tunnel_key:/run/secrets/tunnel_key:ro
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
+      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+    depends_on:
+      forgejo:
+        condition: service_healthy
+      woodpecker:
+        condition: service_started
+      staging:
+        condition: service_started
+    networks:
+      - disinto-net
+
+  # Staging container — static file server for staging artifacts
+  # Edge proxy routes to this container for default requests
+  staging:
+    image: caddy:alpine
+    command: ["caddy", "file-server", "--root", "/srv/site"]
+    security_opt:
+      - apparmor=unconfined
+    volumes:
+      - ./docker:/srv/site:ro
+    networks:
+      - disinto-net
+
+  # Staging deployment slot — activated by Woodpecker staging pipeline (#755).
+  # Profile-gated: only starts when explicitly targeted by deploy commands.
+  # Customize image/ports/volumes for your project after init.
+  staging-deploy:
+    image: alpine:3
+    profiles: ["staging"]
+    security_opt:
+      - apparmor=unconfined
+    environment:
+      DEPLOY_ENV: staging
+    networks:
+      - disinto-net
+    command: ["echo", "staging slot — replace with project image"]
+
+  # Chat container — Claude chat UI backend (#705)
+  # Internal service only; edge proxy routes to chat:8080
+  # Sandbox hardened per #706 — no docker.sock, read-only rootfs, minimal caps
+  chat:
+    build:
+      context: ./docker/chat
+      dockerfile: Dockerfile
+    container_name: disinto-chat
+    restart: unless-stopped
+    read_only: true
+    tmpfs:
+      - /tmp:size=64m
+    security_opt:
+      - no-new-privileges:true
+    cap_drop:
+      - ALL
+    pids_limit: 128
+    mem_limit: 512m
+    memswap_limit: 512m
+    volumes:
+      # Mount claude binary from host (same as agents)
+      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      # Throwaway named volume for chat config (isolated from host ~/.claude)
+      - chat-config:/var/chat/config
+      # Chat history persistence: per-user NDJSON files on bind-mounted host volume
+      - ${CHAT_HISTORY_DIR:-./state/chat-history}:/var/lib/chat/history
+    environment:
+      CHAT_HOST: "0.0.0.0"
+      CHAT_PORT: "8080"
+      FORGE_URL: http://forgejo:3000
+      CHAT_OAUTH_CLIENT_ID: ${CHAT_OAUTH_CLIENT_ID:-}
+      CHAT_OAUTH_CLIENT_SECRET: ${CHAT_OAUTH_CLIENT_SECRET:-}
+      EDGE_TUNNEL_FQDN: ${EDGE_TUNNEL_FQDN:-}
+      DISINTO_CHAT_ALLOWED_USERS: ${DISINTO_CHAT_ALLOWED_USERS:-}
+      # Shared secret for Caddy forward_auth verify endpoint (#709)
+      FORWARD_AUTH_SECRET: ${FORWARD_AUTH_SECRET:-}
+      # Cost caps / rate limiting (#711)
+      CHAT_MAX_REQUESTS_PER_HOUR: ${CHAT_MAX_REQUESTS_PER_HOUR:-60}
+      CHAT_MAX_REQUESTS_PER_DAY: ${CHAT_MAX_REQUESTS_PER_DAY:-500}
+      CHAT_MAX_TOKENS_PER_DAY: ${CHAT_MAX_TOKENS_PER_DAY:-1000000}
+    networks:
+      - disinto-net
+
+volumes:
+  forgejo-data:
+  woodpecker-data:
+  agent-data:
+  project-repos:
+  caddy_data:
+  chat-config:
+
+networks:
+  disinto-net:
+    driver: bridge
+COMPOSEEOF
+
+  # Patch PROJECT_REPO_ROOT — interpolate PROJECT_NAME at generation time
+  # (Docker Compose cannot resolve it; it's a shell variable, not a .env var)
+  sed -i "s|\${PROJECT_NAME:-project}|${PROJECT_NAME}|g" "$compose_file"
+
+  # Patch WOODPECKER_REPO_ID — interpolate at generation time
+  # (Docker Compose cannot resolve it; it's a shell variable, not a .env var)
+  if [ -n "$wp_repo_id" ] && [ "$wp_repo_id" != "0" ]; then
+    sed -i "s|PLACEHOLDER_WP_REPO_ID|${wp_repo_id}|g" "$compose_file"
+  else
+    # Default to empty if no repo_id found (agents will handle gracefully)
+    sed -i "s|PLACEHOLDER_WP_REPO_ID||g" "$compose_file"
+  fi
+
+  # Patch the forgejo port mapping into the file if non-default
+  if [ "$forge_port" != "3000" ]; then
+    # Add port mapping to forgejo service so it's reachable from host during init
+    sed -i "/image: codeberg\.org\/forgejo\/forgejo:11\.0/a\\    ports:\\n      - \"${forge_port}:3000\"" "$compose_file"
+  else
+    sed -i "/image: codeberg\.org\/forgejo\/forgejo:11\.0/a\\    ports:\\n      - \"3000:3000\"" "$compose_file"
+  fi
+
+  # Append local-model agent services if any are configured
+  # (must run before CLAUDE_BIN_PLACEHOLDER substitution so the placeholder
+  # in local-model services is also resolved)
+  _generate_local_model_services "$compose_file"
+
+  # Patch the Claude CLI binary path — resolve from host PATH at init time.
+  local claude_bin
+  claude_bin="$(command -v claude 2>/dev/null || true)"
+  if [ -n "$claude_bin" ]; then
+    # Resolve symlinks to get the real binary path
+    claude_bin="$(readlink -f "$claude_bin")"
+    sed -i "s|CLAUDE_BIN_PLACEHOLDER|${claude_bin}|g" "$compose_file"
+  else
+    echo "Warning: claude CLI not found in PATH — update docker-compose.yml volumes manually" >&2
+    sed -i "s|CLAUDE_BIN_PLACEHOLDER|/usr/local/bin/claude|g" "$compose_file"
+  fi
+
+  echo "Created: ${compose_file}"
+}
+
+# Generate docker/agents/ files if they don't already exist.
+_generate_agent_docker_impl() {
+  local docker_dir="${FACTORY_ROOT}/docker/agents"
+  mkdir -p "$docker_dir"
+
+  if [ ! -f "${docker_dir}/Dockerfile" ]; then
+    echo "Warning: docker/agents/Dockerfile not found — expected in repo" >&2
+  fi
+  if [ ! -f "${docker_dir}/entrypoint.sh" ]; then
+    echo "Warning: docker/agents/entrypoint.sh not found — expected in repo" >&2
+  fi
+}
+
+# Generate docker/Caddyfile template for edge proxy.
+_generate_caddyfile_impl() {
+  local docker_dir="${FACTORY_ROOT}/docker"
+  local caddyfile="${docker_dir}/Caddyfile"
+
+  if [ -f "$caddyfile" ]; then
+    echo "Caddyfile:  ${caddyfile} (already exists, skipping)"
+    return
+  fi
+
+  cat > "$caddyfile" <<'CADDYFILEEOF'
+# Caddyfile — edge proxy configuration
+# IP-only binding at bootstrap; domain + TLS added later via vault resource request
+
+:80 {
+    # Redirect root to Forgejo
+    handle / {
+        redir /forge/ 302
+    }
+
+    # Reverse proxy to Forgejo
+    handle /forge/* {
+        reverse_proxy forgejo:3000
+    }
+
+    # Reverse proxy to Woodpecker CI
+    handle /ci/* {
+        reverse_proxy woodpecker:8000
+    }
+
+    # Reverse proxy to staging
+    handle /staging/* {
+        reverse_proxy staging:80
+    }
+
+    # Chat service — reverse proxy to disinto-chat backend (#705)
+    # OAuth routes bypass forward_auth — unauthenticated users need these (#709)
+    handle /chat/login {
+        reverse_proxy chat:8080
+    }
+    handle /chat/oauth/callback {
+        reverse_proxy chat:8080
+    }
+    # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)
+    handle /chat/* {
+        forward_auth chat:8080 {
+            uri /chat/auth/verify
+            copy_headers X-Forwarded-User
+            header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
+        }
+        reverse_proxy chat:8080
+    }
+}
+CADDYFILEEOF
+
+  echo "Created: ${caddyfile}"
+}
+
+# Generate docker/index.html default page.
+_generate_staging_index_impl() {
+  local docker_dir="${FACTORY_ROOT}/docker"
+  local index_file="${docker_dir}/index.html"
+
+  if [ -f "$index_file" ]; then
+    echo "Staging:  ${index_file} (already exists, skipping)"
+    return
+  fi
+
+  cat > "$index_file" <<'INDEXEOF'
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Nothing shipped yet</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            min-height: 100vh;
+            margin: 0;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        .container {
+            text-align: center;
+            padding: 2rem;
+        }
+        h1 {
+            font-size: 3rem;
+            margin: 0 0 1rem 0;
+        }
+        p {
+            font-size: 1.25rem;
+            opacity: 0.9;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Nothing shipped yet</h1>
+        <p>CI pipelines will update this page with your staging artifacts.</p>
+    </div>
+</body>
+</html>
+INDEXEOF
+
+  echo "Created: ${index_file}"
+}
+
+# Generate template .woodpecker/ deployment pipeline configs in a project repo.
+# Creates staging.yml and production.yml alongside the project's existing CI config.
+# These pipelines trigger on Woodpecker's deployment event with environment filters.
+_generate_deploy_pipelines_impl() {
+  local repo_root="$1"
+  local project_name="$2"
+  : "${project_name// /}"  # Silence SC2034 - variable used in heredoc
+  local wp_dir="${repo_root}/.woodpecker"
+
+  mkdir -p "$wp_dir"
+
+  # Skip if deploy pipelines already exist
+  if [ -f "${wp_dir}/staging.yml" ] && [ -f "${wp_dir}/production.yml" ]; then
+    echo "Deploy:  .woodpecker/{staging,production}.yml (already exist)"
+    return
+  fi
+
+  if [ ! -f "${wp_dir}/staging.yml" ]; then
+    cat > "${wp_dir}/staging.yml" <<'STAGINGEOF'
+# .woodpecker/staging.yml — Staging deployment pipeline
+# Triggered by runner via Woodpecker promote API.
+# Human approves promotion in vault → runner calls promote → this runs.
+
+when:
+  event: deployment
+  environment: staging
+
+steps:
+  - name: deploy-staging
+    image: docker:27
+    commands:
+      - echo "Deploying to staging environment..."
+      - echo "Pipeline ${CI_PIPELINE_NUMBER} promoted from CI #${CI_PIPELINE_PARENT}"
+      # Pull the image built by CI and deploy to staging
+      # Customize these commands for your project:
+      # - docker compose -f docker-compose.yml --profile staging up -d
+      - echo "Staging deployment complete"
+
+  - name: verify-staging
+    image: alpine:3
+    commands:
+      - echo "Verifying staging deployment..."
+      # Add health checks, smoke tests, or integration tests here:
+      # - curl -sf http://staging:8080/health || exit 1
+      - echo "Staging verification complete"
+STAGINGEOF
+    echo "Created: ${wp_dir}/staging.yml"
+  fi
+
+  if [ ! -f "${wp_dir}/production.yml" ]; then
+    cat > "${wp_dir}/production.yml" <<'PRODUCTIONEOF'
+# .woodpecker/production.yml — Production deployment pipeline
+# Triggered by runner via Woodpecker promote API.
+# Human approves promotion in vault → runner calls promote → this runs.
+
+when:
+  event: deployment
+  environment: production
+
+steps:
+  - name: deploy-production
+    image: docker:27
+    commands:
+      - echo "Deploying to production environment..."
+      - echo "Pipeline ${CI_PIPELINE_NUMBER} promoted from staging"
+      # Pull the verified image and deploy to production
+      # Customize these commands for your project:
+      # - docker compose -f docker-compose.yml up -d
+      - echo "Production deployment complete"
+
+  - name: verify-production
+    image: alpine:3
+    commands:
+      - echo "Verifying production deployment..."
+      # Add production health checks here:
+      # - curl -sf http://production:8080/health || exit 1
+      - echo "Production verification complete"
+PRODUCTIONEOF
+    echo "Created: ${wp_dir}/production.yml"
+  fi
+}
--- a/lib/git-creds.sh
+++ b/lib/git-creds.sh
@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# git-creds.sh — Shared git credential helper configuration
+#
+# Configures a static credential helper for Forgejo password-based HTTP auth.
+# Forgejo 11.x rejects API tokens for git push (#361); password auth works.
+# This ensures all git operations (clone, fetch, push) use password auth
+# without needing tokens embedded in remote URLs (#604).
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/git-creds.sh"
+#   configure_git_creds [HOME_DIR] [RUN_AS_CMD]
+#   repair_baked_cred_urls [--as RUN_AS_CMD] DIR [DIR ...]
+#
+# Globals expected:
+#   FORGE_PASS  — bot password for git HTTP auth
+#   FORGE_URL   — Forge instance URL (e.g. http://forgejo:3000)
+#   FORGE_TOKEN — API token (used to resolve bot username)
+
+set -euo pipefail
+
+# configure_git_creds [HOME_DIR] [RUN_AS_CMD]
+#   HOME_DIR    — home directory for the git user (default: $HOME or /home/agent)
+#   RUN_AS_CMD  — command prefix to run as another user (e.g. "gosu agent")
+#
+# Writes a credential helper script and configures git to use it globally.
+configure_git_creds() {
+  local home_dir="${1:-${HOME:-/home/agent}}"
+  local run_as="${2:-}"
+
+  if [ -z "${FORGE_PASS:-}" ] || [ -z "${FORGE_URL:-}" ]; then
+    return 0
+  fi
+
+  local forge_host forge_proto
+  forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
+  forge_proto=$(printf '%s' "$FORGE_URL" | sed 's|://.*||')
+
+  # Determine the bot username from FORGE_TOKEN identity (or default to dev-bot)
+  local bot_user=""
+  if [ -n "${FORGE_TOKEN:-}" ]; then
+    bot_user=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      "${FORGE_URL}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || bot_user=""
+  fi
+  bot_user="${bot_user:-dev-bot}"
+
+  local helper_path="${home_dir}/.git-credentials-helper"
+
+  # Write a static credential helper script (git credential protocol)
+  cat > "$helper_path" <<CREDEOF
+#!/bin/sh
+# Auto-generated git credential helper for Forgejo password auth (#361, #604)
+# Reads \$FORGE_PASS from env at runtime — file is safe to read on disk.
+# Only respond to "get" action; ignore "store" and "erase".
+[ "\$1" = "get" ] || exit 0
+# Read and discard stdin (git sends protocol/host info)
+cat >/dev/null
+echo "protocol=${forge_proto}"
+echo "host=${forge_host}"
+echo "username=${bot_user}"
+echo "password=\$FORGE_PASS"
+CREDEOF
+  chmod 755 "$helper_path"
+
+  # Set ownership and configure git if running as a different user
+  if [ -n "$run_as" ]; then
+    local target_user
+    target_user=$(echo "$run_as" | awk '{print $NF}')
+    chown "${target_user}:${target_user}" "$helper_path" 2>/dev/null || true
+    $run_as bash -c "git config --global credential.helper '${helper_path}'"
+  else
+    git config --global credential.helper "$helper_path"
+  fi
+
+  # Set safe.directory to work around dubious ownership after container restart
+  if [ -n "$run_as" ]; then
+    $run_as bash -c "git config --global --add safe.directory '*'"
+  else
+    git config --global --add safe.directory '*'
+  fi
+}
+
+# repair_baked_cred_urls [--as RUN_AS_CMD] DIR [DIR ...]
+#   Scans git repos under each DIR and rewrites remote URLs that contain
+#   embedded credentials (user:pass@host) to clean URLs.
+#   Logs each repair so operators can see the migration happened.
+#
+#   Optional --as flag runs git operations under the specified user wrapper
+#   (e.g. "gosu agent") to avoid dubious-ownership issues on user-owned repos.
+#
+# Set _GIT_CREDS_LOG_FN to a custom log function name (default: echo).
+repair_baked_cred_urls() {
+  local log_fn="${_GIT_CREDS_LOG_FN:-echo}"
+  local run_as=""
+  local -a dirs=()
+  while [ $# -gt 0 ]; do
+    case "$1" in
+      --as) shift; run_as="$1"; shift ;;
+      *) dirs+=("$1"); shift ;;
+    esac
+  done
+
+  for dir in "${dirs[@]}"; do
+    [ -d "$dir" ] || continue
+
+    # Find git repos: either dir itself or immediate subdirectories
+    local -a repos=()
+    if [ -d "${dir}/.git" ]; then
+      repos+=("$dir")
+    else
+      local sub
+      for sub in "$dir"/*/; do
+        [ -d "${sub}.git" ] && repos+=("${sub%/}")
+      done
+    fi
+
+    local repo
+    for repo in "${repos[@]}"; do
+      local url
+      if [ -n "$run_as" ]; then
+        url=$($run_as git -C "$repo" config --get remote.origin.url 2>/dev/null || true)
+      else
+        url=$(git -C "$repo" config --get remote.origin.url 2>/dev/null || true)
+      fi
+      [ -n "$url" ] || continue
+
+      # Check if URL contains embedded credentials: http(s)://user:pass@host
+      if printf '%s' "$url" | grep -qE '^https?://[^/]+@'; then
+        # Strip credentials: http(s)://user:pass@host/path -> http(s)://host/path
+        local clean_url
+        clean_url=$(printf '%s' "$url" | sed -E 's|(https?://)[^@]+@|\1|')
+        if [ -n "$run_as" ]; then
+          $run_as git -C "$repo" remote set-url origin "$clean_url"
+        else
+          git -C "$repo" remote set-url origin "$clean_url"
+        fi
+        $log_fn "Repaired baked credentials in ${repo} (remote origin -> ${clean_url})"
+      fi
+    done
+  done
+}
--- a/lib/guard.sh
+++ b/lib/guard.sh
@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# guard.sh — Active-state guard for cron entry points
+# guard.sh — Active-state guard for polling-loop entry points
 #
 # Each agent checks for a state file before running. If the file
 # doesn't exist, the agent logs a skip and exits cleanly.
--- a/lib/hire-agent.sh
+++ b/lib/hire-agent.sh
@ -0,0 +1,503 @@
+#!/usr/bin/env bash
+# =============================================================================
+# hire-agent — disinto_hire_an_agent() function
+#
+# Handles user creation, .profile repo setup, formula copying, branch protection,
+# and state marker creation for hiring a new agent.
+#
+# Globals expected:
+#   FORGE_URL    - Forge instance URL
+#   FORGE_TOKEN  - Admin token for Forge operations
+#   FACTORY_ROOT - Root of the disinto factory
+#   PROJECT_NAME - Project name for email/domain generation
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/hire-agent.sh"
+#   disinto_hire_an_agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>] [--poll-interval <seconds>]
+# =============================================================================
+set -euo pipefail
+
+disinto_hire_an_agent() {
+  local agent_name="${1:-}"
+  local role="${2:-}"
+  local formula_path=""
+  local local_model=""
+  local model_name=""
+  local poll_interval=""
+
+  if [ -z "$agent_name" ] || [ -z "$role" ]; then
+    echo "Error: agent-name and role required" >&2
+    echo "Usage: disinto hire-an-agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>] [--poll-interval <seconds>]" >&2
+    exit 1
+  fi
+  shift 2
+
+  # Parse flags
+  while [ $# -gt 0 ]; do
+    case "$1" in
+      --formula)
+        formula_path="$2"
+        shift 2
+        ;;
+      --local-model)
+        local_model="$2"
+        shift 2
+        ;;
+      --model)
+        model_name="$2"
+        shift 2
+        ;;
+      --poll-interval)
+        poll_interval="$2"
+        shift 2
+        ;;
+      *)
+        echo "Unknown option: $1" >&2
+        exit 1
+        ;;
+    esac
+  done
+
+  # Default formula path — try both naming conventions
+  if [ -z "$formula_path" ]; then
+    formula_path="${FACTORY_ROOT}/formulas/${role}.toml"
+    if [ ! -f "$formula_path" ]; then
+      formula_path="${FACTORY_ROOT}/formulas/run-${role}.toml"
+    fi
+  fi
+
+  # Validate formula exists
+  if [ ! -f "$formula_path" ]; then
+    echo "Error: formula not found at ${formula_path}" >&2
+    exit 1
+  fi
+
+  echo "── Hiring agent: ${agent_name} (${role}) ───────────────────────"
+  echo "Formula:   ${formula_path}"
+  if [ -n "$local_model" ]; then
+    echo "Local model: ${local_model}"
+    echo "Model name:  ${model_name:-local-model}"
+    echo "Poll interval: ${poll_interval:-60}s"
+  fi
+
+  # Ensure FORGE_TOKEN is set
+  if [ -z "${FORGE_TOKEN:-}" ]; then
+    echo "Error: FORGE_TOKEN not set" >&2
+    exit 1
+  fi
+
+  # Get Forge URL
+  local forge_url="${FORGE_URL:-http://localhost:3000}"
+  echo "Forge:     ${forge_url}"
+
+  # Step 1: Create user via API (skip if exists)
+  echo ""
+  echo "Step 1: Creating user '${agent_name}' (if not exists)..."
+
+  local user_pass=""
+  local admin_pass=""
+
+  # Read admin password from .env for standalone runs (#184)
+  local env_file="${FACTORY_ROOT}/.env"
+  if [ -f "$env_file" ] && grep -q '^FORGE_ADMIN_PASS=' "$env_file" 2>/dev/null; then
+    admin_pass=$(grep '^FORGE_ADMIN_PASS=' "$env_file" | head -1 | cut -d= -f2-)
+  fi
+
+  # Get admin token early (needed for both user creation and password reset)
+  local admin_user="disinto-admin"
+  admin_pass="${admin_pass:-admin}"
+  local admin_token=""
+  local admin_token_name
+  admin_token_name="temp-token-$(date +%s)"
+  admin_token=$(curl -sf -X POST \
+    -u "${admin_user}:${admin_pass}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/users/${admin_user}/tokens" \
+    -d "{\"name\":\"${admin_token_name}\",\"scopes\":[\"all\"]}" 2>/dev/null \
+    | jq -r '.sha1 // empty') || admin_token=""
+  if [ -z "$admin_token" ]; then
+    # Token might already exist — try listing
+    admin_token=$(curl -sf \
+      -u "${admin_user}:${admin_pass}" \
+      "${forge_url}/api/v1/users/${admin_user}/tokens" 2>/dev/null \
+      | jq -r '.[0].sha1 // empty') || admin_token=""
+  fi
+  if [ -z "$admin_token" ]; then
+    echo "Error: failed to obtain admin API token" >&2
+    echo "  Cannot proceed without admin privileges" >&2
+    exit 1
+  fi
+
+  if curl -sf --max-time 5 "${forge_url}/api/v1/users/${agent_name}" >/dev/null 2>&1; then
+    echo "  User '${agent_name}' already exists"
+    # Reset user password so we can get a token (#184)
+    user_pass="agent-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+    # Use Forgejo CLI to reset password (API PATCH ignores must_change_password in Forgejo 11.x)
+    if _forgejo_exec forgejo admin user change-password \
+      --username "${agent_name}" \
+      --password "${user_pass}" \
+      --must-change-password=false >/dev/null 2>&1; then
+      echo "  Reset password for existing user '${agent_name}'"
+    else
+      echo "  Warning: could not reset password for existing user" >&2
+    fi
+  else
+    # Create user using basic auth (admin token fallback would poison subsequent calls)
+    # Create the user
+    user_pass="agent-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+    if curl -sf -X POST \
+      -u "${admin_user}:${admin_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/admin/users" \
+      -d "{\"username\":\"${agent_name}\",\"password\":\"${user_pass}\",\"email\":\"${agent_name}@${PROJECT_NAME:-disinto}.local\",\"full_name\":\"${agent_name}\",\"active\":true,\"admin\":false,\"must_change_password\":false}" >/dev/null 2>&1; then
+      echo "  Created user '${agent_name}'"
+    else
+      echo "  Warning: failed to create user via admin API" >&2
+      # Try alternative: user might already exist
+      if curl -sf --max-time 5 "${forge_url}/api/v1/users/${agent_name}" >/dev/null 2>&1; then
+        echo "  User '${agent_name}' exists (confirmed)"
+      else
+        echo "  Error: failed to create user '${agent_name}'" >&2
+        exit 1
+      fi
+    fi
+  fi
+
+  # Step 1.5: Generate Forge token for the new/existing user
+  echo ""
+  echo "Step 1.5: Generating Forge token for '${agent_name}'..."
+
+  # Convert role to uppercase token variable name (e.g., architect -> FORGE_ARCHITECT_TOKEN)
+  local role_upper
+  role_upper=$(echo "$role" | tr '[:lower:]' '[:upper:]')
+  local token_var="FORGE_${role_upper}_TOKEN"
+
+  # Generate token using the user's password (basic auth)
+  local agent_token=""
+  agent_token=$(curl -sf -X POST \
+    -u "${agent_name}:${user_pass}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/users/${agent_name}/tokens" \
+    -d "{\"name\":\"disinto-${agent_name}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \
+    | jq -r '.sha1 // empty') || agent_token=""
+
+  if [ -z "$agent_token" ]; then
+    # Token name collision — create with timestamp suffix
+    agent_token=$(curl -sf -X POST \
+      -u "${agent_name}:${user_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/users/${agent_name}/tokens" \
+      -d "{\"name\":\"disinto-${agent_name}-$(date +%s)\",\"scopes\":[\"all\"]}" 2>/dev/null \
+      | jq -r '.sha1 // empty') || agent_token=""
+  fi
+
+  if [ -z "$agent_token" ]; then
+    echo "  Warning: failed to create API token for '${agent_name}'" >&2
+  else
+    # Store token in .env under the role-specific variable name
+    if grep -q "^${token_var}=" "$env_file" 2>/dev/null; then
+      # Use sed with alternative delimiter and proper escaping for special chars in token
+      local escaped_token
+      escaped_token=$(printf '%s\n' "$agent_token" | sed 's/[&/\]/\\&/g')
+      sed -i "s|^${token_var}=.*|${token_var}=${escaped_token}|" "$env_file"
+      echo "  ${agent_name} token updated (${token_var})"
+    else
+      printf '%s=%s\n' "$token_var" "$agent_token" >> "$env_file"
+      echo "  ${agent_name} token saved (${token_var})"
+    fi
+    export "${token_var}=${agent_token}"
+  fi
+
+  # Step 2: Create .profile repo on Forgejo
+  echo ""
+  echo "Step 2: Creating '${agent_name}/.profile' repo (if not exists)..."
+
+  if curl -sf --max-time 5 "${forge_url}/api/v1/repos/${agent_name}/.profile" >/dev/null 2>&1; then
+    echo "  Repo '${agent_name}/.profile' already exists"
+  else
+    # Create the repo using the admin API to ensure it's created in the agent's namespace.
+    # Using POST /api/v1/user/repos with a user token would create the repo under the
+    # authenticated user, which could be wrong if the token belongs to a different user.
+    # The admin API POST /api/v1/admin/users/{username}/repos explicitly creates in the
+    # specified user's namespace.
+    local create_output
+    create_output=$(curl -sf -X POST \
+      -u "${admin_user}:${admin_pass}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/admin/users/${agent_name}/repos" \
+      -d "{\"name\":\".profile\",\"description\":\"${agent_name}'s .profile repo\",\"private\":true,\"auto_init\":false}" 2>&1) || true
+
+    if echo "$create_output" | grep -q '"id":\|[0-9]'; then
+      echo "  Created repo '${agent_name}/.profile' (via admin API)"
+    else
+      echo "  Error: failed to create repo '${agent_name}/.profile'" >&2
+      echo "  Response: ${create_output}" >&2
+      exit 1
+    fi
+  fi
+
+  # Step 3: Clone repo and create initial commit
+  echo ""
+  echo "Step 3: Cloning repo and creating initial commit..."
+
+  local clone_dir="/tmp/.profile-clone-${agent_name}"
+  rm -rf "$clone_dir"
+  mkdir -p "$clone_dir"
+
+  # Build authenticated clone URL using basic auth (user_pass is always set in Step 1)
+  if [ -z "${user_pass:-}" ]; then
+    echo "  Error: no user password available for cloning" >&2
+    exit 1
+  fi
+
+  local auth_url
+  auth_url=$(printf '%s' "$forge_url" | sed "s|://|://${agent_name}:${user_pass}@|")
+  auth_url="${auth_url}/${agent_name}/.profile.git"
+
+  # Display unauthenticated URL (auth token only in actual git clone command)
+  echo "  Cloning: ${forge_url}/${agent_name}/.profile.git"
+
+  # Try authenticated clone first (required for private repos)
+  if ! git clone --quiet "$auth_url" "$clone_dir" 2>/dev/null; then
+    echo "  Error: failed to clone repo with authentication" >&2
+    echo "  Note: Ensure the user has a valid API token with repository access" >&2
+    rm -rf "$clone_dir"
+    exit 1
+  fi
+
+  # Configure git
+  git -C "$clone_dir" config user.name "disinto-admin"
+  git -C "$clone_dir" config user.email "disinto-admin@localhost"
+
+  # Create directory structure
+  echo "  Creating directory structure..."
+  mkdir -p "${clone_dir}/journal"
+  mkdir -p "${clone_dir}/knowledge"
+  touch "${clone_dir}/journal/.gitkeep"
+  touch "${clone_dir}/knowledge/.gitkeep"
+
+  # Copy formula
+  echo "  Copying formula..."
+  cp "$formula_path" "${clone_dir}/formula.toml"
+
+  # Create README
+  if [ ! -f "${clone_dir}/README.md" ]; then
+    cat > "${clone_dir}/README.md" <<EOF
+# ${agent_name}'s .profile
+
+Agent profile repository for ${agent_name}.
+
+## Structure
+
+\`\`\`
+${agent_name}/.profile/
+├── formula.toml    # Agent's role formula
+├── journal/        # Issue-by-issue log files (journal branch)
+│   └── .gitkeep
+├── knowledge/      # Shared knowledge and best practices
+│   └── .gitkeep
+└── README.md
+\`\`\`
+
+## Branches
+
+- \`main\` — Admin-only merge for formula changes (requires 1 approval)
+- \`journal\` — Agent branch for direct journal entries
+  - Agent can push directly to this branch
+  - Formula changes must go through PR to \`main\`
+
+## Branch protection
+
+- \`main\`: Protected — requires 1 admin approval for merges
+- \`journal\`: Unprotected — agent can push directly
+EOF
+  fi
+
+  # Commit and push
+  echo "  Committing and pushing..."
+  git -C "$clone_dir" add -A
+  if ! git -C "$clone_dir" diff --cached --quiet 2>/dev/null; then
+    git -C "$clone_dir" commit -m "chore: initial .profile setup" -q
+    git -C "$clone_dir" push origin main >/dev/null 2>&1 || \
+      git -C "$clone_dir" push origin master >/dev/null 2>&1 || true
+    echo "  Committed: initial .profile setup"
+  else
+    echo "  No changes to commit"
+  fi
+
+  rm -rf "$clone_dir"
+
+  # Step 4: Set up branch protection
+  echo ""
+  echo "Step 4: Setting up branch protection..."
+
+  # Source branch-protection.sh helper
+  local bp_script="${FACTORY_ROOT}/lib/branch-protection.sh"
+  if [ -f "$bp_script" ]; then
+    # Source required environment
+    if [ -f "${FACTORY_ROOT}/lib/env.sh" ]; then
+      source "${FACTORY_ROOT}/lib/env.sh"
+    fi
+
+    # Set up branch protection for .profile repo
+    if source "$bp_script" 2>/dev/null && setup_profile_branch_protection "${agent_name}/.profile" "main"; then
+      echo "  Branch protection configured for main branch"
+      echo "  - Requires 1 approval before merge"
+      echo "  - Admin-only merge enforcement"
+      echo "  - Journal branch created for direct agent pushes"
+    else
+      echo "  Warning: could not configure branch protection (Forgejo API may not be available)"
+      echo "  Note: Branch protection can be set up manually later"
+    fi
+  else
+    echo "  Warning: branch-protection.sh not found at ${bp_script}"
+  fi
+
+  # Step 5: Create state marker
+  echo ""
+  echo "Step 5: Creating state marker..."
+
+  local state_dir="${FACTORY_ROOT}/state"
+  mkdir -p "$state_dir"
+  local state_file="${state_dir}/.${role}-active"
+
+  if [ ! -f "$state_file" ]; then
+    touch "$state_file"
+    echo "  Created: ${state_file}"
+  else
+    echo "  State marker already exists: ${state_file}"
+  fi
+
+  # Step 6: Set up local model agent (if --local-model specified)
+  if [ -n "$local_model" ]; then
+    echo ""
+    echo "Step 6: Configuring local model agent..."
+
+    # Validate model endpoint is reachable
+    echo "  Validating model endpoint: ${local_model}"
+    if ! curl -sf --max-time 10 "${local_model}/health" >/dev/null 2>&1; then
+      # Try /v1/chat/completions as fallback endpoint check
+      if ! curl -sf --max-time 10 "${local_model}/v1/chat/completions" >/dev/null 2>&1; then
+        echo "  Warning: model endpoint may not be reachable at ${local_model}"
+        echo "  Continuing with configuration..."
+      fi
+    else
+      echo "  Model endpoint is reachable"
+    fi
+
+    # Find project TOML
+    local project_name="${PROJECT_NAME:-}"
+    local toml_file=""
+    if [ -n "$project_name" ]; then
+      toml_file="${FACTORY_ROOT}/projects/${project_name}.toml"
+    fi
+    # Fallback: find the first .toml in projects/
+    if [ -z "$toml_file" ] || [ ! -f "$toml_file" ]; then
+      for f in "${FACTORY_ROOT}/projects/"*.toml; do
+        if [ -f "$f" ]; then
+          toml_file="$f"
+          break
+        fi
+      done
+    fi
+
+    if [ -z "$toml_file" ] || [ ! -f "$toml_file" ]; then
+      echo "  Error: no project TOML found in ${FACTORY_ROOT}/projects/" >&2
+      echo "  Run 'disinto init' first to create a project config" >&2
+      exit 1
+    fi
+
+    echo "  Project TOML: ${toml_file}"
+
+    # Derive a safe section name from the agent name (lowercase, alphanumeric+hyphens)
+    local section_name
+    section_name=$(echo "$agent_name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9]/-/g')
+
+    # Default model name if not provided
+    local model="${model_name:-local-model}"
+
+    # Write [agents.<name>] section to the project TOML
+    local interval="${poll_interval:-60}"
+    echo "  Writing [agents.${section_name}] to ${toml_file}..."
+    python3 -c '
+import sys, re, pathlib
+
+toml_path = sys.argv[1]
+section_name = sys.argv[2]
+base_url = sys.argv[3]
+model = sys.argv[4]
+agent_name = sys.argv[5]
+role = sys.argv[6]
+poll_interval = sys.argv[7]
+
+p = pathlib.Path(toml_path)
+text = p.read_text()
+
+# Build the new section
+new_section = f"""
+[agents.{section_name}]
+base_url = "{base_url}"
+model = "{model}"
+api_key = "sk-no-key-required"
+roles = ["{role}"]
+forge_user = "{agent_name}"
+compact_pct = 60
+poll_interval = {poll_interval}
+"""
+
+# Check if section already exists and replace it
+pattern = rf"\[agents\.{re.escape(section_name)}\][^\[]*"
+if re.search(pattern, text):
+    text = re.sub(pattern, new_section.strip() + "\n", text)
+else:
+    # Remove commented-out example [agents.llama] block if present
+    text = re.sub(
+        r"\n# Local-model agents \(optional\).*?(?=\n# \[mirrors\]|\n\[mirrors\]|\Z)",
+        "",
+        text,
+        flags=re.DOTALL,
+    )
+    # Append before [mirrors] if it exists, otherwise at end
+    mirrors_match = re.search(r"\n(# )?\[mirrors\]", text)
+    if mirrors_match:
+        text = text[:mirrors_match.start()] + "\n" + new_section + text[mirrors_match.start():]
+    else:
+        text = text.rstrip() + "\n" + new_section
+
+p.write_text(text)
+' "$toml_file" "$section_name" "$local_model" "$model" "$agent_name" "$role" "$interval"
+
+    echo "  Agent config written to TOML"
+
+    # Regenerate docker-compose.yml to include the new agent container
+    local compose_file="${FACTORY_ROOT}/docker-compose.yml"
+    if [ -f "$compose_file" ]; then
+      echo "  Regenerating docker-compose.yml..."
+      rm -f "$compose_file"
+      # generate_compose is defined in the calling script (bin/disinto) via generators.sh
+      # Use _generate_compose_impl directly since generators.sh is already sourced
+      local forge_port="3000"
+      if [ -n "${FORGE_URL:-}" ]; then
+        forge_port=$(printf '%s' "$FORGE_URL" | sed -E 's|.*:([0-9]+)/?$|\1|')
+        forge_port="${forge_port:-3000}"
+      fi
+      _generate_compose_impl "$forge_port"
+      echo "  Compose regenerated with agents-${section_name} service"
+    fi
+
+    local service_name="agents-${section_name}"
+    echo ""
+    echo "  Service name: ${service_name}"
+    echo "  Model endpoint: ${local_model}"
+    echo "  Model: ${model}"
+    echo ""
+    echo "  To start the agent, run:"
+    echo "    docker compose --profile ${service_name} up -d ${service_name}"
+  fi
+
+  echo ""
+  echo "Done! Agent '${agent_name}' hired for role '${role}'."
+  echo "  User:    ${forge_url}/${agent_name}"
+  echo "  Repo:    ${forge_url}/${agent_name}/.profile"
+  echo "  Formula: ${role}.toml"
+}
--- a/lib/issue-lifecycle.sh
+++ b/lib/issue-lifecycle.sh
@ -43,7 +43,6 @@ _ilc_log() {

 # ---------------------------------------------------------------------------
 # Label ID caching — lookup once per name, cache in globals.
-# Pattern follows ci-helpers.sh (ensure_blocked_label_id).
 # ---------------------------------------------------------------------------
 declare -A _ILC_LABEL_IDS
 _ILC_LABEL_IDS["backlog"]=""
@ -80,6 +79,27 @@ _ilc_backlog_id()      { _ilc_ensure_label_id "backlog"     "#0075ca"; }
 _ilc_in_progress_id()  { _ilc_ensure_label_id "in-progress" "#1d76db"; }
 _ilc_blocked_id()      { _ilc_ensure_label_id "blocked"     "#e11d48"; }

+# ---------------------------------------------------------------------------
+# Labels that indicate an issue belongs to a non-dev agent workflow.
+# Any issue carrying one of these should NOT be touched by dev-poll's
+# stale-detection or orphan-recovery logic.  See issue #608.
+# ---------------------------------------------------------------------------
+_ILC_NON_DEV_LABELS="bug-report vision in-triage prediction/unreviewed prediction/dismissed action formula"
+
+# issue_is_dev_claimable COMMA_SEPARATED_LABELS
+# Returns 0 if the issue's labels are compatible with dev-agent ownership,
+# 1 if any non-dev label is present (meaning another agent owns this issue).
+issue_is_dev_claimable() {
+  local labels="$1"
+  local lbl
+  for lbl in $_ILC_NON_DEV_LABELS; do
+    if echo ",$labels," | grep -qF ",$lbl,"; then
+      return 1
+    fi
+  done
+  return 0
+}
+
 # ---------------------------------------------------------------------------
 # issue_claim — assign issue to bot, add "in-progress" label, remove "backlog".
 # Args: issue_number
@ -103,7 +123,9 @@ issue_claim() {
    return 1
  fi

-  # Assign to self (Forgejo rejects if already assigned differently)
+  # Assign to self BEFORE adding in-progress label (issue #471).
+  # This ordering ensures the assignee is set by the time other pollers
+  # see the in-progress label, reducing the stale-detection race window.
  curl -sf -X PATCH \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
@ -161,6 +183,27 @@ issue_release() {
  _ilc_log "released issue #${issue}"
 }

+# ---------------------------------------------------------------------------
+# _ilc_post_comment — Post a comment to an issue (internal helper)
+# Args: issue_number body_text
+# Uses a temp file to avoid large inline strings.
+# ---------------------------------------------------------------------------
+_ilc_post_comment() {
+  local issue="$1" body="$2"
+
+  local tmpfile tmpjson
+  tmpfile=$(mktemp /tmp/ilc-comment-XXXXXX.md)
+  tmpjson="${tmpfile}.json"
+  printf '%s' "$body" > "$tmpfile"
+  jq -Rs '{body:.}' < "$tmpfile" > "$tmpjson"
+  curl -sf -o /dev/null -X POST \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Content-Type: application/json" \
+    "${FORGE_API}/issues/${issue}/comments" \
+    --data-binary @"$tmpjson" 2>/dev/null || true
+  rm -f "$tmpfile" "$tmpjson"
+}
+
 # ---------------------------------------------------------------------------
 # issue_block — add "blocked" label, post diagnostic comment, remove in-progress.
 # Args: issue_number reason [result_text]
@ -187,14 +230,9 @@ issue_block() {
    fi
  } > "$tmpfile"

-  # Post comment
-  jq -Rs '{body:.}' < "$tmpfile" > "${tmpfile}.json"
-  curl -sf -o /dev/null -X POST \
-    -H "Authorization: token ${FORGE_TOKEN}" \
-    -H "Content-Type: application/json" \
-    "${FORGE_API}/issues/${issue}/comments" \
-    --data-binary @"${tmpfile}.json" 2>/dev/null || true
-  rm -f "$tmpfile" "${tmpfile}.json"
+  # Post comment using shared helper
+  _ilc_post_comment "$issue" "$(cat "$tmpfile")"
+  rm -f "$tmpfile"

  # Remove in-progress, add blocked
  local ip_id bk_id
--- a/lib/load-project.sh
+++ b/lib/load-project.sh
@ -10,7 +10,6 @@
 #   PROJECT_CONTAINERS, CHECK_PRS, CHECK_DEV_AGENT,
 #   CHECK_PIPELINE_STALL, CI_STALE_MINUTES,
 #   MIRROR_NAMES, MIRROR_URLS, MIRROR_<NAME> (per configured mirror)
-#   (plus backwards-compat aliases: CODEBERG_REPO, CODEBERG_API, CODEBERG_WEB)
 #
 # If no argument given, does nothing (allows poll scripts to work with
 # plain .env fallback for backwards compatibility).
@ -83,7 +82,7 @@ if mirrors:
 # Export parsed variables.
 # Inside the agents container (DISINTO_CONTAINER=1), compose already sets the
 # correct FORGE_URL (http://forgejo:3000) and path vars for the container
-# environment.  The TOML carries host-perspective values (localhost, /home/johba/…)
+# environment.  The TOML carries host-perspective values (localhost, /home/admin/…)
 # that would break container API calls and path resolution.  Skip overriding
 # any env var that is already set when running inside the container.
 while IFS='=' read -r _key _val; do
@ -98,27 +97,61 @@ done <<< "$_PROJECT_VARS"
 # FORGE_URL: TOML forge_url > existing FORGE_URL > default
 export FORGE_URL="${FORGE_URL:-http://localhost:3000}"
 if [ -n "$FORGE_REPO" ]; then
-  export FORGE_API="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  export FORGE_API_BASE="${FORGE_URL}/api/v1"
+  export FORGE_API="${FORGE_API_BASE}/repos/${FORGE_REPO}"
  export FORGE_WEB="${FORGE_URL}/${FORGE_REPO}"
-fi
-# Backwards-compat aliases
-export CODEBERG_REPO="${FORGE_REPO}"
-export CODEBERG_API="${FORGE_API:-}"
-export CODEBERG_WEB="${FORGE_WEB:-}"
-
-# Derive PROJECT_REPO_ROOT if not explicitly set
-if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
-  export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}"
+  # Extract repo owner (first path segment of owner/repo)
+  export FORGE_REPO_OWNER="${FORGE_REPO%%/*}"
 fi

-# Derive OPS_REPO_ROOT if not explicitly set
-if [ -z "${OPS_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
-  export OPS_REPO_ROOT="/home/${USER}/${PROJECT_NAME}-ops"
-fi
+# PROJECT_REPO_ROOT and OPS_REPO_ROOT: no fallback derivation from USER/HOME.
+# These must be set by the entrypoint (container) or the TOML (host CLI).
+# Inside the container, the entrypoint exports the correct paths before agent
+# scripts source env.sh; the TOML's host-perspective paths are skipped by the
+# DISINTO_CONTAINER guard above.

 # Derive FORGE_OPS_REPO if not explicitly set
 if [ -z "${FORGE_OPS_REPO:-}" ] && [ -n "${FORGE_REPO:-}" ]; then
  export FORGE_OPS_REPO="${FORGE_REPO}-ops"
 fi

+# Parse [agents.*] sections for local-model agents
+# Exports AGENT_<NAME>_BASE_URL, AGENT_<NAME>_MODEL, AGENT_<NAME>_API_KEY,
+# AGENT_<NAME>_ROLES, AGENT_<NAME>_FORGE_USER, AGENT_<NAME>_COMPACT_PCT
+if command -v python3 &>/dev/null; then
+  _AGENT_VARS=$(python3 -c "
+import sys, tomllib
+
+with open(sys.argv[1], 'rb') as f:
+    cfg = tomllib.load(f)
+
+agents = cfg.get('agents', {})
+for name, config in agents.items():
+    if not isinstance(config, dict):
+        continue
+    # Emit variables in uppercase with the agent name
+    if 'base_url' in config:
+        print(f'AGENT_{name.upper()}_BASE_URL={config[\"base_url\"]}')
+    if 'model' in config:
+        print(f'AGENT_{name.upper()}_MODEL={config[\"model\"]}')
+    if 'api_key' in config:
+        print(f'AGENT_{name.upper()}_API_KEY={config[\"api_key\"]}')
+    if 'roles' in config:
+        roles = ' '.join(config['roles']) if isinstance(config['roles'], list) else config['roles']
+        print(f'AGENT_{name.upper()}_ROLES={roles}')
+    if 'forge_user' in config:
+        print(f'AGENT_{name.upper()}_FORGE_USER={config[\"forge_user\"]}')
+    if 'compact_pct' in config:
+        print(f'AGENT_{name.upper()}_COMPACT_PCT={config[\"compact_pct\"]}')
+" "$_PROJECT_TOML" 2>/dev/null) || true
+
+  if [ -n "$_AGENT_VARS" ]; then
+    while IFS='=' read -r _key _val; do
+      [ -z "$_key" ] && continue
+      export "$_key=$_val"
+    done <<< "$_AGENT_VARS"
+  fi
+  unset _AGENT_VARS
+fi
+
 unset _PROJECT_TOML _PROJECT_VARS _key _val
--- a/lib/ops-setup.sh
+++ b/lib/ops-setup.sh
@ -0,0 +1,409 @@
+#!/usr/bin/env bash
+# ops-setup.sh — Setup ops repository (disinto-ops)
+#
+# Source from bin/disinto:
+#   source "$(dirname "$0")/../lib/ops-setup.sh"
+#
+# Required globals: FORGE_URL, FORGE_TOKEN, FACTORY_ROOT
+# Optional: HUMAN_TOKEN (falls back to FORGE_TOKEN for admin operations)
+#
+# Functions:
+#   setup_ops_repo <forge_url> <ops_slug> <ops_root> [primary_branch] [admin_token]
+#     - Create ops repo on Forgejo if it doesn't exist
+#     - Configure bot collaborators with appropriate permissions
+#     - Clone or initialize ops repo locally
+#     - Seed directory structure (vault, knowledge, evidence)
+#     - Export _ACTUAL_OPS_SLUG for caller to use
+#   migrate_ops_repo <ops_root> [primary_branch]
+#     - Seed missing directories/files on existing ops repos (idempotent)
+#     - Creates .gitkeep files and template content for canonical structure
+#
+# Globals modified:
+#   _ACTUAL_OPS_SLUG - resolved ops repo slug after setup_ops_repo completes
+
+set -euo pipefail
+
+setup_ops_repo() {
+
+  local forge_url="$1" ops_slug="$2" ops_root="$3" primary_branch="${4:-main}"
+  local admin_token="${5:-${HUMAN_TOKEN:-${FORGE_TOKEN}}}"
+  local org_name="${ops_slug%%/*}"
+  local ops_name="${ops_slug##*/}"
+
+  echo ""
+  echo "── Ops repo setup ─────────────────────────────────────"
+
+  # Determine the actual ops repo location by searching across possible namespaces
+  # This handles cases where the repo was created under a different namespace
+  # due to past bugs (e.g., dev-bot/disinto-ops instead of disinto-admin/disinto-ops)
+  local actual_ops_slug=""
+  local -a possible_namespaces=( "$org_name" "dev-bot" "disinto-admin" )
+  local http_code
+
+  for ns in "${possible_namespaces[@]}"; do
+    slug="${ns}/${ops_name}"
+    if curl -sf --max-time 5 \
+      -H "Authorization: token ${FORGE_TOKEN}" \
+      "${forge_url}/api/v1/repos/${slug}" >/dev/null 2>&1; then
+      actual_ops_slug="$slug"
+      echo "Ops repo: ${slug} (found at ${slug})"
+      break
+    fi
+  done
+
+  # If not found, try to create it in the configured namespace
+  if [ -z "$actual_ops_slug" ]; then
+    echo "Creating ops repo in namespace: ${org_name}"
+
+    # Determine if target namespace is a user or an org
+    local ns_type=""
+    if curl -sf -H "Authorization: token ${admin_token}" \
+      "${forge_url}/api/v1/users/${org_name}" >/dev/null 2>&1; then
+      # User endpoint exists - check if it's an org
+      if curl -sf -H "Authorization: token ${admin_token}" \
+        "${forge_url}/api/v1/users/${org_name}" | grep -q '"is_org":true'; then
+        ns_type="org"
+      else
+        ns_type="user"
+      fi
+    elif curl -sf -H "Authorization: token ${admin_token}" \
+      "${forge_url}/api/v1/orgs/${org_name}" >/dev/null 2>&1; then
+      # Org endpoint exists
+      ns_type="org"
+    fi
+
+    local create_endpoint="" via_msg=""
+    if [ "$ns_type" = "org" ]; then
+      # Org namespace — use org API
+      create_endpoint="/api/v1/orgs/${org_name}/repos"
+      # Create org if it doesn't exist
+      curl -sf -X POST \
+        -H "Authorization: token ${admin_token}" \
+        -H "Content-Type: application/json" \
+        "${forge_url}/api/v1/orgs" \
+        -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true
+    else
+      # User namespace — use admin API (requires admin token)
+      create_endpoint="/api/v1/admin/users/${org_name}/repos"
+      via_msg=" (via admin API)"
+    fi
+
+    if curl -sf -X POST \
+      -H "Authorization: token ${admin_token}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}${create_endpoint}" \
+      -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data for ${org_name}/${ops_name%-ops}\"}" >/dev/null 2>&1; then
+      actual_ops_slug="${org_name}/${ops_name}"
+      echo "Ops repo: ${actual_ops_slug} created on Forgejo${via_msg}"
+    else
+      http_code=$(curl -s -o /dev/null -w "%{http_code}" \
+        -X POST \
+        -H "Authorization: token ${admin_token}" \
+        -H "Content-Type: application/json" \
+        "${forge_url}${create_endpoint}" \
+        -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data for ${org_name}/${ops_name%-ops}\"}" 2>/dev/null || echo "0")
+      if [ "$http_code" = "201" ]; then
+        actual_ops_slug="${org_name}/${ops_name}"
+        echo "Ops repo: ${actual_ops_slug} created on Forgejo${via_msg}"
+      else
+        echo "Error: failed to create ops repo '${org_name}/${ops_name}' (HTTP ${http_code})" >&2
+        return 1
+      fi
+    fi
+  fi
+
+  # Configure collaborators on the ops repo
+  local bot_user bot_perm
+  declare -A bot_permissions=(
+    [dev-bot]="write"
+    [review-bot]="read"
+    [planner-bot]="write"
+    [gardener-bot]="write"
+    [vault-bot]="write"
+    [supervisor-bot]="read"
+    [predictor-bot]="read"
+    [architect-bot]="write"
+  )
+
+  # Add all bot users as collaborators with appropriate permissions
+  # vault branch protection (#77) requires:
+  # - Admin-only merge to main (enforced by admin_enforced: true)
+  # - Bots can push branches and create PRs, but cannot merge
+  for bot_user in "${!bot_permissions[@]}"; do
+    bot_perm="${bot_permissions[$bot_user]}"
+    if curl -sf -X PUT \
+      -H "Authorization: token ${admin_token}" \
+      -H "Content-Type: application/json" \
+      "${forge_url}/api/v1/repos/${actual_ops_slug}/collaborators/${bot_user}" \
+      -d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1; then
+      echo "  + ${bot_user} = ${bot_perm} collaborator"
+    else
+      echo "  ! ${bot_user} = ${bot_perm} (already set or failed)"
+    fi
+  done
+
+  # Add disinto-admin as admin collaborator
+  if curl -sf -X PUT \
+    -H "Authorization: token ${admin_token}" \
+    -H "Content-Type: application/json" \
+    "${forge_url}/api/v1/repos/${actual_ops_slug}/collaborators/disinto-admin" \
+    -d '{"permission":"admin"}' >/dev/null 2>&1; then
+    echo "  + disinto-admin = admin collaborator"
+  else
+    echo "  ! disinto-admin = admin (already set or failed)"
+  fi
+
+  # Clone ops repo locally if not present — use clean URL, credential helper
+  # supplies auth (#604).
+  if [ ! -d "${ops_root}/.git" ]; then
+    local clone_url="${forge_url}/${actual_ops_slug}.git"
+    echo "Cloning: ops repo -> ${ops_root}"
+    if git clone --quiet "$clone_url" "$ops_root" 2>/dev/null; then
+      echo "Ops repo: ${actual_ops_slug} cloned successfully"
+    else
+      echo "Initializing: ops repo at ${ops_root}"
+      mkdir -p "$ops_root"
+      git -C "$ops_root" init --initial-branch="${primary_branch}" -q
+      # Set remote to the actual ops repo location
+      git -C "$ops_root" remote add origin "${forge_url}/${actual_ops_slug}.git"
+      echo "Ops repo: ${actual_ops_slug} initialized locally"
+    fi
+  else
+    echo "Ops repo: ${ops_root} (already exists locally)"
+    # Verify remote is correct
+    local current_remote
+    current_remote=$(git -C "$ops_root" remote get-url origin 2>/dev/null || true)
+    local expected_remote="${forge_url}/${actual_ops_slug}.git"
+    if [ -n "$current_remote" ] && [ "$current_remote" != "$expected_remote" ]; then
+      echo "  Fixing: remote URL from ${current_remote} to ${expected_remote}"
+      git -C "$ops_root" remote set-url origin "$expected_remote"
+    fi
+  fi
+
+  # Seed directory structure
+  local seeded=false
+  mkdir -p "${ops_root}/vault/pending"
+  mkdir -p "${ops_root}/vault/approved"
+  mkdir -p "${ops_root}/vault/fired"
+  mkdir -p "${ops_root}/vault/rejected"
+  mkdir -p "${ops_root}/knowledge"
+  mkdir -p "${ops_root}/evidence/engagement"
+  mkdir -p "${ops_root}/evidence/red-team"
+  mkdir -p "${ops_root}/evidence/holdout"
+  mkdir -p "${ops_root}/evidence/evolution"
+  mkdir -p "${ops_root}/evidence/user-test"
+  mkdir -p "${ops_root}/sprints"
+  [ -f "${ops_root}/sprints/.gitkeep" ] || { touch "${ops_root}/sprints/.gitkeep"; seeded=true; }
+  [ -f "${ops_root}/evidence/red-team/.gitkeep" ] || { touch "${ops_root}/evidence/red-team/.gitkeep"; seeded=true; }
+  [ -f "${ops_root}/evidence/holdout/.gitkeep" ] || { touch "${ops_root}/evidence/holdout/.gitkeep"; seeded=true; }
+  [ -f "${ops_root}/evidence/evolution/.gitkeep" ] || { touch "${ops_root}/evidence/evolution/.gitkeep"; seeded=true; }
+  [ -f "${ops_root}/evidence/user-test/.gitkeep" ] || { touch "${ops_root}/evidence/user-test/.gitkeep"; seeded=true; }
+
+  if [ ! -f "${ops_root}/README.md" ]; then
+    cat > "${ops_root}/README.md" <<OPSEOF
+# ${ops_name}
+
+Operational data for the ${ops_name%-ops} project.
+
+## Structure
+
+\`\`\`
+${ops_name}/
+├── vault/
+│   ├── pending/          # vault items awaiting approval
+│   ├── approved/         # approved vault items
+│   ├── fired/            # executed vault items
+│   └── rejected/         # rejected vault items
+├── sprints/              # sprint specs written by architect agent
+├── knowledge/            # shared agent knowledge and best practices
+├── evidence/             # engagement data, experiment results
+├── portfolio.md          # addressables + observables
+├── prerequisites.md      # dependency graph
+└── RESOURCES.md          # accounts, tokens (refs), infra inventory
+\`\`\`
+
+> **Note:** Journal directories (journal/planner/ and journal/supervisor/) have been removed from the ops repo. Agent journals are now stored in each agent's .profile repo on Forgejo.
+
+## Branch protection
+
+- \`main\`: 2 reviewers required for vault items
+- Journal/evidence commits may use lighter rules
+OPSEOF
+    seeded=true
+  fi
+
+  # Copy vault policy.toml template if not already present
+  if [ ! -f "${ops_root}/vault/policy.toml" ]; then
+    local policy_src="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/vault/policy.toml"
+    if [ -f "$policy_src" ]; then
+      cp "$policy_src" "${ops_root}/vault/policy.toml"
+      echo "  + Copied vault/policy.toml template"
+      seeded=true
+    fi
+  fi
+
+  # Create stub files if they don't exist
+  [ -f "${ops_root}/portfolio.md" ] || { echo "# Portfolio" > "${ops_root}/portfolio.md"; seeded=true; }
+  [ -f "${ops_root}/prerequisites.md" ] || { echo "# Prerequisite Tree" > "${ops_root}/prerequisites.md"; seeded=true; }
+  [ -f "${ops_root}/RESOURCES.md" ] || { echo "# Resources" > "${ops_root}/RESOURCES.md"; seeded=true; }
+
+  # Commit and push seed content
+  if [ "$seeded" = true ] && [ -d "${ops_root}/.git" ]; then
+    # Auto-configure repo-local git identity if missing (#778)
+    if [ -z "$(git -C "$ops_root" config user.name 2>/dev/null)" ]; then
+      git -C "$ops_root" config user.name "disinto-admin"
+    fi
+    if [ -z "$(git -C "$ops_root" config user.email 2>/dev/null)" ]; then
+      git -C "$ops_root" config user.email "disinto-admin@localhost"
+    fi
+
+    git -C "$ops_root" add -A
+    if ! git -C "$ops_root" diff --cached --quiet 2>/dev/null; then
+      git -C "$ops_root" commit -m "chore: seed ops repo structure" -q
+      # Push if remote exists
+      if git -C "$ops_root" remote get-url origin >/dev/null 2>&1; then
+        if git -C "$ops_root" push origin "${primary_branch}" -q 2>/dev/null; then
+          echo "Seeded:  ops repo with initial structure"
+        else
+          echo "Warning: failed to push seed content to ops repo" >&2
+        fi
+      fi
+    fi
+  fi
+
+  # Export resolved slug for the caller to write back to the project TOML
+  _ACTUAL_OPS_SLUG="${actual_ops_slug}"
+}
+
+# migrate_ops_repo — Seed missing ops repo directories and files on existing deployments
+#
+# This function is idempotent — safe to run on every container start.
+# It checks for missing directories/files and creates them with .gitkeep files
+# or template content as appropriate.
+#
+# Called from entrypoint.sh after setup_ops_repo() to bring pre-#407 deployments
+# up to date with the canonical ops repo structure.
+migrate_ops_repo() {
+  local ops_root="${1:-}"
+  local primary_branch="${2:-main}"
+
+  # Validate ops_root argument
+  if [ -z "$ops_root" ]; then
+    # Try to determine ops_root from environment or project config
+    if [ -n "${OPS_REPO_ROOT:-}" ]; then
+      ops_root="${OPS_REPO_ROOT}"
+    elif [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then
+      source "$(dirname "$0")/load-project.sh" "$PROJECT_TOML"
+      ops_root="${OPS_REPO_ROOT:-}"
+    fi
+  fi
+
+  # Skip if we still don't have an ops root
+  if [ -z "$ops_root" ]; then
+    echo "migrate_ops_repo: skipping — no ops repo root determined"
+    return 0
+  fi
+
+  # Verify it's a git repo
+  if [ ! -d "${ops_root}/.git" ]; then
+    echo "migrate_ops_repo: skipping — ${ops_root} is not a git repo"
+    return 0
+  fi
+
+  echo ""
+  echo "── Ops repo migration ───────────────────────────────────"
+  echo "Checking ${ops_root} for missing directories and files..."
+
+  # Change to ops_root directory to ensure all git operations use the correct repo
+  # This prevents "fatal: not in a git directory" errors from stray git commands
+  local orig_dir
+  orig_dir=$(pwd)
+  cd "$ops_root" || {
+    echo "Error: failed to change to ${ops_root}" >&2
+    return 1
+  }
+
+  local migrated=false
+
+  # Canonical ops repo structure (post #407)
+  # Directories to ensure exist with .gitkeep files
+  local -a dir_keepfiles=(
+    "vault/pending/.gitkeep"
+    "vault/approved/.gitkeep"
+    "vault/fired/.gitkeep"
+    "vault/rejected/.gitkeep"
+    "knowledge/.gitkeep"
+    "evidence/engagement/.gitkeep"
+    "evidence/red-team/.gitkeep"
+    "evidence/holdout/.gitkeep"
+    "evidence/evolution/.gitkeep"
+    "evidence/user-test/.gitkeep"
+    "sprints/.gitkeep"
+  )
+
+  # Create missing directories and .gitkeep files
+  for keepfile in "${dir_keepfiles[@]}"; do
+    if [ ! -f "$keepfile" ]; then
+      mkdir -p "$(dirname "$keepfile")"
+      touch "$keepfile"
+      echo "  + Created: ${keepfile}"
+      migrated=true
+    fi
+  done
+
+  # Template files to create if missing (starter content)
+  local -a template_files=(
+    "portfolio.md"
+    "prerequisites.md"
+    "RESOURCES.md"
+  )
+
+  for tfile in "${template_files[@]}"; do
+    if [ ! -f "$tfile" ]; then
+      local title
+      title=$(basename "$tfile" | sed 's/\.md$//; s/_/ /g' | sed 's/\b\(.\)/\u\1/g')
+      {
+        echo "# ${title}"
+        echo ""
+        echo "## Overview"
+        echo ""
+        echo "<!-- Add content here -->"
+      } > "$tfile"
+      echo "  + Created: ${tfile}"
+      migrated=true
+    fi
+  done
+
+  # Commit and push changes if any were made
+  if [ "$migrated" = true ]; then
+    # Auto-configure repo-local git identity if missing
+    if [ -z "$(git config user.name 2>/dev/null)" ]; then
+      git config user.name "disinto-admin"
+    fi
+    if [ -z "$(git config user.email 2>/dev/null)" ]; then
+      git config user.email "disinto-admin@localhost"
+    fi
+
+    git add -A
+    if ! git diff --cached --quiet 2>/dev/null; then
+      if ! git commit -m "chore: migrate ops repo structure to canonical layout" -q; then
+        echo "Error: failed to commit migration changes" >&2
+        cd "$orig_dir"
+        return 1
+      fi
+      # Push if remote exists
+      if git remote get-url origin >/dev/null 2>&1; then
+        if ! git push origin "${primary_branch}" -q 2>/dev/null; then
+          echo "Warning: failed to push migration to ops repo" >&2
+        else
+          echo "Migrated:  ops repo structure updated and pushed"
+        fi
+      fi
+    fi
+  else
+    echo "  (all directories and files already present)"
+  fi
+
+  # Return to original directory
+  cd "$orig_dir"
+}
--- a/lib/pr-lifecycle.sh
+++ b/lib/pr-lifecycle.sh
@ -357,11 +357,18 @@ pr_close() {
  local pr_num="$1"

  _prl_log "closing PR #${pr_num}"
-  curl -sf -X PATCH \
+  local resp http_code
+  resp=$(curl -sf -w "\n%{http_code}" -X PATCH \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_API}/pulls/${pr_num}" \
-    -d '{"state":"closed"}' >/dev/null 2>&1 || true
+    -d '{"state":"closed"}' 2>/dev/null) || true
+  http_code=$(printf '%s\n' "$resp" | tail -1)
+  if [ "$http_code" != "200" ] && [ "$http_code" != "204" ]; then
+    _prl_log "pr_close FAILED: HTTP ${http_code} for PR #${pr_num}"
+    return 1
+  fi
+  _prl_log "PR #${pr_num} closed"
 }

 # ---------------------------------------------------------------------------
@ -398,11 +405,18 @@ pr_walk_to_merge() {
      if [ "${_PR_CI_FAILURE_TYPE:-}" = "infra" ] && [ "$ci_retry_count" -lt 1 ]; then
        ci_retry_count=$((ci_retry_count + 1))
        _prl_log "infra failure — retriggering CI (retry ${ci_retry_count})"
+        local rebase_output rebase_rc
        ( cd "$worktree" && \
          git commit --allow-empty -m "ci: retrigger after infra failure" --no-verify && \
          git fetch "$remote" "${PRIMARY_BRANCH}" 2>/dev/null && \
          git rebase "${remote}/${PRIMARY_BRANCH}" && \
-          git push --force-with-lease "$remote" HEAD ) 2>&1 | tail -5 || true
+          git push --force-with-lease "$remote" HEAD ) > /tmp/rebase-output-$$ 2>&1
+        rebase_rc=$?
+        rebase_output=$(cat /tmp/rebase-output-$$)
+        rm -f /tmp/rebase-output-$$
+        if [ "$rebase_rc" -ne 0 ]; then
+          _prl_log "rebase/push failed (exit code $rebase_rc): $(echo "$rebase_output" | tail -5)"
+        fi
        continue
      fi

@ -414,6 +428,23 @@ pr_walk_to_merge() {
      fi

      _prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})"
+
+      # Get CI logs from SQLite database if available
+      local ci_logs=""
+      if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then
+        ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs=""
+      fi
+
+      local logs_section=""
+      if [ -n "$ci_logs" ]; then
+        logs_section="
+CI Log Output (last 50 lines):
+\`\`\`
+${ci_logs}
+\`\`\`
+"
+      fi
+
      agent_run --resume "$session_id" --worktree "$worktree" \
        "CI failed on PR #${pr_num} (attempt ${ci_fix_count}/${max_ci_fixes}).

@ -421,7 +452,7 @@ Pipeline: #${_PR_CI_PIPELINE:-?}
 Failure type: ${_PR_CI_FAILURE_TYPE:-unknown}

 Error log:
-${_PR_CI_ERROR_LOG:-No logs available.}
+${_PR_CI_ERROR_LOG:-No logs available.}${logs_section}

 Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push:
  git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH}
@ -457,11 +488,7 @@ Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push:
          _PR_WALK_EXIT_REASON="merged"
          return 0
        fi
-        if [ "$rc" -eq 2 ]; then
-          _PR_WALK_EXIT_REASON="merge_blocked"
-          return 1
-        fi
-        # Merge failed (conflict) — ask agent to rebase
+        # Merge failed (conflict or HTTP 405) — ask agent to rebase
        _prl_log "merge failed — invoking agent to rebase"
        agent_run --resume "$session_id" --worktree "$worktree" \
          "PR #${pr_num} approved but merge failed: ${_PR_MERGE_ERROR:-unknown}
@ -507,8 +534,7 @@ Commit, rebase on ${PRIMARY_BRANCH}, and push:
 # build_phase_protocol_prompt — Generate push/commit instructions for Claude.
 #
 # For the synchronous agent_run architecture: tells Claude how to commit and
-# push (no phase files). For the tmux session architecture, use the
-# build_phase_protocol_prompt in dev/phase-handler.sh instead.
+# push (no phase files).
 #
 # Args: branch [remote]
 # Stdout: instruction text
--- a/lib/profile.sh
+++ b/lib/profile.sh
@ -1,210 +0,0 @@
-#!/usr/bin/env bash
-# profile.sh — Helpers for agent .profile repo management
-#
-# Source after lib/env.sh and lib/formula-session.sh:
-#   source "$(dirname "$0")/../lib/env.sh"
-#   source "$(dirname "$0")/lib/formula-session.sh"
-#   source "$(dirname "$0")/lib/profile.sh"
-#
-# Required globals: FORGE_TOKEN, FORGE_URL, AGENT_IDENTITY, PROFILE_REPO_PATH
-#
-# Functions:
-#   profile_propose_formula   NEW_FORMULA CONTENT REASON — create PR to update formula.toml
-
-set -euo pipefail
-
-# Internal log helper
-_profile_log() {
-  if declare -f log >/dev/null 2>&1; then
-    log "profile: $*"
-  else
-    printf '[%s] profile: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >&2
-  fi
-}
-
-# -----------------------------------------------------------------------------
-# profile_propose_formula — Propose a formula change via PR
-#
-# Creates a branch, writes updated formula.toml, opens a PR, and returns PR number.
-# Branch is protected (requires admin approval per #87).
-#
-# Args:
-#   $1 - NEW_FORMULA_CONTENT: The complete new formula.toml content
-#   $2 - REASON: Human-readable explanation of what changed and why
-#
-# Returns:
-#   0 on success, prints PR number to stdout
-#   1 on failure
-#
-# Example:
-#   source "$(dirname "$0")/../lib/env.sh"
-#   source "$(dirname "$0")/lib/formula-session.sh"
-#   source "$(dirname "$0")/lib/profile.sh"
-#   AGENT_IDENTITY="dev-bot"
-#   ensure_profile_repo "$AGENT_IDENTITY"
-#   profile_propose_formula "$new_formula" "Added new prompt pattern for code review"
-# -----------------------------------------------------------------------------
-profile_propose_formula() {
-  local new_formula="$1"
-  local reason="$2"
-
-  if [ -z "${AGENT_IDENTITY:-}" ]; then
-    _profile_log "ERROR: AGENT_IDENTITY not set"
-    return 1
-  fi
-
-  if [ -z "${PROFILE_REPO_PATH:-}" ]; then
-    _profile_log "ERROR: PROFILE_REPO_PATH not set — ensure_profile_repo not called"
-    return 1
-  fi
-
-  if [ -z "${FORGE_TOKEN:-}" ]; then
-    _profile_log "ERROR: FORGE_TOKEN not set"
-    return 1
-  fi
-
-  if [ -z "${FORGE_URL:-}" ]; then
-    _profile_log "ERROR: FORGE_URL not set"
-    return 1
-  fi
-
-  # Generate short description from reason for branch name
-  local short_desc
-  short_desc=$(printf '%s' "$reason" | \
-    tr '[:upper:]' '[:lower:]' | \
-    sed 's/[^a-z0-9 ]//g' | \
-    sed 's/  */ /g' | \
-    sed 's/^ *//;s/ *$//' | \
-    cut -c1-40 | \
-    tr ' ' '-')
-
-  if [ -z "$short_desc" ]; then
-    short_desc="formula-update"
-  fi
-
-  local branch_name="formula/${short_desc}"
-  local formula_path="${PROFILE_REPO_PATH}/formula.toml"
-
-  _profile_log "Proposing formula change: ${branch_name}"
-  _profile_log "Reason: ${reason}"
-
-  # Ensure we're on main branch and up-to-date
-  _profile_log "Fetching .profile repo"
-  (
-    cd "$PROFILE_REPO_PATH" || return 1
-
-    git fetch origin main --quiet 2>/dev/null || \
-    git fetch origin master --quiet 2>/dev/null || true
-
-    # Reset to main/master
-    if git checkout main --quiet 2>/dev/null; then
-      git pull --ff-only origin main --quiet 2>/dev/null || true
-    elif git checkout master --quiet 2>/dev/null; then
-      git pull --ff-only origin master --quiet 2>/dev/null || true
-    else
-      _profile_log "ERROR: Failed to checkout main/master branch"
-      return 1
-    fi
-
-    # Create and checkout new branch
-    git checkout -b "$branch_name" 2>/dev/null || {
-      _profile_log "Branch ${branch_name} may already exist"
-      git checkout "$branch_name" 2>/dev/null || return 1
-    }
-
-    # Write formula.toml
-    printf '%s' "$new_formula" > "$formula_path"
-
-    # Commit the change
-    git config user.name "${AGENT_IDENTITY}" || true
-    git config user.email "${AGENT_IDENTITY}@users.noreply.codeberg.org" || true
-
-    git add "$formula_path"
-    git commit -m "formula: ${reason}" --no-verify || {
-      _profile_log "No changes to commit (formula unchanged)"
-      # Check if branch has any commits
-      if git rev-parse HEAD >/dev/null 2>&1; then
-        : # branch has commits, continue
-      else
-        _profile_log "ERROR: Failed to create commit"
-        return 1
-      fi
-    }
-
-    # Push branch
-    local remote="${FORGE_REMOTE:-origin}"
-    git push --set-upstream "$remote" "$branch_name" --quiet 2>/dev/null || {
-      _profile_log "ERROR: Failed to push branch"
-      return 1
-    }
-
-    _profile_log "Branch pushed: ${branch_name}"
-
-    # Create PR
-    local forge_url="${FORGE_URL%/}"
-    local api_url="${forge_url}/api/v1/repos/${AGENT_IDENTITY}/.profile"
-    local primary_branch="main"
-
-    # Check if main or master is the primary branch
-    if ! curl -sf -o /dev/null -w "%{http_code}" \
-      -H "Authorization: token ${FORGE_TOKEN}" \
-      "${api_url}/git/branches/main" 2>/dev/null | grep -q "200"; then
-      primary_branch="master"
-    fi
-
-    local pr_title="formula: ${reason}"
-    local pr_body="# Formula Update
-
-**Reason:** ${reason}
-
---
-*This PR was auto-generated by ${AGENT_IDENTITY}.*
-"
-
-    local pr_response http_code
-    local pr_json
-    pr_json=$(jq -n \
-      --arg t "$pr_title" \
-      --arg b "$pr_body" \
-      --arg h "$branch_name" \
-      --arg base "$primary_branch" \
-      '{title:$t, body:$b, head:$h, base:$base}') || {
-      _profile_log "ERROR: Failed to build PR JSON"
-      return 1
-    }
-
-    pr_response=$(curl -s -w "\n%{http_code}" -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
-      -H "Content-Type: application/json" \
-      "${api_url}/pulls" \
-      -d "$pr_json" || true)
-
-    http_code=$(printf '%s\n' "$pr_response" | tail -1)
-    pr_response=$(printf '%s\n' "$pr_response" | sed '$d')
-
-    if [ "$http_code" = "201" ] || [ "$http_code" = "200" ]; then
-      local pr_num
-      pr_num=$(printf '%s' "$pr_response" | jq -r '.number')
-      _profile_log "PR created: #${pr_num}"
-      printf '%s' "$pr_num"
-      return 0
-    else
-      # Check if PR already exists (409 conflict)
-      if [ "$http_code" = "409" ]; then
-        local existing_pr
-        existing_pr=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-          "${api_url}/pulls?state=open&head=${AGENT_IDENTITY}:formula/${short_desc}" 2>/dev/null | \
-          jq -r '.[0].number // empty') || true
-        if [ -n "$existing_pr" ]; then
-          _profile_log "PR already exists: #${existing_pr}"
-          printf '%s' "$existing_pr"
-          return 0
-        fi
-      fi
-      _profile_log "ERROR: Failed to create PR (HTTP ${http_code})"
-      return 1
-    fi
-  )
-
-  return $?
-}
--- a/lib/release.sh
+++ b/lib/release.sh
@ -0,0 +1,179 @@
+#!/usr/bin/env bash
+# =============================================================================
+# release.sh — disinto_release() function
+#
+# Handles vault TOML creation, branch setup on ops repo, PR creation,
+# and auto-merge request for a versioned release.
+#
+# Globals expected:
+#   FORGE_URL      - Forge instance URL (e.g. http://localhost:3000)
+#   FORGE_TOKEN    - API token for Forge operations
+#   FORGE_OPS_REPO - Ops repo slug (e.g. disinto-admin/myproject-ops)
+#   FACTORY_ROOT   - Root of the disinto factory
+#   PRIMARY_BRANCH - Primary branch name (e.g. main)
+#
+# Usage:
+#   source "${FACTORY_ROOT}/lib/release.sh"
+#   disinto_release <version>
+# =============================================================================
+set -euo pipefail
+
+# Source vault.sh for _vault_log helper
+source "${FACTORY_ROOT}/lib/vault.sh"
+
+# Assert required globals are set before using this module.
+_assert_release_globals() {
+  local missing=()
+  [ -z "${FORGE_URL:-}" ]      && missing+=("FORGE_URL")
+  [ -z "${FORGE_TOKEN:-}" ]    && missing+=("FORGE_TOKEN")
+  [ -z "${FORGE_OPS_REPO:-}" ] && missing+=("FORGE_OPS_REPO")
+  [ -z "${FACTORY_ROOT:-}" ]   && missing+=("FACTORY_ROOT")
+  [ -z "${PRIMARY_BRANCH:-}" ] && missing+=("PRIMARY_BRANCH")
+  if [ "${#missing[@]}" -gt 0 ]; then
+    echo "Error: release.sh requires these globals to be set: ${missing[*]}" >&2
+    exit 1
+  fi
+}
+
+disinto_release() {
+  _assert_release_globals
+
+  local version="${1:-}"
+  local formula_path="${FACTORY_ROOT}/formulas/release.toml"
+
+  if [ -z "$version" ]; then
+    echo "Error: version required" >&2
+    echo "Usage: disinto release <version>" >&2
+    echo "Example: disinto release v1.2.0" >&2
+    exit 1
+  fi
+
+  # Validate version format (must start with 'v' followed by semver)
+  if ! echo "$version" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+$'; then
+    echo "Error: version must be in format v1.2.3 (semver with 'v' prefix)" >&2
+    exit 1
+  fi
+
+  # Load project config to get FORGE_OPS_REPO
+  if [ -z "${PROJECT_NAME:-}" ]; then
+    # PROJECT_NAME is unset - detect project TOML from projects/ directory
+    local found_toml
+    found_toml=$(find "${FACTORY_ROOT}/projects" -maxdepth 1 -name "*.toml" ! -name "*.example" 2>/dev/null | head -1)
+    if [ -n "$found_toml" ]; then
+      source "${FACTORY_ROOT}/lib/load-project.sh" "$found_toml"
+    fi
+  else
+    local project_toml="${FACTORY_ROOT}/projects/${PROJECT_NAME}.toml"
+    if [ -f "$project_toml" ]; then
+      source "${FACTORY_ROOT}/lib/load-project.sh" "$project_toml"
+    fi
+  fi
+
+  # Check formula exists
+  if [ ! -f "$formula_path" ]; then
+    echo "Error: release formula not found at ${formula_path}" >&2
+    exit 1
+  fi
+
+  # Get the ops repo root
+  local ops_root="${FACTORY_ROOT}/../disinto-ops"
+  if [ ! -d "${ops_root}/.git" ]; then
+    echo "Error: ops repo not found at ${ops_root}" >&2
+    echo "  Run 'disinto init' to set up the ops repo first" >&2
+    exit 1
+  fi
+
+  # Generate a unique ID for the vault item
+  local id="release-${version//./}"
+  local vault_toml="${ops_root}/vault/actions/${id}.toml"
+
+  # Create vault TOML with the specific version
+  cat > "$vault_toml" <<EOF
+# vault/actions/${id}.toml
+# Release vault item for ${version}
+# Auto-generated by disinto release
+
+id = "${id}"
+formula = "release"
+context = "Release ${version}"
+secrets = ["GITHUB_TOKEN", "CODEBERG_TOKEN"]
+mounts = ["ssh"]
+EOF
+
+  echo "Created vault item: ${vault_toml}"
+
+  # Create a PR to submit the vault item to the ops repo
+  local branch_name="release/${version//./}"
+  local pr_title="release: ${version}"
+  local pr_body="Release ${version}
+
+This PR creates a vault item for the release of version ${version}.
+
+## Changes
+- Added vault item: ${id}.toml
+
+## Next Steps
+1. Review this PR
+2. Approve and merge
+3. The vault runner will execute the release formula
+"
+
+  # Create branch from clean primary branch
+  (
+    cd "$ops_root"
+    git checkout "$PRIMARY_BRANCH"
+    git pull origin "$PRIMARY_BRANCH"
+    git checkout -B "$branch_name" "$PRIMARY_BRANCH"
+
+    # Add and commit only the vault TOML file
+    git add "vault/actions/${id}.toml"
+    git commit -m "$pr_title" -m "$pr_body" 2>/dev/null || true
+
+    # Push branch
+    git push -u origin "$branch_name" 2>/dev/null || {
+      echo "Error: failed to push branch" >&2
+      exit 1
+    }
+  )
+
+  # Create PR
+  local pr_response
+  pr_response=$(curl -sf -X POST \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Content-Type: application/json" \
+    "${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}/pulls" \
+    -d "{\"title\":\"${pr_title}\",\"head\":\"${branch_name}\",\"base\":\"${PRIMARY_BRANCH}\",\"body\":\"$(echo "$pr_body" | sed ':a;N;$!ba;s/\n/\\n/g')\"}" 2>/dev/null) || {
+    echo "Error: failed to create PR" >&2
+    echo "Response: ${pr_response}" >&2
+    exit 1
+  }
+
+  local pr_number
+  pr_number=$(echo "$pr_response" | jq -r '.number')
+
+  local pr_url="${FORGE_URL}/${FORGE_OPS_REPO}/pulls/${pr_number}"
+
+  # Enable auto-merge on the PR — Forgejo will auto-merge after approval
+  _vault_log "Enabling auto-merge for PR #${pr_number}"
+  curl -sf -X POST \
+    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Content-Type: application/json" \
+    "${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}/pulls/${pr_number}/merge" \
+    -d '{"Do":"merge","merge_when_checks_succeed":true}' >/dev/null 2>&1 || {
+    echo "Warning: failed to enable auto-merge (may already be enabled or not supported)" >&2
+  }
+
+  echo ""
+  echo "Release PR created: ${pr_url}"
+  echo ""
+  echo "Next steps:"
+  echo "  1. Review the PR"
+  echo "  2. Approve the PR (auto-merge will trigger after approval)"
+  echo "  3. The vault runner will execute the release formula"
+  echo ""
+  echo "After merge, the release will:"
+  echo "  1. Tag Forgejo main with ${version}"
+  echo "  2. Push tag to mirrors (Codeberg, GitHub)"
+  echo "  3. Build and tag the agents Docker image"
+  echo "  4. Restart agent containers"
+}
--- a/lib/stack-lock.sh
+++ b/lib/stack-lock.sh
@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+# stack-lock.sh — File-based lock protocol for singleton project stack access
+#
+# Prevents CI pipelines and the reproduce-agent from stepping on each other
+# when sharing a single project stack (e.g. harb docker compose).
+#
+# Lock file: /home/agent/data/locks/<project>-stack.lock
+# Contents:  {"holder": "reproduce-agent-42", "since": "...", "heartbeat": "..."}
+#
+# Protocol:
+#   1. stack_lock_check   — inspect current lock state
+#   2. stack_lock_acquire — wait until lock is free, then claim it
+#   3. stack_lock_release — delete lock file when done
+#
+# Heartbeat: callers must update the heartbeat every 2 minutes while holding
+# the lock by calling stack_lock_heartbeat. A heartbeat older than 10 minutes
+# is considered stale — the next acquire will break it.
+#
+# Usage:
+#   source "$(dirname "$0")/../lib/stack-lock.sh"
+#   stack_lock_acquire "ci-pipeline-$BUILD_NUMBER" "myproject"
+#   trap 'stack_lock_release "myproject"' EXIT
+#   # ... do work ...
+#   stack_lock_release "myproject"
+
+set -euo pipefail
+
+STACK_LOCK_DIR="${HOME}/data/locks"
+STACK_LOCK_POLL_INTERVAL=30   # seconds between retry polls
+STACK_LOCK_STALE_SECONDS=600  # 10 minutes — heartbeat older than this = stale
+STACK_LOCK_MAX_WAIT=3600      # 1 hour — give up after this many seconds
+
+# _stack_lock_path <project>
+#   Print the path of the lock file for the given project.
+_stack_lock_path() {
+  local project="$1"
+  echo "${STACK_LOCK_DIR}/${project}-stack.lock"
+}
+
+# _stack_lock_now
+#   Print current UTC timestamp in ISO-8601 format.
+_stack_lock_now() {
+  date -u +"%Y-%m-%dT%H:%M:%SZ"
+}
+
+# _stack_lock_epoch <iso_timestamp>
+#   Convert an ISO-8601 UTC timestamp to a Unix epoch integer.
+_stack_lock_epoch() {
+  local ts="$1"
+  # Strip trailing Z, replace T with space for `date -d`
+  date -u -d "${ts%Z}" +%s 2>/dev/null || date -u -j -f "%Y-%m-%dT%H:%M:%S" "${ts%Z}" +%s 2>/dev/null
+}
+
+# stack_lock_check <project>
+#   Print lock status to stdout: "free", "held:<holder>", or "stale:<holder>".
+#   Returns 0 in all cases (status is in stdout).
+stack_lock_check() {
+  local project="$1"
+  local lock_file
+  lock_file="$(_stack_lock_path "$project")"
+
+  if [ ! -f "$lock_file" ]; then
+    echo "free"
+    return 0
+  fi
+
+  local holder heartbeat
+  holder=$(python3 -c 'import sys,json; d=json.load(open(sys.argv[1])); print(d.get("holder","unknown"))' "$lock_file" 2>/dev/null || echo "unknown")
+  heartbeat=$(python3 -c 'import sys,json; d=json.load(open(sys.argv[1])); print(d.get("heartbeat",""))' "$lock_file" 2>/dev/null || echo "")
+
+  if [ -z "$heartbeat" ]; then
+    echo "stale:${holder}"
+    return 0
+  fi
+
+  local hb_epoch now_epoch age
+  hb_epoch=$(_stack_lock_epoch "$heartbeat" 2>/dev/null || echo "0")
+  now_epoch=$(date -u +%s)
+  age=$(( now_epoch - hb_epoch ))
+
+  if [ "$age" -gt "$STACK_LOCK_STALE_SECONDS" ]; then
+    echo "stale:${holder}"
+  else
+    echo "held:${holder}"
+  fi
+}
+
+# stack_lock_acquire <holder_id> <project> [max_wait_seconds]
+#   Acquire the lock for <project> on behalf of <holder_id>.
+#   Polls every STACK_LOCK_POLL_INTERVAL seconds.
+#   Breaks stale locks automatically.
+#   Exits non-zero if the lock cannot be acquired within max_wait_seconds.
+stack_lock_acquire() {
+  local holder="$1"
+  local project="$2"
+  local max_wait="${3:-$STACK_LOCK_MAX_WAIT}"
+  local lock_file
+  lock_file="$(_stack_lock_path "$project")"
+  local deadline
+  deadline=$(( $(date -u +%s) + max_wait ))
+
+  mkdir -p "$STACK_LOCK_DIR"
+
+  while true; do
+    local status
+    status=$(stack_lock_check "$project")
+
+    case "$status" in
+      free)
+        # Write to temp file then rename to avoid partial reads by other processes
+        local tmp_lock
+        tmp_lock=$(mktemp "${STACK_LOCK_DIR}/.lock-tmp-XXXXXX")
+        local now
+        now=$(_stack_lock_now)
+        printf '{"holder": "%s", "since": "%s", "heartbeat": "%s"}\n' \
+          "$holder" "$now" "$now" > "$tmp_lock"
+        mv "$tmp_lock" "$lock_file"
+        echo "[stack-lock] acquired lock for ${project} as ${holder}" >&2
+        return 0
+        ;;
+      stale:*)
+        local stale_holder="${status#stale:}"
+        echo "[stack-lock] breaking stale lock held by ${stale_holder} for ${project}" >&2
+        rm -f "$lock_file"
+        # Loop back immediately to re-check and claim
+        ;;
+      held:*)
+        local cur_holder="${status#held:}"
+        local remaining
+        remaining=$(( deadline - $(date -u +%s) ))
+        if [ "$remaining" -le 0 ]; then
+          echo "[stack-lock] timed out waiting for lock on ${project} (held by ${cur_holder})" >&2
+          return 1
+        fi
+        echo "[stack-lock] ${project} locked by ${cur_holder}, waiting ${STACK_LOCK_POLL_INTERVAL}s (${remaining}s left)..." >&2
+        sleep "$STACK_LOCK_POLL_INTERVAL"
+        ;;
+      *)
+        echo "[stack-lock] unexpected status '${status}' for ${project}" >&2
+        return 1
+        ;;
+    esac
+  done
+}
+
+# stack_lock_heartbeat <holder_id> <project>
+#   Update the heartbeat timestamp in the lock file.
+#   Should be called every 2 minutes while holding the lock.
+#   No-op if the lock file is absent or held by a different holder.
+stack_lock_heartbeat() {
+  local holder="$1"
+  local project="$2"
+  local lock_file
+  lock_file="$(_stack_lock_path "$project")"
+
+  [ -f "$lock_file" ] || return 0
+
+  local current_holder
+  current_holder=$(python3 -c 'import sys,json; d=json.load(open(sys.argv[1])); print(d.get("holder",""))' "$lock_file" 2>/dev/null || echo "")
+  [ "$current_holder" = "$holder" ] || return 0
+
+  local since
+  since=$(python3 -c 'import sys,json; d=json.load(open(sys.argv[1])); print(d.get("since",""))' "$lock_file" 2>/dev/null || echo "")
+  local now
+  now=$(_stack_lock_now)
+
+  local tmp_lock
+  tmp_lock=$(mktemp "${STACK_LOCK_DIR}/.lock-tmp-XXXXXX")
+  printf '{"holder": "%s", "since": "%s", "heartbeat": "%s"}\n' \
+    "$holder" "$since" "$now" > "$tmp_lock"
+  mv "$tmp_lock" "$lock_file"
+}
+
+# stack_lock_release <project> [holder_id]
+#   Release the lock for <project>.
+#   If holder_id is provided, only releases if the lock is held by that holder
+#   (prevents accidentally releasing someone else's lock).
+stack_lock_release() {
+  local project="$1"
+  local holder="${2:-}"
+  local lock_file
+  lock_file="$(_stack_lock_path "$project")"
+
+  [ -f "$lock_file" ] || return 0
+
+  if [ -n "$holder" ]; then
+    local current_holder
+    current_holder=$(python3 -c 'import sys,json; d=json.load(open(sys.argv[1])); print(d.get("holder",""))' "$lock_file" 2>/dev/null || echo "")
+    if [ "$current_holder" != "$holder" ]; then
+      echo "[stack-lock] refusing to release: lock held by '${current_holder}', not '${holder}'" >&2
+      return 1
+    fi
+  fi
+
+  rm -f "$lock_file"
+  echo "[stack-lock] released lock for ${project}" >&2
+}
--- a/lib/vault.sh
+++ b/lib/vault.sh
@ -39,6 +39,60 @@ _vault_ops_api() {
  printf '%s' "${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}"
 }

+# -----------------------------------------------------------------------------
+# _vault_commit_direct — Commit low-tier action directly to ops main
+# Args: ops_api tmp_toml_file action_id
+# Uses FORGE_ADMIN_TOKEN to bypass PR workflow
+# -----------------------------------------------------------------------------
+_vault_commit_direct() {
+  local ops_api="$1"
+  local tmp_toml="$2"
+  local action_id="$3"
+  local file_path="vault/actions/${action_id}.toml"
+
+  # Use FORGE_ADMIN_TOKEN for direct commit (vault-bot identity)
+  local admin_token="${FORGE_ADMIN_TOKEN:-${FORGE_TOKEN}}"
+  if [ -z "$admin_token" ]; then
+    echo "ERROR: FORGE_ADMIN_TOKEN is required for low-tier commits" >&2
+    return 1
+  fi
+
+  # Get main branch SHA
+  local main_sha
+  main_sha=$(curl -sf -H "Authorization: token ${admin_token}" \
+    "${ops_api}/git/branches/${PRIMARY_BRANCH:-main}" 2>/dev/null | \
+    jq -r '.commit.id // empty' || true)
+
+  if [ -z "$main_sha" ]; then
+    main_sha=$(curl -sf -H "Authorization: token ${admin_token}" \
+      "${ops_api}/git/refs/heads/${PRIMARY_BRANCH:-main}" 2>/dev/null | \
+      jq -r '.object.sha // empty' || true)
+  fi
+
+  if [ -z "$main_sha" ]; then
+    echo "ERROR: could not get main branch SHA" >&2
+    return 1
+  fi
+
+  _vault_log "Committing ${file_path} directly to ${PRIMARY_BRANCH:-main}"
+
+  # Encode TOML content as base64
+  local encoded_content
+  encoded_content=$(base64 -w 0 < "$tmp_toml")
+
+  # Commit directly to main branch using Forgejo content API
+  if ! curl -sf -X PUT \
+    -H "Authorization: token ${admin_token}" \
+    -H "Content-Type: application/json" \
+    "${ops_api}/contents/${file_path}" \
+    -d "{\"message\":\"vault: add ${action_id} (low-tier)\",\"branch\":\"${PRIMARY_BRANCH:-main}\",\"content\":\"${encoded_content}\",\"committer\":{\"name\":\"vault-bot\",\"email\":\"vault-bot@${FORGE_REPO}\"},\"overwrite\":true}" >/dev/null 2>&1; then
+    echo "ERROR: failed to write ${file_path} to ${PRIMARY_BRANCH:-main}" >&2
+    return 1
+  fi
+
+  _vault_log "Direct commit successful for ${action_id}"
+}
+
 # -----------------------------------------------------------------------------
 # vault_request — Create a vault PR or return existing one
 # Args: action_id toml_content
@ -59,6 +113,9 @@ vault_request() {
    return 1
  fi

+  # Get admin token for API calls (FORGE_ADMIN_TOKEN for low-tier, FORGE_TOKEN otherwise)
+  local admin_token="${FORGE_ADMIN_TOKEN:-${FORGE_TOKEN}}"
+
  # Check if PR already exists for this action
  local existing_pr
  existing_pr=$(pr_find_by_branch "vault/${action_id}" "$(_vault_ops_api)") || true
@ -99,7 +156,34 @@ vault_request() {
    return 1
  fi

-  # Extract values for PR creation
+  # Get ops repo API URL
+  local ops_api
+  ops_api="$(_vault_ops_api)"
+
+  # Classify the action to determine if PR bypass is allowed
+  local classify_script="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/vault/classify.sh"
+  local vault_tier
+  vault_tier=$("$classify_script" "${VAULT_ACTION_FORMULA:-}" "${VAULT_BLAST_RADIUS_OVERRIDE:-}") || {
+    # Classification failed, default to high tier (require PR)
+    vault_tier="high"
+    _vault_log "Warning: classification failed, defaulting to high tier"
+  }
+  export VAULT_TIER="${vault_tier}"
+
+  # For low-tier actions, commit directly to ops main using FORGE_ADMIN_TOKEN
+  if [ "$vault_tier" = "low" ]; then
+    _vault_log "low-tier — committed directly to ops main"
+    # Add dispatch_mode field to indicate direct commit (no PR)
+    local direct_toml
+    direct_toml=$(mktemp /tmp/vault-direct-XXXXXX.toml)
+    trap 'rm -f "$tmp_toml" "$direct_toml"' RETURN
+    # Prepend dispatch_mode = "direct" to the TOML
+    printf 'dispatch_mode = "direct"\n%s\n' "$toml_content" > "$direct_toml"
+    _vault_commit_direct "$ops_api" "$direct_toml" "${action_id}"
+    return 0
+  fi
+
+  # Extract values for PR creation (medium/high tier)
  local pr_title pr_body
  pr_title="vault: ${action_id}"
  pr_body="Vault action: ${action_id}
@ -113,16 +197,12 @@ Secrets: ${VAULT_ACTION_SECRETS:-}
 This vault action has been created by an agent and requires admin approval
 before execution. See the TOML file for details."

-  # Get ops repo API URL
-  local ops_api
-  ops_api="$(_vault_ops_api)"
-
  # Create branch
  local branch="vault/${action_id}"
  local branch_exists

  branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
-    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Authorization: token ${admin_token}" \
    "${ops_api}/git/branches/${branch}" 2>/dev/null || echo "0")

  if [ "$branch_exists" != "200" ]; then
@ -131,13 +211,13 @@ before execution. See the TOML file for details."

    # Get the commit SHA of main branch
    local main_sha
-    main_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    main_sha=$(curl -sf -H "Authorization: token ${admin_token}" \
      "${ops_api}/git/branches/${PRIMARY_BRANCH:-main}" 2>/dev/null | \
      jq -r '.commit.id // empty' || true)

    if [ -z "$main_sha" ]; then
      # Fallback: get from refs
-      main_sha=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+      main_sha=$(curl -sf -H "Authorization: token ${admin_token}" \
        "${ops_api}/git/refs/heads/${PRIMARY_BRANCH:-main}" 2>/dev/null | \
        jq -r '.object.sha // empty' || true)
    fi
@ -149,7 +229,7 @@ before execution. See the TOML file for details."

    # Create the branch
    if ! curl -sf -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Authorization: token ${admin_token}" \
      -H "Content-Type: application/json" \
      "${ops_api}/git/branches" \
      -d "{\"ref\":\"${branch}\",\"sha\":\"${main_sha}\"}" >/dev/null 2>&1; then
@ -170,7 +250,7 @@ before execution. See the TOML file for details."

  # Upload file using Forgejo content API
  if ! curl -sf -X PUT \
-    -H "Authorization: token ${FORGE_TOKEN}" \
+    -H "Authorization: token ${admin_token}" \
    -H "Content-Type: application/json" \
    "${ops_api}/contents/${file_path}" \
    -d "{\"message\":\"vault: add ${action_id}\",\"branch\":\"${branch}\",\"content\":\"${encoded_content}\",\"committer\":{\"name\":\"vault-bot\",\"email\":\"vault-bot@${FORGE_REPO}\"},\"overwrite\":true}" >/dev/null 2>&1; then
@ -187,23 +267,33 @@ before execution. See the TOML file for details."
    return 1
  }

+  # Enable auto-merge on the PR — Forgejo will auto-merge after approval
+  _vault_log "Enabling auto-merge for PR #${pr_num}"
+  curl -sf -X POST \
+    -H "Authorization: token ${admin_token}" \
+    -H "Content-Type: application/json" \
+    "${ops_api}/pulls/${pr_num}/merge" \
+    -d '{"Do":"merge","merge_when_checks_succeed":true}' >/dev/null 2>&1 || {
+    _vault_log "Warning: failed to enable auto-merge (may already be enabled or not supported)"
+  }
+
  # Add labels to PR (vault, pending-approval)
  _vault_log "PR #${pr_num} created, adding labels"

  # Get label IDs
  local vault_label_id pending_label_id
-  vault_label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  vault_label_id=$(curl -sf -H "Authorization: token ${admin_token}" \
    "${ops_api}/labels" 2>/dev/null | \
    jq -r --arg n "vault" '.[] | select(.name == $n) | .id // empty' || true)

-  pending_label_id=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  pending_label_id=$(curl -sf -H "Authorization: token ${admin_token}" \
    "${ops_api}/labels" 2>/dev/null | \
    jq -r --arg n "pending-approval" '.[] | select(.name == $n) | .id // empty' || true)

  # Add labels if they exist
  if [ -n "$vault_label_id" ]; then
    curl -sf -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Authorization: token ${admin_token}" \
      -H "Content-Type: application/json" \
      "${ops_api}/issues/${pr_num}/labels" \
      -d "[{\"id\":${vault_label_id}}]" >/dev/null 2>&1 || true
@ -211,7 +301,7 @@ before execution. See the TOML file for details."

  if [ -n "$pending_label_id" ]; then
    curl -sf -X POST \
-      -H "Authorization: token ${FORGE_TOKEN}" \
+      -H "Authorization: token ${admin_token}" \
      -H "Content-Type: application/json" \
      "${ops_api}/issues/${pr_num}/labels" \
      -d "[{\"id\":${pending_label_id}}]" >/dev/null 2>&1 || true
--- a/planner/AGENTS.md
+++ b/planner/AGENTS.md
@ -1,8 +1,8 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Planner Agent

 **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),
-executed directly from cron via tmux + Claude.
+invoked by the polling loop in `docker/agents/entrypoint.sh` every 12 hours (iteration math at line 210-222) via tmux + Claude.
 Phase 0 (preflight): pull latest code, load persistent memory and prerequisite
 tree from `$OPS_REPO_ROOT/knowledge/planner-memory.md` and `$OPS_REPO_ROOT/prerequisites.md`. Also reads
 all available formulas: factory formulas (`$FACTORY_ROOT/formulas/*.toml`) and
@ -41,16 +41,16 @@ AGENTS.md maintenance is handled by the Gardener.
 prerequisite tree, memory, vault state) live under `$OPS_REPO_ROOT/`.
 Each project manages its own planner state in a separate ops repo.

-**Trigger**: `planner-run.sh` runs daily via cron (accepts an optional project
-TOML argument, defaults to `projects/disinto.toml`). Sources `lib/guard.sh` and
-calls `check_active planner` first — skips if `$FACTORY_ROOT/state/.planner-active`
-is absent. Then creates a tmux session with `claude --model opus`, injects
-`formulas/run-planner.toml` as context, monitors the phase file, and cleans up
-on completion or timeout. No action issues — the planner is a nervous system
-component, not work.
+**Trigger**: `planner-run.sh` is invoked by the polling loop in `docker/agents/entrypoint.sh`
+every 12 hours (iteration math at line 210-222). Accepts an optional project TOML argument,
+defaults to `projects/disinto.toml`. Sources `lib/guard.sh` and calls `check_active planner`
+first — skips if `$FACTORY_ROOT/state/.planner-active` is absent. Then creates a tmux session
+with `claude --model opus`, injects `formulas/run-planner.toml` as context, monitors the
+phase file, and cleans up on completion or timeout. No action issues — the planner is a
+nervous system component, not work.

 **Key files**:
- `planner/planner-run.sh` — Cron wrapper + orchestrator: lock, memory guard,
+- `planner/planner-run.sh` — Polling loop participant + orchestrator: lock, memory guard,
  sources disinto project config, builds structural analysis via `lib/formula-session.sh:build_graph_section()`,
  creates tmux session, injects formula prompt, monitors phase file, handles crash recovery, cleans up
 - `formulas/run-planner.toml` — Execution spec: six steps (preflight,
@ -65,7 +65,7 @@ component, not work.
  tree, humans steer by editing VISION.md. Tree grows organically as the
  planner discovers new prerequisites during runs
 - `$OPS_REPO_ROOT/knowledge/planner-memory.md` — Persistent memory across runs (in ops repo)
- `$OPS_REPO_ROOT/journal/planner/*.md` — Daily raw logs from each planner run (in ops repo)
+

 **Constraint focus**: The planner uses Theory of Constraints to avoid premature
 issue filing. Only the top 3 unresolved prerequisites that block the most
--- a/planner/planner-run.sh
+++ b/planner/planner-run.sh
@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 # =============================================================================
-# planner-run.sh — Cron wrapper: planner execution via SDK + formula
+# planner-run.sh — Polling-loop wrapper: planner execution via SDK + formula
 #
 # Synchronous bash loop using claude -p (one-shot invocation).
 # No tmux sessions, no phase files — the bash script IS the state machine.
 #
 # Flow:
-#   1. Guards: cron lock, memory check
+#   1. Guards: run lock, memory check
 #   2. Load formula (formulas/run-planner.toml)
 #   3. Context: VISION.md, AGENTS.md, ops:RESOURCES.md, structural graph,
 #      planner memory, journal entries
@ -35,7 +35,7 @@ source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"

-LOG_FILE="$SCRIPT_DIR/planner.log"
+LOG_FILE="${DISINTO_LOG_DIR}/planner/planner.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@ -43,21 +43,60 @@ SID_FILE="/tmp/planner-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/planner-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-planner-run"

-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+# shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
+LOG_AGENT="planner"
+
+# Override log() to append to planner-specific log file
+# shellcheck disable=SC2034
+log() {
+  local agent="${LOG_AGENT:-planner}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$LOG_FILE"
+}

 # ── Guards ────────────────────────────────────────────────────────────────
 check_active planner
-acquire_cron_lock "/tmp/planner-run.lock"
-check_memory 2000
+acquire_run_lock "/tmp/planner-run.lock"
+memory_guard 2000

 log "--- Planner run start ---"

-# ── Resolve agent identity for .profile repo ────────────────────────────
-if [ -z "${AGENT_IDENTITY:-}" ] && [ -n "${FORGE_PLANNER_TOKEN:-}" ]; then
-  AGENT_IDENTITY=$(curl -sf -H "Authorization: token ${FORGE_PLANNER_TOKEN}" \
-    "${FORGE_URL:-http://localhost:3000}/api/v1/user" 2>/dev/null | jq -r '.login // empty' 2>/dev/null || true)
+# ── Precondition checks: skip if nothing to plan ──────────────────────────
+LAST_SHA_FILE="$FACTORY_ROOT/state/planner-last-sha"
+LAST_OPS_SHA_FILE="$FACTORY_ROOT/state/planner-last-ops-sha"
+
+CURRENT_SHA=$(git -C "$FACTORY_ROOT" rev-parse HEAD 2>/dev/null || echo "")
+LAST_SHA=$(cat "$LAST_SHA_FILE" 2>/dev/null || echo "")
+
+# ops repo is required for planner — pull before checking sha
+ensure_ops_repo
+CURRENT_OPS_SHA=$(git -C "$OPS_REPO_ROOT" rev-parse HEAD 2>/dev/null || echo "")
+LAST_OPS_SHA=$(cat "$LAST_OPS_SHA_FILE" 2>/dev/null || echo "")
+
+unreviewed_count=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_API}/issues?labels=prediction/unreviewed&state=open&limit=1" 2>/dev/null | jq length) || unreviewed_count=0
+vision_open=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+  "${FORGE_API}/issues?labels=vision&state=open&limit=1" 2>/dev/null | jq length) || vision_open=0
+
+if [ "$CURRENT_SHA" = "$LAST_SHA" ] \
+   && [ "$CURRENT_OPS_SHA" = "$LAST_OPS_SHA" ] \
+   && [ "${unreviewed_count:-0}" -eq 0 ] \
+   && [ "${vision_open:-0}" -eq 0 ]; then
+  log "no new commits, no ops changes, no unreviewed predictions, no open vision — skipping"
+  exit 0
 fi

+log "sha=${CURRENT_SHA:0:8} ops=${CURRENT_OPS_SHA:0:8} unreviewed=${unreviewed_count} vision=${vision_open}"
+
+# ── Resolve forge remote for git operations ─────────────────────────────
+# Run git operations from the project checkout, not the baked code dir
+cd "$PROJECT_REPO_ROOT"
+
+resolve_forge_remote
+
+# ── Resolve agent identity for .profile repo ────────────────────────────
+resolve_agent_identity || true
+
 # ── Load formula + context ───────────────────────────────────────────────
 load_formula_or_profile "planner" "$FACTORY_ROOT/formulas/run-planner.toml" || exit 1
 build_context_block VISION.md AGENTS.md ops:RESOURCES.md ops:prerequisites.md
@ -65,9 +104,6 @@ build_context_block VISION.md AGENTS.md ops:RESOURCES.md ops:prerequisites.md
 # ── Build structural analysis graph ──────────────────────────────────────
 build_graph_section

-# ── Ensure ops repo is available ───────────────────────────────────────
-ensure_ops_repo
-
 # ── Read planner memory ─────────────────────────────────────────────────
 MEMORY_BLOCK=""
 MEMORY_FILE="$OPS_REPO_ROOT/knowledge/planner-memory.md"
@ -115,6 +151,11 @@ export CLAUDE_MODEL="opus"
 agent_run --worktree "$WORKTREE" "$PROMPT"
 log "agent_run complete"

+# Persist watermarks so next run can skip if nothing changed
+mkdir -p "$FACTORY_ROOT/state"
+echo "$CURRENT_SHA" > "$LAST_SHA_FILE"
+echo "$CURRENT_OPS_SHA" > "$LAST_OPS_SHA_FILE"
+
 # Write journal entry post-session
 profile_write_journal "planner-run" "Planner run $(date -u +%Y-%m-%d)" "complete" "" || true

--- a/predictor/AGENTS.md
+++ b/predictor/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Predictor Agent

 **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
@ -22,14 +22,15 @@ exploit counts as 2 (prediction + action dispatch). The predictor MUST NOT
 emit feature work — only observations challenging claims, exposing gaps,
 and surfacing risks.

-**Trigger**: `predictor-run.sh` runs daily at 06:00 UTC via cron (1h before
-the planner at 07:00). Sources `lib/guard.sh` and calls `check_active predictor`
-first — skips if `$FACTORY_ROOT/state/.predictor-active` is absent. Also guarded
-by PID lock (`/tmp/predictor-run.lock`) and memory check (skips if available
-RAM < 2000 MB).
+**Trigger**: `predictor-run.sh` is invoked by the polling loop in `docker/agents/entrypoint.sh`
+every 24 hours (iteration math at line 224-236). Sources `lib/guard.sh` and calls
+`check_active predictor` first — skips if `$FACTORY_ROOT/state/.predictor-active` is absent.
+Also guarded by PID lock (`/tmp/predictor-run.lock`) and memory check (skips if available
+RAM < 2000 MB). Note: the 24h cadence is iteration-based, not anchored to 06:00 UTC —
+drifts on container restart.

 **Key files**:
- `predictor/predictor-run.sh` — Cron wrapper + orchestrator: active-state guard,
+- `predictor/predictor-run.sh` — Polling loop participant + orchestrator: active-state guard,
  lock, memory guard, sources disinto project config, builds structural analysis
  via `lib/formula-session.sh:build_graph_section()` (full-project scan — results
  included in prompt as `## Structural analysis`; failures non-fatal), builds
@ -44,7 +45,7 @@ RAM < 2000 MB).
 - `FORGE_TOKEN`, `FORGE_PREDICTOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`, `OPS_REPO_ROOT`
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by predictor-run.sh)

-**Lifecycle**: predictor-run.sh (daily 06:00 cron) → lock + memory guard →
+**Lifecycle**: predictor-run.sh (invoked by polling loop every 24h) → lock + memory guard →
 load formula + context (AGENTS.md, VISION.md from code repo; RESOURCES.md, prerequisites.md from ops repo)
 → create tmux session → Claude fetches prediction history (open + closed) →
 reviews track record (actioned/dismissed/watching) → finds weaknesses
--- a/predictor/predictor-run.sh
+++ b/predictor/predictor-run.sh
@ -1,12 +1,12 @@
 #!/usr/bin/env bash
 # =============================================================================
-# predictor-run.sh — Cron wrapper: predictor execution via SDK + formula
+# predictor-run.sh — Polling-loop wrapper: predictor execution via SDK + formula
 #
 # Synchronous bash loop using claude -p (one-shot invocation).
 # No tmux sessions, no phase files — the bash script IS the state machine.
 #
 # Flow:
-#   1. Guards: cron lock, memory check
+#   1. Guards: run lock, memory check
 #   2. Load formula (formulas/run-predictor.toml)
 #   3. Context: AGENTS.md, ops:RESOURCES.md, VISION.md, structural graph
 #   4. agent_run(worktree, prompt) → Claude analyzes, writes to ops repo
@ -14,7 +14,7 @@
 # Usage:
 #   predictor-run.sh [projects/disinto.toml]   # project config (default: disinto)
 #
-# Cron: 0 6 * * * cd /path/to/dark-factory && bash predictor/predictor-run.sh
+# Called by: entrypoint.sh polling loop (daily)
 # =============================================================================
 set -euo pipefail

@ -36,7 +36,7 @@ source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"

-LOG_FILE="$SCRIPT_DIR/predictor.log"
+LOG_FILE="${DISINTO_LOG_DIR}/predictor/predictor.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
 LOGFILE="$LOG_FILE"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@ -44,20 +44,32 @@ SID_FILE="/tmp/predictor-session-${PROJECT_NAME}.sid"
 SCRATCH_FILE="/tmp/predictor-${PROJECT_NAME}-scratch.md"
 WORKTREE="/tmp/${PROJECT_NAME}-predictor-run"

-log() { echo "[$(date -u +%Y-%m-%dT%H:%M:%S)Z] $*" >> "$LOG_FILE"; }
+# Override LOG_AGENT for consistent agent identification
+# shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
+LOG_AGENT="predictor"
+
+# Override log() to append to predictor-specific log file
+# shellcheck disable=SC2034
+log() {
+  local agent="${LOG_AGENT:-predictor}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$LOG_FILE"
+}

 # ── Guards ────────────────────────────────────────────────────────────────
 check_active predictor
-acquire_cron_lock "/tmp/predictor-run.lock"
-check_memory 2000
+acquire_run_lock "/tmp/predictor-run.lock"
+memory_guard 2000

 log "--- Predictor run start ---"

+# ── Resolve forge remote for git operations ─────────────────────────────
+# Run git operations from the project checkout, not the baked code dir
+cd "$PROJECT_REPO_ROOT"
+
+resolve_forge_remote
+
 # ── Resolve agent identity for .profile repo ────────────────────────────
-if [ -z "${AGENT_IDENTITY:-}" ] && [ -n "${FORGE_PREDICTOR_TOKEN:-}" ]; then
-  AGENT_IDENTITY=$(curl -sf -H "Authorization: token ${FORGE_PREDICTOR_TOKEN}" \
-    "${FORGE_URL:-http://localhost:3000}/api/v1/user" 2>/dev/null | jq -r '.login // empty' 2>/dev/null || true)
-fi
+resolve_agent_identity || true

 # ── Load formula + context ───────────────────────────────────────────────
 load_formula_or_profile "predictor" "$FACTORY_ROOT/formulas/run-predictor.toml" || exit 1
--- a/projects/disinto.toml.example
+++ b/projects/disinto.toml.example
@ -5,7 +5,7 @@

 name            = "disinto"
 repo            = "johba/disinto"
-ops_repo        = "johba/disinto-ops"
+ops_repo        = "disinto-admin/disinto-ops"
 forge_url       = "http://localhost:3000"
 repo_root       = "/home/YOU/dark-factory"
 ops_repo_root   = "/home/YOU/disinto-ops"
@ -23,6 +23,42 @@ check_prs            = true
 check_dev_agent      = true
 check_pipeline_stall = false

+# Agent scheduling configuration
+#
+# These values are passed to the agents container as environment variables.
+# The default values (6 hours each) work well for stable production projects.
+# For active development on the disinto factory itself, you may want to
+# configure shorter intervals:
+#
+#   GARDENER_INTERVAL=3600    # 1 hour (default: 21600 = 6 hours)
+#   ARCHITECT_INTERVAL=540    # 9 minutes (default: 21600 = 6 hours)
+#   PLANNER_INTERVAL=660      # 11 minutes (default: 43200 = 12 hours)
+#
+# These can be set in docker-compose.yml environment section or in a .env file.
+#
+# [agents.schedule]
+#   gardener_interval = 21600  # seconds (default: 21600 = 6 hours)
+#   architect_interval  = 21600  # seconds (default: 21600 = 6 hours)
+#   planner_interval    = 43200  # seconds (default: 43200 = 12 hours)
+
+# Local-model agents (optional) — configure to use llama-server or similar
+# for local LLM inference. Each agent gets its own container with isolated
+# credentials and configuration.
+#
+# When enabled, `disinto init` automatically:
+#   1. Creates a Forgejo bot user matching agents.llama.forge_user
+#   2. Generates FORGE_TOKEN_<BOT> and FORGE_PASS_<BOT> (stored in .env.enc)
+#   3. Adds the bot user as a write collaborator on the project repo
+#
+# [agents.llama]
+#   base_url = "http://10.10.10.1:8081"
+#   model = "unsloth/Qwen3.5-35B-A3B"
+#   api_key = "sk-no-key-required"
+#   roles = ["dev"]
+#   forge_user = "dev-qwen"
+#   compact_pct = 60
+#   poll_interval = 60
+
 # [mirrors]
 # github   = "git@github.com:johba/disinto.git"
 # codeberg = "git@codeberg.org:johba/disinto.git"
--- a/review/AGENTS.md
+++ b/review/AGENTS.md
@ -1,16 +1,29 @@
-<!-- last-reviewed: f32707ba659de278a3af434e3549fb8a8dce9d3a -->
+<!-- last-reviewed: 4fcbca1bef23734d05a9fc97bb56cd0a6bbcd25f -->
 # Review Agent

 **Role**: AI-powered PR review — post structured findings and formal
 approve/request-changes verdicts to forge.

-**Trigger**: `review-poll.sh` runs every 10 min via cron. It scans open PRs
-whose CI has passed and that lack a review for the current HEAD SHA, then
-spawns `review-pr.sh <pr-number>`.
+**Trigger**: `review-poll.sh` is invoked by the polling loop in `docker/agents/entrypoint.sh`
+every 5 minutes (iteration math at line 163-167). It scans open PRs whose CI has passed and
+that lack a review for the current HEAD SHA, then spawns `review-pr.sh <pr-number>`.

 **Key files**:
- `review/review-poll.sh` — Cron scheduler: finds unreviewed PRs with passing CI. Sources `lib/guard.sh` and calls `check_active reviewer` — skips if `$FACTORY_ROOT/state/.reviewer-active` is absent.
- `review/review-pr.sh` — Creates/reuses a tmux session (`review-{project}-{pr}`), injects PR diff, waits for Claude to write structured JSON output, posts markdown review + formal forge review, auto-creates follow-up issues for pre-existing tech debt. Before starting the session, runs `lib/build-graph.py --changed-files <PR files>` and appends the JSON structural analysis (affected objectives, orphaned prerequisites, thin evidence) to the review prompt. Graph failures are non-fatal — review proceeds without it.
+- `review/review-poll.sh` — Polling loop participant: finds unreviewed PRs with passing CI.
+Invoked by `docker/agents/entrypoint.sh` every 5 minutes. Sources `lib/guard.sh` and calls
+`check_active reviewer` — skips if `$FACTORY_ROOT/state/.reviewer-active` is absent.
+**Circuit breaker**: counts existing `<!-- review-error: <sha> -->` comments; skips a PR
+if ≥3 consecutive errors for the same HEAD SHA (prevents flooding on repeated review failures).
+- `review/review-pr.sh` — Polling loop participant: Creates/reuses a tmux session
+(`review-{project}-{pr}`), injects PR diff, waits for Claude to write structured JSON output,
+posts markdown review + formal forge review, auto-creates follow-up issues for pre-existing
+tech debt. **cd at startup**: changes to `$PROJECT_REPO_ROOT` early in the script — before
+any git commands — because the factory root is not a git repo after image rebuild (#408).
+Calls `resolve_forge_remote()` at startup to determine the correct git remote name (avoids
+hardcoded 'origin'). Before starting the session, runs `lib/build-graph.py --changed-files
+<PR files>` and appends the JSON structural analysis (affected objectives, orphaned
+prerequisites, thin evidence) to the review prompt. Graph failures are non-fatal — review
+proceeds without it.

 **Environment variables consumed**:
 - `FORGE_TOKEN` — Dev-agent token (must not be the same account as FORGE_REVIEW_TOKEN)
--- a/review/review-poll.sh
+++ b/review/review-poll.sh
@ -23,8 +23,15 @@ LOGFILE="${DISINTO_LOG_DIR}/review/review-poll.log"
 MAX_REVIEWS=3
 REVIEW_IDLE_TIMEOUT=14400  # 4h: kill review session if idle

+# Override LOG_AGENT for consistent agent identification
+# shellcheck disable=SC2034  # consumed by agent-sdk.sh and env.sh log()
+LOG_AGENT="review"
+
+# Override log() to append to review-specific log file
+# shellcheck disable=SC2034
 log() {
-  printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" >> "$LOGFILE"
+  local agent="${LOG_AGENT:-review}"
+  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$agent" "$*" >> "$LOGFILE"
 }

 # Log rotation
@ -126,10 +133,11 @@ if [ -n "$REVIEW_SIDS" ]; then

    log "  #${pr_num} re-review: new commits (${reviewed_sha:0:7}→${current_sha:0:7})"

-    if "${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1; then
+    review_output=$("${SCRIPT_DIR}/review-pr.sh" "$pr_num" 2>&1) && review_rc=0 || review_rc=$?
+    if [ "$review_rc" -eq 0 ]; then
      REVIEWED=$((REVIEWED + 1))
    else
-      log "  #${pr_num} re-review failed"
+      log "  #${pr_num} re-review failed (exit code $review_rc): $(echo "$review_output" | tail -3)"
    fi

    [ "$REVIEWED" -lt "$MAX_REVIEWS" ] || break
@ -166,10 +174,25 @@ while IFS= read -r line; do

  log "  #${PR_NUM} needs review (CI=success, SHA=${PR_SHA:0:7})"

-  if "${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1; then
+  # Circuit breaker: count existing review-error comments for this SHA
+  ERROR_COMMENTS=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
+    "${API_BASE}/issues/${PR_NUM}/comments" | \
+    jq --arg sha "$PR_SHA" \
+    '[.[] | select(.body | contains("<!-- review-error: " + $sha + " -->"))] | length')
+
+  if [ "${ERROR_COMMENTS:-0}" -ge 3 ]; then
+    log "  #${PR_NUM} blocked: ${ERROR_COMMENTS} consecutive error comments for ${PR_SHA:0:7}, skipping"
+    SKIPPED=$((SKIPPED + 1))
+    continue
+  fi
+
+  log "  #${PR_NUM} error check: ${ERROR_COMMENTS:-0} prior error(s) for ${PR_SHA:0:7}"
+
+  review_output=$("${SCRIPT_DIR}/review-pr.sh" "$PR_NUM" 2>&1) && review_rc=0 || review_rc=$?
+  if [ "$review_rc" -eq 0 ]; then
    REVIEWED=$((REVIEWED + 1))
  else
-    log "  #${PR_NUM} review failed"
+    log "  #${PR_NUM} review failed (exit code $review_rc): $(echo "$review_output" | tail -3)"
  fi

  if [ "$REVIEWED" -ge "$MAX_REVIEWS" ]; then
--- a/Show more
+++ b/Show more