Merge pull request 'fix: [nomad-step-0] S0.3 — install vault + systemd auto-unseal + vault-init.sh (dev-persisted seal) (#823 )' (#828 ) from fix/issue-823 into main

fix: [nomad-step-0] S0.3 — install vault + systemd auto-unseal + vault-init.sh (dev-persisted seal) (#823 )
Adds the Vault half of the factory-dev-box bringup, landed but not started (per the install-but-don't-start pattern used for nomad in #822): - lib/init/nomad/install.sh — now also installs vault from the shared HashiCorp apt repo. VAULT_VERSION pinned (1.18.5). Fast-path skips apt entirely when both binaries are at their pins; partial upgrades only touch the package that drifted. - nomad/vault.hcl — single-node config: file storage backend at /var/lib/vault/data, localhost listener on :8200, ui on, mlock kept on. No TLS / HA / audit yet; those land in later steps. - lib/init/nomad/systemd-vault.sh — writes /etc/systemd/system/vault.service (Type=notify, ExecStartPost auto-unseals from /etc/vault.d/unseal.key, CAP_IPC_LOCK granted for mlock), deploys nomad/vault.hcl to /etc/vault.d/, creates /var/lib/vault/data (0700 root), enables the unit without starting it. Idempotent via content-compare. - lib/init/nomad/vault-init.sh — first-run init: spawns a temporary `vault server` if not already reachable, runs operator-init with key-shares=1/threshold=1, persists unseal.key + root.token (0400 root), unseals once in-process, shuts down the temp server. Re-run detects initialized + unseal.key present → no-op. Initialized but key missing is a hard failure (can't recover). lib/hvault.sh already defaults VAULT_TOKEN to /etc/vault.d/root.token when the env var is absent, so no change needed there. Seal model: the single unseal key lives on disk; seal-key theft equals vault theft. Factory-dev-box-acceptable tradeoff — avoids running a second Vault to auto-unseal the first. Blocks S0.4 (#824). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 07:04:57 +00:00 · 2026-04-16 06:53:27 +00:00 · 2026-04-16 06:15:32 +00:00 · 2026-04-16 06:04:02 +00:00 · 2026-04-16 05:54:22 +00:00 · 2026-04-16 05:43:35 +00:00
110 changed files with 10575 additions and 1258 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,8 +1,7 @@
-# Secrets — prevent .env files from being baked into the image
+# Secrets — prevent .env files and encrypted secrets from being baked into the image
 .env
 .env.enc
-.env.vault
+secrets/
 .env.vault.enc
 # Version control — .git is huge and not needed in image
 .git
--- a/.env.example
+++ b/.env.example
@ -45,7 +45,9 @@ FORGE_PREDICTOR_TOKEN=                     # [SECRET] predictor-bot API token
 FORGE_PREDICTOR_PASS=                      # [SECRET] predictor-bot password for git HTTP push
 FORGE_ARCHITECT_TOKEN=                     # [SECRET] architect-bot API token
 FORGE_ARCHITECT_PASS=                      # [SECRET] architect-bot password for git HTTP push
-FORGE_BOT_USERNAMES=dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot
+FORGE_FILER_TOKEN=                         # [SECRET] filer-bot API token (issues:write on project repo only)
 FORGE_FILER_PASS=                          # [SECRET] filer-bot password for git HTTP push
 FORGE_BOT_USERNAMES=dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot,filer-bot
 # ── Backwards compatibility ───────────────────────────────────────────────
 # If CODEBERG_TOKEN is set but FORGE_TOKEN is not, env.sh falls back to
@ -61,6 +63,10 @@ FORGE_BOT_USERNAMES=dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,superv
 WOODPECKER_TOKEN=                          # [SECRET] Woodpecker API token
 WOODPECKER_SERVER=http://localhost:8000     # [CONFIG] Woodpecker server URL
 WOODPECKER_AGENT_SECRET=                   # [SECRET] shared secret for server↔agent auth (auto-generated)
 # Woodpecker privileged-plugin allowlist — comma-separated image names
 # Add plugins/docker (and others) here to allow privileged execution
 WOODPECKER_PLUGINS_PRIVILEGED=plugins/docker
 # WOODPECKER_REPO_ID — now per-project, set in projects/*.toml [ci] section
 # Woodpecker Postgres (for direct DB queries)
@ -69,26 +75,59 @@ WOODPECKER_DB_USER=woodpecker              # [CONFIG] Postgres user
 WOODPECKER_DB_HOST=127.0.0.1              # [CONFIG] Postgres host
 WOODPECKER_DB_NAME=woodpecker              # [CONFIG] Postgres database name
 # ── Chat OAuth (#708) ────────────────────────────────────────────────────
 CHAT_OAUTH_CLIENT_ID=                     # [SECRET] Chat OAuth2 client ID (auto-generated by init)
 CHAT_OAUTH_CLIENT_SECRET=                 # [SECRET] Chat OAuth2 client secret (auto-generated by init)
 DISINTO_CHAT_ALLOWED_USERS=               # [CONFIG] CSV of allowed usernames (disinto-admin always allowed)
 FORWARD_AUTH_SECRET=                      # [SECRET] Shared secret for Caddy ↔ chat forward_auth (#709)
 # ── Vault-only secrets (DO NOT put these in .env) ────────────────────────
 # These tokens grant access to external systems (GitHub, ClawHub, deploy targets).
-# They live ONLY in .env.vault.enc and are injected into the ephemeral runner
+# They live ONLY in secrets/<NAME>.enc (age-encrypted, one file per key) and are
-# container at fire time (#745). lib/env.sh explicitly unsets them so agents
+# decrypted into the ephemeral runner container at fire time (#745, #777).
-# can never hold them directly — all external actions go through vault dispatch.
+# lib/env.sh explicitly unsets them so agents can never hold them directly —
 # all external actions go through vault dispatch.
 #
 #   GITHUB_TOKEN          — GitHub API access (publish, deploy, post)
 #   CLAWHUB_TOKEN         — ClawHub registry credentials (publish)
 #   CADDY_SSH_KEY         — SSH key for Caddy log collection
 #   (deploy keys)         — SSH keys for deployment targets
 #
-# To manage vault secrets: disinto secrets edit-vault
+# To manage secrets: disinto secrets add/show/remove/list
 # (vault redesign in progress: PR-based approval, see #73-#77)
 # ── Project-specific secrets ──────────────────────────────────────────────
 # Store all project secrets here so formulas reference env vars, never hardcode.
 BASE_RPC_URL=                              # [SECRET] on-chain RPC endpoint
 # ── Local Qwen dev agent (optional) ──────────────────────────────────────
 # Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml.
 # Requires a running llama-server reachable at ANTHROPIC_BASE_URL.
 # See docs/agents-llama.md for details.
 ENABLE_LLAMA_AGENT=0                       # [CONFIG] 1 = enable agents-llama service
 ANTHROPIC_BASE_URL=                        # [CONFIG] e.g. http://host.docker.internal:8081
 # ── Tuning ────────────────────────────────────────────────────────────────
 CLAUDE_TIMEOUT=7200                        # [CONFIG] max seconds per Claude invocation
 # ── Host paths (Nomad-portable) ────────────────────────────────────────────
 # These env vars externalize host-side bind-mount paths from docker-compose.yml.
 # At cutover, Nomad jobspecs reference the same vars — no path translation.
 # Defaults point at current paths so an empty .env override still works.
 CLAUDE_BIN_DIR=/usr/local/bin/claude          # [CONFIG] host path to claude CLI binary (resolved by `disinto init`)
 CLAUDE_CONFIG_FILE=${HOME}/.claude.json       # [CONFIG] host path to claude config JSON file
 CLAUDE_DIR=${HOME}/.claude                    # [CONFIG] host path to .claude directory (reproduce/edge)
 AGENT_SSH_DIR=${HOME}/.ssh                    # [CONFIG] host path to SSH keys directory
 SOPS_AGE_DIR=${HOME}/.config/sops/age         # [CONFIG] host path to SOPS age key directory
 # ── Claude Code shared OAuth state ─────────────────────────────────────────
 # Shared directory used by every factory container so Claude Code's internal
 # proper-lockfile-based OAuth refresh lock works across containers. Both
 # values must live outside $HOME (so docker bind mounts don't depend on UID
 # mapping) and must be the same absolute path on host and inside each
 # container. See docs/CLAUDE-AUTH-CONCURRENCY.md.
 CLAUDE_SHARED_DIR=/var/lib/disinto/claude-shared
 CLAUDE_CONFIG_DIR=${CLAUDE_SHARED_DIR}/config
 # ── Factory safety ────────────────────────────────────────────────────────
 # Disables Claude Code auto-updater, telemetry, error reporting, and bug
 # command. Factory sessions are production processes — they must never phone
--- a/.codeberg/ISSUE_TEMPLATE/bug.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/bug.yaml
@ -1,7 +1,7 @@
 name: Bug Report
 about: Something is broken or behaving incorrectly
 labels:
-  - bug
+  - bug-report
 body:
  - type: textarea
    id: what
--- a/.codeberg/ISSUE_TEMPLATE/feature.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/feature.yaml
--- a/.codeberg/ISSUE_TEMPLATE/refactor.yaml
+++ b/.codeberg/ISSUE_TEMPLATE/refactor.yaml
--- a/.gitignore
+++ b/.gitignore
@ -3,7 +3,6 @@
 # Encrypted secrets — safe to commit (SOPS-encrypted with age)
 !.env.enc
 !.env.vault.enc
 !.sops.yaml
 # Per-box project config (generated by disinto init)
@ -28,3 +27,15 @@ secrets/
 # Pre-built binaries for Docker builds (avoid network calls during build)
 docker/agents/bin/
 # Generated docker-compose.yml (run 'bin/disinto init' to regenerate)
 # Note: This file is now committed to track volume mount configuration
 # docker-compose.yml
 # Generated Caddyfile — single source of truth is generate_caddyfile in lib/generators.sh
 docker/Caddyfile
 # Python bytecode
 __pycache__/
 *.pyc
 *.pyo
--- a/.woodpecker/agent-smoke.sh
+++ b/.woodpecker/agent-smoke.sh
@ -11,6 +11,11 @@ set -euo pipefail
 cd "$(dirname "$0")/.."
 # CI-side filesystem snapshot: show lib/ state at smoke time (#600)
 echo "=== smoke environment snapshot ==="
 ls -la lib/ 2>&1 | head -50
 echo "=== "
 FAILED=0
 # ── helpers ─────────────────────────────────────────────────────────────────
@ -93,32 +98,37 @@ echo "syntax check done"
 echo "=== 2/2  Function resolution ==="
-# Functions provided by shared lib files (available to all agent scripts via source).
+# Enumerate ALL lib/*.sh files in stable lexicographic order (#742).
 # Previous approach used a hand-maintained REQUIRED_LIBS list, which silently
 # became incomplete as new libs were added, producing partial LIB_FUNS that
 # caused non-deterministic "undef" failures.
 #
-# Included — these are inline-sourced by agent scripts:
+# Excluded from LIB_FUNS (not sourced inline by agents):
 #   lib/env.sh              — sourced by every agent (log, forge_api, etc.)
 #   lib/agent-sdk.sh        — sourced by SDK agents (agent_run, agent_recover_session)
 #   lib/ci-helpers.sh       — sourced by pollers and review (ci_passed, classify_pipeline_failure, etc.)
 #   lib/load-project.sh     — sourced by env.sh when PROJECT_TOML is set
 #   lib/file-action-issue.sh — sourced by gardener-run.sh (file_action_issue)
 #   lib/secret-scan.sh      — sourced by file-action-issue.sh (scan_for_secrets, redact_secrets)
 #   lib/formula-session.sh  — sourced by formula-driven agents (acquire_run_lock, check_memory, etc.)
 #   lib/mirrors.sh          — sourced by merge sites (mirror_push)
 #   lib/guard.sh            — sourced by all polling-loop entry points (check_active)
 #   lib/issue-lifecycle.sh  — sourced by agents for issue claim/release/block/deps
 #   lib/worktree.sh         — sourced by agents for worktree create/recover/cleanup/preserve
 #
 # Excluded — not sourced inline by agents:
 #   lib/tea-helpers.sh      — sourced conditionally by env.sh (tea_file_issue, etc.); checked standalone below
 #   lib/ci-debug.sh         — standalone CLI tool, run directly (not sourced)
 #   lib/parse-deps.sh       — executed via `bash lib/parse-deps.sh` (not sourced)
 #   lib/hooks/*.sh          — Claude Code hook scripts, executed by the harness (not sourced)
-#
+EXCLUDED_LIBS="lib/ci-debug.sh lib/parse-deps.sh"
-# If a new lib file is added and sourced by agents, add it to LIB_FUNS below
+
-# and add a check_script call for it in the lib files section further down.
+# Build the list of lib files in deterministic order (LC_ALL=C sort).
 # Fail loudly if no lib files are found — checkout is broken.
 mapfile -t ALL_LIBS < <(LC_ALL=C find lib -maxdepth 1 -name '*.sh' -print | LC_ALL=C sort)
 if [ "${#ALL_LIBS[@]}" -eq 0 ]; then
  echo 'FAIL [no-libs] no lib/*.sh files found at smoke time' >&2
  printf '  pwd=%s\n' "$(pwd)" >&2
  echo '=== SMOKE TEST FAILED (precondition) ===' >&2
  exit 2
 fi
 # Build LIB_FUNS from all non-excluded lib files.
 # Use set -e inside the subshell so a failed get_fns aborts loudly
 # instead of silently shrinking the function list.
 LIB_FUNS=$(
-  for f in lib/agent-sdk.sh lib/env.sh lib/ci-helpers.sh lib/load-project.sh lib/secret-scan.sh lib/file-action-issue.sh lib/formula-session.sh lib/mirrors.sh lib/guard.sh lib/pr-lifecycle.sh lib/issue-lifecycle.sh lib/worktree.sh; do
+  set -e
-    if [ -f "$f" ]; then get_fns "$f"; fi
+  for f in "${ALL_LIBS[@]}"; do
    # shellcheck disable=SC2086
    skip=0; for ex in $EXCLUDED_LIBS; do [ "$f" = "$ex" ] && skip=1; done
    [ "$skip" -eq 1 ] && continue
    get_fns "$f"
  done | sort -u
 )
@ -170,8 +180,15 @@ check_script() {
  while IFS= read -r fn; do
    [ -z "$fn" ] && continue
    is_known_cmd "$fn" && continue
-    if ! printf '%s\n' "$all_fns" | grep -qxF "$fn"; then
+    # Use here-string (<<<) instead of pipe to avoid SIGPIPE race (#742):
    # with pipefail, `printf | grep -q` can fail when grep closes the pipe
    # early after finding a match, causing printf to get SIGPIPE (exit 141).
    # This produced non-deterministic false "undef" failures.
    if ! grep -qxF "$fn" <<< "$all_fns"; then
      printf 'FAIL [undef] %s: %s\n' "$script" "$fn"
      printf '  all_fns count: %d\n' "$(grep -c . <<< "$all_fns")"
      printf '  LIB_FUNS contains "%s": %s\n' "$fn" "$(grep -cxF "$fn" <<< "$LIB_FUNS")"
      printf '  defining lib (if any): %s\n' "$(grep -l "^[[:space:]]*${fn}[[:space:]]*()" lib/*.sh 2>/dev/null | tr '\n' ' ')"
      FAILED=1
    fi
  done <<< "$candidates"
@ -184,9 +201,8 @@ check_script lib/env.sh              lib/mirrors.sh
 check_script lib/agent-sdk.sh
 check_script lib/ci-helpers.sh
 check_script lib/secret-scan.sh
 check_script lib/file-action-issue.sh   lib/secret-scan.sh
 check_script lib/tea-helpers.sh         lib/secret-scan.sh
-check_script lib/formula-session.sh
+check_script lib/formula-session.sh     lib/ops-setup.sh
 check_script lib/load-project.sh
 check_script lib/mirrors.sh              lib/env.sh
 check_script lib/guard.sh
@ -197,12 +213,13 @@ check_script lib/issue-lifecycle.sh   lib/secret-scan.sh
 # Still checked for function resolution against LIB_FUNS + own definitions.
 check_script lib/ci-debug.sh
 check_script lib/parse-deps.sh
 check_script lib/sprint-filer.sh
 # Agent scripts — list cross-sourced files where function scope flows across files.
 check_script dev/dev-agent.sh
 check_script dev/dev-poll.sh
 check_script dev/phase-test.sh
-check_script gardener/gardener-run.sh
+check_script gardener/gardener-run.sh    lib/formula-session.sh
 check_script review/review-pr.sh         lib/agent-sdk.sh
 check_script review/review-poll.sh
 check_script planner/planner-run.sh      lib/formula-session.sh
--- a/.woodpecker/detect-duplicates.py
+++ b/.woodpecker/detect-duplicates.py
@ -292,6 +292,8 @@ def main() -> int:
        "21aec56a99d5252b23fb9a38b895e8e8": "Verification helper: check body for Decomposed from pattern",
        "60ea98b3604557d539193b2a6624e232": "Verification helper: append sub-issue number",
        "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
        # Standard lib source block shared across formula-driven agent run scripts
        "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
    }
    if not sh_files:
--- a/.woodpecker/publish-images.yml
+++ b/.woodpecker/publish-images.yml
@ -0,0 +1,64 @@
 # .woodpecker/publish-images.yml — Build and push versioned container images
 # Triggered on tag pushes (e.g. v1.2.3). Builds and pushes:
 #   - ghcr.io/disinto/agents:<tag>
 #   - ghcr.io/disinto/reproduce:<tag>
 #   - ghcr.io/disinto/edge:<tag>
 #
 # Requires GHCR_TOKEN secret configured in Woodpecker with push access
 # to ghcr.io/disinto.
 when:
  event: tag
  ref: refs/tags/v*
 clone:
  git:
    image: alpine/git
    commands:
      - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
      - git clone --depth 1 "$AUTH_URL" .
      - git fetch --depth 1 origin "$CI_COMMIT_REF"
      - git checkout FETCH_HEAD
 steps:
  - name: build-and-push-agents
    image: plugins/docker
    settings:
      repo: ghcr.io/disinto/agents
      registry: ghcr.io
      dockerfile: docker/agents/Dockerfile
      context: .
      tags:
        - ${CI_COMMIT_TAG}
        - latest
      username: disinto
      password:
        from_secret: GHCR_TOKEN
  - name: build-and-push-reproduce
    image: plugins/docker
    settings:
      repo: ghcr.io/disinto/reproduce
      registry: ghcr.io
      dockerfile: docker/reproduce/Dockerfile
      context: .
      tags:
        - ${CI_COMMIT_TAG}
        - latest
      username: disinto
      password:
        from_secret: GHCR_TOKEN
  - name: build-and-push-edge
    image: plugins/docker
    settings:
      repo: ghcr.io/disinto/edge
      registry: ghcr.io
      dockerfile: docker/edge/Dockerfile
      context: docker/edge
      tags:
        - ${CI_COMMIT_TAG}
        - latest
      username: disinto
      password:
        from_secret: GHCR_TOKEN
--- a/.woodpecker/run-secret-scan.sh
+++ b/.woodpecker/run-secret-scan.sh
@ -0,0 +1,68 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # run-secret-scan.sh — CI wrapper for lib/secret-scan.sh
 #
 # Scans files changed in this PR for plaintext secrets.
 # Exits non-zero if any secret is detected.
 # shellcheck source=../lib/secret-scan.sh
 source lib/secret-scan.sh
 # Path patterns considered secret-adjacent
 SECRET_PATH_PATTERNS=(
  '\.env'
  'tools/vault-.*\.sh'
  'nomad/'
  'vault/'
  'action-vault/'
  'lib/hvault\.sh'
  'lib/action-vault\.sh'
 )
 # Build a single regex from patterns
 path_regex=$(printf '%s|' "${SECRET_PATH_PATTERNS[@]}")
 path_regex="${path_regex%|}"
 # Get files changed in this PR vs target branch.
 # Note: shallow clone (depth 50) may lack the merge base for very large PRs,
 # causing git diff to fail — || true means the gate skips rather than blocks.
 changed_files=$(git diff --name-only --diff-filter=ACMR "origin/${CI_COMMIT_TARGET_BRANCH}...HEAD" || true)
 if [ -z "$changed_files" ]; then
  echo "secret-scan: no changed files found, skipping"
  exit 0
 fi
 # Filter to secret-adjacent paths only
 target_files=$(printf '%s\n' "$changed_files" | grep -E "$path_regex" || true)
 if [ -z "$target_files" ]; then
  echo "secret-scan: no secret-adjacent files changed, skipping"
  exit 0
 fi
 echo "secret-scan: scanning $(printf '%s\n' "$target_files" | wc -l) file(s):"
 printf '  %s\n' "$target_files"
 failures=0
 while IFS= read -r file; do
  # Skip deleted files / non-existent
  [ -f "$file" ] || continue
  # Skip binary files
  file -b --mime-encoding "$file" 2>/dev/null | grep -q binary && continue
  content=$(cat "$file")
  if ! scan_for_secrets "$content"; then
    echo "FAIL: secret detected in $file"
    failures=$((failures + 1))
  fi
 done <<< "$target_files"
 if [ "$failures" -gt 0 ]; then
  echo ""
  echo "secret-scan: $failures file(s) contain potential secrets — merge blocked"
  echo "If these are false positives, verify patterns in lib/secret-scan.sh"
  exit 1
 fi
 echo "secret-scan: all files clean"
--- a/.woodpecker/secret-scan.yml
+++ b/.woodpecker/secret-scan.yml
@ -0,0 +1,32 @@
 # .woodpecker/secret-scan.yml — Block PRs that leak plaintext secrets
 #
 # Triggers on pull requests touching secret-adjacent paths.
 # Sources lib/secret-scan.sh and scans each changed file's content.
 # Exits non-zero if any potential secret is detected.
 when:
  - event: pull_request
    path:
      - ".env*"
      - "tools/vault-*.sh"
      - "nomad/**/*"
      - "vault/**/*"
      - "action-vault/**/*"
      - "lib/hvault.sh"
      - "lib/action-vault.sh"
 clone:
  git:
    image: alpine/git
    commands:
      - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
      - git clone --depth 50 "$AUTH_URL" .
      - git fetch --depth 50 origin "$CI_COMMIT_REF" "$CI_COMMIT_TARGET_BRANCH"
      - git checkout FETCH_HEAD
 steps:
  - name: secret-scan
    image: alpine:3
    commands:
      - apk add --no-cache bash git grep file
      - bash .woodpecker/run-secret-scan.sh
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Disinto — Agent Instructions
 ## What this repo is
@ -31,19 +31,19 @@ disinto/                 (code repo)
 ├── supervisor/    supervisor-run.sh — formula-driven health monitoring (polling-loop executor)
 │                  preflight.sh — pre-flight data collection for supervisor formula
 ├── architect/     architect-run.sh — strategic decomposition of vision into sprints
-├── vault/         vault-env.sh — shared env setup (vault redesign in progress, see #73-#77)
+├── action-vault/  vault-env.sh — shared env setup (vault redesign in progress, see #73-#77)
 │                  SCHEMA.md — vault item schema documentation
 │                  validate.sh — vault item validator
 │                  examples/ — example vault action TOMLs (promote, publish, release, webhook-call)
-├── lib/           env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py,
+├── lib/           env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh
 │                  branch-protection.sh, secret-scan.sh, tea-helpers.sh, vault.sh, ci-log-reader.py
 │                  hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure)
 ├── projects/      *.toml.example — templates; *.toml — local per-box config (gitignored)
 ├── formulas/      Issue templates (TOML specs for multi-step agent tasks)
-├── docker/        Dockerfiles and entrypoints for reproduce, triage, and edge dispatcher agents
+├── docker/        Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/)
 ├── tools/         Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh)
 ├── docs/          Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
 ├── site/          disinto.ai website content
-├── tests/         Test files (mock-forgejo.py, smoke-init.sh)
+├── tests/         Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats)
 ├── templates/     Issue templates
 ├── bin/           The `disinto` CLI script
 ├── disinto-factory/  Setup documentation and skill
@ -86,7 +86,7 @@ Each agent has a `.profile` repository on Forgejo storing `knowledge/lessons-lea
 - All scripts start with `#!/usr/bin/env bash` and `set -euo pipefail`
 - Source shared environment: `source "$(dirname "$0")/../lib/env.sh"`
 - Log to `$LOGFILE` using the `log()` function from env.sh or defined locally
- Never hardcode secrets — agent secrets come from `.env.enc`, vault secrets from `.env.vault.enc` (or `.env`/`.env.vault` fallback)
+- Never hardcode secrets — agent secrets come from `.env.enc`, vault secrets from `secrets/<NAME>.enc` (age-encrypted, one file per key)
 - Never embed secrets in issue bodies, PR descriptions, or comments — use env var references (e.g. `$BASE_RPC_URL`)
 - ShellCheck must pass (CI runs `shellcheck` on all `.sh` files)
 - Avoid duplicate code — shared helpers go in `lib/`
@ -113,10 +113,13 @@ bash dev/phase-test.sh
 | Supervisor | `supervisor/` | Health monitoring | [supervisor/AGENTS.md](supervisor/AGENTS.md) |
 | Planner | `planner/` | Strategic planning | [planner/AGENTS.md](planner/AGENTS.md) |
 | Predictor | `predictor/` | Infrastructure pattern detection | [predictor/AGENTS.md](predictor/AGENTS.md) |
-| Architect | `architect/` | Strategic decomposition | [architect/AGENTS.md](architect/AGENTS.md) |
+| Architect | `architect/` | Strategic decomposition (read-only on project repo) | [architect/AGENTS.md](architect/AGENTS.md) |
 | Filer | `lib/sprint-filer.sh` | Sub-issue filing from merged sprint PRs | ops repo pipeline (deferred, see #779) |
 | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` |
 | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` |
 | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` |
 | agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) |
 | agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) |
 > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77).
 > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details.
@ -135,7 +138,7 @@ Issues flow: `backlog` → `in-progress` → PR → CI → review → merge →
 |---|---|---|
 | `backlog` | Issue is queued for implementation. Dev-poll picks the first ready one. | Planner, gardener, humans |
 | `priority` | Queue tier above plain backlog. Issues with both `priority` and `backlog` are picked before plain `backlog` issues. FIFO within each tier. | Planner, humans |
-| `in-progress` | Dev-agent is actively working on this issue. Only one issue per project is in-progress at a time. | dev-agent.sh (claims issue) |
+| `in-progress` | Dev-agent is actively working on this issue. Only one issue per project is in-progress at a time. Also set on vision issues by filer-bot when sub-issues are filed (#764). | dev-agent.sh (claims issue), filer-bot (vision issues) |
 | `blocked` | Issue is stuck — agent session failed, crashed, timed out, or CI exhausted. Diagnostic comment on the issue has details. Also used for unmet dependencies. | dev-agent.sh, dev-poll.sh (on failure) |
 | `tech-debt` | Pre-existing issue flagged by AI reviewer, not introduced by a PR. | review-pr.sh (auto-created follow-ups) |
 | `underspecified` | Dev-agent refused the issue as too large or vague. | dev-poll.sh (on preflight `too_large`), dev-agent.sh (on mid-run `too_large` refusal) |
@ -174,19 +177,17 @@ Humans write these. Agents read and enforce them.
 | ID | Decision | Rationale |
 |---|---|---|
 | AD-001 | Nervous system runs from a polling loop (`docker/agents/entrypoint.sh`), not PR-based actions. | Planner, predictor, gardener, supervisor run directly via `*-run.sh`. They create work, they don't become work. (See PR #474 revert.) |
-| AD-002 | **Concurrency is bounded per LLM backend, not per project.** One concurrent Claude session per OAuth credential pool; one concurrent session per llama-server instance. Containers with disjoint backends may run in parallel. | The single-thread invariant is about *backends*, not pipelines. **(a) Anthropic OAuth credentials race on token refresh** — two sessions sharing one mounted `~/.claude` will trip over each other during rotation and 401. All agents inside an OAuth-mounted container serialize on `flock session.lock`. **(b) llama-server has finite VRAM and one KV cache** — parallel inference thrashes the cache and risks OOM. All llama-backed agents serialize on the same lock. **(c) Disjoint backends are free to parallelize.** Today `disinto-agents` (Anthropic OAuth, runs `review,gardener`) runs concurrently with `disinto-agents-llama` (llama, runs `dev`) on the same project — they share neither OAuth state nor llama VRAM. **(d) Per-project work-conflict safety** (no duplicate dev work, no merge conflicts on the same branch) is enforced by `issue_claim` (assignee + `in-progress` label) and per-issue worktrees — that's a separate guard that does NOT depend on this AD. |
+| AD-002 | **Concurrency is bounded per LLM backend, not per project.** One concurrent Claude session per OAuth credential pool; one concurrent session per llama-server instance. Containers with disjoint backends may run in parallel. | The single-thread invariant is about *backends*, not pipelines. **(a) Anthropic OAuth credentials race on token refresh** — each container uses a per-session `CLAUDE_CONFIG_DIR`, so Claude Code's native lockfile-based OAuth refresh handles contention automatically without external serialization. (Legacy: set `CLAUDE_EXTERNAL_LOCK=1` to re-enable the old `flock session.lock` wrapper for rollback.) **(b) llama-server has finite VRAM and one KV cache** — parallel inference thrashes the cache and risks OOM. All llama-backed agents serialize on the same lock. **(c) Disjoint backends are free to parallelize.** Today `disinto-agents` (Anthropic OAuth, runs `review,gardener`) runs concurrently with `disinto-agents-llama` (llama, runs `dev`) on the same project — they share neither OAuth state nor llama VRAM. **(d) Per-project work-conflict safety** (no duplicate dev work, no merge conflicts on the same branch) is enforced by `issue_claim` (assignee + `in-progress` label) and per-issue worktrees — that's a separate guard that does NOT depend on this AD. |
 | AD-003 | The runtime creates and destroys, the formula preserves. | Runtime manages worktrees/sessions/temp. Formulas commit knowledge to git before signaling done. |
 | AD-004 | Event-driven > polling > fixed delays. | Never `waitForTimeout` or hardcoded sleep. Use phase files, webhooks, or poll loops with backoff. |
-| AD-005 | Secrets via env var indirection, never in issue bodies. | Issue bodies become code. Agent secrets go in `.env.enc`, vault secrets in `.env.vault.enc` (SOPS-encrypted when available; plaintext `.env`/`.env.vault` fallback supported). Referenced as `$VAR_NAME`. Runner gets only vault secrets; agents get only agent secrets. |
+| AD-005 | Secrets via env var indirection, never in issue bodies. | Issue bodies become code. Agent secrets go in `.env.enc` (SOPS-encrypted), vault secrets in `secrets/<NAME>.enc` (age-encrypted, one file per key). Referenced as `$VAR_NAME`. Runner gets only vault secrets; agents get only agent secrets. |
-| AD-006 | External actions go through vault dispatch, never direct. | Agents build addressables; only the vault exercises them (publishes, deploys, posts). Tokens for external systems (`GITHUB_TOKEN`, `CLAWHUB_TOKEN`, deploy keys) live only in `.env.vault.enc` and are injected into the ephemeral runner container. `lib/env.sh` unsets them so agents never hold them. PRs with direct external actions without vault dispatch get REQUEST_CHANGES. (Vault redesign in progress: PR-based approval on ops repo, see #73-#77) |
+| AD-006 | External actions go through vault dispatch, never direct. | Agents build addressables; only the vault exercises them (publishes, deploys, posts). Tokens for external systems (`GITHUB_TOKEN`, `CLAWHUB_TOKEN`, deploy keys) live only in `secrets/<NAME>.enc` and are decrypted into the ephemeral runner container. `lib/env.sh` unsets them so agents never hold them. PRs with direct external actions without vault dispatch get REQUEST_CHANGES. (Vault redesign in progress: PR-based approval on ops repo, see #73-#77) |
 **Who enforces what:**
 - **Gardener** checks open backlog issues against ADs during grooming; closes violations with a comment referencing the AD number.
 - **Planner** plans within the architecture; does not create issues that violate ADs.
 - **Dev-agent** reads AGENTS.md before implementing; refuses work that violates ADs.
- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** Concurrency is enforced by `flock session.lock` within each container and by `issue_claim` for per-issue work. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue.
+- **AD-002 is a runtime invariant; nothing for the gardener to check at issue-groom time.** OAuth concurrency is handled by per-session `CLAUDE_CONFIG_DIR` isolation (with `CLAUDE_EXTERNAL_LOCK` as a rollback flag). Per-issue work is enforced by `issue_claim`. A violation manifests as a 401 or VRAM OOM in agent logs, not as a malformed issue.
 ---
 ## Phase-Signaling Protocol
@ -196,6 +197,4 @@ at each phase boundary by writing to a phase file (e.g.
 Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`.
 Also: `PHASE:escalate` (needs human input), `PHASE:failed`.
-
+See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery.
 See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec
 including the orchestrator reaction matrix, sequence diagram, and crash recovery.
--- a/README.md
+++ b/README.md
@ -72,6 +72,8 @@ cd disinto
 disinto init https://github.com/yourorg/yourproject
 ```
 This will generate a `docker-compose.yml` file.
 Or configure manually — edit `.env` with your values:
 ```bash
@ -97,7 +99,7 @@ CLAUDE_TIMEOUT=7200         # max seconds per Claude invocation (default: 2h)
 docker compose up -d
 # 4. Verify the entrypoint loop is running
-docker exec disinto-agents-1 tail -f /home/agent/data/agent-entrypoint.log
+docker exec disinto-agents tail -f /home/agent/data/agent-entrypoint.log
 ```
 ## Directory Structure
--- a/action-vault/SCHEMA.md
+++ b/action-vault/SCHEMA.md
@ -50,7 +50,7 @@ blast_radius = "low"       # optional: overrides policy.toml tier ("low"|"medium
 ## Secret Names
-Secret names must be defined in `.env.vault.enc` on the ops repo. The vault validates that requested secrets exist in the allowlist before execution.
+Secret names must have a corresponding `secrets/<NAME>.enc` file (age-encrypted). The vault validates that requested secrets exist in the allowlist before execution.
 Common secret names:
 - `CLAWHUB_TOKEN` - Token for ClawHub skill publishing
--- a/action-vault/classify.sh
+++ b/action-vault/classify.sh
--- a/action-vault/examples/promote.toml
+++ b/action-vault/examples/promote.toml
--- a/action-vault/examples/publish.toml
+++ b/action-vault/examples/publish.toml
--- a/action-vault/examples/release.toml
+++ b/action-vault/examples/release.toml
--- a/action-vault/examples/webhook-call.toml
+++ b/action-vault/examples/webhook-call.toml
--- a/action-vault/policy.toml
+++ b/action-vault/policy.toml
--- a/action-vault/validate.sh
+++ b/action-vault/validate.sh
--- a/action-vault/vault-env.sh
+++ b/action-vault/vault-env.sh
@ -28,7 +28,7 @@ fi
 # VAULT ACTION VALIDATION
 # =============================================================================
-# Allowed secret names - must match keys in .env.vault.enc
+# Allowed secret names - must match files in secrets/<NAME>.enc
 VAULT_ALLOWED_SECRETS="CLAWHUB_TOKEN GITHUB_TOKEN CODEBERG_TOKEN DEPLOY_KEY NPM_TOKEN DOCKER_HUB_TOKEN"
 # Allowed mount aliases — well-known file-based credential directories
--- a/architect/AGENTS.md
+++ b/architect/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Architect — Agent Instructions
 ## What this agent is
@ -10,9 +10,9 @@ converses with humans through PR comments.
 ## Role
 - **Input**: Vision issues from VISION.md, prerequisite tree from ops repo
- **Output**: Sprint proposals as PRs on the ops repo, sub-issue files
+- **Output**: Sprint proposals as PRs on the ops repo (with embedded `## Sub-issues` blocks)
 - **Mechanism**: Bash-driven orchestration in `architect-run.sh`, pitching formula via `formulas/run-architect.toml`
- **Identity**: `architect-bot` on Forgejo
+- **Identity**: `architect-bot` on Forgejo (READ-ONLY on project repo, write on ops repo only — #764)
 ## Responsibilities
@ -24,40 +24,66 @@ converses with humans through PR comments.
   acceptance criteria and dependencies
 4. **Human conversation**: Respond to PR comments, refine sprint proposals based
   on human feedback
-5. **Sub-issue filing**: After design forks are resolved, file concrete sub-issues
+5. **Sub-issue definition**: Define concrete sub-issues in the `## Sub-issues`
-   for implementation
+   block of the sprint spec. Filing is handled by `filer-bot` after sprint PR
   merge (#764)
 ## Formula
 The architect pitching is driven by `formulas/run-architect.toml`. This formula defines
 the steps for:
 - Research: analyzing vision items and prerequisite tree
- Pitch: creating structured sprint PRs
+- Pitch: creating structured sprint PRs with embedded `## Sub-issues` blocks
- Sub-issue filing: creating concrete implementation issues
+- Design Q&A: refining the sprint via PR comments after human ACCEPT
 ## Bash-driven orchestration
 Bash in `architect-run.sh` handles state detection and orchestration:
 - **Deterministic state detection**: Bash reads the Forgejo reviews API to detect
-  ACCEPT/REJECT decisions — no model-dependent API parsing
+  ACCEPT/REJECT decisions — checks both formal APPROVED reviews and PR comments, not just comments (#718)
 - **Human guidance injection**: Review body text from ACCEPT reviews is injected
  directly into the research prompt as context
 - **Response processing**: When ACCEPT/REJECT responses are detected, bash invokes
  the agent with appropriate context (session resumed for questions phase)
 - **Pitch capture**: `pitch_output` is written to a temp file instead of captured via `$()` subshell, because `agent_run` writes to side-channels (`SID_FILE`, `LOGFILE`) that subshell capture would suppress (#716)
 - **PR URL construction**: existing-PR check uses `${FORGE_API}/pulls` directly (not `${FORGE_API}/repos/…`) — the base URL already includes the repos segment (#717)
 ### State transitions
 ```
 New vision issue → pitch PR (model generates pitch, bash creates PR)
  ↓
-ACCEPT review → research + questions (model, session saved to $SID_FILE)
+APPROVED review → start design questions (model posts Q1:, adds Design forks section)
  ↓
-Answers received → sub-issue filing (model, session resumed via --resume)
+Answers received → continue Q&A (model processes answers, posts follow-ups)
  ↓
 All forks resolved → finalize ## Sub-issues section in sprint spec
  ↓
 Sprint PR merged → filer-bot files sub-issues on project repo (#764)
  ↓
 REJECT review → close PR + journal (model processes rejection, bash merges PR)
 ```
 ### Vision issue lifecycle
 Vision issues decompose into sprint sub-issues. Sub-issues are defined in the
 `## Sub-issues` block of the sprint spec (between `<!-- filer:begin -->` and
 `<!-- filer:end -->` markers) and filed by `filer-bot` after the sprint PR merges
 on the ops repo (#764).
 Each filer-created sub-issue carries a `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->`
 marker in its body for idempotency and traceability.
 The filer-bot (via `lib/sprint-filer.sh`) handles vision lifecycle:
 1. After filing sub-issues, adds `in-progress` label to the vision issue
 2. On each run, checks if all sub-issues for a vision are closed
 3. If all closed, posts a summary comment and closes the vision issue
 The architect no longer writes to the project repo — it is read-only (#764).
 All project-repo writes (issue filing, label management, vision closure) are
 handled by filer-bot with its narrowly-scoped `FORGE_FILER_TOKEN`.
 ### Session management
 The agent maintains a global session file at `/tmp/architect-session-{project}.sid`.
@ -70,6 +96,7 @@ Run via `architect/architect-run.sh`, which:
 - Acquires a poll-loop lock (via `acquire_lock`) and checks available memory
 - Cleans up per-issue scratch files from previous runs (`/tmp/architect-{project}-scratch-*.md`)
 - Sources shared libraries (env.sh, formula-session.sh)
 - Exports `FORGE_TOKEN_OVERRIDE="${FORGE_ARCHITECT_TOKEN}"` BEFORE sourcing env.sh, ensuring architect-bot identity survives re-sourcing (#762)
 - Uses FORGE_ARCHITECT_TOKEN for authentication
 - Processes existing architect PRs via bash-driven design phase
 - Loads the formula and builds context from VISION.md, AGENTS.md, and ops repo
@ -79,7 +106,9 @@ Run via `architect/architect-run.sh`, which:
  - Selects up to `pitch_budget` (3 - open architect PRs) remaining vision issues
  - For each selected issue, invokes stateless `claude -p` with issue body + context
  - Creates PRs directly from pitch content (no scratch files)
- Agent is invoked only for response processing (ACCEPT/REJECT handling)
+- Agent is invoked for stateless pitch generation and response processing (ACCEPT/REJECT handling)
 - NOTE: architect-bot is read-only on the project repo (#764) — sub-issue filing
  and in-progress label management are handled by filer-bot after sprint PR merge
 **Multi-sprint pitching**: The architect pitches up to 3 sprints per run. Bash handles all state management:
 - Fetches Forgejo API data (vision issues, open PRs, merged PRs)
@ -104,4 +133,5 @@ empty file not created, just document it).
 - #100: Architect formula — research + design fork identification
 - #101: Architect formula — sprint PR creation with questions
 - #102: Architect formula — answer parsing + sub-issue filing
 - #764: Permission scoping — architect read-only on project repo, filer-bot files sub-issues
 - #491: Refactor — bash-driven design phase with stateful session resumption
--- a/architect/architect-run.sh
+++ b/architect/architect-run.sh
@ -34,10 +34,11 @@ FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
 # Accept project config from argument; default to disinto
 export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
 # Set override BEFORE sourcing env.sh so it survives any later re-source of
 # env.sh from nested shells / claude -p tools (#762, #747)
 export FORGE_TOKEN_OVERRIDE="${FORGE_ARCHITECT_TOKEN:-}"
 # shellcheck source=../lib/env.sh
 source "$FACTORY_ROOT/lib/env.sh"
 # Override FORGE_TOKEN with architect-bot's token (#747)
 FORGE_TOKEN="${FORGE_ARCHITECT_TOKEN:-${FORGE_TOKEN}}"
 # shellcheck source=../lib/formula-session.sh
 source "$FACTORY_ROOT/lib/formula-session.sh"
 # shellcheck source=../lib/worktree.sh
@ -79,7 +80,7 @@ log "--- Architect run start ---"
 # ── Resolve forge remote for git operations ─────────────────────────────
 # Run git operations from the project checkout, not the baked code dir
-cd "$PROJECT_REPO_ROOT" || exit 1
+cd "$PROJECT_REPO_ROOT"
 resolve_forge_remote
@ -116,8 +117,8 @@ build_architect_prompt() {
 You are the architect agent for ${FORGE_REPO}. Work through the formula below.
 Your role: strategic decomposition of vision issues into development sprints.
-Propose sprints via PRs on the ops repo, converse with humans through PR comments,
+Propose sprints via PRs on the ops repo, converse with humans through PR comments.
-and file sub-issues after design forks are resolved.
+You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
 ## Project context
 ${CONTEXT_BLOCK}
@ -132,7 +133,88 @@ ${PROMPT_FOOTER}
 _PROMPT_EOF_
 }
-PROMPT=$(build_architect_prompt)
+# ── Build prompt for specific session mode ───────────────────────────────
 # Args: session_mode (pitch / questions_phase / start_questions)
 # Returns: prompt text via stdout
 build_architect_prompt_for_mode() {
  local session_mode="$1"
  case "$session_mode" in
    "start_questions")
      cat <<_PROMPT_EOF_
 You are the architect agent for ${FORGE_REPO}. Work through the formula below.
 Your role: strategic decomposition of vision issues into development sprints.
 Propose sprints via PRs on the ops repo, converse with humans through PR comments.
 You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
 ## CURRENT STATE: Approved PR awaiting initial design questions
 A sprint pitch PR has been approved by the human (via APPROVED review), but the
 design conversation has not yet started. Your task is to:
 1. Read the approved sprint pitch from the PR body
 2. Identify the key design decisions that need human input
 3. Post initial design questions (Q1:, Q2:, etc.) as comments on the PR
 4. Add a `## Design forks` section to the PR body documenting the design decisions
 5. Update the ## Sub-issues section in the sprint spec if design decisions affect decomposition
 This is NOT a pitch phase — the pitch is already approved. This is the START
 of the design Q&A phase. Sub-issues are filed by filer-bot after sprint PR merge (#764).
 ## Project context
 ${CONTEXT_BLOCK}
 ${GRAPH_SECTION}
 ${SCRATCH_CONTEXT}
 $(formula_lessons_block)
 ## Formula
 ${FORMULA_CONTENT}
 ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}
 _PROMPT_EOF_
      ;;
    "questions_phase")
      cat <<_PROMPT_EOF_
 You are the architect agent for ${FORGE_REPO}. Work through the formula below.
 Your role: strategic decomposition of vision issues into development sprints.
 Propose sprints via PRs on the ops repo, converse with humans through PR comments.
 You are READ-ONLY on the project repo — sub-issues are filed by filer-bot after sprint PR merge (#764).
 ## CURRENT STATE: Design Q&A in progress
 A sprint pitch PR is in the questions phase:
 - The PR has a `## Design forks` section
 - Initial questions (Q1:, Q2:, etc.) have been posted
 - Humans may have posted answers or follow-up questions
 Your task is to:
 1. Read the existing questions and the PR body
 2. Read human answers from PR comments
 3. Parse the answers and determine next steps
 4. Post follow-up questions if needed (Q3:, Q4:, etc.)
 5. If all design forks are resolved, finalize the ## Sub-issues section in the sprint spec
 6. Update the `## Design forks` section as you progress
 ## Project context
 ${CONTEXT_BLOCK}
 ${GRAPH_SECTION}
 ${SCRATCH_CONTEXT}
 $(formula_lessons_block)
 ## Formula
 ${FORMULA_CONTENT}
 ${SCRATCH_INSTRUCTION}
 ${PROMPT_FOOTER}
 _PROMPT_EOF_
      ;;
    "pitch"|*)
      # Default: pitch new sprints (original behavior)
      build_architect_prompt
      ;;
  esac
 }
 # ── Create worktree ──────────────────────────────────────────────────────
 formula_worktree_setup "$WORKTREE"
@ -154,7 +236,7 @@ detect_questions_phase() {
  # Use Forgejo API to find open architect PRs
  local response
  response=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open" 2>/dev/null) || return 1
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open" 2>/dev/null) || return 1
  # Check each open PR for architect markers
  pr_number=$(printf '%s' "$response" | jq -r '.[] | select(.title | contains("architect:")) | .number' 2>/dev/null | head -1) || return 1
@ -165,7 +247,7 @@ detect_questions_phase() {
  # Fetch PR body
  pr_body=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls/${pr_number}" 2>/dev/null | jq -r '.body // empty') || return 1
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_number}" 2>/dev/null | jq -r '.body // empty') || return 1
  # Check for `## Design forks` section (added by #101 after ACCEPT)
  if ! printf '%s' "$pr_body" | grep -q "## Design forks"; then
@ -176,7 +258,7 @@ detect_questions_phase() {
  # Use jq to extract body text before grepping (handles JSON escaping properly)
  local comments
  comments=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/issues/${pr_number}/comments" 2>/dev/null) || return 1
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/issues/${pr_number}/comments" 2>/dev/null) || return 1
  if ! printf '%s' "$comments" | jq -r '.[].body // empty' | grep -qE 'Q[0-9]+:'; then
    return 1
@ -187,6 +269,71 @@ detect_questions_phase() {
  return 0
 }
 # ── Detect if PR is approved and awaiting initial design questions ────────
 # A PR is in this state when:
 # - It's an open architect PR on ops repo
 # - It has an APPROVED review (from human acceptance)
 # - It has NO `## Design forks` section yet
 # - It has NO Q1:, Q2:, etc. comments yet
 # This means the human accepted the pitch and we need to start the design
 # conversation by posting initial questions and adding the Design forks section.
 detect_approved_pending_questions() {
  local pr_number=""
  local pr_body=""
  # Get open architect PRs on ops repo
  local ops_repo="${OPS_REPO_ROOT:-/home/agent/data/ops}"
  if [ ! -d "${ops_repo}/.git" ]; then
    return 1
  fi
  # Use Forgejo API to find open architect PRs
  local response
  response=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open" 2>/dev/null) || return 1
  # Check each open PR for architect markers
  pr_number=$(printf '%s' "$response" | jq -r '.[] | select(.title | contains("architect:")) | .number' 2>/dev/null | head -1) || return 1
  if [ -z "$pr_number" ]; then
    return 1
  fi
  # Fetch PR body
  pr_body=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_number}" 2>/dev/null | jq -r '.body // empty') || return 1
  # Check for APPROVED review
  local reviews
  reviews=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_number}/reviews" 2>/dev/null) || return 1
  if ! printf '%s' "$reviews" | jq -e '.[] | select(.state == "APPROVED")' >/dev/null 2>&1; then
    return 1
  fi
  # Check that PR does NOT have `## Design forks` section yet
  # (we're in the "start questions" phase, not "process answers" phase)
  if printf '%s' "$pr_body" | grep -q "## Design forks"; then
    # Has design forks section — this is either in questions phase or past it
    return 1
  fi
  # Check that PR has NO question comments yet (Q1:, Q2:, etc.)
  local comments
  comments=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/issues/${pr_number}/comments" 2>/dev/null) || return 1
  if printf '%s' "$comments" | jq -r '.[].body // empty' | grep -qE 'Q[0-9]+:'; then
    # Has question comments — this is either in questions phase or past it
    return 1
  fi
  # PR is approved and awaiting initial design questions
  log "Detected PR #${pr_number} approved and awaiting initial design questions"
  return 0
 }
 # ── Sub-issue existence check ────────────────────────────────────────────
 # Check if a vision issue already has sub-issues filed from it.
 # Returns 0 if sub-issues exist and are open, 1 otherwise.
@ -225,7 +372,7 @@ has_merged_sprint_pr() {
  # Get closed PRs from ops repo
  local prs_json
  prs_json=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=closed&limit=100" 2>/dev/null) || return 1
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=closed&limit=100" 2>/dev/null) || return 1
  # Check each closed PR for architect markers and vision issue reference
  local pr_numbers
@ -238,7 +385,7 @@ has_merged_sprint_pr() {
    # Get PR details including merged status
    local pr_details
    pr_details=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-      "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}" 2>/dev/null) || continue
+      "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}" 2>/dev/null) || continue
    # Check if PR is actually merged (not just closed)
    local is_merged
@ -271,11 +418,16 @@ fetch_vision_issues() {
    "${FORGE_API}/issues?labels=vision&state=open&limit=100" 2>/dev/null || echo '[]'
 }
 # NOTE: get_vision_subissues, all_subissues_closed, close_vision_issue,
 # check_and_close_completed_visions removed (#764) — architect-bot is read-only
 # on the project repo. Vision lifecycle (closing completed visions, adding
 # in-progress labels) is now handled by filer-bot via lib/sprint-filer.sh.
 # ── Helper: Fetch open architect PRs from ops repo Forgejo API ───────────
 # Returns: JSON array of architect PR objects
 fetch_open_architect_prs() {
  curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null || echo '[]'
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null || echo '[]'
 }
 # ── Helper: Get vision issue body by number ──────────────────────────────
@ -361,7 +513,23 @@ Instructions:
 ## Recommendation
 <architect's assessment: worth it / defer / alternative approach>
 ## Sub-issues
 <!-- filer:begin -->
 - id: <kebab-case-id>
  title: \"vision(#${issue_num}): <concise sub-issue title>\"
  labels: [backlog]
  depends_on: []
  body: |
    ## Goal
    <what this sub-issue accomplishes>
    ## Acceptance criteria
    - [ ] <criterion>
 <!-- filer:end -->
 IMPORTANT: Do NOT include design forks or questions. This is a go/no-go pitch.
 The ## Sub-issues block is parsed by the filer-bot pipeline after sprint PR merge.
 Each sub-issue between filer:begin/end markers becomes a Forgejo issue.
 ---
@ -369,12 +537,11 @@ ${pitch_context}
 "
  # Execute stateless claude -p call
-  local pitch_output
+  agent_run "$pitch_prompt" 2>>"$LOGFILE" || true
  pitch_output=$(agent_run -p "$pitch_prompt" --output-format json --dangerously-skip-permissions --max-turns 200 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") || true
  # Extract pitch content from JSON response
  local pitch
-  pitch=$(printf '%s' "$pitch_output" | jq -r '.content // empty' 2>/dev/null) || pitch=""
+  pitch=$(printf '%s' "$_AGENT_LAST_OUTPUT" | jq -r '.result // empty' 2>/dev/null) || pitch=""
  if [ -z "$pitch" ]; then
    log "WARNING: empty pitch generated for vision issue #${issue_num}"
@ -397,7 +564,7 @@ create_sprint_pr() {
  if ! curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/branches" \
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/branches" \
    -d "{\"new_branch_name\": \"${branch_name}\", \"old_branch_name\": \"${PRIMARY_BRANCH:-main}\"}" >/dev/null 2>&1; then
    log "WARNING: failed to create branch ${branch_name}"
    return 1
@ -422,7 +589,7 @@ ${sprint_body}
  if ! curl -sf -X PUT \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/contents/sprints/${sprint_slug}.md" \
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/contents/sprints/${sprint_slug}.md" \
    -d "{\"message\": \"sprint: add ${sprint_slug}.md\", \"content\": \"${sprint_spec_b64}\", \"branch\": \"${branch_name}\"}" >/dev/null 2>&1; then
    log "WARNING: failed to write sprint spec file"
    return 1
@ -441,7 +608,7 @@ ${sprint_body}
  pr_response=$(curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls" \
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls" \
    -d "$pr_payload" 2>/dev/null) || return 1
  # Extract PR number
@ -461,7 +628,7 @@ post_pr_footer() {
  if curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/issues/${pr_number}/comments" \
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/issues/${pr_number}/comments" \
    -d "{\"body\": \"${footer}\"}" >/dev/null 2>&1; then
    log "Posted footer comment on PR #${pr_number}"
    return 0
@ -471,37 +638,8 @@ post_pr_footer() {
  fi
 }
-# ── Helper: Add in-progress label to vision issue ────────────────────────
+# NOTE: add_inprogress_label removed (#764) — architect-bot is read-only on
-# Args: vision_issue_number
+# project repo. in-progress label is now added by filer-bot via sprint-filer.sh.
 add_inprogress_label() {
  local issue_num="$1"
  # Get label ID for 'in-progress'
  local labels_json
  labels_json=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_API}/labels" 2>/dev/null) || return 1
  local inprogress_label_id
  inprogress_label_id=$(printf '%s' "$labels_json" | jq -r --arg label "in-progress" '.[] | select(.name == $label) | .id' 2>/dev/null) || true
  if [ -z "$inprogress_label_id" ]; then
    log "WARNING: in-progress label not found"
    return 1
  fi
  # Add label to issue
  if curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_API}/repos/${FORGE_REPO}/issues/${issue_num}/labels" \
    -d "{\"labels\": [${inprogress_label_id}]}" >/dev/null 2>&1; then
    log "Added in-progress label to vision issue #${issue_num}"
    return 0
  else
    log "WARNING: failed to add in-progress label to vision issue #${issue_num}"
    return 1
  fi
 }
 # ── Precondition checks in bash before invoking the model ─────────────────
@ -511,7 +649,7 @@ vision_count=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
 if [ "${vision_count:-0}" -eq 0 ]; then
  # Check for open architect PRs that need handling (ACCEPT/REJECT responses)
  open_arch_prs=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=10" 2>/dev/null | jq '[.[] | select(.title | startswith("architect:"))] | length') || open_arch_prs=0
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=10" 2>/dev/null | jq '[.[] | select(.title | startswith("architect:"))] | length') || open_arch_prs=0
  if [ "${open_arch_prs:-0}" -eq 0 ]; then
    log "no vision issues and no open architect PRs — skipping"
    exit 0
@ -522,10 +660,18 @@ fi
 # This ensures responses are processed regardless of open_arch_prs count
 has_responses_to_process=false
 pr_numbers=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-  "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq -r '.[] | select(.title | startswith("architect:")) | .number') || pr_numbers=""
+  "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq -r '.[] | select(.title | startswith("architect:")) | .number') || pr_numbers=""
 for pr_num in $pr_numbers; do
  # Check formal reviews first (Forgejo green check via review API)
  reviews=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}/reviews" 2>/dev/null) || reviews="[]"
  if printf '%s' "$reviews" | jq -e '.[] | select(.state == "APPROVED" or .state == "REQUEST_CHANGES")' >/dev/null 2>&1; then
    has_responses_to_process=true
    break
  fi
  # Then check ACCEPT/REJECT in comments (legacy / human-typed)
  comments=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/issues/${pr_num}/comments" 2>/dev/null) || continue
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/issues/${pr_num}/comments" 2>/dev/null) || continue
  if printf '%s' "$comments" | jq -r '.[].body // empty' | grep -qE '(ACCEPT|REJECT):'; then
    has_responses_to_process=true
    break
@ -534,7 +680,7 @@ done
 # Check 2 (continued): Skip if already at max open pitches (3), unless there are responses to process
 open_arch_prs=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-  "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq '[.[] | select(.title | startswith("architect:"))] | length') || open_arch_prs=0
+  "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq '[.[] | select(.title | startswith("architect:"))] | length') || open_arch_prs=0
 if [ "${open_arch_prs:-0}" -ge 3 ]; then
  if [ "$has_responses_to_process" = false ]; then
    log "already 3 open architect PRs with no responses to process — skipping"
@ -543,6 +689,8 @@ if [ "${open_arch_prs:-0}" -ge 3 ]; then
  log "3 open architect PRs found but responses detected — processing"
 fi
 # NOTE: Vision lifecycle check (close completed visions) moved to filer-bot (#764)
 # ── Bash-driven state management: Select vision issues for pitching ───────
 # This logic is also documented in formulas/run-architect.toml preflight step
@ -555,7 +703,7 @@ declare -A _arch_vision_issues_with_open_prs
 while IFS= read -r pr_num; do
  [ -z "$pr_num" ] && continue
  pr_body=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}" 2>/dev/null | jq -r '.body // ""') || continue
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}" 2>/dev/null | jq -r '.body // ""') || continue
  # Extract vision issue numbers referenced in PR body (e.g., "refs #419" or "#419")
  while IFS= read -r ref_issue; do
    [ -z "$ref_issue" ] && continue
@ -677,8 +825,7 @@ for vision_issue in "${ARCHITECT_TARGET_ISSUES[@]}"; do
  # Post footer comment
  post_pr_footer "$pr_number"
-  # Add in-progress label to vision issue
+  # NOTE: in-progress label is added by filer-bot after sprint PR merge (#764)
  add_inprogress_label "$vision_issue"
  pitch_count=$((pitch_count + 1))
  log "Completed pitch for vision issue #${vision_issue} — PR #${pr_number}"
@ -694,16 +841,16 @@ if [ "${has_responses_to_process:-false}" = "true" ]; then
  # Check if any PRs have responses that need agent handling
  needs_agent=false
  pr_numbers=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-    "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq -r '.[] | select(.title | startswith("architect:")) | .number') || pr_numbers=""
+    "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls?state=open&limit=100" 2>/dev/null | jq -r '.[] | select(.title | startswith("architect:")) | .number') || pr_numbers=""
  for pr_num in $pr_numbers; do
    # Check for ACCEPT/REJECT in comments
    comments=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-      "${FORGE_API}/repos/${FORGE_OPS_REPO}/issues/${pr_num}/comments" 2>/dev/null) || continue
+      "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/issues/${pr_num}/comments" 2>/dev/null) || continue
    # Check for review decisions (higher precedence)
    reviews=$(curl -sf -H "Authorization: token $FORGE_TOKEN" \
-      "${FORGE_API}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}/reviews" 2>/dev/null) || reviews=""
+      "${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}/pulls/${pr_num}/reviews" 2>/dev/null) || reviews=""
    # Check for ACCEPT (APPROVED review or ACCEPT comment)
    if printf '%s' "$reviews" | jq -e '.[] | select(.state == "APPROVED")' >/dev/null 2>&1; then
@ -720,19 +867,32 @@ if [ "${has_responses_to_process:-false}" = "true" ]; then
  # Run agent only if there are responses to process
  if [ "$needs_agent" = "true" ]; then
-    # Determine whether to resume session
+    # Determine session handling based on PR state
    RESUME_ARGS=()
-    if detect_questions_phase && [ -f "$SID_FILE" ]; then
+    SESSION_MODE="fresh"
-      RESUME_SESSION=$(cat "$SID_FILE")
+
-      RESUME_ARGS=(--resume "$RESUME_SESSION")
+    if detect_questions_phase; then
-      log "Resuming session from questions phase run: ${RESUME_SESSION:0:12}..."
+      # PR is in questions-awaiting-answers phase — resume from that session
-    elif ! detect_questions_phase; then
+      if [ -f "$SID_FILE" ]; then
        RESUME_SESSION=$(cat "$SID_FILE")
        RESUME_ARGS=(--resume "$RESUME_SESSION")
        SESSION_MODE="questions_phase"
        log "PR in questions-awaiting-answers phase — resuming session: ${RESUME_SESSION:0:12}..."
      else
        log "PR in questions phase but no session file — starting fresh session"
      fi
    elif detect_approved_pending_questions; then
      # PR is approved but awaiting initial design questions — start fresh with special prompt
      SESSION_MODE="start_questions"
      log "PR approved and awaiting initial design questions — starting fresh session"
    else
      log "PR not in questions phase — starting fresh session"
    elif [ ! -f "$SID_FILE" ]; then
      log "No session ID found for questions phase — starting fresh session"
    fi
-    agent_run "${RESUME_ARGS[@]}" --worktree "$WORKTREE" "$PROMPT"
+    # Build prompt with appropriate mode
    PROMPT_FOR_MODE=$(build_architect_prompt_for_mode "$SESSION_MODE")
    agent_run "${RESUME_ARGS[@]}" --worktree "$WORKTREE" "$PROMPT_FOR_MODE"
    log "agent_run complete"
  fi
 fi
--- a/bin/disinto
+++ b/bin/disinto
--- a/dev/AGENTS.md
+++ b/dev/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Dev Agent
 **Role**: Implement issues autonomously — write code, push branches, address
@ -29,12 +29,16 @@ stale checks (vision issues are managed by the architect). If the issue is assig
 `REQUEST_CHANGES`, spawns the dev-agent to address it before setting `BLOCKED_BY_INPROGRESS=true`;
 otherwise just sets blocked. If assigned to another agent, logs and falls through (does not
 block). If no assignee, no open PR, and no agent lock file — removes `in-progress`, adds
-`blocked` with a human-triage comment. **Per-agent open-PR gate**: before starting new work,
+`blocked` with a human-triage comment. **Post-crash self-assigned recovery (#749)**: when the
 issue is self-assigned (this bot) but there is no open PR, dev-poll now checks for a lock
 file (`/tmp/dev-impl-summary-$PROJECT_NAME-$ISSUE_NUM.txt`) AND a remote branch
 (`fix/issue-$ISSUE_NUM`) before declaring "my thread is busy". If neither exists after a cold
 boot, it spawns a fresh dev-agent for recovery instead of looping forever. **Per-agent open-PR gate**: before starting new work,
 filters open waiting PRs to only those assigned to this agent (`$BOT_USER`). Other agents'
 PRs do not block this agent's pipeline (#358, #369). **Pre-lock merge scan own-PRs only**:
 the direct-merge scan only merges PRs whose linked issue is assigned to this agent — skips
 PRs owned by other bot users (#374).
- `dev/dev-agent.sh` — Orchestrator: claims issue, creates worktree + tmux session with interactive `claude`, monitors phase file, injects CI results and review feedback, merges on approval
+- `dev/dev-agent.sh` — Orchestrator: claims issue, creates worktree + tmux session with interactive `claude`, monitors phase file, injects CI results and review feedback, merges on approval. **Launched as a subshell** (`("${SCRIPT_DIR}/dev-agent.sh" ...) &`) — not via `nohup` — to avoid deadlocking the polling loop and review-poll when running in the same container (#693).
 - `dev/phase-test.sh` — Integration test for the phase protocol
 **Environment variables consumed** (via `lib/env.sh` + project TOML):
@ -51,6 +55,12 @@ PRs owned by other bot users (#374).
 **Crash recovery**: on `PHASE:crashed` or non-zero exit, the worktree is **preserved** (not destroyed) for debugging. Location logged. Supervisor housekeeping removes stale crashed worktrees older than 24h.
 **Polling loop isolation (#753)**: `docker/agents/entrypoint.sh` now tracks fast-poll PIDs
 (`FAST_PIDS`) and calls `wait "${FAST_PIDS[@]}"` instead of `wait` (no-args). This means
 long-running dev-agent sessions no longer block the loop from launching the next iteration's
 fast polls — the loop only waits for review-poll and dev-poll (the fast agents), never for
 the dev-agent subprocess itself.
 **Lifecycle**: dev-poll.sh (invoked by polling loop, `check_active dev`) → dev-agent.sh →
 tmux session → phase file drives CI/review loop → merge + `mirror_push()` → close issue.
 On respawn after `PHASE:escalate`, the stale phase file is cleared first so the session
--- a/dev/dev-agent.sh
+++ b/dev/dev-agent.sh
@ -268,8 +268,22 @@ log "forge remote: ${FORGE_REMOTE}"
 # First attempt: fix/issue-N, subsequent: fix/issue-N-1, fix/issue-N-2, etc.
 if [ "$RECOVERY_MODE" = false ]; then
  # Count only branches matching fix/issue-N, fix/issue-N-1, fix/issue-N-2, etc. (exact prefix match)
-  ATTEMPT=$(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}" 2>/dev/null | grep -c "refs/heads/fix/issue-${ISSUE}$" || echo 0)
+  # Use explicit error handling to avoid silent failure from set -e + pipefail when git ls-remote fails.
-  ATTEMPT=$((ATTEMPT + $(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}-*" 2>/dev/null | wc -l)))
+  if _lr1=$(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}" 2>&1); then
    ATTEMPT=$(printf '%s\n' "$_lr1" | grep -c "refs/heads/fix/issue-${ISSUE}$" || true)
  else
    log "WARNING: git ls-remote failed for attempt counting: $_lr1"
    ATTEMPT=0
  fi
  ATTEMPT="${ATTEMPT:-0}"
  if _lr2=$(git ls-remote --heads "$FORGE_REMOTE" "refs/heads/fix/issue-${ISSUE}-*" 2>&1); then
    # Guard on empty to avoid off-by-one: command substitution strips trailing newlines,
    # so wc -l undercounts by 1 when output exists. Re-add newline only if non-empty.
    ATTEMPT=$((ATTEMPT + $( [ -z "$_lr2" ] && echo 0 || printf '%s\n' "$_lr2" | wc -l )))
  else
    log "WARNING: git ls-remote failed for suffix counting: $_lr2"
  fi
  if [ "$ATTEMPT" -gt 0 ]; then
    BRANCH="fix/issue-${ISSUE}-${ATTEMPT}"
  fi
--- a/dev/dev-poll.sh
+++ b/dev/dev-poll.sh
@ -426,6 +426,7 @@ ORPHANS_JSON=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
 ORPHAN_COUNT=$(echo "$ORPHANS_JSON" | jq 'length')
 BLOCKED_BY_INPROGRESS=false
 OTHER_AGENT_INPROGRESS=false
 if [ "$ORPHAN_COUNT" -gt 0 ]; then
  ISSUE_NUM=$(echo "$ORPHANS_JSON" | jq -r '.[0].number')
@ -438,12 +439,14 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
    OPEN_PR=true
  fi
-  # Skip vision-labeled issues — they are managed by architect agent, not dev-poll
+  # Skip issues owned by non-dev agents (bug-report, vision, prediction, etc.)
  # See issue #608: dev-poll must only touch issues it could actually claim.
  issue_labels=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
    "${API}/issues/${ISSUE_NUM}" | jq -r '[.labels[].name] | join(",")')
-  if echo "$issue_labels" | grep -q "vision"; then
+  if ! issue_is_dev_claimable "$issue_labels"; then
-    log "issue #${ISSUE_NUM} has 'vision' label — skipping stale detection (managed by architect)"
+    log "issue #${ISSUE_NUM} has non-dev label(s) [${issue_labels}] — skipping (owned by another agent)"
-    BLOCKED_BY_INPROGRESS=true
+    BLOCKED_BY_INPROGRESS=false
    OTHER_AGENT_INPROGRESS=true
  fi
  # Check if issue has an assignee — only block on issues assigned to this agent
@ -465,7 +468,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
        if [ "${HAS_CHANGES:-0}" -gt 0 ]; then
          log "issue #${ISSUE_NUM} has review feedback — spawning agent"
-          nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+          ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
          log "started dev-agent PID $! for issue #${ISSUE_NUM} (review fix)"
          BLOCKED_BY_INPROGRESS=true
        else
@ -473,18 +476,29 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
          BLOCKED_BY_INPROGRESS=true
        fi
      else
-        log "issue #${ISSUE_NUM} assigned to me — my thread is busy"
+        # No open PR — check if a thread is actually alive (lock file or remote branch)
-        BLOCKED_BY_INPROGRESS=true
+        LOCK_FILE="/tmp/dev-impl-summary-${PROJECT_NAME}-${ISSUE_NUM}.txt"
        REMOTE_BRANCH_EXISTS=$(git ls-remote --exit-code origin "fix/issue-${ISSUE_NUM}" >/dev/null 2>&1 && echo yes || echo no)
        if [ -f "$LOCK_FILE" ] || [ "$REMOTE_BRANCH_EXISTS" = "yes" ]; then
          log "issue #${ISSUE_NUM} assigned to me — my thread is busy (lock=$([ -f "$LOCK_FILE" ] && echo y || echo n) remote_branch=$REMOTE_BRANCH_EXISTS)"
          BLOCKED_BY_INPROGRESS=true
        else
          log "issue #${ISSUE_NUM} self-assigned but orphaned (no lock, no branch, no PR) — recovering"
          nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
          log "started dev-agent PID $! for issue #${ISSUE_NUM} (post-crash recovery)"
          BLOCKED_BY_INPROGRESS=true
        fi
      fi
    else
      log "issue #${ISSUE_NUM} assigned to ${assignee} — their thread, not blocking"
-      BLOCKED_BY_INPROGRESS=true
+      OTHER_AGENT_INPROGRESS=true
-      # Issue assigned to another agent — don't block, fall through to backlog
+      # Issue assigned to another agent — skip stale checks but fall through to backlog
    fi
  fi
-  # Only proceed with in-progress checks if not blocked by another agent
+  # Only proceed with in-progress checks if not blocked by this agent's own work
-  if [ "$BLOCKED_BY_INPROGRESS" = false ]; then
+  if [ "$BLOCKED_BY_INPROGRESS" = false ] && [ "$OTHER_AGENT_INPROGRESS" = false ]; then
    # Check for dev-agent lock file (agent may be running in another container)
    LOCK_FILE="/tmp/dev-impl-summary-${PROJECT_NAME}-${ISSUE_NUM}.txt"
    if [ -f "$LOCK_FILE" ]; then
@ -504,20 +518,6 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
      fi
    fi
    # Formula guard: formula-labeled issues should not be worked on by dev-agent.
    # Remove in-progress label and skip to prevent infinite respawn cycle (#115).
    if [ "$BLOCKED_BY_INPROGRESS" = false ]; then
      ORPHAN_LABELS=$(echo "$ORPHANS_JSON" | jq -r '.[0].labels[].name' 2>/dev/null) || true
      SKIP_LABEL=$(echo "$ORPHAN_LABELS" | grep -oE '^(formula|prediction/dismissed|prediction/unreviewed)$' | head -1) || true
      if [ -n "$SKIP_LABEL" ]; then
        log "issue #${ISSUE_NUM} has '${SKIP_LABEL}' label — removing in-progress, skipping"
        IP_ID=$(_ilc_in_progress_id)
        curl -sf -X DELETE -H "Authorization: token ${FORGE_TOKEN}" \
          "${API}/issues/${ISSUE_NUM}/labels/${IP_ID}" >/dev/null 2>&1 || true
        BLOCKED_BY_INPROGRESS=true
      fi
    fi
    # Check if there's already an open PR for this issue
    if [ "$BLOCKED_BY_INPROGRESS" = false ]; then
      HAS_PR=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
@ -571,7 +571,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
            else
              # Direct merge failed (conflicts?) — fall back to dev-agent
              log "falling back to dev-agent for PR #${HAS_PR} merge"
-              nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+              ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
              log "started dev-agent PID $! for issue #${ISSUE_NUM} (agent-merge)"
              BLOCKED_BY_INPROGRESS=true
            fi
@ -589,7 +589,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
              BLOCKED_BY_INPROGRESS=false
            else
              log "issue #${ISSUE_NUM} PR #${HAS_PR} has REQUEST_CHANGES — spawning agent"
-              nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+              ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
              log "started dev-agent PID $! for issue #${ISSUE_NUM} (review fix)"
              BLOCKED_BY_INPROGRESS=true
            fi
@ -604,7 +604,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
                BLOCKED_BY_INPROGRESS=true  # exhausted between check and launch
              else
                log "issue #${ISSUE_NUM} PR #${HAS_PR} CI failed — spawning agent to fix (attempt ${CI_FIX_ATTEMPTS}/3)"
-                nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+                ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
                log "started dev-agent PID $! for issue #${ISSUE_NUM} (CI fix)"
                BLOCKED_BY_INPROGRESS=true
              fi
@ -630,7 +630,7 @@ if [ "$ORPHAN_COUNT" -gt 0 ]; then
          # Don't block — fall through to backlog
        else
          log "recovering orphaned issue #${ISSUE_NUM} (no PR found, assigned to ${BOT_USER:-unassigned})"
-          nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+          ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
          log "started dev-agent PID $! for issue #${ISSUE_NUM} (recovery)"
          BLOCKED_BY_INPROGRESS=true
        fi
@ -697,7 +697,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
    fi
    # Direct merge failed (conflicts?) — fall back to dev-agent
    log "falling back to dev-agent for PR #${PR_NUM} merge"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM} (agent-merge)"
    exit 0
  fi
@ -718,7 +718,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
      continue  # skip this PR, check next stuck PR or fall through to backlog
    fi
    log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) has REQUEST_CHANGES — fixing first"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM}"
    exit 0
  elif ci_failed "$CI_STATE"; then
@ -730,7 +730,7 @@ for i in $(seq 0 $(($(echo "$OPEN_PRS" | jq 'length') - 1))); do
      continue  # exhausted between check and launch
    fi
    log "PR #${PR_NUM} (issue #${STUCK_ISSUE}) CI failed — fixing (attempt ${CI_FIX_ATTEMPTS}/3)"
-    nohup "${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1 &
+    ("${SCRIPT_DIR}/dev-agent.sh" "$STUCK_ISSUE" >> "$LOGFILE" 2>&1) &
    log "started dev-agent PID $! for stuck PR #${PR_NUM}"
    exit 0
  fi
@ -851,7 +851,7 @@ for i in $(seq 0 $((BACKLOG_COUNT - 1))); do
      fi
      # Direct merge failed (conflicts?) — fall back to dev-agent
      log "falling back to dev-agent for PR #${EXISTING_PR} merge"
-      nohup "${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1 &
+      ("${SCRIPT_DIR}/dev-agent.sh" "$ISSUE_NUM" >> "$LOGFILE" 2>&1) &
      log "started dev-agent PID $! for issue #${ISSUE_NUM} (agent-merge)"
      exit 0
@ -929,5 +929,5 @@ if [ -n "${READY_PR_FOR_INCREMENT:-}" ]; then
 fi
 log "launching dev-agent for #${READY_ISSUE}"
-nohup "${SCRIPT_DIR}/dev-agent.sh" "$READY_ISSUE" >> "$LOGFILE" 2>&1 &
+("${SCRIPT_DIR}/dev-agent.sh" "$READY_ISSUE" >> "$LOGFILE" 2>&1) &
 log "started dev-agent PID $! for issue #${READY_ISSUE}"
--- a/disinto-factory/SKILL.md
+++ b/disinto-factory/SKILL.md
@ -11,7 +11,6 @@ You are helping the user set up and operate a **disinto autonomous code factory*
 - **[Setup guide](setup.md)** — First-time factory setup: environment, init, verification, backlog seeding
 - **[Operations guide](operations.md)** — Day-to-day: status checks, CI debugging, unsticking issues, Forgejo access
 - **[Lessons learned](lessons-learned.md)** — Patterns for writing issues, debugging CI, retrying failures, vault operations, breaking down features
 ## Important context
--- a/disinto-factory/lessons-learned.md
+++ b/disinto-factory/lessons-learned.md
@ -1,35 +0,0 @@
 # Lessons learned
 ## Remediation & deployment
 **Escalate gradually.** Cheapest fix first, re-measure, escalate only if it persists. Single-shot fixes are either too weak or cause collateral damage.
 **Parameterize deployment boundaries.** Entrypoint references to a specific project name are config values waiting to escape. `${VAR:-default}` preserves compat and unlocks reuse.
 **Fail loudly over silent defaults.** A fatal error with a clear message beats a wrong default that appears to work.
 **Audit the whole file when fixing one value.** Hardcoded assumptions cluster. Fixing one while leaving siblings produces multi-commit churn.
 ## Documentation
 **Per-context rewrites, not batch replacement.** Each doc mention sits in a different narrative. Blanket substitution produces awkward text.
 **Search for implicit references too.** After keyword matches, check for instructions that assume the old mechanism without naming it.
 ## Code review
 **Approval means "safe to ship," not "how I'd write it."** Distinguish "wrong" from "different" — only the former blocks.
 **Scale scrutiny to blast radius.** A targeted fix warrants less ceremony than a cross-cutting refactor.
 **Be specific; separate blockers from preferences.** Concrete observations invite fixes; vague concerns invite debate.
 **Read diffs top-down: intent, behavior, edge cases.** Verify the change matches its stated goal before examining lines.
 ## Issue authoring & retry
 **Self-contained issue bodies.** The agent reads the body, not comments. On retry, update the body with exact error and fix guidance.
 **Clean stale branches before retry.** Old branches trigger recovery on stale code. Close PR, delete branch, relabel.
 **Diagnose CI failures externally.** The agent sees pass/fail, not logs. After repeated failures, read logs yourself and put findings in the issue.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -7,29 +7,62 @@ services:
      dockerfile: docker/agents/Dockerfile
    image: disinto/agents:latest
    container_name: disinto-agents
    restart: unless-stopped
    security_opt:
      - apparmor=unconfined
    volumes:
-      - ./data/agents:/home/agent/data
+      - agent-data:/home/agent/data
-      - ./disinto:/home/agent/disinto:ro
+      - project-repos:/home/agent/repos
-      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
    environment:
      - FORGE_URL=http://forgejo:3000
      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - FORGE_TOKEN=${FORGE_TOKEN:-}
      - FORGE_REVIEW_TOKEN=${FORGE_REVIEW_TOKEN:-}
      - FORGE_PLANNER_TOKEN=${FORGE_PLANNER_TOKEN:-}
      - FORGE_GARDENER_TOKEN=${FORGE_GARDENER_TOKEN:-}
      - FORGE_VAULT_TOKEN=${FORGE_VAULT_TOKEN:-}
      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
      - FORGE_PREDICTOR_TOKEN=${FORGE_PREDICTOR_TOKEN:-}
      - FORGE_ARCHITECT_TOKEN=${FORGE_ARCHITECT_TOKEN:-}
-      - FORGE_VAULT_TOKEN=${FORGE_VAULT_TOKEN:-}
+      - FORGE_FILER_TOKEN=${FORGE_FILER_TOKEN:-}
      - FORGE_PLANNER_TOKEN=${FORGE_PLANNER_TOKEN:-}
      - FORGE_BOT_USERNAMES=${FORGE_BOT_USERNAMES:-}
      - WOODPECKER_TOKEN=${WOODPECKER_TOKEN:-}
      - CLAUDE_TIMEOUT=${CLAUDE_TIMEOUT:-7200}
      - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - FORGE_PASS=${FORGE_PASS:-}
      - FORGE_ADMIN_PASS=${FORGE_ADMIN_PASS:-}
-      - DISINTO_AGENTS=review,gardener
+      - FACTORY_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - DISINTO_CONTAINER=1
      - PROJECT_NAME=${PROJECT_NAME:-project}
      - PROJECT_REPO_ROOT=/home/agent/repos/${PROJECT_NAME:-project}
      - WOODPECKER_DATA_DIR=/woodpecker-data
      - WOODPECKER_REPO_ID=${WOODPECKER_REPO_ID:-}
      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      - POLL_INTERVAL=${POLL_INTERVAL:-300}
      - GARDENER_INTERVAL=${GARDENER_INTERVAL:-21600}
      - ARCHITECT_INTERVAL=${ARCHITECT_INTERVAL:-21600}
      - PLANNER_INTERVAL=${PLANNER_INTERVAL:-43200}
      - SUPERVISOR_INTERVAL=${SUPERVISOR_INTERVAL:-1200}
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
-      - forgejo
+      forgejo:
        condition: service_healthy
      woodpecker:
        condition: service_started
    networks:
      - disinto-net
  agents-llama:
    build:
@ -37,12 +70,21 @@ services:
      dockerfile: docker/agents/Dockerfile
    image: disinto/agents-llama:latest
    container_name: disinto-agents-llama
    restart: unless-stopped
    security_opt:
      - apparmor=unconfined
    volumes:
-      - ./data/llama:/home/agent/data
+      - agent-data:/home/agent/data
-      - ./disinto:/home/agent/disinto:ro
+      - project-repos:/home/agent/repos
-      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
    environment:
      - FORGE_URL=http://forgejo:3000
      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - FORGE_TOKEN=${FORGE_TOKEN_LLAMA:-}
      - FORGE_PASS=${FORGE_PASS_LLAMA:-}
      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
@ -59,34 +101,96 @@ services:
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL:-}
      - FORGE_ADMIN_PASS=${FORGE_ADMIN_PASS:-}
-      - DISINTO_AGENTS=dev
+      - DISINTO_CONTAINER=1
      - PROJECT_TOML=projects/disinto.toml
-      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
+      - PROJECT_NAME=${PROJECT_NAME:-project}
      - PROJECT_REPO_ROOT=/home/agent/repos/${PROJECT_NAME:-project}
      - WOODPECKER_DATA_DIR=/woodpecker-data
      - WOODPECKER_REPO_ID=${WOODPECKER_REPO_ID:-}
      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      - POLL_INTERVAL=${POLL_INTERVAL:-300}
      - AGENT_ROLES=dev
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
-      - forgejo
+      forgejo:
-
+        condition: service_healthy
-  runner:
+      woodpecker:
-    image: disinto/agents:latest
+        condition: service_started
    profiles: ["runner"]
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
      - /usr/local/bin/claude:/usr/local/bin/claude:ro
      - ${HOME}/.claude:/home/agent/.claude
      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
    entrypoint: ["bash", "/home/agent/disinto/docker/runner/entrypoint-runner.sh"]
    environment:
      - DISINTO_CONTAINER=1
      - FORGE_URL=${FORGE_URL:-}
      - FORGE_TOKEN=${FORGE_TOKEN:-}
      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - FORGE_OPS_REPO=${FORGE_OPS_REPO:-}
      - PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - CLAUDE_MODEL=${CLAUDE_MODEL:-}
    networks:
-      - default
+      - disinto-net
  agents-llama-all:
    build:
      context: .
      dockerfile: docker/agents/Dockerfile
    image: disinto/agents-llama:latest
    container_name: disinto-agents-llama-all
    restart: unless-stopped
    profiles: ["agents-llama-all"]
    security_opt:
      - apparmor=unconfined
    volumes:
      - agent-data:/home/agent/data
      - project-repos:/home/agent/repos
      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
    environment:
      - FORGE_URL=http://forgejo:3000
      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - FORGE_TOKEN=${FORGE_TOKEN_LLAMA:-}
      - FORGE_PASS=${FORGE_PASS_LLAMA:-}
      - FORGE_REVIEW_TOKEN=${FORGE_REVIEW_TOKEN:-}
      - FORGE_PLANNER_TOKEN=${FORGE_PLANNER_TOKEN:-}
      - FORGE_GARDENER_TOKEN=${FORGE_GARDENER_TOKEN:-}
      - FORGE_VAULT_TOKEN=${FORGE_VAULT_TOKEN:-}
      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
      - FORGE_PREDICTOR_TOKEN=${FORGE_PREDICTOR_TOKEN:-}
      - FORGE_ARCHITECT_TOKEN=${FORGE_ARCHITECT_TOKEN:-}
      - FORGE_FILER_TOKEN=${FORGE_FILER_TOKEN:-}
      - FORGE_BOT_USERNAMES=${FORGE_BOT_USERNAMES:-}
      - WOODPECKER_TOKEN=${WOODPECKER_TOKEN:-}
      - CLAUDE_TIMEOUT=${CLAUDE_TIMEOUT:-7200}
      - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
      - CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=60
      - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS=1
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL:-}
      - FORGE_ADMIN_PASS=${FORGE_ADMIN_PASS:-}
      - DISINTO_CONTAINER=1
      - PROJECT_TOML=projects/disinto.toml
      - PROJECT_NAME=${PROJECT_NAME:-project}
      - PROJECT_REPO_ROOT=/home/agent/repos/${PROJECT_NAME:-project}
      - WOODPECKER_DATA_DIR=/woodpecker-data
      - WOODPECKER_REPO_ID=${WOODPECKER_REPO_ID:-}
      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      - POLL_INTERVAL=${POLL_INTERVAL:-300}
      - GARDENER_INTERVAL=${GARDENER_INTERVAL:-21600}
      - ARCHITECT_INTERVAL=${ARCHITECT_INTERVAL:-21600}
      - PLANNER_INTERVAL=${PLANNER_INTERVAL:-43200}
      - SUPERVISOR_INTERVAL=${SUPERVISOR_INTERVAL:-1200}
      - AGENT_ROLES=review,dev,gardener,architect,planner,predictor,supervisor
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      forgejo:
        condition: service_healthy
      woodpecker:
        condition: service_started
    networks:
      - disinto-net
  reproduce:
    build:
@ -99,9 +203,9 @@ services:
      - /var/run/docker.sock:/var/run/docker.sock
      - agent-data:/home/agent/data
      - project-repos:/home/agent/repos
-      - ${HOME}/.claude:/home/agent/.claude
+      - ${CLAUDE_DIR:-${HOME}/.claude}:/home/agent/.claude
-      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${CLAUDE_BIN_DIR:-/usr/local/bin/claude}:/usr/local/bin/claude:ro
-      - ${HOME}/.ssh:/home/agent/.ssh:ro
+      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
    env_file:
      - .env
@ -111,42 +215,72 @@ services:
      dockerfile: Dockerfile
    image: disinto/edge:latest
    container_name: disinto-edge
    security_opt:
      - apparmor=unconfined
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
-      - /usr/local/bin/claude:/usr/local/bin/claude:ro
+      - ${CLAUDE_BIN_DIR:-/usr/local/bin/claude}:/usr/local/bin/claude:ro
-      - ${HOME}/.claude:/home/agent/.claude
+      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/root/.claude.json:ro
-      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - ${CLAUDE_DIR:-${HOME}/.claude}:/root/.claude:ro
      - disinto-logs:/opt/disinto-logs
      - ./docker-compose.yml:/opt/docker-compose.yml:ro
      - ./projects:/opt/disinto-projects:ro
    environment:
      - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-}
      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
      - CLAUDE_MODEL=claude-sonnet-4-6
      - FORGE_TOKEN=${FORGE_TOKEN:-}
      - FORGE_URL=http://forgejo:3000
      - FORGE_REPO=disinto-admin/disinto
      - FORGE_OPS_REPO=disinto-admin/disinto-ops
      - PRIMARY_BRANCH=main
      - DISINTO_CONTAINER=1
-      - HOST_PROJECT_DIR=${HOST_PROJECT_DIR:-.}
+      - FORGE_ADMIN_USERS=disinto-admin,vault-bot,admin
      - PROJECTS_DIR=/opt/disinto-projects
    ports:
      - "80:80"
      - "443:443"
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://localhost:2019/config/"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 15s
    depends_on:
      - forgejo
    networks:
      - disinto-net
  forgejo:
-    image: codeberg.org/forgejo/forgejo:1
+    image: codeberg.org/forgejo/forgejo:11.0
    container_name: disinto-forgejo
    restart: unless-stopped
    security_opt:
      - apparmor=unconfined
    volumes:
-      - ./data/forgejo:/data
+      - forgejo-data:/data
    environment:
      - FORGEJO__database__DB_TYPE=sqlite3
-      - FORGEJO__service__REGISTER_EMAIL_CONFIRMATION=false
+      - FORGEJO__server__ROOT_URL=http://forgejo:3000/
-      - FORGEJO__service__ENABLE_NOTIFY_MAIL=false
+      - FORGEJO__server__HTTP_PORT=3000
      - FORGEJO__security__INSTALL_LOCK=true
      - FORGEJO__service__DISABLE_REGISTRATION=true
-      - FORGEJO__service__REQUIRE_SIGNIN_VIEW=true
+      - FORGEJO__webhook__ALLOWED_HOST_LIST=private
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:3000/api/v1/version"]
      interval: 5s
      timeout: 3s
      retries: 30
      start_period: 30s
    ports:
      - "3000:3000"
    networks:
      - disinto-net
 volumes:
  disinto-logs:
  agent-data:
  project-repos:
  woodpecker-data:
  forgejo-data:
 networks:
  disinto-net:
    driver: bridge
--- a/docker/agents/Dockerfile
+++ b/docker/agents/Dockerfile
@ -28,6 +28,9 @@ RUN chmod +x /entrypoint.sh
 # Entrypoint runs polling loop directly, dropping to agent user via gosu.
 # All scripts execute as the agent user (UID 1000) while preserving env vars.
 VOLUME /home/agent/data
 VOLUME /home/agent/repos
 WORKDIR /home/agent/disinto
 ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/agents/entrypoint.sh
+++ b/docker/agents/entrypoint.sh
@ -7,18 +7,24 @@ set -euo pipefail
 # poll scripts.  All Docker Compose env vars are inherited (PATH, FORGE_TOKEN,
 # ANTHROPIC_API_KEY, etc.).
 #
-# AGENT_ROLES env var controls which scripts run: "review,dev,gardener,architect,planner,predictor"
+# AGENT_ROLES env var controls which scripts run: "review,dev,gardener,architect,planner,predictor,supervisor"
-# (default: all six). Uses while-true loop with staggered intervals:
+# (default: all seven). Uses while-true loop with staggered intervals:
 #   - review-poll: every 5 minutes (offset by 0s)
 #   - dev-poll: every 5 minutes (offset by 2 minutes)
-#   - gardener: every 6 hours (72 iterations * 5 min)
+#   - gardener: every GARDENER_INTERVAL seconds (default: 21600 = 6 hours)
-#   - architect: every 6 hours (same as gardener)
+#   - architect: every ARCHITECT_INTERVAL seconds (default: 21600 = 6 hours)
-#   - planner: every 12 hours (144 iterations * 5 min)
+#   - planner: every PLANNER_INTERVAL seconds (default: 43200 = 12 hours)
 #   - predictor: every 24 hours (288 iterations * 5 min)
 #   - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min)
-DISINTO_DIR="/home/agent/disinto"
+DISINTO_BAKED="/home/agent/disinto"
 DISINTO_LIVE="/home/agent/repos/_factory"
 DISINTO_DIR="$DISINTO_BAKED"  # start with baked copy; switched to live checkout after bootstrap
 LOGFILE="/home/agent/data/agent-entrypoint.log"
-mkdir -p /home/agent/data/logs
+
 # Create all expected log subdirectories and set ownership as root before dropping to agent.
 # This handles both fresh volumes and stale root-owned dirs from prior container runs.
 mkdir -p /home/agent/data/logs/{dev,action,review,supervisor,vault,site,metrics,gardener,planner,predictor,architect,dispatcher}
 chown -R agent:agent /home/agent/data
 log() {
@ -37,42 +43,46 @@ init_state_dir() {
  log "Initialized state directory"
 }
-# Configure git credential helper for password-based HTTP auth.
+# Source shared git credential helper library (#604).
-# Forgejo 11.x rejects API tokens for git push (#361); password auth works.
+# shellcheck source=lib/git-creds.sh
-# This ensures all git operations (clone, fetch, push) from worktrees use
+source "${DISINTO_BAKED}/lib/git-creds.sh"
-# password auth without needing tokens embedded in remote URLs.
+
-configure_git_creds() {
+# Wrapper that calls the shared configure_git_creds with agent-specific paths,
 # then repairs any legacy baked-credential URLs in existing clones.
 _setup_git_creds() {
  _GIT_CREDS_LOG_FN=log configure_git_creds "/home/agent" "gosu agent"
  if [ -n "${FORGE_PASS:-}" ] && [ -n "${FORGE_URL:-}" ]; then
-    _forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
+    log "Git credential helper configured (password auth)"
    _forge_proto=$(printf '%s' "$FORGE_URL" | sed 's|://.*||')
    # Determine the bot username from FORGE_TOKEN identity (or default to dev-bot)
    _bot_user=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_URL}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || _bot_user=""
    _bot_user="${_bot_user:-dev-bot}"
    # Write a static credential helper script (git credential protocol)
    cat > /home/agent/.git-credentials-helper <<CREDEOF
 #!/bin/sh
 # Auto-generated git credential helper for Forgejo password auth (#361)
 # Only respond to "get" action; ignore "store" and "erase".
 [ "\$1" = "get" ] || exit 0
 # Read and discard stdin (git sends protocol/host info)
 cat >/dev/null
 echo "protocol=${_forge_proto}"
 echo "host=${_forge_host}"
 echo "username=${_bot_user}"
 echo "password=${FORGE_PASS}"
 CREDEOF
    chmod 755 /home/agent/.git-credentials-helper
    chown agent:agent /home/agent/.git-credentials-helper
    gosu agent bash -c "git config --global credential.helper '/home/agent/.git-credentials-helper'"
    log "Git credential helper configured for ${_bot_user}@${_forge_host} (password auth)"
  fi
-  # Set safe.directory to work around dubious ownership after container restart
+  # Repair legacy clones with baked-in stale credentials (#604).
-  # (https://github.com/disinto-admin/disinto/issues/517)
+  _GIT_CREDS_LOG_FN=log repair_baked_cred_urls --as "gosu agent" /home/agent/repos
-  gosu agent bash -c "git config --global --add safe.directory '*'"
+}
 # Configure git author identity for commits made by this container.
 # Derives identity from the resolved bot user (BOT_USER) to ensure commits
 # are visibly attributable to the correct bot in the forge timeline.
 # BOT_USER is normally set by configure_git_creds() (#741); this function
 # only falls back to its own API call if BOT_USER was not already resolved.
 configure_git_identity() {
  # Resolve BOT_USER from FORGE_TOKEN if not already set (configure_git_creds
  # exports BOT_USER on success, so this is a fallback for edge cases only).
  if [ -z "${BOT_USER:-}" ] && [ -n "${FORGE_TOKEN:-}" ]; then
    BOT_USER=$(curl -sf --max-time 10 \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_URL:-http://localhost:3000}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || true
  fi
  if [ -z "${BOT_USER:-}" ]; then
    log "WARNING: Could not resolve bot username for git identity — commits will use fallback"
    BOT_USER="agent"
  fi
  # Configure git identity for all repositories
  gosu agent git config --global user.name "${BOT_USER}"
  gosu agent git config --global user.email "${BOT_USER}@disinto.local"
  log "Git identity configured: ${BOT_USER} <${BOT_USER}@disinto.local>"
 }
 # Configure tea CLI login for forge operations (runs as agent user).
@ -97,8 +107,16 @@ configure_tea_login() {
 log "Agent container starting"
-# Set USER for scripts that source lib/env.sh (e.g., OPS_REPO_ROOT default)
+# Set USER and HOME for scripts that source lib/env.sh.
 # These are preconditions required by lib/env.sh's surface contract.
 # gosu agent inherits the parent's env, so exports here propagate to all children.
 export USER=agent
 export HOME=/home/agent
 # Source lib/env.sh to get DISINTO_LOG_DIR and other shared environment.
 # This must happen after USER/HOME are set (env.sh preconditions).
 # shellcheck source=lib/env.sh
 source "${DISINTO_BAKED}/lib/env.sh"
 # Verify Claude CLI is available (expected via volume mount from host).
 if ! command -v claude &>/dev/null; then
@ -115,29 +133,216 @@ log "Claude CLI: $(claude --version 2>&1 || true)"
 # auth method is active so operators can debug 401s.
 if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
  log "Auth: ANTHROPIC_API_KEY is set — using API key (no OAuth rotation)"
-elif [ -f /home/agent/.claude/credentials.json ]; then
+elif [ -f "${CLAUDE_CONFIG_DIR:-/home/agent/.claude}/.credentials.json" ]; then
-  log "Auth: OAuth credentials mounted from host (~/.claude)"
+  log "Auth: OAuth credentials mounted from host (${CLAUDE_CONFIG_DIR:-~/.claude})"
 else
  log "WARNING: No ANTHROPIC_API_KEY and no OAuth credentials found."
  log "Run 'claude auth login' on the host, or set ANTHROPIC_API_KEY in .env"
 fi
 # Bootstrap ops repos for each project TOML (#586).
 # In compose mode the ops repo lives on a Docker named volume at
 # /home/agent/repos/<project>-ops.  If init ran migrate_ops_repo on the host
 # the container never saw those changes.  This function clones from forgejo
 # when the repo is missing, or configures the remote and pulls when it exists
 # but has no remote (orphaned local-only checkout).
 bootstrap_ops_repos() {
  local repos_dir="/home/agent/repos"
  mkdir -p "$repos_dir"
  chown agent:agent "$repos_dir"
  for toml in "${DISINTO_DIR}"/projects/*.toml; do
    [ -f "$toml" ] || continue
    # Extract project name, ops repo slug, repo slug, and primary branch from TOML
    local project_name ops_slug primary_branch
    local _toml_vals
    _toml_vals=$(python3 -c "
 import tomllib, sys
 with open(sys.argv[1], 'rb') as f:
    cfg = tomllib.load(f)
 print(cfg.get('name', ''))
 print(cfg.get('ops_repo', ''))
 print(cfg.get('repo', ''))
 print(cfg.get('primary_branch', 'main'))
 " "$toml" 2>/dev/null || true)
    project_name=$(sed -n '1p' <<< "$_toml_vals")
    [ -n "$project_name" ] || continue
    ops_slug=$(sed -n '2p' <<< "$_toml_vals")
    local repo_slug
    repo_slug=$(sed -n '3p' <<< "$_toml_vals")
    primary_branch=$(sed -n '4p' <<< "$_toml_vals")
    primary_branch="${primary_branch:-main}"
    # Fall back to convention if ops_repo not in TOML
    if [ -z "$ops_slug" ]; then
      if [ -n "$repo_slug" ]; then
        ops_slug="${repo_slug}-ops"
      else
        ops_slug="disinto-admin/${project_name}-ops"
      fi
    fi
    local ops_root="${repos_dir}/${project_name}-ops"
    local remote_url="${FORGE_URL}/${ops_slug}.git"
    if [ ! -d "${ops_root}/.git" ]; then
      # Clone ops repo from forgejo
      log "Ops bootstrap: cloning ${ops_slug} -> ${ops_root}"
      if gosu agent git clone --quiet "$remote_url" "$ops_root" 2>/dev/null; then
        log "Ops bootstrap: ${ops_slug} cloned successfully"
      else
        # Remote may not exist yet (first run before init); create empty repo
        log "Ops bootstrap: clone failed for ${ops_slug} — initializing empty repo"
        gosu agent bash -c "
          mkdir -p '${ops_root}' && \
          git -C '${ops_root}' init --initial-branch='${primary_branch}' -q && \
          git -C '${ops_root}' remote add origin '${remote_url}'
        "
      fi
    else
      # Repo exists — ensure remote is configured and pull latest
      local current_remote
      current_remote=$(git -C "$ops_root" remote get-url origin 2>/dev/null || true)
      if [ -z "$current_remote" ]; then
        log "Ops bootstrap: adding missing remote to ${ops_root}"
        gosu agent git -C "$ops_root" remote add origin "$remote_url"
      elif [ "$current_remote" != "$remote_url" ]; then
        log "Ops bootstrap: fixing remote URL in ${ops_root}"
        gosu agent git -C "$ops_root" remote set-url origin "$remote_url"
      fi
      # Pull latest from forgejo to pick up any host-side migrations
      log "Ops bootstrap: pulling latest for ${project_name}-ops"
      gosu agent bash -c "
        cd '${ops_root}' && \
        git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
        git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
      " || log "Ops bootstrap: pull failed for ${ops_slug} (remote may not exist yet)"
    fi
  done
 }
 # Bootstrap the factory (disinto) repo from Forgejo into the project-repos
 # volume so the entrypoint runs from a live git checkout that receives
 # updates via `git pull`, not the stale baked copy from `COPY .` (#593).
 bootstrap_factory_repo() {
  local repo="${FACTORY_REPO:-}"
  if [ -z "$repo" ]; then
    log "Factory bootstrap: FACTORY_REPO not set — running from baked copy"
    return 0
  fi
  local remote_url="${FORGE_URL}/${repo}.git"
  local primary_branch="${PRIMARY_BRANCH:-main}"
  if [ ! -d "${DISINTO_LIVE}/.git" ]; then
    log "Factory bootstrap: cloning ${repo} -> ${DISINTO_LIVE}"
    if gosu agent git clone --quiet --branch "$primary_branch" "$remote_url" "$DISINTO_LIVE" 2>&1; then
      log "Factory bootstrap: cloned successfully"
    else
      log "Factory bootstrap: clone failed — running from baked copy"
      return 0
    fi
  else
    log "Factory bootstrap: pulling latest ${repo}"
    gosu agent bash -c "
      cd '${DISINTO_LIVE}' && \
      git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
      git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
    " || log "Factory bootstrap: pull failed — using existing checkout"
  fi
  # Copy project TOMLs from baked dir — they are gitignored AND docker-ignored,
  # so neither the image nor the clone normally contains them.  If the baked
  # copy has any (e.g. operator manually placed them), propagate them.
  if compgen -G "${DISINTO_BAKED}/projects/*.toml" >/dev/null 2>&1; then
    mkdir -p "${DISINTO_LIVE}/projects"
    cp "${DISINTO_BAKED}"/projects/*.toml "${DISINTO_LIVE}/projects/"
    chown -R agent:agent "${DISINTO_LIVE}/projects"
    log "Factory bootstrap: copied project TOMLs to live checkout"
  fi
  # Verify the live checkout has the expected structure
  if [ -f "${DISINTO_LIVE}/lib/env.sh" ]; then
    DISINTO_DIR="$DISINTO_LIVE"
    log "Factory bootstrap: DISINTO_DIR switched to live checkout at ${DISINTO_LIVE}"
  else
    log "Factory bootstrap: live checkout missing expected files — falling back to baked copy"
  fi
 }
 # Ensure the project repo is cloned on first run (#589).
 # The agents container uses a named volume (project-repos) at /home/agent/repos.
 # On first startup, if the project repo is missing, clone it from FORGE_URL/FORGE_REPO.
 # This makes the agents container self-healing and independent of init's host clone.
 ensure_project_clone() {
  # shellcheck disable=SC2153
  local repo_dir="/home/agent/repos/${PROJECT_NAME}"
  if [ -d "${repo_dir}/.git" ]; then
    log "Project repo present at ${repo_dir}"
    return 0
  fi
  if [ -z "${FORGE_REPO:-}" ] || [ -z "${FORGE_URL:-}" ]; then
    log "Cannot clone project repo: FORGE_REPO or FORGE_URL unset"
    return 1
  fi
  log "Cloning ${FORGE_URL}/${FORGE_REPO}.git -> ${repo_dir} (first run)"
  mkdir -p "$(dirname "$repo_dir")"
  chown -R agent:agent "$(dirname "$repo_dir")"
  if gosu agent git clone --quiet "${FORGE_URL}/${FORGE_REPO}.git" "$repo_dir"; then
    log "Project repo cloned"
  else
    log "Project repo clone failed — agents may fail until manually fixed"
    return 1
  fi
 }
 # Pull latest factory code at the start of each poll iteration (#593).
 # Runs as the agent user; failures are non-fatal (stale code still works).
 pull_factory_repo() {
  [ "$DISINTO_DIR" = "$DISINTO_LIVE" ] || return 0
  local primary_branch="${PRIMARY_BRANCH:-main}"
  gosu agent bash -c "
    cd '${DISINTO_LIVE}' && \
    git fetch origin '${primary_branch}' --quiet 2>/dev/null && \
    git reset --hard 'origin/${primary_branch}' --quiet 2>/dev/null
  " || log "Factory pull failed — continuing with current checkout"
 }
 # Configure git and tea once at startup (as root, then drop to agent)
-configure_git_creds
+_setup_git_creds
 configure_git_identity
 configure_tea_login
 # Clone project repo on first run (makes agents self-healing, #589)
 ensure_project_clone
 # Bootstrap ops repos from forgejo into container volumes (#586)
 bootstrap_ops_repos
 # Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593)
 bootstrap_factory_repo
 # Initialize state directory for check_active guards
 init_state_dir
 # Parse AGENT_ROLES env var (default: all agents)
 # Expected format: comma-separated list like "review,dev,gardener"
-AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor}"
+AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor,supervisor}"
 log "Agent roles configured: ${AGENT_ROLES}"
 # Poll interval in seconds (5 minutes default)
 POLL_INTERVAL="${POLL_INTERVAL:-300}"
 # Gardener and architect intervals (default 6 hours = 21600 seconds)
 GARDENER_INTERVAL="${GARDENER_INTERVAL:-21600}"
 ARCHITECT_INTERVAL="${ARCHITECT_INTERVAL:-21600}"
 PLANNER_INTERVAL="${PLANNER_INTERVAL:-43200}"
 SUPERVISOR_INTERVAL="${SUPERVISOR_INTERVAL:-1200}"
 log "Entering polling loop (interval: ${POLL_INTERVAL}s, roles: ${AGENT_ROLES})"
 log "Gardener interval: ${GARDENER_INTERVAL}s, Architect interval: ${ARCHITECT_INTERVAL}s, Planner interval: ${PLANNER_INTERVAL}s, Supervisor interval: ${SUPERVISOR_INTERVAL}s"
 # Main polling loop using iteration counter for gardener scheduling
 iteration=0
@ -145,6 +350,9 @@ while true; do
  iteration=$((iteration + 1))
  now=$(date +%s)
  # Pull latest factory code so poll scripts stay current (#593)
  pull_factory_repo
  # Stale .sid cleanup — needed for agents that don't support --resume
  # Run this as the agent user
  gosu agent bash -c "rm -f /tmp/dev-session-*.sid /tmp/review-session-*.sid 2>/dev/null || true"
@ -153,17 +361,39 @@ while true; do
  # Fast agents (review-poll, dev-poll) run in background so they don't block
  # each other.  Slow agents (gardener, architect, planner, predictor) also run
  # in background but are guarded by pgrep so only one instance runs at a time.
-  # The flock on session.lock already serializes claude -p calls.
+  # Per-session CLAUDE_CONFIG_DIR isolation handles OAuth concurrency natively.
  # Set CLAUDE_EXTERNAL_LOCK=1 to re-enable the legacy flock serialization.
  for toml in "${DISINTO_DIR}"/projects/*.toml; do
    [ -f "$toml" ] || continue
    # Parse project name and primary branch from TOML so env.sh preconditions
    # are satisfied when agent scripts source it (#674).
    _toml_vals=$(python3 -c "
 import tomllib, sys
 with open(sys.argv[1], 'rb') as f:
    cfg = tomllib.load(f)
 print(cfg.get('name', ''))
 print(cfg.get('primary_branch', 'main'))
 " "$toml" 2>/dev/null || true)
    _pname=$(sed -n '1p' <<< "$_toml_vals")
    _pbranch=$(sed -n '2p' <<< "$_toml_vals")
    [ -n "$_pname" ] || { log "WARNING: could not parse project name from ${toml} — skipping"; continue; }
    export PROJECT_NAME="$_pname"
    export PROJECT_REPO_ROOT="/home/agent/repos/${_pname}"
    export OPS_REPO_ROOT="/home/agent/repos/${_pname}-ops"
    export PRIMARY_BRANCH="${_pbranch:-main}"
    log "Processing project TOML: ${toml}"
    # --- Fast agents: run in background, wait before slow agents ---
    FAST_PIDS=()
    # Review poll (every iteration)
    if [[ ",${AGENT_ROLES}," == *",review,"* ]]; then
      log "Running review-poll (iteration ${iteration}) for ${toml}"
-      gosu agent bash -c "cd ${DISINTO_DIR} && bash review/review-poll.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/review-poll.log" 2>&1 &
+      gosu agent bash -c "cd ${DISINTO_DIR} && bash review/review-poll.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/review-poll.log" 2>&1 &
      FAST_PIDS+=($!)
    fi
    sleep 2  # stagger fast polls
@ -171,50 +401,51 @@ while true; do
    # Dev poll (every iteration)
    if [[ ",${AGENT_ROLES}," == *",dev,"* ]]; then
      log "Running dev-poll (iteration ${iteration}) for ${toml}"
-      gosu agent bash -c "cd ${DISINTO_DIR} && bash dev/dev-poll.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/dev-poll.log" 2>&1 &
+      gosu agent bash -c "cd ${DISINTO_DIR} && bash dev/dev-poll.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/dev-poll.log" 2>&1 &
      FAST_PIDS+=($!)
    fi
-    # Wait for fast polls to finish before launching slow agents
+    # Wait only for THIS iteration's fast polls — long-running gardener/dev-agent
-    wait
+    # from prior iterations must not block us.
    if [ ${#FAST_PIDS[@]} -gt 0 ]; then
      wait "${FAST_PIDS[@]}"
    fi
    # --- Slow agents: run in background with pgrep guard ---
-    # Gardener (every 6 hours = 72 iterations * 5 min = 21600 seconds)
+    # Gardener (interval configurable via GARDENER_INTERVAL env var)
    if [[ ",${AGENT_ROLES}," == *",gardener,"* ]]; then
      gardener_iteration=$((iteration * POLL_INTERVAL))
-      gardener_interval=$((6 * 60 * 60))  # 6 hours in seconds
+      if [ $((gardener_iteration % GARDENER_INTERVAL)) -eq 0 ] && [ "$now" -ge "$gardener_iteration" ]; then
      if [ $((gardener_iteration % gardener_interval)) -eq 0 ] && [ "$now" -ge "$gardener_iteration" ]; then
        if ! pgrep -f "gardener-run.sh" >/dev/null; then
-          log "Running gardener (iteration ${iteration}, 6-hour interval) for ${toml}"
+          log "Running gardener (iteration ${iteration}, ${GARDENER_INTERVAL}s interval) for ${toml}"
-          gosu agent bash -c "cd ${DISINTO_DIR} && bash gardener/gardener-run.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/gardener.log" 2>&1 &
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash gardener/gardener-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/gardener.log" 2>&1 &
        else
          log "Skipping gardener — already running"
        fi
      fi
    fi
-    # Architect (every 6 hours, same schedule as gardener)
+    # Architect (interval configurable via ARCHITECT_INTERVAL env var)
    if [[ ",${AGENT_ROLES}," == *",architect,"* ]]; then
      architect_iteration=$((iteration * POLL_INTERVAL))
-      architect_interval=$((6 * 60 * 60))  # 6 hours in seconds
+      if [ $((architect_iteration % ARCHITECT_INTERVAL)) -eq 0 ] && [ "$now" -ge "$architect_iteration" ]; then
      if [ $((architect_iteration % architect_interval)) -eq 0 ] && [ "$now" -ge "$architect_iteration" ]; then
        if ! pgrep -f "architect-run.sh" >/dev/null; then
-          log "Running architect (iteration ${iteration}, 6-hour interval) for ${toml}"
+          log "Running architect (iteration ${iteration}, ${ARCHITECT_INTERVAL}s interval) for ${toml}"
-          gosu agent bash -c "cd ${DISINTO_DIR} && bash architect/architect-run.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/architect.log" 2>&1 &
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash architect/architect-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/architect.log" 2>&1 &
        else
          log "Skipping architect — already running"
        fi
      fi
    fi
-    # Planner (every 12 hours = 144 iterations * 5 min = 43200 seconds)
+    # Planner (interval configurable via PLANNER_INTERVAL env var)
    if [[ ",${AGENT_ROLES}," == *",planner,"* ]]; then
      planner_iteration=$((iteration * POLL_INTERVAL))
-      planner_interval=$((12 * 60 * 60))  # 12 hours in seconds
+      if [ $((planner_iteration % PLANNER_INTERVAL)) -eq 0 ] && [ "$now" -ge "$planner_iteration" ]; then
      if [ $((planner_iteration % planner_interval)) -eq 0 ] && [ "$now" -ge "$planner_iteration" ]; then
        if ! pgrep -f "planner-run.sh" >/dev/null; then
-          log "Running planner (iteration ${iteration}, 12-hour interval) for ${toml}"
+          log "Running planner (iteration ${iteration}, ${PLANNER_INTERVAL}s interval) for ${toml}"
-          gosu agent bash -c "cd ${DISINTO_DIR} && bash planner/planner-run.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/planner.log" 2>&1 &
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash planner/planner-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/planner.log" 2>&1 &
        else
          log "Skipping planner — already running"
        fi
@ -228,12 +459,25 @@ while true; do
      if [ $((predictor_iteration % predictor_interval)) -eq 0 ] && [ "$now" -ge "$predictor_iteration" ]; then
        if ! pgrep -f "predictor-run.sh" >/dev/null; then
          log "Running predictor (iteration ${iteration}, 24-hour interval) for ${toml}"
-          gosu agent bash -c "cd ${DISINTO_DIR} && bash predictor/predictor-run.sh \"${toml}\"" >> "${DISINTO_DIR}/../data/logs/predictor.log" 2>&1 &
+          gosu agent bash -c "cd ${DISINTO_DIR} && bash predictor/predictor-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/predictor.log" 2>&1 &
        else
          log "Skipping predictor — already running"
        fi
      fi
    fi
    # Supervisor (interval configurable via SUPERVISOR_INTERVAL env var, default 20 min)
    if [[ ",${AGENT_ROLES}," == *",supervisor,"* ]]; then
      supervisor_iteration=$((iteration * POLL_INTERVAL))
      if [ $((supervisor_iteration % SUPERVISOR_INTERVAL)) -eq 0 ] && [ "$now" -ge "$supervisor_iteration" ]; then
        if ! pgrep -f "supervisor-run.sh" >/dev/null; then
          log "Running supervisor (iteration ${iteration}, ${SUPERVISOR_INTERVAL}s interval) for ${toml}"
          gosu agent bash -c "cd ${DISINTO_DIR} && bash supervisor/supervisor-run.sh \"${toml}\"" >> "${DISINTO_LOG_DIR}/supervisor.log" 2>&1 &
        else
          log "Skipping supervisor — already running"
        fi
      fi
    fi
  done
  sleep "${POLL_INTERVAL}"
--- a/docker/chat/Dockerfile
+++ b/docker/chat/Dockerfile
@ -0,0 +1,35 @@
 # disinto-chat — minimal HTTP backend for Claude chat UI
 #
 # Small Debian slim base with Python runtime.
 # Chosen for simplicity and small image size (~100MB).
 #
 # Image size: ~100MB (well under the 200MB ceiling)
 #
 # The claude binary is mounted from the host at runtime via docker-compose,
 # not baked into the image — same pattern as the agents container.
 FROM debian:bookworm-slim
 # Install Python (no build-time network access needed)
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    && rm -rf /var/lib/apt/lists/*
 # Non-root user — fixed UID 10001 for sandbox hardening (#706)
 RUN useradd -m -u 10001 -s /bin/bash chat
 # Copy application files
 COPY server.py /usr/local/bin/server.py
 COPY entrypoint-chat.sh /entrypoint-chat.sh
 COPY ui/ /var/chat/ui/
 RUN chmod +x /entrypoint-chat.sh /usr/local/bin/server.py
 USER chat
 WORKDIR /var/chat
 EXPOSE 8080
 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
  CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1
 ENTRYPOINT ["/entrypoint-chat.sh"]
--- a/docker/chat/entrypoint-chat.sh
+++ b/docker/chat/entrypoint-chat.sh
@ -0,0 +1,37 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # entrypoint-chat.sh — Start the disinto-chat backend server
 #
 # Exec-replace pattern: this script is the container entrypoint and runs
 # the server directly (no wrapper needed). Logs to stdout for docker logs.
 LOGFILE="/tmp/chat.log"
 log() {
    printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE"
 }
 # Sandbox sanity checks (#706) — fail fast if isolation is broken
 if [ -e /var/run/docker.sock ]; then
    log "FATAL: /var/run/docker.sock is accessible — sandbox violation"
    exit 1
 fi
 if [ "$(id -u)" = "0" ]; then
    log "FATAL: running as root (uid 0) — sandbox violation"
    exit 1
 fi
 # Verify Claude CLI is available (expected via volume mount from host).
 if ! command -v claude &>/dev/null; then
    log "FATAL: claude CLI not found in PATH"
    log "Mount the host binary into the container, e.g.:"
    log "  volumes:"
    log "    - /usr/local/bin/claude:/usr/local/bin/claude:ro"
    exit 1
 fi
 log "Claude CLI: $(claude --version 2>&1 || true)"
 # Start the Python server (exec-replace so signals propagate correctly)
 log "Starting disinto-chat server on port 8080..."
 exec python3 /usr/local/bin/server.py
--- a/docker/chat/server.py
+++ b/docker/chat/server.py
@ -0,0 +1,957 @@
 #!/usr/bin/env python3
 """
 disinto-chat server — minimal HTTP backend for Claude chat UI.
 Routes:
    GET /chat/auth/verify    -> Caddy forward_auth callback (returns 200+X-Forwarded-User or 401)
    GET /chat/login          -> 302 to Forgejo OAuth authorize
    GET /chat/oauth/callback -> exchange code for token, validate user, set session
    GET /chat/               -> serves index.html (session required)
    GET /chat/static/*       -> serves static assets (session required)
    POST /chat               -> spawns `claude --print` with user message (session required)
    GET /ws                  -> reserved for future streaming upgrade (returns 501)
 OAuth flow:
    1. User hits any /chat/* route without a valid session cookie -> 302 /chat/login
    2. /chat/login redirects to Forgejo /login/oauth/authorize
    3. Forgejo redirects back to /chat/oauth/callback with ?code=...&state=...
    4. Server exchanges code for access token, fetches /api/v1/user
    5. Asserts user is in allowlist, sets HttpOnly session cookie
    6. Redirects to /chat/
 The claude binary is expected to be mounted from the host at /usr/local/bin/claude.
 """
 import datetime
 import json
 import os
 import re
 import secrets
 import subprocess
 import sys
 import time
 from http.server import HTTPServer, BaseHTTPRequestHandler
 from urllib.parse import urlparse, parse_qs, urlencode
 # Configuration
 HOST = os.environ.get("CHAT_HOST", "0.0.0.0")
 PORT = int(os.environ.get("CHAT_PORT", 8080))
 UI_DIR = "/var/chat/ui"
 STATIC_DIR = os.path.join(UI_DIR, "static")
 CLAUDE_BIN = "/usr/local/bin/claude"
 # OAuth configuration
 FORGE_URL = os.environ.get("FORGE_URL", "http://localhost:3000")
 CHAT_OAUTH_CLIENT_ID = os.environ.get("CHAT_OAUTH_CLIENT_ID", "")
 CHAT_OAUTH_CLIENT_SECRET = os.environ.get("CHAT_OAUTH_CLIENT_SECRET", "")
 EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "")
 # Shared secret for Caddy forward_auth verify endpoint (#709).
 # When set, only requests carrying this value in X-Forward-Auth-Secret are
 # allowed to call /chat/auth/verify.  When empty the endpoint is unrestricted
 # (acceptable during local dev; production MUST set this).
 FORWARD_AUTH_SECRET = os.environ.get("FORWARD_AUTH_SECRET", "")
 # Rate limiting / cost caps (#711)
 CHAT_MAX_REQUESTS_PER_HOUR = int(os.environ.get("CHAT_MAX_REQUESTS_PER_HOUR", 60))
 CHAT_MAX_REQUESTS_PER_DAY = int(os.environ.get("CHAT_MAX_REQUESTS_PER_DAY", 500))
 CHAT_MAX_TOKENS_PER_DAY = int(os.environ.get("CHAT_MAX_TOKENS_PER_DAY", 1000000))
 # Allowed users - disinto-admin always allowed; CSV allowlist extends it
 _allowed_csv = os.environ.get("DISINTO_CHAT_ALLOWED_USERS", "")
 ALLOWED_USERS = {"disinto-admin"}
 if _allowed_csv:
    ALLOWED_USERS.update(u.strip() for u in _allowed_csv.split(",") if u.strip())
 # Session cookie name
 SESSION_COOKIE = "disinto_chat_session"
 # Session TTL: 24 hours
 SESSION_TTL = 24 * 60 * 60
 # Chat history directory (bind-mounted from host)
 CHAT_HISTORY_DIR = os.environ.get("CHAT_HISTORY_DIR", "/var/lib/chat/history")
 # Regex for valid conversation_id (12-char hex, no slashes)
 CONVERSATION_ID_PATTERN = re.compile(r"^[0-9a-f]{12}$")
 # In-memory session store: token -> {"user": str, "expires": float}
 _sessions = {}
 # Pending OAuth state tokens: state -> expires (float)
 _oauth_states = {}
 # Per-user rate limiting state (#711)
 # user -> list of request timestamps (for sliding-window hourly/daily caps)
 _request_log = {}
 # user -> {"tokens": int, "date": "YYYY-MM-DD"}
 _daily_tokens = {}
 # MIME types for static files
 MIME_TYPES = {
    ".html": "text/html; charset=utf-8",
    ".js": "application/javascript; charset=utf-8",
    ".css": "text/css; charset=utf-8",
    ".json": "application/json; charset=utf-8",
    ".png": "image/png",
    ".jpg": "image/jpeg",
    ".svg": "image/svg+xml",
    ".ico": "image/x-icon",
 }
 def _build_callback_uri():
    """Build the OAuth callback URI based on tunnel configuration."""
    if EDGE_TUNNEL_FQDN:
        return f"https://{EDGE_TUNNEL_FQDN}/chat/oauth/callback"
    return "http://localhost/chat/oauth/callback"
 def _session_cookie_flags():
    """Return cookie flags appropriate for the deployment mode."""
    flags = "HttpOnly; SameSite=Lax; Path=/chat"
    if EDGE_TUNNEL_FQDN:
        flags += "; Secure"
    return flags
 def _validate_session(cookie_header):
    """Check session cookie and return username if valid, else None."""
    if not cookie_header:
        return None
    for part in cookie_header.split(";"):
        part = part.strip()
        if part.startswith(SESSION_COOKIE + "="):
            token = part[len(SESSION_COOKIE) + 1:]
            session = _sessions.get(token)
            if session and session["expires"] > time.time():
                return session["user"]
            # Expired - clean up
            _sessions.pop(token, None)
            return None
    return None
 def _gc_sessions():
    """Remove expired sessions (called opportunistically)."""
    now = time.time()
    expired = [k for k, v in _sessions.items() if v["expires"] <= now]
    for k in expired:
        del _sessions[k]
    expired_states = [k for k, v in _oauth_states.items() if v <= now]
    for k in expired_states:
        del _oauth_states[k]
 def _exchange_code_for_token(code):
    """Exchange an authorization code for an access token via Forgejo."""
    import urllib.request
    import urllib.error
    data = urlencode({
        "grant_type": "authorization_code",
        "code": code,
        "client_id": CHAT_OAUTH_CLIENT_ID,
        "client_secret": CHAT_OAUTH_CLIENT_SECRET,
        "redirect_uri": _build_callback_uri(),
    }).encode()
    req = urllib.request.Request(
        f"{FORGE_URL}/login/oauth/access_token",
        data=data,
        headers={"Accept": "application/json", "Content-Type": "application/x-www-form-urlencoded"},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            return json.loads(resp.read().decode())
    except (urllib.error.URLError, json.JSONDecodeError, OSError) as e:
        print(f"OAuth token exchange failed: {e}", file=sys.stderr)
        return None
 def _fetch_user(access_token):
    """Fetch the authenticated user from Forgejo API."""
    import urllib.request
    import urllib.error
    req = urllib.request.Request(
        f"{FORGE_URL}/api/v1/user",
        headers={"Authorization": f"token {access_token}", "Accept": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            return json.loads(resp.read().decode())
    except (urllib.error.URLError, json.JSONDecodeError, OSError) as e:
        print(f"User fetch failed: {e}", file=sys.stderr)
        return None
 # =============================================================================
 # Rate Limiting Functions (#711)
 # =============================================================================
 def _check_rate_limit(user):
    """Check per-user rate limits. Returns (allowed, retry_after, reason) (#711).
    Checks hourly request cap, daily request cap, and daily token cap.
    """
    now = time.time()
    one_hour_ago = now - 3600
    today = datetime.date.today().isoformat()
    # Prune old entries from request log
    timestamps = _request_log.get(user, [])
    timestamps = [t for t in timestamps if t > now - 86400]
    _request_log[user] = timestamps
    # Hourly request cap
    hourly = [t for t in timestamps if t > one_hour_ago]
    if len(hourly) >= CHAT_MAX_REQUESTS_PER_HOUR:
        oldest_in_window = min(hourly)
        retry_after = int(oldest_in_window + 3600 - now) + 1
        return False, max(retry_after, 1), "hourly request limit"
    # Daily request cap
    start_of_day = time.mktime(datetime.date.today().timetuple())
    daily = [t for t in timestamps if t >= start_of_day]
    if len(daily) >= CHAT_MAX_REQUESTS_PER_DAY:
        next_day = start_of_day + 86400
        retry_after = int(next_day - now) + 1
        return False, max(retry_after, 1), "daily request limit"
    # Daily token cap
    token_info = _daily_tokens.get(user, {"tokens": 0, "date": today})
    if token_info["date"] != today:
        token_info = {"tokens": 0, "date": today}
        _daily_tokens[user] = token_info
    if token_info["tokens"] >= CHAT_MAX_TOKENS_PER_DAY:
        next_day = start_of_day + 86400
        retry_after = int(next_day - now) + 1
        return False, max(retry_after, 1), "daily token limit"
    return True, 0, ""
 def _record_request(user):
    """Record a request timestamp for the user (#711)."""
    _request_log.setdefault(user, []).append(time.time())
 def _record_tokens(user, tokens):
    """Record token usage for the user (#711)."""
    today = datetime.date.today().isoformat()
    token_info = _daily_tokens.get(user, {"tokens": 0, "date": today})
    if token_info["date"] != today:
        token_info = {"tokens": 0, "date": today}
    token_info["tokens"] += tokens
    _daily_tokens[user] = token_info
 def _parse_stream_json(output):
    """Parse stream-json output from claude --print (#711).
    Returns (text_content, total_tokens).  Falls back gracefully if the
    usage event is absent or malformed.
    """
    text_parts = []
    total_tokens = 0
    for line in output.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            event = json.loads(line)
        except json.JSONDecodeError:
            continue
        etype = event.get("type", "")
        # Collect assistant text
        if etype == "content_block_delta":
            delta = event.get("delta", {})
            if delta.get("type") == "text_delta":
                text_parts.append(delta.get("text", ""))
        elif etype == "assistant":
            # Full assistant message (non-streaming)
            content = event.get("content", "")
            if isinstance(content, str) and content:
                text_parts.append(content)
            elif isinstance(content, list):
                for block in content:
                    if isinstance(block, dict) and block.get("text"):
                        text_parts.append(block["text"])
        # Parse usage from result event
        if etype == "result":
            usage = event.get("usage", {})
            total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
        elif "usage" in event:
            usage = event["usage"]
            if isinstance(usage, dict):
                total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
    return "".join(text_parts), total_tokens
 # =============================================================================
 # Conversation History Functions (#710)
 # =============================================================================
 def _generate_conversation_id():
    """Generate a new conversation ID (12-char hex string)."""
    return secrets.token_hex(6)
 def _validate_conversation_id(conv_id):
    """Validate that conversation_id matches the required format."""
    return bool(CONVERSATION_ID_PATTERN.match(conv_id))
 def _get_user_history_dir(user):
    """Get the history directory path for a user."""
    return os.path.join(CHAT_HISTORY_DIR, user)
 def _get_conversation_path(user, conv_id):
    """Get the full path to a conversation file."""
    user_dir = _get_user_history_dir(user)
    return os.path.join(user_dir, f"{conv_id}.ndjson")
 def _ensure_user_dir(user):
    """Ensure the user's history directory exists."""
    user_dir = _get_user_history_dir(user)
    os.makedirs(user_dir, exist_ok=True)
    return user_dir
 def _write_message(user, conv_id, role, content):
    """Append a message to a conversation file in NDJSON format."""
    conv_path = _get_conversation_path(user, conv_id)
    _ensure_user_dir(user)
    record = {
        "ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "user": user,
        "role": role,
        "content": content,
    }
    with open(conv_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
 def _read_conversation(user, conv_id):
    """Read all messages from a conversation file."""
    conv_path = _get_conversation_path(user, conv_id)
    messages = []
    if not os.path.exists(conv_path):
        return None
    try:
        with open(conv_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        messages.append(json.loads(line))
                    except json.JSONDecodeError:
                        # Skip malformed lines
                        continue
    except IOError:
        return None
    return messages
 def _list_user_conversations(user):
    """List all conversation files for a user with first message preview."""
    user_dir = _get_user_history_dir(user)
    conversations = []
    if not os.path.exists(user_dir):
        return conversations
    try:
        for filename in os.listdir(user_dir):
            if not filename.endswith(".ndjson"):
                continue
            conv_id = filename[:-7]  # Remove .ndjson extension
            if not _validate_conversation_id(conv_id):
                continue
            conv_path = os.path.join(user_dir, filename)
            messages = _read_conversation(user, conv_id)
            if messages:
                first_msg = messages[0]
                preview = first_msg.get("content", "")[:50]
                if len(first_msg.get("content", "")) > 50:
                    preview += "..."
                conversations.append({
                    "id": conv_id,
                    "created_at": first_msg.get("ts", ""),
                    "preview": preview,
                    "message_count": len(messages),
                })
            else:
                # Empty conversation file
                conversations.append({
                    "id": conv_id,
                    "created_at": "",
                    "preview": "(empty)",
                    "message_count": 0,
                })
    except OSError:
        pass
    # Sort by created_at descending
    conversations.sort(key=lambda x: x["created_at"] or "", reverse=True)
    return conversations
 def _delete_conversation(user, conv_id):
    """Delete a conversation file."""
    conv_path = _get_conversation_path(user, conv_id)
    if os.path.exists(conv_path):
        os.remove(conv_path)
        return True
    return False
 class ChatHandler(BaseHTTPRequestHandler):
    """HTTP request handler for disinto-chat with Forgejo OAuth."""
    def log_message(self, format, *args):
        """Log to stderr."""
        print(f"[{self.log_date_time_string()}] {format % args}", file=sys.stderr)
    def send_error_page(self, code, message=None):
        """Custom error response."""
        self.send_response(code)
        self.send_header("Content-Type", "text/plain; charset=utf-8")
        self.end_headers()
        if message:
            self.wfile.write(message.encode("utf-8"))
    def _require_session(self):
        """Check session; redirect to /chat/login if missing. Returns username or None."""
        user = _validate_session(self.headers.get("Cookie"))
        if user:
            return user
        self.send_response(302)
        self.send_header("Location", "/chat/login")
        self.end_headers()
        return None
    def _check_forwarded_user(self, session_user):
        """Defense-in-depth: verify X-Forwarded-User matches session user (#709).
        Returns True if the request may proceed, False if a 403 was sent.
        When X-Forwarded-User is absent (forward_auth removed from Caddy),
        the request is rejected - fail-closed by design.
        """
        forwarded = self.headers.get("X-Forwarded-User")
        if not forwarded:
            rid = self.headers.get("X-Request-Id", "-")
            print(
                f"WARN: missing X-Forwarded-User for session_user={session_user} "
                f"req_id={rid} - fail-closed (#709)",
                file=sys.stderr,
            )
            self.send_error_page(403, "Forbidden: missing forwarded-user header")
            return False
        if forwarded != session_user:
            rid = self.headers.get("X-Request-Id", "-")
            print(
                f"WARN: X-Forwarded-User mismatch: header={forwarded} "
                f"session={session_user} req_id={rid} (#709)",
                file=sys.stderr,
            )
            self.send_error_page(403, "Forbidden: user identity mismatch")
            return False
        return True
    def do_GET(self):
        """Handle GET requests."""
        parsed = urlparse(self.path)
        path = parsed.path
        # Health endpoint (no auth required) — used by Docker healthcheck
        if path == "/health":
            self.send_response(200)
            self.send_header("Content-Type", "text/plain")
            self.end_headers()
            self.wfile.write(b"ok\n")
            return
        # Verify endpoint for Caddy forward_auth (#709)
        if path == "/chat/auth/verify":
            self.handle_auth_verify()
            return
        # OAuth routes (no session required)
        if path == "/chat/login":
            self.handle_login()
            return
        if path == "/chat/oauth/callback":
            self.handle_oauth_callback(parsed.query)
            return
        # Conversation list endpoint: GET /chat/history
        if path == "/chat/history":
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            self.handle_conversation_list(user)
            return
        # Single conversation endpoint: GET /chat/history/<id>
        if path.startswith("/chat/history/"):
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            conv_id = path[len("/chat/history/"):]
            self.handle_conversation_get(user, conv_id)
            return
        # Serve index.html at root
        if path in ("/", "/chat", "/chat/"):
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            self.serve_index()
            return
        # Serve static files
        if path.startswith("/chat/static/") or path.startswith("/static/"):
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            self.serve_static(path)
            return
        # Reserved WebSocket endpoint (future use)
        if path == "/ws" or path.startswith("/ws"):
            self.send_error_page(501, "WebSocket upgrade not yet implemented")
            return
        # 404 for unknown paths
        self.send_error_page(404, "Not found")
    def do_POST(self):
        """Handle POST requests."""
        parsed = urlparse(self.path)
        path = parsed.path
        # New conversation endpoint (session required)
        if path == "/chat/new":
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            self.handle_new_conversation(user)
            return
        # Chat endpoint (session required)
        if path in ("/chat", "/chat/"):
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            self.handle_chat(user)
            return
        # 404 for unknown paths
        self.send_error_page(404, "Not found")
    def handle_auth_verify(self):
        """Caddy forward_auth callback - validate session and return X-Forwarded-User (#709).
        Caddy calls this endpoint for every /chat/* request.  If the session
        cookie is valid the endpoint returns 200 with the X-Forwarded-User
        header set to the session username.  Otherwise it returns 401 so Caddy
        knows the request is unauthenticated.
        Access control: when FORWARD_AUTH_SECRET is configured, the request must
        carry a matching X-Forward-Auth-Secret header (shared secret between
        Caddy and the chat backend).
        """
        # Shared-secret gate
        if FORWARD_AUTH_SECRET:
            provided = self.headers.get("X-Forward-Auth-Secret", "")
            if not secrets.compare_digest(provided, FORWARD_AUTH_SECRET):
                self.send_error_page(403, "Forbidden: invalid forward-auth secret")
                return
        user = _validate_session(self.headers.get("Cookie"))
        if not user:
            self.send_error_page(401, "Unauthorized: no valid session")
            return
        self.send_response(200)
        self.send_header("X-Forwarded-User", user)
        self.send_header("Content-Type", "text/plain; charset=utf-8")
        self.end_headers()
        self.wfile.write(b"ok")
    def handle_login(self):
        """Redirect to Forgejo OAuth authorize endpoint."""
        _gc_sessions()
        if not CHAT_OAUTH_CLIENT_ID:
            self.send_error_page(500, "Chat OAuth not configured (CHAT_OAUTH_CLIENT_ID missing)")
            return
        state = secrets.token_urlsafe(32)
        _oauth_states[state] = time.time() + 600  # 10 min validity
        params = urlencode({
            "client_id": CHAT_OAUTH_CLIENT_ID,
            "redirect_uri": _build_callback_uri(),
            "response_type": "code",
            "state": state,
        })
        self.send_response(302)
        self.send_header("Location", f"{FORGE_URL}/login/oauth/authorize?{params}")
        self.end_headers()
    def handle_oauth_callback(self, query_string):
        """Exchange authorization code for token, validate user, set session."""
        params = parse_qs(query_string)
        code = params.get("code", [""])[0]
        state = params.get("state", [""])[0]
        # Validate state
        expected_expiry = _oauth_states.pop(state, None) if state else None
        if not expected_expiry or expected_expiry < time.time():
            self.send_error_page(400, "Invalid or expired OAuth state")
            return
        if not code:
            self.send_error_page(400, "Missing authorization code")
            return
        # Exchange code for access token
        token_resp = _exchange_code_for_token(code)
        if not token_resp or "access_token" not in token_resp:
            self.send_error_page(502, "Failed to obtain access token from Forgejo")
            return
        access_token = token_resp["access_token"]
        # Fetch user info
        user_info = _fetch_user(access_token)
        if not user_info or "login" not in user_info:
            self.send_error_page(502, "Failed to fetch user info from Forgejo")
            return
        username = user_info["login"]
        # Check allowlist
        if username not in ALLOWED_USERS:
            self.send_response(403)
            self.send_header("Content-Type", "text/plain; charset=utf-8")
            self.end_headers()
            self.wfile.write(
                f"Not authorised: user '{username}' is not in the allowed users list.\n".encode()
            )
            return
        # Create session
        session_token = secrets.token_urlsafe(48)
        _sessions[session_token] = {
            "user": username,
            "expires": time.time() + SESSION_TTL,
        }
        cookie_flags = _session_cookie_flags()
        self.send_response(302)
        self.send_header("Set-Cookie", f"{SESSION_COOKIE}={session_token}; {cookie_flags}")
        self.send_header("Location", "/chat/")
        self.end_headers()
    def serve_index(self):
        """Serve the main index.html file."""
        index_path = os.path.join(UI_DIR, "index.html")
        if not os.path.exists(index_path):
            self.send_error_page(500, "UI not found")
            return
        try:
            with open(index_path, "r", encoding="utf-8") as f:
                content = f.read()
            self.send_response(200)
            self.send_header("Content-Type", MIME_TYPES[".html"])
            self.send_header("Content-Length", len(content.encode("utf-8")))
            self.end_headers()
            self.wfile.write(content.encode("utf-8"))
        except IOError as e:
            self.send_error_page(500, f"Error reading index.html: {e}")
    def serve_static(self, path):
        """Serve static files from the static directory."""
        # Strip /chat/static/ or /static/ prefix
        if path.startswith("/chat/static/"):
            relative_path = path[len("/chat/static/"):]
        else:
            relative_path = path[len("/static/"):]
        if ".." in relative_path or relative_path.startswith("/"):
            self.send_error_page(403, "Forbidden")
            return
        file_path = os.path.join(STATIC_DIR, relative_path)
        if not os.path.exists(file_path):
            self.send_error_page(404, "Not found")
            return
        # Determine MIME type
        _, ext = os.path.splitext(file_path)
        content_type = MIME_TYPES.get(ext.lower(), "application/octet-stream")
        try:
            with open(file_path, "rb") as f:
                content = f.read()
            self.send_response(200)
            self.send_header("Content-Type", content_type)
            self.send_header("Content-Length", len(content))
            self.end_headers()
            self.wfile.write(content)
        except IOError as e:
            self.send_error_page(500, f"Error reading file: {e}")
    def _send_rate_limit_response(self, retry_after, reason):
        """Send a 429 response with Retry-After header and HTMX fragment (#711)."""
        body = (
            f'<div class="rate-limit-error">'
            f"Rate limit exceeded: {reason}. "
            f"Please try again in {retry_after} seconds."
            f"</div>"
        )
        self.send_response(429)
        self.send_header("Retry-After", str(retry_after))
        self.send_header("Content-Type", "text/html; charset=utf-8")
        self.send_header("Content-Length", str(len(body.encode("utf-8"))))
        self.end_headers()
        self.wfile.write(body.encode("utf-8"))
    def handle_chat(self, user):
        """
        Handle chat requests by spawning `claude --print` with the user message.
        Enforces per-user rate limits and tracks token usage (#711).
        """
        # Check rate limits before processing (#711)
        allowed, retry_after, reason = _check_rate_limit(user)
        if not allowed:
            self._send_rate_limit_response(retry_after, reason)
            return
        # Read request body
        content_length = int(self.headers.get("Content-Length", 0))
        if content_length == 0:
            self.send_error_page(400, "No message provided")
            return
        body = self.rfile.read(content_length)
        try:
            # Parse form-encoded body
            body_str = body.decode("utf-8")
            params = parse_qs(body_str)
            message = params.get("message", [""])[0]
            conv_id = params.get("conversation_id", [None])[0]
        except (UnicodeDecodeError, KeyError):
            self.send_error_page(400, "Invalid message format")
            return
        if not message:
            self.send_error_page(400, "Empty message")
            return
        # Get user from session
        user = _validate_session(self.headers.get("Cookie"))
        if not user:
            self.send_error_page(401, "Unauthorized")
            return
        # Validate Claude binary exists
        if not os.path.exists(CLAUDE_BIN):
            self.send_error_page(500, "Claude CLI not found")
            return
        # Generate new conversation ID if not provided
        if not conv_id or not _validate_conversation_id(conv_id):
            conv_id = _generate_conversation_id()
        # Record request for rate limiting (#711)
        _record_request(user)
        try:
            # Save user message to history
            _write_message(user, conv_id, "user", message)
            # Spawn claude --print with stream-json for token tracking (#711)
            proc = subprocess.Popen(
                [CLAUDE_BIN, "--print", "--output-format", "stream-json", message],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
            )
            raw_output = proc.stdout.read()
            error_output = proc.stderr.read()
            if error_output:
                print(f"Claude stderr: {error_output}", file=sys.stderr)
            proc.wait()
            if proc.returncode != 0:
                self.send_error_page(500, f"Claude CLI failed with exit code {proc.returncode}")
                return
            # Parse stream-json for text and token usage (#711)
            response, total_tokens = _parse_stream_json(raw_output)
            # Track token usage - does not block *this* request (#711)
            if total_tokens > 0:
                _record_tokens(user, total_tokens)
                print(
                    f"Token usage: user={user} tokens={total_tokens}",
                    file=sys.stderr,
                )
            # Fall back to raw output if stream-json parsing yielded no text
            if not response:
                response = raw_output
            # Save assistant response to history
            _write_message(user, conv_id, "assistant", response)
            self.send_response(200)
            self.send_header("Content-Type", "application/json; charset=utf-8")
            self.end_headers()
            self.wfile.write(json.dumps({
                "response": response,
                "conversation_id": conv_id,
            }, ensure_ascii=False).encode("utf-8"))
        except FileNotFoundError:
            self.send_error_page(500, "Claude CLI not found")
        except Exception as e:
            self.send_error_page(500, f"Error: {e}")
    # =======================================================================
    # Conversation History Handlers
    # =======================================================================
    def handle_conversation_list(self, user):
        """List all conversations for the logged-in user."""
        conversations = _list_user_conversations(user)
        self.send_response(200)
        self.send_header("Content-Type", "application/json; charset=utf-8")
        self.end_headers()
        self.wfile.write(json.dumps(conversations, ensure_ascii=False).encode("utf-8"))
    def handle_conversation_get(self, user, conv_id):
        """Get a specific conversation for the logged-in user."""
        # Validate conversation_id format
        if not _validate_conversation_id(conv_id):
            self.send_error_page(400, "Invalid conversation ID")
            return
        messages = _read_conversation(user, conv_id)
        if messages is None:
            self.send_error_page(404, "Conversation not found")
            return
        self.send_response(200)
        self.send_header("Content-Type", "application/json; charset=utf-8")
        self.end_headers()
        self.wfile.write(json.dumps(messages, ensure_ascii=False).encode("utf-8"))
    def handle_conversation_delete(self, user, conv_id):
        """Delete a specific conversation for the logged-in user."""
        # Validate conversation_id format
        if not _validate_conversation_id(conv_id):
            self.send_error_page(400, "Invalid conversation ID")
            return
        if _delete_conversation(user, conv_id):
            self.send_response(204)  # No Content
            self.end_headers()
        else:
            self.send_error_page(404, "Conversation not found")
    def handle_new_conversation(self, user):
        """Create a new conversation and return its ID."""
        conv_id = _generate_conversation_id()
        self.send_response(200)
        self.send_header("Content-Type", "application/json; charset=utf-8")
        self.end_headers()
        self.wfile.write(json.dumps({"conversation_id": conv_id}, ensure_ascii=False).encode("utf-8"))
    def do_DELETE(self):
        """Handle DELETE requests."""
        parsed = urlparse(self.path)
        path = parsed.path
        # Delete conversation endpoint
        if path.startswith("/chat/history/"):
            user = self._require_session()
            if not user:
                return
            if not self._check_forwarded_user(user):
                return
            conv_id = path[len("/chat/history/"):]
            self.handle_conversation_delete(user, conv_id)
            return
        # 404 for unknown paths
        self.send_error_page(404, "Not found")
 def main():
    """Start the HTTP server."""
    server_address = (HOST, PORT)
    httpd = HTTPServer(server_address, ChatHandler)
    print(f"Starting disinto-chat server on {HOST}:{PORT}", file=sys.stderr)
    print(f"UI available at http://localhost:{PORT}/chat/", file=sys.stderr)
    if CHAT_OAUTH_CLIENT_ID:
        print(f"OAuth enabled (client_id={CHAT_OAUTH_CLIENT_ID[:8]}...)", file=sys.stderr)
        print(f"Allowed users: {', '.join(sorted(ALLOWED_USERS))}", file=sys.stderr)
    else:
        print("WARNING: CHAT_OAUTH_CLIENT_ID not set - OAuth disabled", file=sys.stderr)
    if FORWARD_AUTH_SECRET:
        print("forward_auth secret configured (#709)", file=sys.stderr)
    else:
        print("WARNING: FORWARD_AUTH_SECRET not set - verify endpoint unrestricted", file=sys.stderr)
    print(
        f"Rate limits (#711): {CHAT_MAX_REQUESTS_PER_HOUR}/hr, "
        f"{CHAT_MAX_REQUESTS_PER_DAY}/day, "
        f"{CHAT_MAX_TOKENS_PER_DAY} tokens/day",
        file=sys.stderr,
    )
    httpd.serve_forever()
 if __name__ == "__main__":
    main()
--- a/docker/chat/ui/index.html
+++ b/docker/chat/ui/index.html
@ -0,0 +1,521 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>disinto-chat</title>
    <script src="/static/htmx.min.js"></script>
    <style>
        * {
            box-sizing: border-box;
            margin: 0;
            padding: 0;
        }
        body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, sans-serif;
            background: #1a1a2e;
            color: #eaeaea;
            min-height: 100vh;
            display: flex;
        }
        /* Sidebar styles */
        .sidebar {
            width: 280px;
            background: #16213e;
            border-right: 1px solid #0f3460;
            display: flex;
            flex-direction: column;
            height: 100vh;
            position: fixed;
            left: 0;
            top: 0;
            z-index: 100;
        }
        .sidebar-header {
            padding: 1rem;
            border-bottom: 1px solid #0f3460;
        }
        .sidebar-header h1 {
            font-size: 1.25rem;
            font-weight: 600;
            margin-bottom: 0.5rem;
        }
        .new-chat-btn {
            width: 100%;
            background: #e94560;
            color: white;
            border: none;
            border-radius: 6px;
            padding: 0.75rem 1rem;
            font-size: 0.9rem;
            font-weight: 600;
            cursor: pointer;
            transition: background 0.2s;
        }
        .new-chat-btn:hover {
            background: #d63447;
        }
        .new-chat-btn:disabled {
            background: #555;
            cursor: not-allowed;
        }
        .conversations-list {
            flex: 1;
            overflow-y: auto;
            padding: 0.5rem;
        }
        .conversation-item {
            padding: 0.75rem 1rem;
            border-radius: 6px;
            cursor: pointer;
            margin-bottom: 0.25rem;
            transition: background 0.2s;
            border: 1px solid transparent;
        }
        .conversation-item:hover {
            background: #1a1a2e;
        }
        .conversation-item.active {
            background: #0f3460;
            border-color: #e94560;
        }
        .conversation-item .preview {
            font-size: 0.875rem;
            white-space: nowrap;
            overflow: hidden;
            text-overflow: ellipsis;
            opacity: 0.9;
        }
        .conversation-item .meta {
            font-size: 0.75rem;
            opacity: 0.6;
            margin-top: 0.25rem;
        }
        .conversation-item .message-count {
            float: right;
            font-size: 0.7rem;
            background: #0f3460;
            padding: 0.125rem 0.5rem;
            border-radius: 10px;
        }
        .main-content {
            margin-left: 280px;
            display: flex;
            flex-direction: column;
            width: 100%;
            height: 100vh;
        }
        header {
            background: #16213e;
            padding: 1rem 2rem;
            border-bottom: 1px solid #0f3460;
        }
        header h1 {
            font-size: 1.25rem;
            font-weight: 600;
        }
        main {
            flex: 1;
            display: flex;
            flex-direction: column;
            max-width: 900px;
            margin: 0 auto;
            width: 100%;
            padding: 1rem;
        }
        #messages {
            flex: 1;
            overflow-y: auto;
            padding: 1rem;
            background: #16213e;
            border-radius: 8px;
            margin-bottom: 1rem;
        }
        .message {
            margin-bottom: 1rem;
            padding: 0.75rem 1rem;
            border-radius: 8px;
            line-height: 1.5;
        }
        .message.user {
            background: #0f3460;
            margin-left: 2rem;
        }
        .message.assistant {
            background: #1a1a2e;
            margin-right: 2rem;
        }
        .message.system {
            background: #1a1a2e;
            font-style: italic;
            color: #888;
            text-align: center;
        }
        .message .role {
            font-weight: 600;
            font-size: 0.875rem;
            margin-bottom: 0.25rem;
            opacity: 0.8;
        }
        .message .content {
            white-space: pre-wrap;
            word-wrap: break-word;
        }
        .input-area {
            display: flex;
            gap: 0.5rem;
            padding: 1rem;
            background: #16213e;
            border-radius: 8px;
        }
        textarea {
            flex: 1;
            background: #1a1a2e;
            border: 1px solid #0f3460;
            border-radius: 6px;
            padding: 0.75rem;
            color: #eaeaea;
            font-family: inherit;
            font-size: 1rem;
            resize: none;
            min-height: 80px;
        }
        textarea:focus {
            outline: none;
            border-color: #e94560;
        }
        button {
            background: #e94560;
            color: white;
            border: none;
            border-radius: 6px;
            padding: 0.75rem 1.5rem;
            font-size: 1rem;
            font-weight: 600;
            cursor: pointer;
            transition: background 0.2s;
        }
        button:hover {
            background: #d63447;
        }
        button:disabled {
            background: #555;
            cursor: not-allowed;
        }
        .loading {
            opacity: 0.6;
        }
        .empty-state {
            display: flex;
            flex-direction: column;
            align-items: center;
            justify-content: center;
            height: 100%;
            color: #888;
            text-align: center;
        }
        .empty-state p {
            margin-top: 1rem;
        }
        /* Responsive sidebar toggle */
        .sidebar-toggle {
            display: none;
            position: fixed;
            top: 1rem;
            left: 1rem;
            z-index: 200;
            background: #e94560;
            color: white;
            border: none;
            border-radius: 6px;
            padding: 0.5rem;
            cursor: pointer;
        }
        @media (max-width: 768px) {
            .sidebar {
                transform: translateX(-100%);
                transition: transform 0.3s;
            }
            .sidebar.open {
                transform: translateX(0);
            }
            .sidebar-toggle {
                display: block;
            }
            .main-content {
                margin-left: 0;
            }
        }
    </style>
 </head>
 <body>
    <button class="sidebar-toggle" id="sidebar-toggle">☰</button>
    <aside class="sidebar" id="sidebar">
        <div class="sidebar-header">
            <h1>disinto-chat</h1>
            <button class="new-chat-btn" id="new-chat-btn">+ New Chat</button>
        </div>
        <div class="conversations-list" id="conversations-list">
            <!-- Conversations will be loaded here -->
        </div>
    </aside>
    <div class="main-content">
        <header>
            <h1>disinto-chat</h1>
        </header>
        <main>
            <div id="messages">
                <div class="message system">
                    <div class="role">system</div>
                    <div class="content">Welcome to disinto-chat. Type a message to start chatting with Claude.</div>
                </div>
            </div>
            <form class="input-area" id="chat-form">
                <textarea name="message" placeholder="Type your message..." required></textarea>
                <button type="submit" id="send-btn">Send</button>
            </form>
        </main>
    </div>
    <script>
        // State
        let currentConversationId = null;
        let conversations = [];
        // DOM elements
        const messagesDiv = document.getElementById('messages');
        const sendBtn = document.getElementById('send-btn');
        const textarea = document.querySelector('textarea');
        const conversationsList = document.getElementById('conversations-list');
        const newChatBtn = document.getElementById('new-chat-btn');
        const sidebar = document.getElementById('sidebar');
        const sidebarToggle = document.getElementById('sidebar-toggle');
        // Load conversations list
        async function loadConversations() {
            try {
                const response = await fetch('/chat/history');
                if (response.ok) {
                    conversations = await response.json();
                    renderConversationsList();
                }
            } catch (error) {
                console.error('Failed to load conversations:', error);
            }
        }
        // Render conversations list
        function renderConversationsList() {
            conversationsList.innerHTML = '';
            if (conversations.length === 0) {
                conversationsList.innerHTML = '<div style="padding: 1rem; color: #888; text-align: center; font-size: 0.875rem;">No conversations yet</div>';
                return;
            }
            conversations.forEach(conv => {
                const item = document.createElement('div');
                item.className = 'conversation-item';
                if (conv.id === currentConversationId) {
                    item.classList.add('active');
                }
                item.dataset.conversationId = conv.id;
                const previewDiv = document.createElement('div');
                previewDiv.className = 'preview';
                previewDiv.textContent = conv.preview || '(empty)';
                const metaDiv = document.createElement('div');
                metaDiv.className = 'meta';
                const date = conv.created_at ? new Date(conv.created_at).toLocaleDateString() : '';
                metaDiv.innerHTML = `${date} <span class="message-count">${conv.message_count || 0} msg${conv.message_count !== 1 ? 's' : ''}</span>`;
                item.appendChild(previewDiv);
                item.appendChild(metaDiv);
                item.addEventListener('click', () => loadConversation(conv.id));
                conversationsList.appendChild(item);
            });
        }
        // Load a specific conversation
        async function loadConversation(convId) {
            // Early-return if already showing this conversation
            if (convId === currentConversationId) {
                return;
            }
            // Clear messages
            messagesDiv.innerHTML = '';
            // Update active state in sidebar
            document.querySelectorAll('.conversation-item').forEach(item => {
                item.classList.remove('active');
            });
            document.querySelector(`[data-conversation-id="${convId}"]`)?.classList.add('active');
            currentConversationId = convId;
            try {
                const response = await fetch(`/chat/history/${convId}`);
                if (response.ok) {
                    const messages = await response.json();
                    if (messages && messages.length > 0) {
                        messages.forEach(msg => {
                            addMessage(msg.role, msg.content);
                        });
                    } else {
                        addSystemMessage('This conversation is empty');
                    }
                } else {
                    addSystemMessage('Failed to load conversation');
                }
            } catch (error) {
                console.error('Failed to load conversation:', error);
                addSystemMessage('Error loading conversation');
            }
            // Close sidebar on mobile
            if (window.innerWidth <= 768) {
                sidebar.classList.remove('open');
            }
        }
        // Create a new conversation
        async function createNewConversation() {
            try {
                const response = await fetch('/chat/new', { method: 'POST' });
                if (response.ok) {
                    const data = await response.json();
                    currentConversationId = data.conversation_id;
                    messagesDiv.innerHTML = '';
                    addSystemMessage('New conversation started');
                    await loadConversations();
                } else {
                    addSystemMessage('Failed to create new conversation');
                }
            } catch (error) {
                console.error('Failed to create new conversation:', error);
                addSystemMessage('Error creating new conversation');
            }
        }
        // Add message to display
        function addMessage(role, content, streaming = false) {
            const msgDiv = document.createElement('div');
            msgDiv.className = `message ${role}`;
            msgDiv.innerHTML = `
                <div class="role">${role}</div>
                <div class="content${streaming ? ' streaming' : ''}">${escapeHtml(content)}</div>
            `;
            messagesDiv.appendChild(msgDiv);
            messagesDiv.scrollTop = messagesDiv.scrollHeight;
            return msgDiv.querySelector('.content');
        }
        function addSystemMessage(content) {
            const msgDiv = document.createElement('div');
            msgDiv.className = 'message system';
            msgDiv.innerHTML = `
                <div class="role">system</div>
                <div class="content">${escapeHtml(content)}</div>
            `;
            messagesDiv.appendChild(msgDiv);
            messagesDiv.scrollTop = messagesDiv.scrollHeight;
        }
        function escapeHtml(text) {
            const div = document.createElement('div');
            div.textContent = text;
            return div.innerHTML.replace(/\n/g, '<br>');
        }
        // Send message handler
        async function sendMessage() {
            const message = textarea.value.trim();
            if (!message) return;
            // Disable input
            textarea.disabled = true;
            sendBtn.disabled = true;
            sendBtn.textContent = 'Sending...';
            // Add user message
            addMessage('user', message);
            textarea.value = '';
            // If no conversation ID, create one
            if (!currentConversationId) {
                await createNewConversation();
            }
            try {
                // Use fetch with URLSearchParams for application/x-www-form-urlencoded
                const params = new URLSearchParams();
                params.append('message', message);
                params.append('conversation_id', currentConversationId);
                const response = await fetch('/chat', {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/x-www-form-urlencoded'
                    },
                    body: params
                });
                if (!response.ok) {
                    throw new Error(`HTTP ${response.status}`);
                }
                // Read the response as JSON (now returns JSON with response and conversation_id)
                const data = await response.json();
                addMessage('assistant', data.response);
            } catch (error) {
                addSystemMessage(`Error: ${error.message}`);
            } finally {
                textarea.disabled = false;
                sendBtn.disabled = false;
                sendBtn.textContent = 'Send';
                textarea.focus();
                messagesDiv.scrollTop = messagesDiv.scrollHeight;
                // Refresh conversations list
                await loadConversations();
            }
        }
        // Event listeners
        sendBtn.addEventListener('click', sendMessage);
        newChatBtn.addEventListener('click', createNewConversation);
        textarea.addEventListener('keydown', (e) => {
            if (e.key === 'Enter' && !e.shiftKey) {
                e.preventDefault();
                sendMessage();
            }
        });
        // Sidebar toggle for mobile
        sidebarToggle.addEventListener('click', () => {
            sidebar.classList.toggle('open');
        });
        // Close sidebar when clicking outside on mobile
        document.addEventListener('click', (e) => {
            if (window.innerWidth <= 768) {
                if (!sidebar.contains(e.target) && !sidebarToggle.contains(e.target)) {
                    sidebar.classList.remove('open');
                }
            }
        });
        // Initial focus
        textarea.focus();
        // Load conversations on page load
        loadConversations();
    </script>
 </body>
 </html>
--- a/docker/chat/ui/static/htmx.min.js
+++ b/docker/chat/ui/static/htmx.min.js
--- a/docker/edge/Dockerfile
+++ b/docker/edge/Dockerfile
@ -1,4 +1,7 @@
 FROM caddy:latest
-RUN apk add --no-cache bash jq curl git docker-cli python3
+RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh
 COPY entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh
 VOLUME /data
 ENTRYPOINT ["bash", "/usr/local/bin/entrypoint-edge.sh"]
--- a/docker/edge/dispatcher.sh
+++ b/docker/edge/dispatcher.sh
@ -8,8 +8,8 @@
 # 2. Scan vault/actions/ for TOML files without .result.json
 # 3. Verify TOML arrived via merged PR with admin merger (Forgejo API)
 # 4. Validate TOML using vault-env.sh validator
-# 5. Decrypt .env.vault.enc and extract only declared secrets
+# 5. Decrypt declared secrets via load_secret (lib/env.sh)
-# 6. Launch: docker run --rm disinto/agents:latest <action-id>
+# 6. Launch: delegate to _launch_runner_{docker,nomad} backend
 # 7. Write <action-id>.result.json with exit code, timestamp, logs summary
 #
 # Part of #76.
@ -19,7 +19,7 @@ set -euo pipefail
 # Resolve script root (parent of lib/)
 SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-# Source shared environment
+# Source shared environment (provides load_secret, log helpers, etc.)
 source "${SCRIPT_ROOT}/../lib/env.sh"
 # Project TOML location: prefer mounted path, fall back to cloned path
@ -27,26 +27,18 @@ source "${SCRIPT_ROOT}/../lib/env.sh"
 # the shallow clone only has .toml.example files.
 PROJECTS_DIR="${PROJECTS_DIR:-${FACTORY_ROOT:-/opt/disinto}-projects}"
-# Load vault secrets after env.sh (env.sh unsets them for agent security)
+# -----------------------------------------------------------------------------
-# Vault secrets must be available to the dispatcher
+# Backend selection: DISPATCHER_BACKEND={docker,nomad}
-if [ -f "$FACTORY_ROOT/.env.vault.enc" ] && command -v sops &>/dev/null; then
+# Default: docker.  nomad lands as a pure addition during migration Step 5.
-  set -a
+# -----------------------------------------------------------------------------
-  eval "$(sops -d --output-type dotenv "$FACTORY_ROOT/.env.vault.enc" 2>/dev/null)" \
+DISPATCHER_BACKEND="${DISPATCHER_BACKEND:-docker}"
    || echo "Warning: failed to decrypt .env.vault.enc — vault secrets not loaded" >&2
  set +a
 elif [ -f "$FACTORY_ROOT/.env.vault" ]; then
  set -a
  # shellcheck source=/dev/null
  source "$FACTORY_ROOT/.env.vault"
  set +a
 fi
 # Ops repo location (vault/actions directory)
 OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/debian/disinto-ops}"
 VAULT_ACTIONS_DIR="${OPS_REPO_ROOT}/vault/actions"
 # Vault action validation
-VAULT_ENV="${SCRIPT_ROOT}/../vault/vault-env.sh"
+VAULT_ENV="${SCRIPT_ROOT}/../action-vault/vault-env.sh"
 # Admin users who can merge vault PRs (from issue #77)
 # Comma-separated list of Forgejo usernames with admin role
@ -350,33 +342,231 @@ get_dispatch_mode() {
  fi
 }
-# Write result file for an action
+# Commit result.json to the ops repo via git push (portable, no bind-mount).
-# Usage: write_result <action_id> <exit_code> <logs>
+#
-write_result() {
+# Clones the ops repo into a scratch directory, writes the result file,
 # commits as vault-bot, and pushes to the primary branch.
 # Idempotent: skips if result.json already exists upstream.
 # Retries on push conflict with rebase-and-push (handles concurrent merges).
 #
 # Usage: commit_result_via_git <action_id> <exit_code> <logs>
 commit_result_via_git() {
  local action_id="$1"
  local exit_code="$2"
  local logs="$3"
-  local result_file="${VAULT_ACTIONS_DIR}/${action_id}.result.json"
+  local result_relpath="vault/actions/${action_id}.result.json"
  local ops_clone_url="${FORGE_URL}/${FORGE_OPS_REPO}.git"
  local branch="${PRIMARY_BRANCH:-main}"
  local scratch_dir
  scratch_dir=$(mktemp -d /tmp/dispatcher-result-XXXXXX)
  # shellcheck disable=SC2064
  trap "rm -rf '${scratch_dir}'" RETURN
  # Shallow clone of the ops repo — only the primary branch
  if ! git clone --depth 1 --branch "$branch" \
    "$ops_clone_url" "$scratch_dir" 2>/dev/null; then
    log "ERROR: Failed to clone ops repo for result commit (action ${action_id})"
    return 1
  fi
  # Idempotency: skip if result.json already exists upstream
  if [ -f "${scratch_dir}/${result_relpath}" ]; then
    log "Result already exists upstream for ${action_id} — skipping commit"
    return 0
  fi
  # Configure git identity as vault-bot
  git -C "$scratch_dir" config user.name "vault-bot"
  git -C "$scratch_dir" config user.email "vault-bot@disinto.local"
  # Truncate logs if too long (keep last 1000 chars)
  if [ ${#logs} -gt 1000 ]; then
    logs="${logs: -1000}"
  fi
-  # Write result JSON
+  # Write result JSON via jq (never string-interpolate into JSON)
  mkdir -p "$(dirname "${scratch_dir}/${result_relpath}")"
  jq -n \
    --arg id "$action_id" \
    --argjson exit_code "$exit_code" \
    --arg timestamp "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" \
    --arg logs "$logs" \
    '{id: $id, exit_code: $exit_code, timestamp: $timestamp, logs: $logs}' \
-    > "$result_file"
+    > "${scratch_dir}/${result_relpath}"
-  log "Result written: ${result_file}"
+  git -C "$scratch_dir" add "$result_relpath"
  git -C "$scratch_dir" commit -q -m "vault: result for ${action_id}"
  # Push with retry on conflict (rebase-and-push pattern).
  # Common case: admin merges another action PR between our clone and push.
  local attempt
  for attempt in 1 2 3; do
    if git -C "$scratch_dir" push origin "$branch" 2>/dev/null; then
      log "Result committed and pushed for ${action_id} (attempt ${attempt})"
      return 0
    fi
    log "Push conflict for ${action_id} (attempt ${attempt}/3) — rebasing"
    if ! git -C "$scratch_dir" pull --rebase origin "$branch" 2>/dev/null; then
      # Rebase conflict — check if result was pushed by another process
      git -C "$scratch_dir" rebase --abort 2>/dev/null || true
      if git -C "$scratch_dir" fetch origin "$branch" 2>/dev/null && \
         git -C "$scratch_dir" show "origin/${branch}:${result_relpath}" >/dev/null 2>&1; then
        log "Result already exists upstream for ${action_id} (pushed by another process)"
        return 0
      fi
    fi
  done
  log "ERROR: Failed to push result for ${action_id} after 3 attempts"
  return 1
 }
-# Launch runner for the given action
+# Write result file for an action via git push to the ops repo.
 # Usage: write_result <action_id> <exit_code> <logs>
 write_result() {
  local action_id="$1"
  local exit_code="$2"
  local logs="$3"
  commit_result_via_git "$action_id" "$exit_code" "$logs"
 }
 # -----------------------------------------------------------------------------
 # Pluggable launcher backends
 # -----------------------------------------------------------------------------
 # _launch_runner_docker ACTION_ID SECRETS_CSV MOUNTS_CSV
 #
 # Builds and executes a `docker run` command for the vault runner.
 # Secrets are resolved via load_secret (lib/env.sh).
 # Returns: exit code of the docker run.  Stdout/stderr are captured to a temp
 #          log file whose path is printed to stdout (caller reads it).
 _launch_runner_docker() {
  local action_id="$1"
  local secrets_csv="$2"
  local mounts_csv="$3"
  local -a cmd=(docker run --rm
    --name "vault-runner-${action_id}"
    --network host
    --entrypoint bash
    -e DISINTO_CONTAINER=1
    -e "FORGE_URL=${FORGE_URL}"
    -e "FORGE_TOKEN=${FORGE_TOKEN}"
    -e "FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}"
    -e "FORGE_OPS_REPO=${FORGE_OPS_REPO:-}"
    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
  )
  # Pass through optional env vars if set
  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
  fi
  if [ -n "${CLAUDE_MODEL:-}" ]; then
    cmd+=(-e "CLAUDE_MODEL=${CLAUDE_MODEL}")
  fi
  # Mount docker socket, claude binary, and claude config
  cmd+=(-v /var/run/docker.sock:/var/run/docker.sock)
  if [ -f /usr/local/bin/claude ]; then
    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
  fi
  local runtime_home="${HOME:-/home/debian}"
  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
  fi
  if [ -f "${runtime_home}/.claude.json" ]; then
    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
  fi
  # Add environment variables for secrets (resolved via load_secret)
  if [ -n "$secrets_csv" ]; then
    local secret
    for secret in $(echo "$secrets_csv" | tr ',' ' '); do
      secret=$(echo "$secret" | xargs)
      [ -n "$secret" ] || continue
      local secret_val
      secret_val=$(load_secret "$secret") || true
      if [ -z "$secret_val" ]; then
        log "ERROR: Secret '${secret}' could not be resolved for action ${action_id}"
        write_result "$action_id" 1 "Secret not found: ${secret}"
        return 1
      fi
      cmd+=(-e "${secret}=${secret_val}")
    done
  fi
  # Add volume mounts for file-based credentials
  if [ -n "$mounts_csv" ]; then
    local mount_alias
    for mount_alias in $(echo "$mounts_csv" | tr ',' ' '); do
      mount_alias=$(echo "$mount_alias" | xargs)
      [ -n "$mount_alias" ] || continue
      case "$mount_alias" in
        ssh)
          cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
          ;;
        gpg)
          cmd+=(-v "${runtime_home}/.gnupg:/home/agent/.gnupg:ro")
          ;;
        sops)
          cmd+=(-v "${runtime_home}/.config/sops/age:/home/agent/.config/sops/age:ro")
          ;;
        *)
          log "ERROR: Unknown mount alias '${mount_alias}' for action ${action_id}"
          write_result "$action_id" 1 "Unknown mount alias: ${mount_alias}"
          return 1
          ;;
      esac
    done
  fi
  # Mount the ops repo so the runner entrypoint can read the action TOML
  cmd+=(-v "${OPS_REPO_ROOT}:/home/agent/ops:ro")
  # Image and entrypoint arguments: runner entrypoint + action-id
  cmd+=(disinto/agents:latest /home/agent/disinto/docker/runner/entrypoint-runner.sh "$action_id")
  log "Running: docker run --rm vault-runner-${action_id} (secrets: ${secrets_csv:-none}, mounts: ${mounts_csv:-none})"
  # Create temp file for logs
  local log_file
  log_file=$(mktemp /tmp/dispatcher-logs-XXXXXX)
  trap 'rm -f "$log_file"' RETURN
  # Execute with array expansion (safe from shell injection)
  "${cmd[@]}" > "$log_file" 2>&1
  local exit_code=$?
  # Read logs summary
  local logs
  logs=$(cat "$log_file")
  # Write result file
  write_result "$action_id" "$exit_code" "$logs"
  if [ $exit_code -eq 0 ]; then
    log "Runner completed successfully for action: ${action_id}"
  else
    log "Runner failed for action: ${action_id} (exit code: ${exit_code})"
  fi
  return $exit_code
 }
 # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV
 #
 # Nomad backend stub — will be implemented in migration Step 5.
 _launch_runner_nomad() {
  echo "nomad backend not yet implemented" >&2
  return 1
 }
 # Launch runner for the given action (backend-agnostic orchestrator)
 # Usage: launch_runner <toml_file>
 launch_runner() {
  local toml_file="$1"
@ -409,123 +599,94 @@ launch_runner() {
    log "Action ${action_id}: admin merge verified"
  fi
-  # Extract secrets from validated action
+  # Build CSV lists from validated action metadata
-  local secrets_array
+  local secrets_csv=""
-  secrets_array="${VAULT_ACTION_SECRETS:-}"
+  if [ -n "${VAULT_ACTION_SECRETS:-}" ]; then
    # Convert space-separated to comma-separated
    secrets_csv=$(echo "${VAULT_ACTION_SECRETS}" | xargs | tr ' ' ',')
  fi
  local mounts_csv=""
  if [ -n "${VAULT_ACTION_MOUNTS:-}" ]; then
    mounts_csv=$(echo "${VAULT_ACTION_MOUNTS}" | xargs | tr ' ' ',')
  fi
  # Delegate to the selected backend
  "_launch_runner_${DISPATCHER_BACKEND}" "$action_id" "$secrets_csv" "$mounts_csv"
 }
 # -----------------------------------------------------------------------------
 # Pluggable sidecar launcher (reproduce / triage / verify)
 # -----------------------------------------------------------------------------
 # _dispatch_sidecar_docker CONTAINER_NAME ISSUE_NUM PROJECT_TOML IMAGE [FORMULA]
 #
 # Launches a sidecar container via docker run (background, pid-tracked).
 # Prints the background PID to stdout.
 _dispatch_sidecar_docker() {
  local container_name="$1"
  local issue_number="$2"
  local project_toml="$3"
  local image="$4"
  local formula="${5:-}"
  # Build docker run command (self-contained, no compose context needed).
  # The edge container has the Docker socket but not the host's compose project,
  # so docker compose run would fail with exit 125. docker run is self-contained:
  # the dispatcher knows the image, network, env vars, and entrypoint.
  local -a cmd=(docker run --rm
-    --name "vault-runner-${action_id}"
+    --name "${container_name}"
    --network host
-    --entrypoint bash
+    --security-opt apparmor=unconfined
-    -e DISINTO_CONTAINER=1
+    -v /var/run/docker.sock:/var/run/docker.sock
    -v agent-data:/home/agent/data
    -v project-repos:/home/agent/repos
    -e "FORGE_URL=${FORGE_URL}"
    -e "FORGE_TOKEN=${FORGE_TOKEN}"
-    -e "FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}"
+    -e "FORGE_REPO=${FORGE_REPO}"
    -e "FORGE_OPS_REPO=${FORGE_OPS_REPO:-}"
    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
    -e DISINTO_CONTAINER=1
  )
-  # Pass through optional env vars if set
+  # Set formula if provided
  if [ -n "$formula" ]; then
    cmd+=(-e "DISINTO_FORMULA=${formula}")
  fi
  # Pass through ANTHROPIC_API_KEY if set
  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
  fi
  if [ -n "${CLAUDE_MODEL:-}" ]; then
    cmd+=(-e "CLAUDE_MODEL=${CLAUDE_MODEL}")
  fi
-  # Mount docker socket, claude binary, and claude config
+  # Mount shared Claude config dir and ~/.ssh from the runtime user's home
  cmd+=(-v /var/run/docker.sock:/var/run/docker.sock)
  if [ -f /usr/local/bin/claude ]; then
    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
  fi
  local runtime_home="${HOME:-/home/debian}"
-  if [ -d "${runtime_home}/.claude" ]; then
+  if [ -d "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}" ]; then
-    cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
+    cmd+=(-v "${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}")
    cmd+=(-e "CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}")
  fi
  if [ -f "${runtime_home}/.claude.json" ]; then
    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
  fi
-
+  if [ -d "${runtime_home}/.ssh" ]; then
-  # Add environment variables for secrets (if any declared)
+    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
-  if [ -n "$secrets_array" ]; then
+  fi
-    for secret in $secrets_array; do
+  if [ -f /usr/local/bin/claude ]; then
-      secret=$(echo "$secret" | xargs)
+    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
      if [ -n "$secret" ]; then
        # Verify secret exists in vault
        if [ -z "${!secret:-}" ]; then
          log "ERROR: Secret '${secret}' not found in vault for action ${action_id}"
          write_result "$action_id" 1 "Secret not found in vault: ${secret}"
          return 1
        fi
        cmd+=(-e "${secret}=${!secret}")
      fi
    done
  else
    log "Action ${action_id} has no secrets declared — runner will execute without extra env vars"
  fi
-  # Add volume mounts for file-based credentials (if any declared)
+  # Mount the project TOML into the container at a stable path
-  local mounts_array
+  local container_toml="/home/agent/project.toml"
-  mounts_array="${VAULT_ACTION_MOUNTS:-}"
+  cmd+=(-v "${project_toml}:${container_toml}:ro")
  if [ -n "$mounts_array" ]; then
    for mount_alias in $mounts_array; do
      mount_alias=$(echo "$mount_alias" | xargs)
      [ -n "$mount_alias" ] || continue
      case "$mount_alias" in
        ssh)
          cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
          ;;
        gpg)
          cmd+=(-v "${runtime_home}/.gnupg:/home/agent/.gnupg:ro")
          ;;
        sops)
          cmd+=(-v "${runtime_home}/.config/sops/age:/home/agent/.config/sops/age:ro")
          ;;
        *)
          log "ERROR: Unknown mount alias '${mount_alias}' for action ${action_id}"
          write_result "$action_id" 1 "Unknown mount alias: ${mount_alias}"
          return 1
          ;;
      esac
    done
  fi
-  # Mount the ops repo so the runner entrypoint can read the action TOML
+  cmd+=("${image}" "$container_toml" "$issue_number")
  cmd+=(-v "${OPS_REPO_ROOT}:/home/agent/ops:ro")
-  # Image and entrypoint arguments: runner entrypoint + action-id
+  # Launch in background
-  cmd+=(disinto/agents:latest /home/agent/disinto/docker/runner/entrypoint-runner.sh "$action_id")
+  "${cmd[@]}" &
  echo $!
 }
-  log "Running: docker run --rm vault-runner-${action_id} (secrets: ${secrets_array:-none}, mounts: ${mounts_array:-none})"
+# _dispatch_sidecar_nomad CONTAINER_NAME ISSUE_NUM PROJECT_TOML IMAGE [FORMULA]
-
+#
-  # Create temp file for logs
+# Nomad sidecar backend stub — will be implemented in migration Step 5.
-  local log_file
+_dispatch_sidecar_nomad() {
-  log_file=$(mktemp /tmp/dispatcher-logs-XXXXXX)
+  echo "nomad backend not yet implemented" >&2
-  trap 'rm -f "$log_file"' RETURN
+  return 1
  # Execute with array expansion (safe from shell injection)
  # Capture stdout and stderr to log file
  "${cmd[@]}" > "$log_file" 2>&1
  local exit_code=$?
  # Read logs summary
  local logs
  logs=$(cat "$log_file")
  # Write result file
  write_result "$action_id" "$exit_code" "$logs"
  if [ $exit_code -eq 0 ]; then
    log "Runner completed successfully for action: ${action_id}"
  else
    log "Runner failed for action: ${action_id} (exit code: ${exit_code})"
  fi
  return $exit_code
 }
 # -----------------------------------------------------------------------------
@ -606,51 +767,13 @@ dispatch_reproduce() {
  log "Dispatching reproduce-agent for issue #${issue_number} (project: ${project_toml})"
-  # Build docker run command using array (safe from injection)
+  local bg_pid
-  local -a cmd=(docker run --rm
+  bg_pid=$("_dispatch_sidecar_${DISPATCHER_BACKEND}" \
-    --name "disinto-reproduce-${issue_number}"
+    "disinto-reproduce-${issue_number}" \
-    --network host
+    "$issue_number" \
-    --security-opt apparmor=unconfined
+    "$project_toml" \
-    -v /var/run/docker.sock:/var/run/docker.sock
+    "disinto-reproduce:latest")
    -v agent-data:/home/agent/data
    -v project-repos:/home/agent/repos
    -e "FORGE_URL=${FORGE_URL}"
    -e "FORGE_TOKEN=${FORGE_TOKEN}"
    -e "FORGE_REPO=${FORGE_REPO}"
    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
    -e DISINTO_CONTAINER=1
  )
  # Pass through ANTHROPIC_API_KEY if set
  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
  fi
  # Mount ~/.claude and ~/.ssh from the runtime user's home if available
  local runtime_home="${HOME:-/home/debian}"
  if [ -d "${runtime_home}/.claude" ]; then
    cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
  fi
  if [ -f "${runtime_home}/.claude.json" ]; then
    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
  fi
  if [ -d "${runtime_home}/.ssh" ]; then
    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
  fi
  # Mount claude CLI binary if present on host
  if [ -f /usr/local/bin/claude ]; then
    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
  fi
  # Mount the project TOML into the container at a stable path
  local container_toml="/home/agent/project.toml"
  cmd+=(-v "${project_toml}:${container_toml}:ro")
  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
  # Launch in background; write pid-file so we don't double-launch
  "${cmd[@]}" &
  local bg_pid=$!
  echo "$bg_pid" > "$(_reproduce_lockfile "$issue_number")"
  log "Reproduce container launched (pid ${bg_pid}) for issue #${issue_number}"
 }
@ -730,52 +853,14 @@ dispatch_triage() {
  log "Dispatching triage-agent for issue #${issue_number} (project: ${project_toml})"
-  # Build docker run command using array (safe from injection)
+  local bg_pid
-  local -a cmd=(docker run --rm
+  bg_pid=$("_dispatch_sidecar_${DISPATCHER_BACKEND}" \
-    --name "disinto-triage-${issue_number}"
+    "disinto-triage-${issue_number}" \
-    --network host
+    "$issue_number" \
-    --security-opt apparmor=unconfined
+    "$project_toml" \
-    -v /var/run/docker.sock:/var/run/docker.sock
+    "disinto-reproduce:latest" \
-    -v agent-data:/home/agent/data
+    "triage")
    -v project-repos:/home/agent/repos
    -e "FORGE_URL=${FORGE_URL}"
    -e "FORGE_TOKEN=${FORGE_TOKEN}"
    -e "FORGE_REPO=${FORGE_REPO}"
    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
    -e DISINTO_CONTAINER=1
    -e DISINTO_FORMULA=triage
  )
  # Pass through ANTHROPIC_API_KEY if set
  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
  fi
  # Mount ~/.claude and ~/.ssh from the runtime user's home if available
  local runtime_home="${HOME:-/home/debian}"
  if [ -d "${runtime_home}/.claude" ]; then
    cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
  fi
  if [ -f "${runtime_home}/.claude.json" ]; then
    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
  fi
  if [ -d "${runtime_home}/.ssh" ]; then
    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
  fi
  # Mount claude CLI binary if present on host
  if [ -f /usr/local/bin/claude ]; then
    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
  fi
  # Mount the project TOML into the container at a stable path
  local container_toml="/home/agent/project.toml"
  cmd+=(-v "${project_toml}:${container_toml}:ro")
  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
  # Launch in background; write pid-file so we don't double-launch
  "${cmd[@]}" &
  local bg_pid=$!
  echo "$bg_pid" > "$(_triage_lockfile "$issue_number")"
  log "Triage container launched (pid ${bg_pid}) for issue #${issue_number}"
 }
@ -931,52 +1016,14 @@ dispatch_verify() {
  log "Dispatching verification-agent for issue #${issue_number} (project: ${project_toml})"
-  # Build docker run command using array (safe from injection)
+  local bg_pid
-  local -a cmd=(docker run --rm
+  bg_pid=$("_dispatch_sidecar_${DISPATCHER_BACKEND}" \
-    --name "disinto-verify-${issue_number}"
+    "disinto-verify-${issue_number}" \
-    --network host
+    "$issue_number" \
-    --security-opt apparmor=unconfined
+    "$project_toml" \
-    -v /var/run/docker.sock:/var/run/docker.sock
+    "disinto-reproduce:latest" \
-    -v agent-data:/home/agent/data
+    "verify")
    -v project-repos:/home/agent/repos
    -e "FORGE_URL=${FORGE_URL}"
    -e "FORGE_TOKEN=${FORGE_TOKEN}"
    -e "FORGE_REPO=${FORGE_REPO}"
    -e "PRIMARY_BRANCH=${PRIMARY_BRANCH:-main}"
    -e DISINTO_CONTAINER=1
    -e DISINTO_FORMULA=verify
  )
  # Pass through ANTHROPIC_API_KEY if set
  if [ -n "${ANTHROPIC_API_KEY:-}" ]; then
    cmd+=(-e "ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}")
  fi
  # Mount ~/.claude and ~/.ssh from the runtime user's home if available
  local runtime_home="${HOME:-/home/debian}"
  if [ -d "${runtime_home}/.claude" ]; then
    cmd+=(-v "${runtime_home}/.claude:/home/agent/.claude")
  fi
  if [ -f "${runtime_home}/.claude.json" ]; then
    cmd+=(-v "${runtime_home}/.claude.json:/home/agent/.claude.json:ro")
  fi
  if [ -d "${runtime_home}/.ssh" ]; then
    cmd+=(-v "${runtime_home}/.ssh:/home/agent/.ssh:ro")
  fi
  # Mount claude CLI binary if present on host
  if [ -f /usr/local/bin/claude ]; then
    cmd+=(-v /usr/local/bin/claude:/usr/local/bin/claude:ro)
  fi
  # Mount the project TOML into the container at a stable path
  local container_toml="/home/agent/project.toml"
  cmd+=(-v "${project_toml}:${container_toml}:ro")
  cmd+=(disinto-reproduce:latest "$container_toml" "$issue_number")
  # Launch in background; write pid-file so we don't double-launch
  "${cmd[@]}" &
  local bg_pid=$!
  echo "$bg_pid" > "$(_verify_lockfile "$issue_number")"
  log "Verification container launched (pid ${bg_pid}) for issue #${issue_number}"
 }
@ -998,10 +1045,25 @@ ensure_ops_repo() {
 # Main dispatcher loop
 main() {
-  log "Starting dispatcher..."
+  log "Starting dispatcher (backend=${DISPATCHER_BACKEND})..."
  log "Polling ops repo: ${VAULT_ACTIONS_DIR}"
  log "Admin users: ${ADMIN_USERS}"
  # Validate backend selection at startup
  case "$DISPATCHER_BACKEND" in
    docker) ;;
    nomad)
      log "ERROR: nomad backend not yet implemented"
      echo "nomad backend not yet implemented" >&2
      exit 1
      ;;
    *)
      log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}"
      echo "unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND} (expected: docker, nomad)" >&2
      exit 1
      ;;
  esac
  while true; do
    # Refresh ops repo at the start of each poll cycle
    ensure_ops_repo
--- a/docker/edge/entrypoint-edge.sh
+++ b/docker/edge/entrypoint-edge.sh
@ -1,8 +1,9 @@
 #!/usr/bin/env bash
 set -euo pipefail
-# Set USER before sourcing env.sh (Alpine doesn't set USER)
+# Set USER and HOME before sourcing env.sh — preconditions for lib/env.sh (#674).
-export USER="${USER:-root}"
+export USER="${USER:-agent}"
 export HOME="${HOME:-/home/agent}"
 FORGE_URL="${FORGE_URL:-http://forgejo:3000}"
@ -36,21 +37,132 @@ if [ -z "${FORGE_REPO:-}" ]; then
  fi
 fi
-# Shallow clone at the pinned version (inject token to support auth-required Forgejo)
+# Detect bind-mount of a non-git directory before attempting clone
-if [ ! -d /opt/disinto/.git ]; then
+if [ -d /opt/disinto ] && [ ! -d /opt/disinto/.git ] && [ -n "$(ls -A /opt/disinto 2>/dev/null)" ]; then
-  _auth_url=$(printf '%s' "$FORGE_URL" | sed "s|://|://token:${FORGE_TOKEN}@|")
+  echo "FATAL: /opt/disinto contains files but no .git directory." >&2
-  git clone --depth 1 --branch "${DISINTO_VERSION:-main}" "${_auth_url}/${FORGE_REPO}.git" /opt/disinto
+  echo "If you bind-mounted a directory at /opt/disinto, ensure it is a git working tree." >&2
  echo "Sleeping 60s before exit to throttle the restart loop..." >&2
  sleep 60
  exit 1
 fi
-# Set HOME so that claude OAuth credentials and session.lock are found at the
+# Set HOME early so credential helper and git config land in the right place.
 # same in-container path as in disinto-agents (/home/agent/.claude), which makes
 # flock cross-serialize across containers on the same host inode.
 export HOME=/home/agent
 mkdir -p "$HOME"
 # Configure git credential helper before cloning (#604).
 # /opt/disinto does not exist yet so we cannot source lib/git-creds.sh;
 # inline a minimal credential-helper setup here.
 if [ -n "${FORGE_PASS:-}" ] && [ -n "${FORGE_URL:-}" ]; then
  _forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
  _forge_proto=$(printf '%s' "$FORGE_URL" | sed 's|://.*||')
  _bot_user=""
  if [ -n "${FORGE_TOKEN:-}" ]; then
    _bot_user=$(curl -sf -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_URL}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || _bot_user=""
  fi
  _bot_user="${_bot_user:-dev-bot}"
  cat > "${HOME}/.git-credentials-helper" <<CREDEOF
 #!/bin/sh
 # Reads \$FORGE_PASS from env at runtime — file is safe to read on disk.
 [ "\$1" = "get" ] || exit 0
 cat >/dev/null
 echo "protocol=${_forge_proto}"
 echo "host=${_forge_host}"
 echo "username=${_bot_user}"
 echo "password=\$FORGE_PASS"
 CREDEOF
  chmod 755 "${HOME}/.git-credentials-helper"
  git config --global credential.helper "${HOME}/.git-credentials-helper"
  git config --global --add safe.directory '*'
 fi
 # Shallow clone at the pinned version — use clean URL, credential helper
 # supplies auth (#604).
 # Retry with exponential backoff — forgejo may still be starting (#665).
 if [ ! -d /opt/disinto/.git ]; then
  echo "edge: cloning ${FORGE_URL}/${FORGE_REPO} (branch ${DISINTO_VERSION:-main})..." >&2
  _clone_ok=false
  _backoff=2
  _max_backoff=30
  _max_attempts=10
  for _attempt in $(seq 1 "$_max_attempts"); do
    if git clone --depth 1 --branch "${DISINTO_VERSION:-main}" "${FORGE_URL}/${FORGE_REPO}.git" /opt/disinto 2>&1; then
      _clone_ok=true
      break
    fi
    rm -rf /opt/disinto  # clean up partial clone before retry
    if [ "$_attempt" -lt "$_max_attempts" ]; then
      echo "edge: clone attempt ${_attempt}/${_max_attempts} failed, retrying in ${_backoff}s..." >&2
      sleep "$_backoff"
      _backoff=$(( _backoff * 2 ))
      if [ "$_backoff" -gt "$_max_backoff" ]; then _backoff=$_max_backoff; fi
    fi
  done
  if [ "$_clone_ok" != "true" ]; then
    echo >&2
    echo "FATAL: failed to clone ${FORGE_URL}/${FORGE_REPO}.git (branch ${DISINTO_VERSION:-main}) after ${_max_attempts} attempts" >&2
    echo "Likely causes:" >&2
    echo "  - Forgejo at ${FORGE_URL} is unreachable from the edge container" >&2
    echo "  - Repository '${FORGE_REPO}' does not exist on this forge" >&2
    echo "  - FORGE_TOKEN/FORGE_PASS is invalid or has no read access to '${FORGE_REPO}'" >&2
    echo "  - Branch '${DISINTO_VERSION:-main}' does not exist in '${FORGE_REPO}'" >&2
    echo "Workaround: bind-mount a local git checkout into /opt/disinto." >&2
    echo "Sleeping 60s before exit to throttle the restart loop..." >&2
    sleep 60
    exit 1
  fi
 fi
 # Repair any legacy baked-credential URLs in /opt/disinto (#604).
 # Now that /opt/disinto exists, source the shared lib.
 if [ -f /opt/disinto/lib/git-creds.sh ]; then
  # shellcheck source=/opt/disinto/lib/git-creds.sh
  source /opt/disinto/lib/git-creds.sh
  _GIT_CREDS_LOG_FN="echo" repair_baked_cred_urls /opt/disinto
 fi
 # Ensure log directory exists
 mkdir -p /opt/disinto-logs
 # ── Reverse tunnel (optional) ──────────────────────────────────────────
 # When EDGE_TUNNEL_HOST is set, open a single reverse-SSH forward so the
 # DO edge box can reach this container's Caddy on the project's assigned port.
 # Guarded: if EDGE_TUNNEL_HOST is empty/unset the block is skipped entirely,
 # keeping local-only dev working without errors.
 if [ -n "${EDGE_TUNNEL_HOST:-}" ]; then
  _tunnel_key="/run/secrets/tunnel_key"
  if [ ! -f "$_tunnel_key" ]; then
    echo "WARN: EDGE_TUNNEL_HOST is set but ${_tunnel_key} is missing — skipping tunnel" >&2
  else
    # Ensure correct permissions (bind-mount may arrive as 644)
    chmod 0400 "$_tunnel_key" 2>/dev/null || true
    : "${EDGE_TUNNEL_USER:=tunnel}"
    : "${EDGE_TUNNEL_PORT:?EDGE_TUNNEL_PORT must be set when EDGE_TUNNEL_HOST is set}"
    export AUTOSSH_GATETIME=0   # don't exit if the first attempt fails quickly
    autossh -M 0 -N -f \
      -o StrictHostKeyChecking=accept-new \
      -o ServerAliveInterval=30 \
      -o ServerAliveCountMax=3 \
      -o ExitOnForwardFailure=yes \
      -i "$_tunnel_key" \
      -R "127.0.0.1:${EDGE_TUNNEL_PORT}:localhost:80" \
      "${EDGE_TUNNEL_USER}@${EDGE_TUNNEL_HOST}"
    echo "edge: reverse tunnel → ${EDGE_TUNNEL_HOST}:${EDGE_TUNNEL_PORT}" >&2
  fi
 fi
 # Set project context vars for scripts that source lib/env.sh (#674).
 # These satisfy env.sh's preconditions for edge-container scripts.
 export PROJECT_REPO_ROOT="${PROJECT_REPO_ROOT:-/opt/disinto}"
 export PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
 export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/agent/repos/${PROJECT_NAME:-disinto}-ops}"
 # Start dispatcher in background
 bash /opt/disinto/docker/edge/dispatcher.sh &
@ -61,6 +173,67 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}"
  sleep 1200  # 20 minutes
 done) &
 # ── Load required secrets from secrets/*.enc (#777) ────────────────────
 # Edge container declares its required secrets; missing ones cause a hard fail.
 _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt"
 _SECRETS_DIR="/opt/disinto/secrets"
 EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG"
 _edge_decrypt_secret() {
  local enc_path="${_SECRETS_DIR}/${1}.enc"
  [ -f "$enc_path" ] || return 1
  age -d -i "$_AGE_KEY_FILE" "$enc_path" 2>/dev/null
 }
 if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then
  _missing=""
  for _secret_name in $EDGE_REQUIRED_SECRETS; do
    _val=$(_edge_decrypt_secret "$_secret_name") || { _missing="${_missing} ${_secret_name}"; continue; }
    export "$_secret_name=$_val"
  done
  if [ -n "$_missing" ]; then
    echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2
    echo "  Run 'disinto secrets add <NAME>' for each missing secret." >&2
    echo "  If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2
    exit 1
  fi
  echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2
 else
  echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2
  echo "  Ensure age is installed and secrets/*.enc files are present." >&2
  exit 1
 fi
 # Start daily engagement collection cron loop in background (#745)
 # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that
 # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777).
 (while true; do
  # Calculate seconds until next 23:50 UTC
  _now=$(date -u +%s)
  _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0)
  if [ "$_target" -le "$_now" ]; then
    _target=$(( _target + 86400 ))
  fi
  _sleep_secs=$(( _target - _now ))
  echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2
  sleep "$_sleep_secs"
  _fetch_log="/tmp/caddy-access-log-fetch.log"
  _ssh_key_file=$(mktemp)
  printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file"
  chmod 0600 "$_ssh_key_file"
  scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
    "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \
    "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true
  rm -f "$_ssh_key_file"
  if [ -s "$_fetch_log" ]; then
    CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \
      | tee -a /opt/disinto-logs/collect-engagement.log || true
  else
    echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2
  fi
  rm -f "$_fetch_log"
 done) &
 # Caddy as main process — run in foreground via wait so background jobs survive
 # (exec replaces the shell, which can orphan backgrounded subshells)
 caddy run --config /etc/caddy/Caddyfile --adapter caddyfile &
--- a/docker/reproduce/Dockerfile
+++ b/docker/reproduce/Dockerfile
@ -7,5 +7,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN useradd -m -u 1000 -s /bin/bash agent
 COPY docker/reproduce/entrypoint-reproduce.sh /entrypoint-reproduce.sh
 RUN chmod +x /entrypoint-reproduce.sh
 VOLUME /home/agent/data
 VOLUME /home/agent/repos
 WORKDIR /home/agent
 ENTRYPOINT ["/entrypoint-reproduce.sh"]
--- a/docker/reproduce/entrypoint-reproduce.sh
+++ b/docker/reproduce/entrypoint-reproduce.sh
@ -15,7 +15,7 @@
 # Volumes expected:
 #   /home/agent/data          — agent-data volume (stack-lock files go here)
 #   /home/agent/repos         — project-repos volume
-#   /home/agent/.claude       — host ~/.claude (OAuth credentials)
+#   $CLAUDE_CONFIG_DIR        — shared Claude config dir (OAuth credentials)
 #   /home/agent/.ssh          — host ~/.ssh (read-only)
 #   /usr/local/bin/claude     — host claude CLI binary (read-only)
 #   /var/run/docker.sock      — host docker socket
@ -84,6 +84,19 @@ export DISINTO_CONTAINER=1
 export HOME="${HOME:-/home/agent}"
 export USER="${USER:-agent}"
 # Set project context vars for lib/env.sh surface contract (#674).
 # PROJECT_NAME and PROJECT_REPO_ROOT are set below after TOML parsing.
 export PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
 # Configure git credential helper so reproduce/triage agents can clone/push
 # without needing tokens embedded in remote URLs (#604).
 if [ -f "${DISINTO_DIR}/lib/git-creds.sh" ]; then
  # shellcheck source=lib/git-creds.sh
  source "${DISINTO_DIR}/lib/git-creds.sh"
  # shellcheck disable=SC2119  # no args intended — uses defaults
  configure_git_creds
 fi
 FORGE_API="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
 # Load project name from TOML
@ -98,6 +111,8 @@ with open(sys.argv[1], 'rb') as f:
 export PROJECT_NAME
 PROJECT_REPO_ROOT="/home/agent/repos/${PROJECT_NAME}"
 export PROJECT_REPO_ROOT
 export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/agent/repos/${PROJECT_NAME}-ops}"
 if [ "$AGENT_TYPE" = "triage" ]; then
  log "Starting triage-agent for issue #${ISSUE_NUMBER} (project: ${PROJECT_NAME})"
--- a/docker/runner/entrypoint-runner.sh
+++ b/docker/runner/entrypoint-runner.sh
@ -23,6 +23,15 @@ log() {
  printf '[%s] runner: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$*"
 }
 # Configure git credential helper so formulas can clone/push without
 # needing tokens embedded in remote URLs (#604).
 if [ -f "${FACTORY_ROOT}/lib/git-creds.sh" ]; then
  # shellcheck source=lib/git-creds.sh
  source "${FACTORY_ROOT}/lib/git-creds.sh"
  # shellcheck disable=SC2119  # no args intended — uses defaults
  configure_git_creds
 fi
 # ── Argument parsing ─────────────────────────────────────────────────────
 action_id="${1:-}"
--- a/docs/CLAUDE-AUTH-CONCURRENCY.md
+++ b/docs/CLAUDE-AUTH-CONCURRENCY.md
@ -0,0 +1,138 @@
 # Claude Code OAuth Concurrency Model
 ## Problem statement
 The factory runs multiple concurrent Claude Code processes across
 containers. OAuth access tokens are short-lived; refresh tokens rotate
 on each use. If two processes POST the same refresh token to Anthropic's
 token endpoint simultaneously, only one wins — the other gets
 `invalid_grant` and the operator is forced to re-login.
 Claude Code already serializes OAuth refreshes internally using
 `proper-lockfile` (`src/utils/auth.ts:1485-1491`):
 ```typescript
 release = await lockfile.lock(claudeDir)
 ```
 `proper-lockfile` creates a lockfile via an atomic `mkdir(${path}.lock)`
 call — a cross-process primitive that works across any number of
 processes on the same filesystem. The problem was never the lock
 implementation; it was that our old per-container bind-mount layout
 (`~/.claude` mounted but `/home/agent/` container-local) caused each
 container to compute a different lockfile path, so the locks never
 coordinated.
 ## The fix: shared `CLAUDE_CONFIG_DIR`
 `CLAUDE_CONFIG_DIR` is an officially supported env var in Claude Code
 (`src/utils/envUtils.ts`). It controls where Claude resolves its config
 directory instead of the default `~/.claude`.
 By setting `CLAUDE_CONFIG_DIR` to a path on a shared bind mount, every
 container computes the **same** lockfile location. `proper-lockfile`'s
 atomic `mkdir(${CLAUDE_CONFIG_DIR}.lock)` then gives free cross-container
 serialization — no external wrapper needed.
 ## Current layout
 ```
 Host filesystem:
  /var/lib/disinto/claude-shared/          ← CLAUDE_SHARED_DIR
  └── config/                              ← CLAUDE_CONFIG_DIR
      ├── .credentials.json
      ├── settings.json
      └── ...
 Inside every container:
  Same absolute path: /var/lib/disinto/claude-shared/config
  Env: CLAUDE_CONFIG_DIR=/var/lib/disinto/claude-shared/config
 ```
 The shared directory is mounted at the **same absolute path** inside
 every container, so `proper-lockfile` resolves an identical lock path
 everywhere.
 ### Where these values are defined
 | What | Where |
 |------|-------|
 | Defaults for `CLAUDE_SHARED_DIR`, `CLAUDE_CONFIG_DIR` | `lib/env.sh:138-140` |
 | `.env` documentation | `.env.example:92-99` |
 | Container mounts + env passthrough (edge dispatcher) | `docker/edge/dispatcher.sh:446-448` (and analogous blocks for reproduce, triage, verify) |
 | Auth detection using `CLAUDE_CONFIG_DIR` | `docker/agents/entrypoint.sh:101-102` |
 | Bootstrap / migration during `disinto init` | `lib/claude-config.sh:setup_claude_config_dir()`, `bin/disinto:952-962` |
 ## Migration for existing dev boxes
 For operators upgrading from the old `~/.claude` bind-mount layout,
 `disinto init` handles the migration interactively (or with `--yes`).
 The manual equivalent is:
 ```bash
 # 1. Stop the factory
 disinto down
 # 2. Create the shared directory
 mkdir -p /var/lib/disinto/claude-shared
 # 3. Move existing config
 mv "$HOME/.claude" /var/lib/disinto/claude-shared/config
 # 4. Create a back-compat symlink so host-side claude still works
 ln -sfn /var/lib/disinto/claude-shared/config "$HOME/.claude"
 # 5. Export the env var (add to shell rc for persistence)
 export CLAUDE_CONFIG_DIR=/var/lib/disinto/claude-shared/config
 # 6. Start the factory
 disinto up
 ```
 ## Verification
 Watch for these analytics events during concurrent agent runs:
 | Event | Meaning |
 |-------|---------|
 | `tengu_oauth_token_refresh_lock_acquiring` | A process is attempting to acquire the refresh lock |
 | `tengu_oauth_token_refresh_lock_acquired` | Lock acquired; refresh proceeding |
 | `tengu_oauth_token_refresh_lock_retry` | Lock is held by another process; retrying |
 | `tengu_oauth_token_refresh_lock_race_resolved` | Contention detected and resolved normally |
 | `tengu_oauth_token_refresh_lock_retry_limit_reached` | Lock acquisition failed after all retries |
 **Healthy:** `_race_resolved` appearing during contention windows — this
 means multiple processes tried to refresh simultaneously and the lock
 correctly serialized them.
 **Bad:** `_lock_retry_limit_reached` — indicates the lock is stuck or
 the shared mount is not working. Verify that `CLAUDE_CONFIG_DIR` resolves
 to the same path in all containers and that the filesystem supports
 `mkdir` atomicity (any POSIX filesystem does).
 ## The deferred external `flock` wrapper
 `lib/agent-sdk.sh:139,144` still wraps every `claude` invocation in an
 external `flock` on `${HOME}/.claude/session.lock`:
 ```bash
 local lock_file="${HOME}/.claude/session.lock"
 ...
 output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1;
  claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" ...)
 ```
 With the `CLAUDE_CONFIG_DIR` fix in place, this external lock is
 **redundant but harmless** — `proper-lockfile` serializes the refresh
 internally, and `flock` serializes the entire invocation externally.
 The external flock remains as a defense-in-depth measure; removal is
 tracked as a separate vision-tier issue.
 ## See also
 - `lib/env.sh:138-140` — `CLAUDE_SHARED_DIR` / `CLAUDE_CONFIG_DIR` defaults
 - `lib/claude-config.sh` — migration helper used by `disinto init`
 - `lib/agent-sdk.sh:139,144` — the external `flock` wrapper (deferred removal)
 - `docker/agents/entrypoint.sh:101-102` — `CLAUDE_CONFIG_DIR` auth detection
 - `.env.example:92-99` — operator-facing documentation of the env vars
 - Issue #623 — chat container auth strategy
--- a/docs/VAULT.md
+++ b/docs/VAULT.md
@ -26,8 +26,8 @@ The `main` branch on the ops repo (`johba/disinto-ops`) is protected via Forgejo
 ## Vault PR Lifecycle
-1. **Request** — Agent calls `lib/vault.sh:vault_request()` with action TOML content
+1. **Request** — Agent calls `lib/action-vault.sh:vault_request()` with action TOML content
-2. **Validation** — TOML is validated against the schema in `vault/vault-env.sh`
+2. **Validation** — TOML is validated against the schema in `action-vault/vault-env.sh`
 3. **PR Creation** — A PR is created on `disinto-ops` with:
   - Branch: `vault/<action-id>`
   - Title: `vault: <action-id>`
@ -90,12 +90,12 @@ To verify the protection is working:
 - #73 — Vault redesign proposal
 - #74 — Vault action TOML schema
- #75 — Vault PR creation helper (`lib/vault.sh`)
+- #75 — Vault PR creation helper (`lib/action-vault.sh`)
 - #76 — Dispatcher rewrite (poll for merged vault PRs)
 - #77 — Branch protection on ops repo (this issue)
 ## See Also
- [`lib/vault.sh`](../lib/vault.sh) — Vault PR creation helper
+- [`lib/action-vault.sh`](../lib/action-vault.sh) — Vault PR creation helper
- [`vault/vault-env.sh`](../vault/vault-env.sh) — TOML validation
+- [`action-vault/vault-env.sh`](../action-vault/vault-env.sh) — TOML validation
 - [`lib/branch-protection.sh`](../lib/branch-protection.sh) — Branch protection helper
--- a/docs/agents-llama.md
+++ b/docs/agents-llama.md
@ -0,0 +1,59 @@
 # agents-llama — Local-Qwen Agents
 The `agents-llama` service is an optional compose service that runs agents
 backed by a local llama-server instance (e.g. Qwen) instead of the Anthropic
 API. It uses the same Docker image as the main `agents` service but connects to
 a local inference endpoint via `ANTHROPIC_BASE_URL`.
 Two profiles are available:
 | Profile | Service | Roles | Use case |
 |---------|---------|-------|----------|
 | _(default)_ | `agents-llama` | `dev` only | Conservative: single-role soak test |
 | `agents-llama-all` | `agents-llama-all` | all 7 (review, dev, gardener, architect, planner, predictor, supervisor) | Pre-migration: validate every role on llama before Nomad cutover |
 ## Enabling
 Set `ENABLE_LLAMA_AGENT=1` in `.env` (or `.env.enc`) and provide the required
 credentials:
 ```env
 ENABLE_LLAMA_AGENT=1
 FORGE_TOKEN_LLAMA=<dev-qwen API token>
 FORGE_PASS_LLAMA=<dev-qwen password>
 ANTHROPIC_BASE_URL=http://host.docker.internal:8081   # llama-server endpoint
 ```
 Then regenerate the compose file (`disinto init ...`) and bring the stack up.
 ### Running all 7 roles (agents-llama-all)
 ```bash
 docker compose --profile agents-llama-all up -d
 ```
 This starts the `agents-llama-all` container with all 7 bot roles against the
 local llama endpoint. The per-role forge tokens (`FORGE_REVIEW_TOKEN`,
 `FORGE_GARDENER_TOKEN`, etc.) must be set in `.env` — they are the same tokens
 used by the Claude-backed `agents` container.
 ## Prerequisites
 - **llama-server** (or compatible OpenAI-API endpoint) running on the host,
  reachable from inside Docker at the URL set in `ANTHROPIC_BASE_URL`.
 - A Forgejo bot user (e.g. `dev-qwen`) with its own API token and password,
  stored as `FORGE_TOKEN_LLAMA` / `FORGE_PASS_LLAMA`.
 ## Behaviour
 - `agents-llama`: `AGENT_ROLES=dev` — only picks up dev work.
 - `agents-llama-all`: `AGENT_ROLES=review,dev,gardener,architect,planner,predictor,supervisor` — runs all 7 roles.
 - `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=60` — more aggressive compaction for smaller
  context windows.
 - Serialises on the llama-server's single KV cache (AD-002).
 ## Disabling
 Set `ENABLE_LLAMA_AGENT=0` (or leave it unset) and regenerate. The service
 block is omitted entirely from `docker-compose.yml`; the stack starts cleanly
 without it.
--- a/docs/edge-routing-fallback.md
+++ b/docs/edge-routing-fallback.md
@ -0,0 +1,149 @@
 # Edge Routing Fallback: Per-Project Subdomains
 > **Status:** Contingency plan. Only implement if subpath routing (#704 / #708)
 > proves unworkable.
 ## Context
 The primary approach routes services under subpaths of `<project>.disinto.ai`:
 | Service    | Primary (subpath)                          |
 |------------|--------------------------------------------|
 | Forgejo    | `<project>.disinto.ai/forge/`              |
 | Woodpecker | `<project>.disinto.ai/ci/`                 |
 | Chat       | `<project>.disinto.ai/chat/`               |
 | Staging    | `<project>.disinto.ai/staging/`            |
 The fallback uses per-service subdomains instead:
 | Service    | Fallback (subdomain)                       |
 |------------|--------------------------------------------|
 | Forgejo    | `forge.<project>.disinto.ai/`              |
 | Woodpecker | `ci.<project>.disinto.ai/`                 |
 | Chat       | `chat.<project>.disinto.ai/`               |
 | Staging    | `<project>.disinto.ai/`  (root)            |
 The wildcard cert from #621 already covers `*.<project>.disinto.ai` — no new
 DNS records or certs are needed for sub-subdomains because `*.disinto.ai`
 matches one level deep. For sub-subdomains like `forge.<project>.disinto.ai`
 we would need to add a second wildcard (`*.*.disinto.ai`) or explicit DNS
 records per project. Both are straightforward with the existing Gandi DNS-01
 setup.
 ## Pivot Decision Criteria
 **Pivot if:**
 - Forgejo `ROOT_URL` under a subpath (`/forge/`) causes redirect loops that
  cannot be fixed with `X-Forwarded-Prefix` or Caddy `uri strip_prefix`.
 - Woodpecker's `WOODPECKER_HOST` does not honour subpath prefixes, causing
  OAuth callback mismatches that persist after adjusting redirect URIs.
 - Forward-auth on `/chat/*` conflicts with Forgejo's own OAuth flow when both
  share the same origin (cookie collision, CSRF token mismatch).
 **Do NOT pivot if:**
 - Forgejo login redirects to `/` instead of `/forge/` — fixable with Caddy
  `handle_path` + `uri prefix` rewrite.
 - Woodpecker UI assets 404 under `/ci/` — fixable with asset prefix config
  (`WOODPECKER_ROOT_PATH`).
 - A single OAuth app needs a second redirect URI — Forgejo supports multiple
  `redirect_uris` in the same app.
 ## Fallback Topology
 ### Caddyfile
 Replace the single `:80` block with four host blocks:
 ```caddy
 # Main project domain — staging / landing
 <project>.disinto.ai {
    reverse_proxy staging:80
 }
 # Forgejo — root path, no subpath rewrite needed
 forge.<project>.disinto.ai {
    reverse_proxy forgejo:3000
 }
 # Woodpecker CI — root path
 ci.<project>.disinto.ai {
    reverse_proxy woodpecker:8000
 }
 # Chat — with forward_auth (same as #709, but on its own host)
 chat.<project>.disinto.ai {
    handle /login {
        reverse_proxy chat:8080
    }
    handle /oauth/callback {
        reverse_proxy chat:8080
    }
    handle /* {
        forward_auth chat:8080 {
            uri /auth/verify
            copy_headers X-Forwarded-User
            header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
        }
        reverse_proxy chat:8080
    }
 }
 ```
 **Current file:** `docker/Caddyfile` (generated by `lib/generators.sh:_generate_caddyfile_impl`, line ~596).
 ### Service Configuration Changes
 | Variable / Setting         | Current (subpath)                              | Fallback (subdomain)                            | File                        |
 |----------------------------|------------------------------------------------|-------------------------------------------------|-----------------------------|
 | Forgejo `ROOT_URL`         | `https://<project>.disinto.ai/forge/`          | `https://forge.<project>.disinto.ai/`           | forgejo `app.ini`           |
 | `WOODPECKER_HOST`          | `http://localhost:8000` (subpath via proxy)     | `https://ci.<project>.disinto.ai`               | `lib/ci-setup.sh` line ~164 |
 | Woodpecker OAuth redirect  | `https://<project>.disinto.ai/ci/authorize`    | `https://ci.<project>.disinto.ai/authorize`     | `lib/ci-setup.sh` line ~153 |
 | Chat OAuth redirect        | `https://<project>.disinto.ai/chat/oauth/callback` | `https://chat.<project>.disinto.ai/oauth/callback` | `lib/ci-setup.sh` line ~188 |
 | `EDGE_TUNNEL_FQDN`         | `<project>.disinto.ai`                         | unchanged (main domain)                         | `lib/generators.sh` line ~432 |
 ### New Environment Variables (pivot only)
 These would be added to `lib/generators.sh` `_generate_compose_impl()` in the
 edge service environment block (currently line ~415):
 | Variable                     | Value                                  |
 |------------------------------|----------------------------------------|
 | `EDGE_TUNNEL_FQDN_FORGE`    | `forge.<project>.disinto.ai`           |
 | `EDGE_TUNNEL_FQDN_CI`       | `ci.<project>.disinto.ai`              |
 | `EDGE_TUNNEL_FQDN_CHAT`     | `chat.<project>.disinto.ai`            |
 ### DNS
 No new records needed if the registrar supports `*.*.disinto.ai` wildcards.
 Otherwise, add explicit A/CNAME records per project:
 ```
 forge.<project>.disinto.ai  → edge server IP
 ci.<project>.disinto.ai     → edge server IP
 chat.<project>.disinto.ai   → edge server IP
 ```
 The edge server already handles TLS via Caddy's automatic HTTPS with the
 existing ACME / DNS-01 challenge.
 ### Edge Control (`tools/edge-control/register.sh`)
 Currently `do_register()` creates a single route for `<project>.disinto.ai`.
 The fallback would need to register four routes (or accept a `--subdomain`
 parameter). See the TODO in `register.sh`.
 ## Files to Change on Pivot
 | File                              | What changes                                                    |
 |-----------------------------------|-----------------------------------------------------------------|
 | `docker/Caddyfile`               | Replace single host block → four host blocks (see above)        |
 | `lib/generators.sh`              | Add `EDGE_TUNNEL_FQDN_{FORGE,CI,CHAT}` env vars to compose     |
 | `lib/ci-setup.sh` ~line 153      | Woodpecker OAuth redirect URI → `ci.<project>` subdomain        |
 | `lib/ci-setup.sh` ~line 188      | Chat OAuth redirect URI → `chat.<project>` subdomain            |
 | `tools/edge-control/register.sh` | Register four routes per project instead of one                 |
 | `tools/edge-control/lib/caddy.sh`| `add_route()` gains subdomain support                           |
 | forgejo `app.ini`                 | `ROOT_URL` → `https://forge.<project>.disinto.ai/`             |
 Estimated effort for a full pivot: **under one day** given this plan.
--- a/docs/investigation-685-reviewer-approved-destructive-compose.md
+++ b/docs/investigation-685-reviewer-approved-destructive-compose.md
@ -0,0 +1,123 @@
 # Investigation: Reviewer approved destructive compose rewrite in PR #683
 **Issue**: #685
 **Date**: 2026-04-11
 **PR under investigation**: #683 (fix: config: gardener=1h, architect=9m, planner=11m)
 ## Summary
 The reviewer agent approved PR #683 in ~1 minute without flagging that it
 contained a destructive rewrite of `docker-compose.yml` — dropping named
 volumes, bind mounts, env vars, restart policy, and security options. Six
 structural gaps in the review pipeline allowed this to pass.
 ## Root causes
 ### 1. No infrastructure-file-specific review checklist
 The review formula (`formulas/review-pr.toml`) has a generic review checklist
 (bugs, security, imports, architecture, bash specifics, dead code). It has
 **no special handling for infrastructure files** — `docker-compose.yml`,
 `Dockerfile`, CI configs, or `entrypoint.sh` are reviewed with the same
 checklist as application code.
 Infrastructure files have a different failure mode: a single dropped line
 (a volume mount, an env var, a restart policy) can break a running deployment
 without any syntax error or linting failure. The generic checklist doesn't
 prompt the reviewer to check for these regressions.
 **Fix applied**: Added step 3c "Infrastructure file review" to
 `formulas/review-pr.toml` with a compose-specific checklist covering named
 volumes, bind mounts, env vars, restart policy, and security options.
 ### 2. No scope discipline
 Issue #682 asked for ~3 env var changes + `PLANNER_INTERVAL` plumbing — roughly
 10-15 lines across 3-4 files. PR #683's diff rewrote the entire compose service
 block (~50+ lines changed in `docker-compose.yml` alone).
 The review formula **does not instruct the reviewer to compare diff size against
 issue scope**. A scope-aware reviewer would flag: "this PR changes more lines
 than the issue scope warrants — request justification for out-of-scope changes."
 **Fix applied**: Added step 3d "Scope discipline" to `formulas/review-pr.toml`
 requiring the reviewer to compare actual changes against stated issue scope and
 flag out-of-scope modifications to infrastructure files.
 ### 3. Lessons-learned bias toward approval
 The reviewer's `.profile/knowledge/lessons-learned.md` contains multiple entries
 that systematically bias toward approval:
 - "Approval means 'ready to ship,' not 'perfect.'"
 - "'Different from how I'd write it' is not a blocker."
 - "Reserve request_changes for genuinely blocking concerns."
 These lessons are well-intentioned (they prevent nit-picking and false blocks)
 but they create a blind spot: the reviewer suppresses its instinct to flag
 suspicious-looking changes because the lessons tell it not to block on
 "taste-based" concerns. A compose service block rewrite *looks* like a style
 preference ("the dev reorganized the file") but is actually a correctness
 regression.
 **Recommendation**: The lessons-learned are not wrong — they should stay. But
 the review formula now explicitly carves out infrastructure files from the
 "bias toward APPROVE" guidance, making it clear that dropped infra
 configuration is a blocking concern, not a style preference.
 ### 4. No ground-truth for infrastructure files
 The reviewer only sees the diff. It has no way to compare against the running
 container's actual volume/env config. When dev-qwen rewrote a 30-line service
 block from scratch, the reviewer saw a 30-line addition and a 30-line deletion
 with no reference point.
 **Recommendation (future work)**: Maintain a `docker/expected-compose-config.yml`
 or have the reviewer fetch `docker compose config` output as ground truth when
 reviewing compose changes. This would let the reviewer diff the proposed config
 against the known-good config.
 ### 5. Structural analysis blind spot
 `lib/build-graph.py` tracks changes to files in `formulas/`, agent directories
 (`dev/`, `review/`, etc.), and `evidence/`. It does **not track infrastructure
 files** (`docker-compose.yml`, `docker/`, `.woodpecker/`). Changes to these
 files produce no alerts in the graph report — the reviewer gets no
 "affected objectives" signal for infrastructure changes.
 **Recommendation (future work)**: Add infrastructure file tracking to
 `build-graph.py` so that compose/Dockerfile/CI changes surface in the
 structural analysis.
 ### 6. Model and time budget
 Reviews use Sonnet (`CLAUDE_MODEL="sonnet"` at `review-pr.sh:229`) with a
 15-minute timeout. The PR #683 review completed in ~1 minute. Sonnet is
 optimized for speed, which is appropriate for most code reviews, but
 infrastructure changes benefit from the deeper reasoning of a more capable
 model.
 **Recommendation (future work)**: Consider escalating to a more capable model
 when the diff includes infrastructure files (compose, Dockerfiles, CI configs).
 ## Changes made
 1. **`formulas/review-pr.toml`** — Added two new review steps:
   - **Step 3c: Infrastructure file review** — When the diff touches
     `docker-compose.yml`, `Dockerfile*`, `.woodpecker/`, or `docker/`,
     requires checking for dropped volumes, bind mounts, env vars, restart
     policy, security options, and network config. Instructs the reviewer to
     read the full file (not just the diff) and compare against the base branch.
   - **Step 3d: Scope discipline** — Requires comparing the actual diff
     footprint against the stated issue scope. Flags out-of-scope rewrites of
     infrastructure files as blocking concerns.
 ## What would have caught this
 With the changes above, the reviewer would have:
 1. Seen step 3c trigger for `docker-compose.yml` changes
 2. Read the full compose file and compared against the base branch
 3. Noticed the dropped named volumes, bind mounts, env vars, restart policy
 4. Seen step 3d flag that a 3-env-var issue produced a 50+ line compose rewrite
 5. Issued REQUEST_CHANGES citing specific dropped configuration
--- a/docs/mirror-bootstrap.md
+++ b/docs/mirror-bootstrap.md
@ -0,0 +1,59 @@
 # Mirror Bootstrap — Pull-Mirror Cutover Path
 How to populate an empty Forgejo repo from an external source using
 `lib/mirrors.sh`'s `mirror_pull_register()`.
 ## Prerequisites
 | Variable | Example | Purpose |
 |---|---|---|
 | `FORGE_URL` | `http://forgejo:3000` | Forgejo instance base URL |
 | `FORGE_API_BASE` | `${FORGE_URL}/api/v1` | Global API base (set by `lib/env.sh`) |
 | `FORGE_TOKEN` | (admin or org-owner token) | Must have `repo:create` scope |
 The target org/user must already exist on the Forgejo instance.
 ## Command
 ```bash
 source lib/env.sh
 source lib/mirrors.sh
 # Register a pull mirror — creates the repo and starts the first sync.
 mirror_pull_register \
  "https://codeberg.org/johba/disinto.git" \   # source URL
  "disinto-admin" \                             # target owner
  "disinto" \                                   # target repo name
  "8h0m0s"                                      # sync interval (optional, default 8h)
 ```
 The function calls `POST /api/v1/repos/migrate` with `mirror: true`.
 Forgejo creates the repo and immediately queues the first sync.
 ## Verifying the sync
 ```bash
 # Check mirror status via API
 forge_api GET "/repos/disinto-admin/disinto" | jq '.mirror, .mirror_interval'
 # Confirm content arrived — should list branches
 forge_api GET "/repos/disinto-admin/disinto/branches" | jq '.[].name'
 ```
 The first sync typically completes within a few seconds for small-to-medium
 repos.  For large repos, poll the branches endpoint until content appears.
 ## Cutover scenario (Nomad migration)
 At cutover to the Nomad box:
 1. Stand up fresh Forgejo on the Nomad cluster (empty instance).
 2. Create the `disinto-admin` org via `disinto init` or API.
 3. Run `mirror_pull_register` pointing at the Codeberg source.
 4. Wait for sync to complete (check branches endpoint).
 5. Once content is confirmed, proceed with `disinto init` against the
   now-populated repo — all subsequent `mirror_push` calls will push
   to any additional mirrors configured in `projects/*.toml`.
 No manual `git clone` + `git push` step is needed.  The Forgejo pull-mirror
 handles the entire transfer.
--- a/docs/updating-factory.md
+++ b/docs/updating-factory.md
@ -18,7 +18,12 @@ git stash                           # save any local fixes
 git merge devbox/main
 ```
-If merge conflicts on `docker-compose.yml`: delete it and regenerate in step 3.
+## Note: docker-compose.yml is generator-only
 The `docker-compose.yml` file is now generated exclusively by `bin/disinto init`.
 The tracked file has been removed. If you have a local `docker-compose.yml` from
 before this change, it is now "yours" and won't be touched by future updates.
 To pick up generator improvements, delete the existing file and run `bin/disinto init`.
 ## Step 2: Preserve local config
@ -31,9 +36,9 @@ cp projects/harb.toml projects/harb.toml.backup
 cp docker-compose.override.yml docker-compose.override.yml.backup 2>/dev/null
 ```
-## Step 3: Regenerate docker-compose.yml (if needed)
+## Step 3: Regenerate docker-compose.yml
-Only needed if `generate_compose()` changed or the compose was deleted.
+If `generate_compose()` changed or you need a fresh compose file:
 ```bash
 rm docker-compose.yml
@ -47,41 +52,15 @@ init errors out.
 ### Known post-regeneration fixes (until #429 lands)
-The generated compose has several issues on LXD deployments:
+Most generator issues have been fixed. The following items no longer apply:
-**1. AppArmor (#492)** — Add to ALL services:
+- **AppArmor (#492)** — Fixed: all services now have `apparmor=unconfined`
-```bash
+- **Forgejo image tag (#493)** — Fixed: generator uses `forgejo:11.0`
-sed -i '/^  forgejo:/a\    security_opt:\n      - apparmor=unconfined' docker-compose.yml
+- **Agent credential mounts (#495)** — Fixed: `.claude`, `.claude.json`, `.ssh`, and `project-repos` volumes are auto-generated
-sed -i '/^  agents:/a\    security_opt:\n      - apparmor=unconfined' docker-compose.yml
+- **Repo path (#494)** — Not applicable: `projects/*.toml` files are gitignored and preserved
 # repeat for: agents-llama, edge, woodpecker, woodpecker-agent, staging, reproduce
 ```
-**2. Forgejo image tag (#493)**:
+If you need to add custom volumes, edit the generated `docker-compose.yml` directly.
-```bash
+It will not be overwritten by future `init` runs (the generator skips existing files).
 sed -i 's|forgejo/forgejo:.*|forgejo/forgejo:11.0|' docker-compose.yml
 ```
 **3. Agent credential mounts (#495)** — Add to agents volumes:
 ```yaml
 - ${HOME}/.claude:/home/agent/.claude
 - ${HOME}/.claude.json:/home/agent/.claude.json:ro
 - ${HOME}/.ssh:/home/agent/.ssh:ro
 - project-repos:/home/agent/repos
 ```
 **4. Repo path (#494)** — Fix `projects/harb.toml` if init overwrote it:
 ```bash
 sed -i 's|repo_root.*=.*"/home/johba/harb"|repo_root       = "/home/agent/repos/harb"|' projects/harb.toml
 sed -i 's|ops_repo_root.*=.*"/home/johba/harb-ops"|ops_repo_root   = "/home/agent/repos/harb-ops"|' projects/harb.toml
 ```
 **5. Add missing volumes** to the `volumes:` section at the bottom:
 ```yaml
 volumes:
  project-repos:
  project-repos-llama:
  disinto-logs:
 ```
 ## Step 4: Rebuild and restart
--- a/formulas/collect-engagement.toml
+++ b/formulas/collect-engagement.toml
@ -0,0 +1,172 @@
 # formulas/collect-engagement.toml — Collect website engagement data
 #
 # Daily formula: SSH into Caddy host, fetch access log, parse locally,
 # commit evidence JSON to ops repo via Forgejo API.
 #
 # Triggered by cron in the edge container entrypoint (daily at 23:50 UTC).
 # Design choices from #426: Q1=A (fetch raw log, process locally),
 # Q2=A (direct cron in edge container), Q3=B (dedicated purpose-limited SSH key).
 #
 # Steps: fetch-log → parse-engagement → commit-evidence
 name        = "collect-engagement"
 description = "SSH-fetch Caddy access log, parse engagement metrics, commit evidence"
 version     = 1
 [context]
 files = ["AGENTS.md"]
 [vars.caddy_host]
 description = "SSH host for the Caddy server"
 required    = false
 default     = "${CADDY_SSH_HOST:-disinto.ai}"
 [vars.caddy_user]
 description = "SSH user on the Caddy host"
 required    = false
 default     = "${CADDY_SSH_USER:-debian}"
 [vars.caddy_log_path]
 description = "Path to Caddy access log on the remote host"
 required    = false
 default     = "${CADDY_ACCESS_LOG:-/var/log/caddy/access.log}"
 [vars.local_log_path]
 description = "Local path to store fetched access log"
 required    = false
 default     = "/tmp/caddy-access-log-fetch.log"
 [vars.evidence_dir]
 description = "Evidence output directory in the ops repo"
 required    = false
 default     = "evidence/engagement"
 # ── Step 1: SSH fetch ────────────────────────────────────────────────
 [[steps]]
 id          = "fetch-log"
 title       = "Fetch Caddy access log from remote host via SSH"
 description = """
 Fetch today's Caddy access log segment from the remote host using SCP.
 The SSH key is read from the environment (CADDY_SSH_KEY), which is
 decrypted from secrets/CADDY_SSH_KEY.enc by the edge entrypoint. It is NEVER hardcoded.
 1. Write the SSH key to a temporary file with restricted permissions:
     _ssh_key_file=$(mktemp)
     trap 'rm -f "$_ssh_key_file"' EXIT
     printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file"
     chmod 0600 "$_ssh_key_file"
 2. Verify connectivity:
     ssh -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new \
       -o ConnectTimeout=10 -o BatchMode=yes \
       {{caddy_user}}@{{caddy_host}} 'echo ok'
 3. Fetch the access log via scp:
     scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new \
       -o ConnectTimeout=10 -o BatchMode=yes \
       "{{caddy_user}}@{{caddy_host}}:{{caddy_log_path}}" \
       "{{local_log_path}}"
 4. Verify the fetched file is non-empty:
     if [ ! -s "{{local_log_path}}" ]; then
       echo "WARNING: fetched access log is empty — site may have no traffic"
     else
       echo "Fetched $(wc -l < "{{local_log_path}}") lines from {{caddy_host}}"
     fi
 5. Clean up the temporary key file:
     rm -f "$_ssh_key_file"
 """
 # ── Step 2: Parse engagement ─────────────────────────────────────────
 [[steps]]
 id          = "parse-engagement"
 title       = "Run collect-engagement.sh against the local log copy"
 description = """
 Run the engagement parser against the locally fetched access log.
 1. Set CADDY_ACCESS_LOG to point at the local copy so collect-engagement.sh
   reads from it instead of the default path:
     export CADDY_ACCESS_LOG="{{local_log_path}}"
 2. Run the parser:
     bash "$FACTORY_ROOT/site/collect-engagement.sh"
 3. Verify the evidence JSON was written:
     REPORT_DATE=$(date -u +%Y-%m-%d)
     EVIDENCE_FILE="${OPS_REPO_ROOT}/{{evidence_dir}}/${REPORT_DATE}.json"
     if [ -f "$EVIDENCE_FILE" ]; then
       echo "Evidence written: $EVIDENCE_FILE"
       jq . "$EVIDENCE_FILE"
     else
       echo "ERROR: evidence file not found at $EVIDENCE_FILE"
       exit 1
     fi
 4. Clean up the fetched log:
     rm -f "{{local_log_path}}"
 """
 needs       = ["fetch-log"]
 # ── Step 3: Commit evidence ──────────────────────────────────────────
 [[steps]]
 id          = "commit-evidence"
 title       = "Commit evidence JSON to ops repo via Forgejo API"
 description = """
 Commit the dated evidence JSON to the ops repo so the planner can
 consume it during gap analysis.
 1. Read the evidence file:
     REPORT_DATE=$(date -u +%Y-%m-%d)
     EVIDENCE_FILE="${OPS_REPO_ROOT}/{{evidence_dir}}/${REPORT_DATE}.json"
     CONTENT=$(base64 < "$EVIDENCE_FILE")
 2. Check if the file already exists in the ops repo (update vs create):
     OPS_OWNER="${OPS_FORGE_OWNER:-${FORGE_REPO%%/*}}"
     OPS_REPO="${OPS_FORGE_REPO:-${PROJECT_NAME:-disinto}-ops}"
     FILE_PATH="{{evidence_dir}}/${REPORT_DATE}.json"
     EXISTING=$(curl -sf \
       -H "Authorization: token ${FORGE_TOKEN}" \
       "${FORGE_URL}/api/v1/repos/${OPS_OWNER}/${OPS_REPO}/contents/${FILE_PATH}" \
       2>/dev/null || echo "")
 3. Create or update the file via Forgejo API:
     if [ -n "$EXISTING" ] && printf '%s' "$EXISTING" | jq -e '.sha' >/dev/null 2>&1; then
       # Update existing file
       SHA=$(printf '%s' "$EXISTING" | jq -r '.sha')
       curl -sf -X PUT \
         -H "Authorization: token ${FORGE_TOKEN}" \
         -H "Content-Type: application/json" \
         "${FORGE_URL}/api/v1/repos/${OPS_OWNER}/${OPS_REPO}/contents/${FILE_PATH}" \
         -d "$(jq -nc --arg content "$CONTENT" --arg sha "$SHA" --arg msg "evidence: engagement ${REPORT_DATE}" \
           '{message: $msg, content: $content, sha: $sha}')"
       echo "Updated existing evidence file in ops repo"
     else
       # Create new file
       curl -sf -X POST \
         -H "Authorization: token ${FORGE_TOKEN}" \
         -H "Content-Type: application/json" \
         "${FORGE_URL}/api/v1/repos/${OPS_OWNER}/${OPS_REPO}/contents/${FILE_PATH}" \
         -d "$(jq -nc --arg content "$CONTENT" --arg msg "evidence: engagement ${REPORT_DATE}" \
           '{message: $msg, content: $content}')"
       echo "Created evidence file in ops repo"
     fi
 4. Verify the commit landed:
     VERIFY=$(curl -sf \
       -H "Authorization: token ${FORGE_TOKEN}" \
       "${FORGE_URL}/api/v1/repos/${OPS_OWNER}/${OPS_REPO}/contents/${FILE_PATH}" \
       | jq -r '.name // empty')
     if [ "$VERIFY" = "${REPORT_DATE}.json" ]; then
       echo "Evidence committed: ${FILE_PATH}"
     else
       echo "ERROR: could not verify evidence commit"
       exit 1
     fi
 """
 needs       = ["parse-engagement"]
--- a/formulas/rent-a-human-caddy-ssh.toml
+++ b/formulas/rent-a-human-caddy-ssh.toml
@ -0,0 +1,161 @@
 # formulas/rent-a-human-caddy-ssh.toml — Provision SSH key for Caddy log collection
 #
 # "Rent a Human" — walk the operator through provisioning a purpose-limited
 # SSH keypair so collect-engagement.sh can fetch Caddy access logs remotely.
 #
 # The key uses a `command=` restriction so it can ONLY cat the access log.
 # No interactive shell, no port forwarding, no agent forwarding.
 #
 # Parent vision issue: #426
 # Sprint: website-observability-wire-up (ops PR #10)
 # Consumed by: site/collect-engagement.sh (issue #745)
 name        = "rent-a-human-caddy-ssh"
 description = "Provision a purpose-limited SSH keypair for remote Caddy log collection"
 version     = 1
 # ── Step 1: Generate keypair ─────────────────────────────────────────────────
 [[steps]]
 id    = "generate-keypair"
 title = "Generate a dedicated ed25519 keypair"
 description = """
 Generate a purpose-limited SSH keypair for Caddy log collection.
 Run on your local machine (NOT the Caddy host):
 ```
 ssh-keygen -t ed25519 -f caddy-collect -N '' -C 'disinto-collect-engagement'
 ```
 This produces two files:
  - caddy-collect      (private key — goes into the vault)
  - caddy-collect.pub  (public key — goes onto the Caddy host)
 Do NOT set a passphrase (-N '') — the factory runs unattended.
 """
 # ── Step 2: Install public key on Caddy host ─────────────────────────────────
 [[steps]]
 id    = "install-public-key"
 title = "Install the public key on the Caddy host with command= restriction"
 needs = ["generate-keypair"]
 description = """
 Install the public key on the Caddy host with a strict command= restriction
 so this key can ONLY read the access log.
 1. SSH into the Caddy host as the user who owns /var/log/caddy/access.log.
 2. Open (or create) ~/.ssh/authorized_keys:
     mkdir -p ~/.ssh && chmod 700 ~/.ssh
     nano ~/.ssh/authorized_keys
 3. Add this line (all on ONE line — do not wrap):
     command="cat /var/log/caddy/access.log",no-port-forwarding,no-X11-forwarding,no-agent-forwarding ssh-ed25519 AAAA... disinto-collect-engagement
   Replace "AAAA..." with the contents of caddy-collect.pub.
   To build the line automatically:
     echo "command=\"cat /var/log/caddy/access.log\",no-port-forwarding,no-X11-forwarding,no-agent-forwarding $(cat caddy-collect.pub)"
 4. Set permissions:
     chmod 600 ~/.ssh/authorized_keys
 What the restrictions do:
  - command="cat /var/log/caddy/access.log"
      Forces this key to only execute `cat /var/log/caddy/access.log`,
      regardless of what the client requests.
  - no-port-forwarding    — blocks SSH tunnels
  - no-X11-forwarding     — blocks X11
  - no-agent-forwarding   — blocks agent forwarding
 If the access log is at a different path, update the command= restriction
 AND set CADDY_ACCESS_LOG in the factory environment to match.
 """
 # ── Step 3: Add private key to vault secrets ─────────────────────────────────
 [[steps]]
 id    = "store-private-key"
 title = "Add the private key as CADDY_SSH_KEY secret"
 needs = ["generate-keypair"]
 description = """
 Store the private key in the factory's encrypted secrets store.
 1. Add the private key using `disinto secrets add`:
     cat caddy-collect | disinto secrets add CADDY_SSH_KEY
   This encrypts the key with age and stores it as secrets/CADDY_SSH_KEY.enc.
 2. IMPORTANT: After storing, securely delete the local private key file:
     shred -u caddy-collect 2>/dev/null || rm -f caddy-collect
     rm -f caddy-collect.pub
   The public key is already installed on the Caddy host; the private key
   now lives only in secrets/CADDY_SSH_KEY.enc.
 Never commit the private key to any git repository.
 """
 # ── Step 4: Configure Caddy host address ─────────────────────────────────────
 [[steps]]
 id    = "store-caddy-host"
 title = "Add the Caddy host details as secrets"
 needs = ["install-public-key"]
 description = """
 Store the Caddy connection details so collect-engagement.sh knows
 where to SSH.
 1. Add each value using `disinto secrets add`:
     echo 'disinto.ai' | disinto secrets add CADDY_SSH_HOST
     echo 'debian' | disinto secrets add CADDY_SSH_USER
     echo '/var/log/caddy/access.log' | disinto secrets add CADDY_ACCESS_LOG
   Replace values with the actual SSH host, user, and log path for your setup.
 """
 # ── Step 5: Test the connection ──────────────────────────────────────────────
 [[steps]]
 id    = "test-connection"
 title = "Verify the SSH key works and returns the access log"
 needs = ["install-public-key", "store-private-key", "store-caddy-host"]
 description = """
 Test the end-to-end connection before the factory tries to use it.
 1. From the factory host (or anywhere with the private key), run:
     ssh -i caddy-collect -o StrictHostKeyChecking=accept-new user@caddy-host
   Expected behavior:
     - Outputs the contents of /var/log/caddy/access.log
     - Disconnects immediately (command= restriction forces this)
   If you already shredded the local key, decode it from the vault:
     echo "$CADDY_SSH_KEY" | base64 -d > /tmp/caddy-collect-test
     chmod 600 /tmp/caddy-collect-test
     ssh -i /tmp/caddy-collect-test -o StrictHostKeyChecking=accept-new user@caddy-host
     rm -f /tmp/caddy-collect-test
 2. Verify the output is Caddy structured JSON (one JSON object per line):
     ssh -i /tmp/caddy-collect-test user@caddy-host | head -1 | jq .
   You should see fields like: ts, request, status, duration.
 3. If the connection fails:
     - Permission denied → check authorized_keys format (must be one line)
     - Connection refused → check sshd is running on the Caddy host
     - Empty output → check /var/log/caddy/access.log exists and is readable
       by the SSH user
     - "jq: error" → Caddy may be using Combined Log Format instead of
       structured JSON; check Caddy's log configuration
 4. Once verified, the factory's collect-engagement.sh can use this key
   to fetch logs remotely via:
     ssh -i <decoded-key-path> $CADDY_HOST
 """
--- a/formulas/review-pr.toml
+++ b/formulas/review-pr.toml
@ -80,6 +80,64 @@ For each BEHAVIORAL change in the diff (not pure bug fixes or formatting):
 This check is SKIPPED for pure bug fixes where the intended behavior is
 unchanged (the code was wrong, not the documentation).
 ## 3c. Infrastructure file review (conditional)
 If the diff touches ANY of these files, apply this additional checklist:
 - `docker-compose.yml` or `docker-compose.*.yml`
 - `Dockerfile` or `docker/*`
 - `.woodpecker/` CI configs
 - `docker/agents/entrypoint.sh`
 Infrastructure files have a different failure mode from application code:
 a single dropped line (a volume mount, an env var, a restart policy) can
 break a running deployment with no syntax error. Treat dropped
 infrastructure configuration as a **blocking defect**, not a style choice.
 ### For docker-compose.yml changes:
 1. **Read the full file** in the PR branch — do not rely only on the diff.
 2. Run `git diff <base>..HEAD -- docker-compose.yml` to see the complete
   change, not just the truncated diff.
 3. Check that NONE of the following were dropped without explicit
   justification in the PR description:
   - Named volumes (e.g. `agent-data`, `project-repos`)
   - Bind mounts (especially for config, secrets, SSH keys, shared dirs)
   - Environment variables (compare the full `environment:` block against
     the base branch)
   - `restart:` policy (should be `unless-stopped` for production services)
   - `security_opt:` settings
   - Network configuration
   - Resource limits / deploy constraints
 4. If ANY production configuration was dropped and the PR description does
   not explain why, **REQUEST_CHANGES**. List each dropped item explicitly.
 ### For Dockerfile / entrypoint changes:
 1. Check that base image, installed packages, and runtime deps are preserved.
 2. Verify that entrypoint/CMD changes don't break the container startup.
 ### For CI config changes:
 1. Check that pipeline steps aren't silently removed.
 2. Verify that secret references still match available secrets.
 ## 3d. Scope discipline
 Compare the actual diff footprint against the stated issue scope:
 1. Read the PR title and description to identify what the issue asked for.
 2. Estimate the expected diff size (e.g., "add 3 env vars" = ~5-10 lines
   in compose + ~5 lines in scripts).
 3. If the actual diff in ANY single file exceeds 3x the expected scope,
   flag it: "this file changed N lines but the issue scope suggests ~M."
 For infrastructure files (compose, Dockerfiles, CI), scope violations are
 **blocking**: REQUEST_CHANGES and ask the author to split out-of-scope
 changes into a separate PR or justify them in the description.
 For non-infrastructure files, scope violations are advisory: leave a
 non-blocking COMMENT noting the scope creep.
 ## 4. Vault item quality (conditional)
 If the PR adds or modifies vault item files (`vault/pending/*.md` in the ops repo), apply these
@ -155,7 +213,7 @@ should file a vault item instead of executing directly.
 **Exceptions** (do NOT flag these):
 - Code inside `vault/` — the vault system itself is allowed to handle secrets
 - References in comments or documentation explaining the architecture
- `bin/disinto` setup commands that manage `.env.vault.enc` and the `run` subcommand
+- `bin/disinto` setup commands that manage `secrets/*.enc` and the `run` subcommand
 - Local operations (git push to forge, forge API calls with `FORGE_TOKEN`)
 ## 6. Re-review (if previous review is provided)
@ -219,9 +277,11 @@ for actual problems (bugs, security issues, broken functionality, missing
 required behavior). Use DISCUSS sparingly.
 Note: The bias toward APPROVE applies to code correctness and style decisions.
-It does NOT apply to documentation consistency (step 3b) or tech-debt filing
+It does NOT apply to documentation consistency (step 3b), infrastructure file
-(step 7) — those are separate concerns that should be handled regardless of
+findings (step 3c), or tech-debt filing (step 7) — those are separate concerns
-the change's correctness.
+that should be handled regardless of the change's correctness. In particular,
 dropped production configuration (volumes, bind mounts, env vars, restart
 policy) is a blocking defect, not a style preference.
 ## 9. Output
--- a/formulas/run-architect.toml
+++ b/formulas/run-architect.toml
@ -16,7 +16,14 @@
 #            - Bash creates the ops PR with pitch content
 #            - Bash posts the ACCEPT/REJECT footer comment
 #   Step 3: Sprint PR creation with questions (issue #101) (one PR per pitch)
-#   Step 4: Answer parsing + sub-issue filing (issue #102)
+#   Step 4: Post-merge sub-issue filing via filer-bot (#764)
 #
 # Permission model (#764):
 #   architect-bot: READ-ONLY on project repo (GET issues/PRs/labels for context).
 #     Cannot POST/PUT/PATCH/DELETE any project-repo resource.
 #     Write access ONLY on ops repo (branches, PRs, comments).
 #   filer-bot: issues:write on project repo. Files sub-issues from merged sprint
 #     PRs via ops-filer pipeline. Adds in-progress label to vision issues.
 #
 # Architecture:
 # - Bash script (architect-run.sh) handles ALL state management
@ -146,15 +153,32 @@ For each issue in ARCHITECT_TARGET_ISSUES, bash performs:
 ## Recommendation
 <architect's assessment: worth it / defer / alternative approach>
 ## Sub-issues
 <!-- filer:begin -->
 - id: <kebab-case-id>
  title: "vision(#N): <concise sub-issue title>"
  labels: [backlog]
  depends_on: []
  body: |
    ## Goal
    <what this sub-issue accomplishes>
    ## Acceptance criteria
    - [ ] <criterion>
 <!-- filer:end -->
 IMPORTANT: Do NOT include design forks or questions yet. The pitch is a go/no-go
 decision for the human. Questions come only after acceptance.
 The ## Sub-issues block is parsed by the filer-bot pipeline after sprint PR merge.
 Each sub-issue between filer:begin/end markers becomes a Forgejo issue on the
 project repo. The filer appends a decomposed-from marker to each body automatically.
 4. Bash creates PR:
   - Create branch: architect/sprint-{pitch-number}
   - Write sprint spec to sprints/{sprint-slug}.md
   - Create PR with pitch content as body
   - Post footer comment: "Reply ACCEPT to proceed with design questions, or REJECT: <reason> to decline."
-   - Add in-progress label to vision issue
+   - NOTE: in-progress label is added by filer-bot after sprint PR merge (#764)
 Output:
 - One PR per vision issue (up to 3 per run)
@ -169,9 +193,25 @@ description = """
 IMPORTANT: PR creation is handled by bash (architect-run.sh) during the pitch step.
 This step is for documentation only — the actual PR creation happens in research_pitch.
 ## Approved PR → Initial design questions (issue #570)
 When a sprint pitch PR receives an APPROVED review but has no `## Design forks`
 section and no Q1:, Q2: comments yet, the architect enters a new state:
 1. detect_approved_pending_questions() identifies this state
 2. A fresh agent session starts with a special prompt
 3. The agent reads the approved pitch, posts initial design questions (Q1:, Q2:, etc.)
 4. The agent adds a `## Design forks` section to the PR body
 5. The PR transitions into the questions phase, where the existing Q&A loop takes over
 This ensures approved PRs don't sit indefinitely without design conversation.
 Architecture:
 - Bash creates PRs during stateless pitch generation (step 2)
 - Model has no role in PR creation — no Forgejo API access
 - architect-bot is READ-ONLY on the project repo (#764) — all project-repo
  writes (sub-issue filing, in-progress label) are handled by filer-bot
  via the ops-filer pipeline after sprint PR merge
 - This step describes the PR format for reference
 PR Format (created by bash):
@ -188,64 +228,29 @@ PR Format (created by bash):
   - Head: architect/sprint-{pitch-number}
   - Footer comment: "Reply ACCEPT to proceed with design questions, or REJECT: <reason> to decline."
 4. Add in-progress label to vision issue:
   - Look up label ID: GET /repos/{owner}/{repo}/labels
   - Add label: POST /repos/{owner}/{repo}/issues/{issue_number}/labels
 After creating all PRs, signal PHASE:done.
 NOTE: in-progress label on the vision issue is added by filer-bot after sprint PR merge (#764).
-## Forgejo API Reference
+## Forgejo API Reference (ops repo only)
-All operations use the Forgejo API with Authorization: token ${FORGE_TOKEN} header.
+All operations use the ops repo Forgejo API with `Authorization: token ${FORGE_TOKEN}` header.
 architect-bot is READ-ONLY on the project repo — cannot POST/PUT/PATCH/DELETE project-repo resources (#764).
-### Create branch
+### Create branch (ops repo)
 ```
-POST /repos/{owner}/{repo}/branches
+POST /repos/{owner}/{repo-ops}/branches
 Body: {"new_branch_name": "architect/<sprint-slug>", "old_branch_name": "main"}
 ```
-### Create/update file
+### Create/update file (ops repo)
 ```
-PUT /repos/{owner}/{repo}/contents/<path>
+PUT /repos/{owner}/{repo-ops}/contents/<path>
 Body: {"message": "sprint: add <sprint-slug>.md", "content": "<base64-encoded-content>", "branch": "architect/<sprint-slug>"}
 ```
-### Create PR
+### Create PR (ops repo)
 ```
-POST /repos/{owner}/{repo}/pulls
+POST /repos/{owner}/{repo-ops}/pulls
 Body: {"title": "architect: <sprint summary>", "body": "<markdown-text>", "head": "architect/<sprint-slug>", "base": "main"}
 ```
 **Important: PR body format**
 - The body field must contain plain markdown text (the raw content from the model)
 - Do NOT JSON-encode or escape the body — pass it as a JSON string value
 - Newlines and markdown formatting (headings, lists, etc.) must be preserved as-is
 ### Add label to issue
 ```
 POST /repos/{owner}/{repo}/issues/{index}/labels
 Body: {"labels": [<label-id>]}
 ```
 ## Forgejo API Reference
 All operations use the Forgejo API with `Authorization: token ${FORGE_TOKEN}` header.
 ### Create branch
 ```
 POST /repos/{owner}/{repo}/branches
 Body: {"new_branch_name": "architect/<sprint-slug>", "old_branch_name": "main"}
 ```
 ### Create/update file
 ```
 PUT /repos/{owner}/{repo}/contents/<path>
 Body: {"message": "sprint: add <sprint-slug>.md", "content": "<base64-encoded-content>", "branch": "architect/<sprint-slug>"}
 ```
 ### Create PR
 ```
 POST /repos/{owner}/{repo}/pulls
 Body: {"title": "architect: <sprint summary>", "body": "<markdown-text>", "head": "architect/<sprint-slug>", "base": "main"}
 ```
@ -254,30 +259,22 @@ Body: {"title": "architect: <sprint summary>", "body": "<markdown-text>", "head"
 - Do NOT JSON-encode or escape the body — pass it as a JSON string value
 - Newlines and markdown formatting (headings, lists, etc.) must be preserved as-is
-### Close PR
+### Close PR (ops repo)
 ```
-PATCH /repos/{owner}/{repo}/pulls/{index}
+PATCH /repos/{owner}/{repo-ops}/pulls/{index}
 Body: {"state": "closed"}
 ```
-### Delete branch
+### Delete branch (ops repo)
 ```
-DELETE /repos/{owner}/{repo}/git/branches/<branch-name>
+DELETE /repos/{owner}/{repo-ops}/git/branches/<branch-name>
 ```
-### Get labels (look up label IDs by name)
+### Read-only on project repo (context gathering)
 ```
-GET /repos/{owner}/{repo}/labels
+GET /repos/{owner}/{repo}/issues          — list issues
-```
+GET /repos/{owner}/{repo}/issues/{number} — read issue details
-
+GET /repos/{owner}/{repo}/labels          — list labels
-### Add label to issue (for in-progress on vision issue)
+GET /repos/{owner}/{repo}/pulls           — list PRs
 ```
 POST /repos/{owner}/{repo}/issues/{index}/labels
 Body: {"labels": [<label-id>]}
 ```
 ### Remove label from issue (for in-progress removal on REJECT)
 ```
 DELETE /repos/{owner}/{repo}/issues/{index}/labels/{label-id}
 ```
 """
--- a/formulas/run-gardener.toml
+++ b/formulas/run-gardener.toml
@ -177,7 +177,7 @@ DUST (trivial — single-line edit, rename, comment, style, whitespace):
 VAULT (needs human decision or external resource):
  File a vault procurement item using vault_request():
-    source "$(dirname "$0")/../lib/vault.sh"
+    source "$(dirname "$0")/../lib/action-vault.sh"
    TOML_CONTENT="# Vault action: <action_id>
 context = \"<description of what decision/resource is needed>\"
 unblocks = [\"#NNN\"]
--- a/formulas/run-planner.toml
+++ b/formulas/run-planner.toml
@ -243,7 +243,7 @@ needs = ["preflight"]
 [[steps]]
 id    = "commit-ops-changes"
-title = "Write tree, memory, and journal; commit and push"
+title = "Write tree, memory, and journal; commit and push branch"
 description = """
 ### 1. Write prerequisite tree
 Write to: $OPS_REPO_ROOT/prerequisites.md
@ -256,14 +256,16 @@ If (count - N) >= 5 or planner-memory.md missing, write to:
 Include: run counter marker, date, constraint focus, patterns, direction.
 Keep under 100 lines. Replace entire file.
-### 3. Commit ops repo changes
+### 3. Commit ops repo changes to the planner branch
-Commit the ops repo changes (prerequisites, memory, vault items):
+Commit the ops repo changes (prerequisites, memory, vault items) and push the
 branch. Do NOT push directly to $PRIMARY_BRANCH — planner-run.sh will create a
 PR and walk it to merge via review-bot.
  cd "$OPS_REPO_ROOT"
  git add prerequisites.md knowledge/planner-memory.md vault/pending/
  git add -u
  if ! git diff --cached --quiet; then
    git commit -m "chore: planner run $(date -u +%Y-%m-%d)"
-    git push origin "$PRIMARY_BRANCH"
+    git push origin HEAD
  fi
  cd "$PROJECT_REPO_ROOT"
--- a/formulas/run-predictor.toml
+++ b/formulas/run-predictor.toml
@ -125,8 +125,8 @@ For each weakness you identify, choose one:
  The prediction explains the theory. The vault PR triggers the proof
  after human approval. When the planner runs next, evidence is already there.
-  Vault dispatch (requires lib/vault.sh):
+  Vault dispatch (requires lib/action-vault.sh):
-    source "$PROJECT_REPO_ROOT/lib/vault.sh"
+    source "$PROJECT_REPO_ROOT/lib/action-vault.sh"
    TOML_CONTENT="id = \"predict-<prediction_number>-<formula>\"
 context = \"Test prediction #<prediction_number>: <theory summary> — focus: <specific test>\"
@ -154,7 +154,7 @@ tea is pre-configured with login "$TEA_LOGIN" and repo "$FORGE_REPO".
       --title "<title>" --body "<body>" --labels "prediction/unreviewed"
 2. Dispatch formula via vault (if exploiting):
-     source "$PROJECT_REPO_ROOT/lib/vault.sh"
+     source "$PROJECT_REPO_ROOT/lib/action-vault.sh"
     PR_NUM=$(vault_request "predict-NNN-<formula>" "$TOML_CONTENT")
     # See EXPLOIT section above for TOML_CONTENT format
--- a/gardener/AGENTS.md
+++ b/gardener/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Gardener Agent
 **Role**: Backlog grooming — detect duplicate issues, missing acceptance
@ -32,7 +32,7 @@ the gardener runs as part of the polling loop alongside the planner, predictor,
  PR, reviewed alongside AGENTS.md changes, executed by gardener-run.sh after merge.
 **Environment variables consumed**:
- `FORGE_TOKEN`, `FORGE_GARDENER_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`
+- `FORGE_TOKEN`, `FORGE_GARDENER_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`. `FORGE_TOKEN_OVERRIDE` is exported to `$FORGE_GARDENER_TOKEN` before sourcing env.sh so the gardener-bot identity survives re-sourcing (#762).
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by gardener-run.sh)
 **Lifecycle**: gardener-run.sh (invoked by polling loop every 6h, `check_active gardener`) →
--- a/gardener/gardener-run.sh
+++ b/gardener/gardener-run.sh
@ -26,10 +26,11 @@ FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
 # Accept project config from argument; default to disinto
 export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
 # Set override BEFORE sourcing env.sh so it survives any later re-source of
 # env.sh from nested shells / claude -p tools (#762, #747)
 export FORGE_TOKEN_OVERRIDE="${FORGE_GARDENER_TOKEN:-}"
 # shellcheck source=../lib/env.sh
 source "$FACTORY_ROOT/lib/env.sh"
 # Use gardener-bot's own Forgejo identity (#747)
 FORGE_TOKEN="${FORGE_GARDENER_TOKEN:-${FORGE_TOKEN}}"
 # shellcheck source=../lib/formula-session.sh
 source "$FACTORY_ROOT/lib/formula-session.sh"
 # shellcheck source=../lib/worktree.sh
@ -69,7 +70,7 @@ log "--- Gardener run start ---"
 # ── Resolve forge remote for git operations ─────────────────────────────
 # Run git operations from the project checkout, not the baked code dir
-cd "$PROJECT_REPO_ROOT" || exit 1
+cd "$PROJECT_REPO_ROOT"
 resolve_forge_remote
--- a/gardener/pending-actions.json
+++ b/gardener/pending-actions.json
@ -1,47 +1,12 @@
 [
  {
-    "action": "close",
+    "action": "comment",
-    "issue": 419,
+    "issue": 623,
-    "reason": "Vision goal complete — all sub-issues #437-#454 closed, vault blast-radius redesign delivered"
+    "body": "**Dependency check:** All blocking dependencies are now closed:\n- #620 ✓ closed\n- #621 ✓ closed  \n- #622 ✓ closed\n\nPer the issue description: *\"Once #620/#621/#622 are green, this issue should fork into at least three backlog children: subpath routing + Forgejo ROOT_URL / Woodpecker HOST, disinto-chat container scaffold with OAuth gate, and Claude Code sandbox envelope + working-dir scoping.\"*\n\nThis vision issue is ready for the planner to decompose into backlog children."
  },
  {
-    "action": "close",
+    "action": "comment",
-    "issue": 494,
+    "issue": 758,
-    "reason": "Resolved by PRs #502 and #503 (both merged) — repo_root workaround removed, container paths derived at runtime"
+    "body": "**Gardener flag:** This issue requires human admin action on Forgejo to resolve — changing branch protection settings on the ops repo. No automated formula can fix Forgejo admin settings.\n\nProposed options (from issue body):\n1. Add `planner-bot` to the merge whitelist in ops repo branch protection\n2. Remove branch protection from the ops repo (agents are primary writers)\n3. Create an admin-level service token for agents\n\nThis is blocking all ops repo writes (planner knowledge, sprint artifacts, vault items)."
  },
  {
    "action": "close",
    "issue": 477,
    "reason": "Obsolete — #379 (while-true loop) was deployed on 2026-04-08; env.sh container guard is now correct behavior, no revert needed"
  },
  {
    "action": "edit_body",
    "issue": 498,
    "body": "Flagged by AI reviewer in PR #496.\n\n## Problem\n\n`has_responses_to_process` is only set to `true` inside the `open_arch_prs >= 3` gate in `architect/architect-run.sh` (line 543). When fewer than 3 architect PRs are open, ACCEPT/REJECT responses on existing PRs are never processed — the response-processing block at line 687 defaults to `false` and is skipped entirely.\n\nThis means that if a user ACCEPTs or REJECTs a pitch while the open PR count is below 3, the architect agent will never handle the response.\n\n## Fix\n\nSet `has_responses_to_process` (or an equivalent guard) unconditionally by scanning open PRs for ACCEPT/REJECT responses, not only when the 3-PR cap is hit.\n\n---\n*Auto-created from AI review*\n\n## Acceptance criteria\n\n- [ ] `has_responses_to_process` is computed by scanning open architect PRs for ACCEPT/REJECT responses regardless of `open_arch_prs` count\n- [ ] When a user posts ACCEPT or REJECT on an architect PR and open PR count < 3, the response is processed in the same run\n- [ ] Existing behavior when `open_arch_prs >= 3` is unchanged\n- [ ] ShellCheck passes on modified files\n\n## Affected files\n\n- `architect/architect-run.sh` (lines ~543 and ~687 — response-processing gate)"
  },
  {
    "action": "add_label",
    "issue": 498,
    "label": "backlog"
  },
  {
    "action": "edit_body",
    "issue": 499,
    "body": "Flagged by AI reviewer in PR #496.\n\n## Problem\n\nIn `architect/architect-run.sh` line 203, the `has_open_subissues` function compares `.number` (a JSON integer) against `$vid` (a bash string via `--arg`). In jq, `42 != \"42\"` evaluates to true (different types are never equal), so the self-exclusion filter never fires. In practice this is low-risk since vision issues don't contain 'Decomposed from #N' in their own bodies, but the self-exclusion logic is silently broken.\n\n## Fix\n\nCast the string to a number in jq: `select(.number != ($vid | tonumber))`\n\n---\n*Auto-created from AI review*\n\n## Acceptance criteria\n\n- [ ] `has_open_subissues` self-exclusion filter correctly excludes the vision issue itself using `($vid | tonumber)` cast\n- [ ] A vision issue does not appear in its own subissue list\n- [ ] ShellCheck passes on modified files\n\n## Affected files\n\n- `architect/architect-run.sh` (line ~203 — `has_open_subissues` jq filter)"
  },
  {
    "action": "add_label",
    "issue": 499,
    "label": "backlog"
  },
  {
    "action": "edit_body",
    "issue": 471,
    "body": "## Bug description\n\nWhen dev-bot picks a backlog issue and launches dev-agent.sh, a second dev-poll instance (dev-qwen) can race ahead and mark the issue as stale/blocked before dev-agent.sh finishes claiming it.\n\n## Reproduction\n\nObserved on issues #443 and #445 (2026-04-08):\n\n**#443 timeline:**\n- `20:39:03` — dev-bot removes `backlog`, adds `in-progress` (via dev-poll backlog pickup)\n- `20:39:04` — dev-qwen removes `in-progress`, adds `blocked` with reason `no_assignee_no_open_pr_no_lock`\n- `20:40:11` — dev-bot pushes commit (dev-agent was actually working the whole time)\n- `20:44:02` — PR merged, issue closed\n\n**#445 timeline:**\n- `20:54:03` — dev-bot adds `in-progress`\n- `20:54:06` — dev-qwen marks `blocked` (3 seconds later)\n- `20:55:13` — dev-bot pushes commit\n- `21:09:03` — PR merged, issue closed\n\nIn both cases, the work completed successfully despite being labeled blocked.\n\n## Root cause\n\n`issue_claim()` in `lib/issue-lifecycle.sh` performs three sequential API calls:\n1. PATCH assignee\n2. POST in-progress label\n3. DELETE backlog label\n\nMeanwhile, dev-poll on another agent (dev-qwen) runs its orphan scan, sees the issue labeled `in-progress` but with no assignee set yet (assign PATCH hasn't landed or was read stale), no open PR, and no lock file. It concludes the issue is stale and relabels to `blocked`.\n\nThe race window is ~1-3 seconds between in-progress being set and the assignee being visible to other pollers.\n\n## Impact\n\n- Issues get spuriously labeled `blocked` with a misleading stale diagnostic comment\n- dev-agent continues working anyway (it already has the issue number), so the blocked label is just noise\n- But it could confuse the gardener or humans reading the issue timeline\n- If another dev-poll instance picks up the blocked issue for recovery before the original agent finishes, it could cause duplicate work\n\n## Possible fixes\n\n1. **Assign before labeling**: In `issue_claim()`, set the assignee first, then add in-progress. This way, by the time in-progress is visible, the assignee is already set.\n2. **Grace period in stale detection**: Skip issues whose in-progress label was added less than N seconds ago (check label event timestamp via timeline API).\n3. **Lock file before label**: Write the agent lock file (`/tmp/dev-impl-summary-...`) at the start of dev-agent.sh before calling `issue_claim()`, so the stale detector sees the lock.\n4. **Atomic claim check**: dev-poll should re-check assignee after a short delay before declaring stale, to allow for API propagation.\n\n## Acceptance criteria\n\n- [ ] Stale detection in dev-poll does not mark an issue as blocked within the first 60 seconds of the in-progress label being applied\n- [ ] `issue_claim()` assigns the issue before adding the in-progress label (or equivalent fix is implemented)\n- [ ] No spurious `blocked` labels appear on issues that are actively being worked (verified by log inspection or integration test)\n- [ ] ShellCheck passes on modified files\n\n## Affected files\n\n- `lib/issue-lifecycle.sh` — `issue_claim()` function (assignee + label ordering)\n- `dev/dev-poll.sh` — orphan/stale detection logic"
  },
  {
    "action": "add_label",
    "issue": 471,
    "label": "backlog"
  }
 ]
--- a/lib/AGENTS.md
+++ b/lib/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Shared Helpers (`lib/`)
 All agents source `lib/env.sh` as their first action. Additional helpers are
@ -6,15 +6,15 @@ sourced as needed.
 | File | What it provides | Sourced by |
 |---|---|---|
-| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Required env var**: `FORGE_PASS` — bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). | Every agent |
+| `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold), `load_secret()` (secret-source abstraction — see below). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Per-agent token override (#762)**: agent run scripts export `FORGE_TOKEN_OVERRIDE=<agent-specific-token>` BEFORE sourcing `env.sh`; `env.sh` applies this override at lines 98-100, ensuring the correct identity survives any re-sourcing of `env.sh` by nested shells or `claude -p` invocations. **Required env var**: `FORGE_PASS` — bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). **Hard preconditions (#674)**: `USER` and `HOME` must be exported by the entrypoint before sourcing. When `PROJECT_TOML` is set, `PROJECT_REPO_ROOT`, `PRIMARY_BRANCH`, and `OPS_REPO_ROOT` must also be set (by entrypoint or TOML). **`load_secret NAME [DEFAULT]` (#793)**: backend-agnostic secret resolution. Precedence: (1) `/secrets/<NAME>.env` — Nomad-rendered template, (2) current environment — already set by `.env.enc` / compose, (3) `secrets/<NAME>.enc` — age-encrypted per-key file (decrypted on demand, cached in process env), (4) DEFAULT or empty. Consumers call `$(load_secret GITHUB_TOKEN)` instead of `${GITHUB_TOKEN}` — identical behavior whether secrets come from Docker compose injection or Nomad Vault templates. | Every agent |
 | `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. | dev-poll, review-poll, review-pr |
 | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
 | `lib/ci-log-reader.py` | Python tool: reads CI logs from Woodpecker SQLite database. `<pipeline_number> [--step <name>]` — returns last 200 lines from failed steps (or specified step). Used by `ci_get_logs()` in ci-helpers.sh. Requires `WOODPECKER_DATA_DIR` (default: /woodpecker-data). | ci-helpers.sh |
-| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). **Container path derivation**: `PROJECT_REPO_ROOT` and `OPS_REPO_ROOT` are derived at runtime when `DISINTO_CONTAINER=1` — hardcoded to `/home/agent/repos/$PROJECT_NAME` and `/home/agent/repos/$PROJECT_NAME-ops` respectively — not read from the TOML. This ensures correct paths inside containers where host paths in the TOML would be wrong. | env.sh (when `PROJECT_TOML` is set) |
+| `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). Reads `repo_root` and `ops_repo_root` from the TOML for host-CLI callers. **Container path handling (#674)**: no longer derives `PROJECT_REPO_ROOT` or `OPS_REPO_ROOT` inside the script — container entrypoints export the correct paths before agent scripts source `env.sh`, and the `DISINTO_CONTAINER` guard (line 90) skips TOML overrides when those vars are already set. | env.sh (when `PROJECT_TOML` is set) |
 | `lib/parse-deps.sh` | Extracts dependency issue numbers from an issue body (stdin → stdout, one number per line). Matches `## Dependencies` / `## Depends on` / `## Blocked by` sections and inline `depends on #N` / `blocked by #N` patterns. Inline scan skips fenced code blocks to prevent false positives from code examples in issue bodies. Not sourced — executed via `bash lib/parse-deps.sh`. | dev-poll |
-| `lib/formula-session.sh` | `acquire_run_lock()`, `load_formula()`, `load_formula_or_profile()`, `build_context_block()`, `ensure_ops_repo()`, `ops_commit_and_push()`, `build_prompt_footer()`, `build_sdk_prompt_footer()`, `formula_worktree_setup()`, `formula_prepare_profile_context()`, `formula_lessons_block()`, `profile_write_journal()`, `profile_load_lessons()`, `ensure_profile_repo()`, `_profile_has_repo()`, `_count_undigested_journals()`, `_profile_digest_journals()`, `_profile_commit_and_push()`, `resolve_agent_identity()`, `build_graph_section()`, `build_scratch_instruction()`, `read_scratch_context()`, `cleanup_stale_crashed_worktrees()` — shared helpers for formula-driven polling-loop agents (lock, .profile repo management, prompt assembly, worktree setup). Memory guard is provided by `memory_guard()` in `lib/env.sh` (not duplicated here). `resolve_agent_identity()` — sets `FORGE_TOKEN`, `AGENT_IDENTITY`, `FORGE_REMOTE` from per-agent token env vars and FORGE_URL remote detection. `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
+| `lib/formula-session.sh` | `acquire_run_lock()`, `load_formula()`, `load_formula_or_profile()`, `build_context_block()`, `ensure_ops_repo()`, `ops_commit_and_push()`, `build_prompt_footer()`, `build_sdk_prompt_footer()`, `formula_worktree_setup()`, `formula_prepare_profile_context()`, `formula_lessons_block()`, `profile_write_journal()`, `profile_load_lessons()`, `ensure_profile_repo()`, `_profile_has_repo()`, `_count_undigested_journals()`, `_profile_digest_journals()`, `_profile_restore_lessons()`, `_profile_commit_and_push()`, `resolve_agent_identity()`, `build_graph_section()`, `build_scratch_instruction()`, `read_scratch_context()`, `cleanup_stale_crashed_worktrees()` — shared helpers for formula-driven polling-loop agents (lock, .profile repo management, prompt assembly, worktree setup). Memory guard is provided by `memory_guard()` in `lib/env.sh` (not duplicated here). `resolve_agent_identity()` — sets `FORGE_TOKEN`, `AGENT_IDENTITY`, `FORGE_REMOTE` from per-agent token env vars and FORGE_URL remote detection. `build_graph_section()` generates the structural-analysis section (runs `lib/build-graph.py`, formats JSON output) — previously duplicated in planner-run.sh and predictor-run.sh, now shared here. `cleanup_stale_crashed_worktrees()` — thin wrapper around `worktree_cleanup_stale()` from `lib/worktree.sh` (kept for backwards compatibility). **Journal digestion guards (#702)**: `_profile_digest_journals()` respects `PROFILE_DIGEST_TIMEOUT` (default 300s) and `PROFILE_DIGEST_MAX_BATCH` (default 5 journals per run); `_profile_restore_lessons()` restores the previous lessons-learned.md on digest failure. | planner-run.sh, predictor-run.sh, gardener-run.sh, supervisor-run.sh, dev-agent.sh |
 | `lib/guard.sh` | `check_active(agent_name)` — reads `$FACTORY_ROOT/state/.{agent_name}-active`; exits 0 (skip) if the file is absent. Factory is off by default — state files must be created to enable each agent. **Logs a message to stderr** when skipping (`[check_active] SKIP: state file not found`), so agent dropout is visible in loop logs. Sourced by dev-poll.sh, review-poll.sh, predictor-run.sh, supervisor-run.sh. | polling-loop entry points |
-| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. Sourced by dev-poll.sh — called after every successful merge. | dev-poll.sh |
+| `lib/mirrors.sh` | `mirror_push()` — pushes `$PRIMARY_BRANCH` + tags to all configured mirror remotes (fire-and-forget background pushes). Reads `MIRROR_NAMES` and `MIRROR_*` vars exported by `load-project.sh` from the `[mirrors]` TOML section. Failures are logged but never block the pipeline. `mirror_pull_register(clone_url, owner, repo_name, [interval])` — registers a Forgejo pull mirror via `POST /repos/migrate` with `mirror: true`. Creates the target repo and queues the first sync automatically. Works against empty Forgejo instances — no pre-existing content required. Used for Nomad migration cutover: point at Codeberg source, wait for sync, then proceed with `disinto init`. See [docs/mirror-bootstrap.md](../docs/mirror-bootstrap.md) for the full cutover path. Sourced by dev-poll.sh — called after every successful merge. | dev-poll.sh |
 | `lib/build-graph.py` | Python tool: parses VISION.md, prerequisites.md (from ops repo), AGENTS.md, formulas/*.toml, evidence/ (from ops repo), and forge issues/labels into a NetworkX DiGraph. Runs structural analyses (orphaned objectives, stale prerequisites, thin evidence, circular deps) and outputs a JSON report. Used by `review-pr.sh` (per-PR changed-file analysis) and `predictor-run.sh` (full-project analysis) to provide structural context to Claude. | review-pr.sh, predictor-run.sh |
 | `lib/secret-scan.sh` | `scan_for_secrets()` — detects potential secrets (API keys, bearer tokens, private keys, URLs with embedded credentials) in text; returns 1 if secrets found. `redact_secrets()` — replaces detected secret patterns with `[REDACTED]`. | issue-lifecycle.sh |
 | `lib/stack-lock.sh` | File-based lock protocol for singleton project stack access. `stack_lock_acquire(holder, project)` — polls until free, breaks stale heartbeats (>10 min old), claims lock. `stack_lock_release(project)` — deletes lock file. `stack_lock_check(project)` — inspect current lock state. `stack_lock_heartbeat(project)` — update heartbeat timestamp (callers must call every 2 min while holding). Lock files at `~/data/locks/<project>-stack.lock`. | docker/edge/dispatcher.sh, reproduce formula |
@ -22,13 +22,16 @@ sourced as needed.
 | `lib/worktree.sh` | Reusable git worktree management: `worktree_create(path, branch, [base_ref])` — create worktree, checkout base, fetch submodules. `worktree_recover(path, branch, [remote])` — detect existing worktree, reuse if on correct branch (sets `_WORKTREE_REUSED`), otherwise clean and recreate. `worktree_cleanup(path)` — `git worktree remove --force`, clear Claude Code project cache (`~/.claude/projects/` matching path). `worktree_cleanup_stale([max_age_hours])` — scan `/tmp` for orphaned worktrees older than threshold, skip preserved and active tmux worktrees, prune. `worktree_preserve(path, reason)` — mark worktree as preserved for debugging (writes `.worktree-preserved` marker, skipped by stale cleanup). | dev-agent.sh, supervisor-run.sh, planner-run.sh, predictor-run.sh, gardener-run.sh |
 | `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) |
 | `lib/issue-lifecycle.sh` | Reusable issue lifecycle library: `issue_claim()` (add in-progress, remove backlog), `issue_release()` (remove in-progress, add backlog), `issue_block()` (post diagnostic comment with secret redaction, add blocked label), `issue_close()`, `issue_check_deps()` (parse deps, check transitive closure; sets `_ISSUE_BLOCKED_BY`, `_ISSUE_SUGGESTION`), `issue_suggest_next()` (find next unblocked backlog issue; sets `_ISSUE_NEXT`), `issue_post_refusal()` (structured refusal comment with dedup). Label IDs cached in globals on first lookup. Sources `lib/secret-scan.sh`. | dev-agent.sh (future) |
-| `lib/vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. **Low-tier bypass**: if the action's `blast_radius` classifies as `low` (via `vault/classify.sh`), `vault_request` calls `_vault_commit_direct()` which commits directly to ops `main` using `FORGE_ADMIN_TOKEN` — no PR, no approval wait. Returns `0` (not a PR number) for direct commits. Requires `FORGE_TOKEN`, `FORGE_ADMIN_TOKEN` (low-tier only), `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
+| `lib/action-vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `action-vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. **Low-tier bypass**: if the action's `blast_radius` classifies as `low` (via `action-vault/classify.sh`), `vault_request` calls `_vault_commit_direct()` which commits directly to ops `main` using `FORGE_ADMIN_TOKEN` — no PR, no approval wait. Returns `0` (not a PR number) for direct commits. Requires `FORGE_TOKEN`, `FORGE_ADMIN_TOKEN` (low-tier only), `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
 | `lib/branch-protection.sh` | Branch protection helpers for Forgejo repos. `setup_vault_branch_protection()` — configures admin-only merge protection on main (require 1 approval, restrict merge to admin role, block direct pushes). `setup_profile_branch_protection()` — same protection for `.profile` repos. `verify_branch_protection()` — checks protection is correctly configured. `remove_branch_protection()` — removes protection (cleanup/testing). Handles race condition after initial push: retries with backoff if Forgejo hasn't processed the branch yet. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OPS_REPO`. | bin/disinto (hire-an-agent) |
-| `lib/agent-sdk.sh` | `agent_run([--resume SESSION_ID] [--worktree DIR] PROMPT)` — one-shot `claude -p` invocation with session persistence. Saves session ID to `SID_FILE`, reads it back on resume. `agent_recover_session()` — restore previous session ID from `SID_FILE` on startup. **Nudge guard**: skips nudge injection if the worktree is clean and no push is expected, preventing spurious re-invocations. Callers must define `SID_FILE`, `LOGFILE`, and `log()` before sourcing. | formula-driven agents (dev-agent, planner-run, predictor-run, gardener-run) |
+| `lib/agent-sdk.sh` | `agent_run([--resume SESSION_ID] [--worktree DIR] PROMPT)` — one-shot `claude -p` invocation with session persistence. Saves session ID to `SID_FILE`, reads it back on resume. `agent_recover_session()` — restore previous session ID from `SID_FILE` on startup. **Nudge guard**: skips nudge injection if the worktree is clean and no push is expected, preventing spurious re-invocations. Callers must define `SID_FILE`, `LOGFILE`, and `log()` before sourcing. **Concurrency**: external `flock` on `session.lock` is gated behind `CLAUDE_EXTERNAL_LOCK=1` (default off). When unset, each container's per-session `CLAUDE_CONFIG_DIR` isolation lets Claude Code's native lockfile handle OAuth refresh — no external serialization needed. Set `CLAUDE_EXTERNAL_LOCK=1` to re-enable the old flock wrapper as a rollback mechanism. See [`docs/CLAUDE-AUTH-CONCURRENCY.md`](../docs/CLAUDE-AUTH-CONCURRENCY.md) and AD-002 (#647). | formula-driven agents (dev-agent, planner-run, predictor-run, gardener-run) |
 | `lib/forge-setup.sh` | `setup_forge()` — Forgejo instance provisioning: creates admin user, bot accounts, org, repos (code + ops), configures webhooks, sets repo topics. Extracted from `bin/disinto`. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`. **Password storage (#361)**: after creating each bot account, stores its password in `.env` as `FORGE_<BOT>_PASS` (e.g. `FORGE_PASS`, `FORGE_REVIEW_PASS`, etc.) for use by `forge-push.sh`. | bin/disinto (init) |
 | `lib/forge-push.sh` | `push_to_forge()` — pushes a local clone to the Forgejo remote and verifies the push. `_assert_forge_push_globals()` validates required env vars before use. Requires `FORGE_URL`, `FORGE_PASS`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. **Auth**: uses `FORGE_PASS` (bot password) for git HTTP push — Forgejo 11.x rejects API tokens for `git push` (#361). | bin/disinto (init) |
 | `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) |
 | `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) |
-| `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_woodpecker_oauth_impl()` — creates OAuth2 app on Forgejo for Woodpecker. `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
+| `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
-| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility), `generate_caddyfile()` — Caddyfile, `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
+| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
 | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) |
 | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
 | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
 | `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | Not sourced at runtime yet — pure scaffolding for Nomad+Vault migration (#799) |
--- a/lib/action-vault.sh
+++ b/lib/action-vault.sh
@ -1,9 +1,9 @@
 #!/usr/bin/env bash
-# vault.sh — Helper for agents to create vault PRs on ops repo
+# action-vault.sh — Helper for agents to create vault PRs on ops repo
 #
 # Source after lib/env.sh:
 #   source "$(dirname "$0")/../lib/env.sh"
-#   source "$(dirname "$0")/lib/vault.sh"
+#   source "$(dirname "$0")/lib/action-vault.sh"
 #
 # Required globals: FORGE_TOKEN, FORGE_URL, FORGE_REPO, FORGE_OPS_REPO
 # Optional: OPS_REPO_ROOT (local path for ops repo)
@ -12,7 +12,7 @@
 #   vault_request <action_id> <toml_content>  — Create vault PR, return PR number
 #
 # The function:
-# 1. Validates TOML content using validate_vault_action() from vault/vault-env.sh
+# 1. Validates TOML content using validate_vault_action() from action-vault/vault-env.sh
 # 2. Creates a branch on the ops repo: vault/<action-id>
 # 3. Writes TOML to vault/actions/<action-id>.toml on that branch
 # 4. Creates PR targeting main with title "vault: <action-id>"
@ -133,7 +133,7 @@ vault_request() {
  printf '%s' "$toml_content" > "$tmp_toml"
  # Source vault-env.sh for validate_vault_action
-  local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/vault/vault-env.sh"
+  local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/vault-env.sh"
  if [ ! -f "$vault_env" ]; then
    echo "ERROR: vault-env.sh not found at $vault_env" >&2
    return 1
@ -161,7 +161,7 @@ vault_request() {
  ops_api="$(_vault_ops_api)"
  # Classify the action to determine if PR bypass is allowed
-  local classify_script="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/vault/classify.sh"
+  local classify_script="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/classify.sh"
  local vault_tier
  vault_tier=$("$classify_script" "${VAULT_ACTION_FORMULA:-}" "${VAULT_BLAST_RADIUS_OVERRIDE:-}") || {
    # Classification failed, default to high tier (require PR)
--- a/lib/agent-sdk.sh
+++ b/lib/agent-sdk.sh
@ -27,6 +27,96 @@ agent_recover_session() {
  fi
 }
 # claude_run_with_watchdog — run claude with idle-after-final-message watchdog
 #
 # Mitigates upstream Claude Code hang (#591) by detecting when the final
 # assistant message has been written and terminating the process after a
 # short grace period instead of waiting for CLAUDE_TIMEOUT.
 #
 # The watchdog:
 #   1. Streams claude stdout to a temp file
 #   2. Polls for the final result marker ("type":"result" for stream-json
 #      or closing } for regular json output)
 #   3. After detecting the final marker, starts a CLAUDE_IDLE_GRACE countdown
 #   4. SIGTERM claude if it hasn't exited cleanly within the grace period
 #   5. Falls back to CLAUDE_TIMEOUT as the absolute hard ceiling
 #
 # Usage: claude_run_with_watchdog claude [args...]
 # Expects: LOGFILE, CLAUDE_TIMEOUT, CLAUDE_IDLE_GRACE (default 30)
 # Returns: exit code from claude or timeout
 claude_run_with_watchdog() {
  local -a cmd=("$@")
  local out_file pid grace_pid rc
  # Create temp file for stdout capture
  out_file=$(mktemp) || return 1
  trap 'rm -f "$out_file"' RETURN
  # Start claude in background, capturing stdout to temp file
  "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
  pid=$!
  # Background watchdog: poll for final result marker
  (
    local grace="${CLAUDE_IDLE_GRACE:-30}"
    local detected=0
    while kill -0 "$pid" 2>/dev/null; do
      # Check for stream-json result marker first (more reliable)
      if grep -q '"type":"result"' "$out_file" 2>/dev/null; then
        detected=1
        break
      fi
      # Fallback: check for closing brace of top-level result object
      if tail -c 100 "$out_file" 2>/dev/null | grep -q '}[[:space:]]*$'; then
        # Verify it looks like a JSON result (has session_id or result key)
        if grep -qE '"(session_id|result)":' "$out_file" 2>/dev/null; then
          detected=1
          break
        fi
      fi
      sleep 2
    done
    # If we detected a final message, wait grace period then kill if still running
    if [ "$detected" -eq 1 ] && kill -0 "$pid" 2>/dev/null; then
      log "watchdog: final result detected, ${grace}s grace period before SIGTERM"
      sleep "$grace"
      if kill -0 "$pid" 2>/dev/null; then
        log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
        kill -TERM "$pid" 2>/dev/null || true
        # Give it a moment to clean up
        sleep 5
        if kill -0 "$pid" 2>/dev/null; then
          log "watchdog: force kill after SIGTERM timeout"
          kill -KILL "$pid" 2>/dev/null || true
        fi
      fi
    fi
  ) &
  grace_pid=$!
  # Hard ceiling timeout (existing behavior) — use tail --pid to wait for process
  timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
  rc=$?
  # Clean up the watchdog
  kill "$grace_pid" 2>/dev/null || true
  wait "$grace_pid" 2>/dev/null || true
  # When timeout fires (rc=124), explicitly kill the orphaned claude process
  # tail --pid is a passive waiter, not a supervisor
  if [ "$rc" -eq 124 ]; then
    kill "$pid" 2>/dev/null || true
    sleep 1
    kill -KILL "$pid" 2>/dev/null || true
  fi
  # Output the captured stdout
  cat "$out_file"
  return "$rc"
 }
 # agent_run — synchronous Claude invocation (one-shot claude -p)
 # Usage: agent_run [--resume SESSION_ID] [--worktree DIR] PROMPT
 # Sets: _AGENT_SESSION_ID (updated each call, persisted to SID_FILE)
@ -41,16 +131,24 @@ agent_run() {
  done
  local prompt="${1:-}"
  _AGENT_LAST_OUTPUT=""
  local -a args=(-p "$prompt" --output-format json --dangerously-skip-permissions --max-turns 200)
  [ -n "$resume_id" ] && args+=(--resume "$resume_id")
  [ -n "${CLAUDE_MODEL:-}" ] && args+=(--model "$CLAUDE_MODEL")
  local run_dir="${worktree_dir:-$(pwd)}"
  local lock_file="${HOME}/.claude/session.lock"
  mkdir -p "$(dirname "$lock_file")"
  local output rc
  log "agent_run: starting (resume=${resume_id:-(new)}, dir=${run_dir})"
-  output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
+  # External flock is redundant once CLAUDE_CONFIG_DIR rollout is verified (#647).
  # Gate behind CLAUDE_EXTERNAL_LOCK for rollback safety; default off.
  if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then
    mkdir -p "$(dirname "$lock_file")"
    output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude "${args[@]}" ) 9>"$lock_file" 2>>"$LOGFILE") && rc=0 || rc=$?
  else
    output=$(cd "$run_dir" && claude_run_with_watchdog claude "${args[@]}" 2>>"$LOGFILE") && rc=0 || rc=$?
  fi
  if [ "$rc" -eq 124 ]; then
    log "agent_run: timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $rc)"
  elif [ "$rc" -ne 0 ]; then
@ -75,7 +173,9 @@ agent_run() {
  # Save output for diagnostics (no_push, crashes)
  _AGENT_LAST_OUTPUT="$output"
-  local diag_file="${DISINTO_LOG_DIR:-/tmp}/dev/agent-run-last.json"
+  local diag_dir="${DISINTO_LOG_DIR:-/tmp}/${LOG_AGENT:-dev}"
  mkdir -p "$diag_dir" 2>/dev/null || true
  local diag_file="${diag_dir}/agent-run-last.json"
  printf '%s' "$output" > "$diag_file" 2>/dev/null || true
  # Nudge: if the model stopped without pushing, resume with encouragement.
@ -91,7 +191,11 @@ agent_run() {
        local nudge="You stopped but did not push any code. You have uncommitted changes. Commit them and push."
        log "agent_run: nudging (uncommitted changes)"
        local nudge_rc
-        output=$(cd "$run_dir" && flock -w 600 "$lock_file" timeout "${CLAUDE_TIMEOUT:-7200}" claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
+        if [ -n "${CLAUDE_EXTERNAL_LOCK:-}" ]; then
          output=$(cd "$run_dir" && ( flock -w 600 9 || exit 1; claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} ) 9>"$lock_file" 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
        else
          output=$(cd "$run_dir" && claude_run_with_watchdog claude -p "$nudge" --resume "$_AGENT_SESSION_ID" --output-format json --dangerously-skip-permissions --max-turns 50 ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} 2>>"$LOGFILE") && nudge_rc=0 || nudge_rc=$?
        fi
        if [ "$nudge_rc" -eq 124 ]; then
          log "agent_run: nudge timeout after ${CLAUDE_TIMEOUT:-7200}s (exit code $nudge_rc)"
        elif [ "$nudge_rc" -ne 0 ]; then
--- a/lib/branch-protection.sh
+++ b/lib/branch-protection.sh
@ -34,6 +34,55 @@ _ops_api() {
  printf '%s' "${FORGE_URL}/api/v1/repos/${FORGE_OPS_REPO}"
 }
 # -----------------------------------------------------------------------------
 # _bp_wait_for_branch — Wait for Forgejo to index a branch with exponential backoff
 #
 # Forgejo's branch indexer can take 5–15s to register a newly-pushed branch.
 # This helper retries up to 10 times with exponential backoff (2s, 4s, 6s, …)
 # capped at 10s per wait, for a worst-case total of ~70s.
 #
 # Args:
 #   $1 - Full API URL for the repo (e.g. https://forge.example/api/v1/repos/owner/repo)
 #   $2 - Branch name
 #   $3 - Human-readable repo identifier for log messages
 #
 # Returns: 0 if branch found, 1 if not found after all retries
 # -----------------------------------------------------------------------------
 _bp_wait_for_branch() {
  local api_url="$1"
  local branch="$2"
  local repo_label="$3"
  local max_retries=10
  local base_wait=2
  local attempt=1
  local branch_status="0"
  while [ "$attempt" -le "$max_retries" ]; do
    branch_status=$(curl -s -o /dev/null -w "%{http_code}" \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
    if [ "$branch_status" = "200" ]; then
      _bp_log "Branch ${branch} exists on ${repo_label}"
      return 0
    fi
    if [ "$attempt" -lt "$max_retries" ]; then
      local wait_time=$(( base_wait * attempt ))
      if [ "$wait_time" -gt 10 ]; then
        wait_time=10
      fi
      _bp_log "Branch ${branch} not indexed yet (attempt ${attempt}/${max_retries}), waiting ${wait_time}s..."
      sleep "$wait_time"
    fi
    attempt=$((attempt + 1))
  done
  _bp_log "ERROR: Branch ${branch} does not exist on ${repo_label} after ${max_retries} attempts"
  return 1
 }
 # -----------------------------------------------------------------------------
 # setup_vault_branch_protection — Set up admin-only branch protection for main
 #
@ -51,30 +100,8 @@ setup_vault_branch_protection() {
  _bp_log "Setting up branch protection for ${branch} on ${FORGE_OPS_REPO}"
-  # Check if branch exists with retry loop (handles race condition after initial push)
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
-  local branch_exists="0"
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$FORGE_OPS_REPO"; then
  local max_attempts=3
  local attempt=1
  while [ "$attempt" -le "$max_attempts" ]; do
    branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
    if [ "$branch_exists" = "200" ]; then
      _bp_log "Branch ${branch} exists on ${FORGE_OPS_REPO}"
      break
    fi
    if [ "$attempt" -lt "$max_attempts" ]; then
      _bp_log "Branch ${branch} not indexed yet (attempt ${attempt}/${max_attempts}), waiting 2s..."
      sleep 2
    fi
    attempt=$((attempt + 1))
  done
  if [ "$branch_exists" != "200" ]; then
    _bp_log "ERROR: Branch ${branch} does not exist on ${FORGE_OPS_REPO} after ${max_attempts} attempts"
    return 1
  fi
@ -244,30 +271,8 @@ setup_profile_branch_protection() {
  local api_url
  api_url="${FORGE_URL}/api/v1/repos/${repo}"
-  # Check if branch exists with retry loop (handles race condition after initial push)
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
-  local branch_exists="0"
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$repo"; then
  local max_attempts=3
  local attempt=1
  while [ "$attempt" -le "$max_attempts" ]; do
    branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
    if [ "$branch_exists" = "200" ]; then
      _bp_log "Branch ${branch} exists on ${repo}"
      break
    fi
    if [ "$attempt" -lt "$max_attempts" ]; then
      _bp_log "Branch ${branch} not indexed yet (attempt ${attempt}/${max_attempts}), waiting 2s..."
      sleep 2
    fi
    attempt=$((attempt + 1))
  done
  if [ "$branch_exists" != "200" ]; then
    _bp_log "ERROR: Branch ${branch} does not exist on ${repo} after ${max_attempts} attempts"
    return 1
  fi
@ -430,30 +435,8 @@ setup_project_branch_protection() {
  local api_url
  api_url="${FORGE_URL}/api/v1/repos/${repo}"
-  # Check if branch exists with retry loop (handles race condition after initial push)
+  # Wait for Forgejo to index the branch (may take 5–15s after push)
-  local branch_exists="0"
+  if ! _bp_wait_for_branch "$api_url" "$branch" "$repo"; then
  local max_attempts=3
  local attempt=1
  while [ "$attempt" -le "$max_attempts" ]; do
    branch_exists=$(curl -s -o /dev/null -w "%{http_code}" \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${api_url}/git/branches/${branch}" 2>/dev/null || echo "0")
    if [ "$branch_exists" = "200" ]; then
      _bp_log "Branch ${branch} exists on ${repo}"
      break
    fi
    if [ "$attempt" -lt "$max_attempts" ]; then
      _bp_log "Branch ${branch} not indexed yet (attempt ${attempt}/${max_attempts}), waiting 2s..."
      sleep 2
    fi
    attempt=$((attempt + 1))
  done
  if [ "$branch_exists" != "200" ]; then
    _bp_log "ERROR: Branch ${branch} does not exist on ${repo} after ${max_attempts} attempts"
    return 1
  fi
--- a/lib/ci-setup.sh
+++ b/lib/ci-setup.sh
@ -4,7 +4,9 @@
 #
 # Internal functions (called via _load_ci_context + _*_impl):
 #   _install_cron_impl()              - Install crontab entries (bare-metal only; compose uses polling loop)
 #   _create_forgejo_oauth_app()       - Generic: create an OAuth2 app on Forgejo (shared helper)
 #   _create_woodpecker_oauth_impl()   - Create OAuth2 app on Forgejo for Woodpecker
 #   _create_chat_oauth_impl()         - Create OAuth2 app on Forgejo for disinto-chat
 #   _generate_woodpecker_token_impl() - Auto-generate WOODPECKER_TOKEN via OAuth2 flow
 #   _activate_woodpecker_repo_impl()  - Activate repo in Woodpecker
 #
@ -45,9 +47,9 @@ _install_cron_impl() {
  # Bare mode: crontab is required on the host
  if ! command -v crontab &>/dev/null; then
-    echo "Error: crontab not found (required for bare-metal mode)" >&2
+    echo "Warning: crontab not found (required for bare-metal scheduling)" >&2
    echo "  Install: apt install cron  /  brew install cron" >&2
-    exit 1
+    return 1
  fi
  # Use absolute path for the TOML in cron entries
@ -90,6 +92,54 @@ _install_cron_impl() {
  fi
 }
 # Create an OAuth2 application on Forgejo.
 # Generic helper used by both Woodpecker and chat OAuth setup.
 # Sets _OAUTH_CLIENT_ID and _OAUTH_CLIENT_SECRET on success.
 # Usage: _create_forgejo_oauth_app <app_name> <redirect_uri>
 _create_forgejo_oauth_app() {
  local oauth2_name="$1"
  local redirect_uri="$2"
  local forge_url="${FORGE_URL}"
  _OAUTH_CLIENT_ID=""
  _OAUTH_CLIENT_SECRET=""
  local existing_app
  existing_app=$(curl -sf \
    -H "Authorization: token ${FORGE_TOKEN}" \
    "${forge_url}/api/v1/user/applications/oauth2" 2>/dev/null \
    | jq -r --arg name "$oauth2_name" '.[] | select(.name == $name) | .client_id // empty' 2>/dev/null) || true
  if [ -n "$existing_app" ]; then
    echo "OAuth2:  ${oauth2_name} (already exists, client_id=${existing_app})"
    _OAUTH_CLIENT_ID="$existing_app"
    return 0
  fi
  local oauth2_resp
  oauth2_resp=$(curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${forge_url}/api/v1/user/applications/oauth2" \
    -d "{\"name\":\"${oauth2_name}\",\"redirect_uris\":[\"${redirect_uri}\"],\"confidential_client\":true}" \
    2>/dev/null) || oauth2_resp=""
  if [ -z "$oauth2_resp" ]; then
    echo "Warning: failed to create OAuth2 app '${oauth2_name}' on Forgejo" >&2
    return 1
  fi
  _OAUTH_CLIENT_ID=$(printf '%s' "$oauth2_resp" | jq -r '.client_id // empty')
  _OAUTH_CLIENT_SECRET=$(printf '%s' "$oauth2_resp" | jq -r '.client_secret // empty')
  if [ -z "$_OAUTH_CLIENT_ID" ]; then
    echo "Warning: OAuth2 app creation returned no client_id" >&2
    return 1
  fi
  echo "OAuth2:  ${oauth2_name} created (client_id=${_OAUTH_CLIENT_ID})"
 }
 # Set up Woodpecker CI to use Forgejo as its forge backend.
 # Creates an OAuth2 app on Forgejo for Woodpecker, activates the repo.
 # Usage: create_woodpecker_oauth <forge_url> <repo_slug>
@ -100,44 +150,9 @@ _create_woodpecker_oauth_impl() {
  echo ""
  echo "── Woodpecker OAuth2 setup ────────────────────────────"
-  # Create OAuth2 application on Forgejo for Woodpecker
+  _create_forgejo_oauth_app "woodpecker-ci" "http://localhost:8000/authorize" || return 0
-  local oauth2_name="woodpecker-ci"
+  local client_id="${_OAUTH_CLIENT_ID}"
-  local redirect_uri="http://localhost:8000/authorize"
+  local client_secret="${_OAUTH_CLIENT_SECRET}"
  local existing_app client_id client_secret
  # Check if OAuth2 app already exists
  existing_app=$(curl -sf \
    -H "Authorization: token ${FORGE_TOKEN}" \
    "${forge_url}/api/v1/user/applications/oauth2" 2>/dev/null \
    | jq -r --arg name "$oauth2_name" '.[] | select(.name == $name) | .client_id // empty' 2>/dev/null) || true
  if [ -n "$existing_app" ]; then
    echo "OAuth2:  ${oauth2_name} (already exists, client_id=${existing_app})"
    client_id="$existing_app"
  else
    local oauth2_resp
    oauth2_resp=$(curl -sf -X POST \
      -H "Authorization: token ${FORGE_TOKEN}" \
      -H "Content-Type: application/json" \
      "${forge_url}/api/v1/user/applications/oauth2" \
      -d "{\"name\":\"${oauth2_name}\",\"redirect_uris\":[\"${redirect_uri}\"],\"confidential_client\":true}" \
      2>/dev/null) || oauth2_resp=""
    if [ -z "$oauth2_resp" ]; then
      echo "Warning: failed to create OAuth2 app on Forgejo" >&2
      return
    fi
    client_id=$(printf '%s' "$oauth2_resp" | jq -r '.client_id // empty')
    client_secret=$(printf '%s' "$oauth2_resp" | jq -r '.client_secret // empty')
    if [ -z "$client_id" ]; then
      echo "Warning: OAuth2 app creation returned no client_id" >&2
      return
    fi
    echo "OAuth2:  ${oauth2_name} created (client_id=${client_id})"
  fi
  # Store Woodpecker forge config in .env
  # WP_FORGEJO_CLIENT/SECRET match the docker-compose.yml variable references
@ -166,6 +181,39 @@ _create_woodpecker_oauth_impl() {
  echo "Config:  Woodpecker forge vars written to .env"
 }
 # Create OAuth2 app on Forgejo for disinto-chat.
 # Writes CHAT_OAUTH_CLIENT_ID / CHAT_OAUTH_CLIENT_SECRET to .env.
 # Usage: _create_chat_oauth_impl <redirect_uri>
 _create_chat_oauth_impl() {
  local redirect_uri="$1"
  echo ""
  echo "── Chat OAuth2 setup ──────────────────────────────────"
  _create_forgejo_oauth_app "disinto-chat" "$redirect_uri" || return 0
  local client_id="${_OAUTH_CLIENT_ID}"
  local client_secret="${_OAUTH_CLIENT_SECRET}"
  local env_file="${FACTORY_ROOT}/.env"
  local chat_vars=()
  if [ -n "${client_id:-}" ]; then
    chat_vars+=("CHAT_OAUTH_CLIENT_ID=${client_id}")
  fi
  if [ -n "${client_secret:-}" ]; then
    chat_vars+=("CHAT_OAUTH_CLIENT_SECRET=${client_secret}")
  fi
  for var_line in "${chat_vars[@]}"; do
    local var_name="${var_line%%=*}"
    if grep -q "^${var_name}=" "$env_file" 2>/dev/null; then
      sed -i "s|^${var_name}=.*|${var_line}|" "$env_file"
    else
      printf '%s\n' "$var_line" >> "$env_file"
    fi
  done
  echo "Config:  Chat OAuth vars written to .env"
 }
 # Auto-generate WOODPECKER_TOKEN by driving the Forgejo OAuth2 login flow.
 # Requires _FORGE_ADMIN_PASS (set by setup_forge when admin user was just created).
 # Called after compose stack is up, before activate_woodpecker_repo.
--- a/lib/claude-config.sh
+++ b/lib/claude-config.sh
@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 # lib/claude-config.sh — Shared Claude config directory helpers (#641)
 #
 # Provides setup_claude_config_dir() for creating/migrating CLAUDE_CONFIG_DIR
 # and _env_set_idempotent() for writing env vars to .env files.
 #
 # Requires: CLAUDE_CONFIG_DIR, CLAUDE_SHARED_DIR (set by lib/env.sh)
 # Idempotent .env writer.
 # Usage: _env_set_idempotent KEY VALUE FILE
 _env_set_idempotent() {
  local key="$1" value="$2" file="$3"
  if grep -q "^${key}=" "$file" 2>/dev/null; then
    local existing
    existing=$(grep "^${key}=" "$file" | head -1 | cut -d= -f2-)
    if [ "$existing" != "$value" ]; then
      sed -i "s|^${key}=.*|${key}=${value}|" "$file"
    fi
  else
    printf '%s=%s\n' "$key" "$value" >> "$file"
  fi
 }
 # Create the shared CLAUDE_CONFIG_DIR, optionally migrating ~/.claude.
 # Usage: setup_claude_config_dir [auto_yes]
 setup_claude_config_dir() {
  local auto_yes="${1:-false}"
  local home_claude="${HOME}/.claude"
  # Create the shared config directory (idempotent)
  install -d -m 0700 -o "$USER" "$CLAUDE_CONFIG_DIR"
  echo "Claude:  ${CLAUDE_CONFIG_DIR} (ready)"
  # If ~/.claude is already a symlink to CLAUDE_CONFIG_DIR, nothing to do
  if [ -L "$home_claude" ]; then
    local link_target
    link_target=$(readlink -f "$home_claude")
    local config_real
    config_real=$(readlink -f "$CLAUDE_CONFIG_DIR")
    if [ "$link_target" = "$config_real" ]; then
      echo "Claude:  ${home_claude} -> ${CLAUDE_CONFIG_DIR} (symlink OK)"
      return 0
    fi
  fi
  local home_exists=false home_nonempty=false
  local config_nonempty=false
  # Check ~/.claude (skip if it's a symlink — already handled above)
  if [ -d "$home_claude" ] && [ ! -L "$home_claude" ]; then
    home_exists=true
    if [ -n "$(ls -A "$home_claude" 2>/dev/null)" ]; then
      home_nonempty=true
    fi
  fi
  # Check CLAUDE_CONFIG_DIR contents
  if [ -n "$(ls -A "$CLAUDE_CONFIG_DIR" 2>/dev/null)" ]; then
    config_nonempty=true
  fi
  # Case: both non-empty — abort, operator must reconcile
  if [ "$home_nonempty" = true ] && [ "$config_nonempty" = true ]; then
    echo "ERROR: both ${home_claude} and ${CLAUDE_CONFIG_DIR} exist and are non-empty" >&2
    echo "  Reconcile manually: merge or remove one, then re-run disinto init" >&2
    return 1
  fi
  # Case: ~/.claude exists and CLAUDE_CONFIG_DIR is empty — offer migration
  if [ "$home_nonempty" = true ] && [ "$config_nonempty" = false ]; then
    local do_migrate=false
    if [ "$auto_yes" = true ]; then
      do_migrate=true
    elif [ -t 0 ]; then
      read -rp "Migrate ${home_claude} to ${CLAUDE_CONFIG_DIR}? [Y/n] " confirm
      if [[ ! "$confirm" =~ ^[Nn] ]]; then
        do_migrate=true
      fi
    else
      echo "Warning: ${home_claude} exists but cannot prompt for migration (no TTY)" >&2
      echo "  Re-run with --yes to auto-migrate, or move files manually" >&2
      return 0
    fi
    if [ "$do_migrate" = true ]; then
      # Move contents (not the dir itself) to preserve CLAUDE_CONFIG_DIR ownership
      cp -a "$home_claude/." "$CLAUDE_CONFIG_DIR/"
      rm -rf "$home_claude"
      ln -sfn "$CLAUDE_CONFIG_DIR" "$home_claude"
      echo "Claude:  migrated ${home_claude} -> ${CLAUDE_CONFIG_DIR}"
      return 0
    fi
  fi
  # Case: ~/.claude exists but is empty, or doesn't exist — create symlink
  if [ "$home_exists" = true ] && [ "$home_nonempty" = false ]; then
    rmdir "$home_claude" 2>/dev/null || true
  fi
  if [ ! -e "$home_claude" ]; then
    ln -sfn "$CLAUDE_CONFIG_DIR" "$home_claude"
    echo "Claude:  ${home_claude} -> ${CLAUDE_CONFIG_DIR} (symlink created)"
  fi
 }
--- a/lib/env.sh
+++ b/lib/env.sh
@ -1,12 +1,41 @@
 #!/usr/bin/env bash
 # =============================================================================
 # env.sh — Load environment and shared utilities
 # Source this at the top of every script: source "$(dirname "$0")/lib/env.sh"
 #
 # SURFACE CONTRACT
 #
 # Required preconditions — the entrypoint (or caller) MUST set these before
 # sourcing this file:
 #   USER              — OS user name (e.g. "agent", "johba")
 #   HOME              — home directory (e.g. "/home/agent")
 #
 # Required when PROJECT_TOML is set (i.e. agent scripts loading a project):
 #   PROJECT_REPO_ROOT — absolute path to the project git clone
 #   PRIMARY_BRANCH    — default branch name (e.g. "main")
 #   OPS_REPO_ROOT     — absolute path to the ops repo clone
 #   (these are normally populated by load-project.sh from the TOML)
 #
 # What this file sets / exports:
 #   FACTORY_ROOT, DISINTO_LOG_DIR
 #   .env / .env.enc secrets (FORGE_TOKEN, etc.)
 #   FORGE_API, FORGE_WEB, TEA_LOGIN, FORGE_OPS_REPO (derived from FORGE_URL/FORGE_REPO)
 #   Per-agent tokens (FORGE_REVIEW_TOKEN, FORGE_GARDENER_TOKEN, …)
 #   CLAUDE_SHARED_DIR, CLAUDE_CONFIG_DIR
 #   Helper functions: log(), validate_url(), forge_api(), forge_api_all(),
 #     woodpecker_api(), wpdb(), memory_guard()
 # =============================================================================
 set -euo pipefail
 # Resolve script root (parent of lib/)
 FACTORY_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 # ── Precondition assertions ──────────────────────────────────────────────────
 # These must be set by the entrypoint before sourcing this file.
 : "${USER:?must be set by entrypoint before sourcing lib/env.sh}"
 : "${HOME:?must be set by entrypoint before sourcing lib/env.sh}"
 # Container detection: when running inside the agent container, DISINTO_CONTAINER
 # is set by docker-compose.yml.  Adjust paths so phase files, logs, and thread
 # maps land on the persistent volume instead of /tmp (which is ephemeral).
@ -72,7 +101,6 @@ fi
 # PATH: foundry, node, system
 export PATH="${HOME}/.local/bin:${HOME}/.foundry/bin:${HOME}/.nvm/versions/node/v22.20.0/bin:/usr/local/bin:/usr/bin:/bin:${PATH}"
 export HOME="${HOME:-/home/debian}"
 # Load project TOML if PROJECT_TOML is set (by poll scripts that accept project arg)
 if [ -n "${PROJECT_TOML:-}" ] && [ -f "$PROJECT_TOML" ]; then
@ -93,14 +121,16 @@ export FORGE_VAULT_TOKEN="${FORGE_VAULT_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_SUPERVISOR_TOKEN="${FORGE_SUPERVISOR_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_PREDICTOR_TOKEN="${FORGE_PREDICTOR_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_ARCHITECT_TOKEN="${FORGE_ARCHITECT_TOKEN:-${FORGE_TOKEN}}"
 export FORGE_FILER_TOKEN="${FORGE_FILER_TOKEN:-${FORGE_TOKEN}}"
 # Bot usernames filter
-export FORGE_BOT_USERNAMES="${FORGE_BOT_USERNAMES:-dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot}"
+export FORGE_BOT_USERNAMES="${FORGE_BOT_USERNAMES:-dev-bot,review-bot,planner-bot,gardener-bot,vault-bot,supervisor-bot,predictor-bot,architect-bot,filer-bot}"
 # Project config
 export FORGE_REPO="${FORGE_REPO:-}"
 export FORGE_URL="${FORGE_URL:-http://localhost:3000}"
-export FORGE_API="${FORGE_API:-${FORGE_URL}/api/v1/repos/${FORGE_REPO}}"
+export FORGE_API_BASE="${FORGE_API_BASE:-${FORGE_URL}/api/v1}"
 export FORGE_API="${FORGE_API:-${FORGE_API_BASE}/repos/${FORGE_REPO}}"
 export FORGE_WEB="${FORGE_WEB:-${FORGE_URL}/${FORGE_REPO}}"
 # tea CLI login name: derived from FORGE_URL (codeberg vs local forgejo)
 if [ -z "${TEA_LOGIN:-}" ]; then
@ -112,12 +142,14 @@ fi
 export TEA_LOGIN
 export PROJECT_NAME="${PROJECT_NAME:-${FORGE_REPO##*/}}"
 export PROJECT_REPO_ROOT="${PROJECT_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}}"
 export PRIMARY_BRANCH="${PRIMARY_BRANCH:-master}"
-# Ops repo: operational data (vault items, journals, evidence, prerequisites).
+# Project-specific paths: no guessing from USER/HOME — must be set by
-# Default convention: sibling directory named {project}-ops.
+# the entrypoint or loaded from PROJECT_TOML (via load-project.sh above).
-export OPS_REPO_ROOT="${OPS_REPO_ROOT:-/home/${USER}/${PROJECT_NAME}-ops}"
+if [ -n "${PROJECT_TOML:-}" ]; then
  : "${PROJECT_REPO_ROOT:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
  : "${PRIMARY_BRANCH:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
  : "${OPS_REPO_ROOT:?must be set by entrypoint or PROJECT_TOML before sourcing lib/env.sh}"
 fi
 # Forge repo slug for the ops repo (used by agents that commit to ops).
 export FORGE_OPS_REPO="${FORGE_OPS_REPO:-${FORGE_REPO:+${FORGE_REPO}-ops}}"
@ -126,12 +158,19 @@ export WOODPECKER_SERVER="${WOODPECKER_SERVER:-http://localhost:8000}"
 export CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-7200}"
 # Vault-only token guard (#745): external-action tokens (GITHUB_TOKEN, CLAWHUB_TOKEN)
-# must NEVER be available to agents. They live in .env.vault.enc and are injected
+# must NEVER be available to agents. They live in secrets/*.enc and are decrypted
-# only into the ephemeral runner container at fire time. Unset them here so
+# only into the ephemeral runner container at fire time (#777). Unset them here so
 # even an accidental .env inclusion cannot leak them into agent sessions.
 unset GITHUB_TOKEN 2>/dev/null || true
 unset CLAWHUB_TOKEN 2>/dev/null || true
 # Shared Claude config directory for cross-container OAuth lock coherence (#641).
 # All containers and the host resolve to the same CLAUDE_CONFIG_DIR on a shared
 # bind-mounted filesystem, so proper-lockfile's atomic mkdir works across them.
 : "${CLAUDE_SHARED_DIR:=/var/lib/disinto/claude-shared}"
 : "${CLAUDE_CONFIG_DIR:=${CLAUDE_SHARED_DIR}/config}"
 export CLAUDE_SHARED_DIR CLAUDE_CONFIG_DIR
 # Disable Claude Code auto-updater, telemetry, error reporting in factory sessions.
 # Factory processes must never phone home or auto-update mid-session (#725).
 export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
@ -274,6 +313,68 @@ memory_guard() {
  fi
 }
 # =============================================================================
 # SECRET LOADING ABSTRACTION
 # =============================================================================
 # load_secret NAME [DEFAULT]
 #
 # Resolves a secret value using the following precedence:
 #   1. /secrets/<NAME>.env  — Nomad-rendered template (future)
 #   2. Current environment  — already set by .env.enc, compose, etc.
 #   3. secrets/<NAME>.enc   — age-encrypted per-key file (decrypted on demand)
 #   4. DEFAULT (or empty)
 #
 # Prints the resolved value to stdout.  Caches age-decrypted values in the
 # process environment so subsequent calls are free.
 # =============================================================================
 load_secret() {
  local name="$1"
  local default="${2:-}"
  # 1. Nomad-rendered template (future: Nomad writes /secrets/<NAME>.env)
  local nomad_path="/secrets/${name}.env"
  if [ -f "$nomad_path" ]; then
    # Source into a subshell to extract just the value
    local _nomad_val
    _nomad_val=$(
      set -a
      # shellcheck source=/dev/null
      source "$nomad_path"
      set +a
      printf '%s' "${!name:-}"
    )
    if [ -n "$_nomad_val" ]; then
      export "$name=$_nomad_val"
      printf '%s' "$_nomad_val"
      return 0
    fi
  fi
  # 2. Already in environment (set by .env.enc, compose injection, etc.)
  if [ -n "${!name:-}" ]; then
    printf '%s' "${!name}"
    return 0
  fi
  # 3. Age-encrypted per-key file: secrets/<NAME>.enc (#777)
  local _age_key="${HOME}/.config/sops/age/keys.txt"
  local _enc_path="${FACTORY_ROOT}/secrets/${name}.enc"
  if [ -f "$_enc_path" ] && [ -f "$_age_key" ] && command -v age &>/dev/null; then
    local _dec_val
    if _dec_val=$(age -d -i "$_age_key" "$_enc_path" 2>/dev/null) && [ -n "$_dec_val" ]; then
      export "$name=$_dec_val"
      printf '%s' "$_dec_val"
      return 0
    fi
  fi
  # 4. Default (or empty)
  if [ -n "$default" ]; then
    printf '%s' "$default"
  fi
  return 0
 }
 # Source tea helpers (available when tea binary is installed)
 if command -v tea &>/dev/null; then
  # shellcheck source=tea-helpers.sh
--- a/lib/forge-push.sh
+++ b/lib/forge-push.sh
@ -7,7 +7,6 @@
 # Globals expected:
 #   FORGE_URL    - Forge instance URL (e.g. http://localhost:3000)
 #   FORGE_TOKEN  - API token for Forge operations (used for API verification)
 #   FORGE_PASS   - Bot password for git HTTP push (#361: tokens rejected by Forgejo 11.x)
 #   FACTORY_ROOT - Root of the disinto factory
 #   PRIMARY_BRANCH - Primary branch name (e.g. main)
 #
@ -21,7 +20,6 @@ set -euo pipefail
 _assert_forge_push_globals() {
  local missing=()
  [ -z "${FORGE_URL:-}" ]      && missing+=("FORGE_URL")
  [ -z "${FORGE_PASS:-}" ]     && missing+=("FORGE_PASS")
  [ -z "${FORGE_TOKEN:-}" ]    && missing+=("FORGE_TOKEN")
  [ -z "${FACTORY_ROOT:-}" ]   && missing+=("FACTORY_ROOT")
  [ -z "${PRIMARY_BRANCH:-}" ] && missing+=("PRIMARY_BRANCH")
@ -35,17 +33,11 @@ _assert_forge_push_globals() {
 push_to_forge() {
  local repo_root="$1" forge_url="$2" repo_slug="$3"
-  # Build authenticated remote URL: http://dev-bot:<password>@host:port/org/repo.git
+  # Use clean URL — credential helper supplies auth (#604).
-  # Forgejo 11.x rejects API tokens for git HTTP push (#361); password auth works.
+  # Forgejo 11.x rejects API tokens for git HTTP push (#361); password auth works
-  if [ -z "${FORGE_PASS:-}" ]; then
+  # via the credential helper configured in configure_git_creds().
-    echo "Error: FORGE_PASS not set — cannot push to Forgejo (see #361)" >&2
+  local remote_url="${forge_url}/${repo_slug}.git"
-    return 1
+  local display_url="$remote_url"
  fi
  local auth_url
  auth_url=$(printf '%s' "$forge_url" | sed "s|://|://dev-bot:${FORGE_PASS}@|")
  local remote_url="${auth_url}/${repo_slug}.git"
  # Display URL without token
  local display_url="${forge_url}/${repo_slug}.git"
  # Always set the remote URL to ensure credentials are current
  if git -C "$repo_root" remote get-url forgejo >/dev/null 2>&1; then
--- a/lib/forge-setup.sh
+++ b/lib/forge-setup.sh
@ -31,17 +31,41 @@ _load_init_context() {
 # Execute a command in the Forgejo container (for admin operations)
 _forgejo_exec() {
  local use_bare="${DISINTO_BARE:-false}"
  local cname="${FORGEJO_CONTAINER_NAME:-disinto-forgejo}"
  if [ "$use_bare" = true ]; then
-    docker exec -u git disinto-forgejo "$@"
+    docker exec -u git "$cname" "$@"
  else
    docker compose -f "${FACTORY_ROOT}/docker-compose.yml" exec -T -u git forgejo "$@"
  fi
 }
 # Check if a token already exists in .env (for idempotency)
 # Returns 0 if token exists, 1 if it doesn't
 _token_exists_in_env() {
  local token_var="$1"
  local env_file="$2"
  grep -q "^${token_var}=" "$env_file" 2>/dev/null
 }
 # Check if a password already exists in .env (for idempotency)
 # Returns 0 if password exists, 1 if it doesn't
 _pass_exists_in_env() {
  local pass_var="$1"
  local env_file="$2"
  grep -q "^${pass_var}=" "$env_file" 2>/dev/null
 }
 # Provision or connect to a local Forgejo instance.
 # Creates admin + bot users, generates API tokens, stores in .env.
 # When $DISINTO_BARE is set, uses standalone docker run; otherwise uses compose.
 # Usage: setup_forge [--rotate-tokens] <forge_url> <repo_slug>
 setup_forge() {
  local rotate_tokens=false
  # Parse optional --rotate-tokens flag
  if [ "$1" = "--rotate-tokens" ]; then
    rotate_tokens=true
    shift
  fi
  local forge_url="$1"
  local repo_slug="$2"
  local use_bare="${DISINTO_BARE:-false}"
@ -50,7 +74,7 @@ setup_forge() {
  echo "── Forge setup ────────────────────────────────────────"
  # Check if Forgejo is already running
-  if curl -sf --max-time 5 "${forge_url}/api/v1/version" >/dev/null 2>&1; then
+  if curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/version" >/dev/null 2>&1; then
    echo "Forgejo:  ${forge_url} (already running)"
  else
    echo "Forgejo not reachable at ${forge_url}"
@ -71,11 +95,12 @@ setup_forge() {
      # Bare-metal mode: standalone docker run
      mkdir -p "${FORGEJO_DATA_DIR}"
-      if docker ps -a --format '{{.Names}}' | grep -q '^disinto-forgejo$'; then
+      local cname="${FORGEJO_CONTAINER_NAME:-disinto-forgejo}"
-        docker start disinto-forgejo >/dev/null 2>&1 || true
+      if docker ps -a --format '{{.Names}}' | grep -q "^${cname}$"; then
        docker start "$cname" >/dev/null 2>&1 || true
      else
        docker run -d \
-          --name disinto-forgejo \
+          --name "$cname" \
          --restart unless-stopped \
          -p "${forge_port}:3000" \
          -p 2222:22 \
@ -94,7 +119,7 @@ setup_forge() {
    # Wait for Forgejo to become healthy
    echo -n "Waiting for Forgejo to start"
    local retries=0
-    while ! curl -sf --max-time 3 "${forge_url}/api/v1/version" >/dev/null 2>&1; do
+    while ! curl -sf --max-time 3 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/version" >/dev/null 2>&1; do
      retries=$((retries + 1))
      if [ "$retries" -gt 60 ]; then
        echo ""
@ -138,7 +163,7 @@ setup_forge() {
    admin_pass="admin-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
  fi
-  if ! curl -sf --max-time 5 "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
+  if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
    echo "Creating admin user: ${admin_user}"
    local create_output
    if ! create_output=$(_forgejo_exec forgejo admin user create \
@ -159,7 +184,7 @@ setup_forge() {
      --must-change-password=false
    # Verify admin user was actually created
-    if ! curl -sf --max-time 5 "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
+    if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${admin_user}" >/dev/null 2>&1; then
      echo "Error: admin user '${admin_user}' not found after creation" >&2
      exit 1
    fi
@ -187,10 +212,10 @@ setup_forge() {
  # Create human user (disinto-admin) as site admin if it doesn't exist
  local human_user="disinto-admin"
-  local human_pass
+  # human_user == admin_user; reuse admin_pass for basic-auth operations
-  human_pass="admin-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+  local human_pass="$admin_pass"
-  if ! curl -sf --max-time 5 "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
+  if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
    echo "Creating human user: ${human_user}"
    local create_output
    if ! create_output=$(_forgejo_exec forgejo admin user create \
@ -211,7 +236,7 @@ setup_forge() {
      --must-change-password=false
    # Verify human user was actually created
-    if ! curl -sf --max-time 5 "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
+    if ! curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN:-}" "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
      echo "Error: human user '${human_user}' not found after creation" >&2
      exit 1
    fi
@ -220,50 +245,74 @@ setup_forge() {
    echo "Human user: ${human_user} (already exists)"
  fi
-  # Delete existing admin token if present (token sha1 is only returned at creation time)
+  # Preserve admin token if already stored in .env (idempotent re-run)
-  local existing_token_id
+  local admin_token=""
-  existing_token_id=$(curl -sf \
+  if _token_exists_in_env "FORGE_ADMIN_TOKEN" "$env_file" && [ "$rotate_tokens" = false ]; then
-    -u "${admin_user}:${admin_pass}" \
+    admin_token=$(grep '^FORGE_ADMIN_TOKEN=' "$env_file" | head -1 | cut -d= -f2-)
-    "${forge_url}/api/v1/users/${admin_user}/tokens" 2>/dev/null \
+    [ -n "$admin_token" ] && echo "Admin token: preserved (use --rotate-tokens to force)"
    | jq -r '.[] | select(.name == "disinto-admin-token") | .id') || existing_token_id=""
  if [ -n "$existing_token_id" ]; then
    curl -sf -X DELETE \
      -u "${admin_user}:${admin_pass}" \
      "${forge_url}/api/v1/users/${admin_user}/tokens/${existing_token_id}" >/dev/null 2>&1 || true
  fi
  # Create admin token (fresh, so sha1 is returned)
  local admin_token
  admin_token=$(curl -sf -X POST \
    -u "${admin_user}:${admin_pass}" \
    -H "Content-Type: application/json" \
    "${forge_url}/api/v1/users/${admin_user}/tokens" \
    -d '{"name":"disinto-admin-token","scopes":["all"]}' 2>/dev/null \
    | jq -r '.sha1 // empty') || admin_token=""
  if [ -z "$admin_token" ]; then
-    echo "Error: failed to obtain admin API token" >&2
+    # Delete existing admin token if present (token sha1 is only returned at creation time)
-    exit 1
+    local existing_token_id
    existing_token_id=$(curl -sf \
      -u "${admin_user}:${admin_pass}" \
      "${forge_url}/api/v1/users/${admin_user}/tokens" 2>/dev/null \
      | jq -r '.[] | select(.name == "disinto-admin-token") | .id') || existing_token_id=""
    if [ -n "$existing_token_id" ]; then
      curl -sf -X DELETE \
        -u "${admin_user}:${admin_pass}" \
        "${forge_url}/api/v1/users/${admin_user}/tokens/${existing_token_id}" >/dev/null 2>&1 || true
    fi
    # Create admin token (fresh, so sha1 is returned)
    admin_token=$(curl -sf -X POST \
      -u "${admin_user}:${admin_pass}" \
      -H "Content-Type: application/json" \
      "${forge_url}/api/v1/users/${admin_user}/tokens" \
      -d '{"name":"disinto-admin-token","scopes":["all"]}' 2>/dev/null \
      | jq -r '.sha1 // empty') || admin_token=""
    if [ -z "$admin_token" ]; then
      echo "Error: failed to obtain admin API token" >&2
      exit 1
    fi
    # Store admin token for idempotent re-runs
    if grep -q '^FORGE_ADMIN_TOKEN=' "$env_file" 2>/dev/null; then
      sed -i "s|^FORGE_ADMIN_TOKEN=.*|FORGE_ADMIN_TOKEN=${admin_token}|" "$env_file"
    else
      printf 'FORGE_ADMIN_TOKEN=%s\n' "$admin_token" >> "$env_file"
    fi
    echo "Admin token: generated and saved (FORGE_ADMIN_TOKEN)"
  fi
-  # Get or create human user token
+  # Get or create human user token (human_user == admin_user; use admin_pass)
-  local human_token
+  local human_token=""
-  if curl -sf --max-time 5 "${forge_url}/api/v1/users/${human_user}" >/dev/null 2>&1; then
+  if _token_exists_in_env "HUMAN_TOKEN" "$env_file" && [ "$rotate_tokens" = false ]; then
    human_token=$(grep '^HUMAN_TOKEN=' "$env_file" | head -1 | cut -d= -f2-)
    if [ -n "$human_token" ]; then
      export HUMAN_TOKEN="$human_token"
      echo "  Human token preserved (use --rotate-tokens to force)"
    fi
  fi
  if [ -z "$human_token" ]; then
    # Delete existing human token if present (token sha1 is only returned at creation time)
    local existing_human_token_id
    existing_human_token_id=$(curl -sf \
-      -u "${human_user}:${human_pass}" \
+      -u "${admin_user}:${admin_pass}" \
      "${forge_url}/api/v1/users/${human_user}/tokens" 2>/dev/null \
      | jq -r '.[] | select(.name == "disinto-human-token") | .id') || existing_human_token_id=""
    if [ -n "$existing_human_token_id" ]; then
      curl -sf -X DELETE \
-        -u "${human_user}:${human_pass}" \
+        -u "${admin_user}:${admin_pass}" \
        "${forge_url}/api/v1/users/${human_user}/tokens/${existing_human_token_id}" >/dev/null 2>&1 || true
    fi
-    # Create human token (fresh, so sha1 is returned)
+    # Create human token (use admin_pass since human_user == admin_user)
    human_token=$(curl -sf -X POST \
-      -u "${human_user}:${human_pass}" \
+      -u "${admin_user}:${admin_pass}" \
      -H "Content-Type: application/json" \
      "${forge_url}/api/v1/users/${human_user}/tokens" \
      -d '{"name":"disinto-human-token","scopes":["all"]}' 2>/dev/null \
@ -277,7 +326,7 @@ setup_forge() {
        printf 'HUMAN_TOKEN=%s\n' "$human_token" >> "$env_file"
      fi
      export HUMAN_TOKEN="$human_token"
-      echo "  Human token saved (HUMAN_TOKEN)"
+      echo "  Human token generated and saved (HUMAN_TOKEN)"
    fi
  fi
@ -321,10 +370,22 @@ setup_forge() {
  local bot_user bot_pass token token_var pass_var
  for bot_user in dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot; do
    bot_pass="bot-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
    token_var="${bot_token_vars[$bot_user]}"
    pass_var="${bot_pass_vars[$bot_user]}"
-    # Check if bot user exists
+    # Check if token already exists in .env
    local token_exists=false
    if _token_exists_in_env "$token_var" "$env_file"; then
      token_exists=true
    fi
    # Check if password already exists in .env
    local pass_exists=false
    if _pass_exists_in_env "$pass_var" "$env_file"; then
      pass_exists=true
    fi
    # Check if bot user exists on Forgejo
    local user_exists=false
    if curl -sf --max-time 5 \
      -H "Authorization: token ${admin_token}" \
@ -332,7 +393,25 @@ setup_forge() {
      user_exists=true
    fi
    # Skip token/password regeneration if both exist in .env and not forcing rotation
    if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
      echo "  ${bot_user} token and password preserved (use --rotate-tokens to force)"
      # Still export the existing token for use within this run
      local existing_token existing_pass
      existing_token=$(grep "^${token_var}=" "$env_file" | head -1 | cut -d= -f2-)
      existing_pass=$(grep "^${pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
      export "${token_var}=${existing_token}"
      export "${pass_var}=${existing_pass}"
      continue
    fi
    # Generate new credentials if:
    # - Token doesn't exist (first run)
    # - Password doesn't exist (first run)
    # - --rotate-tokens flag is set (explicit rotation)
    if [ "$user_exists" = false ]; then
      # User doesn't exist - create it
      bot_pass="bot-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
      echo "Creating bot user: ${bot_user}"
      local create_output
      if ! create_output=$(_forgejo_exec forgejo admin user create \
@ -360,16 +439,22 @@ setup_forge() {
      fi
      echo "  ${bot_user} user created"
    else
-      echo "  ${bot_user} user exists (resetting password for token generation)"
+      # User exists - reset password if needed
-      # User exists but may not have a known password.
+      echo "  ${bot_user} user exists"
-      # Use admin API to reset the password so we can generate a new token.
+      if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
-      _forgejo_exec forgejo admin user change-password \
+        bot_pass="bot-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
-        --username "${bot_user}" \
+        _forgejo_exec forgejo admin user change-password \
-        --password "${bot_pass}" \
+          --username "${bot_user}" \
-        --must-change-password=false || {
+          --password "${bot_pass}" \
-        echo "Error: failed to reset password for existing bot user '${bot_user}'" >&2
+          --must-change-password=false || {
-        exit 1
+          echo "Error: failed to reset password for existing bot user '${bot_user}'" >&2
-      }
+          exit 1
        }
        echo "  ${bot_user} password reset for token generation"
      else
        # Password exists, get it from .env
        bot_pass=$(grep "^${pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
      fi
    fi
    # Generate token via API (basic auth as the bot user — Forgejo requires
@ -414,7 +499,6 @@ setup_forge() {
    # Store password in .env for git HTTP push (#361)
    # Forgejo 11.x API tokens don't work for git push; password auth does.
    pass_var="${bot_pass_vars[$bot_user]}"
    if grep -q "^${pass_var}=" "$env_file" 2>/dev/null; then
      sed -i "s|^${pass_var}=.*|${pass_var}=${bot_pass}|" "$env_file"
    else
@ -441,7 +525,19 @@ setup_forge() {
    llama_token_var="${llama_token_vars[$llama_user]}"
    llama_pass_var="${llama_pass_vars[$llama_user]}"
-    # Check if llama bot user exists
+    # Check if token already exists in .env
    local token_exists=false
    if _token_exists_in_env "$llama_token_var" "$env_file"; then
      token_exists=true
    fi
    # Check if password already exists in .env
    local pass_exists=false
    if _pass_exists_in_env "$llama_pass_var" "$env_file"; then
      pass_exists=true
    fi
    # Check if llama bot user exists on Forgejo
    local llama_user_exists=false
    if curl -sf --max-time 5 \
      -H "Authorization: token ${admin_token}" \
@ -449,10 +545,26 @@ setup_forge() {
      llama_user_exists=true
    fi
    # Skip token/password regeneration if both exist in .env and not forcing rotation
    if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
      echo "  ${llama_user} token and password preserved (use --rotate-tokens to force)"
      # Still export the existing token for use within this run
      local existing_token existing_pass
      existing_token=$(grep "^${llama_token_var}=" "$env_file" | head -1 | cut -d= -f2-)
      existing_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
      export "${llama_token_var}=${existing_token}"
      export "${llama_pass_var}=${existing_pass}"
      continue
    fi
    # Generate new credentials if:
    # - Token doesn't exist (first run)
    # - Password doesn't exist (first run)
    # - --rotate-tokens flag is set (explicit rotation)
    if [ "$llama_user_exists" = false ]; then
-      echo "Creating llama bot user: ${llama_user}"
+      # User doesn't exist - create it
      # Generate a unique password for this user
      llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
      echo "Creating llama bot user: ${llama_user}"
      local create_output
      if ! create_output=$(_forgejo_exec forgejo admin user create \
        --username "${llama_user}" \
@ -479,17 +591,22 @@ setup_forge() {
      fi
      echo "  ${llama_user} user created"
    else
-      echo "  ${llama_user} user exists (resetting password for token generation)"
+      # User exists - reset password if needed
-      # User exists but may not have a known password.
+      echo "  ${llama_user} user exists"
-      # Use admin API to reset the password so we can generate a new token.
+      if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
-      llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
+        llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
-      _forgejo_exec forgejo admin user change-password \
+        _forgejo_exec forgejo admin user change-password \
-        --username "${llama_user}" \
+          --username "${llama_user}" \
-        --password "${llama_pass}" \
+          --password "${llama_pass}" \
-        --must-change-password=false || {
+          --must-change-password=false || {
-        echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2
+          echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2
-        exit 1
+          exit 1
-      }
+        }
        echo "  ${llama_user} password reset for token generation"
      else
        # Password exists, get it from .env
        llama_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
      fi
    fi
    # Generate token via API (basic auth as the llama user)
@ -555,7 +672,7 @@ setup_forge() {
  for bot_user in "${bot_users[@]}"; do
    # Check if .profile repo already exists
-    if curl -sf --max-time 5 "${forge_url}/api/v1/repos/${bot_user}/.profile" >/dev/null 2>&1; then
+    if curl -sf --max-time 5 -H "Authorization: token ${admin_token}" "${forge_url}/api/v1/repos/${bot_user}/.profile" >/dev/null 2>&1; then
      echo "  ${bot_user}/.profile already exists"
      continue
    fi
@ -630,7 +747,7 @@ setup_forge() {
    fi
    # Add all bot users as collaborators with appropriate permissions
-    # dev-bot: write (PR creation via lib/vault.sh)
+    # dev-bot: write (PR creation via lib/action-vault.sh)
    # review-bot: read (PR review)
    # planner-bot: write (prerequisites.md, memory)
    # gardener-bot: write (backlog grooming)
--- a/lib/formula-session.sh
+++ b/lib/formula-session.sh
@ -18,7 +18,8 @@
 #   ensure_profile_repo [AGENT_IDENTITY]   — clone/pull .profile repo
 #   _profile_has_repo                      — check if agent has .profile repo
 #   _count_undigested_journals             — count journal entries to digest
-#   _profile_digest_journals               — digest journals into lessons
+#   _profile_digest_journals               — digest journals into lessons (timeout + batch cap)
 #   _profile_restore_lessons FILE BACKUP   — restore lessons on digest failure
 #   _profile_commit_and_push MESSAGE [FILES] — commit/push to .profile repo
 #   resolve_agent_identity                 — resolve agent user login from FORGE_TOKEN
 #   build_graph_section                    — run build-graph.py and set GRAPH_SECTION
@ -28,7 +29,13 @@
 #   ops_commit_and_push MESSAGE [FILES]    — commit/push to ops repo
 #   cleanup_stale_crashed_worktrees [HOURS] — thin wrapper around worktree_cleanup_stale
 #
-# Requires: lib/env.sh, lib/worktree.sh sourced first for shared helpers.
+# Requires: lib/env.sh, lib/worktree.sh, lib/agent-sdk.sh sourced first for shared helpers.
 # Source agent-sdk for claude_run_with_watchdog watchdog helper
 source "$(dirname "${BASH_SOURCE[0]}")/agent-sdk.sh"
 # Source ops-setup for migrate_ops_repo (used by ensure_ops_repo)
 source "$(dirname "${BASH_SOURCE[0]}")/ops-setup.sh"
 # ── Run guards ───────────────────────────────────────────────────────────
@ -113,15 +120,16 @@ ensure_profile_repo() {
  # Define cache directory: /home/agent/data/.profile/{agent-name}
  PROFILE_REPO_PATH="${HOME:-/home/agent}/data/.profile/${agent_identity}"
-  # Build clone URL from FORGE_URL and agent identity
+  # Build clone URL from FORGE_URL — credential helper supplies auth (#604)
  local forge_url="${FORGE_URL:-http://localhost:3000}"
-  local auth_url
+  local clone_url="${forge_url}/${agent_identity}/.profile.git"
  auth_url=$(printf '%s' "$forge_url" | sed "s|://|://$(whoami):${FORGE_TOKEN}@|")
  local clone_url="${auth_url}/${agent_identity}/.profile.git"
  # Check if already cached and up-to-date
  if [ -d "${PROFILE_REPO_PATH}/.git" ]; then
    log "Pulling .profile repo: ${agent_identity}/.profile"
    # Always refresh the remote URL to ensure it's clean (no baked credentials)
    # This fixes auth issues when old URLs contained the wrong username (#652)
    git -C "$PROFILE_REPO_PATH" remote set-url origin "$clone_url" 2>/dev/null || true
    if git -C "$PROFILE_REPO_PATH" fetch origin --quiet 2>/dev/null; then
      git -C "$PROFILE_REPO_PATH" checkout main --quiet 2>/dev/null || \
      git -C "$PROFILE_REPO_PATH" checkout master --quiet 2>/dev/null || true
@ -184,10 +192,14 @@ _count_undigested_journals() {
 # _profile_digest_journals
 # Runs a claude -p one-shot to digest undigested journals into lessons-learned.md
 # Respects PROFILE_DIGEST_TIMEOUT (default 300s) and PROFILE_DIGEST_MAX_BATCH (default 5).
 # On failure/timeout, preserves the previous lessons-learned.md and does not archive journals.
 # Returns 0 on success, 1 on failure.
 _profile_digest_journals() {
  local agent_identity="${AGENT_IDENTITY:-}"
  local model="${CLAUDE_MODEL:-opus}"
  local digest_timeout="${PROFILE_DIGEST_TIMEOUT:-300}"
  local max_batch="${PROFILE_DIGEST_MAX_BATCH:-5}"
  if [ -z "$agent_identity" ]; then
    if ! resolve_agent_identity; then
@ -200,19 +212,27 @@ _profile_digest_journals() {
  local knowledge_dir="${PROFILE_REPO_PATH}/knowledge"
  local lessons_file="${knowledge_dir}/lessons-learned.md"
-  # Collect undigested journal entries
+  # Collect undigested journal entries (capped at max_batch)
  local journal_entries=""
  local batch_count=0
  local -a batchfiles=()
  if [ -d "$journal_dir" ]; then
    for jf in "$journal_dir"/*.md; do
      [ -f "$jf" ] || continue
      # Skip archived entries
      [[ "$jf" == */archive/* ]] && continue
      if [ "$batch_count" -ge "$max_batch" ]; then
        log "profile: capping digest batch at ${max_batch} journals (remaining will be digested in future runs)"
        break
      fi
      local basename
      basename=$(basename "$jf")
      journal_entries="${journal_entries}
 ### ${basename}
 $(cat "$jf")
 "
      batchfiles+=("$jf")
      batch_count=$((batch_count + 1))
    done
  fi
@ -221,64 +241,104 @@ $(cat "$jf")
    return 0
  fi
-  # Read existing lessons if available
+  log "profile: digesting ${batch_count} journals (timeout ${digest_timeout}s)"
-  local existing_lessons=""
+
  # Ensure knowledge directory exists
  mkdir -p "$knowledge_dir"
  # Back up existing lessons-learned.md so we can restore on failure
  local lessons_backup=""
  if [ -f "$lessons_file" ]; then
-    existing_lessons=$(cat "$lessons_file")
+    lessons_backup=$(mktemp)
    cp "$lessons_file" "$lessons_backup"
  fi
  # Capture mtime so we can detect a Write-tool write afterwards
  local mtime_before=0
  [ -f "$lessons_file" ] && mtime_before=$(stat -c %Y "$lessons_file")
  # Build prompt for digestion
  local digest_prompt="You are digesting journal entries from a developer agent's work sessions.
 ## Task
-Condense these journal entries into abstract, transferable lessons. Rewrite lessons-learned.md entirely.
+Update the lessons-learned file at this exact absolute path:
  ${lessons_file}
 1. Read ${lessons_file} (it may not exist yet — that's fine, treat as empty).
 2. Digest the journal entries below into abstract, transferable patterns and heuristics.
 3. Merge with the existing lessons: preserve anything still useful, refine, drop stale or redundant entries, add new ones.
 4. Write the merged result back to ${lessons_file} using the Write tool.
 ## Constraints
 - Hard cap: 2KB maximum
 - Abstract: patterns and heuristics, not specific issues or file paths
 - Transferable: must help with future unseen work, not just recall past work
- Drop the least transferable lessons if over limit
+- Drop the least transferable lessons if over the cap
 ## Existing lessons-learned.md (if any)
 ${existing_lessons:-<none>}
 ## Journal entries to digest
-${journal_entries}
+${journal_entries}"
-## Output
+  # Run claude -p one-shot with digest-specific timeout
-Write the complete, rewritten lessons-learned.md content below. No preamble, no explanation — just the file content."
+  local output digest_rc
-
+  local saved_timeout="${CLAUDE_TIMEOUT:-7200}"
-  # Run claude -p one-shot with same model as agent
+  CLAUDE_TIMEOUT="$digest_timeout"
-  local output
+  output=$(claude_run_with_watchdog claude -p "$digest_prompt" \
  output=$(claude -p "$digest_prompt" \
    --output-format json \
    --dangerously-skip-permissions \
    ${model:+--model "$model"} \
-    2>>"$LOGFILE" || echo '{"result":"error"}')
+    2>>"$LOGFILE") && digest_rc=0 || digest_rc=$?
  CLAUDE_TIMEOUT="$saved_timeout"
-  # Extract content from JSON response
+  if [ "$digest_rc" -eq 124 ]; then
-  local lessons_content
+    log "profile: digest timed out after ${digest_timeout}s — preserving previous lessons, skipping archive"
-  lessons_content=$(printf '%s' "$output" | jq -r '.result // empty' 2>/dev/null || echo "")
+    _profile_restore_lessons "$lessons_file" "$lessons_backup"
  if [ -z "$lessons_content" ]; then
    log "profile: failed to digest journals"
    return 1
  fi
-  # Ensure knowledge directory exists
+  if [ "$digest_rc" -ne 0 ]; then
-  mkdir -p "$knowledge_dir"
+    log "profile: digest failed (exit code ${digest_rc}) — preserving previous lessons, skipping archive"
    _profile_restore_lessons "$lessons_file" "$lessons_backup"
    return 1
  fi
-  # Write the lessons file (full rewrite)
+  local mtime_after=0
-  printf '%s\n' "$lessons_content" > "$lessons_file"
+  [ -f "$lessons_file" ] && mtime_after=$(stat -c %Y "$lessons_file")
  log "profile: wrote lessons-learned.md (${#lessons_content} bytes)"
-  # Move digested journals to archive (if any were processed)
+  if [ "$mtime_after" -gt "$mtime_before" ] && [ -s "$lessons_file" ]; then
-  if [ -d "$journal_dir" ]; then
+    local file_size
    file_size=$(wc -c < "$lessons_file")
    # Treat tiny files (<=16 bytes) as failed digestion (e.g. "null", "{}", empty)
    if [ "$file_size" -le 16 ]; then
      log "profile: digest produced suspiciously small file (${file_size} bytes) — preserving previous lessons, skipping archive"
      _profile_restore_lessons "$lessons_file" "$lessons_backup"
      return 1
    fi
    log "profile: lessons-learned.md written by model via Write tool (${file_size} bytes)"
  else
    # Fallback: model didn't use Write tool — capture .result and strip any markdown code fence
    local lessons_content
    lessons_content=$(printf '%s' "$output" | jq -r '.result // empty' 2>/dev/null || echo "")
    lessons_content=$(printf '%s' "$lessons_content" | sed -E '1{/^```(markdown|md)?[[:space:]]*$/d;};${/^```[[:space:]]*$/d;}')
    if [ -z "$lessons_content" ] || [ "${#lessons_content}" -le 16 ]; then
      log "profile: failed to digest journals (no Write tool call, empty or tiny .result) — preserving previous lessons, skipping archive"
      _profile_restore_lessons "$lessons_file" "$lessons_backup"
      return 1
    fi
    printf '%s\n' "$lessons_content" > "$lessons_file"
    log "profile: lessons-learned.md written from .result fallback (${#lessons_content} bytes)"
  fi
  # Clean up backup on success
  [ -n "$lessons_backup" ] && rm -f "$lessons_backup"
  # Move only the digested journals to archive (not all — only the batch we processed)
  if [ ${#batchfiles[@]} -gt 0 ]; then
    mkdir -p "${journal_dir}/archive"
    local archived=0
-    for jf in "$journal_dir"/*.md; do
+    for jf in "${batchfiles[@]}"; do
      [ -f "$jf" ] || continue
      [[ "$jf" == */archive/* ]] && continue
      local basename
      basename=$(basename "$jf")
      mv "$jf" "${journal_dir}/archive/${basename}" 2>/dev/null && archived=$((archived + 1))
@ -288,9 +348,27 @@ Write the complete, rewritten lessons-learned.md content below. No preamble, no
    fi
  fi
  # Commit and push the digest results
  _profile_commit_and_push \
    "profile: digest ${archived:-0} journals → knowledge/lessons-learned.md" \
    knowledge/lessons-learned.md \
    journal/
  return 0
 }
 # _profile_restore_lessons LESSONS_FILE BACKUP_FILE
 # Restores previous lessons-learned.md from backup on digest failure.
 _profile_restore_lessons() {
  local lessons_file="$1"
  local backup="$2"
  if [ -n "$backup" ] && [ -f "$backup" ]; then
    cp "$backup" "$lessons_file"
    rm -f "$backup"
    log "profile: restored previous lessons-learned.md"
  fi
 }
 # _profile_commit_and_push MESSAGE [FILE ...]
 # Commits and pushes changes to .profile repo.
 _profile_commit_and_push() {
@ -305,6 +383,15 @@ _profile_commit_and_push() {
  (
    cd "$PROFILE_REPO_PATH" || return 1
    # Refresh the remote URL to ensure credentials are current (#652)
    # This ensures we use the correct bot identity and fresh credentials
    local forge_url="${FORGE_URL:-http://localhost:3000}"
    local agent_identity="${AGENT_IDENTITY:-}"
    if [ -n "$agent_identity" ]; then
      local remote_url="${forge_url}/${agent_identity}/.profile.git"
      git remote set-url origin "$remote_url" 2>/dev/null || true
    fi
    if [ ${#files[@]} -gt 0 ]; then
      git add "${files[@]}"
    else
@ -313,7 +400,7 @@ _profile_commit_and_push() {
    if ! git diff --cached --quiet 2>/dev/null; then
      git config user.name "${AGENT_IDENTITY}" || true
-      git config user.email "${AGENT_IDENTITY}@users.noreply.codeberg.org" || true
+      git config user.email "${AGENT_IDENTITY}@disinto.local" || true
      git commit -m "$msg" --no-verify 2>/dev/null || true
      git push origin main --quiet 2>/dev/null || git push origin master --quiet 2>/dev/null || true
    fi
@ -322,7 +409,8 @@ _profile_commit_and_push() {
 # profile_load_lessons
 # Pre-session: loads lessons-learned.md into LESSONS_CONTEXT for prompt injection.
-# Lazy digestion: if >10 undigested journals exist, runs claude -p to digest them.
+# Lazy digestion: if undigested journals exceed PROFILE_DIGEST_THRESHOLD (default 10),
 # runs claude -p to digest them (bounded by PROFILE_DIGEST_MAX_BATCH and PROFILE_DIGEST_TIMEOUT).
 # Returns 0 on success, 1 if agent has no .profile repo (silent no-op).
 # Requires: ensure_profile_repo() called, AGENT_IDENTITY, FORGE_TOKEN, FORGE_URL, CLAUDE_MODEL.
 # Exports: LESSONS_CONTEXT (the lessons file content, hard-capped at 2KB).
@ -338,13 +426,14 @@ profile_load_lessons() {
  fi
  # Check journal count for lazy digestion trigger
-  local journal_count
+  local journal_count digest_threshold
  journal_count=$(_count_undigested_journals)
  digest_threshold="${PROFILE_DIGEST_THRESHOLD:-10}"
-  if [ "${journal_count:-0}" -gt 10 ]; then
+  if [ "${journal_count:-0}" -gt "$digest_threshold" ]; then
-    log "profile: digesting ${journal_count} undigested journals"
+    log "profile: ${journal_count} undigested journals (threshold ${digest_threshold})"
    if ! _profile_digest_journals; then
-      log "profile: warning — journal digestion failed"
+      log "profile: warning — journal digestion failed, continuing with existing lessons"
    fi
  fi
@ -444,7 +533,7 @@ Write the journal entry below. Use markdown format."
  # Run claude -p one-shot with same model as agent
  local output
-  output=$(claude -p "$reflection_prompt" \
+  output=$(claude_run_with_watchdog claude -p "$reflection_prompt" \
    --output-format json \
    --dangerously-skip-permissions \
    ${CLAUDE_MODEL:+--model "$CLAUDE_MODEL"} \
@ -585,6 +674,7 @@ ensure_ops_repo() {
    git -C "$ops_root" fetch origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
    git -C "$ops_root" checkout "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
    git -C "$ops_root" pull --ff-only origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
    migrate_ops_repo "$ops_root" "${PRIMARY_BRANCH}"
    return 0
  fi
@ -592,14 +682,8 @@ ensure_ops_repo() {
  local ops_repo="${FORGE_OPS_REPO:-}"
  [ -n "$ops_repo" ] || return 0
  local forge_url="${FORGE_URL:-http://localhost:3000}"
-  local clone_url
+  # Use clean URL — credential helper supplies auth (#604)
-  if [ -n "${FORGE_TOKEN:-}" ]; then
+  local clone_url="${forge_url}/${ops_repo}.git"
    local auth_url
    auth_url=$(printf '%s' "$forge_url" | sed "s|://|://$(whoami):${FORGE_TOKEN}@|")
    clone_url="${auth_url}/${ops_repo}.git"
  else
    clone_url="${forge_url}/${ops_repo}.git"
  fi
  log "Cloning ops repo: ${ops_repo} -> ${ops_root}"
  if git clone --quiet "$clone_url" "$ops_root" 2>/dev/null; then
@ -735,8 +819,7 @@ build_prompt_footer() {
 Base URL: ${FORGE_API}
 Auth header: -H \"Authorization: token \${FORGE_TOKEN}\"
  Read issue:  curl -sf -H \"Authorization: token \${FORGE_TOKEN}\" '${FORGE_API}/issues/{number}' | jq '.body'
-  Create issue: curl -sf -X POST -H \"Authorization: token \${FORGE_TOKEN}\" -H 'Content-Type: application/json' '${FORGE_API}/issues' -d '{\"title\":\"...\",\"body\":\"...\",\"labels\":[LABEL_ID]}'${extra_api}
+  List labels: curl -sf -H \"Authorization: token \${FORGE_TOKEN}\" '${FORGE_API}/labels'${extra_api}
  List labels: curl -sf -H \"Authorization: token \${FORGE_TOKEN}\" '${FORGE_API}/labels'
 NEVER echo or include the actual token value in output — always reference \${FORGE_TOKEN}.
 ## Environment
--- a/lib/generators.sh
+++ b/lib/generators.sh
@ -26,6 +26,46 @@ PROJECT_NAME="${PROJECT_NAME:-project}"
 # PRIMARY_BRANCH defaults to main (env.sh may have set it to 'master')
 PRIMARY_BRANCH="${PRIMARY_BRANCH:-main}"
 # Helper: extract woodpecker_repo_id from a project TOML file
 # Returns empty string if not found or file doesn't exist
 _get_woodpecker_repo_id() {
  local toml_file="$1"
  if [ -f "$toml_file" ]; then
    python3 -c "
 import sys, tomllib
 try:
    with open(sys.argv[1], 'rb') as f:
        cfg = tomllib.load(f)
    ci = cfg.get('ci', {})
    wp_id = ci.get('woodpecker_repo_id', '0')
    print(wp_id)
 except Exception:
    print('0')
 " "$toml_file" 2>/dev/null || echo "0"
  else
    echo "0"
  fi
 }
 # Find all project TOML files and extract the highest woodpecker_repo_id
 # (used for the main agents service which doesn't have a per-project TOML)
 _get_primary_woodpecker_repo_id() {
  local projects_dir="${FACTORY_ROOT}/projects"
  local max_id="0"
  for toml in "${projects_dir}"/*.toml; do
    [ -f "$toml" ] || continue
    local repo_id
    repo_id=$(_get_woodpecker_repo_id "$toml")
    if [ -n "$repo_id" ] && [ "$repo_id" != "0" ]; then
      # Use the first non-zero repo_id found (or highest if multiple)
      if [ "$repo_id" -gt "$max_id" ] 2>/dev/null; then
        max_id="$repo_id"
      fi
    fi
  done
  echo "$max_id"
 }
 # Parse project TOML for local-model agents and emit compose services.
 # Writes service definitions to stdout; caller handles insertion into compose file.
 _generate_local_model_services() {
@ -40,6 +80,10 @@ _generate_local_model_services() {
  for toml in "${projects_dir}"/*.toml; do
    [ -f "$toml" ] || continue
    # Get woodpecker_repo_id for this project
    local wp_repo_id
    wp_repo_id=$(_get_woodpecker_repo_id "$toml")
    # Parse [agents.*] sections using Python - output YAML-compatible format
    while IFS='=' read -r key value; do
      case "$key" in
@ -56,9 +100,7 @@ _generate_local_model_services() {
            cat >> "$temp_file" <<EOF
  agents-${service_name}:
-    build:
+    image: ghcr.io/disinto/agents:\${DISINTO_IMAGE_TAG:-latest}
      context: .
      dockerfile: docker/agents/Dockerfile
    container_name: disinto-agents-${service_name}
    restart: unless-stopped
    security_opt:
@ -66,12 +108,13 @@ _generate_local_model_services() {
    volumes:
      - agents-${service_name}-data:/home/agent/data
      - project-repos:/home/agent/repos
-      - \${HOME}/.claude:/home/agent/.claude
+      - \${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:\${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
-      - \${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - \${CLAUDE_CONFIG_FILE:-\${HOME}/.claude.json}:/home/agent/.claude.json:ro
-      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - \${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
-      - \${HOME}/.ssh:/home/agent/.ssh:ro
+      - \${AGENT_SSH_DIR:-\${HOME}/.ssh}:/home/agent/.ssh:ro
    environment:
      FORGE_URL: http://forgejo:3000
      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
      # Use llama-specific credentials if available, otherwise fall back to main FORGE_TOKEN
      FORGE_TOKEN: \${FORGE_TOKEN_LLAMA:-\${FORGE_TOKEN:-}}
      FORGE_PASS: \${FORGE_PASS_LLAMA:-\${FORGE_PASS:-}}
@ -82,19 +125,27 @@ _generate_local_model_services() {
      ANTHROPIC_BASE_URL: "${base_url}"
      ANTHROPIC_API_KEY: "${api_key}"
      CLAUDE_MODEL: "${model}"
-      CLAUDE_CONFIG_DIR: /home/agent/.claude-${service_name}
+      CLAUDE_CONFIG_DIR: \${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
-      CLAUDE_CREDENTIALS_DIR: /home/agent/.claude-${service_name}/credentials
+      CLAUDE_CREDENTIALS_DIR: \${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}/credentials
      CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "${compact_pct}"
      CLAUDE_CODE_ATTRIBUTION_HEADER: "0"
      CLAUDE_CODE_ENABLE_TELEMETRY: "0"
      DISINTO_CONTAINER: "1"
      PROJECT_NAME: ${PROJECT_NAME:-project}
      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
      WOODPECKER_DATA_DIR: /woodpecker-data
      WOODPECKER_REPO_ID: "${wp_repo_id}"
      FORGE_BOT_USER_${service_name^^}: "${forge_user}"
      POLL_INTERVAL: "${poll_interval_val}"
      GARDENER_INTERVAL: "${GARDENER_INTERVAL:-21600}"
      ARCHITECT_INTERVAL: "${ARCHITECT_INTERVAL:-21600}"
      PLANNER_INTERVAL: "${PLANNER_INTERVAL:-43200}"
      SUPERVISOR_INTERVAL: "${SUPERVISOR_INTERVAL:-1200}"
    depends_on:
-      - forgejo
+      forgejo:
-      - woodpecker
+        condition: service_healthy
      woodpecker:
        condition: service_started
    networks:
      - disinto-net
    profiles: ["agents-${service_name}"]
@ -176,8 +227,12 @@ for name, config in agents.items():
 }
 # Generate docker-compose.yml in the factory root.
 # **CANONICAL SOURCE**: This generator is the single source of truth for docker-compose.yml.
 # The tracked docker-compose.yml file has been removed. Operators must run 'bin/disinto init'
 # to materialize a working stack on a fresh checkout.
 _generate_compose_impl() {
  local forge_port="${1:-3000}"
  local use_build="${2:-false}"
  local compose_file="${FACTORY_ROOT}/docker-compose.yml"
  # Check if compose file already exists
@ -186,6 +241,10 @@ _generate_compose_impl() {
    return 0
  fi
  # Extract primary woodpecker_repo_id from project TOML files
  local wp_repo_id
  wp_repo_id=$(_get_primary_woodpecker_repo_id)
  cat > "$compose_file" <<'COMPOSEEOF'
 # docker-compose.yml — generated by disinto init
 # Brings up Forgejo, Woodpecker, and the agent runtime.
@ -201,11 +260,17 @@ services:
      - forgejo-data:/data
    environment:
      FORGEJO__database__DB_TYPE: sqlite3
-      FORGEJO__server__ROOT_URL: http://forgejo:3000/
+      FORGEJO__server__ROOT_URL: ${FORGEJO_ROOT_URL:-http://forgejo:3000/}
      FORGEJO__server__HTTP_PORT: "3000"
      FORGEJO__security__INSTALL_LOCK: "true"
      FORGEJO__service__DISABLE_REGISTRATION: "true"
      FORGEJO__webhook__ALLOWED_HOST_LIST: "private"
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/v1/version"]
      interval: 5s
      timeout: 3s
      retries: 30
      start_period: 30s
    networks:
      - disinto-net
@ -226,13 +291,16 @@ services:
      WOODPECKER_FORGEJO_CLIENT: ${WP_FORGEJO_CLIENT:-}
      WOODPECKER_FORGEJO_SECRET: ${WP_FORGEJO_SECRET:-}
      WOODPECKER_HOST: ${WOODPECKER_HOST:-http://woodpecker:8000}
      WOODPECKER_SERVER: http://woodpecker:9000
      WOODPECKER_OPEN: "true"
      WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
      WOODPECKER_DATABASE_DRIVER: sqlite3
      WOODPECKER_DATABASE_DATASOURCE: /var/lib/woodpecker/woodpecker.sqlite
      WOODPECKER_PLUGINS_PRIVILEGED: ${WOODPECKER_PLUGINS_PRIVILEGED:-plugins/docker}
      WOODPECKER_ENVIRONMENT: "FORGE_TOKEN:${FORGE_TOKEN}"
    depends_on:
-      - forgejo
+      forgejo:
        condition: service_healthy
    networks:
      - disinto-net
@ -251,15 +319,19 @@ services:
      WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
      WOODPECKER_GRPC_SECURE: "false"
      WOODPECKER_HEALTHCHECK_ADDR: ":3333"
-      WOODPECKER_BACKEND_DOCKER_NETWORK: disinto_disinto-net
+      WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net}
      WOODPECKER_MAX_WORKFLOWS: 1
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", "http://localhost:3333/healthz"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 15s
    depends_on:
      - woodpecker
  agents:
-    build:
+    image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}
      context: .
      dockerfile: docker/agents/Dockerfile
    container_name: disinto-agents
    restart: unless-stopped
    security_opt:
@ -267,14 +339,18 @@ services:
    volumes:
      - agent-data:/home/agent/data
      - project-repos:/home/agent/repos
-      - ${HOME}/.claude:/home/agent/.claude
+      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
-      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
+      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
-      - CLAUDE_BIN_PLACEHOLDER:/usr/local/bin/claude:ro
+      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
-      - ${HOME}/.ssh:/home/agent/.ssh:ro
+      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
-      - ${HOME}/.config/sops/age:/home/agent/.config/sops/age:ro
+      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
      - ./projects:/home/agent/disinto/projects:ro
      - ./.env:/home/agent/disinto/.env:ro
      - ./state:/home/agent/disinto/state
    environment:
      FORGE_URL: http://forgejo:3000
      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
      FORGE_TOKEN: ${FORGE_TOKEN:-}
      FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
      FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
@ -288,33 +364,180 @@ services:
      CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
      CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
      FORGE_PASS: ${FORGE_PASS:-}
      FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
      FACTORY_REPO: ${FORGE_REPO:-disinto-admin/disinto}
      DISINTO_CONTAINER: "1"
      PROJECT_NAME: ${PROJECT_NAME:-project}
      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
      WOODPECKER_DATA_DIR: /woodpecker-data
      WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      POLL_INTERVAL: ${POLL_INTERVAL:-300}
      GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
      ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
      PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
    # IMPORTANT: agents get explicit environment variables (forge tokens, CI tokens, config).
    # Vault-only secrets (GITHUB_TOKEN, CLAWHUB_TOKEN, deploy keys) live in
-    # .env.vault.enc and are NEVER injected here — only the runner
+    # secrets/*.enc and are NEVER injected here — only the runner
-    # container receives them at fire time (AD-006, #745).
+    # container receives them at fire time (AD-006, #745, #777).
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
-      - forgejo
+      forgejo:
-      - woodpecker
+        condition: service_healthy
      woodpecker:
        condition: service_started
    networks:
      - disinto-net
-  runner:
+COMPOSEEOF
  # ── Conditional agents-llama block (ENABLE_LLAMA_AGENT=1) ──────────────
  # Local-Qwen dev agent — gated on ENABLE_LLAMA_AGENT so factories without
  # a local llama endpoint don't try to start it.  See docs/agents-llama.md.
  if [ "${ENABLE_LLAMA_AGENT:-0}" = "1" ]; then
    cat >> "$compose_file" <<'LLAMAEOF'
  agents-llama:
    build:
      context: .
      dockerfile: docker/agents/Dockerfile
    container_name: disinto-agents-llama
    restart: unless-stopped
    security_opt:
      - apparmor=unconfined
    volumes:
      - agent-data:/home/agent/data
      - project-repos:/home/agent/repos
      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
    environment:
      FORGE_URL: http://forgejo:3000
      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
      FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
      FORGE_PASS: ${FORGE_PASS_LLAMA:-}
      FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
      WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
      CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
      CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
      CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
      ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
      FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
      DISINTO_CONTAINER: "1"
      PROJECT_NAME: ${PROJECT_NAME:-project}
      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
      WOODPECKER_DATA_DIR: /woodpecker-data
      WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      POLL_INTERVAL: ${POLL_INTERVAL:-300}
      AGENT_ROLES: dev
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      forgejo:
        condition: service_healthy
    networks:
      - disinto-net
  agents-llama-all:
    build:
      context: .
      dockerfile: docker/agents/Dockerfile
    container_name: disinto-agents-llama-all
    restart: unless-stopped
    profiles: ["agents-llama-all"]
    security_opt:
      - apparmor=unconfined
    volumes:
      - agent-data:/home/agent/data
      - project-repos:/home/agent/repos
      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
      - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
      - woodpecker-data:/woodpecker-data:ro
    environment:
      FORGE_URL: http://forgejo:3000
      FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
      FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
      FORGE_PASS: ${FORGE_PASS_LLAMA:-}
      FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
      FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
      FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-}
      FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-}
      FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-}
      FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-}
      FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-}
      FORGE_FILER_TOKEN: ${FORGE_FILER_TOKEN:-}
      FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
      WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
      CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
      CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
      CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
      CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1"
      ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
      ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
      FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
      DISINTO_CONTAINER: "1"
      PROJECT_NAME: ${PROJECT_NAME:-project}
      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
      WOODPECKER_DATA_DIR: /woodpecker-data
      WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      POLL_INTERVAL: ${POLL_INTERVAL:-300}
      GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
      ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
      PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
      SUPERVISOR_INTERVAL: ${SUPERVISOR_INTERVAL:-1200}
      AGENT_ROLES: review,dev,gardener,architect,planner,predictor,supervisor
    healthcheck:
      test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
      interval: 60s
      timeout: 5s
      retries: 3
      start_period: 30s
    depends_on:
      forgejo:
        condition: service_healthy
      woodpecker:
        condition: service_started
    networks:
      - disinto-net
 LLAMAEOF
  fi
  # Resume the rest of the compose file (runner onward)
  cat >> "$compose_file" <<'COMPOSEEOF'
  runner:
    image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}
    profiles: ["vault"]
    security_opt:
      - apparmor=unconfined
    volumes:
      - agent-data:/home/agent/data
      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${HOME}/.claude.json:/home/agent/.claude.json:ro
    environment:
      FORGE_URL: http://forgejo:3000
      DISINTO_CONTAINER: "1"
      PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
      CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
    # Vault redesign in progress (PR-based approval, see #73-#77)
    # This container is being replaced — entrypoint will be updated in follow-up
    networks:
@ -323,8 +546,9 @@ services:
  # Edge proxy — reverse proxy to Forgejo, Woodpecker, and staging
  # Serves on ports 80/443, routes based on path
  edge:
-    build: ./docker/edge
+    image: ghcr.io/disinto/edge:${DISINTO_IMAGE_TAG:-latest}
    container_name: disinto-edge
    restart: unless-stopped
    security_opt:
      - apparmor=unconfined
    ports:
@ -336,19 +560,43 @@ services:
      - FORGE_REPO=${FORGE_REPO:-disinto-admin/disinto}
      - FORGE_OPS_REPO=${FORGE_OPS_REPO:-disinto-admin/disinto-ops}
      - FORGE_TOKEN=${FORGE_TOKEN:-}
      - FORGE_PASS=${FORGE_PASS:-}
      - FORGE_ADMIN_USERS=${FORGE_ADMIN_USERS:-disinto-admin}
      - FORGE_ADMIN_TOKEN=${FORGE_ADMIN_TOKEN:-}
      - OPS_REPO_ROOT=/opt/disinto-ops
      - PROJECT_REPO_ROOT=/opt/disinto
      - PRIMARY_BRANCH=main
      - CLAUDE_CONFIG_DIR=${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
      # Reverse tunnel (optional — set by `disinto edge register`, see #622)
      - EDGE_TUNNEL_HOST=${EDGE_TUNNEL_HOST:-}
      - EDGE_TUNNEL_USER=${EDGE_TUNNEL_USER:-tunnel}
      - EDGE_TUNNEL_PORT=${EDGE_TUNNEL_PORT:-}
      - EDGE_TUNNEL_FQDN=${EDGE_TUNNEL_FQDN:-}
      # Subdomain fallback (#713): if subpath routing (#704/#708) fails, add:
      #   EDGE_TUNNEL_FQDN_FORGE, EDGE_TUNNEL_FQDN_CI, EDGE_TUNNEL_FQDN_CHAT
      # See docs/edge-routing-fallback.md for the full pivot plan.
      # Shared secret for Caddy ↔ chat forward_auth (#709)
      - FORWARD_AUTH_SECRET=${FORWARD_AUTH_SECRET:-}
    volumes:
      - ./docker/Caddyfile:/etc/caddy/Caddyfile
      - caddy_data:/data
      - /var/run/docker.sock:/var/run/docker.sock
      - ./secrets/tunnel_key:/run/secrets/tunnel_key:ro
      - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
      - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://localhost:2019/config/"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 15s
    depends_on:
-      - forgejo
+      forgejo:
-      - woodpecker
+        condition: service_healthy
-      - staging
+      woodpecker:
        condition: service_started
      staging:
        condition: service_started
    networks:
      - disinto-net
@ -359,6 +607,12 @@ services:
    command: ["caddy", "file-server", "--root", "/srv/site"]
    security_opt:
      - apparmor=unconfined
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", "http://localhost:2019/config/"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    volumes:
      - ./docker:/srv/site:ro
    networks:
@ -378,12 +632,62 @@ services:
      - disinto-net
    command: ["echo", "staging slot — replace with project image"]
  # Chat container — Claude chat UI backend (#705)
  # Internal service only; edge proxy routes to chat:8080
  # Sandbox hardened per #706 — no docker.sock, read-only rootfs, minimal caps
  chat:
    build:
      context: ./docker/chat
      dockerfile: Dockerfile
    container_name: disinto-chat
    restart: unless-stopped
    read_only: true
    tmpfs:
      - /tmp:size=64m
    security_opt:
      - no-new-privileges:true
    cap_drop:
      - ALL
    pids_limit: 128
    mem_limit: 512m
    memswap_limit: 512m
    volumes:
      # Mount claude binary from host (same as agents)
      - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
      # Throwaway named volume for chat config (isolated from host ~/.claude)
      - chat-config:/var/chat/config
      # Chat history persistence: per-user NDJSON files on bind-mounted host volume
      - ${CHAT_HISTORY_DIR:-./state/chat-history}:/var/lib/chat/history
    environment:
      CHAT_HOST: "0.0.0.0"
      CHAT_PORT: "8080"
      FORGE_URL: http://forgejo:3000
      CHAT_OAUTH_CLIENT_ID: ${CHAT_OAUTH_CLIENT_ID:-}
      CHAT_OAUTH_CLIENT_SECRET: ${CHAT_OAUTH_CLIENT_SECRET:-}
      EDGE_TUNNEL_FQDN: ${EDGE_TUNNEL_FQDN:-}
      DISINTO_CHAT_ALLOWED_USERS: ${DISINTO_CHAT_ALLOWED_USERS:-}
      # Shared secret for Caddy forward_auth verify endpoint (#709)
      FORWARD_AUTH_SECRET: ${FORWARD_AUTH_SECRET:-}
      # Cost caps / rate limiting (#711)
      CHAT_MAX_REQUESTS_PER_HOUR: ${CHAT_MAX_REQUESTS_PER_HOUR:-60}
      CHAT_MAX_REQUESTS_PER_DAY: ${CHAT_MAX_REQUESTS_PER_DAY:-500}
      CHAT_MAX_TOKENS_PER_DAY: ${CHAT_MAX_TOKENS_PER_DAY:-1000000}
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 10s
    networks:
      - disinto-net
 volumes:
  forgejo-data:
  woodpecker-data:
  agent-data:
  project-repos:
  caddy_data:
  chat-config:
 networks:
  disinto-net:
@ -394,6 +698,15 @@ COMPOSEEOF
  # (Docker Compose cannot resolve it; it's a shell variable, not a .env var)
  sed -i "s|\${PROJECT_NAME:-project}|${PROJECT_NAME}|g" "$compose_file"
  # Patch WOODPECKER_REPO_ID — interpolate at generation time
  # (Docker Compose cannot resolve it; it's a shell variable, not a .env var)
  if [ -n "$wp_repo_id" ] && [ "$wp_repo_id" != "0" ]; then
    sed -i "s|PLACEHOLDER_WP_REPO_ID|${wp_repo_id}|g" "$compose_file"
  else
    # Default to empty if no repo_id found (agents will handle gracefully)
    sed -i "s|PLACEHOLDER_WP_REPO_ID||g" "$compose_file"
  fi
  # Patch the forgejo port mapping into the file if non-default
  if [ "$forge_port" != "3000" ]; then
    # Add port mapping to forgejo service so it's reachable from host during init
@ -403,20 +716,35 @@ COMPOSEEOF
  fi
  # Append local-model agent services if any are configured
  # (must run before CLAUDE_BIN_PLACEHOLDER substitution so the placeholder
  # in local-model services is also resolved)
  _generate_local_model_services "$compose_file"
-  # Patch the Claude CLI binary path — resolve from host PATH at init time.
+  # Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env.
  # docker-compose.yml references ${CLAUDE_BIN_DIR} so the value must be set.
  local claude_bin
  claude_bin="$(command -v claude 2>/dev/null || true)"
  if [ -n "$claude_bin" ]; then
    # Resolve symlinks to get the real binary path
    claude_bin="$(readlink -f "$claude_bin")"
    sed -i "s|CLAUDE_BIN_PLACEHOLDER|${claude_bin}|g" "$compose_file"
  else
-    echo "Warning: claude CLI not found in PATH — update docker-compose.yml volumes manually" >&2
+    echo "Warning: claude CLI not found in PATH — set CLAUDE_BIN_DIR in .env manually" >&2
-    sed -i "s|CLAUDE_BIN_PLACEHOLDER|/usr/local/bin/claude|g" "$compose_file"
+    claude_bin="/usr/local/bin/claude"
  fi
  # Persist CLAUDE_BIN_DIR into .env so docker-compose can resolve it.
  local env_file="${FACTORY_ROOT}/.env"
  if [ -f "$env_file" ]; then
    if grep -q "^CLAUDE_BIN_DIR=" "$env_file" 2>/dev/null; then
      sed -i "s|^CLAUDE_BIN_DIR=.*|CLAUDE_BIN_DIR=${claude_bin}|" "$env_file"
    else
      printf 'CLAUDE_BIN_DIR=%s\n' "$claude_bin" >> "$env_file"
    fi
  else
    printf 'CLAUDE_BIN_DIR=%s\n' "$claude_bin" > "$env_file"
  fi
  # In build mode, replace image: with build: for locally-built images
  if [ "$use_build" = true ]; then
    sed -i 's|^\(  agents:\)|\1|' "$compose_file"
    sed -i '/^    image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n      context: .\n      dockerfile: docker/agents/Dockerfile|}' "$compose_file"
    sed -i '/^    image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge|}' "$compose_file"
  fi
  echo "Created: ${compose_file}"
@ -435,7 +763,11 @@ _generate_agent_docker_impl() {
  fi
 }
-# Generate docker/Caddyfile template for edge proxy.
+# Generate docker/Caddyfile for the edge proxy.
 # **CANONICAL SOURCE**: This generator is the single source of truth for the Caddyfile.
 # Output path: ${FACTORY_ROOT}/docker/Caddyfile (gitignored — generated artifact).
 # The edge compose service mounts this path as /etc/caddy/Caddyfile.
 # On a fresh clone, `disinto init` calls generate_caddyfile before first `disinto up`.
 _generate_caddyfile_impl() {
  local docker_dir="${FACTORY_ROOT}/docker"
  local caddyfile="${docker_dir}/Caddyfile"
@ -450,8 +782,13 @@ _generate_caddyfile_impl() {
 # IP-only binding at bootstrap; domain + TLS added later via vault resource request
 :80 {
    # Redirect root to Forgejo
    handle / {
        redir /forge/ 302
    }
    # Reverse proxy to Forgejo
-    handle /forgejo/* {
+    handle /forge/* {
        reverse_proxy forgejo:3000
    }
@ -460,10 +797,28 @@ _generate_caddyfile_impl() {
        reverse_proxy woodpecker:8000
    }
-    # Default: proxy to staging container
+    # Reverse proxy to staging
-    handle {
+    handle /staging/* {
        reverse_proxy staging:80
    }
    # Chat service — reverse proxy to disinto-chat backend (#705)
    # OAuth routes bypass forward_auth — unauthenticated users need these (#709)
    handle /chat/login {
        reverse_proxy chat:8080
    }
    handle /chat/oauth/callback {
        reverse_proxy chat:8080
    }
    # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)
    handle /chat/* {
        forward_auth chat:8080 {
            uri /chat/auth/verify
            copy_headers X-Forwarded-User
            header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
        }
        reverse_proxy chat:8080
    }
 }
 CADDYFILEEOF
--- a/lib/git-creds.sh
+++ b/lib/git-creds.sh
@ -0,0 +1,173 @@
 #!/usr/bin/env bash
 # git-creds.sh — Shared git credential helper configuration
 #
 # Configures a static credential helper for Forgejo password-based HTTP auth.
 # Forgejo 11.x rejects API tokens for git push (#361); password auth works.
 # This ensures all git operations (clone, fetch, push) use password auth
 # without needing tokens embedded in remote URLs (#604).
 #
 # Usage:
 #   source "${FACTORY_ROOT}/lib/git-creds.sh"
 #   configure_git_creds [HOME_DIR] [RUN_AS_CMD]
 #   repair_baked_cred_urls [--as RUN_AS_CMD] DIR [DIR ...]
 #
 # Globals expected:
 #   FORGE_PASS  — bot password for git HTTP auth
 #   FORGE_URL   — Forge instance URL (e.g. http://forgejo:3000)
 #   FORGE_TOKEN — API token (used to resolve bot username)
 set -euo pipefail
 # configure_git_creds [HOME_DIR] [RUN_AS_CMD]
 #   HOME_DIR    — home directory for the git user (default: $HOME or /home/agent)
 #   RUN_AS_CMD  — command prefix to run as another user (e.g. "gosu agent")
 #
 # Writes a credential helper script and configures git to use it globally.
 configure_git_creds() {
  local home_dir="${1:-${HOME:-/home/agent}}"
  local run_as="${2:-}"
  if [ -z "${FORGE_PASS:-}" ] || [ -z "${FORGE_URL:-}" ]; then
    return 0
  fi
  local forge_host forge_proto
  forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||')
  forge_proto=$(printf '%s' "$FORGE_URL" | sed 's|://.*||')
  local log_fn="${_GIT_CREDS_LOG_FN:-echo}"
  # Determine the bot username from FORGE_TOKEN identity with retry/backoff.
  # Never fall back to a hardcoded default — a wrong username paired with the
  # real password produces a cryptic 401 that's much harder to diagnose than
  # a missing credential helper (#741).
  local bot_user=""
  if [ -n "${FORGE_TOKEN:-}" ]; then
    local attempt
    for attempt in 1 2 3 4 5; do
      bot_user=$(curl -sf --max-time 5 -H "Authorization: token ${FORGE_TOKEN}" \
        "${FORGE_URL}/api/v1/user" 2>/dev/null | jq -r '.login // empty') || bot_user=""
      if [ -n "$bot_user" ]; then
        break
      fi
      $log_fn "WARNING: Forgejo not reachable (attempt ${attempt}/5) — retrying in ${attempt}s"
      sleep "$attempt"
    done
  fi
  if [ -z "$bot_user" ]; then
    $log_fn "ERROR: Could not determine bot username from FORGE_TOKEN after 5 attempts — credential helper NOT configured"
    $log_fn "ERROR: git push will fail until this is resolved. Restart the container after Forgejo is healthy."
    return 1
  fi
  # Export BOT_USER so downstream functions (e.g. configure_git_identity) can
  # reuse the resolved value without a redundant API call.
  export BOT_USER="$bot_user"
  local helper_path="${home_dir}/.git-credentials-helper"
  # Write a static credential helper script (git credential protocol)
  cat > "$helper_path" <<CREDEOF
 #!/bin/sh
 # Auto-generated git credential helper for Forgejo password auth (#361, #604)
 # Reads \$FORGE_PASS from env at runtime — file is safe to read on disk.
 # Only respond to "get" action; ignore "store" and "erase".
 [ "\$1" = "get" ] || exit 0
 # Read and discard stdin (git sends protocol/host info)
 cat >/dev/null
 echo "protocol=${forge_proto}"
 echo "host=${forge_host}"
 echo "username=${bot_user}"
 echo "password=\$FORGE_PASS"
 CREDEOF
  chmod 755 "$helper_path"
  # Set ownership and configure git if running as a different user
  if [ -n "$run_as" ]; then
    local target_user
    target_user=$(echo "$run_as" | awk '{print $NF}')
    chown "${target_user}:${target_user}" "$helper_path" 2>/dev/null || true
    $run_as bash -c "git config --global credential.helper '${helper_path}'"
  else
    git config --global credential.helper "$helper_path"
  fi
  # Set safe.directory to work around dubious ownership after container restart
  if [ -n "$run_as" ]; then
    $run_as bash -c "git config --global --add safe.directory '*'"
  else
    git config --global --add safe.directory '*'
  fi
  # Verify the credential helper actually authenticates (#741).
  # A helper that was written with a valid username but a mismatched password
  # would silently 401 on every push — catch it now.
  if ! curl -sf --max-time 5 -u "${bot_user}:${FORGE_PASS}" \
    "${FORGE_URL}/api/v1/user" >/dev/null 2>&1; then
    $log_fn "ERROR: credential helper verification failed — ${bot_user}:FORGE_PASS rejected by Forgejo"
    rm -f "$helper_path"
    return 1
  fi
  $log_fn "Git credential helper verified: ${bot_user}@${forge_host}"
 }
 # repair_baked_cred_urls [--as RUN_AS_CMD] DIR [DIR ...]
 #   Scans git repos under each DIR and rewrites remote URLs that contain
 #   embedded credentials (user:pass@host) to clean URLs.
 #   Logs each repair so operators can see the migration happened.
 #
 #   Optional --as flag runs git operations under the specified user wrapper
 #   (e.g. "gosu agent") to avoid dubious-ownership issues on user-owned repos.
 #
 # Set _GIT_CREDS_LOG_FN to a custom log function name (default: echo).
 repair_baked_cred_urls() {
  local log_fn="${_GIT_CREDS_LOG_FN:-echo}"
  local run_as=""
  local -a dirs=()
  while [ $# -gt 0 ]; do
    case "$1" in
      --as) shift; run_as="$1"; shift ;;
      *) dirs+=("$1"); shift ;;
    esac
  done
  for dir in "${dirs[@]}"; do
    [ -d "$dir" ] || continue
    # Find git repos: either dir itself or immediate subdirectories
    local -a repos=()
    if [ -d "${dir}/.git" ]; then
      repos+=("$dir")
    else
      local sub
      for sub in "$dir"/*/; do
        [ -d "${sub}.git" ] && repos+=("${sub%/}")
      done
    fi
    local repo
    for repo in "${repos[@]}"; do
      local url
      if [ -n "$run_as" ]; then
        url=$($run_as git -C "$repo" config --get remote.origin.url 2>/dev/null || true)
      else
        url=$(git -C "$repo" config --get remote.origin.url 2>/dev/null || true)
      fi
      [ -n "$url" ] || continue
      # Check if URL contains embedded credentials: http(s)://user:pass@host
      if printf '%s' "$url" | grep -qE '^https?://[^/]+@'; then
        # Strip credentials: http(s)://user:pass@host/path -> http(s)://host/path
        local clean_url
        clean_url=$(printf '%s' "$url" | sed -E 's|(https?://)[^@]+@|\1|')
        if [ -n "$run_as" ]; then
          $run_as git -C "$repo" remote set-url origin "$clean_url"
        else
          git -C "$repo" remote set-url origin "$clean_url"
        fi
        $log_fn "Repaired baked credentials in ${repo} (remote origin -> ${clean_url})"
      fi
    done
  done
 }
--- a/lib/hvault.sh
+++ b/lib/hvault.sh
@ -0,0 +1,279 @@
 #!/usr/bin/env bash
 # hvault.sh — HashiCorp Vault helper module
 #
 # Typed, audited helpers for Vault KV v2 access so no script re-implements
 # `curl -H "X-Vault-Token: ..."` ad-hoc.
 #
 # Usage: source this file, then call any hvault_* function.
 #
 # Environment:
 #   VAULT_ADDR  — Vault server address (required, no default)
 #   VAULT_TOKEN — auth token (precedence: env > /etc/vault.d/root.token)
 #
 # All functions emit structured JSON errors to stderr on failure.
 set -euo pipefail
 # ── Internal helpers ─────────────────────────────────────────────────────────
 # _hvault_err — emit structured JSON error to stderr
 # Args: func_name, message, [detail]
 _hvault_err() {
  local func="$1" msg="$2" detail="${3:-}"
  jq -n --arg func "$func" --arg msg "$msg" --arg detail "$detail" \
    '{error:true,function:$func,message:$msg,detail:$detail}' >&2
 }
 # _hvault_resolve_token — resolve VAULT_TOKEN from env or token file
 _hvault_resolve_token() {
  if [ -n "${VAULT_TOKEN:-}" ]; then
    return 0
  fi
  local token_file="/etc/vault.d/root.token"
  if [ -f "$token_file" ]; then
    VAULT_TOKEN="$(cat "$token_file")"
    export VAULT_TOKEN
    return 0
  fi
  return 1
 }
 # _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set
 # Args: caller function name
 _hvault_check_prereqs() {
  local caller="$1"
  if [ -z "${VAULT_ADDR:-}" ]; then
    _hvault_err "$caller" "VAULT_ADDR is not set" "export VAULT_ADDR before calling $caller"
    return 1
  fi
  if ! _hvault_resolve_token; then
    _hvault_err "$caller" "VAULT_TOKEN is not set and /etc/vault.d/root.token not found" \
      "export VAULT_TOKEN or write token to /etc/vault.d/root.token"
    return 1
  fi
 }
 # _hvault_request — execute a Vault API request
 # Args: method, path, [data]
 # Outputs: response body to stdout
 # Returns: 0 on 2xx, 1 otherwise (error JSON to stderr)
 _hvault_request() {
  local method="$1" path="$2" data="${3:-}"
  local url="${VAULT_ADDR}/v1/${path}"
  local http_code body
  local tmpfile
  tmpfile="$(mktemp)"
  local curl_args=(
    -s
    -w '%{http_code}'
    -H "X-Vault-Token: ${VAULT_TOKEN}"
    -H "Content-Type: application/json"
    -X "$method"
    -o "$tmpfile"
  )
  if [ -n "$data" ]; then
    curl_args+=(-d "$data")
  fi
  http_code="$(curl "${curl_args[@]}" "$url")" || {
    _hvault_err "_hvault_request" "curl failed" "url=$url"
    rm -f "$tmpfile"
    return 1
  }
  body="$(cat "$tmpfile")"
  rm -f "$tmpfile"
  # Check HTTP status — 2xx is success
  case "$http_code" in
    2[0-9][0-9])
      printf '%s' "$body"
      return 0
      ;;
    *)
      _hvault_err "_hvault_request" "HTTP $http_code" "$body"
      return 1
      ;;
  esac
 }
 # ── Public API ───────────────────────────────────────────────────────────────
 # hvault_kv_get PATH [KEY]
 #   Read a KV v2 secret at PATH, optionally extract a single KEY.
 #   Outputs: JSON value (full data object, or single key value)
 hvault_kv_get() {
  local path="${1:-}"
  local key="${2:-}"
  if [ -z "$path" ]; then
    _hvault_err "hvault_kv_get" "PATH is required" "usage: hvault_kv_get PATH [KEY]"
    return 1
  fi
  _hvault_check_prereqs "hvault_kv_get" || return 1
  local response
  response="$(_hvault_request GET "secret/data/${path}")" || return 1
  if [ -n "$key" ]; then
    printf '%s' "$response" | jq -e -r --arg key "$key" '.data.data[$key]' 2>/dev/null || {
      _hvault_err "hvault_kv_get" "key not found" "key=$key path=$path"
      return 1
    }
  else
    printf '%s' "$response" | jq -e '.data.data' 2>/dev/null || {
      _hvault_err "hvault_kv_get" "failed to parse response" "path=$path"
      return 1
    }
  fi
 }
 # hvault_kv_put PATH KEY=VAL [KEY=VAL ...]
 #   Write a KV v2 secret at PATH. Accepts one or more KEY=VAL pairs.
 hvault_kv_put() {
  local path="${1:-}"
  shift || true
  if [ -z "$path" ] || [ $# -eq 0 ]; then
    _hvault_err "hvault_kv_put" "PATH and at least one KEY=VAL required" \
      "usage: hvault_kv_put PATH KEY=VAL [KEY=VAL ...]"
    return 1
  fi
  _hvault_check_prereqs "hvault_kv_put" || return 1
  # Build JSON payload from KEY=VAL pairs entirely via jq
  local payload='{"data":{}}'
  for kv in "$@"; do
    local k="${kv%%=*}"
    local v="${kv#*=}"
    if [ "$k" = "$kv" ]; then
      _hvault_err "hvault_kv_put" "invalid KEY=VAL pair" "got: $kv"
      return 1
    fi
    payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '.data[$k] = $v')"
  done
  _hvault_request POST "secret/data/${path}" "$payload" >/dev/null
 }
 # hvault_kv_list PATH
 #   List keys at a KV v2 path.
 #   Outputs: JSON array of key names
 hvault_kv_list() {
  local path="${1:-}"
  if [ -z "$path" ]; then
    _hvault_err "hvault_kv_list" "PATH is required" "usage: hvault_kv_list PATH"
    return 1
  fi
  _hvault_check_prereqs "hvault_kv_list" || return 1
  local response
  response="$(_hvault_request LIST "secret/metadata/${path}")" || return 1
  printf '%s' "$response" | jq -e '.data.keys' 2>/dev/null || {
    _hvault_err "hvault_kv_list" "failed to parse response" "path=$path"
    return 1
  }
 }
 # hvault_policy_apply NAME FILE
 #   Idempotent policy upsert — create or update a Vault policy.
 hvault_policy_apply() {
  local name="${1:-}"
  local file="${2:-}"
  if [ -z "$name" ] || [ -z "$file" ]; then
    _hvault_err "hvault_policy_apply" "NAME and FILE are required" \
      "usage: hvault_policy_apply NAME FILE"
    return 1
  fi
  if [ ! -f "$file" ]; then
    _hvault_err "hvault_policy_apply" "policy file not found" "file=$file"
    return 1
  fi
  _hvault_check_prereqs "hvault_policy_apply" || return 1
  local policy_content
  policy_content="$(cat "$file")"
  local payload
  payload="$(jq -n --arg policy "$policy_content" '{"policy": $policy}')"
  _hvault_request PUT "sys/policies/acl/${name}" "$payload" >/dev/null
 }
 # hvault_jwt_login ROLE JWT
 #   Exchange a JWT for a short-lived Vault token.
 #   Outputs: client token string
 hvault_jwt_login() {
  local role="${1:-}"
  local jwt="${2:-}"
  if [ -z "$role" ] || [ -z "$jwt" ]; then
    _hvault_err "hvault_jwt_login" "ROLE and JWT are required" \
      "usage: hvault_jwt_login ROLE JWT"
    return 1
  fi
  # Only need VAULT_ADDR, not VAULT_TOKEN (we're obtaining a token)
  if [ -z "${VAULT_ADDR:-}" ]; then
    _hvault_err "hvault_jwt_login" "VAULT_ADDR is not set"
    return 1
  fi
  local payload
  payload="$(jq -n --arg role "$role" --arg jwt "$jwt" \
    '{"role": $role, "jwt": $jwt}')"
  local response
  # JWT login does not require an existing token — use curl directly
  local tmpfile http_code
  tmpfile="$(mktemp)"
  http_code="$(curl -s -w '%{http_code}' \
    -H "Content-Type: application/json" \
    -X POST \
    -d "$payload" \
    -o "$tmpfile" \
    "${VAULT_ADDR}/v1/auth/jwt/login")" || {
    _hvault_err "hvault_jwt_login" "curl failed"
    rm -f "$tmpfile"
    return 1
  }
  local body
  body="$(cat "$tmpfile")"
  rm -f "$tmpfile"
  case "$http_code" in
    2[0-9][0-9])
      printf '%s' "$body" | jq -e -r '.auth.client_token' 2>/dev/null || {
        _hvault_err "hvault_jwt_login" "failed to extract client_token" "$body"
        return 1
      }
      ;;
    *)
      _hvault_err "hvault_jwt_login" "HTTP $http_code" "$body"
      return 1
      ;;
  esac
 }
 # hvault_token_lookup
 #   Returns TTL, policies, and accessor for the current token.
 #   Outputs: JSON object with ttl, policies, accessor fields
 hvault_token_lookup() {
  _hvault_check_prereqs "hvault_token_lookup" || return 1
  local response
  response="$(_hvault_request GET "auth/token/lookup-self")" || return 1
  printf '%s' "$response" | jq -e '{
    ttl: .data.ttl,
    policies: .data.policies,
    accessor: .data.accessor,
    display_name: .data.display_name
  }' 2>/dev/null || {
    _hvault_err "hvault_token_lookup" "failed to parse token info"
    return 1
  }
 }
--- a/lib/init/nomad/install.sh
+++ b/lib/init/nomad/install.sh
@ -0,0 +1,143 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault
 #
 # Part of the Nomad+Vault migration. Installs both the `nomad` binary (S0.2,
 # issue #822) and the `vault` binary (S0.3, issue #823) from the same
 # HashiCorp apt repository. Does NOT configure, start, or enable any systemd
 # unit — lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh
 # own that. Does NOT wire this script into `disinto init` — S0.4 owns that.
 #
 # Idempotency contract:
 #   - Running twice back-to-back is a no-op once both target versions are
 #     installed and the apt source is in place.
 #   - Adds the HashiCorp apt keyring only if it is absent.
 #   - Adds the HashiCorp apt sources list only if it is absent.
 #   - Skips `apt-get install` for any package whose installed version already
 #     matches the pin. If both are at pin, exits before touching apt.
 #
 # Configuration:
 #   NOMAD_VERSION  — pinned Nomad version (default: see below). Apt package
 #                    name is versioned as "nomad=<version>-1".
 #   VAULT_VERSION  — pinned Vault version (default: see below). Apt package
 #                    name is versioned as "vault=<version>-1".
 #
 # Usage:
 #   sudo lib/init/nomad/install.sh
 #   sudo NOMAD_VERSION=1.9.5 VAULT_VERSION=1.18.5 lib/init/nomad/install.sh
 #
 # Exit codes:
 #   0  success (installed or already present)
 #   1  precondition failure (not Debian/Ubuntu, missing tools, not root)
 # =============================================================================
 set -euo pipefail
 # Pin to specific 1.x releases. Bump here, not at call sites.
 NOMAD_VERSION="${NOMAD_VERSION:-1.9.5}"
 VAULT_VERSION="${VAULT_VERSION:-1.18.5}"
 HASHICORP_KEYRING="/usr/share/keyrings/hashicorp-archive-keyring.gpg"
 HASHICORP_SOURCES="/etc/apt/sources.list.d/hashicorp.list"
 HASHICORP_GPG_URL="https://apt.releases.hashicorp.com/gpg"
 HASHICORP_REPO_URL="https://apt.releases.hashicorp.com"
 log() { printf '[install] %s\n' "$*"; }
 die() { printf '[install] ERROR: %s\n' "$*" >&2; exit 1; }
 # _installed_version BINARY
 #   Echoes the installed semver for `nomad` or `vault` (e.g. "1.9.5").
 #   Both tools print their version on the first line of `<bin> version` as
 #   "<Name> v<semver>..." — the shared awk extracts $2 with the leading "v"
 #   stripped. Empty string when the binary is absent or output is unexpected.
 _installed_version() {
  local bin="$1"
  command -v "$bin" >/dev/null 2>&1 || { printf ''; return 0; }
  "$bin" version 2>/dev/null \
    | awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}'
 }
 # ── Preconditions ────────────────────────────────────────────────────────────
 if [ "$(id -u)" -ne 0 ]; then
  die "must run as root (needs apt-get + /usr/share/keyrings write access)"
 fi
 for bin in apt-get gpg curl lsb_release; do
  command -v "$bin" >/dev/null 2>&1 \
    || die "required binary not found: ${bin}"
 done
 CODENAME="$(lsb_release -cs)"
 [ -n "$CODENAME" ] || die "lsb_release returned empty codename"
 # ── Fast-path: are both already at desired versions? ─────────────────────────
 nomad_installed="$(_installed_version nomad)"
 vault_installed="$(_installed_version vault)"
 need_pkgs=()
 if [ "$nomad_installed" = "$NOMAD_VERSION" ]; then
  log "nomad ${NOMAD_VERSION} already installed"
 else
  need_pkgs+=("nomad=${NOMAD_VERSION}-1")
 fi
 if [ "$vault_installed" = "$VAULT_VERSION" ]; then
  log "vault ${VAULT_VERSION} already installed"
 else
  need_pkgs+=("vault=${VAULT_VERSION}-1")
 fi
 if [ "${#need_pkgs[@]}" -eq 0 ]; then
  log "nothing to do"
  exit 0
 fi
 # ── Ensure HashiCorp apt keyring ─────────────────────────────────────────────
 if [ ! -f "$HASHICORP_KEYRING" ]; then
  log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}"
  tmpkey="$(mktemp)"
  trap 'rm -f "$tmpkey"' EXIT
  curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \
    || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}"
  gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \
    || die "failed to dearmor HashiCorp GPG key"
  chmod 0644 "$HASHICORP_KEYRING"
  rm -f "$tmpkey"
  trap - EXIT
 else
  log "HashiCorp apt keyring already present"
 fi
 # ── Ensure HashiCorp apt sources list ────────────────────────────────────────
 desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main"
 if [ ! -f "$HASHICORP_SOURCES" ] \
   || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then
  log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}"
  printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES"
  apt_update_needed=1
 else
  log "HashiCorp apt sources list already present"
  apt_update_needed=0
 fi
 # ── Install the pinned versions ──────────────────────────────────────────────
 if [ "$apt_update_needed" -eq 1 ]; then
  log "running apt-get update"
  DEBIAN_FRONTEND=noninteractive apt-get update -qq \
    || die "apt-get update failed"
 fi
 log "installing ${need_pkgs[*]}"
 DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
  "${need_pkgs[@]}" \
  || die "apt-get install ${need_pkgs[*]} failed"
 # ── Verify ───────────────────────────────────────────────────────────────────
 final_nomad="$(_installed_version nomad)"
 if [ "$final_nomad" != "$NOMAD_VERSION" ]; then
  die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'"
 fi
 final_vault="$(_installed_version vault)"
 if [ "$final_vault" != "$VAULT_VERSION" ]; then
  die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'"
 fi
 log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} installed successfully"
--- a/lib/init/nomad/lib-systemd.sh
+++ b/lib/init/nomad/lib-systemd.sh
@ -0,0 +1,77 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/lib-systemd.sh — Shared idempotent systemd-unit installer
 #
 # Sourced by lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh
 # (and any future sibling) to collapse the "write unit if content differs,
 # daemon-reload, enable (never start)" boilerplate.
 #
 # Install-but-don't-start is the invariant this helper enforces — mid-migration
 # installers land files and enable units; the orchestrator (S0.4) starts them.
 #
 # Public API (sourced into caller scope):
 #
 #   systemd_require_preconditions UNIT_PATH
 #     Asserts the caller is uid 0 and `systemctl` is on $PATH. Calls the
 #     caller's die() with a UNIT_PATH-scoped message on failure.
 #
 #   systemd_install_unit UNIT_PATH UNIT_NAME UNIT_CONTENT
 #     Writes UNIT_CONTENT to UNIT_PATH (0644 root:root) only if on-disk
 #     content differs. If written, runs `systemctl daemon-reload`. Then
 #     enables UNIT_NAME (no-op if already enabled). Never starts the unit.
 #
 # Caller contract:
 #   - Callers MUST define `log()` and `die()` before sourcing this file (we
 #     call log() for status chatter and rely on the caller's error-handling
 #     stance; `set -e` propagates install/cmp/systemctl failures).
 # =============================================================================
 # systemd_require_preconditions UNIT_PATH
 systemd_require_preconditions() {
  local unit_path="$1"
  if [ "$(id -u)" -ne 0 ]; then
    die "must run as root (needs write access to ${unit_path})"
  fi
  command -v systemctl >/dev/null 2>&1 \
    || die "systemctl not found (systemd is required)"
 }
 # systemd_install_unit UNIT_PATH UNIT_NAME UNIT_CONTENT
 systemd_install_unit() {
  local unit_path="$1"
  local unit_name="$2"
  local unit_content="$3"
  local needs_reload=0
  if [ ! -f "$unit_path" ] \
     || ! printf '%s\n' "$unit_content" | cmp -s - "$unit_path"; then
    log "writing unit → ${unit_path}"
    # Subshell-scoped EXIT trap guarantees the temp file is removed on
    # both success AND set-e-induced failure of `install`. A function-
    # scoped RETURN trap does NOT fire on errexit-abort (bash only runs
    # RETURN on normal function exit), so the subshell is the reliable
    # cleanup boundary. It's also isolated from the caller's EXIT trap.
    (
      local tmp
      tmp="$(mktemp)"
      trap 'rm -f "$tmp"' EXIT
      printf '%s\n' "$unit_content" > "$tmp"
      install -m 0644 -o root -g root "$tmp" "$unit_path"
    )
    needs_reload=1
  else
    log "unit file already up to date"
  fi
  if [ "$needs_reload" -eq 1 ]; then
    log "systemctl daemon-reload"
    systemctl daemon-reload
  fi
  if systemctl is-enabled --quiet "$unit_name" 2>/dev/null; then
    log "${unit_name} already enabled"
  else
    log "systemctl enable ${unit_name}"
    systemctl enable "$unit_name" >/dev/null
  fi
 }
--- a/lib/init/nomad/systemd-nomad.sh
+++ b/lib/init/nomad/systemd-nomad.sh
@ -0,0 +1,102 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/systemd-nomad.sh — Idempotent systemd unit installer for Nomad
 #
 # Part of the Nomad+Vault migration (S0.2, issue #822). Writes
 # /etc/systemd/system/nomad.service pointing at /etc/nomad.d/ and runs
 # `systemctl enable nomad` WITHOUT starting the service — we don't launch
 # the cluster until S0.4 wires everything together.
 #
 # Idempotency contract:
 #   - Existing unit file is NOT rewritten when on-disk content already
 #     matches the desired content (avoids spurious `daemon-reload`).
 #   - `systemctl enable` on an already-enabled unit is a no-op.
 #   - This script is safe to run unconditionally before every factory boot.
 #
 # Preconditions:
 #   - nomad binary installed (see lib/init/nomad/install.sh)
 #   - /etc/nomad.d/ will hold server.hcl / client.hcl (placed by S0.4)
 #
 # Usage:
 #   sudo lib/init/nomad/systemd-nomad.sh
 #
 # Exit codes:
 #   0  success (unit installed + enabled, or already so)
 #   1  precondition failure (not root, no systemctl, no nomad binary)
 # =============================================================================
 set -euo pipefail
 UNIT_PATH="/etc/systemd/system/nomad.service"
 NOMAD_CONFIG_DIR="/etc/nomad.d"
 NOMAD_DATA_DIR="/var/lib/nomad"
 log() { printf '[systemd-nomad] %s\n' "$*"; }
 die() { printf '[systemd-nomad] ERROR: %s\n' "$*" >&2; exit 1; }
 # shellcheck source=lib-systemd.sh
 . "$(dirname "${BASH_SOURCE[0]}")/lib-systemd.sh"
 # ── Preconditions ────────────────────────────────────────────────────────────
 systemd_require_preconditions "$UNIT_PATH"
 NOMAD_BIN="$(command -v nomad 2>/dev/null || true)"
 [ -n "$NOMAD_BIN" ] \
  || die "nomad binary not found — run lib/init/nomad/install.sh first"
 # ── Desired unit content ─────────────────────────────────────────────────────
 # Upstream-recommended baseline (https://developer.hashicorp.com/nomad/docs/install/production/deployment-guide)
 # trimmed for a single-node combined server+client dev box.
 #   - Wants=/After= network-online: nomad must have networking up.
 #   - User/Group=root: the Docker driver needs root to talk to dockerd.
 #   - LimitNOFILE/LimitNPROC=infinity: avoid Nomad's startup warning.
 #   - KillSignal=SIGINT: triggers Nomad's graceful shutdown path.
 #   - Restart=on-failure with a bounded burst to avoid crash-loops eating the
 #     journal when /etc/nomad.d/ is mis-configured.
 read -r -d '' DESIRED_UNIT <<EOF || true
 [Unit]
 Description=Nomad
 Documentation=https://developer.hashicorp.com/nomad/docs
 Wants=network-online.target
 After=network-online.target
 # When Docker is present, ensure dockerd is up before nomad starts — the
 # Docker task driver needs the daemon socket available at startup.
 Wants=docker.service
 After=docker.service
 [Service]
 Type=notify
 User=root
 Group=root
 ExecReload=/bin/kill -HUP \$MAINPID
 ExecStart=${NOMAD_BIN} agent -config=${NOMAD_CONFIG_DIR}
 KillMode=process
 KillSignal=SIGINT
 LimitNOFILE=infinity
 LimitNPROC=infinity
 Restart=on-failure
 RestartSec=2
 StartLimitBurst=3
 StartLimitIntervalSec=10
 TasksMax=infinity
 OOMScoreAdjust=-1000
 [Install]
 WantedBy=multi-user.target
 EOF
 # ── Ensure config + data dirs exist ──────────────────────────────────────────
 # We do not populate /etc/nomad.d/ here (that's S0.4). We do create the
 # directory so `nomad agent -config=/etc/nomad.d` doesn't error if the unit
 # is started before hcl files are dropped in.
 for d in "$NOMAD_CONFIG_DIR" "$NOMAD_DATA_DIR"; do
  if [ ! -d "$d" ]; then
    log "creating ${d}"
    install -d -m 0755 "$d"
  fi
 done
 # ── Install + reload + enable (shared with systemd-vault.sh via lib-systemd) ─
 systemd_install_unit "$UNIT_PATH" "nomad.service" "$DESIRED_UNIT"
 log "done — unit installed and enabled (NOT started; S0.4 brings the cluster up)"
--- a/lib/init/nomad/systemd-vault.sh
+++ b/lib/init/nomad/systemd-vault.sh
@ -0,0 +1,151 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/systemd-vault.sh — Idempotent systemd unit installer for Vault
 #
 # Part of the Nomad+Vault migration (S0.3, issue #823). Lands three things:
 #   1. /etc/vault.d/               (0755 root:root)
 #   2. /etc/vault.d/vault.hcl      (copy of nomad/vault.hcl, 0644 root:root)
 #   3. /var/lib/vault/data/        (0700 root:root, Vault file-storage backend)
 #   4. /etc/systemd/system/vault.service  (0644 root:root)
 #
 # Then `systemctl enable vault` WITHOUT starting the service. Bootstrap
 # order is:
 #   lib/init/nomad/install.sh         (nomad + vault binaries)
 #   lib/init/nomad/systemd-vault.sh   (this script — unit + config + dirs)
 #   lib/init/nomad/vault-init.sh      (init + write unseal.key + unseal once)
 #   systemctl start vault             (ExecStartPost auto-unseals from file)
 #
 # The systemd unit's ExecStartPost reads /etc/vault.d/unseal.key and calls
 # `vault operator unseal`. That file is written by vault-init.sh on first
 # run; until it exists, `systemctl start vault` will leave Vault sealed
 # (ExecStartPost fails, unit goes into failed state — intentional, visible).
 #
 # Seal model:
 #   The single unseal key lives at /etc/vault.d/unseal.key (0400 root).
 #   Seal-key theft == vault theft. Factory-dev-box-acceptable tradeoff —
 #   we avoid running a second Vault to auto-unseal the first.
 #
 # Idempotency contract:
 #   - Unit file NOT rewritten when on-disk content already matches desired.
 #   - vault.hcl NOT rewritten when on-disk content matches the repo copy.
 #   - `systemctl enable` on an already-enabled unit is a no-op.
 #   - Safe to run unconditionally before every factory boot.
 #
 # Preconditions:
 #   - vault binary installed (lib/init/nomad/install.sh)
 #   - nomad/vault.hcl present in the repo (relative to this script)
 #
 # Usage:
 #   sudo lib/init/nomad/systemd-vault.sh
 #
 # Exit codes:
 #   0  success (unit+config installed + enabled, or already so)
 #   1  precondition failure (not root, no systemctl, no vault binary,
 #      missing source config)
 # =============================================================================
 set -euo pipefail
 UNIT_PATH="/etc/systemd/system/vault.service"
 VAULT_CONFIG_DIR="/etc/vault.d"
 VAULT_CONFIG_FILE="${VAULT_CONFIG_DIR}/vault.hcl"
 VAULT_DATA_DIR="/var/lib/vault/data"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 VAULT_HCL_SRC="${REPO_ROOT}/nomad/vault.hcl"
 log() { printf '[systemd-vault] %s\n' "$*"; }
 die() { printf '[systemd-vault] ERROR: %s\n' "$*" >&2; exit 1; }
 # shellcheck source=lib-systemd.sh
 . "${SCRIPT_DIR}/lib-systemd.sh"
 # ── Preconditions ────────────────────────────────────────────────────────────
 systemd_require_preconditions "$UNIT_PATH"
 VAULT_BIN="$(command -v vault 2>/dev/null || true)"
 [ -n "$VAULT_BIN" ] \
  || die "vault binary not found — run lib/init/nomad/install.sh first"
 [ -f "$VAULT_HCL_SRC" ] \
  || die "source config not found: ${VAULT_HCL_SRC}"
 # ── Desired unit content ─────────────────────────────────────────────────────
 # Adapted from HashiCorp's recommended vault.service template
 # (https://developer.hashicorp.com/vault/tutorials/getting-started-deploy/deploy)
 # for a single-node factory dev box:
 #   - User=root keeps the seal-key read path simple (unseal.key is 0400 root).
 #   - CAP_IPC_LOCK lets mlock() succeed so disable_mlock=false is honoured.
 #     Harmless when running as root; required if this is ever flipped to a
 #     dedicated `vault` user.
 #   - ExecStartPost auto-unseals on every boot using the persisted key.
 #     This is the dev-persisted-seal tradeoff — seal-key theft == vault
 #     theft, but no second Vault to babysit.
 #   - ConditionFileNotEmpty guards against starting without config — makes
 #     a missing vault.hcl visible in systemctl status, not a crash loop.
 #   - Type=notify so systemd waits for Vault's listener-ready notification
 #     before running ExecStartPost (ExecStartPost also has `sleep 2` as a
 #     belt-and-braces guard against Type=notify edge cases).
 #   - \$MAINPID is escaped so bash doesn't expand it inside this heredoc.
 #   - \$(cat ...) is escaped so the subshell runs at unit-execution time
 #     (inside bash -c), not at heredoc-expansion time here.
 read -r -d '' DESIRED_UNIT <<EOF || true
 [Unit]
 Description=HashiCorp Vault
 Documentation=https://developer.hashicorp.com/vault/docs
 Requires=network-online.target
 After=network-online.target
 ConditionFileNotEmpty=${VAULT_CONFIG_FILE}
 StartLimitIntervalSec=60
 StartLimitBurst=3
 [Service]
 Type=notify
 User=root
 Group=root
 Environment=VAULT_ADDR=http://127.0.0.1:8200
 SecureBits=keep-caps
 CapabilityBoundingSet=CAP_IPC_LOCK
 AmbientCapabilities=CAP_IPC_LOCK
 ExecStart=${VAULT_BIN} server -config=${VAULT_CONFIG_FILE}
 ExecStartPost=/bin/bash -c 'sleep 2 && ${VAULT_BIN} operator unseal \$(cat ${VAULT_CONFIG_DIR}/unseal.key)'
 ExecReload=/bin/kill --signal HUP \$MAINPID
 KillMode=process
 KillSignal=SIGINT
 Restart=on-failure
 RestartSec=5
 TimeoutStopSec=30
 LimitNOFILE=65536
 LimitMEMLOCK=infinity
 [Install]
 WantedBy=multi-user.target
 EOF
 # ── Ensure config + data dirs exist ──────────────────────────────────────────
 # /etc/vault.d is 0755 — vault.hcl is world-readable (no secrets in it);
 # the real secrets (unseal.key, root.token) get their own 0400 mode.
 # /var/lib/vault/data is 0700 — vault's on-disk state (encrypted-at-rest
 # by Vault itself, but an extra layer of "don't rely on that").
 if [ ! -d "$VAULT_CONFIG_DIR" ]; then
  log "creating ${VAULT_CONFIG_DIR}"
  install -d -m 0755 -o root -g root "$VAULT_CONFIG_DIR"
 fi
 if [ ! -d "$VAULT_DATA_DIR" ]; then
  log "creating ${VAULT_DATA_DIR}"
  install -d -m 0700 -o root -g root "$VAULT_DATA_DIR"
 fi
 # ── Install vault.hcl only if content differs ────────────────────────────────
 if [ ! -f "$VAULT_CONFIG_FILE" ] \
   || ! cmp -s "$VAULT_HCL_SRC" "$VAULT_CONFIG_FILE"; then
  log "writing config → ${VAULT_CONFIG_FILE}"
  install -m 0644 -o root -g root "$VAULT_HCL_SRC" "$VAULT_CONFIG_FILE"
 else
  log "config already up to date"
 fi
 # ── Install + reload + enable (shared with systemd-nomad.sh via lib-systemd) ─
 systemd_install_unit "$UNIT_PATH" "vault.service" "$DESIRED_UNIT"
 log "done — unit+config installed and enabled (NOT started; vault-init.sh next)"
--- a/lib/init/nomad/vault-init.sh
+++ b/lib/init/nomad/vault-init.sh
@ -0,0 +1,206 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/vault-init.sh — Idempotent Vault first-run initializer
 #
 # Part of the Nomad+Vault migration (S0.3, issue #823). Initializes Vault
 # in dev-persisted-seal mode (single unseal key on disk) and unseals once.
 # On re-run, becomes a no-op — never re-initializes or rotates the key.
 #
 # What it does (first run):
 #   1. Ensures Vault is reachable at ${VAULT_ADDR} — spawns a temporary
 #      `vault server -config=/etc/vault.d/vault.hcl` if not already up.
 #   2. Runs `vault operator init -key-shares=1 -key-threshold=1` and
 #      captures the resulting unseal key + root token.
 #   3. Writes /etc/vault.d/unseal.key   (0400 root, no trailing newline).
 #   4. Writes /etc/vault.d/root.token   (0400 root, no trailing newline).
 #   5. Unseals Vault once in the current process.
 #   6. Shuts down the temporary server if we started one (so a subsequent
 #      `systemctl start vault` doesn't conflict on port 8200).
 #
 # Idempotency contract:
 #   - /etc/vault.d/unseal.key exists AND `vault status` reports
 #     initialized=true → exit 0, no mutation, no re-init.
 #   - Initialized-but-unseal.key-missing is a hard failure (can't recover
 #     the key without the existing storage; user must restore from backup).
 #
 # Bootstrap order:
 #   lib/init/nomad/install.sh          (installs vault binary)
 #   lib/init/nomad/systemd-vault.sh    (lands unit + config + dirs; enables)
 #   lib/init/nomad/vault-init.sh       (this script — init + unseal once)
 #   systemctl start vault              (ExecStartPost auto-unseals henceforth)
 #
 # Seal model:
 #   Single unseal key persisted on disk at /etc/vault.d/unseal.key. Seal-key
 #   theft == vault theft. Factory-dev-box-acceptable tradeoff — we avoid
 #   running a second Vault to auto-unseal the first.
 #
 # Environment:
 #   VAULT_ADDR  — Vault API address (default: http://127.0.0.1:8200).
 #
 # Usage:
 #   sudo lib/init/nomad/vault-init.sh
 #
 # Exit codes:
 #   0  success (initialized + unsealed + keys persisted; or already done)
 #   1  precondition / operational failure
 # =============================================================================
 set -euo pipefail
 VAULT_CONFIG_FILE="/etc/vault.d/vault.hcl"
 UNSEAL_KEY_FILE="/etc/vault.d/unseal.key"
 ROOT_TOKEN_FILE="/etc/vault.d/root.token"
 VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
 export VAULT_ADDR
 # Track whether we spawned a temporary vault (for cleanup).
 spawned_pid=""
 spawned_log=""
 log() { printf '[vault-init] %s\n' "$*"; }
 die() { printf '[vault-init] ERROR: %s\n' "$*" >&2; exit 1; }
 # ── Cleanup: stop the temporary server (if we started one) on any exit ───────
 # EXIT trap fires on success AND failure AND signals — so we never leak a
 # background vault process holding port 8200 after this script returns.
 cleanup() {
  if [ -n "$spawned_pid" ] && kill -0 "$spawned_pid" 2>/dev/null; then
    log "stopping temporary vault (pid=${spawned_pid})"
    kill "$spawned_pid" 2>/dev/null || true
    wait "$spawned_pid" 2>/dev/null || true
  fi
  if [ -n "$spawned_log" ] && [ -f "$spawned_log" ]; then
    rm -f "$spawned_log"
  fi
 }
 trap cleanup EXIT
 # ── Preconditions ────────────────────────────────────────────────────────────
 if [ "$(id -u)" -ne 0 ]; then
  die "must run as root (needs to write 0400 files under /etc/vault.d)"
 fi
 for bin in vault jq; do
  command -v "$bin" >/dev/null 2>&1 \
    || die "required binary not found: ${bin}"
 done
 [ -f "$VAULT_CONFIG_FILE" ] \
  || die "config not found: ${VAULT_CONFIG_FILE} — run systemd-vault.sh first"
 # ── Helpers ──────────────────────────────────────────────────────────────────
 # vault_reachable — true iff `vault status` can reach the server.
 #   Exit codes from `vault status`:
 #     0 = reachable, initialized, unsealed
 #     2 = reachable, sealed (or uninitialized)
 #     1 = unreachable / other error
 #   We treat 0 and 2 as "reachable". `|| status=$?` avoids set -e tripping
 #   on the expected sealed-is-also-fine case.
 vault_reachable() {
  local status=0
  vault status -format=json >/dev/null 2>&1 || status=$?
  [ "$status" -eq 0 ] || [ "$status" -eq 2 ]
 }
 # vault_initialized — echoes "true" / "false" / "" (empty on parse failure
 # or unreachable vault). Always returns 0 so that `x="$(vault_initialized)"`
 # is safe under `set -euo pipefail`.
 #
 # Key subtlety: `vault status` exits 2 when Vault is sealed OR uninitialized
 # — the exact state we need to *observe* on first run. Without the
 # `|| true` guard, pipefail + set -e inside a standalone assignment would
 # propagate that exit 2 to the outer script and abort before we ever call
 # `vault operator init`. We capture `vault status`'s output to a variable
 # first (pipefail-safe), then feed it to jq separately.
 vault_initialized() {
  local out=""
  out="$(vault status -format=json 2>/dev/null || true)"
  [ -n "$out" ] || { printf ''; return 0; }
  printf '%s' "$out" | jq -r '.initialized' 2>/dev/null || printf ''
 }
 # write_secret_file PATH CONTENT
 #   Write CONTENT to PATH atomically with 0400 root:root and no trailing
 #   newline. mktemp+install keeps perms tight for the whole lifetime of
 #   the file on disk — no 0644-then-chmod window.
 write_secret_file() {
  local path="$1" content="$2"
  local tmp
  tmp="$(mktemp)"
  printf '%s' "$content" > "$tmp"
  install -m 0400 -o root -g root "$tmp" "$path"
  rm -f "$tmp"
 }
 # ── Ensure vault is reachable ────────────────────────────────────────────────
 if ! vault_reachable; then
  log "vault not reachable at ${VAULT_ADDR} — starting temporary server"
  spawned_log="$(mktemp)"
  vault server -config="$VAULT_CONFIG_FILE" >"$spawned_log" 2>&1 &
  spawned_pid=$!
  # Poll for readiness. Vault's API listener comes up before notify-ready
  # in Type=notify mode, but well inside a few seconds even on cold boots.
  ready=0
  for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
    if vault_reachable; then
      ready=1
      break
    fi
    sleep 1
  done
  if [ "$ready" -ne 1 ]; then
    log "vault did not become reachable within 15s — server log follows:"
    if [ -f "$spawned_log" ]; then
      sed 's/^/[vault-server] /' "$spawned_log" >&2 || true
    fi
    die "failed to start temporary vault server"
  fi
  log "temporary vault ready (pid=${spawned_pid})"
 fi
 # ── Idempotency gate ─────────────────────────────────────────────────────────
 initialized="$(vault_initialized)"
 if [ "$initialized" = "true" ] && [ -f "$UNSEAL_KEY_FILE" ]; then
  log "vault already initialized and unseal.key present — no-op"
  exit 0
 fi
 if [ "$initialized" = "true" ] && [ ! -f "$UNSEAL_KEY_FILE" ]; then
  die "vault is initialized but ${UNSEAL_KEY_FILE} is missing — cannot recover the unseal key; restore from backup or wipe ${VAULT_CONFIG_FILE%/*}/data and re-run"
 fi
 if [ "$initialized" != "false" ]; then
  die "unexpected initialized state: '${initialized}' (expected 'true' or 'false')"
 fi
 # ── Initialize ───────────────────────────────────────────────────────────────
 log "initializing vault (key-shares=1, key-threshold=1)"
 init_json="$(vault operator init \
  -key-shares=1 \
  -key-threshold=1 \
  -format=json)" \
  || die "vault operator init failed"
 unseal_key="$(printf '%s' "$init_json" | jq -er '.unseal_keys_b64[0]')" \
  || die "failed to extract unseal key from init response"
 root_token="$(printf '%s' "$init_json" | jq -er '.root_token')" \
  || die "failed to extract root token from init response"
 # Best-effort scrub of init_json from the env (the captured key+token still
 # sit in the local vars above — there's no clean way to wipe bash memory).
 unset init_json
 # ── Persist keys ─────────────────────────────────────────────────────────────
 log "writing ${UNSEAL_KEY_FILE} (0400 root)"
 write_secret_file "$UNSEAL_KEY_FILE" "$unseal_key"
 log "writing ${ROOT_TOKEN_FILE} (0400 root)"
 write_secret_file "$ROOT_TOKEN_FILE" "$root_token"
 # ── Unseal in the current process ────────────────────────────────────────────
 log "unsealing vault"
 vault operator unseal "$unseal_key" >/dev/null \
  || die "vault operator unseal failed"
 log "done — vault initialized + unsealed + keys persisted"
--- a/lib/issue-lifecycle.sh
+++ b/lib/issue-lifecycle.sh
@ -79,6 +79,27 @@ _ilc_backlog_id()      { _ilc_ensure_label_id "backlog"     "#0075ca"; }
 _ilc_in_progress_id()  { _ilc_ensure_label_id "in-progress" "#1d76db"; }
 _ilc_blocked_id()      { _ilc_ensure_label_id "blocked"     "#e11d48"; }
 # ---------------------------------------------------------------------------
 # Labels that indicate an issue belongs to a non-dev agent workflow.
 # Any issue carrying one of these should NOT be touched by dev-poll's
 # stale-detection or orphan-recovery logic.  See issue #608.
 # ---------------------------------------------------------------------------
 _ILC_NON_DEV_LABELS="bug-report vision in-triage prediction/unreviewed prediction/dismissed action formula"
 # issue_is_dev_claimable COMMA_SEPARATED_LABELS
 # Returns 0 if the issue's labels are compatible with dev-agent ownership,
 # 1 if any non-dev label is present (meaning another agent owns this issue).
 issue_is_dev_claimable() {
  local labels="$1"
  local lbl
  for lbl in $_ILC_NON_DEV_LABELS; do
    if echo ",$labels," | grep -qF ",$lbl,"; then
      return 1
    fi
  done
  return 0
 }
 # ---------------------------------------------------------------------------
 # issue_claim — assign issue to bot, add "in-progress" label, remove "backlog".
 # Args: issue_number
--- a/lib/load-project.sh
+++ b/lib/load-project.sh
@ -97,28 +97,18 @@ done <<< "$_PROJECT_VARS"
 # FORGE_URL: TOML forge_url > existing FORGE_URL > default
 export FORGE_URL="${FORGE_URL:-http://localhost:3000}"
 if [ -n "$FORGE_REPO" ]; then
-  export FORGE_API="${FORGE_URL}/api/v1/repos/${FORGE_REPO}"
+  export FORGE_API_BASE="${FORGE_URL}/api/v1"
  export FORGE_API="${FORGE_API_BASE}/repos/${FORGE_REPO}"
  export FORGE_WEB="${FORGE_URL}/${FORGE_REPO}"
  # Extract repo owner (first path segment of owner/repo)
  export FORGE_REPO_OWNER="${FORGE_REPO%%/*}"
 fi
-# Derive PROJECT_REPO_ROOT if not explicitly set
+# PROJECT_REPO_ROOT and OPS_REPO_ROOT: no fallback derivation from USER/HOME.
-if [ -z "${PROJECT_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
+# These must be set by the entrypoint (container) or the TOML (host CLI).
-  export PROJECT_REPO_ROOT="/home/${USER}/${PROJECT_NAME}"
+# Inside the container, the entrypoint exports the correct paths before agent
-fi
+# scripts source env.sh; the TOML's host-perspective paths are skipped by the
-
+# DISINTO_CONTAINER guard above.
 # Derive OPS_REPO_ROOT if not explicitly set
 if [ -z "${OPS_REPO_ROOT:-}" ] && [ -n "${PROJECT_NAME:-}" ]; then
  export OPS_REPO_ROOT="/home/${USER}/${PROJECT_NAME}-ops"
 fi
 # Inside the container, always derive repo paths from PROJECT_NAME — the TOML
 # carries host-perspective paths that do not exist in the container filesystem.
 if [ "${DISINTO_CONTAINER:-}" = "1" ] && [ -n "${PROJECT_NAME:-}" ]; then
  export PROJECT_REPO_ROOT="/home/agent/repos/${PROJECT_NAME}"
  export OPS_REPO_ROOT="/home/agent/repos/${PROJECT_NAME}-ops"
 fi
 # Derive FORGE_OPS_REPO if not explicitly set
 if [ -z "${FORGE_OPS_REPO:-}" ] && [ -n "${FORGE_REPO:-}" ]; then
--- a/lib/mirrors.sh
+++ b/lib/mirrors.sh
@ -1,8 +1,10 @@
 #!/usr/bin/env bash
-# mirrors.sh — Push primary branch + tags to configured mirror remotes.
+# mirrors.sh — Mirror helpers: push to remotes + register pull mirrors via API.
 #
 # Usage: source lib/mirrors.sh; mirror_push
 #        source lib/mirrors.sh; mirror_pull_register <clone_url> <owner> <repo_name> [interval]
 # Requires: PROJECT_REPO_ROOT, PRIMARY_BRANCH, MIRROR_* vars from load-project.sh
 #           FORGE_API_BASE, FORGE_TOKEN for pull-mirror registration
 # shellcheck disable=SC2154  # globals set by load-project.sh / calling script
@ -37,3 +39,73 @@ mirror_push() {
    log "mirror: pushed to ${name} (pid $!)"
  done
 }
 # ---------------------------------------------------------------------------
 # mirror_pull_register — register a Forgejo pull mirror via the /repos/migrate API.
 #
 # Creates a new repo as a pull mirror of an external source.  Works against
 # empty target repos (the repo is created by the API call itself).
 #
 # Usage:
 #   mirror_pull_register <clone_url> <owner> <repo_name> [interval]
 #
 # Args:
 #   clone_url  — HTTPS URL of the source repo (e.g. https://codeberg.org/johba/disinto.git)
 #   owner      — Forgejo org or user that will own the mirror repo
 #   repo_name  — name of the new mirror repo on Forgejo
 #   interval   — sync interval (default: "8h0m0s"; Forgejo duration format)
 #
 # Requires:
 #   FORGE_API_BASE, FORGE_TOKEN (from env.sh)
 #
 # Returns 0 on success, 1 on failure.  Prints the new repo JSON to stdout.
 # ---------------------------------------------------------------------------
 mirror_pull_register() {
  local clone_url="$1"
  local owner="$2"
  local repo_name="$3"
  local interval="${4:-8h0m0s}"
  if [ -z "${FORGE_API_BASE:-}" ] || [ -z "${FORGE_TOKEN:-}" ]; then
    echo "ERROR: FORGE_API_BASE and FORGE_TOKEN must be set" >&2
    return 1
  fi
  if [ -z "$clone_url" ] || [ -z "$owner" ] || [ -z "$repo_name" ]; then
    echo "Usage: mirror_pull_register <clone_url> <owner> <repo_name> [interval]" >&2
    return 1
  fi
  local payload
  payload=$(jq -n \
    --arg clone_addr "$clone_url" \
    --arg repo_name  "$repo_name" \
    --arg repo_owner "$owner" \
    --arg interval   "$interval" \
    '{
      clone_addr:      $clone_addr,
      repo_name:       $repo_name,
      repo_owner:      $repo_owner,
      mirror:          true,
      mirror_interval: $interval,
      service:         "git"
    }')
  local http_code body
  body=$(curl -s -w "\n%{http_code}" -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_API_BASE}/repos/migrate" \
    -d "$payload")
  http_code=$(printf '%s' "$body" | tail -n1)
  body=$(printf '%s' "$body" | sed '$d')
  if [ "$http_code" -ge 200 ] && [ "$http_code" -lt 300 ]; then
    printf '%s\n' "$body"
    return 0
  else
    echo "ERROR: mirror_pull_register failed (HTTP ${http_code}): ${body}" >&2
    return 1
  fi
 }
--- a/lib/ops-setup.sh
+++ b/lib/ops-setup.sh
@ -5,10 +5,10 @@
 #   source "$(dirname "$0")/../lib/ops-setup.sh"
 #
 # Required globals: FORGE_URL, FORGE_TOKEN, FACTORY_ROOT
-# Optional: admin_token (falls back to FORGE_TOKEN for admin operations)
+# Optional: HUMAN_TOKEN (falls back to FORGE_TOKEN for admin operations)
 #
 # Functions:
-#   setup_ops_repo <forge_url> <ops_slug> <ops_root> [primary_branch]
+#   setup_ops_repo <forge_url> <ops_slug> <ops_root> [primary_branch] [admin_token]
 #     - Create ops repo on Forgejo if it doesn't exist
 #     - Configure bot collaborators with appropriate permissions
 #     - Clone or initialize ops repo locally
@ -26,6 +26,7 @@ set -euo pipefail
 setup_ops_repo() {
  local forge_url="$1" ops_slug="$2" ops_root="$3" primary_branch="${4:-main}"
  local admin_token="${5:-${HUMAN_TOKEN:-${FORGE_TOKEN}}}"
  local org_name="${ops_slug%%/*}"
  local ops_name="${ops_slug##*/}"
@ -53,30 +54,57 @@ setup_ops_repo() {
  # If not found, try to create it in the configured namespace
  if [ -z "$actual_ops_slug" ]; then
    echo "Creating ops repo in namespace: ${org_name}"
-    # Create org if it doesn't exist
+
-    curl -sf -X POST \
+    # Determine if target namespace is a user or an org
-      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+    local ns_type=""
-      -H "Content-Type: application/json" \
+    if curl -sf -H "Authorization: token ${admin_token}" \
-      "${forge_url}/api/v1/orgs" \
+      "${forge_url}/api/v1/users/${org_name}" >/dev/null 2>&1; then
-      -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true
+      # User endpoint exists - check if it's an org
      if curl -sf -H "Authorization: token ${admin_token}" \
        "${forge_url}/api/v1/users/${org_name}" | grep -q '"is_org":true'; then
        ns_type="org"
      else
        ns_type="user"
      fi
    elif curl -sf -H "Authorization: token ${admin_token}" \
      "${forge_url}/api/v1/orgs/${org_name}" >/dev/null 2>&1; then
      # Org endpoint exists
      ns_type="org"
    fi
    local create_endpoint="" via_msg=""
    if [ "$ns_type" = "org" ]; then
      # Org namespace — use org API
      create_endpoint="/api/v1/orgs/${org_name}/repos"
      # Create org if it doesn't exist
      curl -sf -X POST \
        -H "Authorization: token ${admin_token}" \
        -H "Content-Type: application/json" \
        "${forge_url}/api/v1/orgs" \
        -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true
    else
      # User namespace — use admin API (requires admin token)
      create_endpoint="/api/v1/admin/users/${org_name}/repos"
      via_msg=" (via admin API)"
    fi
    if curl -sf -X POST \
-      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+      -H "Authorization: token ${admin_token}" \
      -H "Content-Type: application/json" \
-      "${forge_url}/api/v1/orgs/${org_name}/repos" \
+      "${forge_url}${create_endpoint}" \
      -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data for ${org_name}/${ops_name%-ops}\"}" >/dev/null 2>&1; then
      actual_ops_slug="${org_name}/${ops_name}"
-      echo "Ops repo: ${actual_ops_slug} created on Forgejo"
+      echo "Ops repo: ${actual_ops_slug} created on Forgejo${via_msg}"
    else
      # Fallback: use admin API to create repo under the target namespace
      http_code=$(curl -s -o /dev/null -w "%{http_code}" \
        -X POST \
-        -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+        -H "Authorization: token ${admin_token}" \
        -H "Content-Type: application/json" \
-        "${forge_url}/api/v1/admin/users/${org_name}/repos" \
+        "${forge_url}${create_endpoint}" \
        -d "{\"name\":\"${ops_name}\",\"auto_init\":true,\"default_branch\":\"${primary_branch}\",\"description\":\"Operational data for ${org_name}/${ops_name%-ops}\"}" 2>/dev/null || echo "0")
      if [ "$http_code" = "201" ]; then
        actual_ops_slug="${org_name}/${ops_name}"
-        echo "Ops repo: ${actual_ops_slug} created on Forgejo (via admin API)"
+        echo "Ops repo: ${actual_ops_slug} created on Forgejo${via_msg}"
      else
        echo "Error: failed to create ops repo '${org_name}/${ops_name}' (HTTP ${http_code})" >&2
        return 1
@ -104,7 +132,7 @@ setup_ops_repo() {
  for bot_user in "${!bot_permissions[@]}"; do
    bot_perm="${bot_permissions[$bot_user]}"
    if curl -sf -X PUT \
-      -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+      -H "Authorization: token ${admin_token}" \
      -H "Content-Type: application/json" \
      "${forge_url}/api/v1/repos/${actual_ops_slug}/collaborators/${bot_user}" \
      -d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1; then
@ -116,7 +144,7 @@ setup_ops_repo() {
  # Add disinto-admin as admin collaborator
  if curl -sf -X PUT \
-    -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
+    -H "Authorization: token ${admin_token}" \
    -H "Content-Type: application/json" \
    "${forge_url}/api/v1/repos/${actual_ops_slug}/collaborators/disinto-admin" \
    -d '{"permission":"admin"}' >/dev/null 2>&1; then
@ -125,11 +153,10 @@ setup_ops_repo() {
    echo "  ! disinto-admin = admin (already set or failed)"
  fi
-  # Clone ops repo locally if not present
+  # Clone ops repo locally if not present — use clean URL, credential helper
  # supplies auth (#604).
  if [ ! -d "${ops_root}/.git" ]; then
-    local auth_url
+    local clone_url="${forge_url}/${actual_ops_slug}.git"
    auth_url=$(printf '%s' "$forge_url" | sed "s|://|://dev-bot:${FORGE_TOKEN}@|")
    local clone_url="${auth_url}/${actual_ops_slug}.git"
    echo "Cloning: ops repo -> ${ops_root}"
    if git clone --quiet "$clone_url" "$ops_root" 2>/dev/null; then
      echo "Ops repo: ${actual_ops_slug} cloned successfully"
@ -287,30 +314,37 @@ migrate_ops_repo() {
  echo "── Ops repo migration ───────────────────────────────────"
  echo "Checking ${ops_root} for missing directories and files..."
  # Change to ops_root directory to ensure all git operations use the correct repo
  # This prevents "fatal: not in a git directory" errors from stray git commands
  local orig_dir
  orig_dir=$(pwd)
  cd "$ops_root" || {
    echo "Error: failed to change to ${ops_root}" >&2
    return 1
  }
  local migrated=false
  # Canonical ops repo structure (post #407)
  # Directories to ensure exist with .gitkeep files
  local -a dir_keepfiles=(
-    "${ops_root}/vault/pending/.gitkeep"
+    "vault/pending/.gitkeep"
-    "${ops_root}/vault/approved/.gitkeep"
+    "vault/approved/.gitkeep"
-    "${ops_root}/vault/fired/.gitkeep"
+    "vault/fired/.gitkeep"
-    "${ops_root}/vault/rejected/.gitkeep"
+    "vault/rejected/.gitkeep"
-    "${ops_root}/knowledge/.gitkeep"
+    "knowledge/.gitkeep"
-    "${ops_root}/evidence/engagement/.gitkeep"
+    "evidence/engagement/.gitkeep"
-    "${ops_root}/evidence/red-team/.gitkeep"
+    "evidence/red-team/.gitkeep"
-    "${ops_root}/evidence/holdout/.gitkeep"
+    "evidence/holdout/.gitkeep"
-    "${ops_root}/evidence/evolution/.gitkeep"
+    "evidence/evolution/.gitkeep"
-    "${ops_root}/evidence/user-test/.gitkeep"
+    "evidence/user-test/.gitkeep"
-    "${ops_root}/sprints/.gitkeep"
+    "sprints/.gitkeep"
  )
  # Create missing directories and .gitkeep files
  for keepfile in "${dir_keepfiles[@]}"; do
    local dir
    dir=$(dirname "$keepfile")
    if [ ! -f "$keepfile" ]; then
-      mkdir -p "$dir"
+      mkdir -p "$(dirname "$keepfile")"
      touch "$keepfile"
      echo "  + Created: ${keepfile}"
      migrated=true
@ -319,9 +353,9 @@ migrate_ops_repo() {
  # Template files to create if missing (starter content)
  local -a template_files=(
-    "${ops_root}/portfolio.md"
+    "portfolio.md"
-    "${ops_root}/prerequisites.md"
+    "prerequisites.md"
-    "${ops_root}/RESOURCES.md"
+    "RESOURCES.md"
  )
  for tfile in "${template_files[@]}"; do
@ -343,26 +377,33 @@ migrate_ops_repo() {
  # Commit and push changes if any were made
  if [ "$migrated" = true ]; then
    # Auto-configure repo-local git identity if missing
-    if [ -z "$(git -C "$ops_root" config user.name 2>/dev/null)" ]; then
+    if [ -z "$(git config user.name 2>/dev/null)" ]; then
-      git -C "$ops_root" config user.name "disinto-admin"
+      git config user.name "disinto-admin"
    fi
-    if [ -z "$(git -C "$ops_root" config user.email 2>/dev/null)" ]; then
+    if [ -z "$(git config user.email 2>/dev/null)" ]; then
-      git -C "$ops_root" config user.email "disinto-admin@localhost"
+      git config user.email "disinto-admin@localhost"
    fi
-    git -C "$ops_root" add -A
+    git add -A
-    if ! git -C "$ops_root" diff --cached --quiet 2>/dev/null; then
+    if ! git diff --cached --quiet 2>/dev/null; then
-      git -C "$ops_root" commit -m "chore: migrate ops repo structure to canonical layout" -q
+      if ! git commit -m "chore: migrate ops repo structure to canonical layout" -q; then
        echo "Error: failed to commit migration changes" >&2
        cd "$orig_dir"
        return 1
      fi
      # Push if remote exists
-      if git -C "$ops_root" remote get-url origin >/dev/null 2>&1; then
+      if git remote get-url origin >/dev/null 2>&1; then
-        if git -C "$ops_root" push origin "${primary_branch}" -q 2>/dev/null; then
+        if ! git push origin "${primary_branch}" -q 2>/dev/null; then
          echo "Migrated:  ops repo structure updated and pushed"
        else
          echo "Warning: failed to push migration to ops repo" >&2
        else
          echo "Migrated:  ops repo structure updated and pushed"
        fi
      fi
    fi
  else
    echo "  (all directories and files already present)"
  fi
  # Return to original directory
  cd "$orig_dir"
 }
--- a/lib/release.sh
+++ b/lib/release.sh
@ -18,8 +18,8 @@
 # =============================================================================
 set -euo pipefail
-# Source vault.sh for _vault_log helper
+# Source action-vault.sh for _vault_log helper
-source "${FACTORY_ROOT}/lib/vault.sh"
+source "${FACTORY_ROOT}/lib/action-vault.sh"
 # Assert required globals are set before using this module.
 _assert_release_globals() {
--- a/lib/sprint-filer.sh
+++ b/lib/sprint-filer.sh
@ -0,0 +1,585 @@
 #!/usr/bin/env bash
 # =============================================================================
 # sprint-filer.sh — Parse merged sprint PRs and file sub-issues via filer-bot
 #
 # Invoked by the ops-filer Woodpecker pipeline after a sprint PR merges on the
 # ops repo main branch.  Parses each sprints/*.md file for a structured
 # ## Sub-issues block (filer:begin/end markers), then creates idempotent
 # Forgejo issues on the project repo using FORGE_FILER_TOKEN.
 #
 # Permission model (#764):
 #   filer-bot has issues:write on the project repo.
 #   architect-bot is read-only on the project repo.
 #
 # Usage:
 #   sprint-filer.sh <sprint-file.md>          — file sub-issues from one sprint
 #   sprint-filer.sh --all <sprints-dir>       — scan all sprint files in dir
 #
 # Environment:
 #   FORGE_FILER_TOKEN   — filer-bot API token (issues:write on project repo)
 #   FORGE_API           — project repo API base (e.g. http://forgejo:3000/api/v1/repos/org/repo)
 #   FORGE_API_BASE      — API base URL (e.g. http://forgejo:3000/api/v1)
 # =============================================================================
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Source env.sh only if not already loaded (allows standalone + sourced use)
 if [ -z "${FACTORY_ROOT:-}" ]; then
  FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
  # shellcheck source=env.sh
  source "$SCRIPT_DIR/env.sh"
 fi
 # ── Logging ──────────────────────────────────────────────────────────────
 LOG_AGENT="${LOG_AGENT:-filer}"
 filer_log() {
  printf '[%s] %s: %s\n' "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" "$LOG_AGENT" "$*" >&2
 }
 # ── Validate required environment ────────────────────────────────────────
 : "${FORGE_FILER_TOKEN:?sprint-filer.sh requires FORGE_FILER_TOKEN}"
 : "${FORGE_API:?sprint-filer.sh requires FORGE_API}"
 # ── Paginated Forgejo API fetch ──────────────────────────────────────────
 # Reuses forge_api_all from lib/env.sh with FORGE_FILER_TOKEN.
 # Args: api_path (e.g. /issues?state=all&type=issues)
 # Output: merged JSON array to stdout
 filer_api_all() { forge_api_all "$1" "$FORGE_FILER_TOKEN"; }
 # ── Parse sub-issues block from a sprint markdown file ───────────────────
 # Extracts the YAML-in-markdown between <!-- filer:begin --> and <!-- filer:end -->
 # Args: sprint_file_path
 # Output: the raw sub-issues block (YAML lines) to stdout
 # Returns: 0 if block found, 1 if not found or malformed
 parse_subissues_block() {
  local sprint_file="$1"
  if [ ! -f "$sprint_file" ]; then
    filer_log "ERROR: sprint file not found: ${sprint_file}"
    return 1
  fi
  local in_block=false
  local block=""
  local found=false
  while IFS= read -r line; do
    if [[ "$line" == *"<!-- filer:begin -->"* ]]; then
      in_block=true
      found=true
      continue
    fi
    if [[ "$line" == *"<!-- filer:end -->"* ]]; then
      in_block=false
      continue
    fi
    if [ "$in_block" = true ]; then
      block+="${line}"$'\n'
    fi
  done < "$sprint_file"
  if [ "$found" = false ]; then
    filer_log "No filer:begin/end block found in ${sprint_file}"
    return 1
  fi
  if [ "$in_block" = true ]; then
    filer_log "ERROR: malformed sub-issues block in ${sprint_file} — filer:begin without filer:end"
    return 1
  fi
  if [ -z "$block" ]; then
    filer_log "WARNING: empty sub-issues block in ${sprint_file}"
    return 1
  fi
  printf '%s' "$block"
 }
 # ── Extract vision issue number from sprint file ─────────────────────────
 # Looks for "#N" references specifically in the "## Vision issues" section
 # to avoid picking up cross-links or related-issue mentions earlier in the file.
 # Falls back to first #N in the file if no "## Vision issues" section found.
 # Args: sprint_file_path
 # Output: first vision issue number found
 extract_vision_issue() {
  local sprint_file="$1"
  # Try to extract from "## Vision issues" section first
  local in_section=false
  local result=""
  while IFS= read -r line; do
    if [[ "$line" =~ ^##[[:space:]]+Vision[[:space:]]+issues ]]; then
      in_section=true
      continue
    fi
    # Stop at next heading
    if [ "$in_section" = true ] && [[ "$line" =~ ^## ]]; then
      break
    fi
    if [ "$in_section" = true ]; then
      result=$(printf '%s' "$line" | grep -oE '#[0-9]+' | head -1 | tr -d '#')
      if [ -n "$result" ]; then
        printf '%s' "$result"
        return 0
      fi
    fi
  done < "$sprint_file"
  # Fallback: first #N in the entire file
  grep -oE '#[0-9]+' "$sprint_file" | head -1 | tr -d '#'
 }
 # ── Extract sprint slug from file path ───────────────────────────────────
 # Args: sprint_file_path
 # Output: slug (filename without .md)
 extract_sprint_slug() {
  local sprint_file="$1"
  basename "$sprint_file" .md
 }
 # ── Parse individual sub-issue entries from the block ────────────────────
 # The block is a simple YAML-like format:
 #   - id: foo
 #     title: "..."
 #     labels: [backlog, priority]
 #     depends_on: [bar]
 #     body: |
 #       multi-line body
 #
 # Args: raw_block (via stdin)
 # Output: JSON array of sub-issue objects
 parse_subissue_entries() {
  local block
  block=$(cat)
  # Use awk to parse the YAML-like structure into JSON
  printf '%s' "$block" | awk '
  BEGIN {
    printf "["
    first = 1
    inbody = 0
    id = ""; title = ""; labels = ""; depends = ""; body = ""
  }
  function flush_entry() {
    if (id == "") return
    if (!first) printf ","
    first = 0
    # Escape JSON special characters in body
    gsub(/\\/, "\\\\", body)
    gsub(/"/, "\\\"", body)
    gsub(/\t/, "\\t", body)
    # Replace newlines with \n for JSON
    gsub(/\n/, "\\n", body)
    # Remove trailing \n
    sub(/\\n$/, "", body)
    # Clean up title (remove surrounding quotes)
    gsub(/^"/, "", title)
    gsub(/"$/, "", title)
    printf "{\"id\":\"%s\",\"title\":\"%s\",\"labels\":%s,\"depends_on\":%s,\"body\":\"%s\"}", id, title, labels, depends, body
    id = ""; title = ""; labels = "[]"; depends = "[]"; body = ""
    inbody = 0
  }
  /^- id:/ {
    flush_entry()
    sub(/^- id: */, "")
    id = $0
    labels = "[]"
    depends = "[]"
    next
  }
  /^  title:/ {
    sub(/^  title: */, "")
    title = $0
    # Remove surrounding quotes
    gsub(/^"/, "", title)
    gsub(/"$/, "", title)
    next
  }
  /^  labels:/ {
    sub(/^  labels: */, "")
    # Convert [a, b] to JSON array ["a","b"]
    gsub(/\[/, "", $0)
    gsub(/\]/, "", $0)
    n = split($0, arr, /, */)
    labels = "["
    for (i = 1; i <= n; i++) {
      gsub(/^ */, "", arr[i])
      gsub(/ *$/, "", arr[i])
      if (arr[i] != "") {
        if (i > 1) labels = labels ","
        labels = labels "\"" arr[i] "\""
      }
    }
    labels = labels "]"
    next
  }
  /^  depends_on:/ {
    sub(/^  depends_on: */, "")
    gsub(/\[/, "", $0)
    gsub(/\]/, "", $0)
    n = split($0, arr, /, */)
    depends = "["
    for (i = 1; i <= n; i++) {
      gsub(/^ */, "", arr[i])
      gsub(/ *$/, "", arr[i])
      if (arr[i] != "") {
        if (i > 1) depends = depends ","
        depends = depends "\"" arr[i] "\""
      }
    }
    depends = depends "]"
    next
  }
  /^  body: *\|/ {
    inbody = 1
    body = ""
    next
  }
  inbody && /^    / {
    sub(/^    /, "")
    body = body $0 "\n"
    next
  }
  inbody && !/^    / && !/^$/ {
    inbody = 0
    # This line starts a new field or entry — re-process it
    # (awk does not support re-scanning, so handle common cases)
    if ($0 ~ /^- id:/) {
      flush_entry()
      sub(/^- id: */, "")
      id = $0
      labels = "[]"
      depends = "[]"
    }
  }
  END {
    flush_entry()
    printf "]"
  }
  '
 }
 # ── Check if sub-issue already exists (idempotency) ─────────────────────
 # Searches for the decomposed-from marker in existing issues.
 # Args: vision_issue_number sprint_slug subissue_id
 # Returns: 0 if already exists, 1 if not
 subissue_exists() {
  local vision_issue="$1"
  local sprint_slug="$2"
  local subissue_id="$3"
  local marker="<!-- decomposed-from: #${vision_issue}, sprint: ${sprint_slug}, id: ${subissue_id} -->"
  # Search all issues (paginated) for the exact marker
  local issues_json
  issues_json=$(filer_api_all "/issues?state=all&type=issues")
  if printf '%s' "$issues_json" | jq -e --arg marker "$marker" \
    '[.[] | select(.body // "" | contains($marker))] | length > 0' >/dev/null 2>&1; then
    return 0  # Already exists
  fi
  return 1  # Does not exist
 }
 # ── Resolve label names to IDs ───────────────────────────────────────────
 # Args: label_names_json (JSON array of strings)
 # Output: JSON array of label IDs
 resolve_label_ids() {
  local label_names_json="$1"
  # Fetch all labels from project repo
  local all_labels
  all_labels=$(curl -sf -H "Authorization: token ${FORGE_FILER_TOKEN}" \
    "${FORGE_API}/labels" 2>/dev/null) || all_labels="[]"
  # Map names to IDs
  printf '%s' "$label_names_json" | jq -r '.[]' | while IFS= read -r label_name; do
    [ -z "$label_name" ] && continue
    printf '%s' "$all_labels" | jq -r --arg name "$label_name" \
      '.[] | select(.name == $name) | .id' 2>/dev/null
  done | jq -Rs 'split("\n") | map(select(. != "") | tonumber)'
 }
 # ── Add in-progress label to vision issue ────────────────────────────────
 # Args: vision_issue_number
 add_inprogress_label() {
  local issue_num="$1"
  local labels_json
  labels_json=$(curl -sf -H "Authorization: token ${FORGE_FILER_TOKEN}" \
    "${FORGE_API}/labels" 2>/dev/null) || return 1
  local label_id
  label_id=$(printf '%s' "$labels_json" | jq -r '.[] | select(.name == "in-progress") | .id' 2>/dev/null) || true
  if [ -z "$label_id" ]; then
    filer_log "WARNING: in-progress label not found"
    return 1
  fi
  if curl -sf -X POST \
    -H "Authorization: token ${FORGE_FILER_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_API}/issues/${issue_num}/labels" \
    -d "{\"labels\": [${label_id}]}" >/dev/null 2>&1; then
    filer_log "Added in-progress label to vision issue #${issue_num}"
    return 0
  else
    filer_log "WARNING: failed to add in-progress label to vision issue #${issue_num}"
    return 1
  fi
 }
 # ── File sub-issues from a sprint file ───────────────────────────────────
 # This is the main entry point. Parses the sprint file, extracts sub-issues,
 # and creates them idempotently via the Forgejo API.
 # Args: sprint_file_path
 # Returns: 0 on success, 1 on any error (fail-fast)
 file_subissues() {
  local sprint_file="$1"
  filer_log "Processing sprint file: ${sprint_file}"
  # Extract metadata
  local vision_issue sprint_slug
  vision_issue=$(extract_vision_issue "$sprint_file")
  sprint_slug=$(extract_sprint_slug "$sprint_file")
  if [ -z "$vision_issue" ]; then
    filer_log "ERROR: could not extract vision issue number from ${sprint_file}"
    return 1
  fi
  filer_log "Vision issue: #${vision_issue}, sprint slug: ${sprint_slug}"
  # Parse the sub-issues block
  local raw_block
  raw_block=$(parse_subissues_block "$sprint_file") || return 1
  # Parse individual entries
  local entries_json
  entries_json=$(printf '%s' "$raw_block" | parse_subissue_entries)
  # Validate parsing produced valid JSON
  if ! printf '%s' "$entries_json" | jq empty 2>/dev/null; then
    filer_log "ERROR: failed to parse sub-issues block as valid JSON in ${sprint_file}"
    return 1
  fi
  local entry_count
  entry_count=$(printf '%s' "$entries_json" | jq 'length')
  if [ "$entry_count" -eq 0 ]; then
    filer_log "WARNING: no sub-issue entries found in ${sprint_file}"
    return 1
  fi
  filer_log "Found ${entry_count} sub-issue(s) to file"
  # File each sub-issue (fail-fast on first error)
  local filed_count=0
  local i=0
  while [ "$i" -lt "$entry_count" ]; do
    local entry
    entry=$(printf '%s' "$entries_json" | jq ".[$i]")
    local subissue_id subissue_title subissue_body labels_json
    subissue_id=$(printf '%s' "$entry" | jq -r '.id')
    subissue_title=$(printf '%s' "$entry" | jq -r '.title')
    subissue_body=$(printf '%s' "$entry" | jq -r '.body')
    labels_json=$(printf '%s' "$entry" | jq -c '.labels')
    if [ -z "$subissue_id" ] || [ "$subissue_id" = "null" ]; then
      filer_log "ERROR: sub-issue entry at index ${i} has no id — aborting"
      return 1
    fi
    if [ -z "$subissue_title" ] || [ "$subissue_title" = "null" ]; then
      filer_log "ERROR: sub-issue '${subissue_id}' has no title — aborting"
      return 1
    fi
    # Idempotency check
    if subissue_exists "$vision_issue" "$sprint_slug" "$subissue_id"; then
      filer_log "Sub-issue '${subissue_id}' already exists — skipping"
      i=$((i + 1))
      continue
    fi
    # Append decomposed-from marker to body
    local marker="<!-- decomposed-from: #${vision_issue}, sprint: ${sprint_slug}, id: ${subissue_id} -->"
    local full_body="${subissue_body}
 ${marker}"
    # Resolve label names to IDs
    local label_ids
    label_ids=$(resolve_label_ids "$labels_json")
    # Build issue payload using jq for safe JSON construction
    local payload
    payload=$(jq -n \
      --arg title "$subissue_title" \
      --arg body "$full_body" \
      --argjson labels "$label_ids" \
      '{title: $title, body: $body, labels: $labels}')
    # Create the issue
    local response
    response=$(curl -sf -X POST \
      -H "Authorization: token ${FORGE_FILER_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_API}/issues" \
      -d "$payload" 2>/dev/null) || {
      filer_log "ERROR: failed to create sub-issue '${subissue_id}' — aborting (${filed_count}/${entry_count} filed so far)"
      return 1
    }
    local new_issue_num
    new_issue_num=$(printf '%s' "$response" | jq -r '.number // empty')
    filer_log "Filed sub-issue '${subissue_id}' as #${new_issue_num}: ${subissue_title}"
    filed_count=$((filed_count + 1))
    i=$((i + 1))
  done
  # Add in-progress label to the vision issue
  add_inprogress_label "$vision_issue" || true
  filer_log "Successfully filed ${filed_count}/${entry_count} sub-issue(s) for sprint ${sprint_slug}"
  return 0
 }
 # ── Vision lifecycle: close completed vision issues ──────────────────────
 # Checks open vision issues and closes any whose sub-issues are all closed.
 # Uses the decomposed-from marker to find sub-issues.
 check_and_close_completed_visions() {
  filer_log "Checking for vision issues with all sub-issues complete..."
  local vision_issues_json
  vision_issues_json=$(filer_api_all "/issues?labels=vision&state=open")
  if [ "$vision_issues_json" = "[]" ] || [ "$vision_issues_json" = "null" ]; then
    filer_log "No open vision issues found"
    return 0
  fi
  local all_issues
  all_issues=$(filer_api_all "/issues?state=all&type=issues")
  local vision_nums
  vision_nums=$(printf '%s' "$vision_issues_json" | jq -r '.[].number' 2>/dev/null) || return 0
  local closed_count=0
  while IFS= read -r vid; do
    [ -z "$vid" ] && continue
    # Find sub-issues with decomposed-from marker for this vision
    local sub_issues
    sub_issues=$(printf '%s' "$all_issues" | jq --arg vid "$vid" \
      '[.[] | select(.body // "" | contains("<!-- decomposed-from: #" + $vid))]')
    local sub_count
    sub_count=$(printf '%s' "$sub_issues" | jq 'length')
    # No sub-issues means not ready to close
    [ "$sub_count" -eq 0 ] && continue
    # Check if all are closed
    local open_count
    open_count=$(printf '%s' "$sub_issues" | jq '[.[] | select(.state != "closed")] | length')
    if [ "$open_count" -gt 0 ]; then
      continue
    fi
    # All sub-issues closed — close the vision issue
    filer_log "All ${sub_count} sub-issues for vision #${vid} are closed — closing vision"
    local comment_body
    comment_body="## Vision Issue Completed
 All sub-issues have been implemented and merged. This vision issue is now closed.
 ---
 *Automated closure by filer-bot · $(date -u '+%Y-%m-%d %H:%M UTC')*"
    local comment_payload
    comment_payload=$(jq -n --arg body "$comment_body" '{body: $body}')
    curl -sf -X POST \
      -H "Authorization: token ${FORGE_FILER_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_API}/issues/${vid}/comments" \
      -d "$comment_payload" >/dev/null 2>&1 || true
    curl -sf -X PATCH \
      -H "Authorization: token ${FORGE_FILER_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_API}/issues/${vid}" \
      -d '{"state":"closed"}' >/dev/null 2>&1 || true
    closed_count=$((closed_count + 1))
  done <<< "$vision_nums"
  if [ "$closed_count" -gt 0 ]; then
    filer_log "Closed ${closed_count} vision issue(s)"
  fi
 }
 # ── Main ─────────────────────────────────────────────────────────────────
 main() {
  if [ "${1:-}" = "--all" ]; then
    local sprints_dir="${2:?Usage: sprint-filer.sh --all <sprints-dir>}"
    local exit_code=0
    for sprint_file in "${sprints_dir}"/*.md; do
      [ -f "$sprint_file" ] || continue
      # Only process files with filer:begin markers
      if ! grep -q '<!-- filer:begin -->' "$sprint_file"; then
        continue
      fi
      if ! file_subissues "$sprint_file"; then
        filer_log "ERROR: failed to process ${sprint_file}"
        exit_code=1
      fi
    done
    # Run vision lifecycle check after filing
    check_and_close_completed_visions || true
    return "$exit_code"
  elif [ -n "${1:-}" ]; then
    file_subissues "$1"
    # Run vision lifecycle check after filing
    check_and_close_completed_visions || true
  else
    echo "Usage: sprint-filer.sh <sprint-file.md>" >&2
    echo "       sprint-filer.sh --all <sprints-dir>" >&2
    return 1
  fi
 }
 # Run main only when executed directly (not when sourced for testing)
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  main "$@"
 fi
--- a/nomad/client.hcl
+++ b/nomad/client.hcl
@ -0,0 +1,88 @@
 # =============================================================================
 # nomad/client.hcl — Docker driver + host_volume declarations
 #
 # Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
 # /etc/nomad.d/client.hcl on the factory dev box alongside server.hcl.
 #
 # This file owns: Docker driver plugin config + host_volume pre-wiring.
 # server.hcl owns: agent role, bind, ports, data_dir.
 #
 # NOTE: Nomad merges every *.hcl under -config=/etc/nomad.d, so declaring
 # a second `client { ... }` block here augments (not replaces) the one in
 # server.hcl. On a single-node setup this file could be inlined into
 # server.hcl — the split is for readability, not semantics.
 #
 # host_volume declarations let Nomad jobspecs mount factory state by name
 # (volume = "forgejo-data", etc.) without coupling host paths into jobspec
 # HCL. Host paths under /srv/disinto/* are created out-of-band by the
 # orchestrator (S0.4) before any job references them.
 # =============================================================================
 client {
  # forgejo git server data (repos, avatars, attachments).
  host_volume "forgejo-data" {
    path      = "/srv/disinto/forgejo-data"
    read_only = false
  }
  # woodpecker CI data (pipeline artifacts, sqlite db).
  host_volume "woodpecker-data" {
    path      = "/srv/disinto/woodpecker-data"
    read_only = false
  }
  # agent runtime data (claude config, logs, phase files).
  host_volume "agent-data" {
    path      = "/srv/disinto/agent-data"
    read_only = false
  }
  # per-project git clones and worktrees.
  host_volume "project-repos" {
    path      = "/srv/disinto/project-repos"
    read_only = false
  }
  # caddy config + ACME state.
  host_volume "caddy-data" {
    path      = "/srv/disinto/caddy-data"
    read_only = false
  }
  # disinto chat transcripts + attachments.
  host_volume "chat-history" {
    path      = "/srv/disinto/chat-history"
    read_only = false
  }
  # ops repo clone (vault actions, sprint artifacts, knowledge).
  host_volume "ops-repo" {
    path      = "/srv/disinto/ops-repo"
    read_only = false
  }
 }
 # Docker task driver. `volumes.enabled = true` is required so jobspecs
 # can mount host_volume declarations defined above. `allow_privileged`
 # stays false — no factory workload needs privileged containers today,
 # and flipping it is an audit-worthy change.
 plugin "docker" {
  config {
    allow_privileged = false
    volumes {
      enabled = true
    }
    # Leave images behind when jobs stop, so short job churn doesn't thrash
    # the image cache. Factory disk is not constrained; `docker system prune`
    # is the escape hatch.
    gc {
      image       = false
      container   = true
      dangling_containers {
        enabled = true
      }
    }
  }
 }
--- a/nomad/server.hcl
+++ b/nomad/server.hcl
@ -0,0 +1,53 @@
 # =============================================================================
 # nomad/server.hcl — Single-node combined server+client configuration
 #
 # Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
 # /etc/nomad.d/server.hcl on the factory dev box alongside client.hcl.
 #
 # This file owns: agent role, ports, bind, data directory.
 # client.hcl owns: Docker driver plugin config + host_volume declarations.
 #
 # NOTE: On single-node setups these two files could be merged into one
 # (Nomad auto-merges every *.hcl under -config=/etc/nomad.d). The split is
 # purely for readability — role/bind/port vs. plugin/volume wiring.
 #
 # This is a factory dev-box baseline — TLS, ACLs, gossip encryption, and
 # consul/vault integration are deliberately absent and land in later steps.
 # =============================================================================
 data_dir  = "/var/lib/nomad"
 bind_addr = "127.0.0.1"
 log_level = "INFO"
 # All Nomad agent traffic stays on localhost — the factory box does not
 # federate with peers. Ports are the Nomad defaults, pinned here so that
 # future changes to these numbers are a visible diff.
 ports {
  http = 4646
  rpc  = 4647
  serf = 4648
 }
 # Single-node combined mode: this agent is both the only server and the
 # only client. bootstrap_expect=1 makes the server quorum-of-one.
 server {
  enabled          = true
  bootstrap_expect = 1
 }
 client {
  enabled = true
 }
 # Advertise localhost to self to avoid surprises if the default IP
 # autodetection picks a transient interface (e.g. docker0, wg0).
 advertise {
  http = "127.0.0.1"
  rpc  = "127.0.0.1"
  serf = "127.0.0.1"
 }
 # UI on by default — same bind as http, no TLS (localhost only).
 ui {
  enabled = true
 }
--- a/nomad/vault.hcl
+++ b/nomad/vault.hcl
@ -0,0 +1,41 @@
 # =============================================================================
 # nomad/vault.hcl — Single-node Vault configuration (dev-persisted seal)
 #
 # Part of the Nomad+Vault migration (S0.3, issue #823). Deployed to
 # /etc/vault.d/vault.hcl on the factory dev box.
 #
 # Seal model: the single unseal key lives on disk at /etc/vault.d/unseal.key
 # (0400 root) and is read by systemd ExecStartPost on every boot. This is
 # the factory-dev-box-acceptable tradeoff — seal-key theft equals vault
 # theft, but we avoid running a second Vault to auto-unseal the first.
 #
 # This is a factory dev-box baseline — TLS, HA, Raft storage, and audit
 # devices are deliberately absent. Storage is the `file` backend (single
 # node only). Listener is localhost-only, so no external TLS is needed.
 # =============================================================================
 # File storage backend — single-node only, no HA, no raft. State lives in
 # /var/lib/vault/data which is created (root:root 0700) by
 # lib/init/nomad/systemd-vault.sh before the unit starts.
 storage "file" {
  path = "/var/lib/vault/data"
 }
 # Localhost-only listener. TLS is disabled because all callers are on the
 # same box — flipping this to tls_disable=false is an audit-worthy change
 # paired with cert provisioning.
 listener "tcp" {
  address     = "127.0.0.1:8200"
  tls_disable = true
 }
 # mlock prevents Vault's in-memory secrets from being swapped to disk. We
 # keep it enabled; the systemd unit grants CAP_IPC_LOCK so mlock() succeeds.
 disable_mlock = false
 # Advertised API address — used by Vault clients on this host. Matches
 # the listener above.
 api_addr = "http://127.0.0.1:8200"
 # UI on by default — same bind as listener, no TLS (localhost only).
 ui = true
--- a/planner/AGENTS.md
+++ b/planner/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Planner Agent
 **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),
@ -34,7 +34,9 @@ will then sections) and marks the prerequisite as blocked-on-vault in the tree.
 Deduplication: checks pending/ + approved/ + fired/ before creating.
 Phase 4 (journal-and-memory): write updated prerequisite tree + daily journal
 entry (committed to ops repo) and update `$OPS_REPO_ROOT/knowledge/planner-memory.md`.
-Phase 5 (commit-ops): commit all ops repo changes, push directly.
+Phase 5 (commit-ops): commit all ops repo changes to a `planner/run-YYYY-MM-DD`
 branch, then create a PR and walk it to merge via review-bot (`pr_create` →
 `pr_walk_to_merge`), mirroring the architect's ops flow. No direct push to main.
 AGENTS.md maintenance is handled by the Gardener.
 **Artifacts use `$OPS_REPO_ROOT`**: All planner artifacts (journal,
@ -55,7 +57,7 @@ nervous system component, not work.
  creates tmux session, injects formula prompt, monitors phase file, handles crash recovery, cleans up
 - `formulas/run-planner.toml` — Execution spec: six steps (preflight,
  prediction-triage, update-prerequisite-tree, file-at-constraints,
-  journal-and-memory, commit-and-pr) with `needs` dependencies. Claude
+  journal-and-memory, commit-ops-changes) with `needs` dependencies. Claude
  executes all steps in a single interactive session with tool access
 - `formulas/groom-backlog.toml` — Grooming formula for backlog triage and
  grooming. (Note: the planner no longer dispatches breakdown mode — complex
--- a/planner/planner-run.sh
+++ b/planner/planner-run.sh
@ -10,7 +10,9 @@
 #   2. Load formula (formulas/run-planner.toml)
 #   3. Context: VISION.md, AGENTS.md, ops:RESOURCES.md, structural graph,
 #      planner memory, journal entries
-#   4. agent_run(worktree, prompt) → Claude plans, may push knowledge updates
+#   4. Create ops branch planner/run-YYYY-MM-DD for changes
 #   5. agent_run(worktree, prompt) → Claude plans, commits to ops branch
 #   6. If ops branch has commits: pr_create → pr_walk_to_merge (review-bot)
 #
 # Usage:
 #   planner-run.sh [projects/disinto.toml]   # project config (default: disinto)
@ -22,10 +24,11 @@ FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
 # Accept project config from argument; default to disinto (planner is disinto infrastructure)
 export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
 # Set override BEFORE sourcing env.sh so it survives any later re-source of
 # env.sh from nested shells / claude -p tools (#762, #747)
 export FORGE_TOKEN_OVERRIDE="${FORGE_PLANNER_TOKEN:-}"
 # shellcheck source=../lib/env.sh
 source "$FACTORY_ROOT/lib/env.sh"
 # Use planner-bot's own Forgejo identity (#747)
 FORGE_TOKEN="${FORGE_PLANNER_TOKEN:-${FORGE_TOKEN}}"
 # shellcheck source=../lib/formula-session.sh
 source "$FACTORY_ROOT/lib/formula-session.sh"
 # shellcheck source=../lib/worktree.sh
@ -34,6 +37,10 @@ source "$FACTORY_ROOT/lib/worktree.sh"
 source "$FACTORY_ROOT/lib/guard.sh"
 # shellcheck source=../lib/agent-sdk.sh
 source "$FACTORY_ROOT/lib/agent-sdk.sh"
 # shellcheck source=../lib/ci-helpers.sh
 source "$FACTORY_ROOT/lib/ci-helpers.sh"
 # shellcheck source=../lib/pr-lifecycle.sh
 source "$FACTORY_ROOT/lib/pr-lifecycle.sh"
 LOG_FILE="${DISINTO_LOG_DIR}/planner/planner.log"
 # shellcheck disable=SC2034  # consumed by agent-sdk.sh
@ -90,7 +97,7 @@ log "sha=${CURRENT_SHA:0:8} ops=${CURRENT_OPS_SHA:0:8} unreviewed=${unreviewed_c
 # ── Resolve forge remote for git operations ─────────────────────────────
 # Run git operations from the project checkout, not the baked code dir
-cd "$PROJECT_REPO_ROOT" || exit 1
+cd "$PROJECT_REPO_ROOT"
 resolve_forge_remote
@ -145,12 +152,69 @@ ${PROMPT_FOOTER}"
 # ── Create worktree ──────────────────────────────────────────────────────
 formula_worktree_setup "$WORKTREE"
 # ── Prepare ops branch for PR-based merge (#765) ────────────────────────
 PLANNER_OPS_BRANCH="planner/run-$(date -u +%Y-%m-%d)"
 (
  cd "$OPS_REPO_ROOT"
  git fetch origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
  git checkout "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
  git pull --ff-only origin "${PRIMARY_BRANCH}" --quiet 2>/dev/null || true
  # Create (or reset to) a fresh branch from PRIMARY_BRANCH
  git checkout -B "$PLANNER_OPS_BRANCH" "origin/${PRIMARY_BRANCH}" --quiet 2>/dev/null || \
    git checkout -b "$PLANNER_OPS_BRANCH" --quiet 2>/dev/null || true
 )
 log "ops branch: ${PLANNER_OPS_BRANCH}"
 # ── Run agent ─────────────────────────────────────────────────────────────
 export CLAUDE_MODEL="opus"
 agent_run --worktree "$WORKTREE" "$PROMPT"
 log "agent_run complete"
 # ── PR lifecycle: create PR on ops repo and walk to merge (#765) ─────────
 OPS_FORGE_API="${FORGE_API_BASE}/repos/${FORGE_OPS_REPO}"
 ops_has_commits=false
 if ! git -C "$OPS_REPO_ROOT" diff --quiet "origin/${PRIMARY_BRANCH}..${PLANNER_OPS_BRANCH}" 2>/dev/null; then
  ops_has_commits=true
 fi
 if [ "$ops_has_commits" = "true" ]; then
  log "ops branch has commits — creating PR"
  # Push the branch to the ops remote
  git -C "$OPS_REPO_ROOT" push origin "$PLANNER_OPS_BRANCH" --quiet 2>/dev/null || \
    git -C "$OPS_REPO_ROOT" push --force-with-lease origin "$PLANNER_OPS_BRANCH" 2>/dev/null
  # Temporarily point FORGE_API at the ops repo for pr-lifecycle functions
  ORIG_FORGE_API="$FORGE_API"
  export FORGE_API="$OPS_FORGE_API"
  # Ops repo typically has no Woodpecker CI — skip CI polling
  ORIG_WOODPECKER_REPO_ID="${WOODPECKER_REPO_ID:-2}"
  export WOODPECKER_REPO_ID="0"
  PR_NUM=$(pr_create "$PLANNER_OPS_BRANCH" \
    "chore: planner run $(date -u +%Y-%m-%d)" \
    "Automated planner run — updates prerequisite tree, memory, and vault items." \
    "${PRIMARY_BRANCH}" \
    "$OPS_FORGE_API") || true
  if [ -n "$PR_NUM" ]; then
    log "ops PR #${PR_NUM} created — walking to merge"
    SESSION_ID=$(cat "$SID_FILE" 2>/dev/null || echo "planner-$$")
    pr_walk_to_merge "$PR_NUM" "$SESSION_ID" "$OPS_REPO_ROOT" 1 2 || {
      log "ops PR #${PR_NUM} walk finished: ${_PR_WALK_EXIT_REASON:-unknown}"
    }
    log "ops PR #${PR_NUM} result: ${_PR_WALK_EXIT_REASON:-unknown}"
  else
    log "WARNING: failed to create ops PR for branch ${PLANNER_OPS_BRANCH}"
  fi
  # Restore original FORGE_API
  export FORGE_API="$ORIG_FORGE_API"
  export WOODPECKER_REPO_ID="$ORIG_WOODPECKER_REPO_ID"
 else
  log "no ops changes — skipping PR creation"
 fi
 # Persist watermarks so next run can skip if nothing changed
 mkdir -p "$FACTORY_ROOT/state"
 echo "$CURRENT_SHA" > "$LAST_SHA_FILE"
--- a/predictor/AGENTS.md
+++ b/predictor/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Predictor Agent
 **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
--- a/predictor/predictor-run.sh
+++ b/predictor/predictor-run.sh
@ -23,10 +23,11 @@ FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
 # Accept project config from argument; default to disinto
 export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
 # Set override BEFORE sourcing env.sh so it survives any later re-source of
 # env.sh from nested shells / claude -p tools (#762, #747)
 export FORGE_TOKEN_OVERRIDE="${FORGE_PREDICTOR_TOKEN:-}"
 # shellcheck source=../lib/env.sh
 source "$FACTORY_ROOT/lib/env.sh"
 # Use predictor-bot's own Forgejo identity (#747)
 FORGE_TOKEN="${FORGE_PREDICTOR_TOKEN:-${FORGE_TOKEN}}"
 # shellcheck source=../lib/formula-session.sh
 source "$FACTORY_ROOT/lib/formula-session.sh"
 # shellcheck source=../lib/worktree.sh
@ -64,7 +65,7 @@ log "--- Predictor run start ---"
 # ── Resolve forge remote for git operations ─────────────────────────────
 # Run git operations from the project checkout, not the baked code dir
-cd "$PROJECT_REPO_ROOT" || exit 1
+cd "$PROJECT_REPO_ROOT"
 resolve_forge_remote
--- a/projects/disinto.toml.example
+++ b/projects/disinto.toml.example
@ -23,6 +23,24 @@ check_prs            = true
 check_dev_agent      = true
 check_pipeline_stall = false
 # Agent scheduling configuration
 #
 # These values are passed to the agents container as environment variables.
 # The default values (6 hours each) work well for stable production projects.
 # For active development on the disinto factory itself, you may want to
 # configure shorter intervals:
 #
 #   GARDENER_INTERVAL=3600    # 1 hour (default: 21600 = 6 hours)
 #   ARCHITECT_INTERVAL=540    # 9 minutes (default: 21600 = 6 hours)
 #   PLANNER_INTERVAL=660      # 11 minutes (default: 43200 = 12 hours)
 #
 # These can be set in docker-compose.yml environment section or in a .env file.
 #
 # [agents.schedule]
 #   gardener_interval = 21600  # seconds (default: 21600 = 6 hours)
 #   architect_interval  = 21600  # seconds (default: 21600 = 6 hours)
 #   planner_interval    = 43200  # seconds (default: 43200 = 12 hours)
 # Local-model agents (optional) — configure to use llama-server or similar
 # for local LLM inference. Each agent gets its own container with isolated
 # credentials and configuration.
--- a/review/AGENTS.md
+++ b/review/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Review Agent
 **Role**: AI-powered PR review — post structured findings and formal
--- a/review/review-pr.sh
+++ b/review/review-pr.sh
@ -227,6 +227,7 @@ PROMPT=$(cat "${REVIEW_TMPDIR}/prompt.md")
 status "running review"
 rm -f "$OUTPUT_FILE"
 export CLAUDE_MODEL="sonnet"
 export CLAUDE_TIMEOUT="${CLAUDE_TIMEOUT:-900}"   # 15 min — reviews shouldn't take longer
 if [ "$IS_RE_REVIEW" = true ] && [ -n "$_AGENT_SESSION_ID" ]; then
  agent_run --resume "$_AGENT_SESSION_ID" --worktree "$WORKTREE" "$PROMPT"
--- a/site/collect-engagement.sh
+++ b/site/collect-engagement.sh
@ -59,6 +59,21 @@ fi
 mkdir -p "$EVIDENCE_DIR"
 # Verify input is Caddy JSON format (not Combined Log Format or other)
 first_line=$(grep -m1 '.' "$CADDY_LOG" || true)
 if [ -z "$first_line" ]; then
  log "WARN: Caddy access log is empty at ${CADDY_LOG}"
  echo "WARN: Caddy access log is empty — nothing to parse." >&2
  exit 0
 fi
 if ! printf '%s\n' "$first_line" | jq empty 2>/dev/null; then
  preview="${first_line:0:200}"
  log "ERROR: Input file is not Caddy JSON format (expected structured JSON access log). Got: ${preview}"
  echo "ERROR: Input file is not Caddy JSON format (expected structured JSON access log)." >&2
  echo "Got: ${preview}" >&2
  exit 1
 fi
 # ── Parse access log ────────────────────────────────────────────────────────
 log "Parsing ${CADDY_LOG} for entries since $(date -u -d "@${CUTOFF_TS}" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "${CUTOFF_TS}")"
@ -122,7 +137,8 @@ PAGES=$(printf '%s\n' "$PARSED" | jq -c '
 ')
 TOTAL_REQUESTS=$(printf '%s\n' "$PARSED" | wc -l | tr -d ' ')
-PAGE_VIEWS=$(printf '%s\n' "$PAGES" | grep -c . || echo 0)
+PAGE_VIEWS=$(printf '%s\n' "$PAGES" | grep -c . || true)
 PAGE_VIEWS="${PAGE_VIEWS:-0}"
 UNIQUE_VISITORS=$(printf '%s\n' "$PAGES" | jq -r '.ip' | sort -u | wc -l | tr -d ' ')
 # Top pages by hit count
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 7069b729f77de1687aeeac327e44098a608cf567 -->
+<!-- last-reviewed: c363ee0aea2ae447daab28c2c850d6abefc8c6b5 -->
 # Supervisor Agent
 **Role**: Health monitoring and auto-remediation, executed as a formula-driven
@ -7,13 +7,11 @@ then runs an interactive Claude session (sonnet) that assesses health, auto-fixe
 issues, and writes a daily journal. When blocked on external
 resources or human decisions, files vault items instead of escalating directly.
-**Trigger**: `supervisor-run.sh` is invoked by the polling loop in `docker/edge/entrypoint-edge.sh`
+**Trigger**: `supervisor-run.sh` is invoked by two polling loops:
-every 20 minutes (line 50-53). Sources `lib/guard.sh` and calls `check_active supervisor` first
+- **Agents container** (`docker/agents/entrypoint.sh`): every `SUPERVISOR_INTERVAL` seconds (default 1200 = 20 min). Controlled by the `supervisor` role in `AGENT_ROLES` (included in the default seven-role set since P1/#801). Logs to `supervisor.log` in the agents container.
-— skips if `$FACTORY_ROOT/state/.supervisor-active` is absent. Then runs `claude -p` via
+- **Edge container** (`docker/edge/entrypoint-edge.sh`): separate loop in the edge container (line 169-172). Runs independently of the agents container's polling schedule.
-`agent-sdk.sh`, injects `formulas/run-supervisor.toml` with pre-collected metrics as context,
+
-and cleans up on completion or timeout (20 min max session). Note: the supervisor runs in the
+Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `check_active supervisor` first — skips if `$FACTORY_ROOT/state/.supervisor-active` is absent. Then runs `claude -p` via `agent-sdk.sh`, injects `formulas/run-supervisor.toml` with pre-collected metrics as context, and cleans up on completion or timeout.
 **edge container** (`entrypoint-edge.sh`), not the agent container — this distinction matters
 for operators debugging the factory.
 **Key files**:
 - `supervisor/supervisor-run.sh` — Polling loop participant + orchestrator: lock, memory guard,
@ -39,6 +37,7 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
 **Environment variables consumed**:
 - `FORGE_TOKEN`, `FORGE_SUPERVISOR_TOKEN` (falls back to FORGE_TOKEN), `FORGE_REPO`, `FORGE_API`, `PROJECT_NAME`, `PROJECT_REPO_ROOT`, `OPS_REPO_ROOT`
 - `PRIMARY_BRANCH`, `CLAUDE_MODEL` (set to sonnet by supervisor-run.sh)
 - `SUPERVISOR_INTERVAL` — polling interval in seconds for agents container (default 1200 = 20 min)
 - `WOODPECKER_TOKEN`, `WOODPECKER_SERVER`, `WOODPECKER_DB_PASSWORD`, `WOODPECKER_DB_USER`, `WOODPECKER_DB_HOST`, `WOODPECKER_DB_NAME` — CI database queries
 **Degraded mode (Issue #544)**: When `OPS_REPO_ROOT` is not set or the directory doesn't exist, the supervisor runs in degraded mode:
--- a/supervisor/supervisor-run.sh
+++ b/supervisor/supervisor-run.sh
@ -25,10 +25,11 @@ FACTORY_ROOT="$(dirname "$SCRIPT_DIR")"
 # Accept project config from argument; default to disinto
 export PROJECT_TOML="${1:-$FACTORY_ROOT/projects/disinto.toml}"
 # Set override BEFORE sourcing env.sh so it survives any later re-source of
 # env.sh from nested shells / claude -p tools (#762, #747)
 export FORGE_TOKEN_OVERRIDE="${FORGE_SUPERVISOR_TOKEN:-}"
 # shellcheck source=../lib/env.sh
 source "$FACTORY_ROOT/lib/env.sh"
 # Use supervisor-bot's own Forgejo identity (#747)
 FORGE_TOKEN="${FORGE_SUPERVISOR_TOKEN:-${FORGE_TOKEN}}"
 # shellcheck source=../lib/formula-session.sh
 source "$FACTORY_ROOT/lib/formula-session.sh"
 # shellcheck source=../lib/worktree.sh
@ -86,9 +87,7 @@ log "--- Supervisor run start ---"
 # ── Resolve forge remote for git operations ─────────────────────────────
 # Run git operations from the project checkout, not the baked code dir
-cd "$PROJECT_REPO_ROOT" || exit 1
+cd "$PROJECT_REPO_ROOT"
 resolve_forge_remote
 # ── Housekeeping: clean up stale crashed worktrees (>24h) ────────────────
 cleanup_stale_crashed_worktrees 24
--- a/tests/lib-hvault.bats
+++ b/tests/lib-hvault.bats
@ -0,0 +1,215 @@
 #!/usr/bin/env bats
 # tests/lib-hvault.bats — Unit tests for lib/hvault.sh
 #
 # Runs against a dev-mode Vault server (single binary, no LXC needed).
 # CI launches vault server -dev inline before running these tests.
 VAULT_BIN="${VAULT_BIN:-vault}"
 setup_file() {
  export TEST_DIR
  TEST_DIR="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)"
  # Start dev-mode vault on a random port
  export VAULT_DEV_PORT
  VAULT_DEV_PORT="$(shuf -i 18200-18299 -n 1)"
  export VAULT_ADDR="http://127.0.0.1:${VAULT_DEV_PORT}"
  "$VAULT_BIN" server -dev \
    -dev-listen-address="127.0.0.1:${VAULT_DEV_PORT}" \
    -dev-root-token-id="test-root-token" \
    -dev-no-store-token \
    &>"${BATS_FILE_TMPDIR}/vault.log" &
  export VAULT_PID=$!
  export VAULT_TOKEN="test-root-token"
  # Wait for vault to be ready (up to 10s)
  local i=0
  while ! curl -sf "${VAULT_ADDR}/v1/sys/health" >/dev/null 2>&1; do
    sleep 0.5
    i=$((i + 1))
    if [ "$i" -ge 20 ]; then
      echo "Vault failed to start. Log:" >&2
      cat "${BATS_FILE_TMPDIR}/vault.log" >&2
      return 1
    fi
  done
 }
 teardown_file() {
  if [ -n "${VAULT_PID:-}" ]; then
    kill "$VAULT_PID" 2>/dev/null || true
    wait "$VAULT_PID" 2>/dev/null || true
  fi
 }
 setup() {
  # Source the module under test
  source "${TEST_DIR}/lib/hvault.sh"
  export VAULT_ADDR VAULT_TOKEN
 }
 # ── hvault_kv_put + hvault_kv_get ────────────────────────────────────────────
@test "hvault_kv_put writes and hvault_kv_get reads a secret" {
  run hvault_kv_put "test/myapp" "username=admin" "password=s3cret"
  [ "$status" -eq 0 ]
  run hvault_kv_get "test/myapp"
  [ "$status" -eq 0 ]
  echo "$output" | jq -e '.username == "admin"'
  echo "$output" | jq -e '.password == "s3cret"'
 }
@test "hvault_kv_get extracts a single key" {
  hvault_kv_put "test/single" "foo=bar" "baz=qux"
  run hvault_kv_get "test/single" "foo"
  [ "$status" -eq 0 ]
  [ "$output" = "bar" ]
 }
@test "hvault_kv_get fails for missing key" {
  hvault_kv_put "test/keymiss" "exists=yes"
  run hvault_kv_get "test/keymiss" "nope"
  [ "$status" -ne 0 ]
 }
@test "hvault_kv_get fails for missing path" {
  run hvault_kv_get "test/does-not-exist-$(date +%s)"
  [ "$status" -ne 0 ]
 }
@test "hvault_kv_put fails without KEY=VAL" {
  run hvault_kv_put "test/bad"
  [ "$status" -ne 0 ]
  echo "$output" | grep -q '"error":true' || echo "$stderr" | grep -q '"error":true'
 }
@test "hvault_kv_put rejects malformed pair (no =)" {
  run hvault_kv_put "test/bad2" "noequals"
  [ "$status" -ne 0 ]
 }
@test "hvault_kv_get fails without PATH" {
  run hvault_kv_get
  [ "$status" -ne 0 ]
 }
 # ── hvault_kv_list ───────────────────────────────────────────────────────────
@test "hvault_kv_list lists keys at a path" {
  hvault_kv_put "test/listdir/a" "k=1"
  hvault_kv_put "test/listdir/b" "k=2"
  run hvault_kv_list "test/listdir"
  [ "$status" -eq 0 ]
  echo "$output" | jq -e '. | length >= 2'
  echo "$output" | jq -e 'index("a")'
  echo "$output" | jq -e 'index("b")'
 }
@test "hvault_kv_list fails on nonexistent path" {
  run hvault_kv_list "test/no-such-path-$(date +%s)"
  [ "$status" -ne 0 ]
 }
@test "hvault_kv_list fails without PATH" {
  run hvault_kv_list
  [ "$status" -ne 0 ]
 }
 # ── hvault_policy_apply ──────────────────────────────────────────────────────
@test "hvault_policy_apply creates a policy" {
  local pfile="${BATS_TEST_TMPDIR}/test-policy.hcl"
  cat > "$pfile" <<'HCL'
 path "secret/data/test/*" {
  capabilities = ["read"]
 }
 HCL
  run hvault_policy_apply "test-reader" "$pfile"
  [ "$status" -eq 0 ]
  # Verify the policy exists via Vault API
  run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
    "${VAULT_ADDR}/v1/sys/policies/acl/test-reader"
  [ "$status" -eq 0 ]
  echo "$output" | jq -e '.data.policy' | grep -q "secret/data/test"
 }
@test "hvault_policy_apply is idempotent" {
  local pfile="${BATS_TEST_TMPDIR}/idem-policy.hcl"
  printf 'path "secret/*" { capabilities = ["list"] }\n' > "$pfile"
  run hvault_policy_apply "idem-policy" "$pfile"
  [ "$status" -eq 0 ]
  # Apply again — should succeed
  run hvault_policy_apply "idem-policy" "$pfile"
  [ "$status" -eq 0 ]
 }
@test "hvault_policy_apply fails with missing file" {
  run hvault_policy_apply "bad-policy" "/nonexistent/policy.hcl"
  [ "$status" -ne 0 ]
 }
@test "hvault_policy_apply fails without args" {
  run hvault_policy_apply
  [ "$status" -ne 0 ]
 }
 # ── hvault_token_lookup ──────────────────────────────────────────────────────
@test "hvault_token_lookup returns token info" {
  run hvault_token_lookup
  [ "$status" -eq 0 ]
  echo "$output" | jq -e '.policies'
  echo "$output" | jq -e '.accessor'
  echo "$output" | jq -e 'has("ttl")'
 }
@test "hvault_token_lookup fails without VAULT_TOKEN" {
  unset VAULT_TOKEN
  run hvault_token_lookup
  [ "$status" -ne 0 ]
 }
@test "hvault_token_lookup fails without VAULT_ADDR" {
  unset VAULT_ADDR
  run hvault_token_lookup
  [ "$status" -ne 0 ]
 }
 # ── hvault_jwt_login ─────────────────────────────────────────────────────────
@test "hvault_jwt_login fails without VAULT_ADDR" {
  unset VAULT_ADDR
  run hvault_jwt_login "myrole" "fakejwt"
  [ "$status" -ne 0 ]
 }
@test "hvault_jwt_login fails without args" {
  run hvault_jwt_login
  [ "$status" -ne 0 ]
 }
@test "hvault_jwt_login returns error for unconfigured jwt auth" {
  # JWT auth backend is not enabled in dev mode by default — expect failure
  run hvault_jwt_login "myrole" "eyJhbGciOiJSUzI1NiJ9.fake.sig"
  [ "$status" -ne 0 ]
 }
 # ── Env / prereq errors ─────────────────────────────────────────────────────
@test "all functions fail with structured JSON error when VAULT_ADDR unset" {
  unset VAULT_ADDR
  for fn in hvault_kv_get hvault_kv_put hvault_kv_list hvault_policy_apply hvault_token_lookup; do
    run $fn "dummy" "dummy"
    [ "$status" -ne 0 ]
  done
 }
--- a/tests/mock-forgejo.py
+++ b/tests/mock-forgejo.py
@ -505,8 +505,9 @@ class ForgejoHandler(BaseHTTPRequestHandler):
        require_token(self)
        parts = self.path.split("/")
-        if len(parts) >= 6:
+        # /api/v1/admin/users/{username}/repos → parts[5] is the username
-            target_user = parts[4]
+        if len(parts) >= 7:
            target_user = parts[5]
        else:
            json_response(self, 400, {"message": "username required"})
            return
--- a/Show more
+++ b/Show more