Merge pull request 'fix: vision(#623 ): end-to-end subpath routing smoke test for Forgejo + Woodpecker + chat (#1025 )' (#1063 ) from fix/issue-1025-3 into main

fix: convert bash array to POSIX for-loop in caddyfile-routing-test
Step ran in alpine:3.19 with default /bin/sh (busybox ash) which does not support bash array syntax. REQUIRED_HANDLERS=(...) + "${ARR[@]}" failed with "syntax error: unexpected (". Inlined the handler list into a single space-separated for-loop that works under POSIX sh. No behavioral change; same 6 handlers checked. Fixes edge-subpath/caddyfile-routing-test exit 2 on pipelines targeting fix/issue-1025-3 — see #1025.
2026-04-20 11:01:13 +00:00 · 2026-04-20 10:47:12 +00:00 · 2026-04-20 10:44:17 +00:00 · 2026-04-20 08:54:08 +00:00 · 2026-04-20 08:46:33 +00:00 · 2026-04-20 08:44:05 +00:00
34 changed files with 2454 additions and 172 deletions
--- a/.woodpecker/detect-duplicates.py
+++ b/.woodpecker/detect-duplicates.py
@ -294,6 +294,10 @@ def main() -> int:
        "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
        # Standard lib source block shared across formula-driven agent run scripts
        "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
        # Test data for duplicate service detection tests (#850)
        # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh
        "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)",
        "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)",
        # Common vault-seed script patterns: logging helpers + flag parsing
        # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh
        "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)",
@ -301,6 +305,10 @@ def main() -> int:
        "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)",
        "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)",
        "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)",
        # forgejo-bootstrap.sh follows wp-oauth-register.sh pattern (issue #1069)
        "2b80185e4ae2b54e2e01f33e5555c688": "Standard header (set -euo pipefail, SCRIPT_DIR, REPO_ROOT) (forgejo-bootstrap + wp-oauth-register)",
        "38a1f20a60d69f0d6bfb06a0532b3bd7": "Logging helpers + DRY_RUN init (forgejo-bootstrap + wp-oauth-register)",
        "4dd3c526fa29bdaa88b274c3d7d01032": "Flag parsing loop + case start (forgejo-bootstrap + wp-oauth-register)",
        # Common vault-seed script preamble + precondition patterns
        # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh
        "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT",
@ -311,10 +319,6 @@ def main() -> int:
        # Common vault-seed script flag parsing patterns
        # Shared across tools/vault-seed-{forgejo,ops-repo}.sh
        "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)",
        # Test data for duplicate service detection tests (#850)
        # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh
        "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)",
        "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)",
        "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)",
        "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)",
        "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)",
--- a/.woodpecker/edge-subpath.yml
+++ b/.woodpecker/edge-subpath.yml
@ -0,0 +1,317 @@
 # =============================================================================
 # .woodpecker/edge-subpath.yml — Edge subpath routing static checks
 #
 # Static validation for edge subpath routing configuration. This pipeline does
 # NOT run live service curls — it validates the configuration that would be
 # used by a deployed edge proxy.
 #
 # Checks:
 #   1. shellcheck — syntax check on tests/smoke-edge-subpath.sh
 #   2. caddy validate — validate the Caddyfile template syntax
 #   3. caddyfile-routing-test — verify Caddyfile routing block shape
 #   4. test-caddyfile-routing — run standalone unit test for Caddyfile structure
 #
 # Triggers:
 #   - Pull requests that modify edge-related files
 #
 # Environment variables (inherited from WOODPECKER_ENVIRONMENT):
 #   EDGE_BASE_URL      — Edge proxy URL for reference (default: http://localhost)
 #   EDGE_TIMEOUT       — Request timeout in seconds (default: 30)
 #   EDGE_MAX_RETRIES   — Max retries per request (default: 3)
 # =============================================================================
 when:
  event: pull_request
 steps:
  # ── 1. ShellCheck on smoke script ────────────────────────────────────────
  # `shellcheck` validates bash syntax, style, and common pitfalls.
  # Exit codes:
  #   0 — all checks passed
  #   1 — one or more issues found
  - name: shellcheck-smoke
    image: koalaman/shellcheck-alpine:stable
    commands:
      - shellcheck --severity=warning tests/smoke-edge-subpath.sh tests/test-caddyfile-routing.sh
  # ── 2. Caddyfile template rendering ───────────────────────────────────────
  # Render a mock Caddyfile for validation. The template uses Nomad's
  # templating syntax ({{ range ... }}) which must be processed before Caddy
  # can validate it. We render a mock version with Nomad templates expanded
  # to static values for validation purposes.
  - name: render-caddyfile
    image: alpine:3.19
    commands:
      - apk add --no-cache coreutils
      - |
        set -e
        mkdir -p edge-render
        # Render mock Caddyfile with Nomad templates expanded
        {
          echo '# Caddyfile — edge proxy configuration (Nomad-rendered)'
          echo '# Staging upstream discovered via Nomad service registration.'
          echo ''
          echo ':80 {'
          echo '    # Redirect root to Forgejo'
          echo '    handle / {'
          echo '        redir /forge/ 302'
          echo '    }'
          echo ''
          echo '    # Reverse proxy to Forgejo'
          echo '    handle /forge/* {'
          echo '        reverse_proxy 127.0.0.1:3000'
          echo '    }'
          echo ''
          echo '    # Reverse proxy to Woodpecker CI'
          echo '    handle /ci/* {'
          echo '        reverse_proxy 127.0.0.1:8000'
          echo '    }'
          echo ''
          echo '    # Reverse proxy to staging — dynamic port via Nomad service discovery'
          echo '    handle /staging/* {'
          echo '        reverse_proxy 127.0.0.1:8081'
          echo '    }'
          echo ''
          echo '    # Chat service — reverse proxy to disinto-chat backend (#705)'
          echo '    # OAuth routes bypass forward_auth — unauthenticated users need these (#709)'
          echo '    handle /chat/login {'
          echo '        reverse_proxy 127.0.0.1:8080'
          echo '    }'
          echo '    handle /chat/oauth/callback {'
          echo '        reverse_proxy 127.0.0.1:8080'
          echo '    }'
          echo '    # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)'
          echo '    handle /chat/* {'
          echo '        forward_auth 127.0.0.1:8080 {'
          echo '            uri /chat/auth/verify'
          echo '            copy_headers X-Forwarded-User'
          echo '            header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}'
          echo '        }'
          echo '        reverse_proxy 127.0.0.1:8080'
          echo '    }'
          echo '}'
        } > edge-render/Caddyfile
        cp edge-render/Caddyfile edge-render/Caddyfile.rendered
        echo "Caddyfile rendered successfully"
  # ── 3. Caddy config validation ───────────────────────────────────────────
  # `caddy validate` checks Caddyfile syntax and configuration.
  # This validates the rendered Caddyfile against Caddy's parser.
  # Exit codes:
  #   0 — configuration is valid
  #   1 — configuration has errors
  - name: caddy-validate
    image: alpine:3.19
    commands:
      - apk add --no-cache ca-certificates curl
      - curl -sS -o /tmp/caddy "https://caddyserver.com/api/download?os=linux&arch=amd64"
      - chmod +x /tmp/caddy
      - /tmp/caddy version
      - /tmp/caddy validate --config edge-render/Caddyfile.rendered --adapter caddyfile
  # ── 4. Caddyfile routing block shape test ─────────────────────────────────
  # Verify that the Caddyfile contains all required routing blocks:
  #   - /forge/ — Forgejo subpath
  #   - /ci/ — Woodpecker subpath
  #   - /staging/ — Staging subpath
  #   - /chat/ — Chat subpath with forward_auth
  #
  # This is a unit test that validates the expected structure without
  # requiring a running Caddy instance.
  - name: caddyfile-routing-test
    image: alpine:3.19
    commands:
      - apk add --no-cache grep coreutils
      - |
        set -e
        CADDYFILE="edge-render/Caddyfile.rendered"
        echo "=== Validating Caddyfile routing blocks ==="
        # Check that all required subpath handlers exist
        # POSIX-safe loop (alpine /bin/sh has no arrays)
        FAILED=0
        for handler in "handle /forge/\*" "handle /ci/\*" "handle /staging/\*" "handle /chat/login" "handle /chat/oauth/callback" "handle /chat/\*"; do
          if grep -q "$handler" "$CADDYFILE"; then
            echo "[PASS] Found handler: $handler"
          else
            echo "[FAIL] Missing handler: $handler"
            FAILED=1
          fi
        done
        # Check forward_auth block exists for /chat/*
        if grep -A5 "handle /chat/\*" "$CADDYFILE" | grep -q "forward_auth"; then
          echo "[PASS] forward_auth block found for /chat/*"
        else
          echo "[FAIL] forward_auth block missing for /chat/*"
          FAILED=1
        fi
        # Check reverse_proxy to Forgejo (port 3000)
        if grep -q "reverse_proxy 127.0.0.1:3000" "$CADDYFILE"; then
          echo "[PASS] Forgejo reverse_proxy configured (port 3000)"
        else
          echo "[FAIL] Forgejo reverse_proxy not configured"
          FAILED=1
        fi
        # Check reverse_proxy to Woodpecker (port 8000)
        if grep -q "reverse_proxy 127.0.0.1:8000" "$CADDYFILE"; then
          echo "[PASS] Woodpecker reverse_proxy configured (port 8000)"
        else
          echo "[FAIL] Woodpecker reverse_proxy not configured"
          FAILED=1
        fi
        # Check reverse_proxy to Chat (port 8080)
        if grep -q "reverse_proxy 127.0.0.1:8080" "$CADDYFILE"; then
          echo "[PASS] Chat reverse_proxy configured (port 8080)"
        else
          echo "[FAIL] Chat reverse_proxy not configured"
          FAILED=1
        fi
        # Check root redirect to /forge/
        if grep -q "redir /forge/ 302" "$CADDYFILE"; then
          echo "[PASS] Root redirect to /forge/ configured"
        else
          echo "[FAIL] Root redirect to /forge/ not configured"
          FAILED=1
        fi
        echo ""
        if [ $FAILED -eq 0 ]; then
          echo "=== All routing blocks validated ==="
          exit 0
        else
          echo "=== Routing block validation failed ===" >&2
          exit 1
        fi
  # ── 5. Standalone Caddyfile routing test ─────────────────────────────────
  # Run the standalone unit test for Caddyfile routing block validation.
  # This test extracts the Caddyfile template from edge.hcl and validates
  # its structure without requiring a running Caddy instance.
  - name: test-caddyfile-routing
    image: alpine:3.19
    commands:
      - apk add --no-cache grep coreutils
      - |
        set -e
        EDGE_TEMPLATE="nomad/jobs/edge.hcl"
        echo "=== Extracting Caddyfile template from $EDGE_TEMPLATE ==="
        # Extract the Caddyfile template (content between <<EOT and EOT markers)
        CADDYFILE=$(sed -n '/data[[:space:]]*=[[:space:]]*<<[Ee][Oo][Tt]/,/^EOT$/p' "$EDGE_TEMPLATE" | sed '1s/.*/# Caddyfile extracted from Nomad template/; $d')
        if [ -z "$CADDYFILE" ]; then
          echo "ERROR: Could not extract Caddyfile template from $EDGE_TEMPLATE" >&2
          exit 1
        fi
        echo "Caddyfile template extracted successfully"
        echo ""
        FAILED=0
        # Check Forgejo subpath
        if echo "$CADDYFILE" | grep -q "handle /forge/\*"; then
          echo "[PASS] Forgejo handle block"
        else
          echo "[FAIL] Forgejo handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then
          echo "[PASS] Forgejo reverse_proxy (port 3000)"
        else
          echo "[FAIL] Forgejo reverse_proxy (port 3000)"
          FAILED=1
        fi
        # Check Woodpecker subpath
        if echo "$CADDYFILE" | grep -q "handle /ci/\*"; then
          echo "[PASS] Woodpecker handle block"
        else
          echo "[FAIL] Woodpecker handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8000"; then
          echo "[PASS] Woodpecker reverse_proxy (port 8000)"
        else
          echo "[FAIL] Woodpecker reverse_proxy (port 8000)"
          FAILED=1
        fi
        # Check Staging subpath
        if echo "$CADDYFILE" | grep -q "handle /staging/\*"; then
          echo "[PASS] Staging handle block"
        else
          echo "[FAIL] Staging handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "nomadService"; then
          echo "[PASS] Staging Nomad service discovery"
        else
          echo "[FAIL] Staging Nomad service discovery"
          FAILED=1
        fi
        # Check Chat subpath
        if echo "$CADDYFILE" | grep -q "handle /chat/login"; then
          echo "[PASS] Chat login handle block"
        else
          echo "[FAIL] Chat login handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "handle /chat/oauth/callback"; then
          echo "[PASS] Chat OAuth callback handle block"
        else
          echo "[FAIL] Chat OAuth callback handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "handle /chat/\*"; then
          echo "[PASS] Chat catch-all handle block"
        else
          echo "[FAIL] Chat catch-all handle block"
          FAILED=1
        fi
        if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8080"; then
          echo "[PASS] Chat reverse_proxy (port 8080)"
        else
          echo "[FAIL] Chat reverse_proxy (port 8080)"
          FAILED=1
        fi
        # Check forward_auth for chat
        if echo "$CADDYFILE" | grep -A10 "handle /chat/\*" | grep -q "forward_auth"; then
          echo "[PASS] forward_auth block for /chat/*"
        else
          echo "[FAIL] forward_auth block for /chat/*"
          FAILED=1
        fi
        # Check root redirect
        if echo "$CADDYFILE" | grep -q "redir /forge/ 302"; then
          echo "[PASS] Root redirect to /forge/"
        else
          echo "[FAIL] Root redirect to /forge/"
          FAILED=1
        fi
        echo ""
        if [ $FAILED -eq 0 ]; then
          echo "=== All routing blocks validated ==="
          exit 0
        else
          echo "=== Routing block validation failed ===" >&2
          exit 1
        fi
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Disinto — Agent Instructions
 ## What this repo is
--- a/architect/AGENTS.md
+++ b/architect/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Architect — Agent Instructions
 ## What this agent is
--- a/bin/disinto
+++ b/bin/disinto
@ -12,6 +12,7 @@
 #   disinto secrets <subcommand>        Manage encrypted secrets
 #   disinto run <action-id>              Run action in ephemeral runner container
 #   disinto ci-logs <pipeline> [--step <name>]  Read CI logs from Woodpecker SQLite
 #   disinto backup create <outfile>     Export factory state for migration
 #
 # Usage:
 #   disinto init https://github.com/user/repo
@ -39,7 +40,9 @@ source "${FACTORY_ROOT}/lib/generators.sh"
 source "${FACTORY_ROOT}/lib/forge-push.sh"
 source "${FACTORY_ROOT}/lib/ci-setup.sh"
 source "${FACTORY_ROOT}/lib/release.sh"
 source "${FACTORY_ROOT}/lib/backup.sh"
 source "${FACTORY_ROOT}/lib/claude-config.sh"
 source "${FACTORY_ROOT}/lib/disinto/backup.sh"  # backup create/import
 # ── Helpers ──────────────────────────────────────────────────────────────────
@ -62,7 +65,9 @@ Usage:
  disinto hire-an-agent <agent-name> <role> [--formula <path>] [--local-model <url>] [--model <name>]
                                     Hire a new agent (create user + .profile repo; re-run to rotate credentials)
  disinto agent <subcommand>           Manage agent state (enable/disable)
  disinto backup create <outfile>      Export factory state (issues + ops bundle)
  disinto edge <verb> [options]        Manage edge tunnel registrations
  disinto backup <subcommand>          Backup and restore factory state
 Edge subcommands:
  register [project]    Register a new tunnel (generates keypair if needed)
@ -101,6 +106,18 @@ Hire an agent options:
 CI logs options:
  --step <name>        Filter logs to a specific step (e.g., smoke-init)
 Backup subcommands:
  create <file>        Create backup of factory state to tarball
  import <file>        Restore factory state from backup tarball
 Import behavior:
  - Unpacks tarball to temp directory
  - Creates disinto repo via Forgejo API (mirror config is manual)
  - Creates disinto-ops repo and pushes refs from bundle
  - Imports issues from issues/*.json (idempotent - skips existing)
  - Logs issue number mapping (Forgejo auto-assigns numbers)
  - Prints summary: created X repos, pushed Y refs, imported Z issues, skipped W
 EOF
  exit 1
 }
@ -821,6 +838,11 @@ _disinto_init_nomad() {
        fi
        echo "[deploy] [dry-run] nomad job validate ${jobspec_path}"
        echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}"
        # Post-deploy: forgejo-bootstrap
        if [ "$svc" = "forgejo" ]; then
          local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh"
          echo "[deploy] [dry-run] [post-deploy] would run ${bootstrap_script}"
        fi
      done
      echo "[deploy] dry-run complete"
    fi
@ -1035,7 +1057,27 @@ _disinto_init_nomad() {
          echo "Error: deploy.sh must run as root and sudo is not installed" >&2
          exit 1
        fi
-        sudo -n -- "${deploy_cmd[@]}" || exit $?
+        sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "${deploy_cmd[@]}" || exit $?
      fi
      # Post-deploy: bootstrap Forgejo admin user after forgejo deployment
      if [ "$svc" = "forgejo" ]; then
        echo ""
        echo "── Bootstrapping Forgejo admin user ───────────────────────"
        local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh"
        if [ -x "$bootstrap_script" ]; then
          if [ "$(id -u)" -eq 0 ]; then
            "$bootstrap_script" || exit $?
          else
            if ! command -v sudo >/dev/null 2>&1; then
              echo "Error: forgejo-bootstrap.sh must run as root and sudo is not installed" >&2
              exit 1
            fi
            sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "$bootstrap_script" || exit $?
          fi
        else
          echo "warning: forgejo-bootstrap.sh not found or not executable" >&2
        fi
      fi
    done
@ -2893,6 +2935,33 @@ EOF
  esac
 }
 # ── backup command ────────────────────────────────────────────────────────────
 # Usage: disinto backup <subcommand> [args]
 # Subcommands:
 #   create <outfile.tar.gz>  Create backup of factory state
 #   import <infile.tar.gz>   Restore factory state from backup
 disinto_backup() {
  local subcmd="${1:-}"
  shift || true
  case "$subcmd" in
    create)
      backup_create "$@"
      ;;
    import)
      backup_import "$@"
      ;;
    *)
      echo "Usage: disinto backup <subcommand> [args]" >&2
      echo "" >&2
      echo "Subcommands:" >&2
      echo "  create <outfile.tar.gz>  Create backup of factory state" >&2
      echo "  import <infile.tar.gz>   Restore factory state from backup" >&2
      exit 1
      ;;
  esac
 }
 # ── Main dispatch ────────────────────────────────────────────────────────────
 case "${1:-}" in
@ -2909,6 +2978,7 @@ case "${1:-}" in
  hire-an-agent)   shift; disinto_hire_an_agent "$@" ;;
  agent)           shift; disinto_agent "$@" ;;
  edge)            shift; disinto_edge "$@" ;;
  backup)          shift; disinto_backup "$@" ;;
  -h|--help)       usage ;;
  *)               usage ;;
 esac
--- a/dev/AGENTS.md
+++ b/dev/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Dev Agent
 **Role**: Implement issues autonomously — write code, push branches, address
--- a/docker/edge/entrypoint-edge.sh
+++ b/docker/edge/entrypoint-edge.sh
@ -173,11 +173,15 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}"
  sleep 1200  # 20 minutes
 done) &
-# ── Load required secrets from secrets/*.enc (#777) ────────────────────
+# ── Load optional secrets from secrets/*.enc (#777) ────────────────────
-# Edge container declares its required secrets; missing ones cause a hard fail.
+# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to
 # SCP access logs from a remote edge host. When age key or secrets dir is
 # missing, or any secret fails to decrypt, log a warning and skip the cron.
 # Caddy itself does not depend on these secrets.
 _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt"
 _SECRETS_DIR="/opt/disinto/secrets"
 EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG"
 EDGE_ENGAGEMENT_READY=0  # Assume not ready until proven otherwise
 _edge_decrypt_secret() {
  local enc_path="${_SECRETS_DIR}/${1}.enc"
@ -192,47 +196,53 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then
    export "$_secret_name=$_val"
  done
  if [ -n "$_missing" ]; then
-    echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2
+    echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2
-    echo "  Run 'disinto secrets add <NAME>' for each missing secret." >&2
+    echo "  collect-engagement cron will be skipped. Run 'disinto secrets add <NAME>' to enable." >&2
-    echo "  If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2
+    EDGE_ENGAGEMENT_READY=0
-    exit 1
+  else
    echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2
    EDGE_ENGAGEMENT_READY=1
  fi
  echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2
 else
-  echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2
+  echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2
-  echo "  Ensure age is installed and secrets/*.enc files are present." >&2
+  echo "  collect-engagement cron will be skipped. Run 'disinto secrets add <NAME>' to enable." >&2
-  exit 1
+  EDGE_ENGAGEMENT_READY=0
 fi
 # Start daily engagement collection cron loop in background (#745)
 # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that
 # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777).
-(while true; do
+# Guarded: only start if EDGE_ENGAGEMENT_READY=1.
-  # Calculate seconds until next 23:50 UTC
+if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then
-  _now=$(date -u +%s)
+  (while true; do
-  _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0)
+    # Calculate seconds until next 23:50 UTC
-  if [ "$_target" -le "$_now" ]; then
+    _now=$(date -u +%s)
-    _target=$(( _target + 86400 ))
+    _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0)
-  fi
+    if [ "$_target" -le "$_now" ]; then
-  _sleep_secs=$(( _target - _now ))
+      _target=$(( _target + 86400 ))
-  echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2
+    fi
-  sleep "$_sleep_secs"
+    _sleep_secs=$(( _target - _now ))
-  _fetch_log="/tmp/caddy-access-log-fetch.log"
+    echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2
-  _ssh_key_file=$(mktemp)
+    sleep "$_sleep_secs"
-  printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file"
+    _fetch_log="/tmp/caddy-access-log-fetch.log"
-  chmod 0600 "$_ssh_key_file"
+    _ssh_key_file=$(mktemp)
-  scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+    printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file"
-    "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \
+    chmod 0600 "$_ssh_key_file"
-    "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true
+    scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
-  rm -f "$_ssh_key_file"
+      "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \
-  if [ -s "$_fetch_log" ]; then
+      "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true
-    CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \
+    rm -f "$_ssh_key_file"
-      | tee -a /opt/disinto-logs/collect-engagement.log || true
+    if [ -s "$_fetch_log" ]; then
-  else
+      CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \
-    echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2
+        | tee -a /opt/disinto-logs/collect-engagement.log || true
-  fi
+    else
-  rm -f "$_fetch_log"
+      echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2
-done) &
+    fi
    rm -f "$_fetch_log"
  done) &
 else
  echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2
 fi
 # Nomad template renders Caddyfile to /local/Caddyfile via service discovery;
 # copy it into the expected location if present (compose uses the mounted path).
--- a/docs/nomad-cutover-runbook.md
+++ b/docs/nomad-cutover-runbook.md
@ -0,0 +1,183 @@
 # Nomad Cutover Runbook
 End-to-end procedure to cut over the disinto factory from docker-compose on
 disinto-dev-box to Nomad on disinto-nomad-box.
 **Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box
 stays warm for rollback.
 **Downtime budget**: <5 min blue-green flip.
 **Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is
 regenerated or discarded. OAuth secrets are regenerated on fresh init (all
 sessions invalidated).
 ---
 ## 1. Pre-cutover readiness checklist
 - [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified)
 - [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and
      Codeberg
 - [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6)
 - [ ] Companion tools landed:
  - `disinto backup create` (#1057)
  - `disinto backup import` (#1058)
 - [ ] Backup tarball produced and tested against a scratch LXC (see §3)
 ---
 ## 2. Pre-cutover artifact: backup
 On disinto-dev-box:
 ```bash
 ./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz
 ```
 Copy the tarball to nomad-box (and optionally to a local workstation for
 safekeeping):
 ```bash
 scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/
 ```
 ---
 ## 3. Pre-cutover dry-run
 On a throwaway LXC:
 ```bash
 lxc launch ubuntu:24.04 cutover-dryrun
 # inside the container:
 disinto init --backend=nomad --import-env .env --with edge
 ./bin/disinto backup import /tmp/disinto-backup-*.tar.gz
 ```
 Verify:
 - Issue count matches source Forgejo
 - disinto-ops repo refs match source bundle
 Destroy the LXC once satisfied:
 ```bash
 lxc delete cutover-dryrun --force
 ```
 ---
 ## 4. Cutover T-0 (operator executes; <5 min target)
 ### 4.1 Stop dev-box services
 ```bash
 # On disinto-dev-box — stop, do NOT remove volumes (rollback needs them)
 docker-compose stop
 ```
 ### 4.2 Provision nomad-box (if not already done)
 ```bash
 # On disinto-nomad-box
 disinto init --backend=nomad --import-env .env --with edge
 ```
 ### 4.3 Import backup
 ```bash
 # On disinto-nomad-box
 ./bin/disinto backup import /tmp/disinto-backup-*.tar.gz
 ```
 ### 4.4 Configure Codeberg pull mirror
 Manual, one-time step in the new Forgejo UI:
 1. Create a mirror repository pointing at the Codeberg upstream
 2. Confirm initial sync completes
 ### 4.5 Claude login
 ```bash
 # On disinto-nomad-box
 claude login
 ```
 Set up Anthropic OAuth so agents can authenticate.
 ### 4.6 Autossh tunnel swap
 > **Operator step** — cross-host, no dev-agent involvement. Do NOT automate.
 1. Stop the tunnel on dev-box:
   ```bash
   # On disinto-dev-box
   systemctl stop reverse-tunnel
   ```
 2. Copy or regenerate the tunnel unit on nomad-box:
   ```bash
   # Copy from dev-box, or let init regenerate it
   scp dev-box:/etc/systemd/system/reverse-tunnel.service \
       nomad-box:/etc/systemd/system/
   ```
 3. Register nomad-box's public key on DO edge:
   ```bash
   # On DO edge box — same restricted-command as the dev-box key
   echo "<nomad-box-pubkey>" >> /home/johba/.ssh/authorized_keys
   ```
 4. Start the tunnel on nomad-box:
   ```bash
   # On disinto-nomad-box
   systemctl enable --now reverse-tunnel
   ```
 5. Verify end-to-end:
   ```bash
   curl https://self.disinto.ai/api/v1/version
   # Should return the new box's Forgejo version
   ```
 ---
 ## 5. Post-cutover smoke
 - [ ] `curl https://self.disinto.ai` → Forgejo welcome page
 - [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work
 - [ ] Claude chat login via Forgejo OAuth succeeds
 ---
 ## 6. Rollback (if any step 4 gate fails)
 1. Stop the tunnel on nomad-box:
   ```bash
   systemctl stop reverse-tunnel   # on nomad-box
   ```
 2. Restore the tunnel on dev-box:
   ```bash
   systemctl start reverse-tunnel  # on dev-box
   ```
 3. Bring dev-box services back up:
   ```bash
   docker-compose up -d            # on dev-box
   ```
 4. DO Caddy config is unchanged — traffic restores in <5 min.
 5. File a post-mortem issue. Keep nomad-box state intact for debugging.
 ---
 ## 7. Post-stable cleanup (T+1 week)
 - `docker-compose down -v` on dev-box
 - Archive `/var/lib/docker/volumes/disinto_*` to cold storage
 - Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator
  decision)
--- a/gardener/AGENTS.md
+++ b/gardener/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Gardener Agent
 **Role**: Backlog grooming — detect duplicate issues, missing acceptance
--- a/gardener/pending-actions.json
+++ b/gardener/pending-actions.json
@ -1,52 +1 @@
-[
+[]
  {
    "action": "edit_body",
    "issue": 1025,
    "body": "## Prior art: PR #1033 (open, branch `fix/issue-1025` retained)\n\nFirst attempt by dev-qwen2 (head `f692dd2`). Test script (`tests/smoke-edge-subpath.sh`, 13.8 KB) and pipeline (`.woodpecker/edge-subpath.yml`) both landed and look reasonable, but the **CI harness design is wrong**: the pipeline boots a bare `alpine:3.19` container and runs the smoke script directly against `BASE_URL=http://localhost`, with no stack to test against.\n\n**This is a harness design gap, not a script bug.** The smoke script itself is a reasonable post-deploy tool — the mistake was trying to exercise it as a hermetic CI step.\n\n**Approach (Option 1 — split the work):**\n\nKeep `tests/smoke-edge-subpath.sh` as an out-of-CI post-deploy tool (accepts `BASE_URL` env var). Replace the CI pipeline step that tries to curl a live stack with static checks only: `shellcheck`, `caddy validate` on the generated Caddyfile, and a template-substitution unit test that verifies routing block shape.\n\nBranch `fix/issue-1025` is preserved at `f692dd2` — the smoke script body is reusable; only the pipeline harness needs a rethink.\n\n**Timeline:**\n- 2026-04-19 09:14 — dev-qwen2 last pushed `f692dd2`\n- 3 pipelines (#1378/#1380/#1382) all fail: no service to curl (connection refused)\n\n## Acceptance criteria\n- [ ] `.woodpecker/edge-subpath.yml` pipeline runs `shellcheck` on `tests/smoke-edge-subpath.sh` with no live service curl\n- [ ] `caddy validate` runs on the generated Caddyfile in CI (template-substitution unit test)\n- [ ] A template-substitution test verifies the Caddyfile routing block shape (forge/ci/staging/chat paths)\n- [ ] `tests/smoke-edge-subpath.sh` accepts `BASE_URL` env var for post-deploy staging runs\n- [ ] CI green (no connection-refused failures on Woodpecker)\n\n## Affected files\n- `.woodpecker/edge-subpath.yml` — pipeline config (static checks only, no service curl)\n- `tests/smoke-edge-subpath.sh` — out-of-CI smoke script (reusable from PR #1033)\n\n## Dependencies\n- #1038 should land first to unblock local edge staging runs (optional — CI fix is independent)"
  },
  {
    "action": "remove_label",
    "issue": 1025,
    "label": "blocked"
  },
  {
    "action": "add_label",
    "issue": 1025,
    "label": "backlog"
  },
  {
    "action": "edit_body",
    "issue": 1038,
    "body": "## Problem\n\n`disinto-edge` crashloops on any deployment that has not opted into the age-encrypted secret store (#777), because the edge entrypoint treats four secrets as unconditionally required:\n\n```\nFATAL: age key (/home/agent/.config/sops/age/keys.txt) or secrets dir (/opt/disinto/secrets) not found — cannot load required secrets\n```\n\nObserved on `disinto-dev-box` (container `disinto-edge`, restarting every ~30s), which blocks PR #1033 (edge-subpath smoke test) and any other work that depends on a running edge.\n\n## Root cause\n\n`docker/edge/entrypoint-edge.sh:176-205` requires:\n\n- `~/.config/sops/age/keys.txt`\n- `/opt/disinto/secrets/` with `.enc` files for `CADDY_SSH_KEY`, `CADDY_SSH_HOST`, `CADDY_SSH_USER`, `CADDY_ACCESS_LOG`.\n\nThese four secrets feed exactly one feature: the daily 23:50 UTC `collect-engagement.sh` cron (#745), which SCPs Caddy access logs from a **remote production edge host** for engagement parsing. On a local factory box or any deployment that has not set up a remote edge, this code path has no target — yet its absence kills the whole edge container.\n\n## Fix\n\nMake the secrets block **optional**. When age key or secrets dir is missing, or any of the four CADDY_ secrets fail to decrypt, log a warning and skip the `collect-engagement` cron loop. Caddy itself does not depend on these secrets and should start normally.\n\nThe concrete edit is around lines 176-205 of `docker/edge/entrypoint-edge.sh` — guard the secret-loading block with a check for the age key and secrets dir, set `EDGE_ENGAGEMENT_READY=0` on failure, and skip cron registration when `EDGE_ENGAGEMENT_READY != 1`.\n\n## Acceptance criteria\n- [ ] `docker/edge/entrypoint-edge.sh` loads CADDY_ secrets optionally — missing age key or secrets dir logs a warning and continues, does not FATAL\n- [ ] Caddy starts normally when CADDY_ secrets are absent\n- [ ] `collect-engagement` cron is skipped (not registered) when engagement secrets are unavailable\n- [ ] On deployments WITH secrets configured, behavior is unchanged (collect-engagement cron still fires at 23:50 UTC)\n- [ ] CI green\n\n## Affected files\n- `docker/edge/entrypoint-edge.sh` — lines 176-205, secrets loading block made optional"
  },
  {
    "action": "remove_label",
    "issue": 1038,
    "label": "blocked"
  },
  {
    "action": "add_label",
    "issue": 1038,
    "label": "backlog"
  },
  {
    "action": "edit_body",
    "issue": 850,
    "body": "## Problem\n\nWhen the compose generator emits the same service name twice — e.g. both the legacy `ENABLE_LLAMA_AGENT=1` branch and a matching `[agents.llama]` TOML block produce an `agents-llama:` key — the failure is deferred all the way to `docker compose` YAML parsing:\n\n```\nfailed to parse /home/johba/disinto/docker-compose.yml: yaml: construct errors:\n  line 4: line 431: mapping key \"agents-llama\" already defined at line 155\n```\n\nBy then, the user has already paid the cost of: pre-build binary downloads, generator run, Caddyfile regeneration. The only hint about what went wrong is a line number in a generated file. Root cause (dual activation) is not surfaced.\n\n## Fix\n\nAdd a generate-time guard to `lib/generators.sh`:\n\n- After collecting all service blocks to emit, compare the set of service names against duplicates.\n- If a duplicate is detected, abort with a clear message naming both sources of truth (e.g. `\"agents-llama\" emitted twice — from ENABLE_LLAMA_AGENT=1 and from [agents.llama] in projects/disinto.toml; remove one`).\n\n## Prior art: PR #872 (closed, branch `fix/issue-850` retained)\n\ndev-qwen's first attempt (`db009e3`) landed the dup-detection logic in `lib/generators.sh` correctly (unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases), but the smoke test fails on CI.\n\n**Why the smoke test fails:** sections 1-7 of `smoke-init.sh` already run `bin/disinto init`, materializing `docker-compose.yml`. Section 8 re-invokes `bin/disinto init` to verify the dup guard fires — but `_generate_compose_impl` early-returns with `\"Compose: already exists, skipping\"` before reaching the dup-check.\n\n**Suggested fix:** in `tests/smoke-init.sh` section 8 (around line 452, before the second `bin/disinto init` invocation), add:\n\n```bash\nrm -f \"${FACTORY_ROOT}/docker-compose.yml\"\n```\n\nso the generator actually runs and the dup-detection path is exercised. Do **not** hoist the dup-check above the early-return.\n\nThe branch `fix/issue-850` is preserved as a starting point — pick up from `db009e3` and patch the smoke-test cleanup.\n\nRelated: #846.\n\n## Acceptance criteria\n- [ ] `bin/disinto init` with a config that would produce duplicate service names aborts with a clear error message naming both sources (e.g. `ENABLE_LLAMA_AGENT=1` and `[agents.llama]` TOML block)\n- [ ] `tests/smoke-init.sh` section 8 removes `docker-compose.yml` before re-invoking `disinto init` so the dup guard is exercised\n- [ ] Unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases\n- [ ] CI green (smoke-init.sh section 8 no longer skips dup detection)\n\n## Affected files\n- `lib/generators.sh` — duplicate service name check after collecting all service blocks\n- `tests/smoke-init.sh` — section 8: add `rm -f \\${FACTORY_ROOT}/docker-compose.yml` before second `disinto init`"
  },
  {
    "action": "remove_label",
    "issue": 850,
    "label": "blocked"
  },
  {
    "action": "add_label",
    "issue": 850,
    "label": "backlog"
  },
  {
    "action": "comment",
    "issue": 758,
    "body": "This issue is the critical path blocker for #820 (ops repo re-seed) and #982 (collect-engagement commit fix). Both are in the backlog and ready to merge, but cannot run until ops repo branch protection is resolved. Needs admin/human action to change Forgejo branch protection settings on disinto-ops — no code change can unblock this."
  }
 ]
--- a/lib/AGENTS.md
+++ b/lib/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 0bb04545d47fb43b2cab0a1f4406c2a2b57f4eba -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Shared Helpers (`lib/`)
 All agents source `lib/env.sh` as their first action. Additional helpers are
@ -7,7 +7,7 @@ sourced as needed.
 | File | What it provides | Sourced by |
 |---|---|---|
 | `lib/env.sh` | Loads `.env`, sets `FACTORY_ROOT`, exports project config (`FORGE_REPO`, `PROJECT_NAME`, etc.), defines `log()`, `forge_api()`, `forge_api_all()` (paginates all pages; accepts optional second TOKEN parameter, defaults to `$FORGE_TOKEN`; handles invalid/empty JSON responses gracefully — returns empty on parse error instead of crashing), `woodpecker_api()`, `wpdb()`, `memory_guard()` (skips agent if RAM < threshold), `load_secret()` (secret-source abstraction — see below). Auto-loads project TOML if `PROJECT_TOML` is set. Exports per-agent tokens (`FORGE_PLANNER_TOKEN`, `FORGE_GARDENER_TOKEN`, `FORGE_VAULT_TOKEN`, `FORGE_SUPERVISOR_TOKEN`, `FORGE_PREDICTOR_TOKEN`) — each falls back to `$FORGE_TOKEN` if not set. **Vault-only token guard (AD-006)**: `unset GITHUB_TOKEN CLAWHUB_TOKEN` so agents never hold external-action tokens — only the runner container receives them. **Container note**: when `DISINTO_CONTAINER=1`, `.env` is NOT re-sourced — compose already injects env vars (including `FORGE_URL=http://forgejo:3000`) and re-sourcing would clobber them. **Save/restore scope (#364)**: only `FORGE_URL` is preserved across `.env` re-sourcing (compose injects `http://forgejo:3000`, `.env` has `http://localhost:3000`). `FORGE_TOKEN` is NOT preserved so refreshed tokens in `.env` take effect immediately. **Per-agent token override (#762)**: agent run scripts export `FORGE_TOKEN_OVERRIDE=<agent-specific-token>` BEFORE sourcing `env.sh`; `env.sh` applies this override at lines 98-100, ensuring the correct identity survives any re-sourcing of `env.sh` by nested shells or `claude -p` invocations. **Required env var**: `FORGE_PASS` — bot password for git HTTP push (Forgejo 11.x rejects API tokens for `git push`, #361). **Hard preconditions (#674)**: `USER` and `HOME` must be exported by the entrypoint before sourcing. When `PROJECT_TOML` is set, `PROJECT_REPO_ROOT`, `PRIMARY_BRANCH`, and `OPS_REPO_ROOT` must also be set (by entrypoint or TOML). **`load_secret NAME [DEFAULT]` (#793)**: backend-agnostic secret resolution. Precedence: (1) `/secrets/<NAME>.env` — Nomad-rendered template, (2) current environment — already set by `.env.enc` / compose, (3) `secrets/<NAME>.enc` — age-encrypted per-key file (decrypted on demand, cached in process env), (4) DEFAULT or empty. Consumers call `$(load_secret GITHUB_TOKEN)` instead of `${GITHUB_TOKEN}` — identical behavior whether secrets come from Docker compose injection or Nomad Vault templates. | Every agent |
-| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. | dev-poll, review-poll, review-pr |
+| `lib/ci-helpers.sh` | `ci_passed()` — returns 0 if CI state is "success" (or no CI configured). `ci_required_for_pr()` — returns 0 if PR has code files (CI required), 1 if non-code only (CI not required). `is_infra_step()` — returns 0 if a single CI step failure matches infra heuristics (clone/git exit 128, any exit 137, log timeout patterns). `classify_pipeline_failure()` — returns "infra \<reason>" if any failed Woodpecker step matches infra heuristics via `is_infra_step()`, else "code". `ensure_priority_label()` — looks up (or creates) the `priority` label and returns its ID; caches in `_PRIORITY_LABEL_ID`. `ci_commit_status <sha>` — queries Woodpecker directly for CI state, falls back to forge commit status API. `ci_pipeline_number <sha>` — returns the Woodpecker pipeline number for a commit, falls back to parsing forge status `target_url`. `ci_promote <repo_id> <pipeline_num> <environment>` — promotes a pipeline to a named Woodpecker environment (vault-gated deployment: vault approves, vault-fire calls this — vault redesign in progress, see #73-#77). `ci_get_logs <pipeline_number> [--step <name>]` — reads CI logs from Woodpecker SQLite database via `lib/ci-log-reader.py`; outputs last 200 lines to stdout. Requires mounted woodpecker-data volume at /woodpecker-data. `ci_get_step_logs <pipeline_num> <step_id>` — fetches per-step logs via Woodpecker REST API (`/repos/{id}/logs/{pipeline}/{step_id}`); returns raw log data for a single step. Used by `pr_poll_ci()` to build per-workflow/per-step CI diagnostics (#1051). | dev-poll, review-poll, review-pr |
 | `lib/ci-debug.sh` | CLI tool for Woodpecker CI: `list`, `status`, `logs`, `failures` subcommands. Not sourced — run directly. | Humans / dev-agent (tool access) |
 | `lib/ci-log-reader.py` | Python tool: reads CI logs from Woodpecker SQLite database. `<pipeline_number> [--step <name>]` — returns last 200 lines from failed steps (or specified step). Used by `ci_get_logs()` in ci-helpers.sh. Requires `WOODPECKER_DATA_DIR` (default: /woodpecker-data). | ci-helpers.sh |
 | `lib/load-project.sh` | Parses a `projects/*.toml` file into env vars (`PROJECT_NAME`, `FORGE_REPO`, `WOODPECKER_REPO_ID`, monitoring toggles, mirror config, etc.). Also exports `FORGE_REPO_OWNER` (the owner component of `FORGE_REPO`, e.g. `disinto-admin` from `disinto-admin/disinto`). Reads `repo_root` and `ops_repo_root` from the TOML for host-CLI callers. **Container path handling (#674)**: no longer derives `PROJECT_REPO_ROOT` or `OPS_REPO_ROOT` inside the script — container entrypoints export the correct paths before agent scripts source `env.sh`, and the `DISINTO_CONTAINER` guard (line 90) skips TOML overrides when those vars are already set. | env.sh (when `PROJECT_TOML` is set) |
@ -20,7 +20,7 @@ sourced as needed.
 | `lib/stack-lock.sh` | File-based lock protocol for singleton project stack access. `stack_lock_acquire(holder, project)` — polls until free, breaks stale heartbeats (>10 min old), claims lock. `stack_lock_release(project)` — deletes lock file. `stack_lock_check(project)` — inspect current lock state. `stack_lock_heartbeat(project)` — update heartbeat timestamp (callers must call every 2 min while holding). Lock files at `~/data/locks/<project>-stack.lock`. | docker/edge/dispatcher.sh, reproduce formula |
 | `lib/tea-helpers.sh` | `tea_file_issue(title, body, labels...)` — create issue via tea CLI with secret scanning; sets `FILED_ISSUE_NUM`. `tea_relabel(issue_num, labels...)` — replace labels using tea's `edit` subcommand (not `label`). `tea_comment(issue_num, body)` — add comment with secret scanning. `tea_close(issue_num)` — close issue. All use `TEA_LOGIN` and `FORGE_REPO` from env.sh. Labels by name (no ID lookup). Tea binary download verified via sha256 checksum. Sourced by env.sh when `tea` binary is available. | env.sh (conditional) |
 | `lib/worktree.sh` | Reusable git worktree management: `worktree_create(path, branch, [base_ref])` — create worktree, checkout base, fetch submodules. `worktree_recover(path, branch, [remote])` — detect existing worktree, reuse if on correct branch (sets `_WORKTREE_REUSED`), otherwise clean and recreate. `worktree_cleanup(path)` — `git worktree remove --force`, clear Claude Code project cache (`~/.claude/projects/` matching path). `worktree_cleanup_stale([max_age_hours])` — scan `/tmp` for orphaned worktrees older than threshold, skip preserved and active tmux worktrees, prune. `worktree_preserve(path, reason)` — mark worktree as preserved for debugging (writes `.worktree-preserved` marker, skipped by stale cleanup). | dev-agent.sh, supervisor-run.sh, planner-run.sh, predictor-run.sh, gardener-run.sh |
-| `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) |
+| `lib/pr-lifecycle.sh` | Reusable PR lifecycle library: `pr_create()`, `pr_find_by_branch()`, `pr_poll_ci()`, `pr_poll_review()`, `pr_merge()`, `pr_is_merged()`, `pr_walk_to_merge()`, `build_phase_protocol_prompt()`. `pr_poll_ci()` builds a **per-workflow/per-step CI diagnostics prompt** (#1051): on failure, each failed workflow gets its own section with step name, exit code (annotated with standard meanings for 126/127/128), and step-local log tail (via `ci_get_step_logs`); passing workflows are listed explicitly so agents don't waste fix attempts on them. Falls back to legacy combined-log fetch if per-step API is unavailable. Requires `lib/ci-helpers.sh`. | dev-agent.sh (future) |
 | `lib/issue-lifecycle.sh` | Reusable issue lifecycle library: `issue_claim()` (add in-progress, remove backlog), `issue_release()` (remove in-progress, add backlog), `issue_block()` (post diagnostic comment with secret redaction, add blocked label), `issue_close()`, `issue_check_deps()` (parse deps, check transitive closure; sets `_ISSUE_BLOCKED_BY`, `_ISSUE_SUGGESTION`), `issue_suggest_next()` (find next unblocked backlog issue; sets `_ISSUE_NEXT`), `issue_post_refusal()` (structured refusal comment with dedup). Label IDs cached in globals on first lookup. Sources `lib/secret-scan.sh`. | dev-agent.sh (future) |
 | `lib/action-vault.sh` | **Vault PR helper** — create vault action PRs on ops repo via Forgejo API (works from containers without SSH). `vault_request <action_id> <toml_content>` validates TOML (using `validate_vault_action` from `action-vault/vault-env.sh`), creates branch `vault/<action-id>`, writes `vault/actions/<action-id>.toml`, creates PR targeting `main` with title `vault: <action-id>` and body from context field, returns PR number. Idempotent: if PR exists, returns existing number. **Low-tier bypass**: if the action's `blast_radius` classifies as `low` (via `action-vault/classify.sh`), `vault_request` calls `_vault_commit_direct()` which commits directly to ops `main` using `FORGE_ADMIN_TOKEN` — no PR, no approval wait. Returns `0` (not a PR number) for direct commits. Requires `FORGE_TOKEN`, `FORGE_ADMIN_TOKEN` (low-tier only), `FORGE_URL`, `FORGE_REPO`, `FORGE_OPS_REPO`. Uses the calling agent's own token (saves/restores `FORGE_TOKEN` around sourcing `vault-env.sh`), so approval workflow respects individual agent identities. | dev-agent (vault actions), future vault dispatcher |
 | `lib/branch-protection.sh` | Branch protection helpers for Forgejo repos. `setup_vault_branch_protection()` — configures admin-only merge protection on main (require 1 approval, restrict merge to admin role, block direct pushes). `setup_profile_branch_protection()` — same protection for `.profile` repos. `verify_branch_protection()` — checks protection is correctly configured. `remove_branch_protection()` — removes protection (cleanup/testing). Handles race condition after initial push: retries with backoff if Forgejo hasn't processed the branch yet. Requires `FORGE_TOKEN`, `FORGE_URL`, `FORGE_OPS_REPO`. | bin/disinto (hire-an-agent) |
@ -30,7 +30,9 @@ sourced as needed.
 | `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) |
 | `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) |
 | `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
-| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
+| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (**duplicate service detection**: tracks service names during generation, aborts with `ERROR: Duplicate service name '$name' detected` on conflict; detection state is reset between calls so idempotent reinvocation is safe, #850) (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
 | `lib/backup.sh` | Factory backup creation. `backup_create <outfile.tar.gz>` — exports factory state: fetches all issues (open+closed) from the project and ops repos via Forgejo API, bundles the ops repo as a git bundle, and writes a tarball. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_REPO`, `FORGE_OPS_REPO`, `OPS_REPO_ROOT`. Sourced by `bin/disinto backup create` (#1057). | bin/disinto (backup create) |
 | `lib/disinto/backup.sh` | Factory backup restore. `backup_import <infile.tar.gz>` — restores from a backup tarball: creates missing repos via Forgejo API, imports issues (idempotent — skips by number if present), unpacks ops repo git bundle. Idempotent: running twice produces same end state with no errors. Requires `FORGE_URL`, `FORGE_TOKEN`. Sourced by `bin/disinto backup import` (#1058). | bin/disinto (backup import) |
 | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) |
 | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
 | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
--- a/lib/agent-sdk.sh
+++ b/lib/agent-sdk.sh
@ -52,8 +52,9 @@ claude_run_with_watchdog() {
  out_file=$(mktemp) || return 1
  trap 'rm -f "$out_file"' RETURN
-  # Start claude in background, capturing stdout to temp file
+  # Start claude in new process group (setsid creates new session, $pid is PGID leader)
-  "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
+  # All children of claude will inherit this process group
  setsid "${cmd[@]}" > "$out_file" 2>>"$LOGFILE" &
  pid=$!
  # Background watchdog: poll for final result marker
@ -84,12 +85,12 @@ claude_run_with_watchdog() {
      sleep "$grace"
      if kill -0 "$pid" 2>/dev/null; then
        log "watchdog: claude -p idle for ${grace}s after final result; SIGTERM"
-        kill -TERM "$pid" 2>/dev/null || true
+        kill -TERM -- "-$pid" 2>/dev/null || true
        # Give it a moment to clean up
        sleep 5
        if kill -0 "$pid" 2>/dev/null; then
          log "watchdog: force kill after SIGTERM timeout"
-          kill -KILL "$pid" 2>/dev/null || true
+          kill -KILL -- "-$pid" 2>/dev/null || true
        fi
      fi
    fi
@ -100,16 +101,16 @@ claude_run_with_watchdog() {
  timeout --foreground "${CLAUDE_TIMEOUT:-7200}" tail --pid="$pid" -f /dev/null 2>/dev/null
  rc=$?
-  # Clean up the watchdog
+  # Clean up the watchdog (target process group if it spawned children)
-  kill "$grace_pid" 2>/dev/null || true
+  kill -- "-$grace_pid" 2>/dev/null || true
  wait "$grace_pid" 2>/dev/null || true
-  # When timeout fires (rc=124), explicitly kill the orphaned claude process
+  # When timeout fires (rc=124), explicitly kill the orphaned claude process group
  # tail --pid is a passive waiter, not a supervisor
  if [ "$rc" -eq 124 ]; then
-    kill "$pid" 2>/dev/null || true
+    kill -TERM -- "-$pid" 2>/dev/null || true
    sleep 1
-    kill -KILL "$pid" 2>/dev/null || true
+    kill -KILL -- "-$pid" 2>/dev/null || true
  fi
  # Output the captured stdout
--- a/lib/backup.sh
+++ b/lib/backup.sh
@ -0,0 +1,136 @@
 #!/usr/bin/env bash
 # =============================================================================
 # disinto backup — export factory state for migration
 #
 # Usage: source this file, then call backup_create <outfile.tar.gz>
 # Requires: FORGE_URL, FORGE_TOKEN, FORGE_REPO, FORGE_OPS_REPO, OPS_REPO_ROOT
 # =============================================================================
 set -euo pipefail
 # Fetch all issues (open + closed) for a repo slug and emit the normalized JSON array.
 # Usage: _backup_fetch_issues <org/repo>
 _backup_fetch_issues() {
  local repo_slug="$1"
  local api_url="${FORGE_API_BASE}/repos/${repo_slug}"
  local all_issues="[]"
  for state in open closed; do
    local page=1
    while true; do
      local page_items
      page_items=$(curl -sf -X GET \
        -H "Authorization: token ${FORGE_TOKEN}" \
        -H "Content-Type: application/json" \
        "${api_url}/issues?state=${state}&type=issues&limit=50&page=${page}") || {
        echo "ERROR: failed to fetch ${state} issues from ${repo_slug} (page ${page})" >&2
        return 1
      }
      local count
      count=$(printf '%s' "$page_items" | jq 'length' 2>/dev/null) || count=0
      [ -z "$count" ] && count=0
      [ "$count" -eq 0 ] && break
      all_issues=$(printf '%s\n%s' "$all_issues" "$page_items" | jq -s 'add')
      [ "$count" -lt 50 ] && break
      page=$((page + 1))
    done
  done
  # Normalize to the schema: number, title, body, labels, state
  printf '%s' "$all_issues" | jq '[.[] | {
    number: .number,
    title: .title,
    body: .body,
    labels: [.labels[]?.name],
    state: .state
  }] | sort_by(.number)'
 }
 # Create a backup tarball of factory state.
 # Usage: backup_create <outfile.tar.gz>
 backup_create() {
  local outfile="${1:-}"
  if [ -z "$outfile" ]; then
    echo "Error: output file required" >&2
    echo "Usage: disinto backup create <outfile.tar.gz>" >&2
    return 1
  fi
  # Resolve to absolute path before cd-ing into tmpdir
  case "$outfile" in
    /*) ;;
    *) outfile="$(pwd)/${outfile}" ;;
  esac
  # Validate required env
  : "${FORGE_URL:?FORGE_URL must be set}"
  : "${FORGE_TOKEN:?FORGE_TOKEN must be set}"
  : "${FORGE_REPO:?FORGE_REPO must be set}"
  local forge_ops_repo="${FORGE_OPS_REPO:-${FORGE_REPO}-ops}"
  local ops_repo_root="${OPS_REPO_ROOT:-}"
  if [ -z "$ops_repo_root" ] || [ ! -d "$ops_repo_root/.git" ]; then
    echo "Error: OPS_REPO_ROOT (${ops_repo_root:-<unset>}) is not a valid git repo" >&2
    return 1
  fi
  local tmpdir
  tmpdir=$(mktemp -d)
  trap 'rm -rf "$tmpdir"' EXIT
  local project_name="${FORGE_REPO##*/}"
  echo "=== disinto backup create ==="
  echo "Forge: ${FORGE_URL}"
  echo "Repos: ${FORGE_REPO}, ${forge_ops_repo}"
  # ── 1. Export issues ──────────────────────────────────────────────────────
  mkdir -p "${tmpdir}/issues"
  echo "Fetching issues for ${FORGE_REPO}..."
  _backup_fetch_issues "$FORGE_REPO" > "${tmpdir}/issues/${project_name}.json"
  local main_count
  main_count=$(jq 'length' "${tmpdir}/issues/${project_name}.json")
  echo "  ${main_count} issues exported"
  echo "Fetching issues for ${forge_ops_repo}..."
  _backup_fetch_issues "$forge_ops_repo" > "${tmpdir}/issues/${project_name}-ops.json"
  local ops_count
  ops_count=$(jq 'length' "${tmpdir}/issues/${project_name}-ops.json")
  echo "  ${ops_count} issues exported"
  # ── 2. Git bundle of ops repo ────────────────────────────────────────────
  mkdir -p "${tmpdir}/repos"
  echo "Creating git bundle for ${forge_ops_repo}..."
  git -C "$ops_repo_root" bundle create "${tmpdir}/repos/${project_name}-ops.bundle" --all 2>&1
  echo "  bundle created ($(du -h "${tmpdir}/repos/${project_name}-ops.bundle" | cut -f1))"
  # ── 3. Metadata ──────────────────────────────────────────────────────────
  local created_at
  created_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
  jq -n \
    --arg created_at "$created_at" \
    --arg source_host "$(hostname)" \
    --argjson schema_version 1 \
    --arg forgejo_url "$FORGE_URL" \
    '{
      created_at: $created_at,
      source_host: $source_host,
      schema_version: $schema_version,
      forgejo_url: $forgejo_url
    }' > "${tmpdir}/metadata.json"
  # ── 4. Pack tarball ──────────────────────────────────────────────────────
  echo "Creating tarball: ${outfile}"
  tar -czf "$outfile" -C "$tmpdir" metadata.json issues repos
  local size
  size=$(du -h "$outfile" | cut -f1)
  echo "=== Backup complete: ${outfile} (${size}) ==="
  # Clean up before returning — the EXIT trap references the local $tmpdir
  # which goes out of scope after return, causing 'unbound variable' under set -u.
  trap - EXIT
  rm -rf "$tmpdir"
 }
--- a/lib/ci-helpers.sh
+++ b/lib/ci-helpers.sh
@ -247,6 +247,31 @@ ci_promote() {
  echo "$new_num"
 }
 # ci_get_step_logs <pipeline_num> <step_id>
 # Fetches logs for a single CI step via the Woodpecker API.
 # Requires: WOODPECKER_REPO_ID, woodpecker_api() (from env.sh)
 # Returns: 0 on success, 1 on failure. Outputs log text to stdout.
 #
 # Usage:
 #   ci_get_step_logs 1423 5    # Get logs for step ID 5 in pipeline 1423
 ci_get_step_logs() {
  local pipeline_num="$1" step_id="$2"
  if [ -z "$pipeline_num" ] || [ -z "$step_id" ]; then
    echo "Usage: ci_get_step_logs <pipeline_num> <step_id>" >&2
    return 1
  fi
  if [ -z "${WOODPECKER_REPO_ID:-}" ] || [ "${WOODPECKER_REPO_ID}" = "0" ]; then
    echo "ERROR: WOODPECKER_REPO_ID not set or zero" >&2
    return 1
  fi
  woodpecker_api "/repos/${WOODPECKER_REPO_ID}/logs/${pipeline_num}/${step_id}" \
    --max-time 15 2>/dev/null \
    | jq -r '.[].data // empty' 2>/dev/null
 }
 # ci_get_logs <pipeline_number> [--step <step_name>]
 # Reads CI logs from the Woodpecker SQLite database.
 # Requires: WOODPECKER_DATA_DIR env var or mounted volume at /woodpecker-data
--- a/lib/disinto/backup.sh
+++ b/lib/disinto/backup.sh
@ -0,0 +1,391 @@
 #!/usr/bin/env bash
 # =============================================================================
 # backup.sh — backup/restore utilities for disinto factory state
 #
 # Subcommands:
 #   create <outfile.tar.gz>  Create backup of factory state
 #   import <infile.tar.gz>   Restore factory state from backup
 #
 # Usage:
 #   source "${FACTORY_ROOT}/lib/disinto/backup.sh"
 #   backup_import <tarball>
 #
 # Environment:
 #   FORGE_URL    - Forgejo instance URL (target)
 #   FORGE_TOKEN  - Admin token for target Forgejo
 #
 # Idempotency:
 #   - Repos: created via API if missing
 #   - Issues: check if exists by number, skip if present
 #   - Runs twice = same end state, no errors
 # =============================================================================
 set -euo pipefail
 # ── Helper: log with timestamp ───────────────────────────────────────────────
 backup_log() {
  local msg="$1"
  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg"
 }
 # ── Helper: create repo if it doesn't exist ─────────────────────────────────
 # Usage: backup_create_repo_if_missing <slug>
 # Returns: 0 if repo exists or was created, 1 on error
 backup_create_repo_if_missing() {
  local slug="$1"
  local org_name="${slug%%/*}"
  local repo_name="${slug##*/}"
  # Check if repo exists
  if curl -sf --max-time 5 \
    -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_URL}/api/v1/repos/${slug}" >/dev/null 2>&1; then
    backup_log "Repo ${slug} already exists"
    return 0
  fi
  backup_log "Creating repo ${slug}..."
  # Create org if needed
  curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_URL}/api/v1/orgs" \
    -d "{\"username\":\"${org_name}\",\"visibility\":\"public\"}" >/dev/null 2>&1 || true
  # Create repo
  local response
  response=$(curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_URL}/api/v1/orgs/${org_name}/repos" \
    -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" 2>/dev/null) \
    || response=""
  if [ -n "$response" ] && echo "$response" | grep -q '"id":\|[0-9]'; then
    backup_log "Created repo ${slug}"
    BACKUP_CREATED_REPOS=$((BACKUP_CREATED_REPOS + 1))
    return 0
  fi
  # Fallback: admin endpoint
  response=$(curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_URL}/api/v1/admin/users/${org_name}/repos" \
    -d "{\"name\":\"${repo_name}\",\"auto_init\":false,\"default_branch\":\"main\"}" 2>/dev/null) \
    || response=""
  if [ -n "$response" ] && echo "$response" | grep -q '"id":\|[0-9]'; then
    backup_log "Created repo ${slug} (via admin API)"
    BACKUP_CREATED_REPOS=$((BACKUP_CREATED_REPOS + 1))
    return 0
  fi
  backup_log "ERROR: failed to create repo ${slug}" >&2
  return 1
 }
 # ── Helper: check if issue exists by number ──────────────────────────────────
 # Usage: backup_issue_exists <slug> <issue_number>
 # Returns: 0 if exists, 1 if not
 backup_issue_exists() {
  local slug="$1"
  local issue_num="$2"
  curl -sf --max-time 5 \
    -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_URL}/api/v1/repos/${slug}/issues/${issue_num}" >/dev/null 2>&1
 }
 # ── Helper: create issue with specific number (if Forgejo supports it) ───────
 # Note: Forgejo API auto-assigns next integer; we accept renumbering and log mapping
 # Usage: backup_create_issue <slug> <original_number> <title> <body> [labels...]
 # Returns: new_issue_number on success, 0 on failure
 backup_create_issue() {
  local slug="$1"
  local original_num="$2"
  local title="$3"
  local body="$4"
  shift 4
  # Build labels array
  local -a labels=()
  for label in "$@"; do
    # Resolve label name to ID
    local label_id
    label_id=$(curl -sf --max-time 5 \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_URL}/api/v1/repos/${slug}/labels" 2>/dev/null \
      | jq -r ".[] | select(.name == \"${label}\") | .id" 2>/dev/null) || label_id=""
    if [ -n "$label_id" ] && [ "$label_id" != "null" ]; then
      labels+=("$label_id")
    fi
  done
  # Build payload
  local payload
  if [ ${#labels[@]} -gt 0 ]; then
    payload=$(jq -n \
      --arg title "$title" \
      --arg body "$body" \
      --argjson labels "$(printf '%s\n' "${labels[@]}" | jq -R . | jq -s .)" \
      '{title: $title, body: $body, labels: $labels}')
  else
    payload=$(jq -n --arg title "$title" --arg body "$body" '{title: $title, body: $body, labels: []}')
  fi
  local response
  response=$(curl -sf -X POST \
    -H "Authorization: token ${FORGE_TOKEN}" \
    -H "Content-Type: application/json" \
    "${FORGE_URL}/api/v1/repos/${slug}/issues" \
    -d "$payload" 2>/dev/null) || {
    backup_log "ERROR: failed to create issue '${title}'" >&2
    return 1
  }
  local new_num
  new_num=$(printf '%s' "$response" | jq -r '.number // empty')
  # Log the mapping
  echo "${original_num}:${new_num}" >> "${BACKUP_MAPPING_FILE}"
  backup_log "Created issue '${title}' as #${new_num} (original: #${original_num})"
  echo "$new_num"
 }
 # ── Step 1: Unpack tarball to temp dir ───────────────────────────────────────
 # Usage: backup_unpack_tarball <tarball>
 # Returns: temp dir path via BACKUP_TEMP_DIR
 backup_unpack_tarball() {
  local tarball="$1"
  if [ ! -f "$tarball" ]; then
    backup_log "ERROR: tarball not found: ${tarball}" >&2
    return 1
  fi
  BACKUP_TEMP_DIR=$(mktemp -d -t disinto-backup.XXXXXX)
  backup_log "Unpacking ${tarball} to ${BACKUP_TEMP_DIR}"
  if ! tar -xzf "$tarball" -C "$BACKUP_TEMP_DIR"; then
    backup_log "ERROR: failed to unpack tarball" >&2
    rm -rf "$BACKUP_TEMP_DIR"
    return 1
  fi
  # Verify expected structure
  if [ ! -d "${BACKUP_TEMP_DIR}/repos" ]; then
    backup_log "ERROR: tarball missing 'repos/' directory" >&2
    rm -rf "$BACKUP_TEMP_DIR"
    return 1
  fi
  backup_log "Tarball unpacked successfully"
 }
 # ── Step 2: disinto repo — create via Forgejo API, trigger sync (manual) ─────
 # Usage: backup_import_disinto_repo
 # Returns: 0 on success, 1 on failure
 backup_import_disinto_repo() {
  backup_log "Step 2: Configuring disinto repo..."
  # Create disinto repo if missing
  backup_create_repo_if_missing "disinto-admin/disinto"
  # Note: Manual mirror configuration recommended (avoids SSH deploy-key handling)
  backup_log "Note: Configure Codeberg → Forgejo pull mirror manually"
  backup_log "  Run on Forgejo admin panel: Repository Settings → Repository Mirroring"
  backup_log "  Source: ssh://git@codeberg.org/johba/disinto.git"
  backup_log "  Mirror: disinto-admin/disinto"
  backup_log "  Or use: git clone --mirror ssh://git@codeberg.org/johba/disinto.git"
  backup_log "          cd disinto.git && git push --mirror ${FORGE_URL}/disinto-admin/disinto.git"
  return 0
 }
 # ── Step 3: disinto-ops repo — create empty, push from bundle ────────────────
 # Usage: backup_import_disinto_ops_repo
 # Returns: 0 on success, 1 on failure
 backup_import_disinto_ops_repo() {
  backup_log "Step 3: Configuring disinto-ops repo from bundle..."
  local bundle_path="${BACKUP_TEMP_DIR}/repos/disinto-ops.bundle"
  if [ ! -f "$bundle_path" ]; then
    backup_log "WARNING: Bundle not found at ${bundle_path}, skipping"
    return 0
  fi
  # Create ops repo if missing
  backup_create_repo_if_missing "disinto-admin/disinto-ops"
  # Clone bundle and push to Forgejo
  local clone_dir
  clone_dir=$(mktemp -d -t disinto-ops-clone.XXXXXX)
  backup_log "Cloning bundle to ${clone_dir}"
  if ! git clone --bare "$bundle_path" "$clone_dir/disinto-ops.git"; then
    backup_log "ERROR: failed to clone bundle"
    rm -rf "$clone_dir"
    return 1
  fi
  # Push all refs to Forgejo
  backup_log "Pushing refs to Forgejo..."
  if ! cd "$clone_dir/disinto-ops.git" && \
     git push --mirror "${FORGE_URL}/disinto-admin/disinto-ops.git" 2>&1; then
    backup_log "ERROR: failed to push refs"
    rm -rf "$clone_dir"
    return 1
  fi
  local ref_count
  ref_count=$(cd "$clone_dir/disinto-ops.git" && git show-ref | wc -l)
  BACKUP_PUSHED_REFS=$((BACKUP_PUSHED_REFS + ref_count))
  backup_log "Pushed ${ref_count} refs to disinto-ops"
  rm -rf "$clone_dir"
  return 0
 }
 # ── Step 4: Import issues from backup ────────────────────────────────────────
 # Usage: backup_import_issues <slug> <issues_file>
 #        issues_file is a JSON array of issues (per create schema)
 # Returns: 0 on success
 backup_import_issues() {
  local slug="$1"
  local issues_file="$2"
  if [ ! -f "$issues_file" ]; then
    backup_log "No issues file found, skipping"
    return 0
  fi
  local count
  count=$(jq 'length' "$issues_file")
  backup_log "Importing ${count} issues from ${issues_file}"
  local created=0
  local skipped=0
  for i in $(seq 0 $((count - 1))); do
    local issue_num title body
    issue_num=$(jq -r ".[${i}].number" "$issues_file")
    title=$(jq -r ".[${i}].title" "$issues_file")
    body=$(jq -r ".[${i}].body" "$issues_file")
    if [ -z "$issue_num" ] || [ "$issue_num" = "null" ]; then
      backup_log "WARNING: skipping issue without number at index ${i}"
      continue
    fi
    # Check if issue already exists
    if backup_issue_exists "$slug" "$issue_num"; then
      backup_log "Issue #${issue_num} already exists, skipping"
      skipped=$((skipped + 1))
      continue
    fi
    # Extract labels
    local -a labels=()
    while IFS= read -r label; do
      [ -n "$label" ] && labels+=("$label")
    done < <(jq -r ".[${i}].labels[]? // empty" "$issues_file")
    # Create issue
    local new_num
    if new_num=$(backup_create_issue "$slug" "$issue_num" "$title" "$body" "${labels[@]}"); then
      created=$((created + 1))
    fi
  done
  BACKUP_CREATED_ISSUES=$((BACKUP_CREATED_ISSUES + created))
  BACKUP_SKIPPED_ISSUES=$((BACKUP_SKIPPED_ISSUES + skipped))
  backup_log "Created ${created} issues, skipped ${skipped}"
 }
 # ── Main: import subcommand ──────────────────────────────────────────────────
 # Usage: backup_import <tarball>
 backup_import() {
  local tarball="$1"
  # Validate required environment
  [ -n "${FORGE_URL:-}" ] || { echo "Error: FORGE_URL not set" >&2; exit 1; }
  [ -n "${FORGE_TOKEN:-}" ] || { echo "Error: FORGE_TOKEN not set" >&2; exit 1; }
  backup_log "=== Backup Import Started ==="
  backup_log "Target: ${FORGE_URL}"
  backup_log "Tarball: ${tarball}"
  # Initialize counters
  BACKUP_CREATED_REPOS=0
  BACKUP_PUSHED_REFS=0
  BACKUP_CREATED_ISSUES=0
  BACKUP_SKIPPED_ISSUES=0
  # Create temp dir for mapping file
  BACKUP_MAPPING_FILE=$(mktemp -t disinto-mapping.XXXXXX.json)
  echo '{"mappings": []}' > "$BACKUP_MAPPING_FILE"
  # Step 1: Unpack tarball
  if ! backup_unpack_tarball "$tarball"; then
    exit 1
  fi
  # Step 2: disinto repo
  if ! backup_import_disinto_repo; then
    exit 1
  fi
  # Step 3: disinto-ops repo
  if ! backup_import_disinto_ops_repo; then
    exit 1
  fi
  # Step 4: Import issues — iterate issues/<slug>.json files, each is a JSON array
  for issues_file in "${BACKUP_TEMP_DIR}/issues"/*.json; do
    [ -f "$issues_file" ] || continue
    local slug_filename
    slug_filename=$(basename "$issues_file" .json)
    # Map slug-filename → forgejo-slug: "disinto" → "disinto-admin/disinto",
    #                                    "disinto-ops" → "disinto-admin/disinto-ops"
    local slug
    case "$slug_filename" in
      "disinto") slug="${FORGE_REPO}" ;;
      "disinto-ops") slug="${FORGE_OPS_REPO}" ;;
      *) slug="disinto-admin/${slug_filename}" ;;
    esac
    backup_log "Processing issues from ${slug_filename}.json (${slug})"
    backup_import_issues "$slug" "$issues_file"
  done
  # Summary
  backup_log "=== Backup Import Complete ==="
  backup_log "Created ${BACKUP_CREATED_REPOS} repos"
  backup_log "Pushed ${BACKUP_PUSHED_REFS} refs"
  backup_log "Imported ${BACKUP_CREATED_ISSUES} issues"
  backup_log "Skipped ${BACKUP_SKIPPED_ISSUES} (already present)"
  backup_log "Issue mapping saved to: ${BACKUP_MAPPING_FILE}"
  # Cleanup
  rm -rf "$BACKUP_TEMP_DIR"
  exit 0
 }
 # ── Entry point: if sourced, don't run; if executed directly, run import ────
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
  if [ $# -lt 1 ]; then
    echo "Usage: $0 <tarball>" >&2
    exit 1
  fi
  backup_import "$1"
 fi
--- a/lib/generators.sh
+++ b/lib/generators.sh
@ -313,6 +313,10 @@ _generate_compose_impl() {
    return 0
  fi
  # Reset duplicate detection state for fresh run
  _seen_services=()
  _service_sources=()
  # Initialize duplicate detection with base services defined in the template
  _record_service "forgejo" "base compose template" || return 1
  _record_service "woodpecker" "base compose template" || return 1
@ -401,6 +405,9 @@ services:
      WOODPECKER_SERVER: localhost:9000
      WOODPECKER_AGENT_SECRET: ${WOODPECKER_AGENT_SECRET:-}
      WOODPECKER_GRPC_SECURE: "false"
      WOODPECKER_GRPC_KEEPALIVE_TIME: "10s"
      WOODPECKER_GRPC_KEEPALIVE_TIMEOUT: "20s"
      WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS: "true"
      WOODPECKER_HEALTHCHECK_ADDR: ":3333"
      WOODPECKER_BACKEND_DOCKER_NETWORK: ${WOODPECKER_CI_NETWORK:-disinto_disinto-net}
      WOODPECKER_MAX_WORKFLOWS: 1
--- a/lib/init/nomad/deploy.sh
+++ b/lib/init/nomad/deploy.sh
@ -19,10 +19,12 @@
 #   JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
 #   JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
 #                            JOB_READY_TIMEOUT_FORGEJO=300)
 #                            Built-in: JOB_READY_TIMEOUT_CHAT=600
 #
 # Exit codes:
 #   0  success (all jobs deployed and healthy, or dry-run completed)
-#   1  failure (validation error, timeout, or nomad command failure)
+#   1  failure (validation error, or one or more jobs unhealthy after all
 #      jobs submitted — deploy does NOT cascade-skip on timeout)
 #
 # Idempotency:
 #   Running twice back-to-back on a healthy cluster is a no-op. Jobs that are
@ -35,7 +37,11 @@ SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
 JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
 # Per-job built-in defaults (override with JOB_READY_TIMEOUT_<JOBNAME> env var)
 JOB_READY_TIMEOUT_CHAT="${JOB_READY_TIMEOUT_CHAT:-600}"
 DRY_RUN=0
 FAILED_JOBS=()  # jobs that timed out or failed deployment
 log() { printf '[deploy] %s\n' "$*" >&2; }
 die() { printf '[deploy] ERROR: %s\n' "$*" >&2; exit 1; }
@ -168,6 +174,43 @@ _wait_job_running() {
  return 1
 }
 # ── Helper: _run_post_deploy <job_name> ─────────────────────────────────────
 # Runs post-deploy scripts for a job after it becomes healthy.
 # Currently supports: forgejo → run forgejo-bootstrap.sh
 #
 # Args:
 #   job_name — name of the deployed job
 #
 # Returns:
 #   0 on success (script ran or not applicable)
 #   1 on failure
 # ─────────────────────────────────────────────────────────────────────────────
 _run_post_deploy() {
  local job_name="$1"
  local post_deploy_script
  case "$job_name" in
    forgejo)
      post_deploy_script="${SCRIPT_ROOT}/forgejo-bootstrap.sh"
      if [ -x "$post_deploy_script" ]; then
        log "running post-deploy script for ${job_name}"
        if ! "$post_deploy_script"; then
          log "ERROR: post-deploy script failed for ${job_name}"
          return 1
        fi
        log "post-deploy script completed for ${job_name}"
      else
        log "no post-deploy script found for ${job_name}, skipping"
      fi
      ;;
    *)
      log "no post-deploy script for ${job_name}, skipping"
      ;;
  esac
  return 0
 }
 # ── Main: deploy each job in order ───────────────────────────────────────────
 for job_name in "${JOBS[@]}"; do
  jobspec_path="${REPO_ROOT}/nomad/jobs/${job_name}.hcl"
@ -186,6 +229,9 @@ for job_name in "${JOBS[@]}"; do
    log "[dry-run] nomad job validate ${jobspec_path}"
    log "[dry-run] nomad job run -detach ${jobspec_path}"
    log "[dry-run] (would wait for '${job_name}' to become healthy for ${job_timeout}s)"
    case "$job_name" in
      forgejo) log "[dry-run] [post-deploy] would run forgejo-bootstrap.sh" ;;
    esac
    continue
  fi
@ -215,7 +261,13 @@ for job_name in "${JOBS[@]}"; do
  # 4. Wait for healthy state
  if ! _wait_job_running "$job_name" "$job_timeout"; then
-    die "deployment for job '${job_name}' did not reach successful state"
+    log "WARNING: deployment for job '${job_name}' did not reach successful state — continuing with remaining jobs"
    FAILED_JOBS+=("$job_name")
  else
    # 5. Run post-deploy scripts (only if job reached healthy state)
    if ! _run_post_deploy "$job_name"; then
      die "post-deploy script failed for job '${job_name}'"
    fi
  fi
 done
@ -223,4 +275,17 @@ if [ "$DRY_RUN" -eq 1 ]; then
  log "dry-run complete"
 fi
 # ── Final health summary ─────────────────────────────────────────────────────
 if [ "${#FAILED_JOBS[@]}" -gt 0 ]; then
  log ""
  log "=== DEPLOY SUMMARY ==="
  log "The following jobs did NOT reach healthy state:"
  for failed in "${FAILED_JOBS[@]}"; do
    log "  - ${failed}"
  done
  log "All other jobs were submitted and healthy."
  log "======================"
  exit 1
 fi
 exit 0
--- a/lib/init/nomad/forgejo-bootstrap.sh
+++ b/lib/init/nomad/forgejo-bootstrap.sh
@ -0,0 +1,215 @@
 #!/usr/bin/env bash
 # =============================================================================
 # lib/init/nomad/forgejo-bootstrap.sh — Bootstrap Forgejo admin user
 #
 # Part of the Nomad+Vault migration (S2.4, issue #1069). Creates the
 # disinto-admin user in Forgejo if it doesn't exist, enabling:
 #   - First-login success without manual intervention
 #   - PAT generation via API (required for disinto backup import #1058)
 #
 # The script is idempotent — re-running after success is a no-op.
 #
 # Scope:
 #   - Checks if user 'disinto-admin' exists via GET /api/v1/users/search
 #   - If not: POST /api/v1/admin/users to create admin user
 #   - Uses FORGE_ADMIN_PASS from environment (required)
 #
 # Idempotency contract:
 #   - User 'disinto-admin' exists → skip creation, log
 #     "[forgejo-bootstrap] admin user already exists"
 #   - User creation fails with "user already exists" → treat as success
 #
 # Preconditions:
 #   - Forgejo reachable at $FORGE_URL (default: http://127.0.0.1:3000)
 #   - Forgejo admin token at $FORGE_TOKEN (from Vault or env)
 #   - FORGE_ADMIN_PASS set (env var with admin password)
 #
 # Requires:
 #   - curl, jq
 #
 # Usage:
 #   lib/init/nomad/forgejo-bootstrap.sh
 #   lib/init/nomad/forgejo-bootstrap.sh --dry-run
 #
 # Exit codes:
 #   0  success (user created + ready, or already exists)
 #   1  precondition / API failure
 # =============================================================================
 set -euo pipefail
 # ── Configuration ────────────────────────────────────────────────────────────
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 # shellcheck source=../../../lib/hvault.sh
 source "${REPO_ROOT}/lib/hvault.sh"
 # Configuration
 FORGE_URL="${FORGE_URL:-http://127.0.0.1:3000}"
 FORGE_TOKEN="${FORGE_TOKEN:-}"
 FORGE_ADMIN_USER="${DISINTO_ADMIN_USER:-disinto-admin}"
 FORGE_ADMIN_EMAIL="${DISINTO_ADMIN_EMAIL:-admin@disinto.local}"
 # Derive FORGE_ADMIN_PASS from common env var patterns
 # Priority: explicit FORGE_ADMIN_PASS > DISINTO_FORGE_ADMIN_PASS > FORGEJO_ADMIN_PASS
 FORGE_ADMIN_PASS="${FORGE_ADMIN_PASS:-${DISINTO_FORGE_ADMIN_PASS:-${FORGEJO_ADMIN_PASS:-}}}"
 LOG_TAG="[forgejo-bootstrap]"
 log() { printf '%s %s\n' "$LOG_TAG" "$*" >&2; }
 die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
 # ── Flag parsing ─────────────────────────────────────────────────────────────
 DRY_RUN="${DRY_RUN:-0}"
 for arg in "$@"; do
  case "$arg" in
    --dry-run) DRY_RUN=1 ;;
    -h|--help)
      printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
      printf 'Bootstrap Forgejo admin user if it does not exist.\n'
      printf 'Idempotent: re-running is a no-op.\n\n'
      printf 'Environment:\n'
      printf '  FORGE_URL          Forgejo base URL (default: http://127.0.0.1:3000)\n'
      printf '  FORGE_TOKEN        Forgejo admin token (from Vault or env)\n'
      printf '  FORGE_ADMIN_PASS   Admin password (required)\n'
      printf '  DISINTO_ADMIN_USER Username for admin account (default: disinto-admin)\n'
      printf '  DISINTO_ADMIN_EMAIL Admin email (default: admin@disinto.local)\n\n'
      printf '  --dry-run   Print planned actions without modifying Forgejo.\n'
      exit 0
      ;;
    *) die "invalid argument: ${arg}  (try --help)" ;;
  esac
 done
 # ── Precondition checks ──────────────────────────────────────────────────────
 log "── Precondition check ──"
 if [ -z "$FORGE_URL" ]; then
  die "FORGE_URL is not set"
 fi
 if [ -z "$FORGE_ADMIN_PASS" ]; then
  die "FORGE_ADMIN_PASS is not set (required for admin user creation)"
 fi
 # Resolve FORGE_TOKEN from Vault if not set in env
 if [ -z "$FORGE_TOKEN" ]; then
  log "reading FORGE_TOKEN from Vault at kv/disinto/shared/forge/token"
  _hvault_default_env
  token_raw="$(hvault_get_or_empty "kv/data/disinto/shared/forge/token" 2>/dev/null)" || true
  if [ -n "$token_raw" ]; then
    FORGE_TOKEN="$(printf '%s' "$token_raw" | jq -r '.data.data.token // empty' 2>/dev/null)" || true
  fi
  if [ -z "$FORGE_TOKEN" ]; then
    die "FORGE_TOKEN not set and not found in Vault"
  fi
  log "forge token loaded from Vault"
 fi
 # ── Step 1/3: Check if admin user already exists ─────────────────────────────
 log "── Step 1/3: check if admin user '${FORGE_ADMIN_USER}' exists ──"
 # Use exact match via GET /api/v1/users/{username} (returns 404 if absent)
 user_lookup_raw=$(curl -sf --max-time 10 \
  "${FORGE_URL}/api/v1/users/${FORGE_ADMIN_USER}" 2>/dev/null) || {
  # 404 means user doesn't exist
  if [ $? -eq 7 ]; then
    log "admin user '${FORGE_ADMIN_USER}' not found"
    admin_user_exists=false
    user_id=""
  else
    # Other curl errors (e.g., network, Forgejo down)
    log "warning: failed to lookup user (Forgejo may not be ready yet)"
    admin_user_exists=false
    user_id=""
  fi
 }
 if [ -n "$user_lookup_raw" ]; then
  admin_user_exists=true
  user_id=$(printf '%s' "$user_lookup_raw" | jq -r '.id // empty' 2>/dev/null) || true
  if [ -n "$user_id" ]; then
    log "admin user '${FORGE_ADMIN_USER}' already exists (user_id: ${user_id})"
  fi
 fi
 # ── Step 2/3: Create admin user if needed ────────────────────────────────────
 if [ "$admin_user_exists" = false ]; then
  log "creating admin user '${FORGE_ADMIN_USER}'"
  if [ "$DRY_RUN" -eq 1 ]; then
    log "[dry-run] would create admin user with:"
    log "[dry-run]   username: ${FORGE_ADMIN_USER}"
    log "[dry-run]   email:    ${FORGE_ADMIN_EMAIL}"
    log "[dry-run]   admin:    true"
    log "[dry-run]   must_change_password: false"
  else
    # Create the admin user via the admin API
    create_response=$(curl -sf --max-time 30 -X POST \
      -H "Authorization: token ${FORGE_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_URL}/api/v1/admin/users" \
      -d "{
        \"username\": \"${FORGE_ADMIN_USER}\",
        \"email\": \"${FORGE_ADMIN_EMAIL}\",
        \"password\": \"${FORGE_ADMIN_PASS}\",
        \"admin\": true,
        \"must_change_password\": false
      }" 2>/dev/null) || {
      # Check if the error is "user already exists" (race condition on re-run)
      error_body=$(curl -s --max-time 30 -X POST \
        -H "Authorization: token ${FORGE_TOKEN}" \
        -H "Content-Type: application/json" \
        "${FORGE_URL}/api/v1/admin/users" \
        -d "{\"username\": \"${FORGE_ADMIN_USER}\", \"email\": \"${FORGE_ADMIN_EMAIL}\", \"password\": \"${FORGE_ADMIN_PASS}\", \"admin\": true, \"must_change_password\": false}" 2>/dev/null) || error_body=""
      if echo "$error_body" | grep -q '"message".*"user already exists"'; then
        log "admin user '${FORGE_ADMIN_USER}' already exists (race condition handled)"
        admin_user_exists=true
      else
        die "failed to create admin user in Forgejo: ${error_body:-unknown error}"
      fi
    }
    # Extract user_id from response
    user_id=$(printf '%s' "$create_response" | jq -r '.id // empty' 2>/dev/null) || true
    if [ -n "$user_id" ]; then
      admin_user_exists=true
      log "admin user '${FORGE_ADMIN_USER}' created (user_id: ${user_id})"
    else
      die "failed to extract user_id from Forgejo response"
    fi
  fi
 else
  log "admin user '${FORGE_ADMIN_USER}' already exists — skipping creation"
 fi
 # ── Step 3/3: Verify user was created and is admin ───────────────────────────
 log "── Step 3/3: verify admin user is properly configured ──"
 if [ "$DRY_RUN" -eq 1 ]; then
  log "[dry-run] would verify admin user configuration"
  log "done — [dry-run] complete"
 else
  # Verify the user exists and is admin
  verify_response=$(curl -sf --max-time 10 \
    -u "${FORGE_ADMIN_USER}:${FORGE_ADMIN_PASS}" \
    "${FORGE_URL}/api/v1/user" 2>/dev/null) || {
    die "failed to verify admin user credentials"
  }
  is_admin=$(printf '%s' "$verify_response" | jq -r '.is_admin // false' 2>/dev/null) || true
  login=$(printf '%s' "$verify_response" | jq -r '.login // empty' 2>/dev/null) || true
  if [ "$is_admin" != "true" ]; then
    die "admin user '${FORGE_ADMIN_USER}' is not marked as admin"
  fi
  if [ "$login" != "$FORGE_ADMIN_USER" ]; then
    die "admin user login mismatch: expected '${FORGE_ADMIN_USER}', got '${login}'"
  fi
  log "admin user verified: login=${login}, is_admin=${is_admin}"
  log "done — Forgejo admin user is ready"
 fi
 exit 0
--- a/lib/issue-lifecycle.sh
+++ b/lib/issue-lifecycle.sh
@ -157,9 +157,10 @@ issue_claim() {
    return 1
  fi
-  local ip_id bl_id
+  local ip_id bl_id bk_id
  ip_id=$(_ilc_in_progress_id)
  bl_id=$(_ilc_backlog_id)
  bk_id=$(_ilc_blocked_id)
  if [ -n "$ip_id" ]; then
    curl -sf -X POST \
      -H "Authorization: token ${FORGE_TOKEN}" \
@ -172,6 +173,12 @@ issue_claim() {
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_API}/issues/${issue}/labels/${bl_id}" >/dev/null 2>&1 || true
  fi
  # Clear blocked label on re-claim — starting work is implicit resolution of prior block
  if [ -n "$bk_id" ]; then
    curl -sf -X DELETE \
      -H "Authorization: token ${FORGE_TOKEN}" \
      "${FORGE_API}/issues/${issue}/labels/${bk_id}" >/dev/null 2>&1 || true
  fi
  _ilc_log "claimed issue #${issue}"
  return 0
 }
--- a/lib/ops-setup.sh
+++ b/lib/ops-setup.sh
@ -198,6 +198,7 @@ setup_ops_repo() {
  [ -f "${ops_root}/evidence/holdout/.gitkeep" ] || { touch "${ops_root}/evidence/holdout/.gitkeep"; seeded=true; }
  [ -f "${ops_root}/evidence/evolution/.gitkeep" ] || { touch "${ops_root}/evidence/evolution/.gitkeep"; seeded=true; }
  [ -f "${ops_root}/evidence/user-test/.gitkeep" ] || { touch "${ops_root}/evidence/user-test/.gitkeep"; seeded=true; }
  [ -f "${ops_root}/knowledge/.gitkeep" ] || { touch "${ops_root}/knowledge/.gitkeep"; seeded=true; }
  if [ ! -f "${ops_root}/README.md" ]; then
    cat > "${ops_root}/README.md" <<OPSEOF
@ -362,13 +363,54 @@ migrate_ops_repo() {
    if [ ! -f "$tfile" ]; then
      local title
      title=$(basename "$tfile" | sed 's/\.md$//; s/_/ /g' | sed 's/\b\(.\)/\u\1/g')
-      {
+      case "$tfile" in
-        echo "# ${title}"
+        portfolio.md)
-        echo ""
+          {
-        echo "## Overview"
+            echo "# ${title}"
-        echo ""
+            echo ""
-        echo "<!-- Add content here -->"
+            echo "## Addressables"
-      } > "$tfile"
+            echo ""
            echo "<!-- Add addressables here -->"
            echo ""
            echo "## Observables"
            echo ""
            echo "<!-- Add observables here -->"
          } > "$tfile"
          ;;
        RESOURCES.md)
          {
            echo "# ${title}"
            echo ""
            echo "## Accounts"
            echo ""
            echo "<!-- Add account references here -->"
            echo ""
            echo "## Tokens"
            echo ""
            echo "<!-- Add token references here -->"
            echo ""
            echo "## Infrastructure"
            echo ""
            echo "<!-- Add infrastructure inventory here -->"
          } > "$tfile"
          ;;
        prerequisites.md)
          {
            echo "# ${title}"
            echo ""
            echo "<!-- Add dependency graph here -->"
          } > "$tfile"
          ;;
        *)
          {
            echo "# ${title}"
            echo ""
            echo "## Overview"
            echo ""
            echo "<!-- Add content here -->"
          } > "$tfile"
          ;;
      esac
      echo "  + Created: ${tfile}"
      migrated=true
    fi
--- a/lib/pr-lifecycle.sh
+++ b/lib/pr-lifecycle.sh
@ -429,19 +429,100 @@ pr_walk_to_merge() {
      _prl_log "CI failed — invoking agent (attempt ${ci_fix_count}/${max_ci_fixes})"
-      # Get CI logs from SQLite database if available
+      # Build per-workflow/per-step CI diagnostics prompt
-      local ci_logs=""
+      local ci_prompt_body=""
-      if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then
+      local passing_workflows=""
-        ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs=""
+      local built_diagnostics=false
      if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${WOODPECKER_REPO_ID:-}" ]; then
        local pip_json
        pip_json=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines/${_PR_CI_PIPELINE}" 2>/dev/null) || pip_json=""
        if [ -n "$pip_json" ]; then
          local wf_count
          wf_count=$(printf '%s' "$pip_json" | jq '[.workflows[]?] | length' 2>/dev/null) || wf_count=0
          if [ "$wf_count" -gt 0 ]; then
            built_diagnostics=true
            local wf_idx=0
            while [ "$wf_idx" -lt "$wf_count" ]; do
              local wf_name wf_state
              wf_name=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].name // \"workflow-$wf_idx\"" 2>/dev/null)
              wf_state=$(printf '%s' "$pip_json" | jq -r ".workflows[$wf_idx].state // \"unknown\"" 2>/dev/null)
              if [ "$wf_state" = "failure" ] || [ "$wf_state" = "error" ] || [ "$wf_state" = "killed" ]; then
                # Collect failed children for this workflow
                local failed_children
                failed_children=$(printf '%s' "$pip_json" | jq -r "
                  .workflows[$wf_idx].children[]? |
                  select(.state == \"failure\" or .state == \"error\" or .state == \"killed\") |
                  \"\(.name)\t\(.exit_code)\t\(.pid)\"" 2>/dev/null) || failed_children=""
                ci_prompt_body="${ci_prompt_body}
 --- Failed workflow: ${wf_name} ---"
                if [ -n "$failed_children" ]; then
                  while IFS=$'\t' read -r step_name step_exit step_pid; do
                    [ -z "$step_name" ] && continue
                    local exit_annotation=""
                    case "$step_exit" in
                      126) exit_annotation=" (permission denied or not executable)" ;;
                      127) exit_annotation=" (command not found)" ;;
                      128) exit_annotation=" (invalid exit argument / signal+128)" ;;
                    esac
                    ci_prompt_body="${ci_prompt_body}
  Step: ${step_name}
  Exit code: ${step_exit}${exit_annotation}"
                    # Fetch per-step logs
                    if [ -n "$step_pid" ] && [ "$step_pid" != "null" ]; then
                      local step_logs
                      step_logs=$(ci_get_step_logs "$_PR_CI_PIPELINE" "$step_pid" 2>/dev/null | tail -50) || step_logs=""
                      if [ -n "$step_logs" ]; then
                        ci_prompt_body="${ci_prompt_body}
  Log tail (last 50 lines):
 \`\`\`
 ${step_logs}
 \`\`\`"
                      fi
                    fi
                  done <<< "$failed_children"
                else
                  ci_prompt_body="${ci_prompt_body}
  (no failed step details available)"
                fi
              else
                # Track passing/other workflows
                if [ -n "$passing_workflows" ]; then
                  passing_workflows="${passing_workflows}, ${wf_name}"
                else
                  passing_workflows="${wf_name}"
                fi
              fi
              wf_idx=$((wf_idx + 1))
            done
          fi
        fi
      fi
-      local logs_section=""
+      # Fallback: use legacy log fetch if per-workflow diagnostics unavailable
-      if [ -n "$ci_logs" ]; then
+      if [ "$built_diagnostics" = false ]; then
-        logs_section="
+        local ci_logs=""
        if [ -n "$_PR_CI_PIPELINE" ] && [ -n "${FACTORY_ROOT:-}" ]; then
          ci_logs=$(ci_get_logs "$_PR_CI_PIPELINE" 2>/dev/null | tail -50) || ci_logs=""
        fi
        if [ -n "$ci_logs" ]; then
          ci_prompt_body="
 CI Log Output (last 50 lines):
 \`\`\`
 ${ci_logs}
-\`\`\`
+\`\`\`"
        fi
      fi
      local passing_line=""
      if [ -n "$passing_workflows" ]; then
        passing_line="
 Passing workflows (do not modify): ${passing_workflows}
 "
      fi
@ -450,9 +531,10 @@ ${ci_logs}
 Pipeline: #${_PR_CI_PIPELINE:-?}
 Failure type: ${_PR_CI_FAILURE_TYPE:-unknown}
-
+${passing_line}
 Error log:
-${_PR_CI_ERROR_LOG:-No logs available.}${logs_section}
+${_PR_CI_ERROR_LOG:-No logs available.}
 ${ci_prompt_body}
 Fix the issue, run tests, commit, rebase on ${PRIMARY_BRANCH}, and push:
  git fetch ${remote} ${PRIMARY_BRANCH} && git rebase ${remote}/${PRIMARY_BRANCH}
--- a/nomad/AGENTS.md
+++ b/nomad/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: 0bb04545d47fb43b2cab0a1f4406c2a2b57f4eba -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # nomad/ — Agent Instructions
 Nomad + Vault HCL for the factory's single-node cluster. These files are
--- a/nomad/jobs/woodpecker-agent.hcl
+++ b/nomad/jobs/woodpecker-agent.hcl
@ -57,7 +57,7 @@ job "woodpecker-agent" {
      check {
        type     = "http"
        path     = "/healthz"
-        interval = "15s"
+        interval = "10s"
        timeout  = "3s"
      }
    }
@ -89,10 +89,13 @@ job "woodpecker-agent" {
      # Nomad's port stanza to the allocation's IP (not localhost), so the
      # agent must use the LXC's eth0 IP, not 127.0.0.1.
      env {
-        WOODPECKER_SERVER         = "${attr.unique.network.ip-address}:9000"
+        WOODPECKER_SERVER                   = "${attr.unique.network.ip-address}:9000"
-        WOODPECKER_GRPC_SECURE    = "false"
+        WOODPECKER_GRPC_SECURE              = "false"
-        WOODPECKER_MAX_WORKFLOWS  = "1"
+        WOODPECKER_GRPC_KEEPALIVE_TIME      = "10s"
-        WOODPECKER_HEALTHCHECK_ADDR = ":3333"
+        WOODPECKER_GRPC_KEEPALIVE_TIMEOUT   = "20s"
        WOODPECKER_GRPC_KEEPALIVE_PERMIT_WITHOUT_CALLS = "true"
        WOODPECKER_MAX_WORKFLOWS            = "1"
        WOODPECKER_HEALTHCHECK_ADDR         = ":3333"
      }
      # ── Vault-templated agent secret ──────────────────────────────────
--- a/planner/AGENTS.md
+++ b/planner/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Planner Agent
 **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),
--- a/predictor/AGENTS.md
+++ b/predictor/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Predictor Agent
 **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula
--- a/review/AGENTS.md
+++ b/review/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Review Agent
 **Role**: AI-powered PR review — post structured findings and formal
--- a/review/review-pr.sh
+++ b/review/review-pr.sh
@ -52,8 +52,35 @@ REVIEW_TMPDIR=$(mktemp -d)
 log() { printf '[%s] PR#%s %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" >> "$LOGFILE"; }
 status() { printf '[%s] PR #%s: %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$PR_NUMBER" "$*" > "$STATUSFILE"; log "$*"; }
-cleanup() { rm -rf "$REVIEW_TMPDIR" "$LOCKFILE" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"; }
+
-trap cleanup EXIT
+# cleanup — remove temp files (NOT lockfile — cleanup_on_exit handles that)
 cleanup() {
  rm -rf "$REVIEW_TMPDIR" "$STATUSFILE" "/tmp/${PROJECT_NAME}-review-graph-${PR_NUMBER}.json"
 }
 # cleanup_on_exit — defensive cleanup: remove lockfile if we own it, kill residual children
 # This handles the case where review-pr.sh is terminated unexpectedly (e.g., watchdog SIGTERM)
 cleanup_on_exit() {
  local ec=$?
  # Remove lockfile only if we own it (PID matches $$)
  if [ -f "$LOCKFILE" ] && [ -n "$(cat "$LOCKFILE" 2>/dev/null)" ]; then
    if [ "$(cat "$LOCKFILE" 2>/dev/null)" = "$$" ]; then
      rm -f "$LOCKFILE"
      log "cleanup_on_exit: removed lockfile (we owned it)"
    fi
  fi
  # Kill any direct children that may have been spawned by this process
  # (e.g., bash -c commands from Claude's Bash tool that didn't get reaped)
  pkill -P $$ 2>/dev/null || true
  # Call the main cleanup function to remove temp files
  cleanup
  exit "$ec"
 }
 trap cleanup_on_exit EXIT INT TERM
 # Note: EXIT trap is already set above. The cleanup function is still available for
 # non-error exits (e.g., normal completion via exit 0 after verdict posted).
 # When review succeeds, we want to skip lockfile removal since the verdict was posted.
 # =============================================================================
 # LOG ROTATION
@ -104,6 +131,7 @@ if [ "$PR_STATE" != "open" ]; then
  log "SKIP: state=${PR_STATE}"
  worktree_cleanup "$WORKTREE"
  rm -f "$OUTPUT_FILE" "$SID_FILE" 2>/dev/null || true
  rm -f "$LOCKFILE"
  exit 0
 fi
@ -113,7 +141,7 @@ fi
 CI_STATE=$(ci_commit_status "$PR_SHA")
 CI_NOTE=""
 if ! ci_passed "$CI_STATE"; then
-  ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; exit 0; }
+  ci_required_for_pr "$PR_NUMBER" && { log "SKIP: CI=${CI_STATE}"; rm -f "$LOCKFILE"; exit 0; }
  CI_NOTE=" (not required — non-code PR)"
 fi
@ -123,10 +151,10 @@ fi
 ALL_COMMENTS=$(forge_api_all "/issues/${PR_NUMBER}/comments")
 HAS_CMT=$(printf '%s' "$ALL_COMMENTS" | jq --arg s "$PR_SHA" \
  '[.[]|select(.body|contains("<!-- reviewed: "+$s+" -->"))]|length')
-[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; exit 0; }
+[ "${HAS_CMT:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: reviewed ${PR_SHA:0:7}"; rm -f "$LOCKFILE"; exit 0; }
 HAS_FML=$(forge_api_all "/pulls/${PR_NUMBER}/reviews" | jq --arg s "$PR_SHA" \
  '[.[]|select(.commit_id==$s)|select(.state!="COMMENT")]|length')
-[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; exit 0; }
+[ "${HAS_FML:-0}" -gt 0 ] && [ "$FORCE" != "--force" ] && { log "SKIP: formal review"; rm -f "$LOCKFILE"; exit 0; }
 # =============================================================================
 # RE-REVIEW DETECTION
@ -324,3 +352,7 @@ esac
 profile_write_journal "review-${PR_NUMBER}" "Review PR #${PR_NUMBER} (${VERDICT})" "${VERDICT,,}" "" || true
 log "DONE: ${VERDICT} (re-review: ${IS_RE_REVIEW})"
 # Remove lockfile on successful completion (cleanup_on_exit will also do this,
 # but we do it here to avoid the trap running twice)
 rm -f "$LOCKFILE"
--- a/site/collect-engagement.sh
+++ b/site/collect-engagement.sh
@ -209,3 +209,72 @@ jq -nc \
 log "Engagement report written to ${OUTPUT}: ${UNIQUE_VISITORS} visitors, ${PAGE_VIEWS} page views"
 echo "Engagement report: ${UNIQUE_VISITORS} unique visitors, ${PAGE_VIEWS} page views → ${OUTPUT}"
 # ── Commit evidence to ops repo via Forgejo API ─────────────────────────────
 commit_evidence_via_forgejo() {
  local evidence_file="$1"
  local report_date
  report_date=$(basename "$evidence_file" .json)
  local file_path="evidence/engagement/${report_date}.json"
  # Check if ops repo is available
  if [ -z "${OPS_REPO_ROOT:-}" ] || [ ! -d "${OPS_REPO_ROOT}/.git" ]; then
    log "SKIP: OPS_REPO_ROOT not set or not a git repo — evidence file not committed"
    return 0
  fi
  # Check if Forgejo credentials are available
  if [ -z "${FORGE_TOKEN:-}" ] || [ -z "${FORGE_URL:-}" ] || [ -z "${FORGE_OPS_REPO:-}" ]; then
    log "SKIP: Forgejo credentials not available (FORGE_TOKEN/FORGE_URL/FORGE_OPS_REPO) — evidence file not committed"
    return 0
  fi
  # Read and encode the file content
  local content
  content=$(base64 < "$evidence_file")
  local ops_owner="${OPS_FORGE_OWNER:-${FORGE_REPO%%/*}}"
  local ops_repo="${OPS_FORGE_REPO:-${PROJECT_NAME:-disinto}-ops}"
  # Check if file already exists in the ops repo
  local existing
  existing=$(curl -sf \
    -H "Authorization: token ${FORGE_TOKEN}" \
    "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \
    2>/dev/null || echo "")
  if [ -n "$existing" ] && printf '%s' "$existing" | jq -e '.sha' >/dev/null 2>&1; then
    # Update existing file
    local sha
    sha=$(printf '%s' "$existing" | jq -r '.sha')
    if curl -sf -X PUT \
      -H "Authorization: token ${FORGE_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \
      -d "$(jq -nc --arg content "$content" --arg sha "$sha" --arg msg "evidence: engagement ${report_date}" \
        '{message: $msg, content: $content, sha: $sha}')" >/dev/null 2>&1; then
      log "Updated evidence file in ops repo: ${file_path}"
      return 0
    else
      log "ERROR: failed to update evidence file in ops repo"
      return 1
    fi
  else
    # Create new file
    if curl -sf -X POST \
      -H "Authorization: token ${FORGE_TOKEN}" \
      -H "Content-Type: application/json" \
      "${FORGE_URL}/api/v1/repos/${ops_owner}/${ops_repo}/contents/${file_path}" \
      -d "$(jq -nc --arg content "$content" --arg msg "evidence: engagement ${report_date}" \
        '{message: $msg, content: $content}')" >/dev/null 2>&1; then
      log "Created evidence file in ops repo: ${file_path}"
      return 0
    else
      log "ERROR: failed to create evidence file in ops repo"
      return 1
    fi
  fi
 }
 # Attempt to commit evidence (non-fatal — data collection succeeded even if commit fails)
 commit_evidence_via_forgejo "$OUTPUT" || log "WARNING: evidence commit skipped or failed — file exists locally at ${OUTPUT}"
--- a/supervisor/AGENTS.md
+++ b/supervisor/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # Supervisor Agent
 **Role**: Health monitoring and auto-remediation, executed as a formula-driven
--- a/tests/smoke-edge-subpath.sh
+++ b/tests/smoke-edge-subpath.sh
@ -0,0 +1,310 @@
 #!/usr/bin/env bash
 # =============================================================================
 # smoke-edge-subpath.sh — End-to-end subpath routing smoke test
 #
 # Verifies Forgejo, Woodpecker, and chat function correctly under subpaths:
 #   - Forgejo at /forge/
 #   - Woodpecker at /ci/
 #   - Chat at /chat/
 #   - Staging at /staging/
 #
 # Usage:
 #   smoke-edge-subpath.sh [--base-url BASE_URL]
 #
 # Environment variables:
 #   BASE_URL         — Edge proxy URL (default: http://localhost)
 #   EDGE_TIMEOUT     — Request timeout in seconds (default: 30)
 #   EDGE_MAX_RETRIES — Max retries per request (default: 3)
 #
 # Exit codes:
 #   0 — All checks passed
 #   1 — One or more checks failed
 # =============================================================================
 set -euo pipefail
 # Script directory for relative paths
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # Source common helpers if available
 source "${SCRIPT_DIR}/../lib/env.sh" 2>/dev/null || true
 # ─────────────────────────────────────────────────────────────────────────────
 # Configuration
 # ─────────────────────────────────────────────────────────────────────────────
 BASE_URL="${BASE_URL:-http://localhost}"
 EDGE_TIMEOUT="${EDGE_TIMEOUT:-30}"
 EDGE_MAX_RETRIES="${EDGE_MAX_RETRIES:-3}"
 # Subpaths to test
 FORGE_PATH="/forge/"
 CI_PATH="/ci/"
 CHAT_PATH="/chat/"
 STAGING_PATH="/staging/"
 # Track overall test status
 FAILED=0
 PASSED=0
 SKIPPED=0
 # ─────────────────────────────────────────────────────────────────────────────
 # Logging helpers
 # ─────────────────────────────────────────────────────────────────────────────
 log_info() {
  echo "[INFO] $*"
 }
 log_pass() {
  echo "[PASS] $*"
  ((PASSED++)) || true
 }
 log_fail() {
  echo "[FAIL] $*"
  ((FAILED++)) || true
 }
 log_skip() {
  echo "[SKIP] $*"
  ((SKIPPED++)) || true
 }
 log_section() {
  echo ""
  echo "=== $* ==="
  echo ""
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # HTTP helpers
 # ─────────────────────────────────────────────────────────────────────────────
 # Make an HTTP request with retry logic
 # Usage: http_request <method> <url> [options...]
 # Returns: HTTP status code on stdout
 http_request() {
  local method="$1"
  local url="$2"
  shift 2
  local retries=0
  local response status
  while [ "$retries" -lt "$EDGE_MAX_RETRIES" ]; do
    response=$(curl -sS -w '\n%{http_code}' -X "$method" \
      --max-time "$EDGE_TIMEOUT" \
      -o /tmp/edge-response-$$ \
      "$@" 2>&1) || {
      retries=$((retries + 1))
      log_info "Retry $retries/$EDGE_MAX_RETRIES for $url"
      sleep 1
      continue
    }
    status=$(echo "$response" | tail -n1)
    echo "$status"
    return 0
  done
  log_fail "Max retries exceeded for $url"
  return 1
 }
 # Make a GET request and return status code
 http_get() {
  local url="$1"
  shift || true
  http_request "GET" "$url" "$@"
 }
 # Make a HEAD request (no body)
 http_head() {
  local url="$1"
  shift || true
  http_request "HEAD" "$url" "$@"
 }
 # Make a GET request and return the response body
 http_get_body() {
  local url="$1"
  shift || true
  curl -sS --max-time "$EDGE_TIMEOUT" "$@" "$url"
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # Test functions
 # ─────────────────────────────────────────────────────────────────────────────
 test_root_redirect() {
  log_section "Test 1: Root redirect to /forge/"
  local status
  status=$(http_head "$BASE_URL/")
  if [ "$status" = "302" ]; then
    log_pass "Root / redirects with 302"
  else
    log_fail "Expected 302 redirect from /, got status $status"
  fi
 }
 test_forgejo_subpath() {
  log_section "Test 2: Forgejo at /forge/"
  local status
  status=$(http_head "$BASE_URL${FORGE_PATH}")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Forgejo at ${BASE_URL}${FORGE_PATH} returns status $status"
  else
    log_fail "Forgejo at ${BASE_URL}${FORGE_PATH} returned unexpected status $status"
  fi
 }
 test_woodpecker_subpath() {
  log_section "Test 3: Woodpecker at /ci/"
  local status
  status=$(http_head "$BASE_URL${CI_PATH}")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Woodpecker at ${BASE_URL}${CI_PATH} returns status $status"
  else
    log_fail "Woodpecker at ${BASE_URL}${CI_PATH} returned unexpected status $status"
  fi
 }
 test_chat_subpath() {
  log_section "Test 4: Chat at /chat/"
  # Test chat login endpoint
  local status
  status=$(http_head "$BASE_URL${CHAT_PATH}login")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Chat login at ${BASE_URL}${CHAT_PATH}login returns status $status"
  else
    log_fail "Chat login at ${BASE_URL}${CHAT_PATH}login returned unexpected status $status"
  fi
  # Test chat OAuth callback endpoint
  status=$(http_head "$BASE_URL${CHAT_PATH}oauth/callback")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Chat OAuth callback at ${BASE_URL}${CHAT_PATH}oauth/callback returns status $status"
  else
    log_fail "Chat OAuth callback at ${BASE_URL}${CHAT_PATH}oauth/callback returned unexpected status $status"
  fi
 }
 test_staging_subpath() {
  log_section "Test 5: Staging at /staging/"
  local status
  status=$(http_head "$BASE_URL${STAGING_PATH}")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Staging at ${BASE_URL}${STAGING_PATH} returns status $status"
  else
    log_fail "Staging at ${BASE_URL}${STAGING_PATH} returned unexpected status $status"
  fi
 }
 test_forward_auth_rejection() {
  log_section "Test 6: Forward auth on /chat/* rejects unauthenticated requests"
  # Request a protected chat endpoint without auth header
  # Should return 401 (Unauthorized) due to forward_auth
  local status
  status=$(http_head "$BASE_URL${CHAT_PATH}auth/verify")
  if [ "$status" = "401" ]; then
    log_pass "Unauthenticated /chat/auth/verify returns 401 (forward_auth working)"
  elif [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_skip "Unauthenticated /chat/auth/verify returns $status (forward_auth may be disabled)"
  else
    log_fail "Expected 401 for unauthenticated /chat/auth/verify, got status $status"
  fi
 }
 test_forgejo_oauth_callback() {
  log_section "Test 7: Forgejo OAuth callback for Woodpecker under subpath"
  # Test that Forgejo OAuth callback path works (Woodpecker OAuth integration)
  local status
  status=$(http_head "$BASE_URL${FORGE_PATH}login/oauth/callback")
  if [ "$status" -ge 200 ] && [ "$status" -lt 400 ]; then
    log_pass "Forgejo OAuth callback at ${BASE_URL}${FORGE_PATH}login/oauth/callback works"
  else
    log_fail "Forgejo OAuth callback returned unexpected status $status"
  fi
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # Main
 # ─────────────────────────────────────────────────────────────────────────────
 main() {
  log_info "Starting subpath routing smoke test"
  log_info "Base URL: $BASE_URL"
  log_info "Timeout: ${EDGE_TIMEOUT}s, Max retries: ${EDGE_MAX_RETRIES}"
  # Run all tests
  test_root_redirect
  test_forgejo_subpath
  test_woodpecker_subpath
  test_chat_subpath
  test_staging_subpath
  test_forward_auth_rejection
  test_forgejo_oauth_callback
  # Summary
  log_section "Test Summary"
  log_info "Passed: $PASSED"
  log_info "Failed: $FAILED"
  log_info "Skipped: $SKIPPED"
  if [ "$FAILED" -gt 0 ]; then
    log_fail "Some tests failed"
    exit 1
  fi
  log_pass "All tests passed!"
  exit 0
 }
 # Parse arguments
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --base-url)
      BASE_URL="$2"
      shift 2
      ;;
    --base-url=*)
      BASE_URL="${1#*=}"
      shift
      ;;
    --help)
      echo "Usage: $0 [options]"
      echo ""
      echo "Options:"
      echo "  --base-url URL     Set base URL (default: http://localhost)"
      echo "  --help             Show this help message"
      echo ""
      echo "Environment variables:"
      echo "  BASE_URL           Base URL for edge proxy (default: http://localhost)"
      echo "  EDGE_TIMEOUT       Request timeout in seconds (default: 30)"
      echo "  EDGE_MAX_RETRIES   Max retries per request (default: 3)"
      exit 0
      ;;
    *)
      echo "Unknown option: $1" >&2
      exit 1
      ;;
  esac
 done
 main
--- a/tests/smoke-init.sh
+++ b/tests/smoke-init.sh
@ -15,6 +15,7 @@
 set -euo pipefail
 FACTORY_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 export FACTORY_ROOT_REAL="$FACTORY_ROOT"
 # Always use localhost for mock Forgejo (in case FORGE_URL is set from docker-compose)
 export FORGE_URL="http://localhost:3000"
 MOCK_BIN="/tmp/smoke-mock-bin"
@ -427,14 +428,12 @@ rm -rf /tmp/smoke-claude-shared /tmp/smoke-home-claude
 # ── 8. Test duplicate service name detection ──────────────────────────────
 echo "=== 8/8 Testing duplicate service name detection ==="
-# Clean up for duplicate test
+# Isolated factory root — do NOT touch the real ${FACTORY_ROOT}/projects/
-rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml"
+SMOKE_DUP_ROOT=$(mktemp -d)
-rm -f "${FACTORY_ROOT}/docker-compose.yml"
+mkdir -p "$SMOKE_DUP_ROOT/projects"
-
+cat > "$SMOKE_DUP_ROOT/projects/duplicate-test.toml" <<'TOMLEOF'
 # Create a TOML that would conflict with ENABLE_LLAMA_AGENT
 cat > "${FACTORY_ROOT}/projects/duplicate-test.toml" <<'TOMLEOF'
 name = "duplicate-test"
-description = "Test project for duplicate service detection"
+description = "dup-detection smoke"
 [ci]
 woodpecker_repo_id = "999"
@ -446,26 +445,29 @@ roles = ["dev"]
 forge_user = "llama-bot"
 TOMLEOF
-# Run disinto init with ENABLE_LLAMA_AGENT=1
+# Call the generator directly — no `disinto init` to overwrite the TOML.
-# This should fail because [agents.llama] conflicts with ENABLE_LLAMA_AGENT
+# FACTORY_ROOT tells generators.sh where projects/ + compose_file live.
-export ENABLE_LLAMA_AGENT="1"
+(
-export FORGE_URL="http://localhost:3000"
+  export FACTORY_ROOT="$SMOKE_DUP_ROOT"
-export SMOKE_FORGE_URL="$FORGE_URL"
+  export ENABLE_LLAMA_AGENT=1
-export FORGE_ADMIN_PASS="smoke-test-password-123"
+  # shellcheck disable=SC1091
-export SKIP_PUSH=true
+  source "${FACTORY_ROOT_REAL:-$(cd "$(dirname "$0")/.." && pwd)}/lib/generators.sh"
  # Use a temp file to capture output since pipefail will kill the pipeline
  # when _generate_compose_impl returns non-zero
  _generate_compose_impl > /tmp/smoke-dup-output.txt 2>&1 || true
  if grep -q "Duplicate service name" /tmp/smoke-dup-output.txt; then
    pass "Duplicate service detection: conflict between ENABLE_LLAMA_AGENT and [agents.llama] reported"
    rm -f /tmp/smoke-dup-output.txt
    exit 0
  else
    fail "Duplicate service detection: no error raised for ENABLE_LLAMA_AGENT + [agents.llama]"
    cat /tmp/smoke-dup-output.txt >&2
    rm -f /tmp/smoke-dup-output.txt
    exit 1
  fi
 ) || FAILED=1
-if bash "${FACTORY_ROOT}/bin/disinto" init \
+rm -rf "$SMOKE_DUP_ROOT"
  "duplicate-test" \
  --bare --yes \
  --forge-url "$FORGE_URL" \
  --repo-root "/tmp/smoke-test-repo" 2>&1 | grep -q "Duplicate service name 'agents-llama'"; then
  pass "Duplicate service detection: correctly detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
 else
  fail "Duplicate service detection: should have detected conflict between ENABLE_LLAMA_AGENT and [agents.llama]"
 fi
 # Clean up
 rm -f "${FACTORY_ROOT}/projects/duplicate-test.toml"
 unset ENABLE_LLAMA_AGENT
 # ── Summary ──────────────────────────────────────────────────────────────────
--- a/tests/test-caddyfile-routing.sh
+++ b/tests/test-caddyfile-routing.sh
@ -0,0 +1,231 @@
 #!/usr/bin/env bash
 # =============================================================================
 # test-caddyfile-routing.sh — Caddyfile routing block unit test
 #
 # Extracts the Caddyfile template from nomad/jobs/edge.hcl and validates its
 # structure without requiring a running Caddy instance.
 #
 # Checks:
 #   - Forgejo subpath (/forge/* -> :3000)
 #   - Woodpecker subpath (/ci/* -> :8000)
 #   - Staging subpath (/staging/* -> nomadService discovery)
 #   - Chat subpath (/chat/* with forward_auth and OAuth routes)
 #   - Root redirect to /forge/
 #
 # Usage:
 #   test-caddyfile-routing.sh
 #
 # Exit codes:
 #   0 — All checks passed
 #   1 — One or more checks failed
 # =============================================================================
 set -euo pipefail
 # Script directory for relative paths
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
 EDGE_TEMPLATE="${REPO_ROOT}/nomad/jobs/edge.hcl"
 # Track test status
 FAILED=0
 PASSED=0
 # ─────────────────────────────────────────────────────────────────────────────
 # Logging helpers
 # ─────────────────────────────────────────────────────────────────────────────
 tr_info() {
  echo "[INFO] $*"
 }
 tr_pass() {
  echo "[PASS] $*"
  ((PASSED++)) || true
 }
 tr_fail() {
  echo "[FAIL] $*"
  ((FAILED++)) || true
 }
 tr_section() {
  echo ""
  echo "=== $* ==="
  echo ""
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # Caddyfile extraction
 # ─────────────────────────────────────────────────────────────────────────────
 extract_caddyfile() {
  local template_file="$1"
  # Extract the Caddyfile template (content between <<EOT and EOT markers
  # within the template stanza)
  local caddyfile
  caddyfile=$(sed -n '/data[[:space:]]*=[[:space:]]*<<[Ee][Oo][Tt]/,/^EOT$/p' "$template_file" | sed '1s/.*/# Caddyfile extracted from Nomad template/; $d')
  if [ -z "$caddyfile" ]; then
    echo "ERROR: Could not extract Caddyfile template from $template_file" >&2
    return 1
  fi
  echo "$caddyfile"
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # Validation functions
 # ─────────────────────────────────────────────────────────────────────────────
 check_forgejo_routing() {
  tr_section "Validating Forgejo routing"
  # Check handle block for /forge/*
  if echo "$CADDYFILE" | grep -q "handle /forge/\*"; then
    tr_pass "Forgejo handle block (handle /forge/*)"
  else
    tr_fail "Missing Forgejo handle block (handle /forge/*)"
  fi
  # Check reverse_proxy to Forgejo on port 3000
  if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then
    tr_pass "Forgejo reverse_proxy configured (127.0.0.1:3000)"
  else
    tr_fail "Missing Forgejo reverse_proxy (127.0.0.1:3000)"
  fi
 }
 check_woodpecker_routing() {
  tr_section "Validating Woodpecker routing"
  # Check handle block for /ci/*
  if echo "$CADDYFILE" | grep -q "handle /ci/\*"; then
    tr_pass "Woodpecker handle block (handle /ci/*)"
  else
    tr_fail "Missing Woodpecker handle block (handle /ci/*)"
  fi
  # Check reverse_proxy to Woodpecker on port 8000
  if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8000"; then
    tr_pass "Woodpecker reverse_proxy configured (127.0.0.1:8000)"
  else
    tr_fail "Missing Woodpecker reverse_proxy (127.0.0.1:8000)"
  fi
 }
 check_staging_routing() {
  tr_section "Validating Staging routing"
  # Check handle block for /staging/*
  if echo "$CADDYFILE" | grep -q "handle /staging/\*"; then
    tr_pass "Staging handle block (handle /staging/*)"
  else
    tr_fail "Missing Staging handle block (handle /staging/*)"
  fi
  # Check for nomadService discovery (dynamic port)
  if echo "$CADDYFILE" | grep -q "nomadService"; then
    tr_pass "Staging uses Nomad service discovery"
  else
    tr_fail "Missing Nomad service discovery for staging"
  fi
 }
 check_chat_routing() {
  tr_section "Validating Chat routing"
  # Check login endpoint
  if echo "$CADDYFILE" | grep -q "handle /chat/login"; then
    tr_pass "Chat login handle block (handle /chat/login)"
  else
    tr_fail "Missing Chat login handle block (handle /chat/login)"
  fi
  # Check OAuth callback endpoint
  if echo "$CADDYFILE" | grep -q "handle /chat/oauth/callback"; then
    tr_pass "Chat OAuth callback handle block (handle /chat/oauth/callback)"
  else
    tr_fail "Missing Chat OAuth callback handle block (handle /chat/oauth/callback)"
  fi
  # Check catch-all for /chat/*
  if echo "$CADDYFILE" | grep -q "handle /chat/\*"; then
    tr_pass "Chat catch-all handle block (handle /chat/*)"
  else
    tr_fail "Missing Chat catch-all handle block (handle /chat/*)"
  fi
  # Check reverse_proxy to Chat on port 8080
  if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8080"; then
    tr_pass "Chat reverse_proxy configured (127.0.0.1:8080)"
  else
    tr_fail "Missing Chat reverse_proxy (127.0.0.1:8080)"
  fi
  # Check forward_auth block for /chat/*
  if echo "$CADDYFILE" | grep -A10 "handle /chat/\*" | grep -q "forward_auth"; then
    tr_pass "forward_auth block configured for /chat/*"
  else
    tr_fail "Missing forward_auth block for /chat/*"
  fi
  # Check forward_auth URI
  if echo "$CADDYFILE" | grep -q "uri /chat/auth/verify"; then
    tr_pass "forward_auth URI configured (/chat/auth/verify)"
  else
    tr_fail "Missing forward_auth URI (/chat/auth/verify)"
  fi
 }
 check_root_redirect() {
  tr_section "Validating root redirect"
  # Check root redirect to /forge/
  if echo "$CADDYFILE" | grep -q "redir /forge/ 302"; then
    tr_pass "Root redirect to /forge/ configured (302)"
  else
    tr_fail "Missing root redirect to /forge/"
  fi
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # Main
 # ─────────────────────────────────────────────────────────────────────────────
 main() {
  tr_info "Extracting Caddyfile template from $EDGE_TEMPLATE"
  # Extract Caddyfile
  CADDYFILE=$(extract_caddyfile "$EDGE_TEMPLATE")
  if [ -z "$CADDYFILE" ]; then
    tr_fail "Could not extract Caddyfile template"
    exit 1
  fi
  tr_pass "Caddyfile template extracted successfully"
  # Run all validation checks
  check_forgejo_routing
  check_woodpecker_routing
  check_staging_routing
  check_chat_routing
  check_root_redirect
  # Summary
  tr_section "Test Summary"
  tr_info "Passed: $PASSED"
  tr_info "Failed: $FAILED"
  if [ "$FAILED" -gt 0 ]; then
    tr_fail "Some checks failed"
    exit 1
  fi
  tr_pass "All routing blocks validated!"
  exit 0
 }
 main
--- a/tests/test-watchdog-process-group.sh
+++ b/tests/test-watchdog-process-group.sh
@ -0,0 +1,129 @@
 #!/usr/bin/env bash
 # test-watchdog-process-group.sh — Test that claude_run_with_watchdog kills orphan children
 #
 # This test verifies that when claude_run_with_watchdog terminates the Claude process,
 # all child processes (including those spawned by Claude's Bash tool) are also killed.
 #
 # Reproducer scenario:
 #   1. Create a fake "claude" stub that:
 #      a. Spawns a long-running child process (sleep 3600)
 #      b. Writes a result marker to stdout to trigger idle detection
 #      c. Stays running
 #   2. Run claude_run_with_watchdog with the stub
 #   3. Before the fix: sleep child survives (orphaned to PID 1)
 #   4. After the fix: sleep child dies (killed as part of process group with -PID)
 #
 # Usage: ./tests/test-watchdog-process-group.sh
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
 TEST_TMP="/tmp/test-watchdog-$$"
 LOGFILE="${TEST_TMP}/log.txt"
 PASS=true
 # shellcheck disable=SC2317
 cleanup_test() {
  rm -rf "$TEST_TMP"
 }
 trap cleanup_test EXIT INT TERM
 mkdir -p "$TEST_TMP"
 log() {
  printf '[TEST] %s\n' "$*" | tee -a "$LOGFILE"
 }
 fail() {
  printf '[TEST] FAIL: %s\n' "$*" | tee -a "$LOGFILE"
  PASS=false
 }
 pass() {
  printf '[TEST] PASS: %s\n' "$*" | tee -a "$LOGFILE"
 }
 # Export required environment variables
 export CLAUDE_TIMEOUT=10       # Short timeout for testing
 export CLAUDE_IDLE_GRACE=2     # Short grace period for testing
 export LOGFILE="${LOGFILE}"    # Required by agent-sdk.sh
 # Create a fake claude stub that:
 # 1. Spawns a long-running child process (sleep 3600) that will become an orphan if parent is killed
 # 2. Writes a result marker to stdout (to trigger the watchdog's idle-after-result path)
 # 3. Stays running so the watchdog can kill it
 cat > "${TEST_TMP}/fake-claude" << 'FAKE_CLAUDE_EOF'
 #!/usr/bin/env bash
 # Fake claude that spawns a child and stays running
 # Simulates Claude's behavior when it spawns a Bash tool command
 # Write result marker to stdout (triggers watchdog idle detection)
 echo '{"type":"result","session_id":"test-session-123","verdict":"APPROVE"}'
 # Spawn a child that simulates Claude's Bash tool hanging
 # This is the process that should be killed when the parent is terminated
 sleep 3600 &
 CHILD_PID=$!
 # Log the child PID for debugging
 echo "FAKE_CLAUDE_CHILD_PID=$CHILD_PID" >&2
 # Stay running - sleep in a loop so the watchdog can kill us
 while true; do
  sleep 3600 &
  wait $! 2>/dev/null || true
 done
 FAKE_CLAUDE_EOF
 chmod +x "${TEST_TMP}/fake-claude"
 log "Testing claude_run_with_watchdog process group cleanup..."
 # Source the library and run claude_run_with_watchdog
 cd "$SCRIPT_DIR"
 source lib/agent-sdk.sh
 log "Starting claude_run_with_watchdog with fake claude..."
 # Run the function directly (not as a script)
 # We need to capture output and redirect stderr
 OUTPUT_FILE="${TEST_TMP}/output.txt"
 timeout 35 bash -c "
  source '${SCRIPT_DIR}/lib/agent-sdk.sh'
  CLAUDE_TIMEOUT=10 CLAUDE_IDLE_GRACE=2 LOGFILE='${LOGFILE}' claude_run_with_watchdog '${TEST_TMP}/fake-claude' > '${OUTPUT_FILE}' 2>&1
  exit \$?
 " || true
 # Give the watchdog a moment to clean up
 log "Waiting for cleanup..."
 sleep 5
 # More precise check: look for sleep 3600 processes
 # These would be the orphans from our fake claude
 ORPHAN_COUNT=$(pgrep -a sleep 2>/dev/null | grep -c "sleep 3600" 2>/dev/null || echo "0")
 if [ "$ORPHAN_COUNT" -gt 0 ]; then
  log "Found $ORPHAN_COUNT orphan sleep 3600 processes:"
  pgrep -a sleep | grep "sleep 3600"
  fail "Orphan children found - process group cleanup did not work"
 else
  pass "No orphan children found - process group cleanup worked"
 fi
 # Also verify that the fake claude itself is not running
 FAKE_CLAUDE_COUNT=$(pgrep -c -f "fake-claude" 2>/dev/null || echo "0")
 if [ "$FAKE_CLAUDE_COUNT" -gt 0 ]; then
  log "Found $FAKE_CLAUDE_COUNT fake-claude processes still running"
  fail "Fake claude process(es) still running"
 else
  pass "Fake claude process terminated"
 fi
 # Summary
 echo ""
 if [ "$PASS" = true ]; then
  log "All tests passed!"
  exit 0
 else
  log "Some tests failed. See log at $LOGFILE"
  exit 1
 fi
--- a/vault/policies/AGENTS.md
+++ b/vault/policies/AGENTS.md
@ -1,4 +1,4 @@
-<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
+<!-- last-reviewed: 88222503d5a2ff1d25e9f1cb254ed31f13ccea7f -->
 # vault/policies/ — Agent Instructions
 HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per