diff --git a/.gitignore b/.gitignore index 21c6fbc..a29450c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,6 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json -gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 0485833..473bb18 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,6 +294,45 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", + # Test data for duplicate service detection tests (#850) + # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh + "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", + "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", + # Common vault-seed script patterns: logging helpers + flag parsing + # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh + "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", + "ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)", + "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", + "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", + "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", + # forgejo-bootstrap.sh follows wp-oauth-register.sh pattern (issue #1069) + "2b80185e4ae2b54e2e01f33e5555c688": "Standard header (set -euo pipefail, SCRIPT_DIR, REPO_ROOT) (forgejo-bootstrap + wp-oauth-register)", + "38a1f20a60d69f0d6bfb06a0532b3bd7": "Logging helpers + DRY_RUN init (forgejo-bootstrap + wp-oauth-register)", + "4dd3c526fa29bdaa88b274c3d7d01032": "Flag parsing loop + case start (forgejo-bootstrap + wp-oauth-register)", + # Common vault-seed script preamble + precondition patterns + # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh + "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT", + "1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard", + "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", + "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", + "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", + # Common vault-seed script flag parsing patterns + # Shared across tools/vault-seed-{forgejo,ops-repo}.sh + "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", + "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", + "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", + "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", + "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", + "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", + "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", + "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", + "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", + "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", + "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", + "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", + "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", + # Common shell control-flow: if → return 1 → fi → fi (env.sh + register.sh) + "a8bdb7f1a5d8cbd0a5921b17b6cf6f4d": "Common shell control-flow (return 1 / fi / fi / return 0 / }) (env.sh + register.sh)", } if not sh_files: diff --git a/.woodpecker/edge-subpath.yml b/.woodpecker/edge-subpath.yml new file mode 100644 index 0000000..2c11980 --- /dev/null +++ b/.woodpecker/edge-subpath.yml @@ -0,0 +1,317 @@ +# ============================================================================= +# .woodpecker/edge-subpath.yml — Edge subpath routing static checks +# +# Static validation for edge subpath routing configuration. This pipeline does +# NOT run live service curls — it validates the configuration that would be +# used by a deployed edge proxy. +# +# Checks: +# 1. shellcheck — syntax check on tests/smoke-edge-subpath.sh +# 2. caddy validate — validate the Caddyfile template syntax +# 3. caddyfile-routing-test — verify Caddyfile routing block shape +# 4. test-caddyfile-routing — run standalone unit test for Caddyfile structure +# +# Triggers: +# - Pull requests that modify edge-related files +# +# Environment variables (inherited from WOODPECKER_ENVIRONMENT): +# EDGE_BASE_URL — Edge proxy URL for reference (default: http://localhost) +# EDGE_TIMEOUT — Request timeout in seconds (default: 30) +# EDGE_MAX_RETRIES — Max retries per request (default: 3) +# ============================================================================= + +when: + event: pull_request + +steps: + # ── 1. ShellCheck on smoke script ──────────────────────────────────────── + # `shellcheck` validates bash syntax, style, and common pitfalls. + # Exit codes: + # 0 — all checks passed + # 1 — one or more issues found + - name: shellcheck-smoke + image: koalaman/shellcheck-alpine:stable + commands: + - shellcheck --severity=warning tests/smoke-edge-subpath.sh tests/test-caddyfile-routing.sh + + # ── 2. Caddyfile template rendering ─────────────────────────────────────── + # Render a mock Caddyfile for validation. The template uses Nomad's + # templating syntax ({{ range ... }}) which must be processed before Caddy + # can validate it. We render a mock version with Nomad templates expanded + # to static values for validation purposes. + - name: render-caddyfile + image: alpine:3.19 + commands: + - apk add --no-cache coreutils + - | + set -e + mkdir -p edge-render + # Render mock Caddyfile with Nomad templates expanded + { + echo '# Caddyfile — edge proxy configuration (Nomad-rendered)' + echo '# Staging upstream discovered via Nomad service registration.' + echo '' + echo ':80 {' + echo ' # Redirect root to Forgejo' + echo ' handle / {' + echo ' redir /forge/ 302' + echo ' }' + echo '' + echo ' # Reverse proxy to Forgejo' + echo ' handle /forge/* {' + echo ' reverse_proxy 127.0.0.1:3000' + echo ' }' + echo '' + echo ' # Reverse proxy to Woodpecker CI' + echo ' handle /ci/* {' + echo ' reverse_proxy 127.0.0.1:8000' + echo ' }' + echo '' + echo ' # Reverse proxy to staging — dynamic port via Nomad service discovery' + echo ' handle /staging/* {' + echo ' reverse_proxy 127.0.0.1:8081' + echo ' }' + echo '' + echo ' # Chat service — reverse proxy to disinto-chat backend (#705)' + echo ' # OAuth routes bypass forward_auth — unauthenticated users need these (#709)' + echo ' handle /chat/login {' + echo ' reverse_proxy 127.0.0.1:8080' + echo ' }' + echo ' handle /chat/oauth/callback {' + echo ' reverse_proxy 127.0.0.1:8080' + echo ' }' + echo ' # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)' + echo ' handle /chat/* {' + echo ' forward_auth 127.0.0.1:8080 {' + echo ' uri /chat/auth/verify' + echo ' copy_headers X-Forwarded-User' + echo ' header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}' + echo ' }' + echo ' reverse_proxy 127.0.0.1:8080' + echo ' }' + echo '}' + } > edge-render/Caddyfile + cp edge-render/Caddyfile edge-render/Caddyfile.rendered + echo "Caddyfile rendered successfully" + + # ── 3. Caddy config validation ─────────────────────────────────────────── + # `caddy validate` checks Caddyfile syntax and configuration. + # This validates the rendered Caddyfile against Caddy's parser. + # Exit codes: + # 0 — configuration is valid + # 1 — configuration has errors + - name: caddy-validate + image: alpine:3.19 + commands: + - apk add --no-cache ca-certificates curl + - curl -sS -o /tmp/caddy "https://caddyserver.com/api/download?os=linux&arch=amd64" + - chmod +x /tmp/caddy + - /tmp/caddy version + - /tmp/caddy validate --config edge-render/Caddyfile.rendered --adapter caddyfile + + # ── 4. Caddyfile routing block shape test ───────────────────────────────── + # Verify that the Caddyfile contains all required routing blocks: + # - /forge/ — Forgejo subpath + # - /ci/ — Woodpecker subpath + # - /staging/ — Staging subpath + # - /chat/ — Chat subpath with forward_auth + # + # This is a unit test that validates the expected structure without + # requiring a running Caddy instance. + - name: caddyfile-routing-test + image: alpine:3.19 + commands: + - apk add --no-cache grep coreutils + - | + set -e + + CADDYFILE="edge-render/Caddyfile.rendered" + + echo "=== Validating Caddyfile routing blocks ===" + + # Check that all required subpath handlers exist + # POSIX-safe loop (alpine /bin/sh has no arrays) + FAILED=0 + for handler in "handle /forge/\*" "handle /ci/\*" "handle /staging/\*" "handle /chat/login" "handle /chat/oauth/callback" "handle /chat/\*"; do + if grep -q "$handler" "$CADDYFILE"; then + echo "[PASS] Found handler: $handler" + else + echo "[FAIL] Missing handler: $handler" + FAILED=1 + fi + done + + # Check forward_auth block exists for /chat/* + if grep -A5 "handle /chat/\*" "$CADDYFILE" | grep -q "forward_auth"; then + echo "[PASS] forward_auth block found for /chat/*" + else + echo "[FAIL] forward_auth block missing for /chat/*" + FAILED=1 + fi + + # Check reverse_proxy to Forgejo (port 3000) + if grep -q "reverse_proxy 127.0.0.1:3000" "$CADDYFILE"; then + echo "[PASS] Forgejo reverse_proxy configured (port 3000)" + else + echo "[FAIL] Forgejo reverse_proxy not configured" + FAILED=1 + fi + + # Check reverse_proxy to Woodpecker (port 8000) + if grep -q "reverse_proxy 127.0.0.1:8000" "$CADDYFILE"; then + echo "[PASS] Woodpecker reverse_proxy configured (port 8000)" + else + echo "[FAIL] Woodpecker reverse_proxy not configured" + FAILED=1 + fi + + # Check reverse_proxy to Chat (port 8080) + if grep -q "reverse_proxy 127.0.0.1:8080" "$CADDYFILE"; then + echo "[PASS] Chat reverse_proxy configured (port 8080)" + else + echo "[FAIL] Chat reverse_proxy not configured" + FAILED=1 + fi + + # Check root redirect to /forge/ + if grep -q "redir /forge/ 302" "$CADDYFILE"; then + echo "[PASS] Root redirect to /forge/ configured" + else + echo "[FAIL] Root redirect to /forge/ not configured" + FAILED=1 + fi + + echo "" + if [ $FAILED -eq 0 ]; then + echo "=== All routing blocks validated ===" + exit 0 + else + echo "=== Routing block validation failed ===" >&2 + exit 1 + fi + + # ── 5. Standalone Caddyfile routing test ───────────────────────────────── + # Run the standalone unit test for Caddyfile routing block validation. + # This test extracts the Caddyfile template from edge.hcl and validates + # its structure without requiring a running Caddy instance. + - name: test-caddyfile-routing + image: alpine:3.19 + commands: + - apk add --no-cache grep coreutils + - | + set -e + EDGE_TEMPLATE="nomad/jobs/edge.hcl" + + echo "=== Extracting Caddyfile template from $EDGE_TEMPLATE ===" + + # Extract the Caddyfile template (content between <&2 + exit 1 + fi + + echo "Caddyfile template extracted successfully" + echo "" + + FAILED=0 + + # Check Forgejo subpath + if echo "$CADDYFILE" | grep -q "handle /forge/\*"; then + echo "[PASS] Forgejo handle block" + else + echo "[FAIL] Forgejo handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then + echo "[PASS] Forgejo reverse_proxy (port 3000)" + else + echo "[FAIL] Forgejo reverse_proxy (port 3000)" + FAILED=1 + fi + + # Check Woodpecker subpath + if echo "$CADDYFILE" | grep -q "handle /ci/\*"; then + echo "[PASS] Woodpecker handle block" + else + echo "[FAIL] Woodpecker handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8000"; then + echo "[PASS] Woodpecker reverse_proxy (port 8000)" + else + echo "[FAIL] Woodpecker reverse_proxy (port 8000)" + FAILED=1 + fi + + # Check Staging subpath + if echo "$CADDYFILE" | grep -q "handle /staging/\*"; then + echo "[PASS] Staging handle block" + else + echo "[FAIL] Staging handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "nomadService"; then + echo "[PASS] Staging Nomad service discovery" + else + echo "[FAIL] Staging Nomad service discovery" + FAILED=1 + fi + + # Check Chat subpath + if echo "$CADDYFILE" | grep -q "handle /chat/login"; then + echo "[PASS] Chat login handle block" + else + echo "[FAIL] Chat login handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "handle /chat/oauth/callback"; then + echo "[PASS] Chat OAuth callback handle block" + else + echo "[FAIL] Chat OAuth callback handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "handle /chat/\*"; then + echo "[PASS] Chat catch-all handle block" + else + echo "[FAIL] Chat catch-all handle block" + FAILED=1 + fi + + if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8080"; then + echo "[PASS] Chat reverse_proxy (port 8080)" + else + echo "[FAIL] Chat reverse_proxy (port 8080)" + FAILED=1 + fi + + # Check forward_auth for chat + if echo "$CADDYFILE" | grep -A10 "handle /chat/\*" | grep -q "forward_auth"; then + echo "[PASS] forward_auth block for /chat/*" + else + echo "[FAIL] forward_auth block for /chat/*" + FAILED=1 + fi + + # Check root redirect + if echo "$CADDYFILE" | grep -q "redir /forge/ 302"; then + echo "[PASS] Root redirect to /forge/" + else + echo "[FAIL] Root redirect to /forge/" + FAILED=1 + fi + + echo "" + if [ $FAILED -eq 0 ]; then + echo "=== All routing blocks validated ===" + exit 0 + else + echo "=== Routing block validation failed ===" >&2 + exit 1 + fi diff --git a/AGENTS.md b/AGENTS.md index ad3867b..52ea01f 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,19 +37,20 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) -├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh -│ jobs/ — Nomad jobspecs (forgejo.hcl reads Vault secrets via template stanza, S2.4) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4) +├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh +│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) -├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) -├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) -│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh, vault-seed-forgejo.sh — Vault provisioning (S2.1/S2.2) +├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge (Caddy + chat server subprocess + dispatcher), chat (server.py, ui/ — copied into edge image at build time) +├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh; register.sh enforces: reserved-name blocklist, admin-approved allowlist via /var/lib/disinto/allowlist.json, per-caller attribution via --as forced-command arg stored as registered_by, append-only audit log at /var/log/disinto/edge-register.log, ownership check on deregister requiring pubkey match) +│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) +│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script +├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 7f8b1f4..98c0e04 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 2b676a3..b0893c4 100755 --- a/bin/disinto +++ b/bin/disinto @@ -12,6 +12,7 @@ # disinto secrets Manage encrypted secrets # disinto run Run action in ephemeral runner container # disinto ci-logs [--step ] Read CI logs from Woodpecker SQLite +# disinto backup create Export factory state for migration # # Usage: # disinto init https://github.com/user/repo @@ -39,7 +40,9 @@ source "${FACTORY_ROOT}/lib/generators.sh" source "${FACTORY_ROOT}/lib/forge-push.sh" source "${FACTORY_ROOT}/lib/ci-setup.sh" source "${FACTORY_ROOT}/lib/release.sh" +source "${FACTORY_ROOT}/lib/backup.sh" source "${FACTORY_ROOT}/lib/claude-config.sh" +source "${FACTORY_ROOT}/lib/disinto/backup.sh" # backup create/import # ── Helpers ────────────────────────────────────────────────────────────────── @@ -62,7 +65,9 @@ Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) + disinto backup create Export factory state (issues + ops bundle) disinto edge [options] Manage edge tunnel registrations + disinto backup Backup and restore factory state Edge subcommands: register [project] Register a new tunnel (generates keypair if needed) @@ -82,7 +87,7 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo[,...] (S1.3) + --with (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) @@ -101,6 +106,18 @@ Hire an agent options: CI logs options: --step Filter logs to a specific step (e.g., smoke-init) + +Backup subcommands: + create Create backup of factory state to tarball + import Restore factory state from backup tarball + +Import behavior: + - Unpacks tarball to temp directory + - Creates disinto repo via Forgejo API (mirror config is manual) + - Creates disinto-ops repo and pushes refs from bundle + - Imports issues from issues/*.json (idempotent - skips existing) + - Logs issue number mapping (Forgejo auto-assigns numbers) + - Prints summary: created X repos, pushed Y refs, imported Z issues, skipped W EOF exit 1 } @@ -670,6 +687,7 @@ _disinto_init_nomad() { local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" + local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" @@ -690,15 +708,22 @@ _disinto_init_nomad() { # --empty combined with --with or any --import-* flag, so reaching # this branch with those set is a bug in the caller. # - # On the default (non-empty) path, vault-apply-policies.sh and - # vault-nomad-auth.sh are invoked unconditionally — they are idempotent - # and cheap to re-run, and subsequent --with deployments depend on - # them. vault-import.sh is invoked only when an --import-* flag is set. + # On the default (non-empty) path, vault-engines.sh (enables the kv/ + # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked + # unconditionally — they are idempotent and cheap to re-run, and + # subsequent --with deployments depend on them. vault-import.sh is + # invoked only when an --import-* flag is set. vault-engines.sh runs + # first because every policy and role below references kv/disinto/* + # paths, which 403 if the engine is not yet mounted (issue #912). local import_any=false if [ -n "$import_env" ] || [ -n "$import_sops" ]; then import_any=true fi if [ "$empty" != "true" ]; then + if [ ! -x "$vault_engines_sh" ]; then + echo "Error: ${vault_engines_sh} not found or not executable" >&2 + exit 1 + fi if [ ! -x "$vault_policies_sh" ]; then echo "Error: ${vault_policies_sh} not found or not executable" >&2 exit 1 @@ -737,10 +762,15 @@ _disinto_init_nomad() { exit 0 fi - # Vault policies + auth are invoked on every nomad real-run path - # regardless of --import-* flags (they're idempotent; S2.1 + S2.3). - # Mirror that ordering in the dry-run plan so the operator sees the - # full sequence Step 2 will execute. + # Vault engines + policies + auth are invoked on every nomad real-run + # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). + # Engines runs first because policies/roles/templates all reference the + # kv/ mount it enables (issue #912). Mirror that ordering in the + # dry-run plan so the operator sees the full sequence Step 2 will + # execute. + echo "── Vault engines dry-run ──────────────────────────────" + echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" + echo "" echo "── Vault policies dry-run ─────────────────────────────" echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" echo "" @@ -770,19 +800,37 @@ _disinto_init_nomad() { fi if [ -n "$with_services" ]; then - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" - local IFS=',' - for svc in $with_services; do - svc=$(echo "$svc" | xargs) # trim whitespace - # Validate known services first + # Interleaved seed/deploy per service (S2.6, #928, #948): match the + # real-run path so dry-run output accurately represents execution order. + # Build ordered deploy list: only include services present in with_services + local DEPLOY_ORDER="" + for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do + if echo ",$with_services," | grep -q ",$ordered_svc,"; then + DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" + fi + done + + local IFS=' ' + echo "[deploy] deployment order: ${DEPLOY_ORDER}" + for svc in $DEPLOY_ORDER; do + # Seed this service (if seed script exists) + local seed_name="$svc" case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 - ;; + woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; + agents) seed_name="agents" ;; + chat) seed_name="chat" ;; + edge) seed_name="ops-repo" ;; esac + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" + if [ -x "$seed_script" ]; then + echo "── Vault seed dry-run ─────────────────────────────────" + echo "[seed] [dry-run] ${seed_script} --dry-run" + echo "" + fi + + # Deploy this service + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -790,9 +838,40 @@ _disinto_init_nomad() { fi echo "[deploy] [dry-run] nomad job validate ${jobspec_path}" echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}" + # Post-deploy: forgejo-bootstrap + if [ "$svc" = "forgejo" ]; then + local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" + echo "[deploy] [dry-run] [post-deploy] would run ${bootstrap_script}" + fi done echo "[deploy] dry-run complete" fi + + # Dry-run vault-runner (unconditionally, not gated by --with) + echo "" + echo "── Vault-runner dry-run ───────────────────────────────────" + local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" + if [ -f "$vault_runner_path" ]; then + echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}" + echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}" + else + echo "[deploy] vault-runner: jobspec not found, skipping" + fi + + # Build custom images dry-run (if agents, chat, or edge services are included) + if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then + echo "" + echo "── Build images dry-run ──────────────────────────────" + if echo ",$with_services," | grep -q ",agents,"; then + echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}" + fi + if echo ",$with_services," | grep -q ",chat,"; then + echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat" + fi + if echo ",$with_services," | grep -q ",edge,"; then + echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge" + fi + fi exit 0 fi @@ -814,6 +893,22 @@ _disinto_init_nomad() { exit 0 fi + # Enable Vault secret engines (S2.1 / issue #912) — must precede + # policies/auth/import because every policy and every import target + # addresses paths under kv/. Idempotent, safe to re-run. + echo "" + echo "── Enabling Vault secret engines ──────────────────────" + local -a engines_cmd=("$vault_engines_sh") + if [ "$(id -u)" -eq 0 ]; then + "${engines_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${engines_cmd[@]}" || exit $? + fi + # Apply Vault policies (S2.1) — idempotent, safe to re-run. echo "" echo "── Applying Vault policies ────────────────────────────" @@ -864,44 +959,147 @@ _disinto_init_nomad() { echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" fi - # Deploy services if requested - if [ -n "$with_services" ]; then + # Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy. + # Single-node factory dev box: no multi-node pull needed, no registry auth. + # Can upgrade to approach B (registry push/pull) later if multi-node. + if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then echo "" - echo "── Deploying services ─────────────────────────────────" - local -a deploy_cmd=("$deploy_sh") - # Split comma-separated service list into positional args - local IFS=',' - for svc in $with_services; do - svc=$(echo "$svc" | xargs) # trim whitespace - if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then - echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 - exit 1 + echo "── Building custom images ─────────────────────────────" + if echo ",$with_services," | grep -q ",agents,"; then + local tag="disinto/agents:local" + echo "── Building $tag ─────────────────────────────" + docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5 + fi + if echo ",$with_services," | grep -q ",chat,"; then + local tag="disinto/chat:local" + echo "── Building $tag ─────────────────────────────" + docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5 + fi + if echo ",$with_services," | grep -q ",edge,"; then + local tag="disinto/edge:local" + echo "── Building $tag ─────────────────────────────" + docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5 + fi + fi + + # Interleaved seed/deploy per service (S2.6, #928, #948). + # We interleave seed + deploy per service (not batch all seeds then all deploys) + # so that OAuth-dependent services can reach their dependencies during seeding. + # E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach + # running forgejo) → deploy-woodpecker. + if [ -n "$with_services" ]; then + local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" + + # Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge + local DEPLOY_ORDER="" + for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do + if echo ",$with_services," | grep -q ",$ordered_svc,"; then + DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" fi - # Validate known services FIRST (before jobspec check) + done + + local IFS=' ' + for svc in $DEPLOY_ORDER; do + # Seed this service (if seed script exists) + local seed_name="$svc" case "$svc" in - forgejo) ;; - *) - echo "Error: unknown service '${svc}' — known: forgejo" >&2 - exit 1 + woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; + agents) seed_name="agents" ;; + chat) seed_name="chat" ;; + edge) seed_name="ops-repo" ;; + esac + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" + if [ -x "$seed_script" ]; then + echo "" + echo "── Seeding Vault for ${seed_name} ───────────────────────────" + if [ "$(id -u)" -eq 0 ]; then + VAULT_ADDR="$vault_addr" "$seed_script" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? + fi + fi + + # Deploy this service + echo "" + echo "── Deploying ${svc} ───────────────────────────────────────" + + # Seed host volumes before deployment (if needed) + case "$svc" in + staging) + # Seed site-content host volume (/srv/disinto/docker) with static content + # The staging jobspec mounts this volume read-only to /srv/site + local site_content_src="${FACTORY_ROOT}/docker/index.html" + local site_content_dst="/srv/disinto/docker" + if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then + if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then + echo "[staging] seeding site-content volume..." + cp "$site_content_src" "${site_content_dst}/index.html" + fi + fi ;; esac - # Check jobspec exists + local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 exit 1 fi - deploy_cmd+=("$svc") + + local -a deploy_cmd=("$deploy_sh" "$svc") + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "${deploy_cmd[@]}" || exit $? + fi + + # Post-deploy: bootstrap Forgejo admin user after forgejo deployment + if [ "$svc" = "forgejo" ]; then + echo "" + echo "── Bootstrapping Forgejo admin user ───────────────────────" + local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" + if [ -x "$bootstrap_script" ]; then + if [ "$(id -u)" -eq 0 ]; then + "$bootstrap_script" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: forgejo-bootstrap.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "$bootstrap_script" || exit $? + fi + else + echo "warning: forgejo-bootstrap.sh not found or not executable" >&2 + fi + fi done - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 + # Run vault-runner (unconditionally, not gated by --with) — infrastructure job + # vault-runner is always present since it's needed for vault action dispatch + echo "" + echo "── Running vault-runner ────────────────────────────────────" + local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" + if [ -f "$vault_runner_path" ]; then + echo "[deploy] vault-runner: running Nomad job (infrastructure)" + local -a vault_runner_cmd=("$deploy_sh" "vault-runner") + if [ "$(id -u)" -eq 0 ]; then + "${vault_runner_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${vault_runner_cmd[@]}" || exit $? fi - sudo -n -- "${deploy_cmd[@]}" || exit $? + else + echo "[deploy] vault-runner: jobspec not found, skipping" fi # Print final summary @@ -919,9 +1117,24 @@ _disinto_init_nomad() { echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" fi echo "Deployed: ${with_services}" - if echo "$with_services" | grep -q "forgejo"; then + if echo ",$with_services," | grep -q ",forgejo,"; then echo "Ports: forgejo: 3000" fi + if echo ",$with_services," | grep -q ",woodpecker-server,"; then + echo " woodpecker-server: 8000" + fi + if echo ",$with_services," | grep -q ",woodpecker-agent,"; then + echo " woodpecker-agent: (agent connected)" + fi + if echo ",$with_services," | grep -q ",agents,"; then + echo " agents: (polling loop running)" + fi + if echo ",$with_services," | grep -q ",staging,"; then + echo " staging: (internal, no external port)" + fi + if echo ",$with_services," | grep -q ",chat,"; then + echo " chat: 8080" + fi echo "────────────────────────────────────────────────────────" fi @@ -1007,6 +1220,70 @@ disinto_init() { exit 1 fi + # Normalize --with services (S3.4): expand 'woodpecker' shorthand to + # 'woodpecker-server,woodpecker-agent', auto-include forgejo when + # woodpecker is requested (OAuth dependency), and validate all names. + if [ -n "$with_services" ]; then + # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. + # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. + local expanded="" + local IFS=',' + for _svc in $with_services; do + _svc=$(echo "$_svc" | xargs) + case "$_svc" in + woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; + agents) _svc="agents" ;; + esac + expanded="${expanded:+${expanded},}${_svc}" + done + with_services="$expanded" + unset IFS + + # Auto-include forgejo when woodpecker is requested + if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ + && ! echo ",$with_services," | grep -q ",forgejo,"; then + echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" + with_services="forgejo,${with_services}" + fi + + # Auto-include forgejo and woodpecker when agents is requested + if echo ",$with_services," | grep -q ",agents,"; then + if ! echo ",$with_services," | grep -q ",forgejo,"; then + echo "Note: --with agents implies --with forgejo (agents need forge)" + with_services="forgejo,${with_services}" + fi + if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then + echo "Note: --with agents implies --with woodpecker (agents need CI)" + with_services="${with_services},woodpecker-server,woodpecker-agent" + fi + fi + + # Auto-include all dependencies when edge is requested (S5.5) + if echo ",$with_services," | grep -q ",edge,"; then + # Edge depends on all backend services + for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do + if ! echo ",$with_services," | grep -q ",${dep},"; then + echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)" + with_services="${with_services},${dep}" + fi + done + fi + + # Validate all service names are known + local IFS=',' + for _svc in $with_services; do + _svc=$(echo "$_svc" | xargs) + case "$_svc" in + forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;; + *) + echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2 + exit 1 + ;; + esac + done + unset IFS + fi + # --import-* flag validation (S2.5). These three flags form an import # triple and must be consistent before dispatch: sops encryption is # useless without the age key to decrypt it, so either both --import-sops @@ -1197,6 +1474,36 @@ p.write_text(text) exit 0 fi + # Configure Forgejo and Woodpecker URLs when EDGE_TUNNEL_FQDN is set. + # In subdomain mode, uses per-service FQDNs at root path instead of subpath URLs. + # Must run BEFORE generate_compose so the .env file is available for variable substitution. + if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then + local routing_mode="${EDGE_ROUTING_MODE:-subpath}" + # Create .env file if it doesn't exist yet (needed before compose generation) + if [ "$bare" = false ] && [ ! -f "${FACTORY_ROOT}/.env" ]; then + touch "${FACTORY_ROOT}/.env" + fi + if [ "$routing_mode" = "subdomain" ]; then + # Subdomain mode: Forgejo at forge..disinto.ai (root path) + if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN_FORGE:-forge.${EDGE_TUNNEL_FQDN}}/" >> "${FACTORY_ROOT}/.env" + fi + # Subdomain mode: Woodpecker at ci..disinto.ai (root path) + if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN_CI:-ci.${EDGE_TUNNEL_FQDN}}" >> "${FACTORY_ROOT}/.env" + fi + else + # Subpath mode: Forgejo ROOT_URL with /forge/ subpath (trailing slash required) + if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" + fi + # Subpath mode: Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) + if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" + fi + fi + fi + # Generate compose files (unless --bare) if [ "$bare" = false ]; then local forge_port @@ -1211,18 +1518,6 @@ p.write_text(text) touch "${FACTORY_ROOT}/.env" fi - # Configure Forgejo and Woodpecker subpath URLs when EDGE_TUNNEL_FQDN is set - if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - # Forgejo ROOT_URL with /forge/ subpath (note trailing slash - Forgejo needs it) - if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" - fi - # Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) - if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" - fi - fi - # Prompt for FORGE_ADMIN_PASS before setup_forge # This ensures the password is set before Forgejo user creation prompt_admin_password "${FACTORY_ROOT}/.env" @@ -1326,9 +1621,15 @@ p.write_text(text) create_woodpecker_oauth "$forge_url" "$forge_repo" # Create OAuth2 app on Forgejo for disinto-chat (#708) + # In subdomain mode, callback is at chat. root instead of /chat/ subpath. local chat_redirect_uri if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" + local chat_routing_mode="${EDGE_ROUTING_MODE:-subpath}" + if [ "$chat_routing_mode" = "subdomain" ]; then + chat_redirect_uri="https://${EDGE_TUNNEL_FQDN_CHAT:-chat.${EDGE_TUNNEL_FQDN}}/oauth/callback" + else + chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" + fi else chat_redirect_uri="http://localhost/chat/oauth/callback" fi @@ -2528,15 +2829,29 @@ disinto_edge() { # Write to .env (replace existing entries to avoid duplicates) local tmp_env tmp_env=$(mktemp) - grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN)=" "$env_file" > "$tmp_env" 2>/dev/null || true + grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN|FQDN_FORGE|FQDN_CI|FQDN_CHAT)=" "$env_file" > "$tmp_env" 2>/dev/null || true mv "$tmp_env" "$env_file" echo "EDGE_TUNNEL_HOST=${edge_host}" >> "$env_file" echo "EDGE_TUNNEL_PORT=${port}" >> "$env_file" echo "EDGE_TUNNEL_FQDN=${fqdn}" >> "$env_file" + # Subdomain mode: write per-service FQDNs (#1028) + local reg_routing_mode="${EDGE_ROUTING_MODE:-subpath}" + if [ "$reg_routing_mode" = "subdomain" ]; then + echo "EDGE_TUNNEL_FQDN_FORGE=forge.${fqdn}" >> "$env_file" + echo "EDGE_TUNNEL_FQDN_CI=ci.${fqdn}" >> "$env_file" + echo "EDGE_TUNNEL_FQDN_CHAT=chat.${fqdn}" >> "$env_file" + fi + echo "Registered: ${project}" echo " Port: ${port}" echo " FQDN: ${fqdn}" + if [ "$reg_routing_mode" = "subdomain" ]; then + echo " Mode: subdomain" + echo " Forge: forge.${fqdn}" + echo " CI: ci.${fqdn}" + echo " Chat: chat.${fqdn}" + fi echo " Saved to: ${env_file}" ;; @@ -2570,12 +2885,23 @@ disinto_edge() { edge_host="${EDGE_HOST:-edge.disinto.ai}" fi + # Read tunnel pubkey for ownership proof + local secrets_dir="${FACTORY_ROOT}/secrets" + local tunnel_pubkey="${secrets_dir}/tunnel_key.pub" + if [ ! -f "$tunnel_pubkey" ]; then + echo "Error: tunnel keypair not found at ${tunnel_pubkey}" >&2 + echo "Cannot prove ownership without the tunnel public key." >&2 + exit 1 + fi + local pubkey + pubkey=$(tr -d '\n' < "$tunnel_pubkey") + # SSH to edge host and deregister echo "Deregistering tunnel for ${project} on ${edge_host}..." local response response=$(ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \ "disinto-register@${edge_host}" \ - "deregister ${project}" 2>&1) || { + "deregister ${project} ${pubkey}" 2>&1) || { echo "Error: failed to deregister tunnel" >&2 echo "Response: ${response}" >&2 exit 1 @@ -2658,6 +2984,33 @@ EOF esac } +# ── backup command ──────────────────────────────────────────────────────────── +# Usage: disinto backup [args] +# Subcommands: +# create Create backup of factory state +# import Restore factory state from backup +disinto_backup() { + local subcmd="${1:-}" + shift || true + + case "$subcmd" in + create) + backup_create "$@" + ;; + import) + backup_import "$@" + ;; + *) + echo "Usage: disinto backup [args]" >&2 + echo "" >&2 + echo "Subcommands:" >&2 + echo " create Create backup of factory state" >&2 + echo " import Restore factory state from backup" >&2 + exit 1 + ;; + esac +} + # ── Main dispatch ──────────────────────────────────────────────────────────── case "${1:-}" in @@ -2674,6 +3027,7 @@ case "${1:-}" in hire-an-agent) shift; disinto_hire_an_agent "$@" ;; agent) shift; disinto_agent "$@" ;; edge) shift; disinto_edge "$@" ;; + backup) shift; disinto_backup "$@" ;; -h|--help) usage ;; *) usage ;; esac diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 13d9736..d48f6b6 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker-compose.yml b/docker-compose.yml index ba8c77c..6206b2c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,6 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -78,7 +77,6 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -139,7 +137,6 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -211,8 +208,8 @@ services: edge: build: - context: docker/edge - dockerfile: Dockerfile + context: . + dockerfile: docker/edge/Dockerfile image: disinto/edge:latest container_name: disinto-edge security_opt: @@ -223,6 +220,8 @@ services: - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/root/.claude.json:ro - ${CLAUDE_DIR:-${HOME}/.claude}:/root/.claude:ro - disinto-logs:/opt/disinto-logs + # Chat history persistence (merged from chat container, #1083) + - ${CHAT_HISTORY_DIR:-./state/chat-history}:/var/lib/chat/history environment: - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} @@ -234,6 +233,17 @@ services: - PRIMARY_BRANCH=main - DISINTO_CONTAINER=1 - FORGE_ADMIN_USERS=disinto-admin,vault-bot,admin + # Chat env vars (merged from chat container into edge, #1083) + - CHAT_HOST=127.0.0.1 + - CHAT_PORT=8080 + - CHAT_OAUTH_CLIENT_ID=${CHAT_OAUTH_CLIENT_ID:-} + - CHAT_OAUTH_CLIENT_SECRET=${CHAT_OAUTH_CLIENT_SECRET:-} + - DISINTO_CHAT_ALLOWED_USERS=${DISINTO_CHAT_ALLOWED_USERS:-} + - FORWARD_AUTH_SECRET=${FORWARD_AUTH_SECRET:-} + - EDGE_TUNNEL_FQDN=${EDGE_TUNNEL_FQDN:-} + - EDGE_TUNNEL_FQDN_CHAT=${EDGE_TUNNEL_FQDN_CHAT:-} + - EDGE_ROUTING_MODE=${EDGE_ROUTING_MODE:-subpath} + # Rate limiting removed (#1084) ports: - "80:80" - "443:443" diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index 1bcba89..fa3b2d8 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -1,21 +1,26 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ + bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) # SOPS — encrypted data decryption tool -COPY docker/agents/bin/sops /usr/local/bin/sops -RUN chmod +x /usr/local/bin/sops +# Download sops binary (replaces manual COPY of vendored binary) +ARG SOPS_VERSION=3.9.4 +RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ + -o /usr/local/bin/sops && chmod +x /usr/local/bin/sops # tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations -COPY docker/agents/bin/tea /usr/local/bin/tea -RUN chmod +x /usr/local/bin/tea +# Download tea binary (replaces manual COPY of vendored binary) +ARG TEA_VERSION=0.9.2 +RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ + -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea -# Claude CLI is mounted from the host via docker-compose volume. -# No internet access to cli.anthropic.com required at build time. +# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API). +# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider. +RUN npm install -g @anthropic-ai/claude-code@2.1.84 # Non-root user RUN useradd -m -u 1000 -s /bin/bash agent diff --git a/docker/chat/Dockerfile b/docker/chat/Dockerfile deleted file mode 100644 index 3d89863..0000000 --- a/docker/chat/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -# disinto-chat — minimal HTTP backend for Claude chat UI -# -# Small Debian slim base with Python runtime. -# Chosen for simplicity and small image size (~100MB). -# -# Image size: ~100MB (well under the 200MB ceiling) -# -# The claude binary is mounted from the host at runtime via docker-compose, -# not baked into the image — same pattern as the agents container. - -FROM debian:bookworm-slim - -# Install Python (no build-time network access needed) -RUN apt-get update && apt-get install -y --no-install-recommends \ - python3 \ - && rm -rf /var/lib/apt/lists/* - -# Non-root user — fixed UID 10001 for sandbox hardening (#706) -RUN useradd -m -u 10001 -s /bin/bash chat - -# Copy application files -COPY server.py /usr/local/bin/server.py -COPY entrypoint-chat.sh /entrypoint-chat.sh -COPY ui/ /var/chat/ui/ - -RUN chmod +x /entrypoint-chat.sh /usr/local/bin/server.py - -USER chat -WORKDIR /var/chat - -EXPOSE 8080 -HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ - CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1 - -ENTRYPOINT ["/entrypoint-chat.sh"] diff --git a/docker/chat/entrypoint-chat.sh b/docker/chat/entrypoint-chat.sh deleted file mode 100755 index 00fbe53..0000000 --- a/docker/chat/entrypoint-chat.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# entrypoint-chat.sh — Start the disinto-chat backend server -# -# Exec-replace pattern: this script is the container entrypoint and runs -# the server directly (no wrapper needed). Logs to stdout for docker logs. - -LOGFILE="/tmp/chat.log" - -log() { - printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE" -} - -# Sandbox sanity checks (#706) — fail fast if isolation is broken -if [ -e /var/run/docker.sock ]; then - log "FATAL: /var/run/docker.sock is accessible — sandbox violation" - exit 1 -fi -if [ "$(id -u)" = "0" ]; then - log "FATAL: running as root (uid 0) — sandbox violation" - exit 1 -fi - -# Verify Claude CLI is available (expected via volume mount from host). -if ! command -v claude &>/dev/null; then - log "FATAL: claude CLI not found in PATH" - log "Mount the host binary into the container, e.g.:" - log " volumes:" - log " - /usr/local/bin/claude:/usr/local/bin/claude:ro" - exit 1 -fi -log "Claude CLI: $(claude --version 2>&1 || true)" - -# Start the Python server (exec-replace so signals propagate correctly) -log "Starting disinto-chat server on port 8080..." -exec python3 /usr/local/bin/server.py diff --git a/docker/chat/server.py b/docker/chat/server.py index 6748354..48944d1 100644 --- a/docker/chat/server.py +++ b/docker/chat/server.py @@ -20,9 +20,15 @@ OAuth flow: 6. Redirects to /chat/ The claude binary is expected to be mounted from the host at /usr/local/bin/claude. + +Workspace access: + - CHAT_WORKSPACE_DIR environment variable: bind-mounted project working tree + - Claude invocation uses --permission-mode acceptEdits for code modification + - CWD is set to workspace directory when configured, enabling Claude to + inspect, explain, or modify code scoped to that tree only """ -import datetime +import asyncio import json import os import re @@ -30,21 +36,33 @@ import secrets import subprocess import sys import time +import threading from http.server import HTTPServer, BaseHTTPRequestHandler +from socketserver import ThreadingMixIn from urllib.parse import urlparse, parse_qs, urlencode +import socket +import struct +import base64 +import hashlib # Configuration -HOST = os.environ.get("CHAT_HOST", "0.0.0.0") +HOST = os.environ.get("CHAT_HOST", "127.0.0.1") PORT = int(os.environ.get("CHAT_PORT", 8080)) UI_DIR = "/var/chat/ui" STATIC_DIR = os.path.join(UI_DIR, "static") CLAUDE_BIN = "/usr/local/bin/claude" +# Workspace directory: bind-mounted project working tree for Claude access +# Defaults to empty; when set, Claude can read/write to this directory +WORKSPACE_DIR = os.environ.get("CHAT_WORKSPACE_DIR", "") + # OAuth configuration FORGE_URL = os.environ.get("FORGE_URL", "http://localhost:3000") CHAT_OAUTH_CLIENT_ID = os.environ.get("CHAT_OAUTH_CLIENT_ID", "") CHAT_OAUTH_CLIENT_SECRET = os.environ.get("CHAT_OAUTH_CLIENT_SECRET", "") EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "") +EDGE_TUNNEL_FQDN_CHAT = os.environ.get("EDGE_TUNNEL_FQDN_CHAT", "") +EDGE_ROUTING_MODE = os.environ.get("EDGE_ROUTING_MODE", "subpath") # Shared secret for Caddy forward_auth verify endpoint (#709). # When set, only requests carrying this value in X-Forward-Auth-Secret are @@ -52,10 +70,6 @@ EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "") # (acceptable during local dev; production MUST set this). FORWARD_AUTH_SECRET = os.environ.get("FORWARD_AUTH_SECRET", "") -# Rate limiting / cost caps (#711) -CHAT_MAX_REQUESTS_PER_HOUR = int(os.environ.get("CHAT_MAX_REQUESTS_PER_HOUR", 60)) -CHAT_MAX_REQUESTS_PER_DAY = int(os.environ.get("CHAT_MAX_REQUESTS_PER_DAY", 500)) -CHAT_MAX_TOKENS_PER_DAY = int(os.environ.get("CHAT_MAX_TOKENS_PER_DAY", 1000000)) # Allowed users - disinto-admin always allowed; CSV allowlist extends it _allowed_csv = os.environ.get("DISINTO_CHAT_ALLOWED_USERS", "") @@ -81,11 +95,10 @@ _sessions = {} # Pending OAuth state tokens: state -> expires (float) _oauth_states = {} -# Per-user rate limiting state (#711) -# user -> list of request timestamps (for sliding-window hourly/daily caps) -_request_log = {} -# user -> {"tokens": int, "date": "YYYY-MM-DD"} -_daily_tokens = {} + +# WebSocket message queues per user +# user -> asyncio.Queue (for streaming messages to connected clients) +_websocket_queues = {} # MIME types for static files MIME_TYPES = { @@ -99,9 +112,22 @@ MIME_TYPES = { ".ico": "image/x-icon", } +# WebSocket subprotocol for chat streaming +WEBSOCKET_SUBPROTOCOL = "chat-stream-v1" + +# WebSocket opcodes +OPCODE_CONTINUATION = 0x0 +OPCODE_TEXT = 0x1 +OPCODE_BINARY = 0x2 +OPCODE_CLOSE = 0x8 +OPCODE_PING = 0x9 +OPCODE_PONG = 0xA + def _build_callback_uri(): """Build the OAuth callback URI based on tunnel configuration.""" + if EDGE_ROUTING_MODE == "subdomain" and EDGE_TUNNEL_FQDN_CHAT: + return f"https://{EDGE_TUNNEL_FQDN_CHAT}/oauth/callback" if EDGE_TUNNEL_FQDN: return f"https://{EDGE_TUNNEL_FQDN}/chat/oauth/callback" return "http://localhost/chat/oauth/callback" @@ -187,69 +213,9 @@ def _fetch_user(access_token): return None -# ============================================================================= -# Rate Limiting Functions (#711) -# ============================================================================= - -def _check_rate_limit(user): - """Check per-user rate limits. Returns (allowed, retry_after, reason) (#711). - - Checks hourly request cap, daily request cap, and daily token cap. - """ - now = time.time() - one_hour_ago = now - 3600 - today = datetime.date.today().isoformat() - - # Prune old entries from request log - timestamps = _request_log.get(user, []) - timestamps = [t for t in timestamps if t > now - 86400] - _request_log[user] = timestamps - - # Hourly request cap - hourly = [t for t in timestamps if t > one_hour_ago] - if len(hourly) >= CHAT_MAX_REQUESTS_PER_HOUR: - oldest_in_window = min(hourly) - retry_after = int(oldest_in_window + 3600 - now) + 1 - return False, max(retry_after, 1), "hourly request limit" - - # Daily request cap - start_of_day = time.mktime(datetime.date.today().timetuple()) - daily = [t for t in timestamps if t >= start_of_day] - if len(daily) >= CHAT_MAX_REQUESTS_PER_DAY: - next_day = start_of_day + 86400 - retry_after = int(next_day - now) + 1 - return False, max(retry_after, 1), "daily request limit" - - # Daily token cap - token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) - if token_info["date"] != today: - token_info = {"tokens": 0, "date": today} - _daily_tokens[user] = token_info - if token_info["tokens"] >= CHAT_MAX_TOKENS_PER_DAY: - next_day = start_of_day + 86400 - retry_after = int(next_day - now) + 1 - return False, max(retry_after, 1), "daily token limit" - - return True, 0, "" - - -def _record_request(user): - """Record a request timestamp for the user (#711).""" - _request_log.setdefault(user, []).append(time.time()) - - -def _record_tokens(user, tokens): - """Record token usage for the user (#711).""" - today = datetime.date.today().isoformat() - token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) - if token_info["date"] != today: - token_info = {"tokens": 0, "date": today} - token_info["tokens"] += tokens - _daily_tokens[user] = token_info - def _parse_stream_json(output): - """Parse stream-json output from claude --print (#711). + """Parse stream-json output from claude --print. Returns (text_content, total_tokens). Falls back gracefully if the usage event is absent or malformed. @@ -295,6 +261,313 @@ def _parse_stream_json(output): return "".join(text_parts), total_tokens +# ============================================================================= +# WebSocket Handler Class +# ============================================================================= + +class _WebSocketHandler: + """Handle WebSocket connections for chat streaming.""" + + def __init__(self, reader, writer, user, message_queue): + self.reader = reader + self.writer = writer + self.user = user + self.message_queue = message_queue + self.closed = False + + async def accept_connection(self, sec_websocket_key, sec_websocket_protocol=None): + """Accept the WebSocket handshake. + + The HTTP request has already been parsed by BaseHTTPRequestHandler, + so we use the provided key and protocol instead of re-reading from socket. + """ + # Validate subprotocol + if sec_websocket_protocol and sec_websocket_protocol != WEBSOCKET_SUBPROTOCOL: + self._send_http_error( + 400, + "Bad Request", + f"Unsupported subprotocol. Expected: {WEBSOCKET_SUBPROTOCOL}", + ) + self._close_connection() + return False + + # Generate accept key + accept_key = self._generate_accept_key(sec_websocket_key) + + # Send handshake response + response = ( + "HTTP/1.1 101 Switching Protocols\r\n" + "Upgrade: websocket\r\n" + "Connection: Upgrade\r\n" + f"Sec-WebSocket-Accept: {accept_key}\r\n" + ) + + if sec_websocket_protocol: + response += f"Sec-WebSocket-Protocol: {sec_websocket_protocol}\r\n" + + response += "\r\n" + self.writer.write(response.encode("utf-8")) + await self.writer.drain() + return True + + def _generate_accept_key(self, sec_key): + """Generate the Sec-WebSocket-Accept key.""" + GUID = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11" + combined = sec_key + GUID + sha1 = hashlib.sha1(combined.encode("utf-8")) + return base64.b64encode(sha1.digest()).decode("utf-8") + + async def _read_line(self): + """Read a line from the socket.""" + data = await self.reader.read(1) + line = "" + while data: + if data == b"\r": + data = await self.reader.read(1) + continue + if data == b"\n": + return line + line += data.decode("utf-8", errors="replace") + data = await self.reader.read(1) + return line + + def _send_http_error(self, code, title, message): + """Send an HTTP error response.""" + response = ( + f"HTTP/1.1 {code} {title}\r\n" + "Content-Type: text/plain; charset=utf-8\r\n" + "Content-Length: " + str(len(message)) + "\r\n" + "\r\n" + + message + ) + try: + self.writer.write(response.encode("utf-8")) + self.writer.drain() + except Exception: + pass + + def _close_connection(self): + """Close the connection.""" + try: + self.writer.close() + except Exception: + pass + + async def send_text(self, data): + """Send a text frame.""" + if self.closed: + return + try: + frame = self._encode_frame(OPCODE_TEXT, data.encode("utf-8")) + self.writer.write(frame) + await self.writer.drain() + except Exception as e: + print(f"WebSocket send error: {e}", file=sys.stderr) + + async def send_binary(self, data): + """Send a binary frame.""" + if self.closed: + return + try: + if isinstance(data, str): + data = data.encode("utf-8") + frame = self._encode_frame(OPCODE_BINARY, data) + self.writer.write(frame) + await self.writer.drain() + except Exception as e: + print(f"WebSocket send error: {e}", file=sys.stderr) + + def _encode_frame(self, opcode, payload): + """Encode a WebSocket frame.""" + frame = bytearray() + frame.append(0x80 | opcode) # FIN + opcode + + length = len(payload) + if length < 126: + frame.append(length) + elif length < 65536: + frame.append(126) + frame.extend(struct.pack(">H", length)) + else: + frame.append(127) + frame.extend(struct.pack(">Q", length)) + + frame.extend(payload) + return bytes(frame) + + async def _decode_frame(self): + """Decode a WebSocket frame. Returns (opcode, payload).""" + try: + # Read first two bytes (use readexactly for guaranteed length) + header = await self.reader.readexactly(2) + + fin = (header[0] >> 7) & 1 + opcode = header[0] & 0x0F + masked = (header[1] >> 7) & 1 + length = header[1] & 0x7F + + # Extended payload length + if length == 126: + ext = await self.reader.readexactly(2) + length = struct.unpack(">H", ext)[0] + elif length == 127: + ext = await self.reader.readexactly(8) + length = struct.unpack(">Q", ext)[0] + + # Masking key + if masked: + mask_key = await self.reader.readexactly(4) + + # Payload + payload = await self.reader.readexactly(length) + + # Unmask if needed + if masked: + payload = bytes(b ^ mask_key[i % 4] for i, b in enumerate(payload)) + + return opcode, payload + except Exception as e: + print(f"WebSocket decode error: {e}", file=sys.stderr) + return None, None + + async def handle_connection(self): + """Handle the WebSocket connection loop.""" + try: + while not self.closed: + opcode, payload = await self._decode_frame() + if opcode is None: + break + + if opcode == OPCODE_CLOSE: + await self._send_close() + break + elif opcode == OPCODE_PING: + await self._send_pong(payload) + elif opcode == OPCODE_PONG: + pass # Ignore pong + elif opcode in (OPCODE_TEXT, OPCODE_BINARY): + # Handle text messages from client (e.g., chat_request) + try: + msg = payload.decode("utf-8") + data = json.loads(msg) + if data.get("type") == "chat_request": + # Invoke Claude with the message + await self._handle_chat_request(data.get("message", "")) + except (json.JSONDecodeError, UnicodeDecodeError): + pass + + # Check if we should stop waiting for messages + if self.closed: + break + + except Exception as e: + print(f"WebSocket connection error: {e}", file=sys.stderr) + finally: + self._close_connection() + # Clean up the message queue on disconnect + if self.user in _websocket_queues: + del _websocket_queues[self.user] + + async def _send_close(self): + """Send a close frame.""" + try: + # Close code 1000 = normal closure + frame = self._encode_frame(OPCODE_CLOSE, struct.pack(">H", 1000)) + self.writer.write(frame) + await self.writer.drain() + except Exception: + pass + + async def _send_pong(self, payload): + """Send a pong frame.""" + try: + frame = self._encode_frame(OPCODE_PONG, payload) + self.writer.write(frame) + await self.writer.drain() + except Exception: + pass + + async def _handle_chat_request(self, message): + """Handle a chat_request WebSocket frame by invoking Claude.""" + if not message: + return + + # Validate Claude binary exists + if not os.path.exists(CLAUDE_BIN): + await self.send_text(json.dumps({ + "type": "error", + "message": "Claude CLI not found", + })) + return + + try: + # Build claude command with permission mode (acceptEdits allows file edits) + claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] + + # Spawn claude --print with stream-json for streaming output + # Set cwd to workspace directory if configured, allowing Claude to access project code + cwd = WORKSPACE_DIR if WORKSPACE_DIR else None + proc = subprocess.Popen( + claude_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=cwd, + bufsize=1, + ) + + # Stream output line by line + for line in iter(proc.stdout.readline, ""): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + etype = event.get("type", "") + + # Extract text content from content_block_delta events + if etype == "content_block_delta": + delta = event.get("delta", {}) + if delta.get("type") == "text_delta": + text = delta.get("text", "") + if text: + # Send tokens to client + await self.send_text(text) + + # Check for usage event to know when complete + if etype == "result": + pass # Will send complete after loop + + except json.JSONDecodeError: + pass + + # Wait for process to complete + proc.wait() + + if proc.returncode != 0: + await self.send_text(json.dumps({ + "type": "error", + "message": f"Claude CLI failed with exit code {proc.returncode}", + })) + return + + # Send complete signal + await self.send_text(json.dumps({ + "type": "complete", + })) + + except FileNotFoundError: + await self.send_text(json.dumps({ + "type": "error", + "message": "Claude CLI not found", + })) + except Exception as e: + await self.send_text(json.dumps({ + "type": "error", + "message": str(e), + })) + + # ============================================================================= # Conversation History Functions (#710) # ============================================================================= @@ -544,9 +817,9 @@ class ChatHandler(BaseHTTPRequestHandler): self.serve_static(path) return - # Reserved WebSocket endpoint (future use) - if path == "/ws" or path.startswith("/ws"): - self.send_error_page(501, "WebSocket upgrade not yet implemented") + # WebSocket upgrade endpoint + if path == "/chat/ws" or path == "/ws" or path.startswith("/ws"): + self.handle_websocket_upgrade() return # 404 for unknown paths @@ -736,33 +1009,13 @@ class ChatHandler(BaseHTTPRequestHandler): except IOError as e: self.send_error_page(500, f"Error reading file: {e}") - def _send_rate_limit_response(self, retry_after, reason): - """Send a 429 response with Retry-After header and HTMX fragment (#711).""" - body = ( - f'
' - f"Rate limit exceeded: {reason}. " - f"Please try again in {retry_after} seconds." - f"
" - ) - self.send_response(429) - self.send_header("Retry-After", str(retry_after)) - self.send_header("Content-Type", "text/html; charset=utf-8") - self.send_header("Content-Length", str(len(body.encode("utf-8")))) - self.end_headers() - self.wfile.write(body.encode("utf-8")) - + def handle_chat(self, user): """ Handle chat requests by spawning `claude --print` with the user message. - Enforces per-user rate limits and tracks token usage (#711). + Streams tokens over WebSocket if connected. """ - # Check rate limits before processing (#711) - allowed, retry_after, reason = _check_rate_limit(user) - if not allowed: - self._send_rate_limit_response(retry_after, reason) - return - # Read request body content_length = int(self.headers.get("Content-Length", 0)) if content_length == 0: @@ -799,23 +1052,63 @@ class ChatHandler(BaseHTTPRequestHandler): if not conv_id or not _validate_conversation_id(conv_id): conv_id = _generate_conversation_id() - # Record request for rate limiting (#711) - _record_request(user) - try: # Save user message to history _write_message(user, conv_id, "user", message) + # Build claude command with permission mode (acceptEdits allows file edits) + claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] + # Spawn claude --print with stream-json for token tracking (#711) + # Set cwd to workspace directory if configured, allowing Claude to access project code + cwd = WORKSPACE_DIR if WORKSPACE_DIR else None proc = subprocess.Popen( - [CLAUDE_BIN, "--print", "--output-format", "stream-json", message], + claude_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + cwd=cwd, + bufsize=1, # Line buffered ) - raw_output = proc.stdout.read() + # Stream output line by line + response_parts = [] + total_tokens = 0 + for line in iter(proc.stdout.readline, ""): + line = line.strip() + if not line: + continue + try: + event = json.loads(line) + etype = event.get("type", "") + # Extract text content from content_block_delta events + if etype == "content_block_delta": + delta = event.get("delta", {}) + if delta.get("type") == "text_delta": + text = delta.get("text", "") + if text: + response_parts.append(text) + # Stream to WebSocket if connected + if user in _websocket_queues: + try: + _websocket_queues[user].put_nowait(text) + except Exception: + pass # Client disconnected + + # Parse usage from result event + if etype == "result": + usage = event.get("usage", {}) + total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) + elif "usage" in event: + usage = event["usage"] + if isinstance(usage, dict): + total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) + + except json.JSONDecodeError: + pass + + # Wait for process to complete error_output = proc.stderr.read() if error_output: print(f"Claude stderr: {error_output}", file=sys.stderr) @@ -826,20 +1119,12 @@ class ChatHandler(BaseHTTPRequestHandler): self.send_error_page(500, f"Claude CLI failed with exit code {proc.returncode}") return - # Parse stream-json for text and token usage (#711) - response, total_tokens = _parse_stream_json(raw_output) - - # Track token usage - does not block *this* request (#711) - if total_tokens > 0: - _record_tokens(user, total_tokens) - print( - f"Token usage: user={user} tokens={total_tokens}", - file=sys.stderr, - ) + # Combine response parts + response = "".join(response_parts) # Fall back to raw output if stream-json parsing yielded no text if not response: - response = raw_output + response = proc.stdout.getvalue() if hasattr(proc.stdout, 'getvalue') else "" # Save assistant response to history _write_message(user, conv_id, "assistant", response) @@ -909,6 +1194,106 @@ class ChatHandler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(json.dumps({"conversation_id": conv_id}, ensure_ascii=False).encode("utf-8")) + @staticmethod + def push_to_websocket(user, message): + """Push a message to a WebSocket connection for a user. + + This is called from the chat handler to stream tokens to connected clients. + The message is added to the user's WebSocket message queue. + """ + # Get the message queue from the WebSocket handler's queue + # We store the queue in a global dict keyed by user + if user in _websocket_queues: + _websocket_queues[user].put_nowait(message) + + def handle_websocket_upgrade(self): + """Handle WebSocket upgrade request for chat streaming.""" + # Check session cookie + user = _validate_session(self.headers.get("Cookie")) + if not user: + self.send_error_page(401, "Unauthorized: no valid session") + return + + # Create message queue for this user + _websocket_queues[user] = asyncio.Queue() + + # Get WebSocket upgrade headers from the HTTP request + sec_websocket_key = self.headers.get("Sec-WebSocket-Key", "") + sec_websocket_protocol = self.headers.get("Sec-WebSocket-Protocol", "") + + # Validate Sec-WebSocket-Key + if not sec_websocket_key: + self.send_error_page(400, "Bad Request", "Missing Sec-WebSocket-Key") + return + + # Get the socket from the connection + sock = self.connection + sock.setblocking(False) + + # Create async server to handle the connection + async def handle_ws(): + try: + # Wrap the socket in asyncio streams using open_connection + reader, writer = await asyncio.open_connection(sock=sock) + + # Create WebSocket handler + ws_handler = _WebSocketHandler(reader, writer, user, _websocket_queues[user]) + + # Accept the connection (pass headers from HTTP request) + if not await ws_handler.accept_connection(sec_websocket_key, sec_websocket_protocol): + return + + # Start a task to read from the queue and send to client + async def send_stream(): + while not ws_handler.closed: + try: + data = await asyncio.wait_for(ws_handler.message_queue.get(), timeout=1.0) + await ws_handler.send_text(data) + except asyncio.TimeoutError: + # Send ping to keep connection alive + try: + frame = ws_handler._encode_frame(OPCODE_PING, b"") + writer.write(frame) + await writer.drain() + except Exception: + break + except Exception as e: + print(f"Send stream error: {e}", file=sys.stderr) + break + + # Start sending task + send_task = asyncio.create_task(send_stream()) + + # Handle incoming WebSocket frames + await ws_handler.handle_connection() + + # Cancel send task + send_task.cancel() + try: + await send_task + except asyncio.CancelledError: + pass + + except Exception as e: + print(f"WebSocket handler error: {e}", file=sys.stderr) + finally: + try: + writer.close() + await writer.wait_closed() + except Exception: + pass + + # Run the async handler in a thread + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(handle_ws()) + except Exception as e: + print(f"WebSocket error: {e}", file=sys.stderr) + finally: + loop.close() + sock.close() + def do_DELETE(self): """Handle DELETE requests.""" parsed = urlparse(self.path) @@ -944,12 +1329,6 @@ def main(): print("forward_auth secret configured (#709)", file=sys.stderr) else: print("WARNING: FORWARD_AUTH_SECRET not set - verify endpoint unrestricted", file=sys.stderr) - print( - f"Rate limits (#711): {CHAT_MAX_REQUESTS_PER_HOUR}/hr, " - f"{CHAT_MAX_REQUESTS_PER_DAY}/day, " - f"{CHAT_MAX_TOKENS_PER_DAY} tokens/day", - file=sys.stderr, - ) httpd.serve_forever() diff --git a/docker/chat/ui/index.html b/docker/chat/ui/index.html index bd920f9..b045873 100644 --- a/docker/chat/ui/index.html +++ b/docker/chat/ui/index.html @@ -430,6 +430,10 @@ return div.innerHTML.replace(/\n/g, '
'); } + // WebSocket connection for streaming + let ws = null; + let wsMessageId = null; + // Send message handler async function sendMessage() { const message = textarea.value.trim(); @@ -449,6 +453,14 @@ await createNewConversation(); } + // Try WebSocket streaming first, fall back to fetch + if (window.location.protocol === 'https:' || window.location.hostname === 'localhost') { + if (tryWebSocketSend(message)) { + return; + } + } + + // Fallback to fetch try { // Use fetch with URLSearchParams for application/x-www-form-urlencoded const params = new URLSearchParams(); @@ -485,6 +497,111 @@ } } + // Try to send message via WebSocket streaming + function tryWebSocketSend(message) { + try { + // Generate a unique message ID for this request + wsMessageId = Date.now().toString(36) + Math.random().toString(36).substr(2); + + // Connect to WebSocket + const wsUrl = window.location.protocol === 'https:' + ? `wss://${window.location.host}/chat/ws` + : `ws://${window.location.host}/chat/ws`; + + ws = new WebSocket(wsUrl); + + ws.onopen = function() { + // Send the message as JSON with message ID + const data = { + type: 'chat_request', + message_id: wsMessageId, + message: message, + conversation_id: currentConversationId + }; + ws.send(JSON.stringify(data)); + }; + + ws.onmessage = function(event) { + try { + const data = JSON.parse(event.data); + + if (data.type === 'token') { + // Stream a token to the UI + addTokenToLastMessage(data.token); + } else if (data.type === 'complete') { + // Streaming complete + closeWebSocket(); + textarea.disabled = false; + sendBtn.disabled = false; + sendBtn.textContent = 'Send'; + textarea.focus(); + messagesDiv.scrollTop = messagesDiv.scrollHeight; + loadConversations(); + } else if (data.type === 'error') { + addSystemMessage(`Error: ${data.message}`); + closeWebSocket(); + textarea.disabled = false; + sendBtn.disabled = false; + sendBtn.textContent = 'Send'; + textarea.focus(); + } + } catch (e) { + console.error('Failed to parse WebSocket message:', e); + } + }; + + ws.onerror = function(error) { + console.error('WebSocket error:', error); + addSystemMessage('WebSocket connection error. Falling back to regular chat.'); + closeWebSocket(); + sendMessage(); // Retry with fetch + }; + + ws.onclose = function() { + wsMessageId = null; + }; + + return true; // WebSocket attempt started + + } catch (error) { + console.error('Failed to create WebSocket:', error); + return false; // Fall back to fetch + } + } + + // Add a token to the last assistant message (for streaming) + function addTokenToLastMessage(token) { + const messages = messagesDiv.querySelectorAll('.message.assistant'); + if (messages.length === 0) { + // No assistant message yet, create one + const msgDiv = document.createElement('div'); + msgDiv.className = 'message assistant'; + msgDiv.innerHTML = ` +
assistant
+
+ `; + messagesDiv.appendChild(msgDiv); + } + + const lastMsg = messagesDiv.querySelector('.message.assistant .content.streaming'); + if (lastMsg) { + lastMsg.textContent += token; + messagesDiv.scrollTop = messagesDiv.scrollHeight; + } + } + + // Close WebSocket connection + function closeWebSocket() { + if (ws) { + ws.onopen = null; + ws.onmessage = null; + ws.onerror = null; + ws.onclose = null; + ws.close(); + ws = null; + } + } + // Event listeners sendBtn.addEventListener('click', sendMessage); diff --git a/docker/edge/Dockerfile b/docker/edge/Dockerfile index eca7d7e..507c39b 100644 --- a/docker/edge/Dockerfile +++ b/docker/edge/Dockerfile @@ -1,6 +1,12 @@ FROM caddy:latest -RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh -COPY entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh +RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh \ + nodejs npm +# Claude Code CLI — chat backend runtime (merged from docker/chat, #1083) +RUN npm install -g @anthropic-ai/claude-code@2.1.84 +COPY docker/edge/entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh +# Chat server and UI (merged from docker/chat into edge, #1083) +COPY docker/chat/server.py /usr/local/bin/chat-server.py +COPY docker/chat/ui/ /var/chat/ui/ VOLUME /data diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index a48abf2..282342a 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -560,10 +560,168 @@ _launch_runner_docker() { # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV # -# Nomad backend stub — will be implemented in migration Step 5. +# Dispatches a vault-runner batch job via `nomad job dispatch`. +# Polls `nomad job status` until terminal state (completed/failed). +# Reads exit code from allocation and writes .result.json. +# +# Usage: _launch_runner_nomad +# Returns: exit code of the nomad job (0=success, non-zero=failure) _launch_runner_nomad() { - echo "nomad backend not yet implemented" >&2 - return 1 + local action_id="$1" + local secrets_csv="$2" + local mounts_csv="$3" + + log "Dispatching vault-runner batch job via Nomad for action: ${action_id}" + + # Dispatch the parameterized batch job + # The vault-runner job expects meta: action_id, secrets_csv + # Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl) + local dispatch_output + dispatch_output=$(nomad job dispatch \ + -detach \ + -meta action_id="$action_id" \ + -meta secrets_csv="$secrets_csv" \ + vault-runner 2>&1) || { + log "ERROR: Failed to dispatch vault-runner job for ${action_id}" + log "Dispatch output: ${dispatch_output}" + write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}" + return 1 + } + + # Extract dispatched job ID from output (format: "vault-runner/dispatch--") + local dispatched_job_id + dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true) + + if [ -z "$dispatched_job_id" ]; then + log "ERROR: Could not extract dispatched job ID from nomad output" + log "Dispatch output: ${dispatch_output}" + write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output" + return 1 + fi + + log "Dispatched vault-runner with job ID: ${dispatched_job_id}" + + # Poll job status until terminal state + # Batch jobs transition: running -> completed/failed + local max_wait=300 # 5 minutes max wait + local elapsed=0 + local poll_interval=5 + local alloc_id="" + + log "Polling nomad job status for ${dispatched_job_id}..." + + while [ "$elapsed" -lt "$max_wait" ]; do + # Get job status with JSON output for the dispatched child job + local job_status_json + job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { + log "ERROR: Failed to get job status for ${dispatched_job_id}" + write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}" + return 1 + } + + # Check job status field (transitions to "dead" on completion) + local job_state + job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state="" + + # Check allocation state directly + alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id="" + + if [ -n "$alloc_id" ]; then + local alloc_state + alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true) + + case "$alloc_state" in + *completed*|*success*|*dead*) + log "Allocation ${alloc_id} reached terminal state: ${alloc_state}" + break + ;; + *running*|*pending*|*starting*) + log "Allocation ${alloc_id} still running (state: ${alloc_state})..." + ;; + *failed*|*crashed*) + log "Allocation ${alloc_id} failed (state: ${alloc_state})" + break + ;; + esac + fi + + # Also check job-level state + case "$job_state" in + dead) + log "Job ${dispatched_job_id} reached terminal state: ${job_state}" + break + ;; + failed) + log "Job ${dispatched_job_id} failed" + break + ;; + esac + + sleep "$poll_interval" + elapsed=$((elapsed + poll_interval)) + done + + if [ "$elapsed" -ge "$max_wait" ]; then + log "ERROR: Timeout waiting for vault-runner job to complete" + write_result "$action_id" 1 "Timeout waiting for nomad job to complete" + return 1 + fi + + # Get final job status and exit code + local final_status_json + final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { + log "ERROR: Failed to get final job status" + write_result "$action_id" 1 "Failed to get final job status" + return 1 + } + + # Get allocation exit code + local exit_code=0 + local logs="" + + if [ -n "$alloc_id" ]; then + # Get allocation logs + logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true) + + # Try to get exit code from alloc status JSON + # Nomad alloc status -json has .TaskStates[""].Events[].ExitCode + local alloc_exit_code + alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code="" + + if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then + exit_code="$alloc_exit_code" + fi + fi + + # If we couldn't get exit code from alloc, check job state as fallback + # Note: "dead" = terminal state for batch jobs (includes successful completion) + # Only "failed" indicates actual failure + if [ "$exit_code" -eq 0 ]; then + local final_state + final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state="" + + case "$final_state" in + failed) + exit_code=1 + ;; + esac + fi + + # Truncate logs if too long + if [ ${#logs} -gt 1000 ]; then + logs="${logs: -1000}" + fi + + # Write result file + write_result "$action_id" "$exit_code" "$logs" + + if [ "$exit_code" -eq 0 ]; then + log "Vault-runner job completed successfully for action: ${action_id}" + else + log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})" + fi + + return "$exit_code" } # Launch runner for the given action (backend-agnostic orchestrator) @@ -1051,11 +1209,8 @@ main() { # Validate backend selection at startup case "$DISPATCHER_BACKEND" in - docker) ;; - nomad) - log "ERROR: nomad backend not yet implemented" - echo "nomad backend not yet implemented" >&2 - exit 1 + docker|nomad) + log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch" ;; *) log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}" diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index 1b5f94f..a1511ff 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,11 +173,15 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load required secrets from secrets/*.enc (#777) ──────────────────── -# Edge container declares its required secrets; missing ones cause a hard fail. +# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── +# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to +# SCP access logs from a remote edge host. When age key or secrets dir is +# missing, or any secret fails to decrypt, log a warning and skip the cron. +# Caddy itself does not depend on these secrets. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" +EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -192,47 +196,63 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 - echo " Run 'disinto secrets add ' for each missing secret." >&2 - echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 - exit 1 + echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 + echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 + EDGE_ENGAGEMENT_READY=0 + else + echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 + EDGE_ENGAGEMENT_READY=1 fi - echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 - echo " Ensure age is installed and secrets/*.enc files are present." >&2 - exit 1 + echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 + echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 + EDGE_ENGAGEMENT_READY=0 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -(while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" -done) & +# Guarded: only start if EDGE_ENGAGEMENT_READY=1. +if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then + (while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" + done) & +else + echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 +fi + +# Start chat server in background (#1083 — merged from docker/chat into edge) +(python3 /usr/local/bin/chat-server.py 2>&1 | tee -a /opt/disinto-logs/chat.log) & + +# Nomad template renders Caddyfile to /local/Caddyfile via service discovery; +# copy it into the expected location if present (compose uses the mounted path). +if [ -f /local/Caddyfile ]; then + cp /local/Caddyfile /etc/caddy/Caddyfile + echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2 +fi # Caddy as main process — run in foreground via wait so background jobs survive # (exec replaces the shell, which can orphan backgrounded subshells) diff --git a/docs/nomad-cutover-runbook.md b/docs/nomad-cutover-runbook.md new file mode 100644 index 0000000..e0956cc --- /dev/null +++ b/docs/nomad-cutover-runbook.md @@ -0,0 +1,183 @@ +# Nomad Cutover Runbook + +End-to-end procedure to cut over the disinto factory from docker-compose on +disinto-dev-box to Nomad on disinto-nomad-box. + +**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box +stays warm for rollback. + +**Downtime budget**: <5 min blue-green flip. + +**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is +regenerated or discarded. OAuth secrets are regenerated on fresh init (all +sessions invalidated). + +--- + +## 1. Pre-cutover readiness checklist + +- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified) +- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and + Codeberg +- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6) +- [ ] Companion tools landed: + - `disinto backup create` (#1057) + - `disinto backup import` (#1058) +- [ ] Backup tarball produced and tested against a scratch LXC (see §3) + +--- + +## 2. Pre-cutover artifact: backup + +On disinto-dev-box: + +```bash +./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz +``` + +Copy the tarball to nomad-box (and optionally to a local workstation for +safekeeping): + +```bash +scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/ +``` + +--- + +## 3. Pre-cutover dry-run + +On a throwaway LXC: + +```bash +lxc launch ubuntu:24.04 cutover-dryrun +# inside the container: +disinto init --backend=nomad --import-env .env --with edge +./bin/disinto backup import /tmp/disinto-backup-*.tar.gz +``` + +Verify: + +- Issue count matches source Forgejo +- disinto-ops repo refs match source bundle + +Destroy the LXC once satisfied: + +```bash +lxc delete cutover-dryrun --force +``` + +--- + +## 4. Cutover T-0 (operator executes; <5 min target) + +### 4.1 Stop dev-box services + +```bash +# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them) +docker-compose stop +``` + +### 4.2 Provision nomad-box (if not already done) + +```bash +# On disinto-nomad-box +disinto init --backend=nomad --import-env .env --with edge +``` + +### 4.3 Import backup + +```bash +# On disinto-nomad-box +./bin/disinto backup import /tmp/disinto-backup-*.tar.gz +``` + +### 4.4 Configure Codeberg pull mirror + +Manual, one-time step in the new Forgejo UI: + +1. Create a mirror repository pointing at the Codeberg upstream +2. Confirm initial sync completes + +### 4.5 Claude login + +```bash +# On disinto-nomad-box +claude login +``` + +Set up Anthropic OAuth so agents can authenticate. + +### 4.6 Autossh tunnel swap + +> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate. + +1. Stop the tunnel on dev-box: + ```bash + # On disinto-dev-box + systemctl stop reverse-tunnel + ``` + +2. Copy or regenerate the tunnel unit on nomad-box: + ```bash + # Copy from dev-box, or let init regenerate it + scp dev-box:/etc/systemd/system/reverse-tunnel.service \ + nomad-box:/etc/systemd/system/ + ``` + +3. Register nomad-box's public key on DO edge: + ```bash + # On DO edge box — same restricted-command as the dev-box key + echo "" >> /home/johba/.ssh/authorized_keys + ``` + +4. Start the tunnel on nomad-box: + ```bash + # On disinto-nomad-box + systemctl enable --now reverse-tunnel + ``` + +5. Verify end-to-end: + ```bash + curl https://self.disinto.ai/api/v1/version + # Should return the new box's Forgejo version + ``` + +--- + +## 5. Post-cutover smoke + +- [ ] `curl https://self.disinto.ai` → Forgejo welcome page +- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work +- [ ] Claude chat login via Forgejo OAuth succeeds + +--- + +## 6. Rollback (if any step 4 gate fails) + +1. Stop the tunnel on nomad-box: + ```bash + systemctl stop reverse-tunnel # on nomad-box + ``` + +2. Restore the tunnel on dev-box: + ```bash + systemctl start reverse-tunnel # on dev-box + ``` + +3. Bring dev-box services back up: + ```bash + docker-compose up -d # on dev-box + ``` + +4. DO Caddy config is unchanged — traffic restores in <5 min. + +5. File a post-mortem issue. Keep nomad-box state intact for debugging. + +--- + +## 7. Post-stable cleanup (T+1 week) + +- `docker-compose down -v` on dev-box +- Archive `/var/lib/docker/volumes/disinto_*` to cold storage +- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator + decision) diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index f31e6bc..4101252 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -29,7 +29,7 @@ and injected into your prompt above. Review them now. 1. Read the injected metrics data carefully (System Resources, Docker, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, - CI Pipelines, Open PRs, Issue Status, Stale Worktrees). + CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). Note: preflight.sh auto-removes PHASE:escalate files for closed issues (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. @@ -75,6 +75,10 @@ Categorize every finding from the metrics into priority levels. - Dev/action sessions in PHASE:escalate for > 24h (session timeout) (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; this check covers sessions where the issue is still open) +- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight: + - Container not running or in unhealthy state + - gRPC errors >= 3 in last 20 minutes + - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes ### P3 — Factory degraded - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed @@ -100,6 +104,15 @@ For each finding from the health assessment, decide and execute an action. ### Auto-fixable (execute these directly) +**P2 Woodpecker agent unhealthy:** +The supervisor-run.sh script automatically handles WP agent recovery: +- Detects unhealthy state via preflight.sh health checks +- Restarts container via `docker restart` +- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes +- Unassigns and removes blocked label from affected issues +- Posts recovery comment with infra-flake context +- Avoids duplicate restarts via 5-minute cooldown in history file + **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true @@ -248,6 +261,11 @@ Format: - (or "No actions needed") + ### WP Agent Recovery (if applicable) + - WP agent restart: