diff --git a/.gitignore b/.gitignore index a29450c..21c6fbc 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json +gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 473bb18..0485833 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,45 +294,6 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", - # Test data for duplicate service detection tests (#850) - # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh - "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", - "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", - # Common vault-seed script patterns: logging helpers + flag parsing - # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh - "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", - "ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)", - "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", - "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", - "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", - # forgejo-bootstrap.sh follows wp-oauth-register.sh pattern (issue #1069) - "2b80185e4ae2b54e2e01f33e5555c688": "Standard header (set -euo pipefail, SCRIPT_DIR, REPO_ROOT) (forgejo-bootstrap + wp-oauth-register)", - "38a1f20a60d69f0d6bfb06a0532b3bd7": "Logging helpers + DRY_RUN init (forgejo-bootstrap + wp-oauth-register)", - "4dd3c526fa29bdaa88b274c3d7d01032": "Flag parsing loop + case start (forgejo-bootstrap + wp-oauth-register)", - # Common vault-seed script preamble + precondition patterns - # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh - "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT", - "1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard", - "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", - "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", - "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", - # Common vault-seed script flag parsing patterns - # Shared across tools/vault-seed-{forgejo,ops-repo}.sh - "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", - "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", - "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", - "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", - "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", - "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", - "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", - "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", - "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", - "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", - "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", - "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", - "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", - # Common shell control-flow: if → return 1 → fi → fi (env.sh + register.sh) - "a8bdb7f1a5d8cbd0a5921b17b6cf6f4d": "Common shell control-flow (return 1 / fi / fi / return 0 / }) (env.sh + register.sh)", } if not sh_files: diff --git a/.woodpecker/edge-subpath.yml b/.woodpecker/edge-subpath.yml deleted file mode 100644 index 2c11980..0000000 --- a/.woodpecker/edge-subpath.yml +++ /dev/null @@ -1,317 +0,0 @@ -# ============================================================================= -# .woodpecker/edge-subpath.yml — Edge subpath routing static checks -# -# Static validation for edge subpath routing configuration. This pipeline does -# NOT run live service curls — it validates the configuration that would be -# used by a deployed edge proxy. -# -# Checks: -# 1. shellcheck — syntax check on tests/smoke-edge-subpath.sh -# 2. caddy validate — validate the Caddyfile template syntax -# 3. caddyfile-routing-test — verify Caddyfile routing block shape -# 4. test-caddyfile-routing — run standalone unit test for Caddyfile structure -# -# Triggers: -# - Pull requests that modify edge-related files -# -# Environment variables (inherited from WOODPECKER_ENVIRONMENT): -# EDGE_BASE_URL — Edge proxy URL for reference (default: http://localhost) -# EDGE_TIMEOUT — Request timeout in seconds (default: 30) -# EDGE_MAX_RETRIES — Max retries per request (default: 3) -# ============================================================================= - -when: - event: pull_request - -steps: - # ── 1. ShellCheck on smoke script ──────────────────────────────────────── - # `shellcheck` validates bash syntax, style, and common pitfalls. - # Exit codes: - # 0 — all checks passed - # 1 — one or more issues found - - name: shellcheck-smoke - image: koalaman/shellcheck-alpine:stable - commands: - - shellcheck --severity=warning tests/smoke-edge-subpath.sh tests/test-caddyfile-routing.sh - - # ── 2. Caddyfile template rendering ─────────────────────────────────────── - # Render a mock Caddyfile for validation. The template uses Nomad's - # templating syntax ({{ range ... }}) which must be processed before Caddy - # can validate it. We render a mock version with Nomad templates expanded - # to static values for validation purposes. - - name: render-caddyfile - image: alpine:3.19 - commands: - - apk add --no-cache coreutils - - | - set -e - mkdir -p edge-render - # Render mock Caddyfile with Nomad templates expanded - { - echo '# Caddyfile — edge proxy configuration (Nomad-rendered)' - echo '# Staging upstream discovered via Nomad service registration.' - echo '' - echo ':80 {' - echo ' # Redirect root to Forgejo' - echo ' handle / {' - echo ' redir /forge/ 302' - echo ' }' - echo '' - echo ' # Reverse proxy to Forgejo' - echo ' handle /forge/* {' - echo ' reverse_proxy 127.0.0.1:3000' - echo ' }' - echo '' - echo ' # Reverse proxy to Woodpecker CI' - echo ' handle /ci/* {' - echo ' reverse_proxy 127.0.0.1:8000' - echo ' }' - echo '' - echo ' # Reverse proxy to staging — dynamic port via Nomad service discovery' - echo ' handle /staging/* {' - echo ' reverse_proxy 127.0.0.1:8081' - echo ' }' - echo '' - echo ' # Chat service — reverse proxy to disinto-chat backend (#705)' - echo ' # OAuth routes bypass forward_auth — unauthenticated users need these (#709)' - echo ' handle /chat/login {' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo ' handle /chat/oauth/callback {' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo ' # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)' - echo ' handle /chat/* {' - echo ' forward_auth 127.0.0.1:8080 {' - echo ' uri /chat/auth/verify' - echo ' copy_headers X-Forwarded-User' - echo ' header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}' - echo ' }' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo '}' - } > edge-render/Caddyfile - cp edge-render/Caddyfile edge-render/Caddyfile.rendered - echo "Caddyfile rendered successfully" - - # ── 3. Caddy config validation ─────────────────────────────────────────── - # `caddy validate` checks Caddyfile syntax and configuration. - # This validates the rendered Caddyfile against Caddy's parser. - # Exit codes: - # 0 — configuration is valid - # 1 — configuration has errors - - name: caddy-validate - image: alpine:3.19 - commands: - - apk add --no-cache ca-certificates curl - - curl -sS -o /tmp/caddy "https://caddyserver.com/api/download?os=linux&arch=amd64" - - chmod +x /tmp/caddy - - /tmp/caddy version - - /tmp/caddy validate --config edge-render/Caddyfile.rendered --adapter caddyfile - - # ── 4. Caddyfile routing block shape test ───────────────────────────────── - # Verify that the Caddyfile contains all required routing blocks: - # - /forge/ — Forgejo subpath - # - /ci/ — Woodpecker subpath - # - /staging/ — Staging subpath - # - /chat/ — Chat subpath with forward_auth - # - # This is a unit test that validates the expected structure without - # requiring a running Caddy instance. - - name: caddyfile-routing-test - image: alpine:3.19 - commands: - - apk add --no-cache grep coreutils - - | - set -e - - CADDYFILE="edge-render/Caddyfile.rendered" - - echo "=== Validating Caddyfile routing blocks ===" - - # Check that all required subpath handlers exist - # POSIX-safe loop (alpine /bin/sh has no arrays) - FAILED=0 - for handler in "handle /forge/\*" "handle /ci/\*" "handle /staging/\*" "handle /chat/login" "handle /chat/oauth/callback" "handle /chat/\*"; do - if grep -q "$handler" "$CADDYFILE"; then - echo "[PASS] Found handler: $handler" - else - echo "[FAIL] Missing handler: $handler" - FAILED=1 - fi - done - - # Check forward_auth block exists for /chat/* - if grep -A5 "handle /chat/\*" "$CADDYFILE" | grep -q "forward_auth"; then - echo "[PASS] forward_auth block found for /chat/*" - else - echo "[FAIL] forward_auth block missing for /chat/*" - FAILED=1 - fi - - # Check reverse_proxy to Forgejo (port 3000) - if grep -q "reverse_proxy 127.0.0.1:3000" "$CADDYFILE"; then - echo "[PASS] Forgejo reverse_proxy configured (port 3000)" - else - echo "[FAIL] Forgejo reverse_proxy not configured" - FAILED=1 - fi - - # Check reverse_proxy to Woodpecker (port 8000) - if grep -q "reverse_proxy 127.0.0.1:8000" "$CADDYFILE"; then - echo "[PASS] Woodpecker reverse_proxy configured (port 8000)" - else - echo "[FAIL] Woodpecker reverse_proxy not configured" - FAILED=1 - fi - - # Check reverse_proxy to Chat (port 8080) - if grep -q "reverse_proxy 127.0.0.1:8080" "$CADDYFILE"; then - echo "[PASS] Chat reverse_proxy configured (port 8080)" - else - echo "[FAIL] Chat reverse_proxy not configured" - FAILED=1 - fi - - # Check root redirect to /forge/ - if grep -q "redir /forge/ 302" "$CADDYFILE"; then - echo "[PASS] Root redirect to /forge/ configured" - else - echo "[FAIL] Root redirect to /forge/ not configured" - FAILED=1 - fi - - echo "" - if [ $FAILED -eq 0 ]; then - echo "=== All routing blocks validated ===" - exit 0 - else - echo "=== Routing block validation failed ===" >&2 - exit 1 - fi - - # ── 5. Standalone Caddyfile routing test ───────────────────────────────── - # Run the standalone unit test for Caddyfile routing block validation. - # This test extracts the Caddyfile template from edge.hcl and validates - # its structure without requiring a running Caddy instance. - - name: test-caddyfile-routing - image: alpine:3.19 - commands: - - apk add --no-cache grep coreutils - - | - set -e - EDGE_TEMPLATE="nomad/jobs/edge.hcl" - - echo "=== Extracting Caddyfile template from $EDGE_TEMPLATE ===" - - # Extract the Caddyfile template (content between <&2 - exit 1 - fi - - echo "Caddyfile template extracted successfully" - echo "" - - FAILED=0 - - # Check Forgejo subpath - if echo "$CADDYFILE" | grep -q "handle /forge/\*"; then - echo "[PASS] Forgejo handle block" - else - echo "[FAIL] Forgejo handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then - echo "[PASS] Forgejo reverse_proxy (port 3000)" - else - echo "[FAIL] Forgejo reverse_proxy (port 3000)" - FAILED=1 - fi - - # Check Woodpecker subpath - if echo "$CADDYFILE" | grep -q "handle /ci/\*"; then - echo "[PASS] Woodpecker handle block" - else - echo "[FAIL] Woodpecker handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8000"; then - echo "[PASS] Woodpecker reverse_proxy (port 8000)" - else - echo "[FAIL] Woodpecker reverse_proxy (port 8000)" - FAILED=1 - fi - - # Check Staging subpath - if echo "$CADDYFILE" | grep -q "handle /staging/\*"; then - echo "[PASS] Staging handle block" - else - echo "[FAIL] Staging handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "nomadService"; then - echo "[PASS] Staging Nomad service discovery" - else - echo "[FAIL] Staging Nomad service discovery" - FAILED=1 - fi - - # Check Chat subpath - if echo "$CADDYFILE" | grep -q "handle /chat/login"; then - echo "[PASS] Chat login handle block" - else - echo "[FAIL] Chat login handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "handle /chat/oauth/callback"; then - echo "[PASS] Chat OAuth callback handle block" - else - echo "[FAIL] Chat OAuth callback handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "handle /chat/\*"; then - echo "[PASS] Chat catch-all handle block" - else - echo "[FAIL] Chat catch-all handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8080"; then - echo "[PASS] Chat reverse_proxy (port 8080)" - else - echo "[FAIL] Chat reverse_proxy (port 8080)" - FAILED=1 - fi - - # Check forward_auth for chat - if echo "$CADDYFILE" | grep -A10 "handle /chat/\*" | grep -q "forward_auth"; then - echo "[PASS] forward_auth block for /chat/*" - else - echo "[FAIL] forward_auth block for /chat/*" - FAILED=1 - fi - - # Check root redirect - if echo "$CADDYFILE" | grep -q "redir /forge/ 302"; then - echo "[PASS] Root redirect to /forge/" - else - echo "[FAIL] Root redirect to /forge/" - FAILED=1 - fi - - echo "" - if [ $FAILED -eq 0 ]; then - echo "=== All routing blocks validated ===" - exit 0 - else - echo "=== Routing block validation failed ===" >&2 - exit 1 - fi diff --git a/AGENTS.md b/AGENTS.md index 52ea01f..ad3867b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,20 +37,19 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4) -├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh -│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) +├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh +│ jobs/ — Nomad jobspecs (forgejo.hcl reads Vault secrets via template stanza, S2.4) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) -├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge (Caddy + chat server subprocess + dispatcher), chat (server.py, ui/ — copied into edge image at build time) -├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh; register.sh enforces: reserved-name blocklist, admin-approved allowlist via /var/lib/disinto/allowlist.json, per-caller attribution via --as forced-command arg stored as registered_by, append-only audit log at /var/log/disinto/edge-register.log, ownership check on deregister requiring pubkey match) -│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) -│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) +├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) +├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) +│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh, vault-seed-forgejo.sh — Vault provisioning (S2.1/S2.2) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) +├── bin/ The `disinto` CLI script ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 98c0e04..7f8b1f4 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index b0893c4..2b676a3 100755 --- a/bin/disinto +++ b/bin/disinto @@ -12,7 +12,6 @@ # disinto secrets Manage encrypted secrets # disinto run Run action in ephemeral runner container # disinto ci-logs [--step ] Read CI logs from Woodpecker SQLite -# disinto backup create Export factory state for migration # # Usage: # disinto init https://github.com/user/repo @@ -40,9 +39,7 @@ source "${FACTORY_ROOT}/lib/generators.sh" source "${FACTORY_ROOT}/lib/forge-push.sh" source "${FACTORY_ROOT}/lib/ci-setup.sh" source "${FACTORY_ROOT}/lib/release.sh" -source "${FACTORY_ROOT}/lib/backup.sh" source "${FACTORY_ROOT}/lib/claude-config.sh" -source "${FACTORY_ROOT}/lib/disinto/backup.sh" # backup create/import # ── Helpers ────────────────────────────────────────────────────────────────── @@ -65,9 +62,7 @@ Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) - disinto backup create Export factory state (issues + ops bundle) disinto edge [options] Manage edge tunnel registrations - disinto backup Backup and restore factory state Edge subcommands: register [project] Register a new tunnel (generates keypair if needed) @@ -87,7 +82,7 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5) + --with (nomad) Deploy services: forgejo[,...] (S1.3) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) @@ -106,18 +101,6 @@ Hire an agent options: CI logs options: --step Filter logs to a specific step (e.g., smoke-init) - -Backup subcommands: - create Create backup of factory state to tarball - import Restore factory state from backup tarball - -Import behavior: - - Unpacks tarball to temp directory - - Creates disinto repo via Forgejo API (mirror config is manual) - - Creates disinto-ops repo and pushes refs from bundle - - Imports issues from issues/*.json (idempotent - skips existing) - - Logs issue number mapping (Forgejo auto-assigns numbers) - - Prints summary: created X repos, pushed Y refs, imported Z issues, skipped W EOF exit 1 } @@ -687,7 +670,6 @@ _disinto_init_nomad() { local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" - local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" @@ -708,22 +690,15 @@ _disinto_init_nomad() { # --empty combined with --with or any --import-* flag, so reaching # this branch with those set is a bug in the caller. # - # On the default (non-empty) path, vault-engines.sh (enables the kv/ - # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked - # unconditionally — they are idempotent and cheap to re-run, and - # subsequent --with deployments depend on them. vault-import.sh is - # invoked only when an --import-* flag is set. vault-engines.sh runs - # first because every policy and role below references kv/disinto/* - # paths, which 403 if the engine is not yet mounted (issue #912). + # On the default (non-empty) path, vault-apply-policies.sh and + # vault-nomad-auth.sh are invoked unconditionally — they are idempotent + # and cheap to re-run, and subsequent --with deployments depend on + # them. vault-import.sh is invoked only when an --import-* flag is set. local import_any=false if [ -n "$import_env" ] || [ -n "$import_sops" ]; then import_any=true fi if [ "$empty" != "true" ]; then - if [ ! -x "$vault_engines_sh" ]; then - echo "Error: ${vault_engines_sh} not found or not executable" >&2 - exit 1 - fi if [ ! -x "$vault_policies_sh" ]; then echo "Error: ${vault_policies_sh} not found or not executable" >&2 exit 1 @@ -762,15 +737,10 @@ _disinto_init_nomad() { exit 0 fi - # Vault engines + policies + auth are invoked on every nomad real-run - # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). - # Engines runs first because policies/roles/templates all reference the - # kv/ mount it enables (issue #912). Mirror that ordering in the - # dry-run plan so the operator sees the full sequence Step 2 will - # execute. - echo "── Vault engines dry-run ──────────────────────────────" - echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" - echo "" + # Vault policies + auth are invoked on every nomad real-run path + # regardless of --import-* flags (they're idempotent; S2.1 + S2.3). + # Mirror that ordering in the dry-run plan so the operator sees the + # full sequence Step 2 will execute. echo "── Vault policies dry-run ─────────────────────────────" echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" echo "" @@ -800,37 +770,19 @@ _disinto_init_nomad() { fi if [ -n "$with_services" ]; then - # Interleaved seed/deploy per service (S2.6, #928, #948): match the - # real-run path so dry-run output accurately represents execution order. - # Build ordered deploy list: only include services present in with_services - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" - fi - done - - local IFS=' ' - echo "[deploy] deployment order: ${DEPLOY_ORDER}" - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + # Validate known services first case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 + ;; esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "── Vault seed dry-run ─────────────────────────────────" - echo "[seed] [dry-run] ${seed_script} --dry-run" - echo "" - fi - - # Deploy this service - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -838,40 +790,9 @@ _disinto_init_nomad() { fi echo "[deploy] [dry-run] nomad job validate ${jobspec_path}" echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}" - # Post-deploy: forgejo-bootstrap - if [ "$svc" = "forgejo" ]; then - local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" - echo "[deploy] [dry-run] [post-deploy] would run ${bootstrap_script}" - fi done echo "[deploy] dry-run complete" fi - - # Dry-run vault-runner (unconditionally, not gated by --with) - echo "" - echo "── Vault-runner dry-run ───────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}" - echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}" - else - echo "[deploy] vault-runner: jobspec not found, skipping" - fi - - # Build custom images dry-run (if agents, chat, or edge services are included) - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Build images dry-run ──────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat" - fi - if echo ",$with_services," | grep -q ",edge,"; then - echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge" - fi - fi exit 0 fi @@ -893,22 +814,6 @@ _disinto_init_nomad() { exit 0 fi - # Enable Vault secret engines (S2.1 / issue #912) — must precede - # policies/auth/import because every policy and every import target - # addresses paths under kv/. Idempotent, safe to re-run. - echo "" - echo "── Enabling Vault secret engines ──────────────────────" - local -a engines_cmd=("$vault_engines_sh") - if [ "$(id -u)" -eq 0 ]; then - "${engines_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${engines_cmd[@]}" || exit $? - fi - # Apply Vault policies (S2.1) — idempotent, safe to re-run. echo "" echo "── Applying Vault policies ────────────────────────────" @@ -959,147 +864,44 @@ _disinto_init_nomad() { echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" fi - # Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy. - # Single-node factory dev box: no multi-node pull needed, no registry auth. - # Can upgrade to approach B (registry push/pull) later if multi-node. - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Building custom images ─────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - local tag="disinto/agents:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",chat,"; then - local tag="disinto/chat:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",edge,"; then - local tag="disinto/edge:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5 - fi - fi - - # Interleaved seed/deploy per service (S2.6, #928, #948). - # We interleave seed + deploy per service (not batch all seeds then all deploys) - # so that OAuth-dependent services can reach their dependencies during seeding. - # E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach - # running forgejo) → deploy-woodpecker. + # Deploy services if requested if [ -n "$with_services" ]; then - local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" - - # Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" + echo "" + echo "── Deploying services ─────────────────────────────────" + local -a deploy_cmd=("$deploy_sh") + # Split comma-separated service list into positional args + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then + echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 + exit 1 fi - done - - local IFS=' ' - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + # Validate known services FIRST (before jobspec check) case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; - esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "" - echo "── Seeding Vault for ${seed_name} ───────────────────────────" - if [ "$(id -u)" -eq 0 ]; then - VAULT_ADDR="$vault_addr" "$seed_script" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? - fi - fi - - # Deploy this service - echo "" - echo "── Deploying ${svc} ───────────────────────────────────────" - - # Seed host volumes before deployment (if needed) - case "$svc" in - staging) - # Seed site-content host volume (/srv/disinto/docker) with static content - # The staging jobspec mounts this volume read-only to /srv/site - local site_content_src="${FACTORY_ROOT}/docker/index.html" - local site_content_dst="/srv/disinto/docker" - if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then - if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then - echo "[staging] seeding site-content volume..." - cp "$site_content_src" "${site_content_dst}/index.html" - fi - fi + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 ;; esac - + # Check jobspec exists local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 exit 1 fi - - local -a deploy_cmd=("$deploy_sh" "$svc") - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "${deploy_cmd[@]}" || exit $? - fi - - # Post-deploy: bootstrap Forgejo admin user after forgejo deployment - if [ "$svc" = "forgejo" ]; then - echo "" - echo "── Bootstrapping Forgejo admin user ───────────────────────" - local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" - if [ -x "$bootstrap_script" ]; then - if [ "$(id -u)" -eq 0 ]; then - "$bootstrap_script" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: forgejo-bootstrap.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "$bootstrap_script" || exit $? - fi - else - echo "warning: forgejo-bootstrap.sh not found or not executable" >&2 - fi - fi + deploy_cmd+=("$svc") done - # Run vault-runner (unconditionally, not gated by --with) — infrastructure job - # vault-runner is always present since it's needed for vault action dispatch - echo "" - echo "── Running vault-runner ────────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: running Nomad job (infrastructure)" - local -a vault_runner_cmd=("$deploy_sh" "vault-runner") - if [ "$(id -u)" -eq 0 ]; then - "${vault_runner_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${vault_runner_cmd[@]}" || exit $? - fi + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? else - echo "[deploy] vault-runner: jobspec not found, skipping" + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${deploy_cmd[@]}" || exit $? fi # Print final summary @@ -1117,24 +919,9 @@ _disinto_init_nomad() { echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" fi echo "Deployed: ${with_services}" - if echo ",$with_services," | grep -q ",forgejo,"; then + if echo "$with_services" | grep -q "forgejo"; then echo "Ports: forgejo: 3000" fi - if echo ",$with_services," | grep -q ",woodpecker-server,"; then - echo " woodpecker-server: 8000" - fi - if echo ",$with_services," | grep -q ",woodpecker-agent,"; then - echo " woodpecker-agent: (agent connected)" - fi - if echo ",$with_services," | grep -q ",agents,"; then - echo " agents: (polling loop running)" - fi - if echo ",$with_services," | grep -q ",staging,"; then - echo " staging: (internal, no external port)" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo " chat: 8080" - fi echo "────────────────────────────────────────────────────────" fi @@ -1220,70 +1007,6 @@ disinto_init() { exit 1 fi - # Normalize --with services (S3.4): expand 'woodpecker' shorthand to - # 'woodpecker-server,woodpecker-agent', auto-include forgejo when - # woodpecker is requested (OAuth dependency), and validate all names. - if [ -n "$with_services" ]; then - # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. - # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. - local expanded="" - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; - agents) _svc="agents" ;; - esac - expanded="${expanded:+${expanded},}${_svc}" - done - with_services="$expanded" - unset IFS - - # Auto-include forgejo when woodpecker is requested - if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ - && ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" - with_services="forgejo,${with_services}" - fi - - # Auto-include forgejo and woodpecker when agents is requested - if echo ",$with_services," | grep -q ",agents,"; then - if ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with agents implies --with forgejo (agents need forge)" - with_services="forgejo,${with_services}" - fi - if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then - echo "Note: --with agents implies --with woodpecker (agents need CI)" - with_services="${with_services},woodpecker-server,woodpecker-agent" - fi - fi - - # Auto-include all dependencies when edge is requested (S5.5) - if echo ",$with_services," | grep -q ",edge,"; then - # Edge depends on all backend services - for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do - if ! echo ",$with_services," | grep -q ",${dep},"; then - echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)" - with_services="${with_services},${dep}" - fi - done - fi - - # Validate all service names are known - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;; - *) - echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2 - exit 1 - ;; - esac - done - unset IFS - fi - # --import-* flag validation (S2.5). These three flags form an import # triple and must be consistent before dispatch: sops encryption is # useless without the age key to decrypt it, so either both --import-sops @@ -1474,36 +1197,6 @@ p.write_text(text) exit 0 fi - # Configure Forgejo and Woodpecker URLs when EDGE_TUNNEL_FQDN is set. - # In subdomain mode, uses per-service FQDNs at root path instead of subpath URLs. - # Must run BEFORE generate_compose so the .env file is available for variable substitution. - if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - local routing_mode="${EDGE_ROUTING_MODE:-subpath}" - # Create .env file if it doesn't exist yet (needed before compose generation) - if [ "$bare" = false ] && [ ! -f "${FACTORY_ROOT}/.env" ]; then - touch "${FACTORY_ROOT}/.env" - fi - if [ "$routing_mode" = "subdomain" ]; then - # Subdomain mode: Forgejo at forge..disinto.ai (root path) - if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN_FORGE:-forge.${EDGE_TUNNEL_FQDN}}/" >> "${FACTORY_ROOT}/.env" - fi - # Subdomain mode: Woodpecker at ci..disinto.ai (root path) - if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN_CI:-ci.${EDGE_TUNNEL_FQDN}}" >> "${FACTORY_ROOT}/.env" - fi - else - # Subpath mode: Forgejo ROOT_URL with /forge/ subpath (trailing slash required) - if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" - fi - # Subpath mode: Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) - if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" - fi - fi - fi - # Generate compose files (unless --bare) if [ "$bare" = false ]; then local forge_port @@ -1518,6 +1211,18 @@ p.write_text(text) touch "${FACTORY_ROOT}/.env" fi + # Configure Forgejo and Woodpecker subpath URLs when EDGE_TUNNEL_FQDN is set + if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then + # Forgejo ROOT_URL with /forge/ subpath (note trailing slash - Forgejo needs it) + if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" + fi + # Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) + if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" + fi + fi + # Prompt for FORGE_ADMIN_PASS before setup_forge # This ensures the password is set before Forgejo user creation prompt_admin_password "${FACTORY_ROOT}/.env" @@ -1621,15 +1326,9 @@ p.write_text(text) create_woodpecker_oauth "$forge_url" "$forge_repo" # Create OAuth2 app on Forgejo for disinto-chat (#708) - # In subdomain mode, callback is at chat. root instead of /chat/ subpath. local chat_redirect_uri if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - local chat_routing_mode="${EDGE_ROUTING_MODE:-subpath}" - if [ "$chat_routing_mode" = "subdomain" ]; then - chat_redirect_uri="https://${EDGE_TUNNEL_FQDN_CHAT:-chat.${EDGE_TUNNEL_FQDN}}/oauth/callback" - else - chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" - fi + chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" else chat_redirect_uri="http://localhost/chat/oauth/callback" fi @@ -2829,29 +2528,15 @@ disinto_edge() { # Write to .env (replace existing entries to avoid duplicates) local tmp_env tmp_env=$(mktemp) - grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN|FQDN_FORGE|FQDN_CI|FQDN_CHAT)=" "$env_file" > "$tmp_env" 2>/dev/null || true + grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN)=" "$env_file" > "$tmp_env" 2>/dev/null || true mv "$tmp_env" "$env_file" echo "EDGE_TUNNEL_HOST=${edge_host}" >> "$env_file" echo "EDGE_TUNNEL_PORT=${port}" >> "$env_file" echo "EDGE_TUNNEL_FQDN=${fqdn}" >> "$env_file" - # Subdomain mode: write per-service FQDNs (#1028) - local reg_routing_mode="${EDGE_ROUTING_MODE:-subpath}" - if [ "$reg_routing_mode" = "subdomain" ]; then - echo "EDGE_TUNNEL_FQDN_FORGE=forge.${fqdn}" >> "$env_file" - echo "EDGE_TUNNEL_FQDN_CI=ci.${fqdn}" >> "$env_file" - echo "EDGE_TUNNEL_FQDN_CHAT=chat.${fqdn}" >> "$env_file" - fi - echo "Registered: ${project}" echo " Port: ${port}" echo " FQDN: ${fqdn}" - if [ "$reg_routing_mode" = "subdomain" ]; then - echo " Mode: subdomain" - echo " Forge: forge.${fqdn}" - echo " CI: ci.${fqdn}" - echo " Chat: chat.${fqdn}" - fi echo " Saved to: ${env_file}" ;; @@ -2885,23 +2570,12 @@ disinto_edge() { edge_host="${EDGE_HOST:-edge.disinto.ai}" fi - # Read tunnel pubkey for ownership proof - local secrets_dir="${FACTORY_ROOT}/secrets" - local tunnel_pubkey="${secrets_dir}/tunnel_key.pub" - if [ ! -f "$tunnel_pubkey" ]; then - echo "Error: tunnel keypair not found at ${tunnel_pubkey}" >&2 - echo "Cannot prove ownership without the tunnel public key." >&2 - exit 1 - fi - local pubkey - pubkey=$(tr -d '\n' < "$tunnel_pubkey") - # SSH to edge host and deregister echo "Deregistering tunnel for ${project} on ${edge_host}..." local response response=$(ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \ "disinto-register@${edge_host}" \ - "deregister ${project} ${pubkey}" 2>&1) || { + "deregister ${project}" 2>&1) || { echo "Error: failed to deregister tunnel" >&2 echo "Response: ${response}" >&2 exit 1 @@ -2984,33 +2658,6 @@ EOF esac } -# ── backup command ──────────────────────────────────────────────────────────── -# Usage: disinto backup [args] -# Subcommands: -# create Create backup of factory state -# import Restore factory state from backup -disinto_backup() { - local subcmd="${1:-}" - shift || true - - case "$subcmd" in - create) - backup_create "$@" - ;; - import) - backup_import "$@" - ;; - *) - echo "Usage: disinto backup [args]" >&2 - echo "" >&2 - echo "Subcommands:" >&2 - echo " create Create backup of factory state" >&2 - echo " import Restore factory state from backup" >&2 - exit 1 - ;; - esac -} - # ── Main dispatch ──────────────────────────────────────────────────────────── case "${1:-}" in @@ -3027,7 +2674,6 @@ case "${1:-}" in hire-an-agent) shift; disinto_hire_an_agent "$@" ;; agent) shift; disinto_agent "$@" ;; edge) shift; disinto_edge "$@" ;; - backup) shift; disinto_backup "$@" ;; -h|--help) usage ;; *) usage ;; esac diff --git a/dev/AGENTS.md b/dev/AGENTS.md index d48f6b6..13d9736 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker-compose.yml b/docker-compose.yml index 6206b2c..ba8c77c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -77,6 +78,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -137,6 +139,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -208,8 +211,8 @@ services: edge: build: - context: . - dockerfile: docker/edge/Dockerfile + context: docker/edge + dockerfile: Dockerfile image: disinto/edge:latest container_name: disinto-edge security_opt: @@ -220,8 +223,6 @@ services: - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/root/.claude.json:ro - ${CLAUDE_DIR:-${HOME}/.claude}:/root/.claude:ro - disinto-logs:/opt/disinto-logs - # Chat history persistence (merged from chat container, #1083) - - ${CHAT_HISTORY_DIR:-./state/chat-history}:/var/lib/chat/history environment: - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} @@ -233,17 +234,6 @@ services: - PRIMARY_BRANCH=main - DISINTO_CONTAINER=1 - FORGE_ADMIN_USERS=disinto-admin,vault-bot,admin - # Chat env vars (merged from chat container into edge, #1083) - - CHAT_HOST=127.0.0.1 - - CHAT_PORT=8080 - - CHAT_OAUTH_CLIENT_ID=${CHAT_OAUTH_CLIENT_ID:-} - - CHAT_OAUTH_CLIENT_SECRET=${CHAT_OAUTH_CLIENT_SECRET:-} - - DISINTO_CHAT_ALLOWED_USERS=${DISINTO_CHAT_ALLOWED_USERS:-} - - FORWARD_AUTH_SECRET=${FORWARD_AUTH_SECRET:-} - - EDGE_TUNNEL_FQDN=${EDGE_TUNNEL_FQDN:-} - - EDGE_TUNNEL_FQDN_CHAT=${EDGE_TUNNEL_FQDN_CHAT:-} - - EDGE_ROUTING_MODE=${EDGE_ROUTING_MODE:-subpath} - # Rate limiting removed (#1084) ports: - "80:80" - "443:443" diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index fa3b2d8..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -1,26 +1,21 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ + bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) # SOPS — encrypted data decryption tool -# Download sops binary (replaces manual COPY of vendored binary) -ARG SOPS_VERSION=3.9.4 -RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ - -o /usr/local/bin/sops && chmod +x /usr/local/bin/sops +COPY docker/agents/bin/sops /usr/local/bin/sops +RUN chmod +x /usr/local/bin/sops # tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations -# Download tea binary (replaces manual COPY of vendored binary) -ARG TEA_VERSION=0.9.2 -RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ - -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea +COPY docker/agents/bin/tea /usr/local/bin/tea +RUN chmod +x /usr/local/bin/tea -# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API). -# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider. -RUN npm install -g @anthropic-ai/claude-code@2.1.84 +# Claude CLI is mounted from the host via docker-compose volume. +# No internet access to cli.anthropic.com required at build time. # Non-root user RUN useradd -m -u 1000 -s /bin/bash agent diff --git a/docker/chat/Dockerfile b/docker/chat/Dockerfile new file mode 100644 index 0000000..3d89863 --- /dev/null +++ b/docker/chat/Dockerfile @@ -0,0 +1,35 @@ +# disinto-chat — minimal HTTP backend for Claude chat UI +# +# Small Debian slim base with Python runtime. +# Chosen for simplicity and small image size (~100MB). +# +# Image size: ~100MB (well under the 200MB ceiling) +# +# The claude binary is mounted from the host at runtime via docker-compose, +# not baked into the image — same pattern as the agents container. + +FROM debian:bookworm-slim + +# Install Python (no build-time network access needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +# Non-root user — fixed UID 10001 for sandbox hardening (#706) +RUN useradd -m -u 10001 -s /bin/bash chat + +# Copy application files +COPY server.py /usr/local/bin/server.py +COPY entrypoint-chat.sh /entrypoint-chat.sh +COPY ui/ /var/chat/ui/ + +RUN chmod +x /entrypoint-chat.sh /usr/local/bin/server.py + +USER chat +WORKDIR /var/chat + +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1 + +ENTRYPOINT ["/entrypoint-chat.sh"] diff --git a/docker/chat/entrypoint-chat.sh b/docker/chat/entrypoint-chat.sh new file mode 100755 index 0000000..00fbe53 --- /dev/null +++ b/docker/chat/entrypoint-chat.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +# entrypoint-chat.sh — Start the disinto-chat backend server +# +# Exec-replace pattern: this script is the container entrypoint and runs +# the server directly (no wrapper needed). Logs to stdout for docker logs. + +LOGFILE="/tmp/chat.log" + +log() { + printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE" +} + +# Sandbox sanity checks (#706) — fail fast if isolation is broken +if [ -e /var/run/docker.sock ]; then + log "FATAL: /var/run/docker.sock is accessible — sandbox violation" + exit 1 +fi +if [ "$(id -u)" = "0" ]; then + log "FATAL: running as root (uid 0) — sandbox violation" + exit 1 +fi + +# Verify Claude CLI is available (expected via volume mount from host). +if ! command -v claude &>/dev/null; then + log "FATAL: claude CLI not found in PATH" + log "Mount the host binary into the container, e.g.:" + log " volumes:" + log " - /usr/local/bin/claude:/usr/local/bin/claude:ro" + exit 1 +fi +log "Claude CLI: $(claude --version 2>&1 || true)" + +# Start the Python server (exec-replace so signals propagate correctly) +log "Starting disinto-chat server on port 8080..." +exec python3 /usr/local/bin/server.py diff --git a/docker/chat/server.py b/docker/chat/server.py index 48944d1..6748354 100644 --- a/docker/chat/server.py +++ b/docker/chat/server.py @@ -20,15 +20,9 @@ OAuth flow: 6. Redirects to /chat/ The claude binary is expected to be mounted from the host at /usr/local/bin/claude. - -Workspace access: - - CHAT_WORKSPACE_DIR environment variable: bind-mounted project working tree - - Claude invocation uses --permission-mode acceptEdits for code modification - - CWD is set to workspace directory when configured, enabling Claude to - inspect, explain, or modify code scoped to that tree only """ -import asyncio +import datetime import json import os import re @@ -36,33 +30,21 @@ import secrets import subprocess import sys import time -import threading from http.server import HTTPServer, BaseHTTPRequestHandler -from socketserver import ThreadingMixIn from urllib.parse import urlparse, parse_qs, urlencode -import socket -import struct -import base64 -import hashlib # Configuration -HOST = os.environ.get("CHAT_HOST", "127.0.0.1") +HOST = os.environ.get("CHAT_HOST", "0.0.0.0") PORT = int(os.environ.get("CHAT_PORT", 8080)) UI_DIR = "/var/chat/ui" STATIC_DIR = os.path.join(UI_DIR, "static") CLAUDE_BIN = "/usr/local/bin/claude" -# Workspace directory: bind-mounted project working tree for Claude access -# Defaults to empty; when set, Claude can read/write to this directory -WORKSPACE_DIR = os.environ.get("CHAT_WORKSPACE_DIR", "") - # OAuth configuration FORGE_URL = os.environ.get("FORGE_URL", "http://localhost:3000") CHAT_OAUTH_CLIENT_ID = os.environ.get("CHAT_OAUTH_CLIENT_ID", "") CHAT_OAUTH_CLIENT_SECRET = os.environ.get("CHAT_OAUTH_CLIENT_SECRET", "") EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "") -EDGE_TUNNEL_FQDN_CHAT = os.environ.get("EDGE_TUNNEL_FQDN_CHAT", "") -EDGE_ROUTING_MODE = os.environ.get("EDGE_ROUTING_MODE", "subpath") # Shared secret for Caddy forward_auth verify endpoint (#709). # When set, only requests carrying this value in X-Forward-Auth-Secret are @@ -70,6 +52,10 @@ EDGE_ROUTING_MODE = os.environ.get("EDGE_ROUTING_MODE", "subpath") # (acceptable during local dev; production MUST set this). FORWARD_AUTH_SECRET = os.environ.get("FORWARD_AUTH_SECRET", "") +# Rate limiting / cost caps (#711) +CHAT_MAX_REQUESTS_PER_HOUR = int(os.environ.get("CHAT_MAX_REQUESTS_PER_HOUR", 60)) +CHAT_MAX_REQUESTS_PER_DAY = int(os.environ.get("CHAT_MAX_REQUESTS_PER_DAY", 500)) +CHAT_MAX_TOKENS_PER_DAY = int(os.environ.get("CHAT_MAX_TOKENS_PER_DAY", 1000000)) # Allowed users - disinto-admin always allowed; CSV allowlist extends it _allowed_csv = os.environ.get("DISINTO_CHAT_ALLOWED_USERS", "") @@ -95,10 +81,11 @@ _sessions = {} # Pending OAuth state tokens: state -> expires (float) _oauth_states = {} - -# WebSocket message queues per user -# user -> asyncio.Queue (for streaming messages to connected clients) -_websocket_queues = {} +# Per-user rate limiting state (#711) +# user -> list of request timestamps (for sliding-window hourly/daily caps) +_request_log = {} +# user -> {"tokens": int, "date": "YYYY-MM-DD"} +_daily_tokens = {} # MIME types for static files MIME_TYPES = { @@ -112,22 +99,9 @@ MIME_TYPES = { ".ico": "image/x-icon", } -# WebSocket subprotocol for chat streaming -WEBSOCKET_SUBPROTOCOL = "chat-stream-v1" - -# WebSocket opcodes -OPCODE_CONTINUATION = 0x0 -OPCODE_TEXT = 0x1 -OPCODE_BINARY = 0x2 -OPCODE_CLOSE = 0x8 -OPCODE_PING = 0x9 -OPCODE_PONG = 0xA - def _build_callback_uri(): """Build the OAuth callback URI based on tunnel configuration.""" - if EDGE_ROUTING_MODE == "subdomain" and EDGE_TUNNEL_FQDN_CHAT: - return f"https://{EDGE_TUNNEL_FQDN_CHAT}/oauth/callback" if EDGE_TUNNEL_FQDN: return f"https://{EDGE_TUNNEL_FQDN}/chat/oauth/callback" return "http://localhost/chat/oauth/callback" @@ -213,9 +187,69 @@ def _fetch_user(access_token): return None +# ============================================================================= +# Rate Limiting Functions (#711) +# ============================================================================= + +def _check_rate_limit(user): + """Check per-user rate limits. Returns (allowed, retry_after, reason) (#711). + + Checks hourly request cap, daily request cap, and daily token cap. + """ + now = time.time() + one_hour_ago = now - 3600 + today = datetime.date.today().isoformat() + + # Prune old entries from request log + timestamps = _request_log.get(user, []) + timestamps = [t for t in timestamps if t > now - 86400] + _request_log[user] = timestamps + + # Hourly request cap + hourly = [t for t in timestamps if t > one_hour_ago] + if len(hourly) >= CHAT_MAX_REQUESTS_PER_HOUR: + oldest_in_window = min(hourly) + retry_after = int(oldest_in_window + 3600 - now) + 1 + return False, max(retry_after, 1), "hourly request limit" + + # Daily request cap + start_of_day = time.mktime(datetime.date.today().timetuple()) + daily = [t for t in timestamps if t >= start_of_day] + if len(daily) >= CHAT_MAX_REQUESTS_PER_DAY: + next_day = start_of_day + 86400 + retry_after = int(next_day - now) + 1 + return False, max(retry_after, 1), "daily request limit" + + # Daily token cap + token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) + if token_info["date"] != today: + token_info = {"tokens": 0, "date": today} + _daily_tokens[user] = token_info + if token_info["tokens"] >= CHAT_MAX_TOKENS_PER_DAY: + next_day = start_of_day + 86400 + retry_after = int(next_day - now) + 1 + return False, max(retry_after, 1), "daily token limit" + + return True, 0, "" + + +def _record_request(user): + """Record a request timestamp for the user (#711).""" + _request_log.setdefault(user, []).append(time.time()) + + +def _record_tokens(user, tokens): + """Record token usage for the user (#711).""" + today = datetime.date.today().isoformat() + token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) + if token_info["date"] != today: + token_info = {"tokens": 0, "date": today} + token_info["tokens"] += tokens + _daily_tokens[user] = token_info + def _parse_stream_json(output): - """Parse stream-json output from claude --print. + """Parse stream-json output from claude --print (#711). Returns (text_content, total_tokens). Falls back gracefully if the usage event is absent or malformed. @@ -261,313 +295,6 @@ def _parse_stream_json(output): return "".join(text_parts), total_tokens -# ============================================================================= -# WebSocket Handler Class -# ============================================================================= - -class _WebSocketHandler: - """Handle WebSocket connections for chat streaming.""" - - def __init__(self, reader, writer, user, message_queue): - self.reader = reader - self.writer = writer - self.user = user - self.message_queue = message_queue - self.closed = False - - async def accept_connection(self, sec_websocket_key, sec_websocket_protocol=None): - """Accept the WebSocket handshake. - - The HTTP request has already been parsed by BaseHTTPRequestHandler, - so we use the provided key and protocol instead of re-reading from socket. - """ - # Validate subprotocol - if sec_websocket_protocol and sec_websocket_protocol != WEBSOCKET_SUBPROTOCOL: - self._send_http_error( - 400, - "Bad Request", - f"Unsupported subprotocol. Expected: {WEBSOCKET_SUBPROTOCOL}", - ) - self._close_connection() - return False - - # Generate accept key - accept_key = self._generate_accept_key(sec_websocket_key) - - # Send handshake response - response = ( - "HTTP/1.1 101 Switching Protocols\r\n" - "Upgrade: websocket\r\n" - "Connection: Upgrade\r\n" - f"Sec-WebSocket-Accept: {accept_key}\r\n" - ) - - if sec_websocket_protocol: - response += f"Sec-WebSocket-Protocol: {sec_websocket_protocol}\r\n" - - response += "\r\n" - self.writer.write(response.encode("utf-8")) - await self.writer.drain() - return True - - def _generate_accept_key(self, sec_key): - """Generate the Sec-WebSocket-Accept key.""" - GUID = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11" - combined = sec_key + GUID - sha1 = hashlib.sha1(combined.encode("utf-8")) - return base64.b64encode(sha1.digest()).decode("utf-8") - - async def _read_line(self): - """Read a line from the socket.""" - data = await self.reader.read(1) - line = "" - while data: - if data == b"\r": - data = await self.reader.read(1) - continue - if data == b"\n": - return line - line += data.decode("utf-8", errors="replace") - data = await self.reader.read(1) - return line - - def _send_http_error(self, code, title, message): - """Send an HTTP error response.""" - response = ( - f"HTTP/1.1 {code} {title}\r\n" - "Content-Type: text/plain; charset=utf-8\r\n" - "Content-Length: " + str(len(message)) + "\r\n" - "\r\n" - + message - ) - try: - self.writer.write(response.encode("utf-8")) - self.writer.drain() - except Exception: - pass - - def _close_connection(self): - """Close the connection.""" - try: - self.writer.close() - except Exception: - pass - - async def send_text(self, data): - """Send a text frame.""" - if self.closed: - return - try: - frame = self._encode_frame(OPCODE_TEXT, data.encode("utf-8")) - self.writer.write(frame) - await self.writer.drain() - except Exception as e: - print(f"WebSocket send error: {e}", file=sys.stderr) - - async def send_binary(self, data): - """Send a binary frame.""" - if self.closed: - return - try: - if isinstance(data, str): - data = data.encode("utf-8") - frame = self._encode_frame(OPCODE_BINARY, data) - self.writer.write(frame) - await self.writer.drain() - except Exception as e: - print(f"WebSocket send error: {e}", file=sys.stderr) - - def _encode_frame(self, opcode, payload): - """Encode a WebSocket frame.""" - frame = bytearray() - frame.append(0x80 | opcode) # FIN + opcode - - length = len(payload) - if length < 126: - frame.append(length) - elif length < 65536: - frame.append(126) - frame.extend(struct.pack(">H", length)) - else: - frame.append(127) - frame.extend(struct.pack(">Q", length)) - - frame.extend(payload) - return bytes(frame) - - async def _decode_frame(self): - """Decode a WebSocket frame. Returns (opcode, payload).""" - try: - # Read first two bytes (use readexactly for guaranteed length) - header = await self.reader.readexactly(2) - - fin = (header[0] >> 7) & 1 - opcode = header[0] & 0x0F - masked = (header[1] >> 7) & 1 - length = header[1] & 0x7F - - # Extended payload length - if length == 126: - ext = await self.reader.readexactly(2) - length = struct.unpack(">H", ext)[0] - elif length == 127: - ext = await self.reader.readexactly(8) - length = struct.unpack(">Q", ext)[0] - - # Masking key - if masked: - mask_key = await self.reader.readexactly(4) - - # Payload - payload = await self.reader.readexactly(length) - - # Unmask if needed - if masked: - payload = bytes(b ^ mask_key[i % 4] for i, b in enumerate(payload)) - - return opcode, payload - except Exception as e: - print(f"WebSocket decode error: {e}", file=sys.stderr) - return None, None - - async def handle_connection(self): - """Handle the WebSocket connection loop.""" - try: - while not self.closed: - opcode, payload = await self._decode_frame() - if opcode is None: - break - - if opcode == OPCODE_CLOSE: - await self._send_close() - break - elif opcode == OPCODE_PING: - await self._send_pong(payload) - elif opcode == OPCODE_PONG: - pass # Ignore pong - elif opcode in (OPCODE_TEXT, OPCODE_BINARY): - # Handle text messages from client (e.g., chat_request) - try: - msg = payload.decode("utf-8") - data = json.loads(msg) - if data.get("type") == "chat_request": - # Invoke Claude with the message - await self._handle_chat_request(data.get("message", "")) - except (json.JSONDecodeError, UnicodeDecodeError): - pass - - # Check if we should stop waiting for messages - if self.closed: - break - - except Exception as e: - print(f"WebSocket connection error: {e}", file=sys.stderr) - finally: - self._close_connection() - # Clean up the message queue on disconnect - if self.user in _websocket_queues: - del _websocket_queues[self.user] - - async def _send_close(self): - """Send a close frame.""" - try: - # Close code 1000 = normal closure - frame = self._encode_frame(OPCODE_CLOSE, struct.pack(">H", 1000)) - self.writer.write(frame) - await self.writer.drain() - except Exception: - pass - - async def _send_pong(self, payload): - """Send a pong frame.""" - try: - frame = self._encode_frame(OPCODE_PONG, payload) - self.writer.write(frame) - await self.writer.drain() - except Exception: - pass - - async def _handle_chat_request(self, message): - """Handle a chat_request WebSocket frame by invoking Claude.""" - if not message: - return - - # Validate Claude binary exists - if not os.path.exists(CLAUDE_BIN): - await self.send_text(json.dumps({ - "type": "error", - "message": "Claude CLI not found", - })) - return - - try: - # Build claude command with permission mode (acceptEdits allows file edits) - claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] - - # Spawn claude --print with stream-json for streaming output - # Set cwd to workspace directory if configured, allowing Claude to access project code - cwd = WORKSPACE_DIR if WORKSPACE_DIR else None - proc = subprocess.Popen( - claude_args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=cwd, - bufsize=1, - ) - - # Stream output line by line - for line in iter(proc.stdout.readline, ""): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - etype = event.get("type", "") - - # Extract text content from content_block_delta events - if etype == "content_block_delta": - delta = event.get("delta", {}) - if delta.get("type") == "text_delta": - text = delta.get("text", "") - if text: - # Send tokens to client - await self.send_text(text) - - # Check for usage event to know when complete - if etype == "result": - pass # Will send complete after loop - - except json.JSONDecodeError: - pass - - # Wait for process to complete - proc.wait() - - if proc.returncode != 0: - await self.send_text(json.dumps({ - "type": "error", - "message": f"Claude CLI failed with exit code {proc.returncode}", - })) - return - - # Send complete signal - await self.send_text(json.dumps({ - "type": "complete", - })) - - except FileNotFoundError: - await self.send_text(json.dumps({ - "type": "error", - "message": "Claude CLI not found", - })) - except Exception as e: - await self.send_text(json.dumps({ - "type": "error", - "message": str(e), - })) - - # ============================================================================= # Conversation History Functions (#710) # ============================================================================= @@ -817,9 +544,9 @@ class ChatHandler(BaseHTTPRequestHandler): self.serve_static(path) return - # WebSocket upgrade endpoint - if path == "/chat/ws" or path == "/ws" or path.startswith("/ws"): - self.handle_websocket_upgrade() + # Reserved WebSocket endpoint (future use) + if path == "/ws" or path.startswith("/ws"): + self.send_error_page(501, "WebSocket upgrade not yet implemented") return # 404 for unknown paths @@ -1009,13 +736,33 @@ class ChatHandler(BaseHTTPRequestHandler): except IOError as e: self.send_error_page(500, f"Error reading file: {e}") - + def _send_rate_limit_response(self, retry_after, reason): + """Send a 429 response with Retry-After header and HTMX fragment (#711).""" + body = ( + f'
' + f"Rate limit exceeded: {reason}. " + f"Please try again in {retry_after} seconds." + f"
" + ) + self.send_response(429) + self.send_header("Retry-After", str(retry_after)) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body.encode("utf-8")))) + self.end_headers() + self.wfile.write(body.encode("utf-8")) + def handle_chat(self, user): """ Handle chat requests by spawning `claude --print` with the user message. - Streams tokens over WebSocket if connected. + Enforces per-user rate limits and tracks token usage (#711). """ + # Check rate limits before processing (#711) + allowed, retry_after, reason = _check_rate_limit(user) + if not allowed: + self._send_rate_limit_response(retry_after, reason) + return + # Read request body content_length = int(self.headers.get("Content-Length", 0)) if content_length == 0: @@ -1052,63 +799,23 @@ class ChatHandler(BaseHTTPRequestHandler): if not conv_id or not _validate_conversation_id(conv_id): conv_id = _generate_conversation_id() + # Record request for rate limiting (#711) + _record_request(user) + try: # Save user message to history _write_message(user, conv_id, "user", message) - # Build claude command with permission mode (acceptEdits allows file edits) - claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] - # Spawn claude --print with stream-json for token tracking (#711) - # Set cwd to workspace directory if configured, allowing Claude to access project code - cwd = WORKSPACE_DIR if WORKSPACE_DIR else None proc = subprocess.Popen( - claude_args, + [CLAUDE_BIN, "--print", "--output-format", "stream-json", message], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - cwd=cwd, - bufsize=1, # Line buffered ) - # Stream output line by line - response_parts = [] - total_tokens = 0 - for line in iter(proc.stdout.readline, ""): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - etype = event.get("type", "") + raw_output = proc.stdout.read() - # Extract text content from content_block_delta events - if etype == "content_block_delta": - delta = event.get("delta", {}) - if delta.get("type") == "text_delta": - text = delta.get("text", "") - if text: - response_parts.append(text) - # Stream to WebSocket if connected - if user in _websocket_queues: - try: - _websocket_queues[user].put_nowait(text) - except Exception: - pass # Client disconnected - - # Parse usage from result event - if etype == "result": - usage = event.get("usage", {}) - total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) - elif "usage" in event: - usage = event["usage"] - if isinstance(usage, dict): - total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) - - except json.JSONDecodeError: - pass - - # Wait for process to complete error_output = proc.stderr.read() if error_output: print(f"Claude stderr: {error_output}", file=sys.stderr) @@ -1119,12 +826,20 @@ class ChatHandler(BaseHTTPRequestHandler): self.send_error_page(500, f"Claude CLI failed with exit code {proc.returncode}") return - # Combine response parts - response = "".join(response_parts) + # Parse stream-json for text and token usage (#711) + response, total_tokens = _parse_stream_json(raw_output) + + # Track token usage - does not block *this* request (#711) + if total_tokens > 0: + _record_tokens(user, total_tokens) + print( + f"Token usage: user={user} tokens={total_tokens}", + file=sys.stderr, + ) # Fall back to raw output if stream-json parsing yielded no text if not response: - response = proc.stdout.getvalue() if hasattr(proc.stdout, 'getvalue') else "" + response = raw_output # Save assistant response to history _write_message(user, conv_id, "assistant", response) @@ -1194,106 +909,6 @@ class ChatHandler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(json.dumps({"conversation_id": conv_id}, ensure_ascii=False).encode("utf-8")) - @staticmethod - def push_to_websocket(user, message): - """Push a message to a WebSocket connection for a user. - - This is called from the chat handler to stream tokens to connected clients. - The message is added to the user's WebSocket message queue. - """ - # Get the message queue from the WebSocket handler's queue - # We store the queue in a global dict keyed by user - if user in _websocket_queues: - _websocket_queues[user].put_nowait(message) - - def handle_websocket_upgrade(self): - """Handle WebSocket upgrade request for chat streaming.""" - # Check session cookie - user = _validate_session(self.headers.get("Cookie")) - if not user: - self.send_error_page(401, "Unauthorized: no valid session") - return - - # Create message queue for this user - _websocket_queues[user] = asyncio.Queue() - - # Get WebSocket upgrade headers from the HTTP request - sec_websocket_key = self.headers.get("Sec-WebSocket-Key", "") - sec_websocket_protocol = self.headers.get("Sec-WebSocket-Protocol", "") - - # Validate Sec-WebSocket-Key - if not sec_websocket_key: - self.send_error_page(400, "Bad Request", "Missing Sec-WebSocket-Key") - return - - # Get the socket from the connection - sock = self.connection - sock.setblocking(False) - - # Create async server to handle the connection - async def handle_ws(): - try: - # Wrap the socket in asyncio streams using open_connection - reader, writer = await asyncio.open_connection(sock=sock) - - # Create WebSocket handler - ws_handler = _WebSocketHandler(reader, writer, user, _websocket_queues[user]) - - # Accept the connection (pass headers from HTTP request) - if not await ws_handler.accept_connection(sec_websocket_key, sec_websocket_protocol): - return - - # Start a task to read from the queue and send to client - async def send_stream(): - while not ws_handler.closed: - try: - data = await asyncio.wait_for(ws_handler.message_queue.get(), timeout=1.0) - await ws_handler.send_text(data) - except asyncio.TimeoutError: - # Send ping to keep connection alive - try: - frame = ws_handler._encode_frame(OPCODE_PING, b"") - writer.write(frame) - await writer.drain() - except Exception: - break - except Exception as e: - print(f"Send stream error: {e}", file=sys.stderr) - break - - # Start sending task - send_task = asyncio.create_task(send_stream()) - - # Handle incoming WebSocket frames - await ws_handler.handle_connection() - - # Cancel send task - send_task.cancel() - try: - await send_task - except asyncio.CancelledError: - pass - - except Exception as e: - print(f"WebSocket handler error: {e}", file=sys.stderr) - finally: - try: - writer.close() - await writer.wait_closed() - except Exception: - pass - - # Run the async handler in a thread - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete(handle_ws()) - except Exception as e: - print(f"WebSocket error: {e}", file=sys.stderr) - finally: - loop.close() - sock.close() - def do_DELETE(self): """Handle DELETE requests.""" parsed = urlparse(self.path) @@ -1329,6 +944,12 @@ def main(): print("forward_auth secret configured (#709)", file=sys.stderr) else: print("WARNING: FORWARD_AUTH_SECRET not set - verify endpoint unrestricted", file=sys.stderr) + print( + f"Rate limits (#711): {CHAT_MAX_REQUESTS_PER_HOUR}/hr, " + f"{CHAT_MAX_REQUESTS_PER_DAY}/day, " + f"{CHAT_MAX_TOKENS_PER_DAY} tokens/day", + file=sys.stderr, + ) httpd.serve_forever() diff --git a/docker/chat/ui/index.html b/docker/chat/ui/index.html index b045873..bd920f9 100644 --- a/docker/chat/ui/index.html +++ b/docker/chat/ui/index.html @@ -430,10 +430,6 @@ return div.innerHTML.replace(/\n/g, '
'); } - // WebSocket connection for streaming - let ws = null; - let wsMessageId = null; - // Send message handler async function sendMessage() { const message = textarea.value.trim(); @@ -453,14 +449,6 @@ await createNewConversation(); } - // Try WebSocket streaming first, fall back to fetch - if (window.location.protocol === 'https:' || window.location.hostname === 'localhost') { - if (tryWebSocketSend(message)) { - return; - } - } - - // Fallback to fetch try { // Use fetch with URLSearchParams for application/x-www-form-urlencoded const params = new URLSearchParams(); @@ -497,111 +485,6 @@ } } - // Try to send message via WebSocket streaming - function tryWebSocketSend(message) { - try { - // Generate a unique message ID for this request - wsMessageId = Date.now().toString(36) + Math.random().toString(36).substr(2); - - // Connect to WebSocket - const wsUrl = window.location.protocol === 'https:' - ? `wss://${window.location.host}/chat/ws` - : `ws://${window.location.host}/chat/ws`; - - ws = new WebSocket(wsUrl); - - ws.onopen = function() { - // Send the message as JSON with message ID - const data = { - type: 'chat_request', - message_id: wsMessageId, - message: message, - conversation_id: currentConversationId - }; - ws.send(JSON.stringify(data)); - }; - - ws.onmessage = function(event) { - try { - const data = JSON.parse(event.data); - - if (data.type === 'token') { - // Stream a token to the UI - addTokenToLastMessage(data.token); - } else if (data.type === 'complete') { - // Streaming complete - closeWebSocket(); - textarea.disabled = false; - sendBtn.disabled = false; - sendBtn.textContent = 'Send'; - textarea.focus(); - messagesDiv.scrollTop = messagesDiv.scrollHeight; - loadConversations(); - } else if (data.type === 'error') { - addSystemMessage(`Error: ${data.message}`); - closeWebSocket(); - textarea.disabled = false; - sendBtn.disabled = false; - sendBtn.textContent = 'Send'; - textarea.focus(); - } - } catch (e) { - console.error('Failed to parse WebSocket message:', e); - } - }; - - ws.onerror = function(error) { - console.error('WebSocket error:', error); - addSystemMessage('WebSocket connection error. Falling back to regular chat.'); - closeWebSocket(); - sendMessage(); // Retry with fetch - }; - - ws.onclose = function() { - wsMessageId = null; - }; - - return true; // WebSocket attempt started - - } catch (error) { - console.error('Failed to create WebSocket:', error); - return false; // Fall back to fetch - } - } - - // Add a token to the last assistant message (for streaming) - function addTokenToLastMessage(token) { - const messages = messagesDiv.querySelectorAll('.message.assistant'); - if (messages.length === 0) { - // No assistant message yet, create one - const msgDiv = document.createElement('div'); - msgDiv.className = 'message assistant'; - msgDiv.innerHTML = ` -
assistant
-
- `; - messagesDiv.appendChild(msgDiv); - } - - const lastMsg = messagesDiv.querySelector('.message.assistant .content.streaming'); - if (lastMsg) { - lastMsg.textContent += token; - messagesDiv.scrollTop = messagesDiv.scrollHeight; - } - } - - // Close WebSocket connection - function closeWebSocket() { - if (ws) { - ws.onopen = null; - ws.onmessage = null; - ws.onerror = null; - ws.onclose = null; - ws.close(); - ws = null; - } - } - // Event listeners sendBtn.addEventListener('click', sendMessage); diff --git a/docker/edge/Dockerfile b/docker/edge/Dockerfile index 507c39b..eca7d7e 100644 --- a/docker/edge/Dockerfile +++ b/docker/edge/Dockerfile @@ -1,12 +1,6 @@ FROM caddy:latest -RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh \ - nodejs npm -# Claude Code CLI — chat backend runtime (merged from docker/chat, #1083) -RUN npm install -g @anthropic-ai/claude-code@2.1.84 -COPY docker/edge/entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh -# Chat server and UI (merged from docker/chat into edge, #1083) -COPY docker/chat/server.py /usr/local/bin/chat-server.py -COPY docker/chat/ui/ /var/chat/ui/ +RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh +COPY entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh VOLUME /data diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index 282342a..a48abf2 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -560,168 +560,10 @@ _launch_runner_docker() { # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV # -# Dispatches a vault-runner batch job via `nomad job dispatch`. -# Polls `nomad job status` until terminal state (completed/failed). -# Reads exit code from allocation and writes .result.json. -# -# Usage: _launch_runner_nomad -# Returns: exit code of the nomad job (0=success, non-zero=failure) +# Nomad backend stub — will be implemented in migration Step 5. _launch_runner_nomad() { - local action_id="$1" - local secrets_csv="$2" - local mounts_csv="$3" - - log "Dispatching vault-runner batch job via Nomad for action: ${action_id}" - - # Dispatch the parameterized batch job - # The vault-runner job expects meta: action_id, secrets_csv - # Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl) - local dispatch_output - dispatch_output=$(nomad job dispatch \ - -detach \ - -meta action_id="$action_id" \ - -meta secrets_csv="$secrets_csv" \ - vault-runner 2>&1) || { - log "ERROR: Failed to dispatch vault-runner job for ${action_id}" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}" - return 1 - } - - # Extract dispatched job ID from output (format: "vault-runner/dispatch--") - local dispatched_job_id - dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true) - - if [ -z "$dispatched_job_id" ]; then - log "ERROR: Could not extract dispatched job ID from nomad output" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output" - return 1 - fi - - log "Dispatched vault-runner with job ID: ${dispatched_job_id}" - - # Poll job status until terminal state - # Batch jobs transition: running -> completed/failed - local max_wait=300 # 5 minutes max wait - local elapsed=0 - local poll_interval=5 - local alloc_id="" - - log "Polling nomad job status for ${dispatched_job_id}..." - - while [ "$elapsed" -lt "$max_wait" ]; do - # Get job status with JSON output for the dispatched child job - local job_status_json - job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get job status for ${dispatched_job_id}" - write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}" - return 1 - } - - # Check job status field (transitions to "dead" on completion) - local job_state - job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state="" - - # Check allocation state directly - alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id="" - - if [ -n "$alloc_id" ]; then - local alloc_state - alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true) - - case "$alloc_state" in - *completed*|*success*|*dead*) - log "Allocation ${alloc_id} reached terminal state: ${alloc_state}" - break - ;; - *running*|*pending*|*starting*) - log "Allocation ${alloc_id} still running (state: ${alloc_state})..." - ;; - *failed*|*crashed*) - log "Allocation ${alloc_id} failed (state: ${alloc_state})" - break - ;; - esac - fi - - # Also check job-level state - case "$job_state" in - dead) - log "Job ${dispatched_job_id} reached terminal state: ${job_state}" - break - ;; - failed) - log "Job ${dispatched_job_id} failed" - break - ;; - esac - - sleep "$poll_interval" - elapsed=$((elapsed + poll_interval)) - done - - if [ "$elapsed" -ge "$max_wait" ]; then - log "ERROR: Timeout waiting for vault-runner job to complete" - write_result "$action_id" 1 "Timeout waiting for nomad job to complete" - return 1 - fi - - # Get final job status and exit code - local final_status_json - final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get final job status" - write_result "$action_id" 1 "Failed to get final job status" - return 1 - } - - # Get allocation exit code - local exit_code=0 - local logs="" - - if [ -n "$alloc_id" ]; then - # Get allocation logs - logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true) - - # Try to get exit code from alloc status JSON - # Nomad alloc status -json has .TaskStates[""].Events[].ExitCode - local alloc_exit_code - alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code="" - - if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then - exit_code="$alloc_exit_code" - fi - fi - - # If we couldn't get exit code from alloc, check job state as fallback - # Note: "dead" = terminal state for batch jobs (includes successful completion) - # Only "failed" indicates actual failure - if [ "$exit_code" -eq 0 ]; then - local final_state - final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state="" - - case "$final_state" in - failed) - exit_code=1 - ;; - esac - fi - - # Truncate logs if too long - if [ ${#logs} -gt 1000 ]; then - logs="${logs: -1000}" - fi - - # Write result file - write_result "$action_id" "$exit_code" "$logs" - - if [ "$exit_code" -eq 0 ]; then - log "Vault-runner job completed successfully for action: ${action_id}" - else - log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})" - fi - - return "$exit_code" + echo "nomad backend not yet implemented" >&2 + return 1 } # Launch runner for the given action (backend-agnostic orchestrator) @@ -1209,8 +1051,11 @@ main() { # Validate backend selection at startup case "$DISPATCHER_BACKEND" in - docker|nomad) - log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch" + docker) ;; + nomad) + log "ERROR: nomad backend not yet implemented" + echo "nomad backend not yet implemented" >&2 + exit 1 ;; *) log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}" diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index a1511ff..1b5f94f 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,63 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi - -# Start chat server in background (#1083 — merged from docker/chat into edge) -(python3 /usr/local/bin/chat-server.py 2>&1 | tee -a /opt/disinto-logs/chat.log) & - -# Nomad template renders Caddyfile to /local/Caddyfile via service discovery; -# copy it into the expected location if present (compose uses the mounted path). -if [ -f /local/Caddyfile ]; then - cp /local/Caddyfile /etc/caddy/Caddyfile - echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Caddy as main process — run in foreground via wait so background jobs survive # (exec replaces the shell, which can orphan backgrounded subshells) diff --git a/docs/nomad-cutover-runbook.md b/docs/nomad-cutover-runbook.md deleted file mode 100644 index e0956cc..0000000 --- a/docs/nomad-cutover-runbook.md +++ /dev/null @@ -1,183 +0,0 @@ -# Nomad Cutover Runbook - -End-to-end procedure to cut over the disinto factory from docker-compose on -disinto-dev-box to Nomad on disinto-nomad-box. - -**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box -stays warm for rollback. - -**Downtime budget**: <5 min blue-green flip. - -**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is -regenerated or discarded. OAuth secrets are regenerated on fresh init (all -sessions invalidated). - ---- - -## 1. Pre-cutover readiness checklist - -- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified) -- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and - Codeberg -- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6) -- [ ] Companion tools landed: - - `disinto backup create` (#1057) - - `disinto backup import` (#1058) -- [ ] Backup tarball produced and tested against a scratch LXC (see §3) - ---- - -## 2. Pre-cutover artifact: backup - -On disinto-dev-box: - -```bash -./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz -``` - -Copy the tarball to nomad-box (and optionally to a local workstation for -safekeeping): - -```bash -scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/ -``` - ---- - -## 3. Pre-cutover dry-run - -On a throwaway LXC: - -```bash -lxc launch ubuntu:24.04 cutover-dryrun -# inside the container: -disinto init --backend=nomad --import-env .env --with edge -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -Verify: - -- Issue count matches source Forgejo -- disinto-ops repo refs match source bundle - -Destroy the LXC once satisfied: - -```bash -lxc delete cutover-dryrun --force -``` - ---- - -## 4. Cutover T-0 (operator executes; <5 min target) - -### 4.1 Stop dev-box services - -```bash -# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them) -docker-compose stop -``` - -### 4.2 Provision nomad-box (if not already done) - -```bash -# On disinto-nomad-box -disinto init --backend=nomad --import-env .env --with edge -``` - -### 4.3 Import backup - -```bash -# On disinto-nomad-box -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -### 4.4 Configure Codeberg pull mirror - -Manual, one-time step in the new Forgejo UI: - -1. Create a mirror repository pointing at the Codeberg upstream -2. Confirm initial sync completes - -### 4.5 Claude login - -```bash -# On disinto-nomad-box -claude login -``` - -Set up Anthropic OAuth so agents can authenticate. - -### 4.6 Autossh tunnel swap - -> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate. - -1. Stop the tunnel on dev-box: - ```bash - # On disinto-dev-box - systemctl stop reverse-tunnel - ``` - -2. Copy or regenerate the tunnel unit on nomad-box: - ```bash - # Copy from dev-box, or let init regenerate it - scp dev-box:/etc/systemd/system/reverse-tunnel.service \ - nomad-box:/etc/systemd/system/ - ``` - -3. Register nomad-box's public key on DO edge: - ```bash - # On DO edge box — same restricted-command as the dev-box key - echo "" >> /home/johba/.ssh/authorized_keys - ``` - -4. Start the tunnel on nomad-box: - ```bash - # On disinto-nomad-box - systemctl enable --now reverse-tunnel - ``` - -5. Verify end-to-end: - ```bash - curl https://self.disinto.ai/api/v1/version - # Should return the new box's Forgejo version - ``` - ---- - -## 5. Post-cutover smoke - -- [ ] `curl https://self.disinto.ai` → Forgejo welcome page -- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work -- [ ] Claude chat login via Forgejo OAuth succeeds - ---- - -## 6. Rollback (if any step 4 gate fails) - -1. Stop the tunnel on nomad-box: - ```bash - systemctl stop reverse-tunnel # on nomad-box - ``` - -2. Restore the tunnel on dev-box: - ```bash - systemctl start reverse-tunnel # on dev-box - ``` - -3. Bring dev-box services back up: - ```bash - docker-compose up -d # on dev-box - ``` - -4. DO Caddy config is unchanged — traffic restores in <5 min. - -5. File a post-mortem issue. Keep nomad-box state intact for debugging. - ---- - -## 7. Post-stable cleanup (T+1 week) - -- `docker-compose down -v` on dev-box -- Archive `/var/lib/docker/volumes/disinto_*` to cold storage -- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator - decision) diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index 4101252..f31e6bc 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -29,7 +29,7 @@ and injected into your prompt above. Review them now. 1. Read the injected metrics data carefully (System Resources, Docker, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, - CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). + CI Pipelines, Open PRs, Issue Status, Stale Worktrees). Note: preflight.sh auto-removes PHASE:escalate files for closed issues (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. @@ -75,10 +75,6 @@ Categorize every finding from the metrics into priority levels. - Dev/action sessions in PHASE:escalate for > 24h (session timeout) (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; this check covers sessions where the issue is still open) -- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight: - - Container not running or in unhealthy state - - gRPC errors >= 3 in last 20 minutes - - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes ### P3 — Factory degraded - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed @@ -104,15 +100,6 @@ For each finding from the health assessment, decide and execute an action. ### Auto-fixable (execute these directly) -**P2 Woodpecker agent unhealthy:** -The supervisor-run.sh script automatically handles WP agent recovery: -- Detects unhealthy state via preflight.sh health checks -- Restarts container via `docker restart` -- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes -- Unassigns and removes blocked label from affected issues -- Posts recovery comment with infra-flake context -- Avoids duplicate restarts via 5-minute cooldown in history file - **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true @@ -261,11 +248,6 @@ Format: - (or "No actions needed") - ### WP Agent Recovery (if applicable) - - WP agent restart: