diff --git a/.env.example b/.env.example index a1f24d5..c1c0b98 100644 --- a/.env.example +++ b/.env.example @@ -32,10 +32,13 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance # - FORGE_PASS_DEV_QWEN2 # Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores). # The compose generator looks these up via the agent's `forge_user` field in -# the project TOML. Configure local-model agents via [agents.X] sections in -# projects/*.toml — this is the canonical activation path. +# the project TOML. The pre-existing `dev-qwen` llama agent uses +# FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the +# legacy `ENABLE_LLAMA_AGENT=1` single-agent path). FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents) FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361) +FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama) +FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token @@ -104,6 +107,13 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔ # Store all project secrets here so formulas reference env vars, never hardcode. BASE_RPC_URL= # [SECRET] on-chain RPC endpoint +# ── Local Qwen dev agent (optional) ────────────────────────────────────── +# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml. +# Requires a running llama-server reachable at ANTHROPIC_BASE_URL. +# See docs/agents-llama.md for details. +ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service +ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081 + # ── Tuning ──────────────────────────────────────────────────────────────── CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation diff --git a/.gitignore b/.gitignore index a29450c..21c6fbc 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json +gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/detect-duplicates.py b/.woodpecker/detect-duplicates.py index 473bb18..0485833 100644 --- a/.woodpecker/detect-duplicates.py +++ b/.woodpecker/detect-duplicates.py @@ -294,45 +294,6 @@ def main() -> int: "9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern", # Standard lib source block shared across formula-driven agent run scripts "330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)", - # Test data for duplicate service detection tests (#850) - # Intentionally duplicated TOML blocks in smoke-init.sh and test-duplicate-service-detection.sh - "334967b8b4f1a8d3b0b9b8e0912f3bfb": "Test TOML: [agents.llama] block header (smoke-init.sh + test-duplicate-service-detection.sh)", - "d82f30077e5bb23b5fc01db003033d5d": "Test TOML: [agents.llama] block body (smoke-init.sh + test-duplicate-service-detection.sh)", - # Common vault-seed script patterns: logging helpers + flag parsing - # Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh - "843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)", - "ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)", - "9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)", - "9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)", - "5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)", - # forgejo-bootstrap.sh follows wp-oauth-register.sh pattern (issue #1069) - "2b80185e4ae2b54e2e01f33e5555c688": "Standard header (set -euo pipefail, SCRIPT_DIR, REPO_ROOT) (forgejo-bootstrap + wp-oauth-register)", - "38a1f20a60d69f0d6bfb06a0532b3bd7": "Logging helpers + DRY_RUN init (forgejo-bootstrap + wp-oauth-register)", - "4dd3c526fa29bdaa88b274c3d7d01032": "Flag parsing loop + case start (forgejo-bootstrap + wp-oauth-register)", - # Common vault-seed script preamble + precondition patterns - # Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh - "dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT", - "1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard", - "63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die", - "34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup", - "71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die", - # Common vault-seed script flag parsing patterns - # Shared across tools/vault-seed-{forgejo,ops-repo}.sh - "6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)", - "a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)", - "e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)", - "c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)", - "106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)", - "c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)", - "1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)", - "919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)", - "8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)", - "ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)", - "aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)", - "60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)", - "f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)", - # Common shell control-flow: if → return 1 → fi → fi (env.sh + register.sh) - "a8bdb7f1a5d8cbd0a5921b17b6cf6f4d": "Common shell control-flow (return 1 / fi / fi / return 0 / }) (env.sh + register.sh)", } if not sh_files: diff --git a/.woodpecker/edge-subpath.yml b/.woodpecker/edge-subpath.yml deleted file mode 100644 index 2c11980..0000000 --- a/.woodpecker/edge-subpath.yml +++ /dev/null @@ -1,317 +0,0 @@ -# ============================================================================= -# .woodpecker/edge-subpath.yml — Edge subpath routing static checks -# -# Static validation for edge subpath routing configuration. This pipeline does -# NOT run live service curls — it validates the configuration that would be -# used by a deployed edge proxy. -# -# Checks: -# 1. shellcheck — syntax check on tests/smoke-edge-subpath.sh -# 2. caddy validate — validate the Caddyfile template syntax -# 3. caddyfile-routing-test — verify Caddyfile routing block shape -# 4. test-caddyfile-routing — run standalone unit test for Caddyfile structure -# -# Triggers: -# - Pull requests that modify edge-related files -# -# Environment variables (inherited from WOODPECKER_ENVIRONMENT): -# EDGE_BASE_URL — Edge proxy URL for reference (default: http://localhost) -# EDGE_TIMEOUT — Request timeout in seconds (default: 30) -# EDGE_MAX_RETRIES — Max retries per request (default: 3) -# ============================================================================= - -when: - event: pull_request - -steps: - # ── 1. ShellCheck on smoke script ──────────────────────────────────────── - # `shellcheck` validates bash syntax, style, and common pitfalls. - # Exit codes: - # 0 — all checks passed - # 1 — one or more issues found - - name: shellcheck-smoke - image: koalaman/shellcheck-alpine:stable - commands: - - shellcheck --severity=warning tests/smoke-edge-subpath.sh tests/test-caddyfile-routing.sh - - # ── 2. Caddyfile template rendering ─────────────────────────────────────── - # Render a mock Caddyfile for validation. The template uses Nomad's - # templating syntax ({{ range ... }}) which must be processed before Caddy - # can validate it. We render a mock version with Nomad templates expanded - # to static values for validation purposes. - - name: render-caddyfile - image: alpine:3.19 - commands: - - apk add --no-cache coreutils - - | - set -e - mkdir -p edge-render - # Render mock Caddyfile with Nomad templates expanded - { - echo '# Caddyfile — edge proxy configuration (Nomad-rendered)' - echo '# Staging upstream discovered via Nomad service registration.' - echo '' - echo ':80 {' - echo ' # Redirect root to Forgejo' - echo ' handle / {' - echo ' redir /forge/ 302' - echo ' }' - echo '' - echo ' # Reverse proxy to Forgejo' - echo ' handle /forge/* {' - echo ' reverse_proxy 127.0.0.1:3000' - echo ' }' - echo '' - echo ' # Reverse proxy to Woodpecker CI' - echo ' handle /ci/* {' - echo ' reverse_proxy 127.0.0.1:8000' - echo ' }' - echo '' - echo ' # Reverse proxy to staging — dynamic port via Nomad service discovery' - echo ' handle /staging/* {' - echo ' reverse_proxy 127.0.0.1:8081' - echo ' }' - echo '' - echo ' # Chat service — reverse proxy to disinto-chat backend (#705)' - echo ' # OAuth routes bypass forward_auth — unauthenticated users need these (#709)' - echo ' handle /chat/login {' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo ' handle /chat/oauth/callback {' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo ' # Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)' - echo ' handle /chat/* {' - echo ' forward_auth 127.0.0.1:8080 {' - echo ' uri /chat/auth/verify' - echo ' copy_headers X-Forwarded-User' - echo ' header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}' - echo ' }' - echo ' reverse_proxy 127.0.0.1:8080' - echo ' }' - echo '}' - } > edge-render/Caddyfile - cp edge-render/Caddyfile edge-render/Caddyfile.rendered - echo "Caddyfile rendered successfully" - - # ── 3. Caddy config validation ─────────────────────────────────────────── - # `caddy validate` checks Caddyfile syntax and configuration. - # This validates the rendered Caddyfile against Caddy's parser. - # Exit codes: - # 0 — configuration is valid - # 1 — configuration has errors - - name: caddy-validate - image: alpine:3.19 - commands: - - apk add --no-cache ca-certificates curl - - curl -sS -o /tmp/caddy "https://caddyserver.com/api/download?os=linux&arch=amd64" - - chmod +x /tmp/caddy - - /tmp/caddy version - - /tmp/caddy validate --config edge-render/Caddyfile.rendered --adapter caddyfile - - # ── 4. Caddyfile routing block shape test ───────────────────────────────── - # Verify that the Caddyfile contains all required routing blocks: - # - /forge/ — Forgejo subpath - # - /ci/ — Woodpecker subpath - # - /staging/ — Staging subpath - # - /chat/ — Chat subpath with forward_auth - # - # This is a unit test that validates the expected structure without - # requiring a running Caddy instance. - - name: caddyfile-routing-test - image: alpine:3.19 - commands: - - apk add --no-cache grep coreutils - - | - set -e - - CADDYFILE="edge-render/Caddyfile.rendered" - - echo "=== Validating Caddyfile routing blocks ===" - - # Check that all required subpath handlers exist - # POSIX-safe loop (alpine /bin/sh has no arrays) - FAILED=0 - for handler in "handle /forge/\*" "handle /ci/\*" "handle /staging/\*" "handle /chat/login" "handle /chat/oauth/callback" "handle /chat/\*"; do - if grep -q "$handler" "$CADDYFILE"; then - echo "[PASS] Found handler: $handler" - else - echo "[FAIL] Missing handler: $handler" - FAILED=1 - fi - done - - # Check forward_auth block exists for /chat/* - if grep -A5 "handle /chat/\*" "$CADDYFILE" | grep -q "forward_auth"; then - echo "[PASS] forward_auth block found for /chat/*" - else - echo "[FAIL] forward_auth block missing for /chat/*" - FAILED=1 - fi - - # Check reverse_proxy to Forgejo (port 3000) - if grep -q "reverse_proxy 127.0.0.1:3000" "$CADDYFILE"; then - echo "[PASS] Forgejo reverse_proxy configured (port 3000)" - else - echo "[FAIL] Forgejo reverse_proxy not configured" - FAILED=1 - fi - - # Check reverse_proxy to Woodpecker (port 8000) - if grep -q "reverse_proxy 127.0.0.1:8000" "$CADDYFILE"; then - echo "[PASS] Woodpecker reverse_proxy configured (port 8000)" - else - echo "[FAIL] Woodpecker reverse_proxy not configured" - FAILED=1 - fi - - # Check reverse_proxy to Chat (port 8080) - if grep -q "reverse_proxy 127.0.0.1:8080" "$CADDYFILE"; then - echo "[PASS] Chat reverse_proxy configured (port 8080)" - else - echo "[FAIL] Chat reverse_proxy not configured" - FAILED=1 - fi - - # Check root redirect to /forge/ - if grep -q "redir /forge/ 302" "$CADDYFILE"; then - echo "[PASS] Root redirect to /forge/ configured" - else - echo "[FAIL] Root redirect to /forge/ not configured" - FAILED=1 - fi - - echo "" - if [ $FAILED -eq 0 ]; then - echo "=== All routing blocks validated ===" - exit 0 - else - echo "=== Routing block validation failed ===" >&2 - exit 1 - fi - - # ── 5. Standalone Caddyfile routing test ───────────────────────────────── - # Run the standalone unit test for Caddyfile routing block validation. - # This test extracts the Caddyfile template from edge.hcl and validates - # its structure without requiring a running Caddy instance. - - name: test-caddyfile-routing - image: alpine:3.19 - commands: - - apk add --no-cache grep coreutils - - | - set -e - EDGE_TEMPLATE="nomad/jobs/edge.hcl" - - echo "=== Extracting Caddyfile template from $EDGE_TEMPLATE ===" - - # Extract the Caddyfile template (content between <&2 - exit 1 - fi - - echo "Caddyfile template extracted successfully" - echo "" - - FAILED=0 - - # Check Forgejo subpath - if echo "$CADDYFILE" | grep -q "handle /forge/\*"; then - echo "[PASS] Forgejo handle block" - else - echo "[FAIL] Forgejo handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:3000"; then - echo "[PASS] Forgejo reverse_proxy (port 3000)" - else - echo "[FAIL] Forgejo reverse_proxy (port 3000)" - FAILED=1 - fi - - # Check Woodpecker subpath - if echo "$CADDYFILE" | grep -q "handle /ci/\*"; then - echo "[PASS] Woodpecker handle block" - else - echo "[FAIL] Woodpecker handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8000"; then - echo "[PASS] Woodpecker reverse_proxy (port 8000)" - else - echo "[FAIL] Woodpecker reverse_proxy (port 8000)" - FAILED=1 - fi - - # Check Staging subpath - if echo "$CADDYFILE" | grep -q "handle /staging/\*"; then - echo "[PASS] Staging handle block" - else - echo "[FAIL] Staging handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "nomadService"; then - echo "[PASS] Staging Nomad service discovery" - else - echo "[FAIL] Staging Nomad service discovery" - FAILED=1 - fi - - # Check Chat subpath - if echo "$CADDYFILE" | grep -q "handle /chat/login"; then - echo "[PASS] Chat login handle block" - else - echo "[FAIL] Chat login handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "handle /chat/oauth/callback"; then - echo "[PASS] Chat OAuth callback handle block" - else - echo "[FAIL] Chat OAuth callback handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "handle /chat/\*"; then - echo "[PASS] Chat catch-all handle block" - else - echo "[FAIL] Chat catch-all handle block" - FAILED=1 - fi - - if echo "$CADDYFILE" | grep -q "reverse_proxy 127.0.0.1:8080"; then - echo "[PASS] Chat reverse_proxy (port 8080)" - else - echo "[FAIL] Chat reverse_proxy (port 8080)" - FAILED=1 - fi - - # Check forward_auth for chat - if echo "$CADDYFILE" | grep -A10 "handle /chat/\*" | grep -q "forward_auth"; then - echo "[PASS] forward_auth block for /chat/*" - else - echo "[FAIL] forward_auth block for /chat/*" - FAILED=1 - fi - - # Check root redirect - if echo "$CADDYFILE" | grep -q "redir /forge/ 302"; then - echo "[PASS] Root redirect to /forge/" - else - echo "[FAIL] Root redirect to /forge/" - FAILED=1 - fi - - echo "" - if [ $FAILED -eq 0 ]; then - echo "=== All routing blocks validated ===" - exit 0 - else - echo "=== Routing block validation failed ===" >&2 - exit 1 - fi diff --git a/.woodpecker/nomad-validate.yml b/.woodpecker/nomad-validate.yml index 5a1cc7c..1ea6d2d 100644 --- a/.woodpecker/nomad-validate.yml +++ b/.woodpecker/nomad-validate.yml @@ -1,21 +1,25 @@ # ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # -# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, -# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell -# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the -# `disinto init` dispatcher and vault/roles.yaml, gets checked before it -# can land. +# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the +# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or +# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked +# before it can land. +# +# Also includes Vault policy validation (S2.6, issue #884): +# - vault policy fmt -check on vault/policies/*.hcl +# - vault policy validate on each policy file +# - roles.yaml validator (yamllint + policy reference check) +# - secret-scan gate on policy files # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) -# lib/init/nomad/** — cluster-up / install / systemd / vault-init / -# vault-nomad-auth (S2.6 trigger: vault-*.sh -# is a subset of this glob) +# lib/init/nomad/** — cluster-up / install / systemd / vault-init # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself -# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6) -# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6) +# vault/policies/*.hcl — Vault ACL policies (S2.6) +# vault/roles.yaml — JWT auth role definitions (S2.6) +# lib/init/nomad/vault-*.sh — Vault init scripts (S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): @@ -24,22 +28,12 @@ # nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl -# 4. vault-policy-fmt — `vault policy fmt` idempotence check on -# every vault/policies/*.hcl (format drift = -# CI fail; non-destructive via cp+diff) -# 5. vault-policy-validate — HCL syntax + capability validation for every -# vault/policies/*.hcl via `vault policy write` -# against an inline dev-mode Vault server -# 6. vault-roles-validate — yamllint + role→policy reference check on -# vault/roles.yaml (every referenced policy -# must exist as vault/policies/.hcl) -# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto -# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests -# -# Secret-scan coverage: vault/policies/*.hcl is already scanned by the -# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path -# `vault/**/*` covers everything under this directory. We intentionally -# do NOT duplicate that gate here; one scanner, one source of truth. +# 4. vault-policy-fmt — `vault policy fmt -check` on vault/policies/*.hcl +# 5. vault-policy-validate — `vault policy validate` on each policy file +# 6. vault-roles-validate — yamllint + policy reference check +# 7. vault-secret-scan — scan policy files for embedded secrets +# 8. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto +# 9. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on @@ -53,8 +47,9 @@ when: - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" - - "vault/policies/**" + - "vault/policies/*.hcl" - "vault/roles.yaml" + - "lib/init/nomad/vault-*.sh" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is @@ -144,21 +139,14 @@ steps: *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac - # ── 4. Vault policy fmt idempotence check ──────────────────────────────── - # `vault policy fmt ` formats a local HCL policy file in place. - # There's no `-check`/dry-run flag (vault 1.18.5), so we implement a - # non-destructive check as cp → fmt-on-copy → diff against original. - # Any diff means the committed file would be rewritten by `vault policy - # fmt` — failure steers the author to run `vault policy fmt ` - # locally before pushing. + # ── 4. Vault policy fmt -check ─────────────────────────────────────────── + # `vault policy fmt -check` is non-destructive; it reads each policy file + # and compares against the formatted version. Any difference means the file + # needs formatting. This enforces consistent indentation (2-space), no + # trailing whitespace, and proper HCL formatting conventions. # - # Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the - # no-match case (POSIX sh does not nullglob) so an empty policies/ - # directory does not fail this step. - # - # Note: `vault policy fmt` is purely local (HCL text transform) and does - # not require a running Vault server, which is why this step can run - # without starting one. + # CI runs this BEFORE vault policy validate because unformatted HCL can + # sometimes cause confusing validation errors. - name: vault-policy-fmt image: hashicorp/vault:1.18.5 commands: @@ -167,153 +155,215 @@ steps: failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue - tmp="/tmp/$(basename "$f").fmt" - cp "$f" "$tmp" - vault policy fmt "$tmp" >/dev/null 2>&1 - if ! diff -u "$f" "$tmp"; then - echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2 + echo "fmt-check: $f" + if ! vault policy fmt -check "$f" > /dev/null 2>&1; then + echo " ERROR: $f is not formatted correctly" >&2 + vault policy fmt -check "$f" >&2 || true failed=1 fi done if [ "$failed" -gt 0 ]; then - echo "vault-policy-fmt: formatting drift detected" >&2 + echo "vault-policy-fmt: formatting errors found" >&2 exit 1 fi echo "vault-policy-fmt: all policies formatted correctly" - # ── 5. Vault policy HCL syntax + capability validation ─────────────────── - # Vault has no offline `vault policy validate` subcommand — the closest - # in-CLI validator is `vault policy write`, which sends the HCL to a - # running server which parses it, checks capability names against the - # known set (read, list, create, update, delete, patch, sudo, deny), - # and rejects unknown stanzas / malformed path blocks. We start an - # inline dev-mode Vault (in-memory, no persistence, root token = "root") - # for the duration of this step and loop `vault policy write` over every - # vault/policies/*.hcl; the policies never leave the ephemeral dev - # server, so this is strictly a validator — not a deploy. + # ── 5. Vault policy validate ───────────────────────────────────────────── + # `vault policy validate` performs syntax + semantic validation: + # - Checks for unknown stanzas/blocks + # - Validates path patterns are valid + # - Validates capabilities are known (read, list, create, update, delete, sudo) + # - Checks for missing required fields # - # Exit-code handling: - # - `vault policy write` exits 0 on success, non-zero on any parse / - # semantic error. We aggregate failures across all files so a single - # CI run surfaces every broken policy (not just the first). - # - The dev server is killed on any step exit via EXIT trap so the - # step tears down cleanly even on failure. + # Requires a running Vault instance (dev mode is sufficient for CI). + # Uses the default dev server at http://127.0.0.1:8200 with "root" token. # - # Why dev-mode is sufficient: we're not persisting secrets, only asking - # Vault to parse policy text. The factory's production Vault is NOT - # contacted. + # Exit codes: + # 0 — policy is valid + # 1 — policy has errors (syntax or semantic) + # + # CI starts a Vault dev server inline for validation. - name: vault-policy-validate image: hashicorp/vault:1.18.5 commands: - | set -e - vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 & + # Start Vault dev server in background + vault server -dev -dev-root-token-id=root -dev-listen-address=0.0.0.0:8200 & VAULT_PID=$! - trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM - export VAULT_ADDR=http://127.0.0.1:8200 - export VAULT_TOKEN=root - ready=0 - i=0 - while [ "$i" -lt 30 ]; do - if vault status >/dev/null 2>&1; then - ready=1 + trap "kill $VAULT_PID 2>/dev/null || true" EXIT + + # Wait for Vault to be ready + for i in $(seq 1 30); do + if vault status > /dev/null 2>&1; then + echo "vault-policy-validate: Vault is ready" break fi - i=$((i + 1)) sleep 0.5 done - if [ "$ready" -ne 1 ]; then - echo "vault-policy-validate: dev server failed to start after 15s" >&2 - cat /tmp/vault-dev.log >&2 || true - exit 1 - fi + + # Validate each policy failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue - name=$(basename "$f" .hcl) echo "validate: $f" - if ! vault policy write "$name" "$f"; then - echo " ERROR: $f failed validation" >&2 + if ! VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" > /dev/null 2>&1; then + echo " ERROR: $f validation failed" >&2 + VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" >&2 || true failed=1 fi done + if [ "$failed" -gt 0 ]; then echo "vault-policy-validate: validation errors found" >&2 exit 1 fi echo "vault-policy-validate: all policies valid" - # ── 6. vault/roles.yaml validator ──────────────────────────────────────── - # Validates the JWT-auth role bindings file (S2.3). Two checks: + # ── 6. Vault roles.yaml validate ───────────────────────────────────────── + # Validates vault/roles.yaml: + # 1. yamllint check — ensures YAML syntax is valid + # 2. Policy reference check — each role references a policy that exists + # 3. Required fields check — each role has name, policies, and auth fields # - # a. `yamllint` — catches YAML syntax errors and indentation drift. - # Uses a relaxed config (line length bumped to 200) because - # roles.yaml's comments are wide by design. - # b. role → policy reference check — every role's `policy:` field - # must match a basename in vault/policies/*.hcl. A role pointing - # at a non-existent policy = runtime "permission denied" at job - # placement; catching the drift here turns it into a CI failure. - # Also verifies each role entry has the four required fields - # (name, policy, namespace, job_id) per the file's documented - # format. - # - # Parsing is done with PyYAML (the roles.yaml format is a strict - # subset that awk-level parsing in tools/vault-apply-roles.sh handles - # too, but PyYAML in CI gives us structural validation for free). If - # roles.yaml is ever absent (e.g. reverted), the step skips rather - # than fails — presence is enforced by S2.3's own tooling, not here. + # If roles.yaml doesn't exist yet, this step is skipped (it will be added + # in S2.3 alongside JWT auth configuration). - name: vault-roles-validate image: python:3.12-alpine commands: - - pip install --quiet --disable-pip-version-check pyyaml yamllint - | set -e + apk add --no-cache yamllint jq + if [ ! -f vault/roles.yaml ]; then - echo "vault-roles-validate: vault/roles.yaml not present, skipping" + echo "vault-roles-validate: roles.yaml not found, skipping" exit 0 fi - yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml - echo "vault-roles-validate: yamllint OK" - python3 - <<'PY' - import os - import sys - import yaml - with open('vault/roles.yaml') as f: - data = yaml.safe_load(f) or {} - roles = data.get('roles') or [] - if not roles: - print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr) - sys.exit(1) - existing = { - os.path.splitext(e)[0] - for e in os.listdir('vault/policies') - if e.endswith('.hcl') + echo "yamllint: vault/roles.yaml" + yamllint -q vault/roles.yaml || { + echo " ERROR: yamllint found issues" >&2 + exit 1 } - required = ('name', 'policy', 'namespace', 'job_id') - failed = 0 - for r in roles: - if not isinstance(r, dict): - print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr) - failed = 1 - continue - for field in required: - if r.get(field) in (None, ''): - print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr) - failed = 1 - policy = r.get('policy') - if policy and policy not in existing: - print( - f"ERROR: role '{r.get('name')}' references policy '{policy}' " - f"but vault/policies/{policy}.hcl does not exist", - file=sys.stderr, - ) - failed = 1 - sys.exit(failed) - PY - echo "vault-roles-validate: all role→policy references valid" + echo " OK" - # ── 7. Shellcheck ──────────────────────────────────────────────────────── + # Extract and validate policy references + echo "policy-reference-check: validating policy references" + policy_dir="vault/policies" + failed=0 + + # Get referenced policies from roles.yaml + referenced=$(jq -r '.roles[].policies[]?' vault/roles.yaml 2>/dev/null | sort -u || true) + + if [ -z "$referenced" ]; then + echo "vault-roles-validate: no policies referenced in roles.yaml" >&2 + exit 1 + fi + + # Get existing policy names + existing=$(find "$policy_dir" -maxdepth 1 -name '*.hcl' -type f -exec basename {} .hcl \; | sort) + + for policy in $referenced; do + if ! echo "$existing" | grep -q "^${policy}$"; then + echo "vault-roles-validate: ERROR: policy '$policy' referenced but not found" >&2 + failed=1 + fi + done + + if [ "$failed" -gt 0 ]; then + echo "vault-roles-validate: policy reference errors found" >&2 + exit 1 + fi + + echo "vault-roles-validate: all policy references valid" + + # ── 7. Vault secret-scan ───────────────────────────────────────────────── + # Scans policy HCL files for embedded secrets (rare but dangerous copy-paste + # mistake). Uses the same patterns as lib/secret-scan.sh: + # - Long hex strings (32+ chars) + # - API key patterns + # - URLs with embedded credentials + # - Bearer tokens + # + # Environment variables like $TOKEN or ${TOKEN} are excluded as safe. + - name: vault-secret-scan + image: alpine:3.19 + commands: + - | + set -e + apk add --no-cache bash + + # Copy the secret-scan.sh script into the container + cat > /tmp/secret-scan.sh << 'EOF' +#!/usr/bin/env bash +# Inline version of lib/secret-scan.sh for CI secret detection + +_SECRET_PATTERNS=( + '[0-9a-fA-F]{32,}' + 'Bearer [A-Za-z0-9_/+=-]{20,}' + '0x[0-9a-fA-F]{64}' + 'https?://[^[:space:]]*[0-9a-fA-F]{20,}' + 'AKIA[0-9A-Z]{16}' + '(API_KEY|SECRET|TOKEN|PRIVATE_KEY|PASSWORD|INFURA|ALCHEMY)=[^[:space:]"]{16,}' +) + +_SAFE_PATTERNS=( + '\$\{?[A-Z_]+\}?' + 'commit [0-9a-f]{40}' + 'Merge [0-9a-f]{40}' + 'last-reviewed: [0-9a-f]{40}' + 'codeberg\.org/[^[:space:]]+' + 'localhost:3000/[^[:space:]]+' + 'SC[0-9]{4}' +) + +scan_for_secrets() { + local text="${1:-$(cat)}" + local found=0 + + local cleaned="$text" + for safe in "${_SAFE_PATTERNS[@]}"; do + cleaned=$(printf '%s' "$cleaned" | sed -E "s/${safe}/__SAFE__/g" 2>/dev/null || printf '%s' "$cleaned") + done + + for pattern in "${_SECRET_PATTERNS[@]}"; do + local matches + matches=$(printf '%s' "$cleaned" | grep -oE "$pattern" 2>/dev/null || true) + if [ -n "$matches" ]; then + while IFS= read -r match; do + [ "$match" = "__SAFE__" ] && continue + [ -z "$match" ] && continue + printf 'secret-scan: detected potential secret matching pattern [%s]: %s\n' \ + "$pattern" "${match:0:8}...${match: -4}" >&2 + found=1 + done <<< "$matches" + fi + done + + return $found +} +EOF + chmod +x /tmp/secret-scan.sh + + # Scan policy files + echo "secret-scan: vault/policies/*.hcl" + failed=0 + for f in vault/policies/*.hcl; do + [ -f "$f" ] || continue + echo " scanning: $f" + if ! /tmp/secret-scan.sh < "$f"; then + echo " ERROR: potential secrets detected in $f" >&2 + failed=1 + fi + done + + if [ "$failed" -gt 0 ]; then + echo "vault-secret-scan: secrets detected" >&2 + exit 1 + fi + echo "vault-secret-scan: no secrets detected" + + # ── 8. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the @@ -323,7 +373,7 @@ steps: commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto - # ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── + # ── 9. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. diff --git a/AGENTS.md b/AGENTS.md index 52ea01f..eec058c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -37,20 +37,17 @@ disinto/ (code repo) │ examples/ — example vault action TOMLs (promote, publish, release, webhook-call) ├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) -│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4) -├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh -│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1) +│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) +├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) -├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge (Caddy + chat server subprocess + dispatcher), chat (server.py, ui/ — copied into edge image at build time) -├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh; register.sh enforces: reserved-name blocklist, admin-approved allowlist via /var/lib/disinto/allowlist.json, per-caller attribution via --as forced-command arg stored as registered_by, append-only audit log at /var/log/disinto/edge-register.log, ownership check on deregister requiring pubkey match) -│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) -│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) +├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) +├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) +├── bin/ The `disinto` CLI script ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs @@ -123,7 +120,8 @@ bash dev/phase-test.sh | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` | | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` | | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` | -| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) | +| agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) | +| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) | > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77). > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details. @@ -194,7 +192,9 @@ Humans write these. Agents read and enforce them. ## Phase-Signaling Protocol -When running as a persistent tmux session, Claude must signal the orchestrator at each phase boundary by writing to a phase file (e.g. `/tmp/dev-session-{project}-{issue}.phase`). +When running as a persistent tmux session, Claude must signal the orchestrator +at each phase boundary by writing to a phase file (e.g. +`/tmp/dev-session-{project}-{issue}.phase`). Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`. See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery. diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 98c0e04..9582b03 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index b0893c4..6128b7c 100755 --- a/bin/disinto +++ b/bin/disinto @@ -12,7 +12,6 @@ # disinto secrets Manage encrypted secrets # disinto run Run action in ephemeral runner container # disinto ci-logs [--step ] Read CI logs from Woodpecker SQLite -# disinto backup create Export factory state for migration # # Usage: # disinto init https://github.com/user/repo @@ -40,9 +39,7 @@ source "${FACTORY_ROOT}/lib/generators.sh" source "${FACTORY_ROOT}/lib/forge-push.sh" source "${FACTORY_ROOT}/lib/ci-setup.sh" source "${FACTORY_ROOT}/lib/release.sh" -source "${FACTORY_ROOT}/lib/backup.sh" source "${FACTORY_ROOT}/lib/claude-config.sh" -source "${FACTORY_ROOT}/lib/disinto/backup.sh" # backup create/import # ── Helpers ────────────────────────────────────────────────────────────────── @@ -65,9 +62,7 @@ Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) - disinto backup create Export factory state (issues + ops bundle) disinto edge [options] Manage edge tunnel registrations - disinto backup Backup and restore factory state Edge subcommands: register [project] Register a new tunnel (generates keypair if needed) @@ -87,16 +82,13 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad - --with (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5) + --with (nomad) Deploy services: forgejo[,...] (S1.3) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) --yes Skip confirmation prompts --rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default) --dry-run Print every intended action without executing - --import-env (nomad) Path to .env file for import into Vault KV (S2.5) - --import-sops (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5) - --age-key (nomad) Path to age keyfile (required with --import-sops) (S2.5) Hire an agent options: --formula Path to role formula TOML (default: formulas/.toml) @@ -106,18 +98,6 @@ Hire an agent options: CI logs options: --step Filter logs to a specific step (e.g., smoke-init) - -Backup subcommands: - create Create backup of factory state to tarball - import Restore factory state from backup tarball - -Import behavior: - - Unpacks tarball to temp directory - - Creates disinto repo via Forgejo API (mirror config is manual) - - Creates disinto-ops repo and pushes refs from bundle - - Imports issues from issues/*.json (idempotent - skips existing) - - Logs issue number mapping (Forgejo auto-assigns numbers) - - Prints summary: created X repos, pushed Y refs, imported Z issues, skipped W EOF exit 1 } @@ -684,13 +664,8 @@ prompt_admin_password() { # `sudo disinto init ...` directly. _disinto_init_nomad() { local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}" - local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" - local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" - local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" - local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" - local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" if [ ! -x "$cluster_up" ]; then echo "Error: ${cluster_up} not found or not executable" >&2 @@ -702,42 +677,6 @@ _disinto_init_nomad() { exit 1 fi - # --empty short-circuits after cluster-up: no policies, no auth, no - # import, no deploy. It's the "cluster-only escape hatch" for debugging - # (docs/nomad-migration.md). Caller-side validation already rejects - # --empty combined with --with or any --import-* flag, so reaching - # this branch with those set is a bug in the caller. - # - # On the default (non-empty) path, vault-engines.sh (enables the kv/ - # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked - # unconditionally — they are idempotent and cheap to re-run, and - # subsequent --with deployments depend on them. vault-import.sh is - # invoked only when an --import-* flag is set. vault-engines.sh runs - # first because every policy and role below references kv/disinto/* - # paths, which 403 if the engine is not yet mounted (issue #912). - local import_any=false - if [ -n "$import_env" ] || [ -n "$import_sops" ]; then - import_any=true - fi - if [ "$empty" != "true" ]; then - if [ ! -x "$vault_engines_sh" ]; then - echo "Error: ${vault_engines_sh} not found or not executable" >&2 - exit 1 - fi - if [ ! -x "$vault_policies_sh" ]; then - echo "Error: ${vault_policies_sh} not found or not executable" >&2 - exit 1 - fi - if [ ! -x "$vault_auth_sh" ]; then - echo "Error: ${vault_auth_sh} not found or not executable" >&2 - exit 1 - fi - if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then - echo "Error: ${vault_import_sh} not found or not executable" >&2 - exit 1 - fi - fi - # --empty and default both invoke cluster-up today. Log the requested # mode so the dispatch is visible in factory bootstrap logs — Step 1 # will branch on $empty to gate the job-deployment path. @@ -747,7 +686,7 @@ _disinto_init_nomad() { echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" fi - # Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan + # Dry-run: print cluster-up plan + deploy.sh plan if [ "$dry_run" = "true" ]; then echo "" echo "── Cluster-up dry-run ─────────────────────────────────" @@ -755,82 +694,20 @@ _disinto_init_nomad() { "${cmd[@]}" || true echo "" - # --empty skips policies/auth/import/deploy — cluster-up only, no - # workloads. The operator-visible dry-run plan must match the real - # run, so short-circuit here too. - if [ "$empty" = "true" ]; then - exit 0 - fi - - # Vault engines + policies + auth are invoked on every nomad real-run - # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). - # Engines runs first because policies/roles/templates all reference the - # kv/ mount it enables (issue #912). Mirror that ordering in the - # dry-run plan so the operator sees the full sequence Step 2 will - # execute. - echo "── Vault engines dry-run ──────────────────────────────" - echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" - echo "" - echo "── Vault policies dry-run ─────────────────────────────" - echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" - echo "" - echo "── Vault auth dry-run ─────────────────────────────────" - echo "[auth] [dry-run] ${vault_auth_sh}" - echo "" - - # Import plan: one line per --import-* flag that is actually set. - # Printing independently (not in an if/elif chain) means that all - # three flags appearing together each echo their own path — the - # regression that bit prior implementations of this issue (#883). - if [ "$import_any" = true ]; then - echo "── Vault import dry-run ───────────────────────────────" - [ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}" - [ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}" - [ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}" - local -a import_dry_cmd=("$vault_import_sh") - [ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env") - [ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops") - [ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key") - import_dry_cmd+=("--dry-run") - echo "[import] [dry-run] ${import_dry_cmd[*]}" - echo "" - else - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" - echo "" - fi - if [ -n "$with_services" ]; then - # Interleaved seed/deploy per service (S2.6, #928, #948): match the - # real-run path so dry-run output accurately represents execution order. - # Build ordered deploy list: only include services present in with_services - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" - fi - done - - local IFS=' ' - echo "[deploy] deployment order: ${DEPLOY_ORDER}" - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + # Validate known services first case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 + ;; esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "── Vault seed dry-run ─────────────────────────────────" - echo "[seed] [dry-run] ${seed_script} --dry-run" - echo "" - fi - - # Deploy this service - echo "── Deploy services dry-run ────────────────────────────" - echo "[deploy] services to deploy: ${with_services}" local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 @@ -838,44 +715,13 @@ _disinto_init_nomad() { fi echo "[deploy] [dry-run] nomad job validate ${jobspec_path}" echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}" - # Post-deploy: forgejo-bootstrap - if [ "$svc" = "forgejo" ]; then - local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" - echo "[deploy] [dry-run] [post-deploy] would run ${bootstrap_script}" - fi done echo "[deploy] dry-run complete" fi - - # Dry-run vault-runner (unconditionally, not gated by --with) - echo "" - echo "── Vault-runner dry-run ───────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}" - echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}" - else - echo "[deploy] vault-runner: jobspec not found, skipping" - fi - - # Build custom images dry-run (if agents, chat, or edge services are included) - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Build images dry-run ──────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat" - fi - if echo ",$with_services," | grep -q ",edge,"; then - echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge" - fi - fi exit 0 fi - # Real run: cluster-up + policies + auth + (optional) import + deploy + # Real run: cluster-up + deploy services local -a cluster_cmd=("$cluster_up") if [ "$(id -u)" -eq 0 ]; then "${cluster_cmd[@]}" || exit $? @@ -887,254 +733,54 @@ _disinto_init_nomad() { sudo -n -- "${cluster_cmd[@]}" || exit $? fi - # --empty short-circuits here: cluster-up only, no policies/auth/import - # and no deploy. Matches the dry-run plan above and the docs/runbook. - if [ "$empty" = "true" ]; then - exit 0 - fi - - # Enable Vault secret engines (S2.1 / issue #912) — must precede - # policies/auth/import because every policy and every import target - # addresses paths under kv/. Idempotent, safe to re-run. - echo "" - echo "── Enabling Vault secret engines ──────────────────────" - local -a engines_cmd=("$vault_engines_sh") - if [ "$(id -u)" -eq 0 ]; then - "${engines_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${engines_cmd[@]}" || exit $? - fi - - # Apply Vault policies (S2.1) — idempotent, safe to re-run. - echo "" - echo "── Applying Vault policies ────────────────────────────" - local -a policies_cmd=("$vault_policies_sh") - if [ "$(id -u)" -eq 0 ]; then - "${policies_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${policies_cmd[@]}" || exit $? - fi - - # Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent. - echo "" - echo "── Configuring Vault JWT auth ─────────────────────────" - local -a auth_cmd=("$vault_auth_sh") - if [ "$(id -u)" -eq 0 ]; then - "${auth_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${auth_cmd[@]}" || exit $? - fi - - # Import secrets if any --import-* flag is set (S2.2). - if [ "$import_any" = true ]; then + # Deploy services if requested + if [ -n "$with_services" ]; then echo "" - echo "── Importing secrets into Vault ───────────────────────" - local -a import_cmd=("$vault_import_sh") - [ -n "$import_env" ] && import_cmd+=("--env" "$import_env") - [ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops") - [ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key") - if [ "$(id -u)" -eq 0 ]; then - "${import_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-import.sh must run as root and sudo is not installed" >&2 + echo "── Deploying services ─────────────────────────────────" + local -a deploy_cmd=("$deploy_sh") + # Split comma-separated service list into positional args + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then + echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 exit 1 fi - sudo -n -- "${import_cmd[@]}" || exit $? - fi - else - echo "" - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" - fi - - # Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy. - # Single-node factory dev box: no multi-node pull needed, no registry auth. - # Can upgrade to approach B (registry push/pull) later if multi-node. - if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then - echo "" - echo "── Building custom images ─────────────────────────────" - if echo ",$with_services," | grep -q ",agents,"; then - local tag="disinto/agents:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",chat,"; then - local tag="disinto/chat:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5 - fi - if echo ",$with_services," | grep -q ",edge,"; then - local tag="disinto/edge:local" - echo "── Building $tag ─────────────────────────────" - docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5 - fi - fi - - # Interleaved seed/deploy per service (S2.6, #928, #948). - # We interleave seed + deploy per service (not batch all seeds then all deploys) - # so that OAuth-dependent services can reach their dependencies during seeding. - # E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach - # running forgejo) → deploy-woodpecker. - if [ -n "$with_services" ]; then - local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" - - # Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge - local DEPLOY_ORDER="" - for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do - if echo ",$with_services," | grep -q ",$ordered_svc,"; then - DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}" - fi - done - - local IFS=' ' - for svc in $DEPLOY_ORDER; do - # Seed this service (if seed script exists) - local seed_name="$svc" + # Validate known services FIRST (before jobspec check) case "$svc" in - woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;; - agents) seed_name="agents" ;; - chat) seed_name="chat" ;; - edge) seed_name="ops-repo" ;; - esac - local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh" - if [ -x "$seed_script" ]; then - echo "" - echo "── Seeding Vault for ${seed_name} ───────────────────────────" - if [ "$(id -u)" -eq 0 ]; then - VAULT_ADDR="$vault_addr" "$seed_script" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? - fi - fi - - # Deploy this service - echo "" - echo "── Deploying ${svc} ───────────────────────────────────────" - - # Seed host volumes before deployment (if needed) - case "$svc" in - staging) - # Seed site-content host volume (/srv/disinto/docker) with static content - # The staging jobspec mounts this volume read-only to /srv/site - local site_content_src="${FACTORY_ROOT}/docker/index.html" - local site_content_dst="/srv/disinto/docker" - if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then - if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then - echo "[staging] seeding site-content volume..." - cp "$site_content_src" "${site_content_dst}/index.html" - fi - fi + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 ;; esac - + # Check jobspec exists local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" if [ ! -f "$jobspec_path" ]; then echo "Error: jobspec not found: ${jobspec_path}" >&2 exit 1 fi - - local -a deploy_cmd=("$deploy_sh" "$svc") - if [ "$(id -u)" -eq 0 ]; then - "${deploy_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "${deploy_cmd[@]}" || exit $? - fi - - # Post-deploy: bootstrap Forgejo admin user after forgejo deployment - if [ "$svc" = "forgejo" ]; then - echo "" - echo "── Bootstrapping Forgejo admin user ───────────────────────" - local bootstrap_script="${FACTORY_ROOT}/lib/init/nomad/forgejo-bootstrap.sh" - if [ -x "$bootstrap_script" ]; then - if [ "$(id -u)" -eq 0 ]; then - "$bootstrap_script" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: forgejo-bootstrap.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n --preserve-env=FORGE_ADMIN_PASS,FORGE_TOKEN,FORGE_URL -- "$bootstrap_script" || exit $? - fi - else - echo "warning: forgejo-bootstrap.sh not found or not executable" >&2 - fi - fi + deploy_cmd+=("$svc") done - # Run vault-runner (unconditionally, not gated by --with) — infrastructure job - # vault-runner is always present since it's needed for vault action dispatch - echo "" - echo "── Running vault-runner ────────────────────────────────────" - local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl" - if [ -f "$vault_runner_path" ]; then - echo "[deploy] vault-runner: running Nomad job (infrastructure)" - local -a vault_runner_cmd=("$deploy_sh" "vault-runner") - if [ "$(id -u)" -eq 0 ]; then - "${vault_runner_cmd[@]}" || exit $? - else - if ! command -v sudo >/dev/null 2>&1; then - echo "Error: deploy.sh must run as root and sudo is not installed" >&2 - exit 1 - fi - sudo -n -- "${vault_runner_cmd[@]}" || exit $? - fi + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? else - echo "[deploy] vault-runner: jobspec not found, skipping" + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${deploy_cmd[@]}" || exit $? fi # Print final summary echo "" echo "── Summary ────────────────────────────────────────────" echo "Cluster: Nomad+Vault cluster is up" - echo "Policies: applied (Vault ACL)" - echo "Auth: Vault JWT auth + Nomad workload identity configured" - if [ "$import_any" = true ]; then - local import_desc="" - [ -n "$import_env" ] && import_desc+="${import_env} " - [ -n "$import_sops" ] && import_desc+="${import_sops} " - echo "Imported: ${import_desc% }" - else - echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" - fi echo "Deployed: ${with_services}" - if echo ",$with_services," | grep -q ",forgejo,"; then + if echo "$with_services" | grep -q "forgejo"; then echo "Ports: forgejo: 3000" fi - if echo ",$with_services," | grep -q ",woodpecker-server,"; then - echo " woodpecker-server: 8000" - fi - if echo ",$with_services," | grep -q ",woodpecker-agent,"; then - echo " woodpecker-agent: (agent connected)" - fi - if echo ",$with_services," | grep -q ",agents,"; then - echo " agents: (polling loop running)" - fi - if echo ",$with_services," | grep -q ",staging,"; then - echo " staging: (internal, no external port)" - fi - if echo ",$with_services," | grep -q ",chat,"; then - echo " chat: 8080" - fi echo "────────────────────────────────────────────────────────" fi @@ -1157,7 +803,6 @@ disinto_init() { # Parse flags local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" - local import_env="" import_sops="" age_key="" while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -1174,12 +819,6 @@ disinto_init() { --yes) auto_yes=true; shift ;; --rotate-tokens) rotate_tokens=true; shift ;; --dry-run) dry_run=true; shift ;; - --import-env) import_env="$2"; shift 2 ;; - --import-env=*) import_env="${1#--import-env=}"; shift ;; - --import-sops) import_sops="$2"; shift 2 ;; - --import-sops=*) import_sops="${1#--import-sops=}"; shift ;; - --age-key) age_key="$2"; shift 2 ;; - --age-key=*) age_key="${1#--age-key=}"; shift ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -1220,104 +859,11 @@ disinto_init() { exit 1 fi - # Normalize --with services (S3.4): expand 'woodpecker' shorthand to - # 'woodpecker-server,woodpecker-agent', auto-include forgejo when - # woodpecker is requested (OAuth dependency), and validate all names. - if [ -n "$with_services" ]; then - # Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'. - # Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'. - local expanded="" - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - woodpecker) _svc="woodpecker-server,woodpecker-agent" ;; - agents) _svc="agents" ;; - esac - expanded="${expanded:+${expanded},}${_svc}" - done - with_services="$expanded" - unset IFS - - # Auto-include forgejo when woodpecker is requested - if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \ - && ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)" - with_services="forgejo,${with_services}" - fi - - # Auto-include forgejo and woodpecker when agents is requested - if echo ",$with_services," | grep -q ",agents,"; then - if ! echo ",$with_services," | grep -q ",forgejo,"; then - echo "Note: --with agents implies --with forgejo (agents need forge)" - with_services="forgejo,${with_services}" - fi - if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then - echo "Note: --with agents implies --with woodpecker (agents need CI)" - with_services="${with_services},woodpecker-server,woodpecker-agent" - fi - fi - - # Auto-include all dependencies when edge is requested (S5.5) - if echo ",$with_services," | grep -q ",edge,"; then - # Edge depends on all backend services - for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do - if ! echo ",$with_services," | grep -q ",${dep},"; then - echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)" - with_services="${with_services},${dep}" - fi - done - fi - - # Validate all service names are known - local IFS=',' - for _svc in $with_services; do - _svc=$(echo "$_svc" | xargs) - case "$_svc" in - forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;; - *) - echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2 - exit 1 - ;; - esac - done - unset IFS - fi - - # --import-* flag validation (S2.5). These three flags form an import - # triple and must be consistent before dispatch: sops encryption is - # useless without the age key to decrypt it, so either both --import-sops - # and --age-key are present or neither is. --import-env alone is fine - # (it just imports the plaintext dotenv). All three flags are nomad-only. - if [ -n "$import_sops" ] && [ -z "$age_key" ]; then - echo "Error: --import-sops requires --age-key" >&2 - exit 1 - fi - if [ -n "$age_key" ] && [ -z "$import_sops" ]; then - echo "Error: --age-key requires --import-sops" >&2 - exit 1 - fi - if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \ - && [ "$backend" != "nomad" ]; then - echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2 - exit 1 - fi - - # --empty is the cluster-only escape hatch — it skips policies, auth, - # import, and deploy. Pairing it with --import-* silently does nothing, - # which is a worse failure mode than a clear error. Reject explicitly. - if [ "$empty" = true ] \ - && { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then - echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2 - exit 1 - fi - # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # (S0.4). The default and --empty variants are identical today; Step 1 # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" "$empty" "$with_services" \ - "$import_env" "$import_sops" "$age_key" + _disinto_init_nomad "$dry_run" "$empty" "$with_services" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return @@ -1431,6 +977,7 @@ p.write_text(text) echo "" echo "[ensure] Forgejo admin user 'disinto-admin'" echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot" + echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly" echo "[ensure] .profile repos for all bots" echo "[ensure] repo ${forge_repo} on Forgejo with collaborators" echo "[run] preflight checks" @@ -1474,36 +1021,6 @@ p.write_text(text) exit 0 fi - # Configure Forgejo and Woodpecker URLs when EDGE_TUNNEL_FQDN is set. - # In subdomain mode, uses per-service FQDNs at root path instead of subpath URLs. - # Must run BEFORE generate_compose so the .env file is available for variable substitution. - if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - local routing_mode="${EDGE_ROUTING_MODE:-subpath}" - # Create .env file if it doesn't exist yet (needed before compose generation) - if [ "$bare" = false ] && [ ! -f "${FACTORY_ROOT}/.env" ]; then - touch "${FACTORY_ROOT}/.env" - fi - if [ "$routing_mode" = "subdomain" ]; then - # Subdomain mode: Forgejo at forge..disinto.ai (root path) - if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN_FORGE:-forge.${EDGE_TUNNEL_FQDN}}/" >> "${FACTORY_ROOT}/.env" - fi - # Subdomain mode: Woodpecker at ci..disinto.ai (root path) - if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN_CI:-ci.${EDGE_TUNNEL_FQDN}}" >> "${FACTORY_ROOT}/.env" - fi - else - # Subpath mode: Forgejo ROOT_URL with /forge/ subpath (trailing slash required) - if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" - fi - # Subpath mode: Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) - if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then - echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" - fi - fi - fi - # Generate compose files (unless --bare) if [ "$bare" = false ]; then local forge_port @@ -1518,6 +1035,18 @@ p.write_text(text) touch "${FACTORY_ROOT}/.env" fi + # Configure Forgejo and Woodpecker subpath URLs when EDGE_TUNNEL_FQDN is set + if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then + # Forgejo ROOT_URL with /forge/ subpath (note trailing slash - Forgejo needs it) + if ! grep -q '^FORGEJO_ROOT_URL=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "FORGEJO_ROOT_URL=https://${EDGE_TUNNEL_FQDN}/forge/" >> "${FACTORY_ROOT}/.env" + fi + # Woodpecker WOODPECKER_HOST with /ci subpath (no trailing slash for v3) + if ! grep -q '^WOODPECKER_HOST=' "${FACTORY_ROOT}/.env" 2>/dev/null; then + echo "WOODPECKER_HOST=https://${EDGE_TUNNEL_FQDN}/ci" >> "${FACTORY_ROOT}/.env" + fi + fi + # Prompt for FORGE_ADMIN_PASS before setup_forge # This ensures the password is set before Forgejo user creation prompt_admin_password "${FACTORY_ROOT}/.env" @@ -1621,15 +1150,9 @@ p.write_text(text) create_woodpecker_oauth "$forge_url" "$forge_repo" # Create OAuth2 app on Forgejo for disinto-chat (#708) - # In subdomain mode, callback is at chat. root instead of /chat/ subpath. local chat_redirect_uri if [ -n "${EDGE_TUNNEL_FQDN:-}" ]; then - local chat_routing_mode="${EDGE_ROUTING_MODE:-subpath}" - if [ "$chat_routing_mode" = "subdomain" ]; then - chat_redirect_uri="https://${EDGE_TUNNEL_FQDN_CHAT:-chat.${EDGE_TUNNEL_FQDN}}/oauth/callback" - else - chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" - fi + chat_redirect_uri="https://${EDGE_TUNNEL_FQDN}/chat/oauth/callback" else chat_redirect_uri="http://localhost/chat/oauth/callback" fi @@ -1650,6 +1173,19 @@ p.write_text(text) echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env" fi + # Write local-Qwen dev agent env keys with safe defaults (#769) + if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then + cat >> "$env_file" <<'LLAMAENVEOF' + +# Local Qwen dev agent (optional) — set to 1 to enable +ENABLE_LLAMA_AGENT=0 +FORGE_TOKEN_LLAMA= +FORGE_PASS_LLAMA= +ANTHROPIC_BASE_URL= +LLAMAENVEOF + echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)" + fi + # Create labels on remote create_labels "$forge_repo" "$forge_url" @@ -2829,29 +2365,15 @@ disinto_edge() { # Write to .env (replace existing entries to avoid duplicates) local tmp_env tmp_env=$(mktemp) - grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN|FQDN_FORGE|FQDN_CI|FQDN_CHAT)=" "$env_file" > "$tmp_env" 2>/dev/null || true + grep -Ev "^EDGE_TUNNEL_(HOST|PORT|FQDN)=" "$env_file" > "$tmp_env" 2>/dev/null || true mv "$tmp_env" "$env_file" echo "EDGE_TUNNEL_HOST=${edge_host}" >> "$env_file" echo "EDGE_TUNNEL_PORT=${port}" >> "$env_file" echo "EDGE_TUNNEL_FQDN=${fqdn}" >> "$env_file" - # Subdomain mode: write per-service FQDNs (#1028) - local reg_routing_mode="${EDGE_ROUTING_MODE:-subpath}" - if [ "$reg_routing_mode" = "subdomain" ]; then - echo "EDGE_TUNNEL_FQDN_FORGE=forge.${fqdn}" >> "$env_file" - echo "EDGE_TUNNEL_FQDN_CI=ci.${fqdn}" >> "$env_file" - echo "EDGE_TUNNEL_FQDN_CHAT=chat.${fqdn}" >> "$env_file" - fi - echo "Registered: ${project}" echo " Port: ${port}" echo " FQDN: ${fqdn}" - if [ "$reg_routing_mode" = "subdomain" ]; then - echo " Mode: subdomain" - echo " Forge: forge.${fqdn}" - echo " CI: ci.${fqdn}" - echo " Chat: chat.${fqdn}" - fi echo " Saved to: ${env_file}" ;; @@ -2885,23 +2407,12 @@ disinto_edge() { edge_host="${EDGE_HOST:-edge.disinto.ai}" fi - # Read tunnel pubkey for ownership proof - local secrets_dir="${FACTORY_ROOT}/secrets" - local tunnel_pubkey="${secrets_dir}/tunnel_key.pub" - if [ ! -f "$tunnel_pubkey" ]; then - echo "Error: tunnel keypair not found at ${tunnel_pubkey}" >&2 - echo "Cannot prove ownership without the tunnel public key." >&2 - exit 1 - fi - local pubkey - pubkey=$(tr -d '\n' < "$tunnel_pubkey") - # SSH to edge host and deregister echo "Deregistering tunnel for ${project} on ${edge_host}..." local response response=$(ssh -o StrictHostKeyChecking=accept-new -o BatchMode=yes \ "disinto-register@${edge_host}" \ - "deregister ${project} ${pubkey}" 2>&1) || { + "deregister ${project}" 2>&1) || { echo "Error: failed to deregister tunnel" >&2 echo "Response: ${response}" >&2 exit 1 @@ -2984,33 +2495,6 @@ EOF esac } -# ── backup command ──────────────────────────────────────────────────────────── -# Usage: disinto backup [args] -# Subcommands: -# create Create backup of factory state -# import Restore factory state from backup -disinto_backup() { - local subcmd="${1:-}" - shift || true - - case "$subcmd" in - create) - backup_create "$@" - ;; - import) - backup_import "$@" - ;; - *) - echo "Usage: disinto backup [args]" >&2 - echo "" >&2 - echo "Subcommands:" >&2 - echo " create Create backup of factory state" >&2 - echo " import Restore factory state from backup" >&2 - exit 1 - ;; - esac -} - # ── Main dispatch ──────────────────────────────────────────────────────────── case "${1:-}" in @@ -3027,7 +2511,6 @@ case "${1:-}" in hire-an-agent) shift; disinto_hire_an_agent "$@" ;; agent) shift; disinto_agent "$@" ;; edge) shift; disinto_edge "$@" ;; - backup) shift; disinto_backup "$@" ;; -h|--help) usage ;; *) usage ;; esac diff --git a/dev/AGENTS.md b/dev/AGENTS.md index d48f6b6..481bb1f 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/docker-compose.yml b/docker-compose.yml index 6206b2c..ba8c77c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,6 +15,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -77,6 +78,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -137,6 +139,7 @@ services: - project-repos:/home/agent/repos - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro + - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - woodpecker-data:/woodpecker-data:ro @@ -208,8 +211,8 @@ services: edge: build: - context: . - dockerfile: docker/edge/Dockerfile + context: docker/edge + dockerfile: Dockerfile image: disinto/edge:latest container_name: disinto-edge security_opt: @@ -220,8 +223,6 @@ services: - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/root/.claude.json:ro - ${CLAUDE_DIR:-${HOME}/.claude}:/root/.claude:ro - disinto-logs:/opt/disinto-logs - # Chat history persistence (merged from chat container, #1083) - - ${CHAT_HISTORY_DIR:-./state/chat-history}:/var/lib/chat/history environment: - FORGE_SUPERVISOR_TOKEN=${FORGE_SUPERVISOR_TOKEN:-} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} @@ -233,17 +234,6 @@ services: - PRIMARY_BRANCH=main - DISINTO_CONTAINER=1 - FORGE_ADMIN_USERS=disinto-admin,vault-bot,admin - # Chat env vars (merged from chat container into edge, #1083) - - CHAT_HOST=127.0.0.1 - - CHAT_PORT=8080 - - CHAT_OAUTH_CLIENT_ID=${CHAT_OAUTH_CLIENT_ID:-} - - CHAT_OAUTH_CLIENT_SECRET=${CHAT_OAUTH_CLIENT_SECRET:-} - - DISINTO_CHAT_ALLOWED_USERS=${DISINTO_CHAT_ALLOWED_USERS:-} - - FORWARD_AUTH_SECRET=${FORWARD_AUTH_SECRET:-} - - EDGE_TUNNEL_FQDN=${EDGE_TUNNEL_FQDN:-} - - EDGE_TUNNEL_FQDN_CHAT=${EDGE_TUNNEL_FQDN_CHAT:-} - - EDGE_ROUTING_MODE=${EDGE_ROUTING_MODE:-subpath} - # Rate limiting removed (#1084) ports: - "80:80" - "443:443" diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index fa3b2d8..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -1,26 +1,21 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ - bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ + bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) # SOPS — encrypted data decryption tool -# Download sops binary (replaces manual COPY of vendored binary) -ARG SOPS_VERSION=3.9.4 -RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \ - -o /usr/local/bin/sops && chmod +x /usr/local/bin/sops +COPY docker/agents/bin/sops /usr/local/bin/sops +RUN chmod +x /usr/local/bin/sops # tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations -# Download tea binary (replaces manual COPY of vendored binary) -ARG TEA_VERSION=0.9.2 -RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \ - -o /usr/local/bin/tea && chmod +x /usr/local/bin/tea +COPY docker/agents/bin/tea /usr/local/bin/tea +RUN chmod +x /usr/local/bin/tea -# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API). -# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider. -RUN npm install -g @anthropic-ai/claude-code@2.1.84 +# Claude CLI is mounted from the host via docker-compose volume. +# No internet access to cli.anthropic.com required at build time. # Non-root user RUN useradd -m -u 1000 -s /bin/bash agent diff --git a/docker/agents/entrypoint.sh b/docker/agents/entrypoint.sh index 7c58674..89a520b 100644 --- a/docker/agents/entrypoint.sh +++ b/docker/agents/entrypoint.sh @@ -17,38 +17,6 @@ set -euo pipefail # - predictor: every 24 hours (288 iterations * 5 min) # - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min) -# ── Migration check: reject ENABLE_LLAMA_AGENT ─────────────────────────────── -# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported. -# Activation is now done exclusively via [agents.X] sections in project TOML. -# If this legacy flag is detected, fail immediately with a migration message. -if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then - cat <<'MIGRATION_ERR' -FATAL: ENABLE_LLAMA_AGENT is no longer supported. - -The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846). -Activation is now done exclusively via [agents.X] sections in projects/*.toml. - -To migrate: - 1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file - 2. Add an [agents.] section to your project TOML: - - [agents.dev-qwen] - base_url = "http://your-llama-server:8081" - model = "unsloth/Qwen3.5-35B-A3B" - api_key = "sk-no-key-required" - roles = ["dev"] - forge_user = "dev-qwen" - compact_pct = 60 - poll_interval = 60 - - 3. Run: disinto init - 4. Start the agent: docker compose up -d agents-dev-qwen - -See docs/agents-llama.md for full details. -MIGRATION_ERR - exit 1 -fi - DISINTO_BAKED="/home/agent/disinto" DISINTO_LIVE="/home/agent/repos/_factory" DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap @@ -378,19 +346,15 @@ bootstrap_factory_repo # This prevents the silent-zombie mode where the polling loop matches zero files # and does nothing forever. validate_projects_dir() { - # NOTE: compgen -G exits non-zero when no matches exist, so piping it through - # `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch - # can log a diagnostic (#877). Use the conditional form already adopted at - # lines above (see bootstrap_factory_repo, PROJECT_NAME parsing). - if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then + local toml_count + toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" 2>/dev/null | wc -l) + if [ "$toml_count" -eq 0 ]; then log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/" log "Expected at least one project config file (e.g., disinto.toml)" log "The directory only contains *.toml.example template files." log "Mount the host ./projects volume or copy real .toml files into the container." exit 1 fi - local toml_count - toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l) log "Projects directory validated: ${toml_count} real .toml file(s) found" } diff --git a/docker/chat/Dockerfile b/docker/chat/Dockerfile new file mode 100644 index 0000000..3d89863 --- /dev/null +++ b/docker/chat/Dockerfile @@ -0,0 +1,35 @@ +# disinto-chat — minimal HTTP backend for Claude chat UI +# +# Small Debian slim base with Python runtime. +# Chosen for simplicity and small image size (~100MB). +# +# Image size: ~100MB (well under the 200MB ceiling) +# +# The claude binary is mounted from the host at runtime via docker-compose, +# not baked into the image — same pattern as the agents container. + +FROM debian:bookworm-slim + +# Install Python (no build-time network access needed) +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +# Non-root user — fixed UID 10001 for sandbox hardening (#706) +RUN useradd -m -u 10001 -s /bin/bash chat + +# Copy application files +COPY server.py /usr/local/bin/server.py +COPY entrypoint-chat.sh /entrypoint-chat.sh +COPY ui/ /var/chat/ui/ + +RUN chmod +x /entrypoint-chat.sh /usr/local/bin/server.py + +USER chat +WORKDIR /var/chat + +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health')" || exit 1 + +ENTRYPOINT ["/entrypoint-chat.sh"] diff --git a/docker/chat/entrypoint-chat.sh b/docker/chat/entrypoint-chat.sh new file mode 100755 index 0000000..00fbe53 --- /dev/null +++ b/docker/chat/entrypoint-chat.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +# entrypoint-chat.sh — Start the disinto-chat backend server +# +# Exec-replace pattern: this script is the container entrypoint and runs +# the server directly (no wrapper needed). Logs to stdout for docker logs. + +LOGFILE="/tmp/chat.log" + +log() { + printf '[%s] %s\n' "$(date -u '+%Y-%m-%d %H:%M:%S UTC')" "$*" | tee -a "$LOGFILE" +} + +# Sandbox sanity checks (#706) — fail fast if isolation is broken +if [ -e /var/run/docker.sock ]; then + log "FATAL: /var/run/docker.sock is accessible — sandbox violation" + exit 1 +fi +if [ "$(id -u)" = "0" ]; then + log "FATAL: running as root (uid 0) — sandbox violation" + exit 1 +fi + +# Verify Claude CLI is available (expected via volume mount from host). +if ! command -v claude &>/dev/null; then + log "FATAL: claude CLI not found in PATH" + log "Mount the host binary into the container, e.g.:" + log " volumes:" + log " - /usr/local/bin/claude:/usr/local/bin/claude:ro" + exit 1 +fi +log "Claude CLI: $(claude --version 2>&1 || true)" + +# Start the Python server (exec-replace so signals propagate correctly) +log "Starting disinto-chat server on port 8080..." +exec python3 /usr/local/bin/server.py diff --git a/docker/chat/server.py b/docker/chat/server.py index 48944d1..6748354 100644 --- a/docker/chat/server.py +++ b/docker/chat/server.py @@ -20,15 +20,9 @@ OAuth flow: 6. Redirects to /chat/ The claude binary is expected to be mounted from the host at /usr/local/bin/claude. - -Workspace access: - - CHAT_WORKSPACE_DIR environment variable: bind-mounted project working tree - - Claude invocation uses --permission-mode acceptEdits for code modification - - CWD is set to workspace directory when configured, enabling Claude to - inspect, explain, or modify code scoped to that tree only """ -import asyncio +import datetime import json import os import re @@ -36,33 +30,21 @@ import secrets import subprocess import sys import time -import threading from http.server import HTTPServer, BaseHTTPRequestHandler -from socketserver import ThreadingMixIn from urllib.parse import urlparse, parse_qs, urlencode -import socket -import struct -import base64 -import hashlib # Configuration -HOST = os.environ.get("CHAT_HOST", "127.0.0.1") +HOST = os.environ.get("CHAT_HOST", "0.0.0.0") PORT = int(os.environ.get("CHAT_PORT", 8080)) UI_DIR = "/var/chat/ui" STATIC_DIR = os.path.join(UI_DIR, "static") CLAUDE_BIN = "/usr/local/bin/claude" -# Workspace directory: bind-mounted project working tree for Claude access -# Defaults to empty; when set, Claude can read/write to this directory -WORKSPACE_DIR = os.environ.get("CHAT_WORKSPACE_DIR", "") - # OAuth configuration FORGE_URL = os.environ.get("FORGE_URL", "http://localhost:3000") CHAT_OAUTH_CLIENT_ID = os.environ.get("CHAT_OAUTH_CLIENT_ID", "") CHAT_OAUTH_CLIENT_SECRET = os.environ.get("CHAT_OAUTH_CLIENT_SECRET", "") EDGE_TUNNEL_FQDN = os.environ.get("EDGE_TUNNEL_FQDN", "") -EDGE_TUNNEL_FQDN_CHAT = os.environ.get("EDGE_TUNNEL_FQDN_CHAT", "") -EDGE_ROUTING_MODE = os.environ.get("EDGE_ROUTING_MODE", "subpath") # Shared secret for Caddy forward_auth verify endpoint (#709). # When set, only requests carrying this value in X-Forward-Auth-Secret are @@ -70,6 +52,10 @@ EDGE_ROUTING_MODE = os.environ.get("EDGE_ROUTING_MODE", "subpath") # (acceptable during local dev; production MUST set this). FORWARD_AUTH_SECRET = os.environ.get("FORWARD_AUTH_SECRET", "") +# Rate limiting / cost caps (#711) +CHAT_MAX_REQUESTS_PER_HOUR = int(os.environ.get("CHAT_MAX_REQUESTS_PER_HOUR", 60)) +CHAT_MAX_REQUESTS_PER_DAY = int(os.environ.get("CHAT_MAX_REQUESTS_PER_DAY", 500)) +CHAT_MAX_TOKENS_PER_DAY = int(os.environ.get("CHAT_MAX_TOKENS_PER_DAY", 1000000)) # Allowed users - disinto-admin always allowed; CSV allowlist extends it _allowed_csv = os.environ.get("DISINTO_CHAT_ALLOWED_USERS", "") @@ -95,10 +81,11 @@ _sessions = {} # Pending OAuth state tokens: state -> expires (float) _oauth_states = {} - -# WebSocket message queues per user -# user -> asyncio.Queue (for streaming messages to connected clients) -_websocket_queues = {} +# Per-user rate limiting state (#711) +# user -> list of request timestamps (for sliding-window hourly/daily caps) +_request_log = {} +# user -> {"tokens": int, "date": "YYYY-MM-DD"} +_daily_tokens = {} # MIME types for static files MIME_TYPES = { @@ -112,22 +99,9 @@ MIME_TYPES = { ".ico": "image/x-icon", } -# WebSocket subprotocol for chat streaming -WEBSOCKET_SUBPROTOCOL = "chat-stream-v1" - -# WebSocket opcodes -OPCODE_CONTINUATION = 0x0 -OPCODE_TEXT = 0x1 -OPCODE_BINARY = 0x2 -OPCODE_CLOSE = 0x8 -OPCODE_PING = 0x9 -OPCODE_PONG = 0xA - def _build_callback_uri(): """Build the OAuth callback URI based on tunnel configuration.""" - if EDGE_ROUTING_MODE == "subdomain" and EDGE_TUNNEL_FQDN_CHAT: - return f"https://{EDGE_TUNNEL_FQDN_CHAT}/oauth/callback" if EDGE_TUNNEL_FQDN: return f"https://{EDGE_TUNNEL_FQDN}/chat/oauth/callback" return "http://localhost/chat/oauth/callback" @@ -213,9 +187,69 @@ def _fetch_user(access_token): return None +# ============================================================================= +# Rate Limiting Functions (#711) +# ============================================================================= + +def _check_rate_limit(user): + """Check per-user rate limits. Returns (allowed, retry_after, reason) (#711). + + Checks hourly request cap, daily request cap, and daily token cap. + """ + now = time.time() + one_hour_ago = now - 3600 + today = datetime.date.today().isoformat() + + # Prune old entries from request log + timestamps = _request_log.get(user, []) + timestamps = [t for t in timestamps if t > now - 86400] + _request_log[user] = timestamps + + # Hourly request cap + hourly = [t for t in timestamps if t > one_hour_ago] + if len(hourly) >= CHAT_MAX_REQUESTS_PER_HOUR: + oldest_in_window = min(hourly) + retry_after = int(oldest_in_window + 3600 - now) + 1 + return False, max(retry_after, 1), "hourly request limit" + + # Daily request cap + start_of_day = time.mktime(datetime.date.today().timetuple()) + daily = [t for t in timestamps if t >= start_of_day] + if len(daily) >= CHAT_MAX_REQUESTS_PER_DAY: + next_day = start_of_day + 86400 + retry_after = int(next_day - now) + 1 + return False, max(retry_after, 1), "daily request limit" + + # Daily token cap + token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) + if token_info["date"] != today: + token_info = {"tokens": 0, "date": today} + _daily_tokens[user] = token_info + if token_info["tokens"] >= CHAT_MAX_TOKENS_PER_DAY: + next_day = start_of_day + 86400 + retry_after = int(next_day - now) + 1 + return False, max(retry_after, 1), "daily token limit" + + return True, 0, "" + + +def _record_request(user): + """Record a request timestamp for the user (#711).""" + _request_log.setdefault(user, []).append(time.time()) + + +def _record_tokens(user, tokens): + """Record token usage for the user (#711).""" + today = datetime.date.today().isoformat() + token_info = _daily_tokens.get(user, {"tokens": 0, "date": today}) + if token_info["date"] != today: + token_info = {"tokens": 0, "date": today} + token_info["tokens"] += tokens + _daily_tokens[user] = token_info + def _parse_stream_json(output): - """Parse stream-json output from claude --print. + """Parse stream-json output from claude --print (#711). Returns (text_content, total_tokens). Falls back gracefully if the usage event is absent or malformed. @@ -261,313 +295,6 @@ def _parse_stream_json(output): return "".join(text_parts), total_tokens -# ============================================================================= -# WebSocket Handler Class -# ============================================================================= - -class _WebSocketHandler: - """Handle WebSocket connections for chat streaming.""" - - def __init__(self, reader, writer, user, message_queue): - self.reader = reader - self.writer = writer - self.user = user - self.message_queue = message_queue - self.closed = False - - async def accept_connection(self, sec_websocket_key, sec_websocket_protocol=None): - """Accept the WebSocket handshake. - - The HTTP request has already been parsed by BaseHTTPRequestHandler, - so we use the provided key and protocol instead of re-reading from socket. - """ - # Validate subprotocol - if sec_websocket_protocol and sec_websocket_protocol != WEBSOCKET_SUBPROTOCOL: - self._send_http_error( - 400, - "Bad Request", - f"Unsupported subprotocol. Expected: {WEBSOCKET_SUBPROTOCOL}", - ) - self._close_connection() - return False - - # Generate accept key - accept_key = self._generate_accept_key(sec_websocket_key) - - # Send handshake response - response = ( - "HTTP/1.1 101 Switching Protocols\r\n" - "Upgrade: websocket\r\n" - "Connection: Upgrade\r\n" - f"Sec-WebSocket-Accept: {accept_key}\r\n" - ) - - if sec_websocket_protocol: - response += f"Sec-WebSocket-Protocol: {sec_websocket_protocol}\r\n" - - response += "\r\n" - self.writer.write(response.encode("utf-8")) - await self.writer.drain() - return True - - def _generate_accept_key(self, sec_key): - """Generate the Sec-WebSocket-Accept key.""" - GUID = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11" - combined = sec_key + GUID - sha1 = hashlib.sha1(combined.encode("utf-8")) - return base64.b64encode(sha1.digest()).decode("utf-8") - - async def _read_line(self): - """Read a line from the socket.""" - data = await self.reader.read(1) - line = "" - while data: - if data == b"\r": - data = await self.reader.read(1) - continue - if data == b"\n": - return line - line += data.decode("utf-8", errors="replace") - data = await self.reader.read(1) - return line - - def _send_http_error(self, code, title, message): - """Send an HTTP error response.""" - response = ( - f"HTTP/1.1 {code} {title}\r\n" - "Content-Type: text/plain; charset=utf-8\r\n" - "Content-Length: " + str(len(message)) + "\r\n" - "\r\n" - + message - ) - try: - self.writer.write(response.encode("utf-8")) - self.writer.drain() - except Exception: - pass - - def _close_connection(self): - """Close the connection.""" - try: - self.writer.close() - except Exception: - pass - - async def send_text(self, data): - """Send a text frame.""" - if self.closed: - return - try: - frame = self._encode_frame(OPCODE_TEXT, data.encode("utf-8")) - self.writer.write(frame) - await self.writer.drain() - except Exception as e: - print(f"WebSocket send error: {e}", file=sys.stderr) - - async def send_binary(self, data): - """Send a binary frame.""" - if self.closed: - return - try: - if isinstance(data, str): - data = data.encode("utf-8") - frame = self._encode_frame(OPCODE_BINARY, data) - self.writer.write(frame) - await self.writer.drain() - except Exception as e: - print(f"WebSocket send error: {e}", file=sys.stderr) - - def _encode_frame(self, opcode, payload): - """Encode a WebSocket frame.""" - frame = bytearray() - frame.append(0x80 | opcode) # FIN + opcode - - length = len(payload) - if length < 126: - frame.append(length) - elif length < 65536: - frame.append(126) - frame.extend(struct.pack(">H", length)) - else: - frame.append(127) - frame.extend(struct.pack(">Q", length)) - - frame.extend(payload) - return bytes(frame) - - async def _decode_frame(self): - """Decode a WebSocket frame. Returns (opcode, payload).""" - try: - # Read first two bytes (use readexactly for guaranteed length) - header = await self.reader.readexactly(2) - - fin = (header[0] >> 7) & 1 - opcode = header[0] & 0x0F - masked = (header[1] >> 7) & 1 - length = header[1] & 0x7F - - # Extended payload length - if length == 126: - ext = await self.reader.readexactly(2) - length = struct.unpack(">H", ext)[0] - elif length == 127: - ext = await self.reader.readexactly(8) - length = struct.unpack(">Q", ext)[0] - - # Masking key - if masked: - mask_key = await self.reader.readexactly(4) - - # Payload - payload = await self.reader.readexactly(length) - - # Unmask if needed - if masked: - payload = bytes(b ^ mask_key[i % 4] for i, b in enumerate(payload)) - - return opcode, payload - except Exception as e: - print(f"WebSocket decode error: {e}", file=sys.stderr) - return None, None - - async def handle_connection(self): - """Handle the WebSocket connection loop.""" - try: - while not self.closed: - opcode, payload = await self._decode_frame() - if opcode is None: - break - - if opcode == OPCODE_CLOSE: - await self._send_close() - break - elif opcode == OPCODE_PING: - await self._send_pong(payload) - elif opcode == OPCODE_PONG: - pass # Ignore pong - elif opcode in (OPCODE_TEXT, OPCODE_BINARY): - # Handle text messages from client (e.g., chat_request) - try: - msg = payload.decode("utf-8") - data = json.loads(msg) - if data.get("type") == "chat_request": - # Invoke Claude with the message - await self._handle_chat_request(data.get("message", "")) - except (json.JSONDecodeError, UnicodeDecodeError): - pass - - # Check if we should stop waiting for messages - if self.closed: - break - - except Exception as e: - print(f"WebSocket connection error: {e}", file=sys.stderr) - finally: - self._close_connection() - # Clean up the message queue on disconnect - if self.user in _websocket_queues: - del _websocket_queues[self.user] - - async def _send_close(self): - """Send a close frame.""" - try: - # Close code 1000 = normal closure - frame = self._encode_frame(OPCODE_CLOSE, struct.pack(">H", 1000)) - self.writer.write(frame) - await self.writer.drain() - except Exception: - pass - - async def _send_pong(self, payload): - """Send a pong frame.""" - try: - frame = self._encode_frame(OPCODE_PONG, payload) - self.writer.write(frame) - await self.writer.drain() - except Exception: - pass - - async def _handle_chat_request(self, message): - """Handle a chat_request WebSocket frame by invoking Claude.""" - if not message: - return - - # Validate Claude binary exists - if not os.path.exists(CLAUDE_BIN): - await self.send_text(json.dumps({ - "type": "error", - "message": "Claude CLI not found", - })) - return - - try: - # Build claude command with permission mode (acceptEdits allows file edits) - claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] - - # Spawn claude --print with stream-json for streaming output - # Set cwd to workspace directory if configured, allowing Claude to access project code - cwd = WORKSPACE_DIR if WORKSPACE_DIR else None - proc = subprocess.Popen( - claude_args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - cwd=cwd, - bufsize=1, - ) - - # Stream output line by line - for line in iter(proc.stdout.readline, ""): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - etype = event.get("type", "") - - # Extract text content from content_block_delta events - if etype == "content_block_delta": - delta = event.get("delta", {}) - if delta.get("type") == "text_delta": - text = delta.get("text", "") - if text: - # Send tokens to client - await self.send_text(text) - - # Check for usage event to know when complete - if etype == "result": - pass # Will send complete after loop - - except json.JSONDecodeError: - pass - - # Wait for process to complete - proc.wait() - - if proc.returncode != 0: - await self.send_text(json.dumps({ - "type": "error", - "message": f"Claude CLI failed with exit code {proc.returncode}", - })) - return - - # Send complete signal - await self.send_text(json.dumps({ - "type": "complete", - })) - - except FileNotFoundError: - await self.send_text(json.dumps({ - "type": "error", - "message": "Claude CLI not found", - })) - except Exception as e: - await self.send_text(json.dumps({ - "type": "error", - "message": str(e), - })) - - # ============================================================================= # Conversation History Functions (#710) # ============================================================================= @@ -817,9 +544,9 @@ class ChatHandler(BaseHTTPRequestHandler): self.serve_static(path) return - # WebSocket upgrade endpoint - if path == "/chat/ws" or path == "/ws" or path.startswith("/ws"): - self.handle_websocket_upgrade() + # Reserved WebSocket endpoint (future use) + if path == "/ws" or path.startswith("/ws"): + self.send_error_page(501, "WebSocket upgrade not yet implemented") return # 404 for unknown paths @@ -1009,13 +736,33 @@ class ChatHandler(BaseHTTPRequestHandler): except IOError as e: self.send_error_page(500, f"Error reading file: {e}") - + def _send_rate_limit_response(self, retry_after, reason): + """Send a 429 response with Retry-After header and HTMX fragment (#711).""" + body = ( + f'
' + f"Rate limit exceeded: {reason}. " + f"Please try again in {retry_after} seconds." + f"
" + ) + self.send_response(429) + self.send_header("Retry-After", str(retry_after)) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body.encode("utf-8")))) + self.end_headers() + self.wfile.write(body.encode("utf-8")) + def handle_chat(self, user): """ Handle chat requests by spawning `claude --print` with the user message. - Streams tokens over WebSocket if connected. + Enforces per-user rate limits and tracks token usage (#711). """ + # Check rate limits before processing (#711) + allowed, retry_after, reason = _check_rate_limit(user) + if not allowed: + self._send_rate_limit_response(retry_after, reason) + return + # Read request body content_length = int(self.headers.get("Content-Length", 0)) if content_length == 0: @@ -1052,63 +799,23 @@ class ChatHandler(BaseHTTPRequestHandler): if not conv_id or not _validate_conversation_id(conv_id): conv_id = _generate_conversation_id() + # Record request for rate limiting (#711) + _record_request(user) + try: # Save user message to history _write_message(user, conv_id, "user", message) - # Build claude command with permission mode (acceptEdits allows file edits) - claude_args = [CLAUDE_BIN, "--print", "--output-format", "stream-json", "--permission-mode", "acceptEdits", message] - # Spawn claude --print with stream-json for token tracking (#711) - # Set cwd to workspace directory if configured, allowing Claude to access project code - cwd = WORKSPACE_DIR if WORKSPACE_DIR else None proc = subprocess.Popen( - claude_args, + [CLAUDE_BIN, "--print", "--output-format", "stream-json", message], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, - cwd=cwd, - bufsize=1, # Line buffered ) - # Stream output line by line - response_parts = [] - total_tokens = 0 - for line in iter(proc.stdout.readline, ""): - line = line.strip() - if not line: - continue - try: - event = json.loads(line) - etype = event.get("type", "") + raw_output = proc.stdout.read() - # Extract text content from content_block_delta events - if etype == "content_block_delta": - delta = event.get("delta", {}) - if delta.get("type") == "text_delta": - text = delta.get("text", "") - if text: - response_parts.append(text) - # Stream to WebSocket if connected - if user in _websocket_queues: - try: - _websocket_queues[user].put_nowait(text) - except Exception: - pass # Client disconnected - - # Parse usage from result event - if etype == "result": - usage = event.get("usage", {}) - total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) - elif "usage" in event: - usage = event["usage"] - if isinstance(usage, dict): - total_tokens = usage.get("input_tokens", 0) + usage.get("output_tokens", 0) - - except json.JSONDecodeError: - pass - - # Wait for process to complete error_output = proc.stderr.read() if error_output: print(f"Claude stderr: {error_output}", file=sys.stderr) @@ -1119,12 +826,20 @@ class ChatHandler(BaseHTTPRequestHandler): self.send_error_page(500, f"Claude CLI failed with exit code {proc.returncode}") return - # Combine response parts - response = "".join(response_parts) + # Parse stream-json for text and token usage (#711) + response, total_tokens = _parse_stream_json(raw_output) + + # Track token usage - does not block *this* request (#711) + if total_tokens > 0: + _record_tokens(user, total_tokens) + print( + f"Token usage: user={user} tokens={total_tokens}", + file=sys.stderr, + ) # Fall back to raw output if stream-json parsing yielded no text if not response: - response = proc.stdout.getvalue() if hasattr(proc.stdout, 'getvalue') else "" + response = raw_output # Save assistant response to history _write_message(user, conv_id, "assistant", response) @@ -1194,106 +909,6 @@ class ChatHandler(BaseHTTPRequestHandler): self.end_headers() self.wfile.write(json.dumps({"conversation_id": conv_id}, ensure_ascii=False).encode("utf-8")) - @staticmethod - def push_to_websocket(user, message): - """Push a message to a WebSocket connection for a user. - - This is called from the chat handler to stream tokens to connected clients. - The message is added to the user's WebSocket message queue. - """ - # Get the message queue from the WebSocket handler's queue - # We store the queue in a global dict keyed by user - if user in _websocket_queues: - _websocket_queues[user].put_nowait(message) - - def handle_websocket_upgrade(self): - """Handle WebSocket upgrade request for chat streaming.""" - # Check session cookie - user = _validate_session(self.headers.get("Cookie")) - if not user: - self.send_error_page(401, "Unauthorized: no valid session") - return - - # Create message queue for this user - _websocket_queues[user] = asyncio.Queue() - - # Get WebSocket upgrade headers from the HTTP request - sec_websocket_key = self.headers.get("Sec-WebSocket-Key", "") - sec_websocket_protocol = self.headers.get("Sec-WebSocket-Protocol", "") - - # Validate Sec-WebSocket-Key - if not sec_websocket_key: - self.send_error_page(400, "Bad Request", "Missing Sec-WebSocket-Key") - return - - # Get the socket from the connection - sock = self.connection - sock.setblocking(False) - - # Create async server to handle the connection - async def handle_ws(): - try: - # Wrap the socket in asyncio streams using open_connection - reader, writer = await asyncio.open_connection(sock=sock) - - # Create WebSocket handler - ws_handler = _WebSocketHandler(reader, writer, user, _websocket_queues[user]) - - # Accept the connection (pass headers from HTTP request) - if not await ws_handler.accept_connection(sec_websocket_key, sec_websocket_protocol): - return - - # Start a task to read from the queue and send to client - async def send_stream(): - while not ws_handler.closed: - try: - data = await asyncio.wait_for(ws_handler.message_queue.get(), timeout=1.0) - await ws_handler.send_text(data) - except asyncio.TimeoutError: - # Send ping to keep connection alive - try: - frame = ws_handler._encode_frame(OPCODE_PING, b"") - writer.write(frame) - await writer.drain() - except Exception: - break - except Exception as e: - print(f"Send stream error: {e}", file=sys.stderr) - break - - # Start sending task - send_task = asyncio.create_task(send_stream()) - - # Handle incoming WebSocket frames - await ws_handler.handle_connection() - - # Cancel send task - send_task.cancel() - try: - await send_task - except asyncio.CancelledError: - pass - - except Exception as e: - print(f"WebSocket handler error: {e}", file=sys.stderr) - finally: - try: - writer.close() - await writer.wait_closed() - except Exception: - pass - - # Run the async handler in a thread - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - try: - loop.run_until_complete(handle_ws()) - except Exception as e: - print(f"WebSocket error: {e}", file=sys.stderr) - finally: - loop.close() - sock.close() - def do_DELETE(self): """Handle DELETE requests.""" parsed = urlparse(self.path) @@ -1329,6 +944,12 @@ def main(): print("forward_auth secret configured (#709)", file=sys.stderr) else: print("WARNING: FORWARD_AUTH_SECRET not set - verify endpoint unrestricted", file=sys.stderr) + print( + f"Rate limits (#711): {CHAT_MAX_REQUESTS_PER_HOUR}/hr, " + f"{CHAT_MAX_REQUESTS_PER_DAY}/day, " + f"{CHAT_MAX_TOKENS_PER_DAY} tokens/day", + file=sys.stderr, + ) httpd.serve_forever() diff --git a/docker/chat/ui/index.html b/docker/chat/ui/index.html index b045873..bd920f9 100644 --- a/docker/chat/ui/index.html +++ b/docker/chat/ui/index.html @@ -430,10 +430,6 @@ return div.innerHTML.replace(/\n/g, '
'); } - // WebSocket connection for streaming - let ws = null; - let wsMessageId = null; - // Send message handler async function sendMessage() { const message = textarea.value.trim(); @@ -453,14 +449,6 @@ await createNewConversation(); } - // Try WebSocket streaming first, fall back to fetch - if (window.location.protocol === 'https:' || window.location.hostname === 'localhost') { - if (tryWebSocketSend(message)) { - return; - } - } - - // Fallback to fetch try { // Use fetch with URLSearchParams for application/x-www-form-urlencoded const params = new URLSearchParams(); @@ -497,111 +485,6 @@ } } - // Try to send message via WebSocket streaming - function tryWebSocketSend(message) { - try { - // Generate a unique message ID for this request - wsMessageId = Date.now().toString(36) + Math.random().toString(36).substr(2); - - // Connect to WebSocket - const wsUrl = window.location.protocol === 'https:' - ? `wss://${window.location.host}/chat/ws` - : `ws://${window.location.host}/chat/ws`; - - ws = new WebSocket(wsUrl); - - ws.onopen = function() { - // Send the message as JSON with message ID - const data = { - type: 'chat_request', - message_id: wsMessageId, - message: message, - conversation_id: currentConversationId - }; - ws.send(JSON.stringify(data)); - }; - - ws.onmessage = function(event) { - try { - const data = JSON.parse(event.data); - - if (data.type === 'token') { - // Stream a token to the UI - addTokenToLastMessage(data.token); - } else if (data.type === 'complete') { - // Streaming complete - closeWebSocket(); - textarea.disabled = false; - sendBtn.disabled = false; - sendBtn.textContent = 'Send'; - textarea.focus(); - messagesDiv.scrollTop = messagesDiv.scrollHeight; - loadConversations(); - } else if (data.type === 'error') { - addSystemMessage(`Error: ${data.message}`); - closeWebSocket(); - textarea.disabled = false; - sendBtn.disabled = false; - sendBtn.textContent = 'Send'; - textarea.focus(); - } - } catch (e) { - console.error('Failed to parse WebSocket message:', e); - } - }; - - ws.onerror = function(error) { - console.error('WebSocket error:', error); - addSystemMessage('WebSocket connection error. Falling back to regular chat.'); - closeWebSocket(); - sendMessage(); // Retry with fetch - }; - - ws.onclose = function() { - wsMessageId = null; - }; - - return true; // WebSocket attempt started - - } catch (error) { - console.error('Failed to create WebSocket:', error); - return false; // Fall back to fetch - } - } - - // Add a token to the last assistant message (for streaming) - function addTokenToLastMessage(token) { - const messages = messagesDiv.querySelectorAll('.message.assistant'); - if (messages.length === 0) { - // No assistant message yet, create one - const msgDiv = document.createElement('div'); - msgDiv.className = 'message assistant'; - msgDiv.innerHTML = ` -
assistant
-
- `; - messagesDiv.appendChild(msgDiv); - } - - const lastMsg = messagesDiv.querySelector('.message.assistant .content.streaming'); - if (lastMsg) { - lastMsg.textContent += token; - messagesDiv.scrollTop = messagesDiv.scrollHeight; - } - } - - // Close WebSocket connection - function closeWebSocket() { - if (ws) { - ws.onopen = null; - ws.onmessage = null; - ws.onerror = null; - ws.onclose = null; - ws.close(); - ws = null; - } - } - // Event listeners sendBtn.addEventListener('click', sendMessage); diff --git a/docker/edge/Dockerfile b/docker/edge/Dockerfile index 507c39b..eca7d7e 100644 --- a/docker/edge/Dockerfile +++ b/docker/edge/Dockerfile @@ -1,12 +1,6 @@ FROM caddy:latest -RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh \ - nodejs npm -# Claude Code CLI — chat backend runtime (merged from docker/chat, #1083) -RUN npm install -g @anthropic-ai/claude-code@2.1.84 -COPY docker/edge/entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh -# Chat server and UI (merged from docker/chat into edge, #1083) -COPY docker/chat/server.py /usr/local/bin/chat-server.py -COPY docker/chat/ui/ /var/chat/ui/ +RUN apk add --no-cache bash jq curl git docker-cli python3 openssh-client autossh +COPY entrypoint-edge.sh /usr/local/bin/entrypoint-edge.sh VOLUME /data diff --git a/docker/edge/dispatcher.sh b/docker/edge/dispatcher.sh index 282342a..a48abf2 100755 --- a/docker/edge/dispatcher.sh +++ b/docker/edge/dispatcher.sh @@ -560,168 +560,10 @@ _launch_runner_docker() { # _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV # -# Dispatches a vault-runner batch job via `nomad job dispatch`. -# Polls `nomad job status` until terminal state (completed/failed). -# Reads exit code from allocation and writes .result.json. -# -# Usage: _launch_runner_nomad -# Returns: exit code of the nomad job (0=success, non-zero=failure) +# Nomad backend stub — will be implemented in migration Step 5. _launch_runner_nomad() { - local action_id="$1" - local secrets_csv="$2" - local mounts_csv="$3" - - log "Dispatching vault-runner batch job via Nomad for action: ${action_id}" - - # Dispatch the parameterized batch job - # The vault-runner job expects meta: action_id, secrets_csv - # Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl) - local dispatch_output - dispatch_output=$(nomad job dispatch \ - -detach \ - -meta action_id="$action_id" \ - -meta secrets_csv="$secrets_csv" \ - vault-runner 2>&1) || { - log "ERROR: Failed to dispatch vault-runner job for ${action_id}" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}" - return 1 - } - - # Extract dispatched job ID from output (format: "vault-runner/dispatch--") - local dispatched_job_id - dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true) - - if [ -z "$dispatched_job_id" ]; then - log "ERROR: Could not extract dispatched job ID from nomad output" - log "Dispatch output: ${dispatch_output}" - write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output" - return 1 - fi - - log "Dispatched vault-runner with job ID: ${dispatched_job_id}" - - # Poll job status until terminal state - # Batch jobs transition: running -> completed/failed - local max_wait=300 # 5 minutes max wait - local elapsed=0 - local poll_interval=5 - local alloc_id="" - - log "Polling nomad job status for ${dispatched_job_id}..." - - while [ "$elapsed" -lt "$max_wait" ]; do - # Get job status with JSON output for the dispatched child job - local job_status_json - job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get job status for ${dispatched_job_id}" - write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}" - return 1 - } - - # Check job status field (transitions to "dead" on completion) - local job_state - job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state="" - - # Check allocation state directly - alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id="" - - if [ -n "$alloc_id" ]; then - local alloc_state - alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true) - - case "$alloc_state" in - *completed*|*success*|*dead*) - log "Allocation ${alloc_id} reached terminal state: ${alloc_state}" - break - ;; - *running*|*pending*|*starting*) - log "Allocation ${alloc_id} still running (state: ${alloc_state})..." - ;; - *failed*|*crashed*) - log "Allocation ${alloc_id} failed (state: ${alloc_state})" - break - ;; - esac - fi - - # Also check job-level state - case "$job_state" in - dead) - log "Job ${dispatched_job_id} reached terminal state: ${job_state}" - break - ;; - failed) - log "Job ${dispatched_job_id} failed" - break - ;; - esac - - sleep "$poll_interval" - elapsed=$((elapsed + poll_interval)) - done - - if [ "$elapsed" -ge "$max_wait" ]; then - log "ERROR: Timeout waiting for vault-runner job to complete" - write_result "$action_id" 1 "Timeout waiting for nomad job to complete" - return 1 - fi - - # Get final job status and exit code - local final_status_json - final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || { - log "ERROR: Failed to get final job status" - write_result "$action_id" 1 "Failed to get final job status" - return 1 - } - - # Get allocation exit code - local exit_code=0 - local logs="" - - if [ -n "$alloc_id" ]; then - # Get allocation logs - logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true) - - # Try to get exit code from alloc status JSON - # Nomad alloc status -json has .TaskStates[""].Events[].ExitCode - local alloc_exit_code - alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code="" - - if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then - exit_code="$alloc_exit_code" - fi - fi - - # If we couldn't get exit code from alloc, check job state as fallback - # Note: "dead" = terminal state for batch jobs (includes successful completion) - # Only "failed" indicates actual failure - if [ "$exit_code" -eq 0 ]; then - local final_state - final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state="" - - case "$final_state" in - failed) - exit_code=1 - ;; - esac - fi - - # Truncate logs if too long - if [ ${#logs} -gt 1000 ]; then - logs="${logs: -1000}" - fi - - # Write result file - write_result "$action_id" "$exit_code" "$logs" - - if [ "$exit_code" -eq 0 ]; then - log "Vault-runner job completed successfully for action: ${action_id}" - else - log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})" - fi - - return "$exit_code" + echo "nomad backend not yet implemented" >&2 + return 1 } # Launch runner for the given action (backend-agnostic orchestrator) @@ -1209,8 +1051,11 @@ main() { # Validate backend selection at startup case "$DISPATCHER_BACKEND" in - docker|nomad) - log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch" + docker) ;; + nomad) + log "ERROR: nomad backend not yet implemented" + echo "nomad backend not yet implemented" >&2 + exit 1 ;; *) log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}" diff --git a/docker/edge/entrypoint-edge.sh b/docker/edge/entrypoint-edge.sh index a1511ff..1b5f94f 100755 --- a/docker/edge/entrypoint-edge.sh +++ b/docker/edge/entrypoint-edge.sh @@ -173,15 +173,11 @@ PROJECT_TOML="${PROJECT_TOML:-projects/disinto.toml}" sleep 1200 # 20 minutes done) & -# ── Load optional secrets from secrets/*.enc (#777) ──────────────────── -# Engagement collection (collect-engagement.sh) requires CADDY_ secrets to -# SCP access logs from a remote edge host. When age key or secrets dir is -# missing, or any secret fails to decrypt, log a warning and skip the cron. -# Caddy itself does not depend on these secrets. +# ── Load required secrets from secrets/*.enc (#777) ──────────────────── +# Edge container declares its required secrets; missing ones cause a hard fail. _AGE_KEY_FILE="${HOME}/.config/sops/age/keys.txt" _SECRETS_DIR="/opt/disinto/secrets" EDGE_REQUIRED_SECRETS="CADDY_SSH_KEY CADDY_SSH_HOST CADDY_SSH_USER CADDY_ACCESS_LOG" -EDGE_ENGAGEMENT_READY=0 # Assume not ready until proven otherwise _edge_decrypt_secret() { local enc_path="${_SECRETS_DIR}/${1}.enc" @@ -196,63 +192,47 @@ if [ -f "$_AGE_KEY_FILE" ] && [ -d "$_SECRETS_DIR" ]; then export "$_secret_name=$_val" done if [ -n "$_missing" ]; then - echo "WARN: required engagement secrets missing from secrets/*.enc:${_missing}" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 - else - echo "edge: loaded required engagement secrets: ${EDGE_REQUIRED_SECRETS}" >&2 - EDGE_ENGAGEMENT_READY=1 + echo "FATAL: required secrets missing from secrets/*.enc:${_missing}" >&2 + echo " Run 'disinto secrets add ' for each missing secret." >&2 + echo " If migrating from .env.vault.enc, run 'disinto secrets migrate-from-vault' first." >&2 + exit 1 fi + echo "edge: loaded required secrets: ${EDGE_REQUIRED_SECRETS}" >&2 else - echo "WARN: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — engagement secrets unavailable" >&2 - echo " collect-engagement cron will be skipped. Run 'disinto secrets add ' to enable." >&2 - EDGE_ENGAGEMENT_READY=0 + echo "FATAL: age key (${_AGE_KEY_FILE}) or secrets dir (${_SECRETS_DIR}) not found — cannot load required secrets" >&2 + echo " Ensure age is installed and secrets/*.enc files are present." >&2 + exit 1 fi # Start daily engagement collection cron loop in background (#745) # Runs collect-engagement.sh daily at ~23:50 UTC via a sleep loop that # calculates seconds until the next 23:50 window. SSH key from secrets/*.enc (#777). -# Guarded: only start if EDGE_ENGAGEMENT_READY=1. -if [ "$EDGE_ENGAGEMENT_READY" -eq 1 ]; then - (while true; do - # Calculate seconds until next 23:50 UTC - _now=$(date -u +%s) - _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) - if [ "$_target" -le "$_now" ]; then - _target=$(( _target + 86400 )) - fi - _sleep_secs=$(( _target - _now )) - echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 - sleep "$_sleep_secs" - _fetch_log="/tmp/caddy-access-log-fetch.log" - _ssh_key_file=$(mktemp) - printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" - chmod 0600 "$_ssh_key_file" - scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ - "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ - "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true - rm -f "$_ssh_key_file" - if [ -s "$_fetch_log" ]; then - CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ - | tee -a /opt/disinto-logs/collect-engagement.log || true - else - echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 - fi - rm -f "$_fetch_log" - done) & -else - echo "edge: collect-engagement cron skipped (EDGE_ENGAGEMENT_READY=0)" >&2 -fi - -# Start chat server in background (#1083 — merged from docker/chat into edge) -(python3 /usr/local/bin/chat-server.py 2>&1 | tee -a /opt/disinto-logs/chat.log) & - -# Nomad template renders Caddyfile to /local/Caddyfile via service discovery; -# copy it into the expected location if present (compose uses the mounted path). -if [ -f /local/Caddyfile ]; then - cp /local/Caddyfile /etc/caddy/Caddyfile - echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2 -fi +(while true; do + # Calculate seconds until next 23:50 UTC + _now=$(date -u +%s) + _target=$(date -u -d "today 23:50" +%s 2>/dev/null || date -u -d "23:50" +%s 2>/dev/null || echo 0) + if [ "$_target" -le "$_now" ]; then + _target=$(( _target + 86400 )) + fi + _sleep_secs=$(( _target - _now )) + echo "edge: collect-engagement scheduled in ${_sleep_secs}s (next 23:50 UTC)" >&2 + sleep "$_sleep_secs" + _fetch_log="/tmp/caddy-access-log-fetch.log" + _ssh_key_file=$(mktemp) + printf '%s\n' "$CADDY_SSH_KEY" > "$_ssh_key_file" + chmod 0600 "$_ssh_key_file" + scp -i "$_ssh_key_file" -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + "${CADDY_SSH_USER}@${CADDY_SSH_HOST}:${CADDY_ACCESS_LOG}" \ + "$_fetch_log" 2>&1 | tee -a /opt/disinto-logs/collect-engagement.log || true + rm -f "$_ssh_key_file" + if [ -s "$_fetch_log" ]; then + CADDY_ACCESS_LOG="$_fetch_log" bash /opt/disinto/site/collect-engagement.sh 2>&1 \ + | tee -a /opt/disinto-logs/collect-engagement.log || true + else + echo "edge: collect-engagement: fetched log is empty, skipping parse" >&2 + fi + rm -f "$_fetch_log" +done) & # Caddy as main process — run in foreground via wait so background jobs survive # (exec replaces the shell, which can orphan backgrounded subshells) diff --git a/docs/agents-llama.md b/docs/agents-llama.md index b3a1334..bc973b7 100644 --- a/docs/agents-llama.md +++ b/docs/agents-llama.md @@ -2,12 +2,9 @@ Local-model agents run the same agent code as the Claude-backed agents, but connect to a local llama-server (or compatible OpenAI-API endpoint) instead of -the Anthropic API. This document describes the canonical activation flow using +the Anthropic API. This document describes the current activation flow using `disinto hire-an-agent` and `[agents.X]` TOML configuration. -> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846). -> Activation is now done exclusively via `[agents.X]` sections in project TOML. - ## Overview Local-model agents are configured via `[agents.]` sections in diff --git a/docs/nomad-cutover-runbook.md b/docs/nomad-cutover-runbook.md deleted file mode 100644 index e0956cc..0000000 --- a/docs/nomad-cutover-runbook.md +++ /dev/null @@ -1,183 +0,0 @@ -# Nomad Cutover Runbook - -End-to-end procedure to cut over the disinto factory from docker-compose on -disinto-dev-box to Nomad on disinto-nomad-box. - -**Target**: disinto-nomad-box (10.10.10.216) becomes production; disinto-dev-box -stays warm for rollback. - -**Downtime budget**: <5 min blue-green flip. - -**Data scope**: Forgejo issues + disinto-ops git bundle only. Everything else is -regenerated or discarded. OAuth secrets are regenerated on fresh init (all -sessions invalidated). - ---- - -## 1. Pre-cutover readiness checklist - -- [ ] Nomad + Vault stack healthy on a fresh wipe+init (step 5 verified) -- [ ] Codeberg mirror current — `git log` parity between dev-box Forgejo and - Codeberg -- [ ] SSH key pair generated for nomad-box, registered on DO edge (see §4.6) -- [ ] Companion tools landed: - - `disinto backup create` (#1057) - - `disinto backup import` (#1058) -- [ ] Backup tarball produced and tested against a scratch LXC (see §3) - ---- - -## 2. Pre-cutover artifact: backup - -On disinto-dev-box: - -```bash -./bin/disinto backup create /tmp/disinto-backup-$(date +%Y%m%d).tar.gz -``` - -Copy the tarball to nomad-box (and optionally to a local workstation for -safekeeping): - -```bash -scp /tmp/disinto-backup-*.tar.gz nomad-box:/tmp/ -``` - ---- - -## 3. Pre-cutover dry-run - -On a throwaway LXC: - -```bash -lxc launch ubuntu:24.04 cutover-dryrun -# inside the container: -disinto init --backend=nomad --import-env .env --with edge -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -Verify: - -- Issue count matches source Forgejo -- disinto-ops repo refs match source bundle - -Destroy the LXC once satisfied: - -```bash -lxc delete cutover-dryrun --force -``` - ---- - -## 4. Cutover T-0 (operator executes; <5 min target) - -### 4.1 Stop dev-box services - -```bash -# On disinto-dev-box — stop, do NOT remove volumes (rollback needs them) -docker-compose stop -``` - -### 4.2 Provision nomad-box (if not already done) - -```bash -# On disinto-nomad-box -disinto init --backend=nomad --import-env .env --with edge -``` - -### 4.3 Import backup - -```bash -# On disinto-nomad-box -./bin/disinto backup import /tmp/disinto-backup-*.tar.gz -``` - -### 4.4 Configure Codeberg pull mirror - -Manual, one-time step in the new Forgejo UI: - -1. Create a mirror repository pointing at the Codeberg upstream -2. Confirm initial sync completes - -### 4.5 Claude login - -```bash -# On disinto-nomad-box -claude login -``` - -Set up Anthropic OAuth so agents can authenticate. - -### 4.6 Autossh tunnel swap - -> **Operator step** — cross-host, no dev-agent involvement. Do NOT automate. - -1. Stop the tunnel on dev-box: - ```bash - # On disinto-dev-box - systemctl stop reverse-tunnel - ``` - -2. Copy or regenerate the tunnel unit on nomad-box: - ```bash - # Copy from dev-box, or let init regenerate it - scp dev-box:/etc/systemd/system/reverse-tunnel.service \ - nomad-box:/etc/systemd/system/ - ``` - -3. Register nomad-box's public key on DO edge: - ```bash - # On DO edge box — same restricted-command as the dev-box key - echo "" >> /home/johba/.ssh/authorized_keys - ``` - -4. Start the tunnel on nomad-box: - ```bash - # On disinto-nomad-box - systemctl enable --now reverse-tunnel - ``` - -5. Verify end-to-end: - ```bash - curl https://self.disinto.ai/api/v1/version - # Should return the new box's Forgejo version - ``` - ---- - -## 5. Post-cutover smoke - -- [ ] `curl https://self.disinto.ai` → Forgejo welcome page -- [ ] Create a test PR → Woodpecker pipeline runs → agents assign and work -- [ ] Claude chat login via Forgejo OAuth succeeds - ---- - -## 6. Rollback (if any step 4 gate fails) - -1. Stop the tunnel on nomad-box: - ```bash - systemctl stop reverse-tunnel # on nomad-box - ``` - -2. Restore the tunnel on dev-box: - ```bash - systemctl start reverse-tunnel # on dev-box - ``` - -3. Bring dev-box services back up: - ```bash - docker-compose up -d # on dev-box - ``` - -4. DO Caddy config is unchanged — traffic restores in <5 min. - -5. File a post-mortem issue. Keep nomad-box state intact for debugging. - ---- - -## 7. Post-stable cleanup (T+1 week) - -- `docker-compose down -v` on dev-box -- Archive `/var/lib/docker/volumes/disinto_*` to cold storage -- Delete disinto-dev-box LXC or keep as permanent rollback reserve (operator - decision) diff --git a/docs/nomad-migration.md b/docs/nomad-migration.md deleted file mode 100644 index 02ff023..0000000 --- a/docs/nomad-migration.md +++ /dev/null @@ -1,124 +0,0 @@ - -# Nomad+Vault migration — cutover-day runbook - -`disinto init --backend=nomad` is the single entry-point that turns a fresh -LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with -policies applied, JWT workload-identity auth configured, secrets imported -from the old docker stack, and services deployed. - -## Cutover-day invocation - -On the new LXC, as root (or an operator with NOPASSWD sudo): - -```bash -# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile -# from the old box first (out of band — SSH, USB, whatever your ops -# procedure allows). Then: - -sudo ./bin/disinto init \ - --backend=nomad \ - --import-env /tmp/.env \ - --import-sops /tmp/.env.vault.enc \ - --age-key /tmp/keys.txt \ - --with forgejo -``` - -This runs, in order: - -1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault - binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both - services, waits for the Nomad node to become ready. -2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every - `vault/policies/*.hcl` into Vault as an ACL policy. Idempotent. -3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's - JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes - one role per policy, reloads Nomad so jobs can exchange - workload-identity tokens for Vault tokens. Idempotent. -4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the - sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths - matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`, - `kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place). -5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the - `nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from - Vault via the `template` stanza (S2.4). - -## Flag summary - -| Flag | Meaning | -|---|---| -| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). | -| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. | -| `--with forgejo[,…]` | Deploy these services after the cluster is up. | -| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. | -| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. | -| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. | -| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. | - -### Flag validation - -- `--import-sops` without `--age-key` → error. -- `--age-key` without `--import-sops` → error. -- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`). -- `--backend=docker` with any `--import-*` flag → error. -- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty` - skips the import step, so pairing them silently discards the import - intent). - -## Idempotency - -Every layer is idempotent by design. Re-running the same command on an -already-provisioned box is a no-op at every step: - -- **Cluster-up:** second run detects running `nomad`/`vault` systemd - units and state files, skips re-init. -- **Policies:** byte-for-byte compare against on-server policy text; - "unchanged" for every untouched file. -- **Auth:** skips auth-method create if `jwt-nomad/` already enabled, - skips config write if the JWKS + algs match, skips server.hcl write if - the file on disk is identical to the repo copy. -- **Import:** KV v2 writes overwrite in place (same path, same keys, - same values → no new version). -- **Deploy:** `nomad job run` is declarative; same jobspec → no new - allocation. - -## Dry-run - -```bash -./bin/disinto init --backend=nomad \ - --import-env /tmp/.env \ - --import-sops /tmp/.env.vault.enc \ - --age-key /tmp/keys.txt \ - --with forgejo \ - --dry-run -``` - -Prints the five-section plan — cluster-up, policies, auth, import, -deploy — with every path and every argv that would be executed. No -network, no sudo, no state mutation. See -`tests/disinto-init-nomad.bats` for the exact output shape. - -## No-import path - -If you already have `kv/disinto/*` seeded by other means (manual -`vault kv put`, a replica, etc.), omit all three `--import-*` flags. -`disinto init --backend=nomad --with forgejo` still applies policies, -configures auth, and deploys — but skips the import step with: - -``` -[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services -``` - -Forgejo's template stanza will fail to render (and thus the allocation -will stall) until those KV paths exist — so either import them or seed -them first. - -## Secret hygiene - -- Never log a secret value. The CLI only prints paths (`--import-env`, - `--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never - the values themselves. `tools/vault-import.sh` is the only thing that - reads the values, and it pipes them directly into Vault's HTTP API. -- The age keyfile must be mode 0400 — `vault-import.sh` refuses to - source a keyfile with looser permissions. -- `VAULT_ADDR` must be localhost during import — the import tool - refuses to run against a remote Vault, preventing accidental exposure. diff --git a/formulas/release.sh b/formulas/release.sh index 6526d1a..b8c4eb6 100644 --- a/formulas/release.sh +++ b/formulas/release.sh @@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}" log "Step 6/6: Restarting agent containers" -docker compose stop agents 2>/dev/null || true -docker compose up -d agents +docker compose stop agents agents-llama 2>/dev/null || true +docker compose up -d agents agents-llama log "Agent containers restarted" # ── Done ───────────────────────────────────────────────────────────────── diff --git a/formulas/release.toml b/formulas/release.toml index ccd7f95..f702f42 100644 --- a/formulas/release.toml +++ b/formulas/release.toml @@ -189,10 +189,10 @@ Restart agent containers to use the new image. - docker compose pull agents 2. Stop and remove existing agent containers: - - docker compose down agents + - docker compose down agents agents-llama 2>/dev/null || true 3. Start agents with new image: - - docker compose up -d agents + - docker compose up -d agents agents-llama 4. Wait for containers to be healthy: - for i in {1..30}; do @@ -203,7 +203,7 @@ Restart agent containers to use the new image. - done 5. Verify containers are running: - - docker compose ps agents + - docker compose ps agents agents-llama 6. Log restart: - echo "Restarted agents containers" diff --git a/formulas/run-supervisor.toml b/formulas/run-supervisor.toml index 4101252..f31e6bc 100644 --- a/formulas/run-supervisor.toml +++ b/formulas/run-supervisor.toml @@ -29,7 +29,7 @@ and injected into your prompt above. Review them now. 1. Read the injected metrics data carefully (System Resources, Docker, Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs, - CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**). + CI Pipelines, Open PRs, Issue Status, Stale Worktrees). Note: preflight.sh auto-removes PHASE:escalate files for closed issues (24h grace period). Check the "Stale Phase Cleanup" section for any files cleaned or in grace period this run. @@ -75,10 +75,6 @@ Categorize every finding from the metrics into priority levels. - Dev/action sessions in PHASE:escalate for > 24h (session timeout) (Note: PHASE:escalate files for closed issues are auto-cleaned by preflight; this check covers sessions where the issue is still open) -- **Woodpecker agent unhealthy** — see "Woodpecker Agent Health" section in preflight: - - Container not running or in unhealthy state - - gRPC errors >= 3 in last 20 minutes - - Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes ### P3 — Factory degraded - PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed @@ -104,15 +100,6 @@ For each finding from the health assessment, decide and execute an action. ### Auto-fixable (execute these directly) -**P2 Woodpecker agent unhealthy:** -The supervisor-run.sh script automatically handles WP agent recovery: -- Detects unhealthy state via preflight.sh health checks -- Restarts container via `docker restart` -- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes -- Unassigns and removes blocked label from affected issues -- Posts recovery comment with infra-flake context -- Avoids duplicate restarts via 5-minute cooldown in history file - **P0 Memory crisis:** # Kill stale one-shot claude processes (>3h old) pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true @@ -261,11 +248,6 @@ Format: - (or "No actions needed") - ### WP Agent Recovery (if applicable) - - WP agent restart: