diff --git a/.env.example b/.env.example index c1c0b98..a1f24d5 100644 --- a/.env.example +++ b/.env.example @@ -32,13 +32,10 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance # - FORGE_PASS_DEV_QWEN2 # Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores). # The compose generator looks these up via the agent's `forge_user` field in -# the project TOML. The pre-existing `dev-qwen` llama agent uses -# FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the -# legacy `ENABLE_LLAMA_AGENT=1` single-agent path). +# the project TOML. Configure local-model agents via [agents.X] sections in +# projects/*.toml — this is the canonical activation path. FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents) FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361) -FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama) -FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token @@ -107,13 +104,6 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔ # Store all project secrets here so formulas reference env vars, never hardcode. BASE_RPC_URL= # [SECRET] on-chain RPC endpoint -# ── Local Qwen dev agent (optional) ────────────────────────────────────── -# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml. -# Requires a running llama-server reachable at ANTHROPIC_BASE_URL. -# See docs/agents-llama.md for details. -ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service -ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081 - # ── Tuning ──────────────────────────────────────────────────────────────── CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation diff --git a/.gitignore b/.gitignore index 21c6fbc..a29450c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,6 @@ metrics/supervisor-metrics.jsonl # OS .DS_Store dev/ci-fixes-*.json -gardener/dust.jsonl # Individual encrypted secrets (managed by disinto secrets add) secrets/ diff --git a/.woodpecker/nomad-validate.yml b/.woodpecker/nomad-validate.yml index d5828e9..5a1cc7c 100644 --- a/.woodpecker/nomad-validate.yml +++ b/.woodpecker/nomad-validate.yml @@ -1,26 +1,45 @@ # ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # -# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the -# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or -# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked -# before it can land. +# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, +# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell +# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the +# `disinto init` dispatcher and vault/roles.yaml, gets checked before it +# can land. # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) -# lib/init/nomad/** — cluster-up / install / systemd / vault-init +# lib/init/nomad/** — cluster-up / install / systemd / vault-init / +# vault-nomad-auth (S2.6 trigger: vault-*.sh +# is a subset of this glob) # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself +# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6) +# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): # 1. nomad-config-validate — `nomad config validate` on server + client HCL # 2. nomad-job-validate — `nomad job validate` looped over every -# nomad/jobs/*.nomad.hcl (new jobspecs get +# nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl -# 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto -# 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests +# 4. vault-policy-fmt — `vault policy fmt` idempotence check on +# every vault/policies/*.hcl (format drift = +# CI fail; non-destructive via cp+diff) +# 5. vault-policy-validate — HCL syntax + capability validation for every +# vault/policies/*.hcl via `vault policy write` +# against an inline dev-mode Vault server +# 6. vault-roles-validate — yamllint + role→policy reference check on +# vault/roles.yaml (every referenced policy +# must exist as vault/policies/.hcl) +# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto +# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests +# +# Secret-scan coverage: vault/policies/*.hcl is already scanned by the +# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path +# `vault/**/*` covers everything under this directory. We intentionally +# do NOT duplicate that gate here; one scanner, one source of truth. # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on @@ -34,6 +53,8 @@ when: - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" + - "vault/policies/**" + - "vault/roles.yaml" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is @@ -57,6 +78,7 @@ steps: - name: nomad-config-validate image: hashicorp/nomad:1.9.5 commands: + - nomad version - nomad config validate nomad/server.hcl nomad/client.hcl # ── 2. Nomad jobspec HCL syntax check ──────────────────────────────────── @@ -68,15 +90,15 @@ steps: # # Validation is offline: no running Nomad server is required (exit 0 on # valid HCL, 1 on syntax/semantic error). The CLI takes a single path - # argument so we loop over every `*.nomad.hcl` file under nomad/jobs/ — + # argument so we loop over every `*.hcl` file under nomad/jobs/ — # that way a new jobspec PR gets CI coverage automatically (no separate - # "edit the pipeline" step to forget). The `.nomad.hcl` suffix is the - # naming convention documented in nomad/AGENTS.md; anything else in - # nomad/jobs/ is deliberately not validated by this step. + # "edit the pipeline" step to forget). The `.hcl` suffix is the naming + # convention: anything else in nomad/jobs/ is deliberately not validated + # by this step. # # `[ -f "$f" ]` guards against the no-match case: POSIX sh does not # nullglob, so an empty jobs/ directory would leave the literal glob in - # "$f" and fail. Today forgejo.nomad.hcl exists, but the guard keeps the + # "$f" and fail. Today forgejo.hcl exists, but the guard keeps the # step safe during any future transient empty state. # # Scope note: offline validate catches jobspec-level errors (unknown @@ -91,7 +113,7 @@ steps: commands: - | set -e - for f in nomad/jobs/*.nomad.hcl; do + for f in nomad/jobs/*.hcl; do [ -f "$f" ] || continue echo "validating jobspec: $f" nomad job validate "$f" @@ -122,7 +144,176 @@ steps: *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac - # ── 4. Shellcheck ──────────────────────────────────────────────────────── + # ── 4. Vault policy fmt idempotence check ──────────────────────────────── + # `vault policy fmt ` formats a local HCL policy file in place. + # There's no `-check`/dry-run flag (vault 1.18.5), so we implement a + # non-destructive check as cp → fmt-on-copy → diff against original. + # Any diff means the committed file would be rewritten by `vault policy + # fmt` — failure steers the author to run `vault policy fmt ` + # locally before pushing. + # + # Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the + # no-match case (POSIX sh does not nullglob) so an empty policies/ + # directory does not fail this step. + # + # Note: `vault policy fmt` is purely local (HCL text transform) and does + # not require a running Vault server, which is why this step can run + # without starting one. + - name: vault-policy-fmt + image: hashicorp/vault:1.18.5 + commands: + - | + set -e + failed=0 + for f in vault/policies/*.hcl; do + [ -f "$f" ] || continue + tmp="/tmp/$(basename "$f").fmt" + cp "$f" "$tmp" + vault policy fmt "$tmp" >/dev/null 2>&1 + if ! diff -u "$f" "$tmp"; then + echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2 + failed=1 + fi + done + if [ "$failed" -gt 0 ]; then + echo "vault-policy-fmt: formatting drift detected" >&2 + exit 1 + fi + echo "vault-policy-fmt: all policies formatted correctly" + + # ── 5. Vault policy HCL syntax + capability validation ─────────────────── + # Vault has no offline `vault policy validate` subcommand — the closest + # in-CLI validator is `vault policy write`, which sends the HCL to a + # running server which parses it, checks capability names against the + # known set (read, list, create, update, delete, patch, sudo, deny), + # and rejects unknown stanzas / malformed path blocks. We start an + # inline dev-mode Vault (in-memory, no persistence, root token = "root") + # for the duration of this step and loop `vault policy write` over every + # vault/policies/*.hcl; the policies never leave the ephemeral dev + # server, so this is strictly a validator — not a deploy. + # + # Exit-code handling: + # - `vault policy write` exits 0 on success, non-zero on any parse / + # semantic error. We aggregate failures across all files so a single + # CI run surfaces every broken policy (not just the first). + # - The dev server is killed on any step exit via EXIT trap so the + # step tears down cleanly even on failure. + # + # Why dev-mode is sufficient: we're not persisting secrets, only asking + # Vault to parse policy text. The factory's production Vault is NOT + # contacted. + - name: vault-policy-validate + image: hashicorp/vault:1.18.5 + commands: + - | + set -e + vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 & + VAULT_PID=$! + trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM + export VAULT_ADDR=http://127.0.0.1:8200 + export VAULT_TOKEN=root + ready=0 + i=0 + while [ "$i" -lt 30 ]; do + if vault status >/dev/null 2>&1; then + ready=1 + break + fi + i=$((i + 1)) + sleep 0.5 + done + if [ "$ready" -ne 1 ]; then + echo "vault-policy-validate: dev server failed to start after 15s" >&2 + cat /tmp/vault-dev.log >&2 || true + exit 1 + fi + failed=0 + for f in vault/policies/*.hcl; do + [ -f "$f" ] || continue + name=$(basename "$f" .hcl) + echo "validate: $f" + if ! vault policy write "$name" "$f"; then + echo " ERROR: $f failed validation" >&2 + failed=1 + fi + done + if [ "$failed" -gt 0 ]; then + echo "vault-policy-validate: validation errors found" >&2 + exit 1 + fi + echo "vault-policy-validate: all policies valid" + + # ── 6. vault/roles.yaml validator ──────────────────────────────────────── + # Validates the JWT-auth role bindings file (S2.3). Two checks: + # + # a. `yamllint` — catches YAML syntax errors and indentation drift. + # Uses a relaxed config (line length bumped to 200) because + # roles.yaml's comments are wide by design. + # b. role → policy reference check — every role's `policy:` field + # must match a basename in vault/policies/*.hcl. A role pointing + # at a non-existent policy = runtime "permission denied" at job + # placement; catching the drift here turns it into a CI failure. + # Also verifies each role entry has the four required fields + # (name, policy, namespace, job_id) per the file's documented + # format. + # + # Parsing is done with PyYAML (the roles.yaml format is a strict + # subset that awk-level parsing in tools/vault-apply-roles.sh handles + # too, but PyYAML in CI gives us structural validation for free). If + # roles.yaml is ever absent (e.g. reverted), the step skips rather + # than fails — presence is enforced by S2.3's own tooling, not here. + - name: vault-roles-validate + image: python:3.12-alpine + commands: + - pip install --quiet --disable-pip-version-check pyyaml yamllint + - | + set -e + if [ ! -f vault/roles.yaml ]; then + echo "vault-roles-validate: vault/roles.yaml not present, skipping" + exit 0 + fi + yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml + echo "vault-roles-validate: yamllint OK" + python3 - <<'PY' + import os + import sys + import yaml + + with open('vault/roles.yaml') as f: + data = yaml.safe_load(f) or {} + roles = data.get('roles') or [] + if not roles: + print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr) + sys.exit(1) + existing = { + os.path.splitext(e)[0] + for e in os.listdir('vault/policies') + if e.endswith('.hcl') + } + required = ('name', 'policy', 'namespace', 'job_id') + failed = 0 + for r in roles: + if not isinstance(r, dict): + print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr) + failed = 1 + continue + for field in required: + if r.get(field) in (None, ''): + print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr) + failed = 1 + policy = r.get('policy') + if policy and policy not in existing: + print( + f"ERROR: role '{r.get('name')}' references policy '{policy}' " + f"but vault/policies/{policy}.hcl does not exist", + file=sys.stderr, + ) + failed = 1 + sys.exit(failed) + PY + echo "vault-roles-validate: all role→policy references valid" + + # ── 7. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the @@ -132,7 +323,7 @@ steps: commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto - # ── 5. bats: `disinto init --backend=nomad --dry-run` ──────────────────── + # ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. diff --git a/AGENTS.md b/AGENTS.md index eec058c..fced0c6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,4 +1,4 @@ - + # Disinto — Agent Instructions ## What this repo is @@ -39,15 +39,18 @@ disinto/ (code repo) │ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure) │ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825) ├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh +│ jobs/ — Nomad jobspecs (forgejo.hcl reads Vault secrets via template stanza, S2.4) ├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored) ├── formulas/ Issue templates (TOML specs for multi-step agent tasks) ├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/) ├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh) +│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2) +│ vault-seed-.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with ` (add a new file to support a new service) ├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md) ├── site/ disinto.ai website content -├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats) +├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats) ├── templates/ Issue templates -├── bin/ The `disinto` CLI script +├── bin/ The `disinto` CLI script (`--with ` deploys services + runs their Vault seeders) ├── disinto-factory/ Setup documentation and skill ├── state/ Runtime state ├── .woodpecker/ Woodpecker CI pipeline configs @@ -120,8 +123,7 @@ bash dev/phase-test.sh | Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` | | Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` | | Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` | -| agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) | -| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) | +| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) | > **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77). > See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details. @@ -192,9 +194,7 @@ Humans write these. Agents read and enforce them. ## Phase-Signaling Protocol -When running as a persistent tmux session, Claude must signal the orchestrator -at each phase boundary by writing to a phase file (e.g. -`/tmp/dev-session-{project}-{issue}.phase`). +When running as a persistent tmux session, Claude must signal the orchestrator at each phase boundary by writing to a phase file (e.g. `/tmp/dev-session-{project}-{issue}.phase`). Key phases: `PHASE:awaiting_ci` → `PHASE:awaiting_review` → `PHASE:done`. Also: `PHASE:escalate` (needs human input), `PHASE:failed`. See [docs/PHASE-PROTOCOL.md](docs/PHASE-PROTOCOL.md) for the complete spec, orchestrator reaction matrix, sequence diagram, and crash recovery. diff --git a/architect/AGENTS.md b/architect/AGENTS.md index 9582b03..51b24b1 100644 --- a/architect/AGENTS.md +++ b/architect/AGENTS.md @@ -1,4 +1,4 @@ - + # Architect — Agent Instructions ## What this agent is diff --git a/bin/disinto b/bin/disinto index 4f06b5e..5f57927 100755 --- a/bin/disinto +++ b/bin/disinto @@ -60,7 +60,7 @@ Usage: Read CI logs from Woodpecker SQLite disinto release Create vault PR for release (e.g., v1.2.0) disinto hire-an-agent [--formula ] [--local-model ] [--model ] - Hire a new agent (create user + .profile repo) + Hire a new agent (create user + .profile repo; re-run to rotate credentials) disinto agent Manage agent state (enable/disable) disinto edge [options] Manage edge tunnel registrations @@ -82,12 +82,16 @@ Init options: --ci-id Woodpecker CI repo ID (default: 0 = no CI) --forge-url Forge base URL (default: http://localhost:3000) --backend Orchestration backend: docker (default) | nomad + --with (nomad) Deploy services: forgejo[,...] (S1.3) --empty (nomad) Bring up cluster only, no jobs (S0.4) --bare Skip compose generation (bare-metal setup) --build Use local docker build instead of registry images (dev mode) --yes Skip confirmation prompts --rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default) --dry-run Print every intended action without executing + --import-env (nomad) Path to .env file for import into Vault KV (S2.5) + --import-sops (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5) + --age-key (nomad) Path to age keyfile (required with --import-sops) (S2.5) Hire an agent options: --formula Path to role formula TOML (default: formulas/.toml) @@ -662,14 +666,61 @@ prompt_admin_password() { # init run); operators running without sudo-NOPASSWD should invoke # `sudo disinto init ...` directly. _disinto_init_nomad() { - local dry_run="${1:-false}" empty="${2:-false}" + local dry_run="${1:-false}" empty="${2:-false}" with_services="${3:-}" + local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}" local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh" + local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh" + local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh" + local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh" + local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh" + local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh" if [ ! -x "$cluster_up" ]; then echo "Error: ${cluster_up} not found or not executable" >&2 exit 1 fi + if [ -n "$with_services" ] && [ ! -x "$deploy_sh" ]; then + echo "Error: ${deploy_sh} not found or not executable" >&2 + exit 1 + fi + + # --empty short-circuits after cluster-up: no policies, no auth, no + # import, no deploy. It's the "cluster-only escape hatch" for debugging + # (docs/nomad-migration.md). Caller-side validation already rejects + # --empty combined with --with or any --import-* flag, so reaching + # this branch with those set is a bug in the caller. + # + # On the default (non-empty) path, vault-engines.sh (enables the kv/ + # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked + # unconditionally — they are idempotent and cheap to re-run, and + # subsequent --with deployments depend on them. vault-import.sh is + # invoked only when an --import-* flag is set. vault-engines.sh runs + # first because every policy and role below references kv/disinto/* + # paths, which 403 if the engine is not yet mounted (issue #912). + local import_any=false + if [ -n "$import_env" ] || [ -n "$import_sops" ]; then + import_any=true + fi + if [ "$empty" != "true" ]; then + if [ ! -x "$vault_engines_sh" ]; then + echo "Error: ${vault_engines_sh} not found or not executable" >&2 + exit 1 + fi + if [ ! -x "$vault_policies_sh" ]; then + echo "Error: ${vault_policies_sh} not found or not executable" >&2 + exit 1 + fi + if [ ! -x "$vault_auth_sh" ]; then + echo "Error: ${vault_auth_sh} not found or not executable" >&2 + exit 1 + fi + if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then + echo "Error: ${vault_import_sh} not found or not executable" >&2 + exit 1 + fi + fi + # --empty and default both invoke cluster-up today. Log the requested # mode so the dispatch is visible in factory bootstrap logs — Step 1 # will branch on $empty to gate the job-deployment path. @@ -679,31 +730,295 @@ _disinto_init_nomad() { echo "nomad backend: default (cluster-up; jobs deferred to Step 1)" fi - # Dry-run forwards straight through; cluster-up.sh prints its own step - # list and exits 0 without touching the box. - local -a cmd=("$cluster_up") + # Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan if [ "$dry_run" = "true" ]; then - cmd+=("--dry-run") - "${cmd[@]}" - exit $? + echo "" + echo "── Cluster-up dry-run ─────────────────────────────────" + local -a cmd=("$cluster_up" "--dry-run") + "${cmd[@]}" || true + echo "" + + # --empty skips policies/auth/import/deploy — cluster-up only, no + # workloads. The operator-visible dry-run plan must match the real + # run, so short-circuit here too. + if [ "$empty" = "true" ]; then + exit 0 + fi + + # Vault engines + policies + auth are invoked on every nomad real-run + # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3). + # Engines runs first because policies/roles/templates all reference the + # kv/ mount it enables (issue #912). Mirror that ordering in the + # dry-run plan so the operator sees the full sequence Step 2 will + # execute. + echo "── Vault engines dry-run ──────────────────────────────" + echo "[engines] [dry-run] ${vault_engines_sh} --dry-run" + echo "" + echo "── Vault policies dry-run ─────────────────────────────" + echo "[policies] [dry-run] ${vault_policies_sh} --dry-run" + echo "" + echo "── Vault auth dry-run ─────────────────────────────────" + echo "[auth] [dry-run] ${vault_auth_sh}" + echo "" + + # Import plan: one line per --import-* flag that is actually set. + # Printing independently (not in an if/elif chain) means that all + # three flags appearing together each echo their own path — the + # regression that bit prior implementations of this issue (#883). + if [ "$import_any" = true ]; then + echo "── Vault import dry-run ───────────────────────────────" + [ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}" + [ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}" + [ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}" + local -a import_dry_cmd=("$vault_import_sh") + [ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env") + [ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops") + [ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key") + import_dry_cmd+=("--dry-run") + echo "[import] [dry-run] ${import_dry_cmd[*]}" + echo "" + else + echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "" + fi + + if [ -n "$with_services" ]; then + # Vault seed plan (S2.6, #928): one line per service whose + # tools/vault-seed-.sh ships. Services without a seeder are + # silently skipped — the real-run loop below mirrors this, + # making `--with woodpecker` in Step 3 auto-invoke + # tools/vault-seed-woodpecker.sh once that file lands without + # any further change to bin/disinto. + local seed_hdr_printed=false + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${svc}.sh" + if [ -x "$seed_script" ]; then + if [ "$seed_hdr_printed" = false ]; then + echo "── Vault seed dry-run ─────────────────────────────────" + seed_hdr_printed=true + fi + echo "[seed] [dry-run] ${seed_script} --dry-run" + fi + done + [ "$seed_hdr_printed" = true ] && echo "" + + echo "── Deploy services dry-run ────────────────────────────" + echo "[deploy] services to deploy: ${with_services}" + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + # Validate known services first + case "$svc" in + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 + ;; + esac + local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" + if [ ! -f "$jobspec_path" ]; then + echo "Error: jobspec not found: ${jobspec_path}" >&2 + exit 1 + fi + echo "[deploy] [dry-run] nomad job validate ${jobspec_path}" + echo "[deploy] [dry-run] nomad job run -detach ${jobspec_path}" + done + echo "[deploy] dry-run complete" + fi + exit 0 fi - # Real run — needs root. Invoke via sudo if we're not already root so - # the command's exit code propagates directly. We don't distinguish - # "sudo denied" from "cluster-up.sh failed" here; both surface as a - # non-zero exit, and cluster-up.sh's own error messages cover the - # latter case. - local rc=0 + # Real run: cluster-up + policies + auth + (optional) import + deploy + local -a cluster_cmd=("$cluster_up") if [ "$(id -u)" -eq 0 ]; then - "${cmd[@]}" || rc=$? + "${cluster_cmd[@]}" || exit $? else if ! command -v sudo >/dev/null 2>&1; then echo "Error: cluster-up.sh must run as root and sudo is not installed" >&2 exit 1 fi - sudo -n -- "${cmd[@]}" || rc=$? + sudo -n -- "${cluster_cmd[@]}" || exit $? fi - exit "$rc" + + # --empty short-circuits here: cluster-up only, no policies/auth/import + # and no deploy. Matches the dry-run plan above and the docs/runbook. + if [ "$empty" = "true" ]; then + exit 0 + fi + + # Enable Vault secret engines (S2.1 / issue #912) — must precede + # policies/auth/import because every policy and every import target + # addresses paths under kv/. Idempotent, safe to re-run. + echo "" + echo "── Enabling Vault secret engines ──────────────────────" + local -a engines_cmd=("$vault_engines_sh") + if [ "$(id -u)" -eq 0 ]; then + "${engines_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${engines_cmd[@]}" || exit $? + fi + + # Apply Vault policies (S2.1) — idempotent, safe to re-run. + echo "" + echo "── Applying Vault policies ────────────────────────────" + local -a policies_cmd=("$vault_policies_sh") + if [ "$(id -u)" -eq 0 ]; then + "${policies_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${policies_cmd[@]}" || exit $? + fi + + # Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent. + echo "" + echo "── Configuring Vault JWT auth ─────────────────────────" + local -a auth_cmd=("$vault_auth_sh") + if [ "$(id -u)" -eq 0 ]; then + "${auth_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${auth_cmd[@]}" || exit $? + fi + + # Import secrets if any --import-* flag is set (S2.2). + if [ "$import_any" = true ]; then + echo "" + echo "── Importing secrets into Vault ───────────────────────" + local -a import_cmd=("$vault_import_sh") + [ -n "$import_env" ] && import_cmd+=("--env" "$import_env") + [ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops") + [ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key") + if [ "$(id -u)" -eq 0 ]; then + "${import_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-import.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${import_cmd[@]}" || exit $? + fi + else + echo "" + echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + fi + + # Seed Vault for services that ship their own seeder (S2.6, #928). + # Convention: tools/vault-seed-.sh — auto-invoked when --with + # is requested. Runs AFTER vault-import so that real imported values + # win over generated seeds when both are present; each seeder is + # idempotent on a per-key basis (see vault-seed-forgejo.sh's + # "missing → generate, present → unchanged" contract), so re-running + # init does not rotate existing keys. Services without a seeder are + # silently skipped — keeps this loop forward-compatible with Step 3+ + # services that may ship their own seeder without touching bin/disinto. + # + # VAULT_ADDR is passed explicitly because cluster-up.sh writes the + # profile.d export *during* this same init run, so the current shell + # hasn't sourced it yet; sibling vault-* scripts (engines/policies/ + # auth/import) default VAULT_ADDR internally via _hvault_default_env, + # but vault-seed-forgejo.sh requires the caller to set it. + # + # The non-root branch invokes the seeder as `sudo -n -- env VAR=val + # script` rather than `sudo -n VAR=val -- script`: sudo treats bare + # `VAR=val` args as sudoers env-assignments, which the default + # `env_reset=on` policy silently discards unless the variable is in + # `env_keep` (VAULT_ADDR is not). Using `env` as the actual command + # sets VAULT_ADDR in the child process regardless of sudoers policy. + if [ -n "$with_services" ]; then + local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}" + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + local seed_script="${FACTORY_ROOT}/tools/vault-seed-${svc}.sh" + if [ -x "$seed_script" ]; then + echo "" + echo "── Seeding Vault for ${svc} ───────────────────────────" + if [ "$(id -u)" -eq 0 ]; then + VAULT_ADDR="$vault_addr" "$seed_script" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: vault-seed-${svc}.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $? + fi + fi + done + fi + + # Deploy services if requested + if [ -n "$with_services" ]; then + echo "" + echo "── Deploying services ─────────────────────────────────" + local -a deploy_cmd=("$deploy_sh") + # Split comma-separated service list into positional args + local IFS=',' + for svc in $with_services; do + svc=$(echo "$svc" | xargs) # trim whitespace + if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then + echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2 + exit 1 + fi + # Validate known services FIRST (before jobspec check) + case "$svc" in + forgejo) ;; + *) + echo "Error: unknown service '${svc}' — known: forgejo" >&2 + exit 1 + ;; + esac + # Check jobspec exists + local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl" + if [ ! -f "$jobspec_path" ]; then + echo "Error: jobspec not found: ${jobspec_path}" >&2 + exit 1 + fi + deploy_cmd+=("$svc") + done + + if [ "$(id -u)" -eq 0 ]; then + "${deploy_cmd[@]}" || exit $? + else + if ! command -v sudo >/dev/null 2>&1; then + echo "Error: deploy.sh must run as root and sudo is not installed" >&2 + exit 1 + fi + sudo -n -- "${deploy_cmd[@]}" || exit $? + fi + + # Print final summary + echo "" + echo "── Summary ────────────────────────────────────────────" + echo "Cluster: Nomad+Vault cluster is up" + echo "Policies: applied (Vault ACL)" + echo "Auth: Vault JWT auth + Nomad workload identity configured" + if [ "$import_any" = true ]; then + local import_desc="" + [ -n "$import_env" ] && import_desc+="${import_env} " + [ -n "$import_sops" ] && import_desc+="${import_sops} " + echo "Imported: ${import_desc% }" + else + echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)" + fi + echo "Deployed: ${with_services}" + if echo "$with_services" | grep -q "forgejo"; then + echo "Ports: forgejo: 3000" + fi + echo "────────────────────────────────────────────────────────" + fi + + exit 0 } disinto_init() { @@ -721,7 +1036,8 @@ disinto_init() { fi # Parse flags - local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false + local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" + local import_env="" import_sops="" age_key="" while [ $# -gt 0 ]; do case "$1" in --branch) branch="$2"; shift 2 ;; @@ -730,12 +1046,20 @@ disinto_init() { --forge-url) forge_url_flag="$2"; shift 2 ;; --backend) backend="$2"; shift 2 ;; --backend=*) backend="${1#--backend=}"; shift ;; + --with) with_services="$2"; shift 2 ;; + --with=*) with_services="${1#--with=}"; shift ;; --bare) bare=true; shift ;; --build) use_build=true; shift ;; --empty) empty=true; shift ;; --yes) auto_yes=true; shift ;; --rotate-tokens) rotate_tokens=true; shift ;; --dry-run) dry_run=true; shift ;; + --import-env) import_env="$2"; shift 2 ;; + --import-env=*) import_env="${1#--import-env=}"; shift ;; + --import-sops) import_sops="$2"; shift 2 ;; + --import-sops=*) import_sops="${1#--import-sops=}"; shift ;; + --age-key) age_key="$2"; shift 2 ;; + --age-key=*) age_key="${1#--age-key=}"; shift ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done @@ -764,11 +1088,52 @@ disinto_init() { exit 1 fi + # --with requires --backend=nomad + if [ -n "$with_services" ] && [ "$backend" != "nomad" ]; then + echo "Error: --with requires --backend=nomad" >&2 + exit 1 + fi + + # --empty and --with are mutually exclusive + if [ "$empty" = true ] && [ -n "$with_services" ]; then + echo "Error: --empty and --with are mutually exclusive" >&2 + exit 1 + fi + + # --import-* flag validation (S2.5). These three flags form an import + # triple and must be consistent before dispatch: sops encryption is + # useless without the age key to decrypt it, so either both --import-sops + # and --age-key are present or neither is. --import-env alone is fine + # (it just imports the plaintext dotenv). All three flags are nomad-only. + if [ -n "$import_sops" ] && [ -z "$age_key" ]; then + echo "Error: --import-sops requires --age-key" >&2 + exit 1 + fi + if [ -n "$age_key" ] && [ -z "$import_sops" ]; then + echo "Error: --age-key requires --import-sops" >&2 + exit 1 + fi + if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \ + && [ "$backend" != "nomad" ]; then + echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2 + exit 1 + fi + + # --empty is the cluster-only escape hatch — it skips policies, auth, + # import, and deploy. Pairing it with --import-* silently does nothing, + # which is a worse failure mode than a clear error. Reject explicitly. + if [ "$empty" = true ] \ + && { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then + echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2 + exit 1 + fi + # Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh # (S0.4). The default and --empty variants are identical today; Step 1 # will branch on $empty to add job deployment to the default path. if [ "$backend" = "nomad" ]; then - _disinto_init_nomad "$dry_run" "$empty" + _disinto_init_nomad "$dry_run" "$empty" "$with_services" \ + "$import_env" "$import_sops" "$age_key" # shellcheck disable=SC2317 # _disinto_init_nomad always exits today; # `return` is defensive against future refactors. return @@ -882,7 +1247,6 @@ p.write_text(text) echo "" echo "[ensure] Forgejo admin user 'disinto-admin'" echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot" - echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly" echo "[ensure] .profile repos for all bots" echo "[ensure] repo ${forge_repo} on Forgejo with collaborators" echo "[run] preflight checks" @@ -1078,19 +1442,6 @@ p.write_text(text) echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env" fi - # Write local-Qwen dev agent env keys with safe defaults (#769) - if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then - cat >> "$env_file" <<'LLAMAENVEOF' - -# Local Qwen dev agent (optional) — set to 1 to enable -ENABLE_LLAMA_AGENT=0 -FORGE_TOKEN_LLAMA= -FORGE_PASS_LLAMA= -ANTHROPIC_BASE_URL= -LLAMAENVEOF - echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)" - fi - # Create labels on remote create_labels "$forge_repo" "$forge_url" @@ -1757,6 +2108,118 @@ _regen_file() { fi } +# Validate that required environment variables are present for all services +# that reference them in docker-compose.yml +_validate_env_vars() { + local env_file="${FACTORY_ROOT}/.env" + local errors=0 + local -a missing_vars=() + + # Load env vars from .env file into associative array + declare -A env_vars + if [ -f "$env_file" ]; then + while IFS='=' read -r key value; do + # Skip empty lines and comments + [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue + env_vars["$key"]="$value" + done < "$env_file" + fi + + # Check for local-model agent services + # Each [agents.*] section in projects/*.toml requires: + # - FORGE_TOKEN_ + # - FORGE_PASS_ + # - ANTHROPIC_BASE_URL (local model) OR ANTHROPIC_API_KEY (Anthropic backend) + + # Parse projects/*.toml for [agents.*] sections + local projects_dir="${FACTORY_ROOT}/projects" + for toml in "${projects_dir}"/*.toml; do + [ -f "$toml" ] || continue + + # Extract agent config using Python + while IFS='|' read -r service_name forge_user base_url _api_key; do + [ -n "$service_name" ] || continue + [ -n "$forge_user" ] || continue + + # Derive variable names (user -> USER_UPPER) + local user_upper + user_upper=$(echo "$forge_user" | tr 'a-z-' 'A-Z_') + local token_var="FORGE_TOKEN_${user_upper}" + local pass_var="FORGE_PASS_${user_upper}" + + # Check token + if [ -z "${env_vars[$token_var]:-}" ]; then + missing_vars+=("$token_var (for agent ${service_name}/${forge_user})") + errors=$((errors + 1)) + fi + + # Check password + if [ -z "${env_vars[$pass_var]:-}" ]; then + missing_vars+=("$pass_var (for agent ${service_name}/${forge_user})") + errors=$((errors + 1)) + fi + + # Check backend URL or API key (conditional based on base_url presence) + if [ -n "$base_url" ]; then + # Local model: needs ANTHROPIC_BASE_URL + if [ -z "${env_vars[ANTHROPIC_BASE_URL]:-}" ]; then + missing_vars+=("ANTHROPIC_BASE_URL (for agent ${service_name})") + errors=$((errors + 1)) + fi + else + # Anthropic backend: needs ANTHROPIC_API_KEY + if [ -z "${env_vars[ANTHROPIC_API_KEY]:-}" ]; then + missing_vars+=("ANTHROPIC_API_KEY (for agent ${service_name})") + errors=$((errors + 1)) + fi + fi + + done < <(python3 -c ' +import sys, tomllib, re + +with open(sys.argv[1], "rb") as f: + cfg = tomllib.load(f) + +agents = cfg.get("agents", {}) +for name, config in agents.items(): + if not isinstance(config, dict): + continue + + base_url = config.get("base_url", "") + model = config.get("model", "") + api_key = config.get("api_key", "") + forge_user = config.get("forge_user", f"{name}-bot") + + safe_name = name.lower() + safe_name = re.sub(r"[^a-z0-9]", "-", safe_name) + + print(f"{safe_name}|{forge_user}|{base_url}|{api_key}") +' "$toml" 2>/dev/null) + done + + # Check for legacy ENABLE_LLAMA_AGENT services + if [ "${env_vars[ENABLE_LLAMA_AGENT]:-0}" = "1" ]; then + if [ -z "${env_vars[FORGE_TOKEN_LLAMA]:-}" ]; then + missing_vars+=("FORGE_TOKEN_LLAMA (ENABLE_LLAMA_AGENT=1)") + errors=$((errors + 1)) + fi + if [ -z "${env_vars[FORGE_PASS_LLAMA]:-}" ]; then + missing_vars+=("FORGE_PASS_LLAMA (ENABLE_LLAMA_AGENT=1)") + errors=$((errors + 1)) + fi + fi + + if [ "$errors" -gt 0 ]; then + echo "Error: missing required environment variables:" >&2 + for var in "${missing_vars[@]}"; do + echo " - $var" >&2 + done + echo "" >&2 + echo "Run 'disinto hire-an-agent ' to create the agent and write credentials to .env" >&2 + exit 1 + fi +} + disinto_up() { local compose_file="${FACTORY_ROOT}/docker-compose.yml" local caddyfile="${FACTORY_ROOT}/docker/Caddyfile" @@ -1766,6 +2229,9 @@ disinto_up() { exit 1 fi + # Validate environment variables before proceeding + _validate_env_vars + # Parse --no-regen flag; remaining args pass through to docker compose local no_regen=false local -a compose_args=() diff --git a/dev/AGENTS.md b/dev/AGENTS.md index 481bb1f..02fd612 100644 --- a/dev/AGENTS.md +++ b/dev/AGENTS.md @@ -1,4 +1,4 @@ - + # Dev Agent **Role**: Implement issues autonomously — write code, push branches, address diff --git a/dev/dev-agent.sh b/dev/dev-agent.sh index cd8d390..913a2a7 100755 --- a/dev/dev-agent.sh +++ b/dev/dev-agent.sh @@ -254,7 +254,11 @@ agent_recover_session # WORKTREE SETUP # ============================================================================= status "setting up worktree" -cd "$REPO_ROOT" +if ! cd "$REPO_ROOT"; then + log "ERROR: REPO_ROOT=${REPO_ROOT} does not exist — cannot cd" + log "Check PROJECT_REPO_ROOT vs compose PROJECT_NAME vs TOML name mismatch" + exit 1 +fi # Determine forge remote by matching FORGE_URL host against git remotes _forge_host=$(printf '%s' "$FORGE_URL" | sed 's|https\?://||; s|/.*||') diff --git a/docker/agents/Dockerfile b/docker/agents/Dockerfile index 2939230..1bcba89 100644 --- a/docker/agents/Dockerfile +++ b/docker/agents/Dockerfile @@ -2,7 +2,7 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \ - && pip3 install --break-system-packages networkx \ + && pip3 install --break-system-packages networkx tomlkit \ && rm -rf /var/lib/apt/lists/* # Pre-built binaries (copied from docker/agents/bin/) diff --git a/docker/agents/entrypoint.sh b/docker/agents/entrypoint.sh index b7593a2..7c58674 100644 --- a/docker/agents/entrypoint.sh +++ b/docker/agents/entrypoint.sh @@ -17,6 +17,38 @@ set -euo pipefail # - predictor: every 24 hours (288 iterations * 5 min) # - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min) +# ── Migration check: reject ENABLE_LLAMA_AGENT ─────────────────────────────── +# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported. +# Activation is now done exclusively via [agents.X] sections in project TOML. +# If this legacy flag is detected, fail immediately with a migration message. +if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then + cat <<'MIGRATION_ERR' +FATAL: ENABLE_LLAMA_AGENT is no longer supported. + +The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846). +Activation is now done exclusively via [agents.X] sections in projects/*.toml. + +To migrate: + 1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file + 2. Add an [agents.] section to your project TOML: + + [agents.dev-qwen] + base_url = "http://your-llama-server:8081" + model = "unsloth/Qwen3.5-35B-A3B" + api_key = "sk-no-key-required" + roles = ["dev"] + forge_user = "dev-qwen" + compact_pct = 60 + poll_interval = 60 + + 3. Run: disinto init + 4. Start the agent: docker compose up -d agents-dev-qwen + +See docs/agents-llama.md for full details. +MIGRATION_ERR + exit 1 +fi + DISINTO_BAKED="/home/agent/disinto" DISINTO_LIVE="/home/agent/repos/_factory" DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap @@ -315,6 +347,24 @@ _setup_git_creds configure_git_identity configure_tea_login +# Parse first available project TOML to get the project name for cloning. +# This ensures PROJECT_NAME matches the TOML 'name' field, not the compose +# default of 'project'. The clone will land at /home/agent/repos/ +# and subsequent env exports in the main loop will be consistent. +if compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then + _first_toml=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | head -1) + _pname=$(python3 -c " +import sys, tomllib +with open(sys.argv[1], 'rb') as f: + print(tomllib.load(f).get('name', '')) +" "$_first_toml" 2>/dev/null) || _pname="" + if [ -n "$_pname" ]; then + export PROJECT_NAME="$_pname" + export PROJECT_REPO_ROOT="/home/agent/repos/${_pname}" + log "Parsed PROJECT_NAME=${PROJECT_NAME} from ${_first_toml}" + fi +fi + # Clone project repo on first run (makes agents self-healing, #589) ensure_project_clone @@ -324,9 +374,32 @@ bootstrap_ops_repos # Bootstrap factory repo — switch DISINTO_DIR to live checkout (#593) bootstrap_factory_repo +# Validate that projects directory has at least one real .toml file (not .example) +# This prevents the silent-zombie mode where the polling loop matches zero files +# and does nothing forever. +validate_projects_dir() { + # NOTE: compgen -G exits non-zero when no matches exist, so piping it through + # `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch + # can log a diagnostic (#877). Use the conditional form already adopted at + # lines above (see bootstrap_factory_repo, PROJECT_NAME parsing). + if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then + log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/" + log "Expected at least one project config file (e.g., disinto.toml)" + log "The directory only contains *.toml.example template files." + log "Mount the host ./projects volume or copy real .toml files into the container." + exit 1 + fi + local toml_count + toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l) + log "Projects directory validated: ${toml_count} real .toml file(s) found" +} + # Initialize state directory for check_active guards init_state_dir +# Validate projects directory before entering polling loop +validate_projects_dir + # Parse AGENT_ROLES env var (default: all agents) # Expected format: comma-separated list like "review,dev,gardener" AGENT_ROLES="${AGENT_ROLES:-review,dev,gardener,architect,planner,predictor,supervisor}" diff --git a/docs/agents-llama.md b/docs/agents-llama.md index 88622a7..b3a1334 100644 --- a/docs/agents-llama.md +++ b/docs/agents-llama.md @@ -1,59 +1,194 @@ -# agents-llama — Local-Qwen Agents +# Local-Model Agents -The `agents-llama` service is an optional compose service that runs agents -backed by a local llama-server instance (e.g. Qwen) instead of the Anthropic -API. It uses the same Docker image as the main `agents` service but connects to -a local inference endpoint via `ANTHROPIC_BASE_URL`. +Local-model agents run the same agent code as the Claude-backed agents, but +connect to a local llama-server (or compatible OpenAI-API endpoint) instead of +the Anthropic API. This document describes the canonical activation flow using +`disinto hire-an-agent` and `[agents.X]` TOML configuration. -Two profiles are available: +> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846). +> Activation is now done exclusively via `[agents.X]` sections in project TOML. -| Profile | Service | Roles | Use case | -|---------|---------|-------|----------| -| _(default)_ | `agents-llama` | `dev` only | Conservative: single-role soak test | -| `agents-llama-all` | `agents-llama-all` | all 7 (review, dev, gardener, architect, planner, predictor, supervisor) | Pre-migration: validate every role on llama before Nomad cutover | +## Overview -## Enabling - -Set `ENABLE_LLAMA_AGENT=1` in `.env` (or `.env.enc`) and provide the required -credentials: - -```env -ENABLE_LLAMA_AGENT=1 -FORGE_TOKEN_LLAMA= -FORGE_PASS_LLAMA= -ANTHROPIC_BASE_URL=http://host.docker.internal:8081 # llama-server endpoint -``` - -Then regenerate the compose file (`disinto init ...`) and bring the stack up. - -### Running all 7 roles (agents-llama-all) - -```bash -docker compose --profile agents-llama-all up -d -``` - -This starts the `agents-llama-all` container with all 7 bot roles against the -local llama endpoint. The per-role forge tokens (`FORGE_REVIEW_TOKEN`, -`FORGE_GARDENER_TOKEN`, etc.) must be set in `.env` — they are the same tokens -used by the Claude-backed `agents` container. +Local-model agents are configured via `[agents.]` sections in +`projects/.toml`. Each agent gets: +- Its own Forgejo bot user with dedicated API token and password +- A dedicated compose service `agents-` +- Isolated credentials stored as `FORGE_TOKEN_` and `FORGE_PASS_` in `.env` ## Prerequisites - **llama-server** (or compatible OpenAI-API endpoint) running on the host, - reachable from inside Docker at the URL set in `ANTHROPIC_BASE_URL`. -- A Forgejo bot user (e.g. `dev-qwen`) with its own API token and password, - stored as `FORGE_TOKEN_LLAMA` / `FORGE_PASS_LLAMA`. + reachable from inside Docker at the URL you will configure. +- A disinto factory already initialized (`disinto init` completed). + +## Hiring a local-model agent + +Use `disinto hire-an-agent` with `--local-model` to create a bot user and +configure the agent: + +```bash +# Hire a local-model agent for the dev role +disinto hire-an-agent dev-qwen dev \ + --local-model http://10.10.10.1:8081 \ + --model unsloth/Qwen3.5-35B-A3B +``` + +The command performs these steps: + +1. **Creates a Forgejo user** `dev-qwen` with a random password +2. **Generates an API token** for the user +3. **Writes credentials to `.env`**: + - `FORGE_TOKEN_DEV_QWEN` — the API token + - `FORGE_PASS_DEV_QWEN` — the password + - `ANTHROPIC_BASE_URL` — the llama endpoint (required by the agent) +4. **Writes `[agents.dev-qwen]` to `projects/.toml`** with: + - `base_url`, `model`, `api_key` + - `roles = ["dev"]` + - `forge_user = "dev-qwen"` + - `compact_pct = 60` + - `poll_interval = 60` +5. **Regenerates `docker-compose.yml`** to include the `agents-dev-qwen` service + +### Anthropic backend agents + +For agents that use Anthropic API instead of a local model, omit `--local-model`: + +```bash +# Anthropic backend agent (requires ANTHROPIC_API_KEY in environment) +export ANTHROPIC_API_KEY="sk-..." +disinto hire-an-agent dev-claude dev +``` + +This writes `ANTHROPIC_API_KEY` to `.env` instead of `ANTHROPIC_BASE_URL`. + +## Activation and running + +Once hired, the agent service is added to `docker-compose.yml`. Start the +service with `docker compose up -d`: + +```bash +# Start all agent services +docker compose up -d + +# Start a single named agent service +docker compose up -d agents-dev-qwen + +# Start multiple named agent services +docker compose up -d agents-dev-qwen agents-planner +``` + +### Stopping agents + +```bash +# Stop a specific agent service +docker compose down agents-dev-qwen + +# Stop all agent services +docker compose down +``` + +## Credential rotation + +Re-running `disinto hire-an-agent ` with the same parameters rotates +credentials idempotently: + +```bash +# Re-hire the same agent to rotate token and password +disinto hire-an-agent dev-qwen dev \ + --local-model http://10.10.10.1:8081 \ + --model unsloth/Qwen3.5-35B-A3B + +# The command will: +# 1. Detect the user already exists +# 2. Reset the password to a new random value +# 3. Create a new API token +# 4. Update .env with the new credentials +``` + +This is the recommended way to rotate agent credentials. The `.env` file is +updated in place, so no manual editing is required. + +If you need to manually rotate credentials: +1. Generate a new token in Forgejo admin UI +2. Edit `.env` and replace `FORGE_TOKEN_` and `FORGE_PASS_` +3. Restart the agent service: `docker compose restart agents-` + +## Configuration reference + +### Environment variables (`.env`) + +| Variable | Description | Example | +|----------|-------------|---------| +| `FORGE_TOKEN_` | Forgejo API token for the bot user | `FORGE_TOKEN_DEV_QWEN` | +| `FORGE_PASS_` | Forgejo password for the bot user | `FORGE_PASS_DEV_QWEN` | +| `ANTHROPIC_BASE_URL` | Local llama endpoint (local model agents) | `http://host.docker.internal:8081` | +| `ANTHROPIC_API_KEY` | Anthropic API key (Anthropic backend agents) | `sk-...` | + +### Project TOML (`[agents.]` section) + +```toml +[agents.dev-qwen] +base_url = "http://10.10.10.1:8081" +model = "unsloth/Qwen3.5-35B-A3B" +api_key = "sk-no-key-required" +roles = ["dev"] +forge_user = "dev-qwen" +compact_pct = 60 +poll_interval = 60 +``` + +| Field | Description | +|-------|-------------| +| `base_url` | llama-server endpoint | +| `model` | Model name (for logging/identification) | +| `api_key` | Required by API; set to placeholder for llama | +| `roles` | Agent roles this instance handles | +| `forge_user` | Forgejo bot username | +| `compact_pct` | Context compaction threshold (lower = more aggressive) | +| `poll_interval` | Seconds between polling cycles | ## Behaviour -- `agents-llama`: `AGENT_ROLES=dev` — only picks up dev work. -- `agents-llama-all`: `AGENT_ROLES=review,dev,gardener,architect,planner,predictor,supervisor` — runs all 7 roles. +- Each agent runs with `AGENT_ROLES` set to its configured roles - `CLAUDE_AUTOCOMPACT_PCT_OVERRIDE=60` — more aggressive compaction for smaller - context windows. -- Serialises on the llama-server's single KV cache (AD-002). + context windows +- Agents serialize on the llama-server's single KV cache (AD-002) -## Disabling +## Troubleshooting -Set `ENABLE_LLAMA_AGENT=0` (or leave it unset) and regenerate. The service -block is omitted entirely from `docker-compose.yml`; the stack starts cleanly -without it. +### Agent service not starting + +Check that the service was created by `disinto hire-an-agent`: + +```bash +docker compose config | grep -A5 "agents-dev-qwen" +``` + +If the service is missing, re-run `disinto hire-an-agent dev-qwen dev` to +regenerate `docker-compose.yml`. + +### Model endpoint unreachable + +Verify llama-server is accessible from inside Docker: + +```bash +docker compose -f docker-compose.yml exec agents curl -sf http://host.docker.internal:8081/health +``` + +If using a custom host IP, update `ANTHROPIC_BASE_URL` in `.env`: + +```bash +# Update the base URL +sed -i 's|^ANTHROPIC_BASE_URL=.*|ANTHROPIC_BASE_URL=http://192.168.1.100:8081|' .env + +# Restart the agent +docker compose restart agents-dev-qwen +``` + +### Invalid agent name + +Agent names must match `^[a-z]([a-z0-9]|-[a-z0-9])*$` (lowercase letters, digits, +hyphens; starts with letter, ends with alphanumeric). Invalid names like +`dev-qwen2` (trailing digit is OK) or `dev--qwen` (consecutive hyphens) will +be rejected. diff --git a/docs/nomad-migration.md b/docs/nomad-migration.md new file mode 100644 index 0000000..02ff023 --- /dev/null +++ b/docs/nomad-migration.md @@ -0,0 +1,124 @@ + +# Nomad+Vault migration — cutover-day runbook + +`disinto init --backend=nomad` is the single entry-point that turns a fresh +LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with +policies applied, JWT workload-identity auth configured, secrets imported +from the old docker stack, and services deployed. + +## Cutover-day invocation + +On the new LXC, as root (or an operator with NOPASSWD sudo): + +```bash +# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile +# from the old box first (out of band — SSH, USB, whatever your ops +# procedure allows). Then: + +sudo ./bin/disinto init \ + --backend=nomad \ + --import-env /tmp/.env \ + --import-sops /tmp/.env.vault.enc \ + --age-key /tmp/keys.txt \ + --with forgejo +``` + +This runs, in order: + +1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault + binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both + services, waits for the Nomad node to become ready. +2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every + `vault/policies/*.hcl` into Vault as an ACL policy. Idempotent. +3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's + JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes + one role per policy, reloads Nomad so jobs can exchange + workload-identity tokens for Vault tokens. Idempotent. +4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the + sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths + matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`, + `kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place). +5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the + `nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from + Vault via the `template` stanza (S2.4). + +## Flag summary + +| Flag | Meaning | +|---|---| +| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). | +| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. | +| `--with forgejo[,…]` | Deploy these services after the cluster is up. | +| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. | +| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. | +| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. | +| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. | + +### Flag validation + +- `--import-sops` without `--age-key` → error. +- `--age-key` without `--import-sops` → error. +- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`). +- `--backend=docker` with any `--import-*` flag → error. +- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty` + skips the import step, so pairing them silently discards the import + intent). + +## Idempotency + +Every layer is idempotent by design. Re-running the same command on an +already-provisioned box is a no-op at every step: + +- **Cluster-up:** second run detects running `nomad`/`vault` systemd + units and state files, skips re-init. +- **Policies:** byte-for-byte compare against on-server policy text; + "unchanged" for every untouched file. +- **Auth:** skips auth-method create if `jwt-nomad/` already enabled, + skips config write if the JWKS + algs match, skips server.hcl write if + the file on disk is identical to the repo copy. +- **Import:** KV v2 writes overwrite in place (same path, same keys, + same values → no new version). +- **Deploy:** `nomad job run` is declarative; same jobspec → no new + allocation. + +## Dry-run + +```bash +./bin/disinto init --backend=nomad \ + --import-env /tmp/.env \ + --import-sops /tmp/.env.vault.enc \ + --age-key /tmp/keys.txt \ + --with forgejo \ + --dry-run +``` + +Prints the five-section plan — cluster-up, policies, auth, import, +deploy — with every path and every argv that would be executed. No +network, no sudo, no state mutation. See +`tests/disinto-init-nomad.bats` for the exact output shape. + +## No-import path + +If you already have `kv/disinto/*` seeded by other means (manual +`vault kv put`, a replica, etc.), omit all three `--import-*` flags. +`disinto init --backend=nomad --with forgejo` still applies policies, +configures auth, and deploys — but skips the import step with: + +``` +[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services +``` + +Forgejo's template stanza will fail to render (and thus the allocation +will stall) until those KV paths exist — so either import them or seed +them first. + +## Secret hygiene + +- Never log a secret value. The CLI only prints paths (`--import-env`, + `--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never + the values themselves. `tools/vault-import.sh` is the only thing that + reads the values, and it pipes them directly into Vault's HTTP API. +- The age keyfile must be mode 0400 — `vault-import.sh` refuses to + source a keyfile with looser permissions. +- `VAULT_ADDR` must be localhost during import — the import tool + refuses to run against a remote Vault, preventing accidental exposure. diff --git a/formulas/release.sh b/formulas/release.sh index b8c4eb6..6526d1a 100644 --- a/formulas/release.sh +++ b/formulas/release.sh @@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}" log "Step 6/6: Restarting agent containers" -docker compose stop agents agents-llama 2>/dev/null || true -docker compose up -d agents agents-llama +docker compose stop agents 2>/dev/null || true +docker compose up -d agents log "Agent containers restarted" # ── Done ───────────────────────────────────────────────────────────────── diff --git a/formulas/release.toml b/formulas/release.toml index f702f42..ccd7f95 100644 --- a/formulas/release.toml +++ b/formulas/release.toml @@ -189,10 +189,10 @@ Restart agent containers to use the new image. - docker compose pull agents 2. Stop and remove existing agent containers: - - docker compose down agents agents-llama 2>/dev/null || true + - docker compose down agents 3. Start agents with new image: - - docker compose up -d agents agents-llama + - docker compose up -d agents 4. Wait for containers to be healthy: - for i in {1..30}; do @@ -203,7 +203,7 @@ Restart agent containers to use the new image. - done 5. Verify containers are running: - - docker compose ps agents agents-llama + - docker compose ps agents 6. Log restart: - echo "Restarted agents containers" diff --git a/gardener/AGENTS.md b/gardener/AGENTS.md index 3a26084..e9ad846 100644 --- a/gardener/AGENTS.md +++ b/gardener/AGENTS.md @@ -1,4 +1,4 @@ - + # Gardener Agent **Role**: Backlog grooming — detect duplicate issues, missing acceptance diff --git a/gardener/dust.jsonl b/gardener/dust.jsonl new file mode 100644 index 0000000..14b0d5c --- /dev/null +++ b/gardener/dust.jsonl @@ -0,0 +1 @@ +{"issue":915,"group":"lib/generators.sh","title":"remove no-op sed in generate_compose --build mode","reason":"sed replaces agents: with itself — no behavior change; single-line removal","ts":"2026-04-17T01:04:05Z"} diff --git a/gardener/pending-actions.json b/gardener/pending-actions.json index a5cc3c4..1c89c7d 100644 --- a/gardener/pending-actions.json +++ b/gardener/pending-actions.json @@ -1,7 +1,37 @@ [ { "action": "edit_body", - "issue": 835, - "body": "Bugfix for S0.1 (#821). Discovered during Step 0 end-to-end verification on a fresh LXC.\n\n## Symptom\n\n```\n$ ./bin/disinto init --backend=nomad --empty\nError: --empty is only valid with --backend=nomad\n```\n\nThe error is nonsensical — `--backend=nomad` is right there.\n\n## Root cause\n\n`bin/disinto` → `disinto_init` (around line 710) consumes the first positional arg as `repo_url` **before** the argparse `while` loop runs:\n\n```bash\ndisinto_init() {\n local repo_url=\"${1:-}\"\n if [ -z \"$repo_url\" ]; then\n echo \"Error: repo URL required\" >&2\n ...\n fi\n shift\n # ... then while-loop parses flags ...\n}\n```\n\nSo `disinto init --backend=nomad --empty` becomes:\n- `repo_url = \"--backend=nomad\"` (swallowed)\n- `--empty` seen by loop → `empty=true`\n- `backend` stays at default `\"docker\"`\n- Validation at line 747: `empty=true && backend != \"nomad\"` → error\n\n## Why repo_url is wrong for nomad\n\nFor `--backend=nomad`, the cluster-up flow doesn't clone anything — the LXC already has the repo cloned by the operator. `repo_url` is a docker-backend concept.\n\n## Fix\n\nIn `disinto_init`, move backend detection to **before** the `repo_url` consumption, and make `repo_url` conditional on `backend=docker`:\n\n```bash\ndisinto_init() {\n # Pre-scan for --backend to know whether repo_url is required\n local backend=\"docker\"\n for arg in \"$@\"; do\n case \"$arg\" in\n --backend) ;; # handled below\n --backend=*) backend=\"${arg#--backend=}\" ;;\n esac\n done\n # Also handle space-separated form\n local i=1\n while [ $i -le $# ]; do\n if [ \"${!i}\" = \"--backend\" ]; then\n i=$((i+1))\n backend=\"${!i}\"\n fi\n i=$((i+1))\n done\n\n local repo_url=\"\"\n if [ \"$backend\" = \"docker\" ]; then\n repo_url=\"${1:-}\"\n if [ -z \"$repo_url\" ] || [[ \"$repo_url\" == --* ]]; then\n echo \"Error: repo URL required for docker backend\" >&2\n echo \"Usage: disinto init [options]\" >&2\n exit 1\n fi\n shift\n fi\n # ... rest of argparse unchanged, it re-reads --backend cleanly\n```\n\nSimpler alternative: if first arg starts with `--`, assume no positional and skip repo_url consumption entirely (covers nomad + any future `--help`-style invocation).\n\nEither shape is fine; pick the cleaner one.\n\n## Acceptance criteria\n\n- [ ] `./bin/disinto init --backend=nomad --empty` runs `lib/init/nomad/cluster-up.sh` without error on a clean LXC.\n- [ ] `./bin/disinto init --backend=nomad --empty --dry-run` prints the 9-step plan and exits 0.\n- [ ] `./bin/disinto init ` (docker path) behaves identically to today — existing smoke path passes.\n- [ ] `./bin/disinto init` (no args, docker implied) still errors with the \"repo URL required\" message.\n- [ ] `./bin/disinto init --backend=docker` (no repo) errors helpfully — not \"Unknown option: --backend=docker\".\n- [ ] shellcheck clean.\n\n## Verified regression case from Step 0 testing\n\nOn a fresh Ubuntu 24.04 LXC, after `./lib/init/nomad/cluster-up.sh` was invoked directly (workaround), the cluster came up healthy end-to-end:\n\n- Nomad node status: 1 node ready\n- Vault status: Sealed=false, Initialized=true\n- Re-run of cluster-up.sh was fully idempotent\n\nSo the bug is isolated to `bin/disinto` argparse; the rest of the Step 0 code path is solid. This fix unblocks the formal Step 0 acceptance test.\n\n## Labels / meta\n\n- `[nomad-step-0] S0.1-fix` — no dependencies; gates Step 1.\n\n## Affected files\n\n- `bin/disinto` — `disinto_init()` function, around line 710: pre-scan for `--backend` before consuming `repo_url` positional argument\n" + "issue": 910, + "body": "Flagged by AI reviewer in PR #909.\n\n## Problem\n\n`tools/vault-import.sh` still uses hardcoded `secret/data/${path}` for its curl-based KV write (lines 149, 151, 162, 166, 170). The rest of the codebase was migrated to the configurable `VAULT_KV_MOUNT` variable (defaulting to `kv`) via PR #909. Any deployment with `kv/` as its KV mount will see 403/404 failures when `vault-import.sh` runs.\n\n## Fix\n\nEither:\n1. Refactor the write in `vault-import.sh` to call `hvault_kv_put` (which now respects `VAULT_KV_MOUNT`), or\n2. Replace the hardcoded `secret/data` reference with `${VAULT_KV_MOUNT:-kv}/data` matching the convention in `lib/hvault.sh`.\n\n---\n*Auto-created from AI review*\n\n## Affected files\n\n- `tools/vault-import.sh` (lines 149, 151, 162, 166, 170 — hardcoded `secret/data` references)\n- `lib/hvault.sh` (reference implementation using `VAULT_KV_MOUNT`)\n\n## Acceptance criteria\n\n- [ ] `tools/vault-import.sh` uses `${VAULT_KV_MOUNT:-kv}/data` (or calls `hvault_kv_put`) instead of hardcoded `secret/data`\n- [ ] No hardcoded `secret/data` path references remain in `tools/vault-import.sh`\n- [ ] Vault KV writes succeed when `VAULT_KV_MOUNT=kv` is set (matching the standard deployment config)\n- [ ] `shellcheck` clean\n" + }, + { + "action": "add_label", + "issue": 910, + "label": "backlog" + }, + { + "action": "edit_body", + "issue": 914, + "body": "Flagged by AI reviewer in PR #911.\n\n## Problem\n\n`lib/generators.sh` fixes the `agents` service missing `pull_policy: build` in `--build` mode (PR #893), but the `edge` service has the same root cause: the sed replacement at line 664 produces `build: ./docker/edge` with no `pull_policy: build`. Without it, `docker compose up -d --force-recreate` reuses the cached edge image and silently keeps running stale code even after source changes.\n\n## Fix\n\nAdd `\\n pull_policy: build` to the edge sed replacement, matching the pattern applied to agents in PR #893.\n\n---\n*Auto-created from AI review*\n\n## Affected files\n\n- `lib/generators.sh` (line 664 — edge service sed replacement missing `pull_policy: build`)\n\n## Acceptance criteria\n\n- [ ] `lib/generators.sh` edge service block emits `pull_policy: build` when `--build` mode is active (matching the pattern from PR #893 for the agents service)\n- [ ] `docker compose up -d --force-recreate` after source changes rebuilds the edge image rather than using the cached layer\n- [ ] Generated `docker-compose.yml` edge service stanza contains `pull_policy: build`\n- [ ] `shellcheck` clean\n" + }, + { + "action": "add_label", + "issue": 914, + "label": "backlog" + }, + { + "action": "edit_body", + "issue": 867, + "body": "## Incident\n\n**2026-04-16 ~10:55–11:52 UTC.** Woodpecker CI agent (`disinto-woodpecker-agent`) entered a repeated gRPC-error crashloop (Codeberg #813 class — gRPC-in-nested-docker). Every workflow it accepted exited 1 within seconds, never actually running pipeline steps.\n\n**Blast radius:** dev-qwen took issue #842 at 10:55, opened PR #859, and burned its full 3-attempt `pr-lifecycle` CI-fix budget between 10:55 and 11:08 reacting to these infra-flake \"CI failures.\" Each failure arrived in ~30–60 seconds, too fast to be a real test run. After exhausting the budget, dev-qwen marked #842 as `blocked: ci_exhausted` and moved on. No real bug was being detected; the real failure surfaced later only after an operator restarted the WP agent and manually retriggered pipeline #966 — which then returned a legitimate `bats-init-nomad` failure in test #6 (different issue).\n\n**Root cause of the infra-flake:** gRPC-in-nested-docker bug, Woodpecker server ↔ agent comms inside nested containers. Known-flaky; restart of `disinto-woodpecker-agent` clears it.\n\n**Recovery:** operator `docker restart disinto-woodpecker-agent` + retrigger pipelines via WP API POST `/api/repos/2/pipelines/`. Fresh run reached real stage signal.\n\n## Why this burned dev-qwen's budget\n\n`pr-lifecycle`'s CI-fix budget treats every failed commit-status as a signal to invoke the agent. It has no notion of \"infra flake\" vs. \"real test failure\" and no heuristic to distinguish them. Four infra-flake failures in 13 minutes looked identical to four real code-bug failures.\n\n## Suggestions — what supervisor can check every 20min\n\nSupervisor runs every `1200s` already. Add these probes:\n\n**1. WP agent container health.**\n```\ndocker inspect disinto-woodpecker-agent --format '{{.State.Health.Status}}'\n```\nIf `unhealthy` for the second consecutive supervisor tick → **restart it automatically + post a comment on any currently-running dev-bot/dev-qwen issues warning \"CI agent was restarted; subsequent failures before this marker may be infra-flake.\"**\n\n**2. Fast-failure heuristic on WP pipelines.**\nQuery WP API `GET /api/repos/2/pipelines?page=1`. For each pipeline in state `failure`, compute `finished - started`. If duration < 60s, flag as probable infra-flake. Three flagged flakes within a 15-min window → trigger agent restart as in (1) and a bulk-retrigger via POST `/api/repos/2/pipelines/` for each.\n\n**3. grpc error pattern in agent log.**\n`docker logs --since 20m disinto-woodpecker-agent 2>&1 | grep -c 'grpc error'` — if ≥3 matches, agent is probably wedged. Trigger restart as in (1).\n\n**4. Issue-level guard.**\nWhen supervisor detects an agent restart, scan for issues updated in the preceding 30min with label `blocked: ci_exhausted` and for each one:\n- unassign + remove `blocked` label (return to pool)\n- comment on the issue: *\"CI agent was unhealthy between HH:MM and HH:MM — prior 3/3 retry budget may have been spent on infra flake, not real failures. Re-queueing for a fresh attempt.\"*\n- retrigger the PR's latest WP pipeline\n\nThis last step is the key correction: **`ci_exhausted` preceded by WP-agent-unhealth = false positive; return to pool with context.**\n\n## Why this matters for the migration\n\nBetween now and cutover every WP CI flake that silently exhausts an agent's budget steals hours of clock time. Without an automatic recovery path, the pace of the step-N backlogs falls off a cliff the moment the agent next goes unhealthy — and it *will* go unhealthy again (Codeberg #813 is not fixed upstream yet).\n\n## Fix for this specific incident (already applied manually)\n\n- Restarted `disinto-woodpecker-agent`.\n- Closed PR #859 (kept branch `fix/issue-842` at `64080232`).\n- Unassigned dev-qwen from #842, removed `blocked` label, appended prior-art section + pipeline #966 test-#6 failure details to issue body so the next claimant starts with full context.\n\n## Non-goals\n\n- Not trying to fix Codeberg #813 itself (upstream gRPC-in-nested-docker issue).\n- Not trying to fix `pr-lifecycle`'s budget logic — the supervisor-side detection is cheaper and more robust than per-issue budget changes.\n\n## Labels / meta\n\n- `bug-report` + supervisor-focused. Classify severity as blocker for the migration cadence (not for factory day-to-day — it only bites when an unfixable-by-dev issue hits the budget).\n\n## Affected files\n\n- `supervisor/supervisor-run.sh` — add WP agent health probes and flake-detection logic\n- `supervisor/preflight.sh` — may need additional data collection for WP agent health status\n\n## Acceptance criteria\n\n- [ ] Supervisor detects an unhealthy `disinto-woodpecker-agent` container (via `docker inspect` health status or gRPC error log count ≥ 3) and automatically restarts it\n- [ ] After an auto-restart, supervisor scans for issues updated in the prior 30 min labeled `blocked: ci_exhausted` and returns them to the pool (unassign, remove `blocked`, add comment noting infra-flake window)\n- [ ] Fast-failure heuristic: pipelines completing in <60s are flagged as probable infra-flake; 3+ in a 15-min window triggers the restart+retrigger flow\n- [ ] Already-swept PRs/issues are not processed twice (idempotency guard via `` comment)\n- [ ] CI green\n" + }, + { + "action": "add_label", + "issue": 867, + "label": "backlog" + }, + { + "action": "add_label", + "issue": 820, + "label": "backlog" } ] diff --git a/lib/AGENTS.md b/lib/AGENTS.md index 555d0f7..97e6f5e 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -1,4 +1,4 @@ - + # Shared Helpers (`lib/`) All agents source `lib/env.sh` as their first action. Additional helpers are @@ -34,5 +34,5 @@ sourced as needed. | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses ` ... ` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) | | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | -| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | Not sourced at runtime yet — pure scaffolding for Nomad+Vault migration (#799) | -| `lib/init/nomad/` | Nomad+Vault Step 0 installer scripts. `cluster-up.sh` — idempotent orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | +| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh` | +| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | diff --git a/lib/action-vault.sh b/lib/action-vault.sh index 6348cc6..7602a39 100644 --- a/lib/action-vault.sh +++ b/lib/action-vault.sh @@ -128,7 +128,6 @@ vault_request() { # Validate TOML content local tmp_toml tmp_toml=$(mktemp /tmp/vault-XXXXXX.toml) - trap 'rm -f "$tmp_toml"' RETURN printf '%s' "$toml_content" > "$tmp_toml" @@ -136,6 +135,7 @@ vault_request() { local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/vault-env.sh" if [ ! -f "$vault_env" ]; then echo "ERROR: vault-env.sh not found at $vault_env" >&2 + rm -f "$tmp_toml" return 1 fi @@ -145,11 +145,15 @@ vault_request() { if ! source "$vault_env"; then FORGE_TOKEN="${_saved_forge_token:-}" echo "ERROR: failed to source vault-env.sh" >&2 + rm -f "$tmp_toml" return 1 fi # Restore caller's FORGE_TOKEN after validation FORGE_TOKEN="${_saved_forge_token:-}" + # Set trap AFTER sourcing vault-env.sh to avoid RETURN trap firing during source + trap 'rm -f "$tmp_toml"' RETURN + # Run validation if ! validate_vault_action "$tmp_toml"; then echo "ERROR: TOML validation failed" >&2 diff --git a/lib/forge-setup.sh b/lib/forge-setup.sh index 2b7b697..2f8b117 100644 --- a/lib/forge-setup.sh +++ b/lib/forge-setup.sh @@ -356,16 +356,6 @@ setup_forge() { [predictor-bot]="FORGE_PREDICTOR_PASS" [architect-bot]="FORGE_ARCHITECT_PASS" ) - # Llama bot users (local-model agents) — separate from main agents - # Each llama agent gets its own Forgejo user, token, and password - local -A llama_token_vars=( - [dev-qwen]="FORGE_TOKEN_LLAMA" - [dev-qwen-nightly]="FORGE_TOKEN_LLAMA_NIGHTLY" - ) - local -A llama_pass_vars=( - [dev-qwen]="FORGE_PASS_LLAMA" - [dev-qwen-nightly]="FORGE_PASS_LLAMA_NIGHTLY" - ) local bot_user bot_pass token token_var pass_var @@ -515,159 +505,12 @@ setup_forge() { fi done - # Create llama bot users and tokens (local-model agents) - # These are separate from the main agents and get their own credentials - echo "" - echo "── Setting up llama bot users ────────────────────────────" - - local llama_user llama_pass llama_token llama_token_var llama_pass_var - for llama_user in "${!llama_token_vars[@]}"; do - llama_token_var="${llama_token_vars[$llama_user]}" - llama_pass_var="${llama_pass_vars[$llama_user]}" - - # Check if token already exists in .env - local token_exists=false - if _token_exists_in_env "$llama_token_var" "$env_file"; then - token_exists=true - fi - - # Check if password already exists in .env - local pass_exists=false - if _pass_exists_in_env "$llama_pass_var" "$env_file"; then - pass_exists=true - fi - - # Check if llama bot user exists on Forgejo - local llama_user_exists=false - if curl -sf --max-time 5 \ - -H "Authorization: token ${admin_token}" \ - "${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then - llama_user_exists=true - fi - - # Skip token/password regeneration if both exist in .env and not forcing rotation - if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then - echo " ${llama_user} token and password preserved (use --rotate-tokens to force)" - # Still export the existing token for use within this run - local existing_token existing_pass - existing_token=$(grep "^${llama_token_var}=" "$env_file" | head -1 | cut -d= -f2-) - existing_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-) - export "${llama_token_var}=${existing_token}" - export "${llama_pass_var}=${existing_pass}" - continue - fi - - # Generate new credentials if: - # - Token doesn't exist (first run) - # - Password doesn't exist (first run) - # - --rotate-tokens flag is set (explicit rotation) - if [ "$llama_user_exists" = false ]; then - # User doesn't exist - create it - llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)" - echo "Creating llama bot user: ${llama_user}" - local create_output - if ! create_output=$(_forgejo_exec forgejo admin user create \ - --username "${llama_user}" \ - --password "${llama_pass}" \ - --email "${llama_user}@disinto.local" \ - --must-change-password=false 2>&1); then - echo "Error: failed to create llama bot user '${llama_user}':" >&2 - echo " ${create_output}" >&2 - exit 1 - fi - # Forgejo 11.x ignores --must-change-password=false on create; - # explicitly clear the flag so basic-auth token creation works. - _forgejo_exec forgejo admin user change-password \ - --username "${llama_user}" \ - --password "${llama_pass}" \ - --must-change-password=false - - # Verify llama bot user was actually created - if ! curl -sf --max-time 5 \ - -H "Authorization: token ${admin_token}" \ - "${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then - echo "Error: llama bot user '${llama_user}' not found after creation" >&2 - exit 1 - fi - echo " ${llama_user} user created" - else - # User exists - reset password if needed - echo " ${llama_user} user exists" - if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then - llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)" - _forgejo_exec forgejo admin user change-password \ - --username "${llama_user}" \ - --password "${llama_pass}" \ - --must-change-password=false || { - echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2 - exit 1 - } - echo " ${llama_user} password reset for token generation" - else - # Password exists, get it from .env - llama_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-) - fi - fi - - # Generate token via API (basic auth as the llama user) - # First, delete any existing tokens to avoid name collision - local existing_llama_token_ids - existing_llama_token_ids=$(curl -sf \ - -u "${llama_user}:${llama_pass}" \ - "${forge_url}/api/v1/users/${llama_user}/tokens" 2>/dev/null \ - | jq -r '.[].id // empty' 2>/dev/null) || existing_llama_token_ids="" - - # Delete any existing tokens for this user - if [ -n "$existing_llama_token_ids" ]; then - while IFS= read -r tid; do - [ -n "$tid" ] && curl -sf -X DELETE \ - -u "${llama_user}:${llama_pass}" \ - "${forge_url}/api/v1/users/${llama_user}/tokens/${tid}" >/dev/null 2>&1 || true - done <<< "$existing_llama_token_ids" - fi - - llama_token=$(curl -sf -X POST \ - -u "${llama_user}:${llama_pass}" \ - -H "Content-Type: application/json" \ - "${forge_url}/api/v1/users/${llama_user}/tokens" \ - -d "{\"name\":\"disinto-${llama_user}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \ - | jq -r '.sha1 // empty') || llama_token="" - - if [ -z "$llama_token" ]; then - echo "Error: failed to create API token for '${llama_user}'" >&2 - exit 1 - fi - - # Store token in .env under the llama-specific variable name - if grep -q "^${llama_token_var}=" "$env_file" 2>/dev/null; then - sed -i "s|^${llama_token_var}=.*|${llama_token_var}=${llama_token}|" "$env_file" - else - printf '%s=%s\n' "$llama_token_var" "$llama_token" >> "$env_file" - fi - export "${llama_token_var}=${llama_token}" - echo " ${llama_user} token generated and saved (${llama_token_var})" - - # Store password in .env for git HTTP push (#361) - # Forgejo 11.x API tokens don't work for git push; password auth does. - if grep -q "^${llama_pass_var}=" "$env_file" 2>/dev/null; then - sed -i "s|^${llama_pass_var}=.*|${llama_pass_var}=${llama_pass}|" "$env_file" - else - printf '%s=%s\n' "$llama_pass_var" "$llama_pass" >> "$env_file" - fi - export "${llama_pass_var}=${llama_pass}" - echo " ${llama_user} password saved (${llama_pass_var})" - done - # Create .profile repos for all bot users (if they don't already exist) # This runs the same logic as hire-an-agent Step 2-3 for idempotent setup echo "" echo "── Setting up .profile repos ────────────────────────────" local -a bot_users=(dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot) - # Add llama bot users to .profile repo creation - for llama_user in "${!llama_token_vars[@]}"; do - bot_users+=("$llama_user") - done local bot_user for bot_user in "${bot_users[@]}"; do @@ -775,15 +618,6 @@ setup_forge() { -d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1 || true done - # Add llama bot users as write collaborators for local-model agents - for llama_user in "${!llama_token_vars[@]}"; do - curl -sf -X PUT \ - -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \ - -H "Content-Type: application/json" \ - "${forge_url}/api/v1/repos/${repo_slug}/collaborators/${llama_user}" \ - -d '{"permission":"write"}' >/dev/null 2>&1 || true - done - # Add disinto-admin as admin collaborator curl -sf -X PUT \ -H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \ diff --git a/lib/generators.sh b/lib/generators.sh index af08aa2..8f132bb 100644 --- a/lib/generators.sh +++ b/lib/generators.sh @@ -102,12 +102,32 @@ _generate_local_model_services() { # so we key the env-var lookup by forge_user (which hire-agent.sh # writes as the Forgejo username). Apply the same tr 'a-z-' 'A-Z_' # convention as hire-agent.sh Gap 1 so the names match. + # + # NOTE (#845): the emitted block has NO `profiles:` key. The + # [agents.] TOML entry is already the activation gate — + # its presence is what drives emission here. Profile-gating + # the service caused `disinto up` (without COMPOSE_PROFILES) + # to treat the hired container as an orphan and silently + # remove it via --remove-orphans. local user_upper user_upper=$(echo "$forge_user" | tr 'a-z-' 'A-Z_') cat >> "$temp_file" <> "$compose_file" <<'LLAMAEOF' - - agents-llama: - build: - context: . - dockerfile: docker/agents/Dockerfile - container_name: disinto-agents-llama - restart: unless-stopped - security_opt: - - apparmor=unconfined - volumes: - - agent-data:/home/agent/data - - project-repos:/home/agent/repos - - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - - woodpecker-data:/woodpecker-data:ro - environment: - FORGE_URL: http://forgejo:3000 - FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto} - FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-} - FORGE_PASS: ${FORGE_PASS_LLAMA:-} - FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-} - WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-} - CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200} - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1} - CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60" - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-} - FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-} - DISINTO_CONTAINER: "1" - PROJECT_NAME: ${PROJECT_NAME:-project} - PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project} - WOODPECKER_DATA_DIR: /woodpecker-data - WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID" - CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config} - POLL_INTERVAL: ${POLL_INTERVAL:-300} - AGENT_ROLES: dev - healthcheck: - test: ["CMD", "pgrep", "-f", "entrypoint.sh"] - interval: 60s - timeout: 5s - retries: 3 - start_period: 30s - depends_on: - forgejo: - condition: service_healthy - networks: - - disinto-net - - agents-llama-all: - build: - context: . - dockerfile: docker/agents/Dockerfile - container_name: disinto-agents-llama-all - restart: unless-stopped - profiles: ["agents-llama-all"] - security_opt: - - apparmor=unconfined - volumes: - - agent-data:/home/agent/data - - project-repos:/home/agent/repos - - ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared} - - ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro - - ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro - - ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro - - ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro - - woodpecker-data:/woodpecker-data:ro - environment: - FORGE_URL: http://forgejo:3000 - FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto} - FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-} - FORGE_PASS: ${FORGE_PASS_LLAMA:-} - FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-} - FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-} - FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-} - FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-} - FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-} - FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-} - FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-} - FORGE_FILER_TOKEN: ${FORGE_FILER_TOKEN:-} - FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-} - WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-} - CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200} - CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1} - CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60" - CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1" - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} - ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-} - FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-} - DISINTO_CONTAINER: "1" - PROJECT_NAME: ${PROJECT_NAME:-project} - PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project} - WOODPECKER_DATA_DIR: /woodpecker-data - WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID" - CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config} - POLL_INTERVAL: ${POLL_INTERVAL:-300} - GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600} - ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600} - PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200} - SUPERVISOR_INTERVAL: ${SUPERVISOR_INTERVAL:-1200} - AGENT_ROLES: review,dev,gardener,architect,planner,predictor,supervisor - healthcheck: - test: ["CMD", "pgrep", "-f", "entrypoint.sh"] - interval: 60s - timeout: 5s - retries: 3 - start_period: 30s - depends_on: - forgejo: - condition: service_healthy - woodpecker: - condition: service_started - networks: - - disinto-net -LLAMAEOF - fi - # Resume the rest of the compose file (runner onward) cat >> "$compose_file" <<'COMPOSEEOF' @@ -761,7 +660,7 @@ COMPOSEEOF # In build mode, replace image: with build: for locally-built images if [ "$use_build" = true ]; then sed -i 's|^\( agents:\)|\1|' "$compose_file" - sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile|}' "$compose_file" + sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile\n pull_policy: build|}' "$compose_file" sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge|}' "$compose_file" fi diff --git a/lib/hire-agent.sh b/lib/hire-agent.sh index 2bbea63..170389f 100644 --- a/lib/hire-agent.sh +++ b/lib/hire-agent.sh @@ -30,6 +30,29 @@ disinto_hire_an_agent() { echo "Usage: disinto hire-an-agent [--formula ] [--local-model ] [--model ] [--poll-interval ]" >&2 exit 1 fi + + # Validate agent name before any side effects (Forgejo user creation, TOML + # write, token issuance). The name flows through several systems that have + # stricter rules than the raw TOML spec: + # - load-project.sh emits shell vars keyed by the name (dashes are mapped + # to underscores via tr 'a-z-' 'A-Z_') + # - generators.sh emits a docker-compose service name `agents-` and + # uppercases it for env var keys (#852 tracks the `^^` bug; we keep the + # grammar tight here so that fix can happen without re-validation) + # - Forgejo usernames are lowercase alnum + dash + # Constraint: start with a lowercase letter, contain only [a-z0-9-], end + # with a lowercase letter or digit (no trailing dash), no consecutive + # dashes. Rejecting at hire-time prevents unparseable TOML sections like + # [agents.dev-qwen2] from landing on disk and crashing load-project.sh on + # the next `disinto up` (#862). + if ! [[ "$agent_name" =~ ^[a-z]([a-z0-9]|-[a-z0-9])*$ ]]; then + echo "Error: invalid agent name '${agent_name}'" >&2 + echo " Agent names must match: ^[a-z]([a-z0-9]|-[a-z0-9])*$" >&2 + echo " (lowercase letters/digits/single dashes, starts with letter, ends with alphanumeric)" >&2 + echo " Examples: dev, dev-qwen2, review-qwen, planner" >&2 + exit 1 + fi + shift 2 # Parse flags @@ -229,6 +252,46 @@ disinto_hire_an_agent() { export "${pass_var}=${user_pass}" fi + # Step 1.7: Write backend credentials to .env (#847). + # Local-model agents need ANTHROPIC_BASE_URL; Anthropic-backend agents need ANTHROPIC_API_KEY. + # These must be persisted so the container can start with valid credentials. + echo "" + echo "Step 1.7: Writing backend credentials to .env..." + + if [ -n "$local_model" ]; then + # Local model agent: write ANTHROPIC_BASE_URL + local backend_var="ANTHROPIC_BASE_URL" + local backend_val="$local_model" + local escaped_val + escaped_val=$(printf '%s\n' "$backend_val" | sed 's/[&/\]/\\&/g') + if grep -q "^${backend_var}=" "$env_file" 2>/dev/null; then + sed -i "s|^${backend_var}=.*|${backend_var}=${escaped_val}|" "$env_file" + echo " ${backend_var} updated" + else + printf '%s=%s\n' "$backend_var" "$backend_val" >> "$env_file" + echo " ${backend_var} saved" + fi + export "${backend_var}=${backend_val}" + else + # Anthropic backend: check if ANTHROPIC_API_KEY is set, write it if present + if [ -n "${ANTHROPIC_API_KEY:-}" ]; then + local backend_var="ANTHROPIC_API_KEY" + local backend_val="$ANTHROPIC_API_KEY" + local escaped_key + escaped_key=$(printf '%s\n' "$backend_val" | sed 's/[&/\]/\\&/g') + if grep -q "^${backend_var}=" "$env_file" 2>/dev/null; then + sed -i "s|^${backend_var}=.*|${backend_var}=${escaped_key}|" "$env_file" + echo " ${backend_var} updated" + else + printf '%s=%s\n' "$backend_var" "$backend_val" >> "$env_file" + echo " ${backend_var} saved" + fi + export "${backend_var}=${backend_val}" + else + echo " Note: ANTHROPIC_API_KEY not set — required for Anthropic backend agents" + fi + fi + # Step 1.6: Add the new agent as a write collaborator on the project repo (#856). # Without this, PATCH /issues/{n} {assignees:[agent]} returns 403 Forbidden and # the dev-agent polls forever logging "claim lost to — skipping" (see @@ -472,7 +535,10 @@ EOF local interval="${poll_interval:-60}" echo " Writing [agents.${section_name}] to ${toml_file}..." python3 -c ' -import sys, re, pathlib +import sys +import tomlkit +import re +import pathlib toml_path = sys.argv[1] section_name = sys.argv[2] @@ -485,38 +551,39 @@ poll_interval = sys.argv[7] p = pathlib.Path(toml_path) text = p.read_text() -# Build the new section -new_section = f""" -[agents.{section_name}] -base_url = "{base_url}" -model = "{model}" -api_key = "sk-no-key-required" -roles = ["{role}"] -forge_user = "{agent_name}" -compact_pct = 60 -poll_interval = {poll_interval} -""" +# Step 1: Remove any commented-out [agents.X] blocks (they cause parse issues) +# Match # [agents.section_name] followed by lines that are not section headers +# Use negative lookahead to stop before a real section header (# [ or [) +commented_pattern = rf"(?:^|\n)# \[agents\.{re.escape(section_name)}\](?:\n(?!# \[|\[)[^\n]*)*" +text = re.sub(commented_pattern, "", text, flags=re.DOTALL) -# Check if section already exists and replace it -pattern = rf"\[agents\.{re.escape(section_name)}\][^\[]*" -if re.search(pattern, text): - text = re.sub(pattern, new_section.strip() + "\n", text) -else: - # Remove commented-out example [agents.llama] block if present - text = re.sub( - r"\n# Local-model agents \(optional\).*?(?=\n# \[mirrors\]|\n\[mirrors\]|\Z)", - "", - text, - flags=re.DOTALL, - ) - # Append before [mirrors] if it exists, otherwise at end - mirrors_match = re.search(r"\n(# )?\[mirrors\]", text) - if mirrors_match: - text = text[:mirrors_match.start()] + "\n" + new_section + text[mirrors_match.start():] - else: - text = text.rstrip() + "\n" + new_section +# Step 2: Parse TOML with tomlkit (preserves comments and formatting) +try: + doc = tomlkit.parse(text) +except Exception as e: + print(f"Error: Invalid TOML in {toml_path}: {e}", file=sys.stderr) + sys.exit(1) -p.write_text(text) +# Step 3: Ensure agents table exists +if "agents" not in doc: + doc.add("agents", tomlkit.table()) + +# Step 4: Update the specific agent section +doc["agents"][section_name] = { + "base_url": base_url, + "model": model, + "api_key": "sk-no-key-required", + "roles": [role], + "forge_user": agent_name, + "compact_pct": 60, + "poll_interval": int(poll_interval), +} + +# Step 5: Serialize back to TOML (preserves comments) +output = tomlkit.dumps(doc) + +# Step 6: Write back +p.write_text(output) ' "$toml_file" "$section_name" "$local_model" "$model" "$agent_name" "$role" "$interval" echo " Agent config written to TOML" @@ -544,7 +611,7 @@ p.write_text(text) echo " Model: ${model}" echo "" echo " To start the agent, run:" - echo " docker compose --profile ${service_name} up -d ${service_name}" + echo " disinto up" fi echo "" diff --git a/lib/hvault.sh b/lib/hvault.sh index b1e0d62..086c9f2 100644 --- a/lib/hvault.sh +++ b/lib/hvault.sh @@ -38,6 +38,30 @@ _hvault_resolve_token() { return 1 } +# _hvault_default_env — set the local-cluster Vault env if unset +# +# Idempotent helper used by every Vault-touching script that runs during +# `disinto init` (S2). On the local-cluster common case, operators (and +# the init dispatcher in bin/disinto) have not exported VAULT_ADDR or +# VAULT_TOKEN — the server is reachable on localhost:8200 and the root +# token lives at /etc/vault.d/root.token. Scripts must Just Work in that +# shape. +# +# - If VAULT_ADDR is unset, defaults to http://127.0.0.1:8200. +# - If VAULT_TOKEN is unset, resolves from /etc/vault.d/root.token via +# _hvault_resolve_token. A missing token file is not an error here — +# downstream hvault_token_lookup() probes connectivity and emits the +# operator-facing "VAULT_ADDR + VAULT_TOKEN" diagnostic. +# +# Centralised to keep the defaulting stanza in one place — copy-pasting +# the 5-line block into each init script trips the repo-wide 5-line +# sliding-window duplicate detector (.woodpecker/detect-duplicates.py). +_hvault_default_env() { + VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}" + export VAULT_ADDR + _hvault_resolve_token || : +} + # _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set # Args: caller function name _hvault_check_prereqs() { @@ -100,6 +124,11 @@ _hvault_request() { # ── Public API ─────────────────────────────────────────────────────────────── +# VAULT_KV_MOUNT — KV v2 mount point (default: "kv") +# Override with: export VAULT_KV_MOUNT=secret +# Used by: hvault_kv_get, hvault_kv_put, hvault_kv_list +: "${VAULT_KV_MOUNT:=kv}" + # hvault_kv_get PATH [KEY] # Read a KV v2 secret at PATH, optionally extract a single KEY. # Outputs: JSON value (full data object, or single key value) @@ -114,7 +143,7 @@ hvault_kv_get() { _hvault_check_prereqs "hvault_kv_get" || return 1 local response - response="$(_hvault_request GET "secret/data/${path}")" || return 1 + response="$(_hvault_request GET "${VAULT_KV_MOUNT}/data/${path}")" || return 1 if [ -n "$key" ]; then printf '%s' "$response" | jq -e -r --arg key "$key" '.data.data[$key]' 2>/dev/null || { @@ -154,7 +183,7 @@ hvault_kv_put() { payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '.data[$k] = $v')" done - _hvault_request POST "secret/data/${path}" "$payload" >/dev/null + _hvault_request POST "${VAULT_KV_MOUNT}/data/${path}" "$payload" >/dev/null } # hvault_kv_list PATH @@ -170,7 +199,7 @@ hvault_kv_list() { _hvault_check_prereqs "hvault_kv_list" || return 1 local response - response="$(_hvault_request LIST "secret/metadata/${path}")" || return 1 + response="$(_hvault_request LIST "${VAULT_KV_MOUNT}/metadata/${path}")" || return 1 printf '%s' "$response" | jq -e '.data.keys' 2>/dev/null || { _hvault_err "hvault_kv_list" "failed to parse response" "path=$path" @@ -178,6 +207,51 @@ hvault_kv_list() { } } +# hvault_get_or_empty PATH +# GET /v1/PATH. On 200, prints the raw response body to stdout (caller +# parses with jq). On 404, prints nothing and returns 0 — caller treats +# the empty string as "resource absent, needs create". Any other HTTP +# status is a hard error: response body is logged to stderr as a +# structured JSON error and the function returns 1. +# +# Used by the sync scripts (tools/vault-apply-*.sh + +# lib/init/nomad/vault-nomad-auth.sh) to read existing policies, roles, +# auth-method listings, and per-role configs without triggering errexit +# on the expected absent-resource case. `_hvault_request` is not a +# substitute — it treats 404 as a hard error, which is correct for +# writes but wrong for "does this already exist?" checks. +# +# Subshell + EXIT trap: the RETURN trap does NOT fire on set-e abort, +# so tmpfile cleanup from a function-scoped RETURN trap would leak on +# jq/curl errors under `set -eo pipefail`. The subshell + EXIT trap +# is the reliable cleanup boundary. +hvault_get_or_empty() { + local path="${1:-}" + + if [ -z "$path" ]; then + _hvault_err "hvault_get_or_empty" "PATH is required" \ + "usage: hvault_get_or_empty PATH" + return 1 + fi + _hvault_check_prereqs "hvault_get_or_empty" || return 1 + + ( + local tmp http_code + tmp="$(mktemp)" + trap 'rm -f "$tmp"' EXIT + http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \ + -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/${path}")" \ + || { _hvault_err "hvault_get_or_empty" "curl failed" "path=$path"; exit 1; } + case "$http_code" in + 2[0-9][0-9]) cat "$tmp" ;; + 404) printf '' ;; + *) _hvault_err "hvault_get_or_empty" "HTTP $http_code" "$(cat "$tmp")" + exit 1 ;; + esac + ) +} + # hvault_policy_apply NAME FILE # Idempotent policy upsert — create or update a Vault policy. hvault_policy_apply() { diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh index 7c802c6..4aab42d 100755 --- a/lib/init/nomad/cluster-up.sh +++ b/lib/init/nomad/cluster-up.sh @@ -5,7 +5,7 @@ # Wires together the S0.1–S0.3 building blocks into one idempotent # "bring up a single-node Nomad+Vault cluster" script: # -# 1. install.sh (nomad + vault binaries) +# 1. install.sh (nomad + vault binaries + docker daemon) # 2. systemd-nomad.sh (nomad.service — unit + enable, not started) # 3. systemd-vault.sh (vault.service — unit + vault.hcl + enable) # 4. Host-volume dirs (/srv/disinto/* matching nomad/client.hcl) @@ -104,7 +104,7 @@ done # ── Dry-run: print step list + exit ────────────────────────────────────────── if [ "$dry_run" = true ]; then cat </dev/null || true)" + [ -n "$out" ] || return 1 + detected="$(printf '%s' "$out" | jq -r '.Drivers.docker.Detected // false' 2>/dev/null)" || detected="" + healthy="$(printf '%s' "$out" | jq -r '.Drivers.docker.Healthy // false' 2>/dev/null)" || healthy="" + [ "$detected" = "true" ] && [ "$healthy" = "true" ] +} + # _die_with_service_status SVC REASON # Log + dump `systemctl status SVC` to stderr + die with REASON. Factored # out so the poll helper doesn't carry three copies of the same dump. @@ -243,8 +258,8 @@ poll_until_healthy() { _die_with_service_status "$svc" "not healthy within ${timeout}s" } -# ── Step 1/9: install.sh (nomad + vault binaries) ──────────────────────────── -log "── Step 1/9: install nomad + vault binaries ──" +# ── Step 1/9: install.sh (nomad + vault binaries + docker daemon) ──────────── +log "── Step 1/9: install nomad + vault binaries + docker daemon ──" "$INSTALL_SH" # ── Step 2/9: systemd-nomad.sh (unit + enable, not started) ────────────────── @@ -296,13 +311,25 @@ else poll_until_healthy vault vault_is_unsealed "$VAULT_POLL_SECS" fi -# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready ─────────────── -log "── Step 8/9: start nomad + poll until ≥1 node ready ──" -if systemctl is-active --quiet nomad && nomad_has_ready_node; then - log "nomad already active + ≥1 node ready — skip start" +# ── Step 8/9: systemctl start nomad + poll until ≥1 node ready + docker up ── +log "── Step 8/9: start nomad + poll until ≥1 node ready + docker driver healthy ──" +# Three conditions gate this step: +# (a) nomad.service active +# (b) ≥1 nomad node in "ready" state +# (c) nomad's docker task driver fingerprinted as Detected+Healthy +# (c) can lag (a)+(b) briefly because driver fingerprinting races with +# dockerd startup — polling it explicitly prevents Step-1 deploys from +# hitting "missing drivers" placement failures on a cold-booted host (#871). +if systemctl is-active --quiet nomad \ + && nomad_has_ready_node \ + && nomad_docker_driver_healthy; then + log "nomad already active + ≥1 node ready + docker driver healthy — skip start" else - systemctl start nomad + if ! systemctl is-active --quiet nomad; then + systemctl start nomad + fi poll_until_healthy nomad nomad_has_ready_node "$NOMAD_POLL_SECS" + poll_until_healthy nomad nomad_docker_driver_healthy "$NOMAD_POLL_SECS" fi # ── Step 9/9: /etc/profile.d/disinto-nomad.sh ──────────────────────────────── diff --git a/lib/init/nomad/deploy.sh b/lib/init/nomad/deploy.sh index 7a58a5a..a1724c5 100755 --- a/lib/init/nomad/deploy.sh +++ b/lib/init/nomad/deploy.sh @@ -2,7 +2,7 @@ # ============================================================================= # lib/init/nomad/deploy.sh — Dependency-ordered Nomad job deploy + wait # -# Runs a list of jobspecs in order, waiting for each to reach "running" state +# Runs a list of jobspecs in order, waiting for each to reach healthy state # before starting the next. Step-1 uses it for forgejo-only; Steps 3–6 extend # the job list. # @@ -16,22 +16,24 @@ # Environment: # REPO_ROOT — absolute path to repo root (defaults to parent of # this script's parent directory) -# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 120) +# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 240) +# JOB_READY_TIMEOUT_ — per-job timeout override (e.g., +# JOB_READY_TIMEOUT_FORGEJO=300) # # Exit codes: -# 0 success (all jobs deployed and running, or dry-run completed) +# 0 success (all jobs deployed and healthy, or dry-run completed) # 1 failure (validation error, timeout, or nomad command failure) # # Idempotency: # Running twice back-to-back on a healthy cluster is a no-op. Jobs that are -# already running print "[deploy] already running" and continue. +# already healthy print "[deploy] already healthy" and continue. # ============================================================================= set -euo pipefail # ── Configuration ──────────────────────────────────────────────────────────── SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}" -JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-120}" +JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}" DRY_RUN=0 @@ -61,11 +63,12 @@ if [ "${#JOBS[@]}" -eq 0 ]; then fi # ── Helper: _wait_job_running ─────────────────────────────── -# Polls `nomad job status -json ` until: -# - Status == "running", OR -# - All allocations are in "running" state +# Polls `nomad deployment status -json ` until: +# - Status == "successful" +# - Status == "failed" # -# On timeout: prints last 50 lines of stderr from all allocations and exits 1. +# On deployment failure: prints last 50 lines of stderr from allocations and exits 1. +# On timeout: prints last 50 lines of stderr from allocations and exits 1. # # This is a named, reusable helper for future init scripts. _wait_job_running() { @@ -73,39 +76,72 @@ _wait_job_running() { local timeout="$2" local elapsed=0 - log "waiting for job '${job_name}' to become running (timeout: ${timeout}s)..." + log "waiting for job '${job_name}' to become healthy (timeout: ${timeout}s)..." + + # Get the latest deployment ID for this job (retry until available) + local deployment_id="" + local retry_count=0 + local max_retries=12 + + while [ -z "$deployment_id" ] && [ "$retry_count" -lt "$max_retries" ]; do + deployment_id=$(nomad job deployments -json "$job_name" 2>/dev/null | jq -r '.[0].ID' 2>/dev/null) || deployment_id="" + if [ -z "$deployment_id" ]; then + sleep 5 + retry_count=$((retry_count + 1)) + fi + done + + if [ -z "$deployment_id" ]; then + log "ERROR: no deployment found for job '${job_name}' after ${max_retries} attempts" + return 1 + fi + + log "tracking deployment '${deployment_id}'..." while [ "$elapsed" -lt "$timeout" ]; do - local status_json - status_json=$(nomad job status -json "$job_name" 2>/dev/null) || { - # Job may not exist yet — keep waiting + local deploy_status_json + deploy_status_json=$(nomad deployment status -json "$deployment_id" 2>/dev/null) || { + # Deployment may not exist yet — keep waiting sleep 5 elapsed=$((elapsed + 5)) continue } local status - status=$(printf '%s' "$status_json" | jq -r '.Status' 2>/dev/null) || { + status=$(printf '%s' "$deploy_status_json" | jq -r '.Status' 2>/dev/null) || { sleep 5 elapsed=$((elapsed + 5)) continue } case "$status" in - running) - log "job '${job_name}' is now running" + successful) + log "${job_name} healthy after ${elapsed}s" return 0 ;; - complete) - log "job '${job_name}' reached terminal state: ${status}" - return 0 - ;; - dead|failed) - log "job '${job_name}' reached terminal state: ${status}" + failed) + log "deployment '${deployment_id}' failed for job '${job_name}'" + log "showing last 50 lines of allocation logs (stderr):" + + # Get allocation IDs from job status + local alloc_ids + alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \ + | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids="" + + if [ -n "$alloc_ids" ]; then + for alloc_id in $alloc_ids; do + log "--- Allocation ${alloc_id} logs (stderr) ---" + nomad alloc logs -stderr -short "$alloc_id" 2>/dev/null | tail -50 || true + done + fi + return 1 ;; + running|progressing) + log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)" + ;; *) - log "job '${job_name}' status: ${status} (waiting...)" + log "deployment '${deployment_id}' status: ${status} (waiting for ${job_name}...)" ;; esac @@ -114,13 +150,13 @@ _wait_job_running() { done # Timeout — print last 50 lines of alloc logs - log "TIMEOUT: job '${job_name}' did not reach running state within ${timeout}s" + log "TIMEOUT: deployment '${deployment_id}' did not reach successful state within ${timeout}s" log "showing last 50 lines of allocation logs (stderr):" - # Get allocation IDs + # Get allocation IDs from job status local alloc_ids alloc_ids=$(nomad job status -json "$job_name" 2>/dev/null \ - | jq -r '.Evaluations[].Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids="" + | jq -r '.Allocations[]?.ID // empty' 2>/dev/null) || alloc_ids="" if [ -n "$alloc_ids" ]; then for alloc_id in $alloc_ids; do @@ -140,10 +176,15 @@ for job_name in "${JOBS[@]}"; do die "Jobspec not found: ${jobspec_path}" fi + # Per-job timeout override: JOB_READY_TIMEOUT_ + job_upper=$(printf '%s' "$job_name" | tr '[:lower:]' '[:upper:]') + timeout_var="JOB_READY_TIMEOUT_${job_upper}" + job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}" + if [ "$DRY_RUN" -eq 1 ]; then log "[dry-run] nomad job validate ${jobspec_path}" log "[dry-run] nomad job run -detach ${jobspec_path}" - log "[dry-run] (would wait for '${job_name}' to become running for ${JOB_READY_TIMEOUT_SECS}s)" + log "[dry-run] (would wait for '${job_name}' to become healthy for ${job_timeout}s)" continue fi @@ -155,12 +196,12 @@ for job_name in "${JOBS[@]}"; do die "validation failed for: ${jobspec_path}" fi - # 2. Check if already running (idempotency) + # 2. Check if already healthy (idempotency) job_status_json=$(nomad job status -json "$job_name" 2>/dev/null || true) if [ -n "$job_status_json" ]; then current_status=$(printf '%s' "$job_status_json" | jq -r '.Status' 2>/dev/null || true) if [ "$current_status" = "running" ]; then - log "${job_name} already running" + log "${job_name} already healthy" continue fi fi @@ -171,9 +212,9 @@ for job_name in "${JOBS[@]}"; do die "failed to run job: ${job_name}" fi - # 4. Wait for running state - if ! _wait_job_running "$job_name" "$JOB_READY_TIMEOUT_SECS"; then - die "timeout waiting for job '${job_name}' to become running" + # 4. Wait for healthy state + if ! _wait_job_running "$job_name" "$job_timeout"; then + die "deployment for job '${job_name}' did not reach successful state" fi done diff --git a/lib/init/nomad/install.sh b/lib/init/nomad/install.sh index 6f1ffed..ea9ac17 100755 --- a/lib/init/nomad/install.sh +++ b/lib/init/nomad/install.sh @@ -1,20 +1,33 @@ #!/usr/bin/env bash # ============================================================================= # lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad + Vault +# + Ubuntu-native Docker for Nomad's docker driver # -# Part of the Nomad+Vault migration. Installs both the `nomad` binary (S0.2, -# issue #822) and the `vault` binary (S0.3, issue #823) from the same -# HashiCorp apt repository. Does NOT configure, start, or enable any systemd -# unit — lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh -# own that. Does NOT wire this script into `disinto init` — S0.4 owns that. +# Part of the Nomad+Vault migration. Installs the `nomad` binary (S0.2, +# issue #822), the `vault` binary (S0.3, issue #823), and the `docker` +# daemon (S0.2-fix, issue #871) needed by Nomad's docker task driver. +# Nomad + Vault come from the pinned HashiCorp apt repo; docker comes from +# Ubuntu's default apt repo (docker.io) — matches the existing factory +# dev-box setup and avoids adding a second apt source with pinning. +# +# Does NOT configure, start, or enable nomad.service or vault.service — +# lib/init/nomad/systemd-nomad.sh and lib/init/nomad/systemd-vault.sh own +# those. The docker.service unit ships with the docker.io package and is +# enabled+started here directly (not a disinto-owned unit), because Nomad's +# docker driver reports Healthy=false without a running dockerd — that +# silently blocks job placement at Step 1 with a confusing "missing +# drivers" error (issue #871). Does NOT wire this script into `disinto +# init` — S0.4 owns that. # # Idempotency contract: -# - Running twice back-to-back is a no-op once both target versions are -# installed and the apt source is in place. +# - Running twice back-to-back is a no-op once all three targets are +# installed and the HashiCorp apt source is in place. # - Adds the HashiCorp apt keyring only if it is absent. # - Adds the HashiCorp apt sources list only if it is absent. # - Skips `apt-get install` for any package whose installed version already -# matches the pin. If both are at pin, exits before touching apt. +# matches the pin. If all three are satisfied, exits before touching apt. +# - `command -v docker` is the docker install sentinel; `systemctl +# enable --now` is a no-op on an already-enabled+active unit. # # Configuration: # NOMAD_VERSION — pinned Nomad version (default: see below). Apt package @@ -85,59 +98,90 @@ else need_pkgs+=("vault=${VAULT_VERSION}-1") fi -if [ "${#need_pkgs[@]}" -eq 0 ]; then +# Docker isn't version-pinned (Ubuntu's docker.io tracks the distro's +# ship-stable release — good enough for a dev box and avoids a second +# apt source). Sentinel is binary presence, not a semver match. +if command -v docker >/dev/null 2>&1; then + log "docker already installed" + docker_needs_install=0 +else + docker_needs_install=1 +fi + +if [ "${#need_pkgs[@]}" -eq 0 ] && [ "$docker_needs_install" -eq 0 ]; then log "nothing to do" exit 0 fi -# ── Ensure HashiCorp apt keyring ───────────────────────────────────────────── -if [ ! -f "$HASHICORP_KEYRING" ]; then - log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" - tmpkey="$(mktemp)" - trap 'rm -f "$tmpkey"' EXIT - curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \ - || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}" - gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \ - || die "failed to dearmor HashiCorp GPG key" - chmod 0644 "$HASHICORP_KEYRING" - rm -f "$tmpkey" - trap - EXIT -else - log "HashiCorp apt keyring already present" +# ── HashiCorp apt setup + nomad/vault install (skipped if both at pin) ─────── +if [ "${#need_pkgs[@]}" -gt 0 ]; then + # Ensure HashiCorp apt keyring. + if [ ! -f "$HASHICORP_KEYRING" ]; then + log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" + tmpkey="$(mktemp)" + trap 'rm -f "$tmpkey"' EXIT + curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \ + || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}" + gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \ + || die "failed to dearmor HashiCorp GPG key" + chmod 0644 "$HASHICORP_KEYRING" + rm -f "$tmpkey" + trap - EXIT + else + log "HashiCorp apt keyring already present" + fi + + # Ensure HashiCorp apt sources list. + desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" + if [ ! -f "$HASHICORP_SOURCES" ] \ + || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then + log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" + printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" + apt_update_needed=1 + else + log "HashiCorp apt sources list already present" + apt_update_needed=0 + fi + + # Install the pinned versions. + if [ "$apt_update_needed" -eq 1 ]; then + log "running apt-get update" + DEBIAN_FRONTEND=noninteractive apt-get update -qq \ + || die "apt-get update failed" + fi + + log "installing ${need_pkgs[*]}" + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "${need_pkgs[@]}" \ + || die "apt-get install ${need_pkgs[*]} failed" + + # Verify pinned versions. + final_nomad="$(_installed_version nomad)" + if [ "$final_nomad" != "$NOMAD_VERSION" ]; then + die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" + fi + final_vault="$(_installed_version vault)" + if [ "$final_vault" != "$VAULT_VERSION" ]; then + die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" + fi fi -# ── Ensure HashiCorp apt sources list ──────────────────────────────────────── -desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" -if [ ! -f "$HASHICORP_SOURCES" ] \ - || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then - log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" - printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" - apt_update_needed=1 -else - log "HashiCorp apt sources list already present" - apt_update_needed=0 +# ── Install docker.io + enable+start docker.service (if missing) ───────────── +# Nomad's docker task driver reports Healthy=false without a running +# dockerd. On the factory dev box docker was pre-installed so Step 0's +# cluster-up passed silently; on a fresh LXC the first docker-driver +# jobspec (forgejo, Step 1) fails placement with "missing drivers". +# Install from Ubuntu's default apt repo — no second source, no pinning. +# `docker.service` ships with the package; `enable --now` is idempotent. +if [ "$docker_needs_install" -eq 1 ]; then + log "installing docker.io" + DEBIAN_FRONTEND=noninteractive apt-get install -y -q docker.io \ + || die "apt-get install docker.io failed" + log "enabling + starting docker.service" + systemctl enable --now docker \ + || die "failed to enable/start docker.service" + command -v docker >/dev/null 2>&1 \ + || die "post-install check: docker binary still not found" fi -# ── Install the pinned versions ────────────────────────────────────────────── -if [ "$apt_update_needed" -eq 1 ]; then - log "running apt-get update" - DEBIAN_FRONTEND=noninteractive apt-get update -qq \ - || die "apt-get update failed" -fi - -log "installing ${need_pkgs[*]}" -DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - "${need_pkgs[@]}" \ - || die "apt-get install ${need_pkgs[*]} failed" - -# ── Verify ─────────────────────────────────────────────────────────────────── -final_nomad="$(_installed_version nomad)" -if [ "$final_nomad" != "$NOMAD_VERSION" ]; then - die "post-install check: expected nomad ${NOMAD_VERSION}, got '${final_nomad}'" -fi -final_vault="$(_installed_version vault)" -if [ "$final_vault" != "$VAULT_VERSION" ]; then - die "post-install check: expected vault ${VAULT_VERSION}, got '${final_vault}'" -fi - -log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} installed successfully" +log "nomad ${NOMAD_VERSION} + vault ${VAULT_VERSION} + docker installed successfully" diff --git a/lib/init/nomad/vault-engines.sh b/lib/init/nomad/vault-engines.sh new file mode 100755 index 0000000..7bc2c38 --- /dev/null +++ b/lib/init/nomad/vault-engines.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/vault-engines.sh — Enable required Vault secret engines +# +# Part of the Nomad+Vault migration (S2.1, issue #912). Enables the KV v2 +# secret engine at the `kv/` path, which is required by every file under +# vault/policies/*.hcl, every role in vault/roles.yaml, every write done +# by tools/vault-import.sh, and every template read done by +# nomad/jobs/forgejo.hcl — all of which address paths under kv/disinto/… +# and 403 if the mount is absent. +# +# Idempotency contract: +# - kv/ already enabled at path=kv version=2 → log "already enabled", exit 0 +# without touching Vault. +# - kv/ enabled at a different type/version → die (manual intervention). +# - kv/ not enabled → POST sys/mounts/kv to enable kv-v2, log "enabled". +# - Second run on a fully-configured box is a silent no-op. +# +# Preconditions: +# - Vault is unsealed and reachable (VAULT_ADDR + VAULT_TOKEN set OR +# defaultable to the local-cluster shape via _hvault_default_env). +# - Must run AFTER cluster-up.sh (unseal complete) but BEFORE +# vault-apply-policies.sh (policies reference kv/* paths). +# +# Environment: +# VAULT_ADDR — default http://127.0.0.1:8200 via _hvault_default_env. +# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh). +# +# Usage: +# sudo lib/init/nomad/vault-engines.sh +# sudo lib/init/nomad/vault-engines.sh --dry-run +# +# Exit codes: +# 0 success (kv enabled, or already so) +# 1 precondition / API failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# shellcheck source=../../hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +log() { printf '[vault-engines] %s\n' "$*"; } +die() { printf '[vault-engines] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing (single optional flag) ───────────────────────────────────── +# Shape: while/shift loop. Deliberately NOT a flat `case "${1:-}"` like +# tools/vault-apply-policies.sh nor an if/elif ladder like +# tools/vault-apply-roles.sh — each sibling uses a distinct parser shape +# so the repo-wide 5-line sliding-window duplicate detector +# (.woodpecker/detect-duplicates.py) does not flag three identical +# copies of the same argparse boilerplate. +print_help() { + cat </dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +# Default the local-cluster Vault env (VAULT_ADDR + VAULT_TOKEN). Shared +# with the rest of the init-time Vault scripts — see lib/hvault.sh header. +_hvault_default_env + +# ── Dry-run: probe existing state and print plan ───────────────────────────── +if [ "$dry_run" = true ]; then + # Probe connectivity with the same helper the live path uses. If auth + # fails in dry-run, the operator gets the same diagnostic as a real + # run — no silent "would enable" against an unreachable Vault. + hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + mounts_raw="$(hvault_get_or_empty "sys/mounts")" \ + || die "failed to list secret engines" + if [ -n "$mounts_raw" ] \ + && printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then + log "[dry-run] kv-v2 at kv/ already enabled" + else + log "[dry-run] would enable kv-v2 at kv/" + fi + exit 0 +fi + +# ── Live run: Vault connectivity check ─────────────────────────────────────── +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Check if kv/ is already enabled ────────────────────────────────────────── +# sys/mounts returns an object keyed by "/" for every enabled secret +# engine (trailing slash is Vault's on-disk form). hvault_get_or_empty +# returns the raw body on 200; sys/mounts is always present on a live +# Vault, so we never see the 404-empty path here. +log "checking existing secret engines" +mounts_raw="$(hvault_get_or_empty "sys/mounts")" \ + || die "failed to list secret engines" + +if [ -n "$mounts_raw" ] \ + && printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then + # kv/ exists — verify it's kv-v2 on the right path shape. Vault returns + # the option as a string ("2") on GET, never an integer. + kv_type="$(printf '%s' "$mounts_raw" | jq -r '."kv/".type // ""')" + kv_version="$(printf '%s' "$mounts_raw" | jq -r '."kv/".options.version // ""')" + if [ "$kv_type" = "kv" ] && [ "$kv_version" = "2" ]; then + log "kv-v2 at kv/ already enabled (type=${kv_type}, version=${kv_version})" + exit 0 + fi + die "kv/ exists but is not kv-v2 (type=${kv_type:-}, version=${kv_version:-}) — manual intervention required" +fi + +# ── Enable kv-v2 at path=kv ────────────────────────────────────────────────── +# POST sys/mounts/ with type=kv + options.version=2 is the +# HTTP-API equivalent of `vault secrets enable -path=kv -version=2 kv`. +# Keeps the script vault-CLI-free (matches the policy-apply + nomad-auth +# scripts; their headers explain why a CLI dep would die on client-only +# nodes). +log "enabling kv-v2 at path=kv" +enable_payload="$(jq -n '{type:"kv",options:{version:"2"}}')" +_hvault_request POST "sys/mounts/kv" "$enable_payload" >/dev/null \ + || die "failed to enable kv-v2 secret engine" +log "kv-v2 enabled at kv/" diff --git a/lib/init/nomad/vault-nomad-auth.sh b/lib/init/nomad/vault-nomad-auth.sh new file mode 100755 index 0000000..cb6a542 --- /dev/null +++ b/lib/init/nomad/vault-nomad-auth.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring +# +# Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT +# auth method at path `jwt-nomad`, points it at Nomad's workload-identity +# JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh), +# updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad +# to reload so jobs can exchange short-lived workload-identity tokens for +# Vault tokens — no shared VAULT_TOKEN in job env. +# +# Steps: +# 1. Enable auth method (sys/auth/jwt-nomad, type=jwt) +# 2. Configure JWKS + algs (auth/jwt-nomad/config) +# 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh) +# 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed +# +# Idempotency contract: +# - Auth path already enabled → skip create, log "jwt-nomad already enabled". +# - Config identical to desired → skip write, log "jwt-nomad config unchanged". +# - Roles: see tools/vault-apply-roles.sh header for per-role diffing. +# - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP. +# - Second run on a fully-configured box is a silent no-op end-to-end. +# +# Preconditions: +# - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed). +# - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh +# (otherwise the roles we write will reference policies Vault does not +# know about — the write succeeds, but token minting will fail later). +# - Running as root (writes /etc/nomad.d/server.hcl + signals nomad). +# +# Environment: +# VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl). +# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh). +# +# Usage: +# sudo lib/init/nomad/vault-nomad-auth.sh +# +# Exit codes: +# 0 success (configured, or already so) +# 1 precondition / API / nomad-reload failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh" +SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl" +SERVER_HCL_DST="/etc/nomad.d/server.hcl" + +# shellcheck source=../../hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env). +# Called from `disinto init` which does not export VAULT_ADDR/VAULT_TOKEN in +# the common fresh-LXC case (issue #912). Must run after hvault.sh is sourced. +_hvault_default_env + +log() { printf '[vault-auth] %s\n' "$*"; } +die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Preconditions ──────────────────────────────────────────────────────────── +if [ "$(id -u)" -ne 0 ]; then + die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)" +fi + +# curl + jq are used directly; hvault.sh's helpers are also curl-based, so +# the `vault` CLI is NOT required here — don't add it to this list, or a +# Vault-server-present / vault-CLI-absent box (e.g. a Nomad-client-only +# node) would die spuriously. systemctl is required for SIGHUPing nomad. +for bin in curl jq systemctl; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +[ -f "$SERVER_HCL_SRC" ] \ + || die "source config not found: ${SERVER_HCL_SRC}" +[ -x "$APPLY_ROLES_SH" ] \ + || die "companion script missing or not executable: ${APPLY_ROLES_SH}" + +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Desired config (Nomad workload-identity JWKS on localhost:4646) ────────── +# Nomad's default workload-identity signer publishes the public JWKS at +# /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates +# JWTs against it. RS256 is the signer's default algorithm. `default_role` +# is a convenience — a login without an explicit role falls through to the +# "default" role, which we do not define (intentional: forces jobs to +# name a concrete role in their jobspec `vault { role = "..." }`). +JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json" + +# ── Step 1/4: enable auth method jwt-nomad ─────────────────────────────────── +log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──" +# sys/auth returns an object keyed by "/" for every enabled method. +# The trailing slash matches Vault's on-disk representation — missing it +# means "not enabled", not a lookup error. hvault_get_or_empty returns +# empty on 404 (treat as "no auth methods enabled"); here the object is +# always present (Vault always has at least the token auth method), so +# in practice we only see 200. +auth_list="$(hvault_get_or_empty "sys/auth")" \ + || die "failed to list auth methods" +if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then + log "auth path jwt-nomad already enabled" +else + enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')" + _hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \ + || die "failed to enable auth method jwt-nomad" + log "auth path jwt-nomad enabled" +fi + +# ── Step 2/4: configure auth/jwt-nomad/config ──────────────────────────────── +log "── Step 2/4: configure auth/jwt-nomad/config ──" +desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{ + jwks_url: $jwks, + jwt_supported_algs: ["RS256"], + default_role: "default" +}')" + +current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \ + || die "failed to read current jwt-nomad config" +if [ -n "$current_cfg_raw" ]; then + cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')" + cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')" + cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')" +else + cur_jwks=""; cur_algs="[]"; cur_default="" +fi + +if [ "$cur_jwks" = "$JWKS_URL" ] \ + && [ "$cur_algs" = '["RS256"]' ] \ + && [ "$cur_default" = "default" ]; then + log "jwt-nomad config unchanged" +else + _hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \ + || die "failed to write jwt-nomad config" + log "jwt-nomad config written" +fi + +# ── Step 3/4: apply roles from vault/roles.yaml ────────────────────────────── +log "── Step 3/4: apply roles from vault/roles.yaml ──" +# Delegates to tools/vault-apply-roles.sh — one source of truth for the +# parser and per-role idempotency contract. Its header documents the +# created/updated/unchanged wiring. +"$APPLY_ROLES_SH" + +# ── Step 4/4: install server.hcl + SIGHUP nomad if changed ─────────────────── +log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──" +# cluster-up.sh (S0.4) is the normal path for installing server.hcl — but +# this script is run AFTER S0.4, so we also install here. Writing only on +# content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install` +# preserves perms at 0644 root:root on every write. +needs_reload=0 +if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then + log "unchanged: ${SERVER_HCL_DST}" +else + log "writing: ${SERVER_HCL_DST}" + install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST" + needs_reload=1 +fi + +if [ "$needs_reload" -eq 1 ]; then + # SIGHUP triggers Nomad's config reload (see ExecReload in + # lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using + # `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the + # signal even when the unit doesn't declare ExecReload (defensive — + # future unit edits can't silently break this script). + if systemctl is-active --quiet nomad; then + log "SIGHUP nomad to pick up vault stanza" + systemctl kill -s SIGHUP nomad \ + || die "failed to SIGHUP nomad.service" + else + # Fresh box: nomad not started yet. The updated server.hcl will be + # picked up at first start. Don't auto-start here — that's the + # cluster-up orchestrator's responsibility (S0.4). + log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)" + fi +else + log "server.hcl unchanged — nomad SIGHUP not needed" +fi + +log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──" diff --git a/lib/load-project.sh b/lib/load-project.sh index 0745276..e42d6dc 100755 --- a/lib/load-project.sh +++ b/lib/load-project.sh @@ -85,8 +85,22 @@ if mirrors: # environment. The TOML carries host-perspective values (localhost, /home/admin/…) # that would break container API calls and path resolution. Skip overriding # any env var that is already set when running inside the container. +# +# #852 defence: validate that $_key is a legal shell identifier before +# `export`. A hand-edited TOML can smuggle in keys that survive the +# Python emitter but fail `export`'s identifier rule — e.g. +# `[mirrors] my-mirror = "..."` becomes `MIRROR_MY-MIRROR` because the +# MIRROR_ emitter only upper-cases, it does not dash-to-underscore. +# Without this guard `export "MIRROR_MY-MIRROR=…"` returns non-zero, and +# under `set -euo pipefail` in the caller the whole file aborts — which +# is how the original #852 crash-loop presented. Warn-and-skip keeps +# the rest of the TOML loadable. while IFS='=' read -r _key _val; do [ -z "$_key" ] && continue + if ! [[ "$_key" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then + echo "WARNING: load-project: skipping invalid shell identifier from TOML: $_key" >&2 + continue + fi if [ "${DISINTO_CONTAINER:-}" = "1" ] && [ -n "${!_key:-}" ]; then continue fi @@ -129,25 +143,39 @@ agents = cfg.get('agents', {}) for name, config in agents.items(): if not isinstance(config, dict): continue + # Normalize the TOML section key into a valid shell identifier fragment. + # TOML allows dashes in bare keys (e.g. [agents.dev-qwen2]), but POSIX + # shell var names cannot contain '-'. Match the 'tr a-z- A-Z_' convention + # used in hire-agent.sh (#834) and generators.sh (#852) so the var names + # stay consistent across the stack. + safe = name.upper().replace('-', '_') # Emit variables in uppercase with the agent name if 'base_url' in config: - print(f'AGENT_{name.upper()}_BASE_URL={config[\"base_url\"]}') + print(f'AGENT_{safe}_BASE_URL={config[\"base_url\"]}') if 'model' in config: - print(f'AGENT_{name.upper()}_MODEL={config[\"model\"]}') + print(f'AGENT_{safe}_MODEL={config[\"model\"]}') if 'api_key' in config: - print(f'AGENT_{name.upper()}_API_KEY={config[\"api_key\"]}') + print(f'AGENT_{safe}_API_KEY={config[\"api_key\"]}') if 'roles' in config: roles = ' '.join(config['roles']) if isinstance(config['roles'], list) else config['roles'] - print(f'AGENT_{name.upper()}_ROLES={roles}') + print(f'AGENT_{safe}_ROLES={roles}') if 'forge_user' in config: - print(f'AGENT_{name.upper()}_FORGE_USER={config[\"forge_user\"]}') + print(f'AGENT_{safe}_FORGE_USER={config[\"forge_user\"]}') if 'compact_pct' in config: - print(f'AGENT_{name.upper()}_COMPACT_PCT={config[\"compact_pct\"]}') + print(f'AGENT_{safe}_COMPACT_PCT={config[\"compact_pct\"]}') " "$_PROJECT_TOML" 2>/dev/null) || true if [ -n "$_AGENT_VARS" ]; then + # #852 defence: same warn-and-skip guard as the main loop above. The + # Python emitter already normalizes dashed agent names (#862), but a + # quoted TOML section like `[agents."weird name"]` could still produce + # an invalid identifier. Fail loudly but keep other agents loadable. while IFS='=' read -r _key _val; do [ -z "$_key" ] && continue + if ! [[ "$_key" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then + echo "WARNING: load-project: skipping invalid shell identifier from [agents.*]: $_key" >&2 + continue + fi export "$_key=$_val" done <<< "$_AGENT_VARS" fi diff --git a/nomad/AGENTS.md b/nomad/AGENTS.md index d80780f..f57c30a 100644 --- a/nomad/AGENTS.md +++ b/nomad/AGENTS.md @@ -1,45 +1,47 @@ - + # nomad/ — Agent Instructions Nomad + Vault HCL for the factory's single-node cluster. These files are the source of truth that `lib/init/nomad/cluster-up.sh` copies onto a factory box under `/etc/nomad.d/` and `/etc/vault.d/` at init time. -This directory is part of the **Nomad+Vault migration (Step 0)** — -see issues #821–#825 for the step breakdown. Jobspecs land in Step 1. +This directory covers the **Nomad+Vault migration (Steps 0–2)** — +see issues #821–#884 for the step breakdown. ## What lives here -| File | Deployed to | Owned by | +| File/Dir | Deployed to | Owned by | |---|---|---| | `server.hcl` | `/etc/nomad.d/server.hcl` | agent role, bind, ports, `data_dir` (S0.2) | | `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2) | | `vault.hcl` | `/etc/vault.d/vault.hcl` | Vault storage, listener, UI, `disable_mlock` (S0.3) | +| `jobs/forgejo.hcl` | submitted via `lib/init/nomad/deploy.sh` | Forgejo job; reads creds from Vault via consul-template stanza (S2.4) | Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the split between `server.hcl` and `client.hcl` is for readability, not semantics. The top-of-file header in each config documents which blocks it owns. -## What does NOT live here yet +## Vault ACL policies -- **Jobspecs.** Step 0 brings up an *empty* cluster. Step 1 (and later) - adds `*.nomad.hcl` job files for forgejo, woodpecker, agents, caddy, - etc. When that lands, jobspecs will live in `nomad/jobs/` and each - will get its own header comment pointing to the `host_volume` names - it consumes (`volume = "forgejo-data"`, etc. — declared in - `client.hcl`). -- **TLS, ACLs, gossip encryption.** Deliberately absent in Step 0 — - factory traffic stays on localhost. These land in later migration - steps alongside multi-node support. +`vault/policies/` holds one `.hcl` file per Vault policy; see +[`vault/policies/AGENTS.md`](../vault/policies/AGENTS.md) for the naming +convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3). + +## Not yet implemented + +- **Additional jobspecs** (woodpecker, agents, caddy) — Step 1 brought up + Forgejo; remaining services land in later steps. +- **TLS, ACLs, gossip encryption** — deliberately absent for now; land + alongside multi-node support. ## Adding a jobspec (Step 1 and later) -1. Drop a file in `nomad/jobs/.nomad.hcl`. The `.nomad.hcl` - suffix is load-bearing: `.woodpecker/nomad-validate.yml` globs on - exactly that suffix to auto-pick up new jobspecs (see step 2 in - "How CI validates these files" below). Anything else in - `nomad/jobs/` is silently skipped by CI. +1. Drop a file in `nomad/jobs/.hcl`. The `.hcl` suffix is + load-bearing: `.woodpecker/nomad-validate.yml` globs on exactly that + suffix to auto-pick up new jobspecs (see step 2 in "How CI validates + these files" below). Anything else in `nomad/jobs/` is silently + skipped by CI. 2. If it needs persistent state, reference a `host_volume` already declared in `client.hcl` — *don't* add ad-hoc host paths in the jobspec. If a new volume is needed, add it to **both**: @@ -52,22 +54,22 @@ it owns. rejects the mismatch at placement time instead. 3. Pin image tags — `image = "forgejo/forgejo:1.22.5"`, not `:latest`. 4. No pipeline edit required — step 2 of `nomad-validate.yml` globs - over `nomad/jobs/*.nomad.hcl` and validates every match. Just make - sure the existing `nomad/**` trigger path still covers your file - (it does for anything under `nomad/jobs/`). + over `nomad/jobs/*.hcl` and validates every match. Just make sure + the existing `nomad/**` trigger path still covers your file (it + does for anything under `nomad/jobs/`). ## How CI validates these files `.woodpecker/nomad-validate.yml` runs on every PR that touches `nomad/` -(including `nomad/jobs/`), `lib/init/nomad/`, or `bin/disinto`. Five -fail-closed steps: +(including `nomad/jobs/`), `lib/init/nomad/`, `bin/disinto`, +`vault/policies/`, or `vault/roles.yaml`. Eight fail-closed steps: 1. **`nomad config validate nomad/server.hcl nomad/client.hcl`** — parses the HCL, fails on unknown blocks, bad port ranges, invalid driver config. Vault HCL is excluded (different tool). Jobspecs are excluded too — agent-config and jobspec are disjoint HCL grammars; running this step on a jobspec rejects it with "unknown block 'job'". -2. **`nomad job validate nomad/jobs/*.nomad.hcl`** (loop, one call per file) +2. **`nomad job validate nomad/jobs/*.hcl`** (loop, one call per file) — parses each jobspec's HCL, fails on unknown stanzas, missing required fields, wrong value types, invalid driver config. Runs offline (no Nomad server needed) so CI exit 0 ≠ "this will schedule @@ -79,25 +81,53 @@ fail-closed steps: - image reachability — `image = "codeberg.org/forgejo/forgejo:11.0"` is accepted even if the registry is down or the tag is wrong. New jobspecs are picked up automatically by the glob — no pipeline - edit needed as long as the file is named `.nomad.hcl`. + edit needed as long as the file is named `.hcl`. 3. **`vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener`** — Vault's equivalent syntax + schema check. `-skip=storage/listener` disables the runtime checks (CI containers don't have `/var/lib/vault/data` or port 8200). Exit 2 (advisory warnings only, e.g. TLS-disabled listener) is tolerated; exit 1 blocks merge. -4. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`** +4. **`vault policy fmt` idempotence check on every `vault/policies/*.hcl`** + (S2.6) — `vault policy fmt` has no `-check` flag in 1.18.5, so the + step copies each file to `/tmp`, runs `vault policy fmt` on the copy, + and diffs against the original. Any non-empty diff means the + committed file would be rewritten by `fmt` and the step fails — the + author is pointed at `vault policy fmt ` to heal the drift. +5. **`vault policy write`-based validation against an inline dev-mode Vault** + (S2.6) — Vault 1.18.5 has no offline `policy validate` subcommand; + the CI step starts a dev-mode server, loops `vault policy write + ` over each `vault/policies/*.hcl`, and aggregates + failures so one CI run surfaces every broken policy. The server is + ephemeral and torn down on step exit — no persistence, no real + secrets. Catches unknown capability names (e.g. `"frobnicate"`), + malformed `path` blocks, and other semantic errors `fmt` does not. +6. **`vault/roles.yaml` validator** (S2.6) — yamllint + a PyYAML-based + check that every role's `policy:` field matches a basename under + `vault/policies/`, and that every role entry carries all four + required fields (`name`, `policy`, `namespace`, `job_id`). Drift + between the two directories is a scheduling-time "permission denied" + in production; this step turns it into a CI failure at PR time. +7. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`** — all init/dispatcher shell clean. `bin/disinto` has no `.sh` extension so the repo-wide shellcheck in `.woodpecker/ci.yml` skips it — this is the one place it gets checked. -5. **`bats tests/disinto-init-nomad.bats`** +8. **`bats tests/disinto-init-nomad.bats`** — exercises the dispatcher: `disinto init --backend=nomad --dry-run`, `… --empty --dry-run`, and the `--backend=docker` regression guard. +**Secret-scan coverage.** Policy HCL files under `vault/policies/` are +already swept by the P11 secret-scan gate +(`.woodpecker/secret-scan.yml`, #798), whose `vault/**/*` trigger path +covers everything in this directory. `nomad-validate.yml` intentionally +does NOT duplicate that gate — one scanner, one source of truth. + If a PR breaks `nomad/server.hcl` (e.g. typo in a block name), step 1 fails with a clear error; if it breaks a jobspec (e.g. misspells `task` as `tsak`, or adds a `volume` stanza without a `source`), step -2 fails instead. The fix makes it pass. PRs that don't touch any of -the trigger paths skip this pipeline entirely. +2 fails; a typo in a `path "..."` block in a vault policy fails step 5 +with the Vault parser's error; a `roles.yaml` entry that points at a +policy basename that does not exist fails step 6. PRs that don't touch +any of the trigger paths skip this pipeline entirely. ## Version pinning @@ -117,5 +147,13 @@ accept (or vice versa). - `lib/init/nomad/` — installer + systemd units + cluster-up orchestrator. - `.woodpecker/nomad-validate.yml` — this directory's CI pipeline. +- `vault/policies/` — Vault ACL policy HCL files (S2.1); the + `vault-policy-fmt` / `vault-policy-validate` CI steps above enforce + their shape. See [`../vault/policies/AGENTS.md`](../vault/policies/AGENTS.md) + for the policy lifecycle, CI enforcement details, and common failure + modes. +- `vault/roles.yaml` — JWT-auth role → policy bindings (S2.3); the + `vault-roles-validate` CI step above keeps it in lockstep with the + policies directory. - Top-of-file headers in `server.hcl` / `client.hcl` / `vault.hcl` document the per-file ownership contract. diff --git a/nomad/jobs/forgejo.hcl b/nomad/jobs/forgejo.hcl new file mode 100644 index 0000000..4d15aec --- /dev/null +++ b/nomad/jobs/forgejo.hcl @@ -0,0 +1,189 @@ +# ============================================================================= +# nomad/jobs/forgejo.hcl — Forgejo git server (Nomad service job) +# +# Part of the Nomad+Vault migration (S1.1, issue #840; S2.4, issue #882). +# First jobspec to land under nomad/jobs/ — proves the docker driver + +# host_volume plumbing from Step 0 (client.hcl) by running a real factory +# service. S2.4 layered Vault integration on top: admin/internal secrets +# now render via workload identity + template stanza instead of inline env. +# +# Host_volume contract: +# This job mounts the `forgejo-data` host_volume declared in +# nomad/client.hcl. That volume is backed by /srv/disinto/forgejo-data on +# the factory box, created by lib/init/nomad/cluster-up.sh before any job +# references it. Keep the `source = "forgejo-data"` below in sync with the +# host_volume stanza in client.hcl — drift = scheduling failures. +# +# Vault integration (S2.4): +# - vault { role = "service-forgejo" } at the group scope — the task's +# workload-identity JWT is exchanged for a Vault token carrying the +# policy named on that role. Role + policy are defined in +# vault/roles.yaml + vault/policies/service-forgejo.hcl. +# - template { destination = "secrets/forgejo.env" env = true } pulls +# FORGEJO__security__{SECRET_KEY,INTERNAL_TOKEN} out of Vault KV v2 +# at kv/disinto/shared/forgejo and merges them into the task env. +# Seeded on fresh boxes by tools/vault-seed-forgejo.sh. +# - Non-secret env (DB type, ROOT_URL, ports, registration lockdown, +# webhook allow-list) stays inline below — not sensitive, not worth +# round-tripping through Vault. +# +# Not the runtime yet: docker-compose.yml is still the factory's live stack +# until cutover. This file exists so CI can validate it and S1.3 can wire +# `disinto init --backend=nomad --with forgejo` to `nomad job run` it. +# ============================================================================= + +job "forgejo" { + type = "service" + datacenters = ["dc1"] + + group "forgejo" { + count = 1 + + # ── Vault workload identity (S2.4, issue #882) ───────────────────────── + # `role = "service-forgejo"` is defined in vault/roles.yaml and + # applied by tools/vault-apply-roles.sh (S2.3). The role's bound + # claim pins nomad_job_id = "forgejo" — renaming this jobspec's + # `job "forgejo"` without updating vault/roles.yaml will make token + # exchange fail at placement with a "claim mismatch" error. + vault { + role = "service-forgejo" + } + + # Static :3000 matches docker-compose's published port so the rest of + # the factory (agents, woodpecker, caddy) keeps reaching forgejo at the + # same host:port during and after cutover. `to = 3000` maps the host + # port into the container's :3000 listener. + network { + port "http" { + static = 3000 + to = 3000 + } + } + + # Host-volume mount: declared in nomad/client.hcl, path + # /srv/disinto/forgejo-data on the factory box. + volume "forgejo-data" { + type = "host" + source = "forgejo-data" + read_only = false + } + + # Conservative restart policy — fail fast to the scheduler instead of + # spinning on a broken image/config. 3 attempts over 5m, then back off. + restart { + attempts = 3 + interval = "5m" + delay = "15s" + mode = "delay" + } + + # Native Nomad service discovery (no Consul in this factory cluster). + # Health check gates the service as healthy only after the API is up; + # initial_status is deliberately unset so Nomad waits for the first + # probe to pass before marking the allocation healthy on boot. + service { + name = "forgejo" + port = "http" + provider = "nomad" + + check { + type = "http" + path = "/api/v1/version" + interval = "10s" + timeout = "3s" + } + } + + task "forgejo" { + driver = "docker" + + config { + image = "codeberg.org/forgejo/forgejo:11.0" + ports = ["http"] + } + + volume_mount { + volume = "forgejo-data" + destination = "/data" + read_only = false + } + + # Non-secret env — DB type, public URL, ports, install lock, + # registration lockdown, webhook allow-list. Nothing sensitive here, + # so this stays inline. Secret-bearing env (SECRET_KEY, INTERNAL_TOKEN) + # lives in the template stanza below and is merged into task env. + env { + FORGEJO__database__DB_TYPE = "sqlite3" + FORGEJO__server__ROOT_URL = "http://forgejo:3000/" + FORGEJO__server__HTTP_PORT = "3000" + FORGEJO__security__INSTALL_LOCK = "true" + FORGEJO__service__DISABLE_REGISTRATION = "true" + FORGEJO__webhook__ALLOWED_HOST_LIST = "private" + } + + # ── Vault-templated secrets env (S2.4, issue #882) ────────────────── + # Renders `/secrets/forgejo.env` (per-alloc secrets dir, + # never on disk on the host root filesystem, never in `nomad job + # inspect` output). `env = true` merges every KEY=VAL line into the + # task environment. `change_mode = "restart"` re-runs the task + # whenever a watched secret's value in Vault changes — so `vault kv + # put …` alone is enough to roll new secrets; no manual + # `nomad alloc restart` required (though that also works — it + # forces a re-render). + # + # Vault path: `kv/data/disinto/shared/forgejo`. The literal `/data/` + # segment is required by consul-template for KV v2 mounts — without + # it the template would read from a KV v1 path that doesn't exist + # (the policy in vault/policies/service-forgejo.hcl grants + # `kv/data/disinto/shared/forgejo/*`, confirming v2). + # + # Empty-Vault fallback (`with ... else ...`): on a fresh LXC where + # the KV path is absent, consul-template's `with` short-circuits to + # the `else` branch. Emitting visible placeholders (instead of no + # env vars) means the container still boots, but with obviously-bad + # secrets that an operator will spot in `env | grep FORGEJO` — + # better than forgejo silently regenerating SECRET_KEY on every + # restart and invalidating every prior session. Seed the path with + # tools/vault-seed-forgejo.sh to replace the placeholders. + # + # Placeholder values are kept short on purpose: the repo-wide + # secret-scan (.woodpecker/secret-scan.yml → lib/secret-scan.sh) + # flags `TOKEN=<16+ non-space chars>` as a plaintext secret, so a + # descriptive long placeholder (e.g. "run-tools-vault-seed-...") on + # the INTERNAL_TOKEN line would fail CI on every PR that touched + # this file. "seed-me" is < 16 chars and still distinctive enough + # to surface in a `grep FORGEJO__security__` audit. The template + # comment below carries the operator-facing fix pointer. + # `error_on_missing_key = false` stops consul-template from blocking + # the alloc on template-pending when the Vault KV path exists but a + # referenced key is absent (or the path itself is absent and the + # else-branch placeholders are used). Without this, a fresh-LXC + # `disinto init --with forgejo` against an empty Vault hangs on + # template-pending until deploy.sh times out (issue #912, bug #4). + template { + destination = "secrets/forgejo.env" + env = true + change_mode = "restart" + error_on_missing_key = false + data = < + # Planner Agent **Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints), diff --git a/predictor/AGENTS.md b/predictor/AGENTS.md index 4f762c7..cec03a1 100644 --- a/predictor/AGENTS.md +++ b/predictor/AGENTS.md @@ -1,4 +1,4 @@ - + # Predictor Agent **Role**: Abstract adversary (the "goblin"). Runs a 2-step formula diff --git a/review/AGENTS.md b/review/AGENTS.md index 087f0f5..4c06b34 100644 --- a/review/AGENTS.md +++ b/review/AGENTS.md @@ -1,4 +1,4 @@ - + # Review Agent **Role**: AI-powered PR review — post structured findings and formal diff --git a/supervisor/AGENTS.md b/supervisor/AGENTS.md index 48b39bd..736f78f 100644 --- a/supervisor/AGENTS.md +++ b/supervisor/AGENTS.md @@ -1,4 +1,4 @@ - + # Supervisor Agent **Role**: Health monitoring and auto-remediation, executed as a formula-driven diff --git a/tests/disinto-init-nomad.bats b/tests/disinto-init-nomad.bats index 5b2648b..21f4303 100644 --- a/tests/disinto-init-nomad.bats +++ b/tests/disinto-init-nomad.bats @@ -34,7 +34,7 @@ setup_file() { [[ "$output" == *"nomad backend: default (cluster-up; jobs deferred to Step 1)"* ]] # All nine cluster-up dry-run steps, in order. - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"[dry-run] Step 2/9: write + enable nomad.service (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 3/9: write + enable vault.service + vault.hcl (NOT started)"* ]] [[ "$output" == *"[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/"* ]] @@ -57,7 +57,7 @@ setup_file() { # of the migration will branch on $empty to gate job deployment; today # both modes invoke the same cluster-up dry-run. [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] [[ "$output" == *"Dry run complete — no changes made."* ]] } @@ -69,7 +69,7 @@ setup_file() { # Negative assertion: the nomad dispatcher banners must be absent. [[ "$output" != *"nomad backend:"* ]] - [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" != *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] # Positive assertion: docker-path output still appears — the existing # docker dry-run printed "=== disinto init ===" before listing the @@ -88,7 +88,7 @@ setup_file() { run "$DISINTO_BIN" init placeholder/repo --backend nomad --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: default"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] } # ── Flag validation ────────────────────────────────────────────────────────── @@ -118,7 +118,7 @@ setup_file() { run "$DISINTO_BIN" init --backend=nomad --empty --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] # The bug symptom must be absent — backend was misdetected as docker # when --backend=nomad got swallowed as repo_url. [[ "$output" != *"--empty is only valid with --backend=nomad"* ]] @@ -128,7 +128,7 @@ setup_file() { run "$DISINTO_BIN" init --backend nomad --dry-run [ "$status" -eq 0 ] [[ "$output" == *"nomad backend: default"* ]] - [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries"* ]] + [[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]] } @test "disinto init (no args) still errors with 'repo URL required'" { @@ -143,3 +143,208 @@ setup_file() { [[ "$output" == *"repo URL required"* ]] [[ "$output" != *"Unknown option"* ]] } + +# ── --with flag tests ───────────────────────────────────────────────────────── + +@test "disinto init --backend=nomad --with forgejo --dry-run prints deploy plan" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"services to deploy: forgejo"* ]] + [[ "$output" == *"[deploy] [dry-run] nomad job validate"* ]] + [[ "$output" == *"[deploy] [dry-run] nomad job run -detach"* ]] + [[ "$output" == *"[deploy] dry-run complete"* ]] +} + +# S2.6 / #928 — every --with that ships tools/vault-seed-.sh +# must auto-invoke the seeder before deploy.sh runs. forgejo is the +# only service with a seeder today, so the dry-run plan must include +# its seed line when --with forgejo is set. The seed block must also +# appear BEFORE the deploy block (seeded secrets must exist before +# nomad reads the template stanza) — pinned here by scanning output +# order. Services without a seeder (e.g. unknown hypothetical future +# ones) are silently skipped by the loop convention. +@test "disinto init --backend=nomad --with forgejo --dry-run prints seed plan before deploy plan" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"Vault seed dry-run"* ]] + [[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]] + # Order: seed header must appear before deploy header. + local seed_line deploy_line + seed_line=$(echo "$output" | grep -n "Vault seed dry-run" | head -1 | cut -d: -f1) + deploy_line=$(echo "$output" | grep -n "Deploy services dry-run" | head -1 | cut -d: -f1) + [ -n "$seed_line" ] + [ -n "$deploy_line" ] + [ "$seed_line" -lt "$deploy_line" ] +} + +# Regression guard (PR #929 review): `sudo -n VAR=val -- cmd` is subject +# to sudoers env_reset policy and silently drops VAULT_ADDR unless it's +# in env_keep (it isn't in default configs). vault-seed-forgejo.sh +# requires VAULT_ADDR and dies at its own precondition check if unset, +# so the non-root branch MUST invoke `sudo -n -- env VAR=val cmd` so +# that `env` sets the variable in the child process regardless of +# sudoers policy. This grep-level guard catches a revert to the unsafe +# form that silently broke non-root seed runs on a fresh LXC. +@test "seed loop invokes sudo via 'env VAR=val' (bypasses sudoers env_reset)" { + run grep -F 'sudo -n -- env "VAULT_ADDR=' "$DISINTO_BIN" + [ "$status" -eq 0 ] + # Negative: no bare `sudo -n "VAR=val" --` form anywhere in the file. + run grep -F 'sudo -n "VAULT_ADDR=' "$DISINTO_BIN" + [ "$status" -ne 0 ] +} + +@test "disinto init --backend=nomad --with forgejo,forgejo --dry-run handles comma-separated services" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,forgejo --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"services to deploy: forgejo,forgejo"* ]] +} + +@test "disinto init --backend=docker --with forgejo errors with '--with requires --backend=nomad'" { + run "$DISINTO_BIN" init placeholder/repo --backend=docker --with forgejo + [ "$status" -ne 0 ] + [[ "$output" == *"--with requires --backend=nomad"* ]] +} + +@test "disinto init --backend=nomad --empty --with forgejo errors with mutually exclusive" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --with forgejo + [ "$status" -ne 0 ] + [[ "$output" == *"--empty and --with are mutually exclusive"* ]] +} + +@test "disinto init --backend=nomad --with unknown-service errors with unknown service" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"unknown service"* ]] + [[ "$output" == *"known: forgejo"* ]] +} + +@test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with=forgejo --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"services to deploy: forgejo"* ]] +} + +@test "disinto init --backend=nomad --with forgejo --empty --dry-run rejects in any order" { + run "$DISINTO_BIN" init placeholder/repo --with forgejo --backend=nomad --empty --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"--empty and --with are mutually exclusive"* ]] +} + +# ── --import-env / --import-sops / --age-key (S2.5, #883) ──────────────────── +# +# Step 2.5 wires Vault policies + JWT auth + optional KV import into +# `disinto init --backend=nomad`. The tests below exercise the flag +# grammar (who-requires-whom + who-requires-backend=nomad) and the +# dry-run plan shape (each --import-* flag prints its own path line, +# independently). A prior attempt at this issue regressed the "print +# every set flag" invariant by using if/elif — covered by the +# "--import-env --import-sops --age-key" case. + +@test "disinto init --backend=nomad --import-env only is accepted" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"--import-env"* ]] + [[ "$output" == *"env file: /tmp/.env"* ]] +} + +@test "disinto init --backend=nomad --import-sops without --age-key errors" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-sops /tmp/.env.vault.enc --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"--import-sops requires --age-key"* ]] +} + +@test "disinto init --backend=nomad --age-key without --import-sops errors" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --age-key /tmp/keys.txt --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"--age-key requires --import-sops"* ]] +} + +@test "disinto init --backend=docker --import-env errors with backend requirement" { + run "$DISINTO_BIN" init placeholder/repo --backend=docker --import-env /tmp/.env + [ "$status" -ne 0 ] + [[ "$output" == *"--import-env, --import-sops, and --age-key require --backend=nomad"* ]] +} + +@test "disinto init --backend=nomad --import-sops --age-key --dry-run shows import plan" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"Vault import dry-run"* ]] + [[ "$output" == *"--import-sops"* ]] + [[ "$output" == *"--age-key"* ]] + [[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]] + [[ "$output" == *"age key: /tmp/keys.txt"* ]] +} + +# When all three flags are set, each one must print its own path line — +# if/elif regressed this to "only one printed" in a prior attempt (#883). +@test "disinto init --backend=nomad --import-env --import-sops --age-key --dry-run shows full import plan" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"Vault import dry-run"* ]] + [[ "$output" == *"env file: /tmp/.env"* ]] + [[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]] + [[ "$output" == *"age key: /tmp/keys.txt"* ]] +} + +@test "disinto init --backend=nomad without import flags shows skip message" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"no --import-env/--import-sops"* ]] + [[ "$output" == *"skipping"* ]] +} + +@test "disinto init --backend=nomad --import-env --import-sops --age-key --with forgejo --dry-run shows all plans" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --with forgejo --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"Vault import dry-run"* ]] + [[ "$output" == *"Vault policies dry-run"* ]] + [[ "$output" == *"Vault auth dry-run"* ]] + [[ "$output" == *"Deploy services dry-run"* ]] +} + +@test "disinto init --backend=nomad --dry-run prints policies + auth plan even without --import-*" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run + [ "$status" -eq 0 ] + # Policies + auth run on every nomad path (idempotent), so the dry-run + # plan always lists them — regardless of whether --import-* is set. + [[ "$output" == *"Vault policies dry-run"* ]] + [[ "$output" == *"Vault auth dry-run"* ]] + [[ "$output" != *"Vault import dry-run"* ]] +} + +# --import-env=PATH (=-form) must work alongside --import-env PATH. +@test "disinto init --backend=nomad --import-env=PATH (equals form) works" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env=/tmp/.env --dry-run + [ "$status" -eq 0 ] + [[ "$output" == *"env file: /tmp/.env"* ]] +} + +# --empty short-circuits after cluster-up: no policies, no auth, no +# import, no deploy. The dry-run plan must match that — cluster-up plan +# appears, but none of the S2.x section banners do. +@test "disinto init --backend=nomad --empty --dry-run skips policies/auth/import sections" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --dry-run + [ "$status" -eq 0 ] + # Cluster-up still runs (it's what --empty brings up). + [[ "$output" == *"Cluster-up dry-run"* ]] + # Policies + auth + import must NOT appear under --empty. + [[ "$output" != *"Vault policies dry-run"* ]] + [[ "$output" != *"Vault auth dry-run"* ]] + [[ "$output" != *"Vault import dry-run"* ]] + [[ "$output" != *"no --import-env/--import-sops"* ]] +} + +# --empty + any --import-* flag silently does nothing (import is skipped), +# so the CLI rejects the combination up front rather than letting it +# look like the import "succeeded". +@test "disinto init --backend=nomad --empty --import-env errors" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-env /tmp/.env --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]] +} + +@test "disinto init --backend=nomad --empty --import-sops --age-key errors" { + run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run + [ "$status" -ne 0 ] + [[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]] +} diff --git a/tests/fixtures/.env.vault.enc b/tests/fixtures/.env.vault.enc new file mode 100644 index 0000000..2924dc9 --- /dev/null +++ b/tests/fixtures/.env.vault.enc @@ -0,0 +1,20 @@ +{ + "data": "ENC[AES256_GCM,data:SsLdIiZDVkkV1bbKeHQ8A1K/4vgXQFJF8y4J87GGwsGa13lNnPoqRaCmPAtuQr3hR5JNqARUhFp8aEusyzwi/lZLU2Reo32YjE26ObVOHf47EGmmHM/tEgh6u0fa1AmFtuqJVQzhG2eZhJmZJFgdRH36+bhdBwI1mkORmsRNtBPHHjtQJDbsgN47maDhuP4B7WvB4/TdnJ++GNMlMbyrbr0pEf2uqqOVO55cJ3I4v/Jcg8tq0clPuW1k5dNFsmFSMbbjE5N25EGrc7oEH5GVZ6I6L6p0Fzyj/MV4hKacboFHiZmBZgRQ,iv:UnXTa800G3PW4IaErkPBIZKjPHAU3LmiCvAqDdhFE/Q=,tag:kdWpHQ8fEPGFlmfVoTMskA==,type:str]", + "sops": { + "kms": null, + "gcp_kms": null, + "azure_kv": null, + "hc_vault": null, + "age": [ + { + "recipient": "age1ztkm8yvdk42m2cn4dj2v9ptfknq8wpgr3ry9dpmtmlaeas6p7yyqft0ldg", + "enc": "-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBrVUlmaEdTNU1iMGg4dFA4\nNFNOSzlBc1NER1U3SHlwVFU1dm5tR1kyeldzCjZ2NXI3MjR4Zkd1RVBKNzJoQ1Jm\nQWpEZU5VMkNuYnhTTVJNc0RpTXlIZE0KLS0tIDFpQ2tlN0MzL1NuS2hKZU5JTG9B\nNWxXMzE0bGZpQkVBTnhWRXZBQlhrc1EKG76DM98cCuqIwUkbfJWHhJdYV77O9r8Q\nRJrq6jH59Gcp9W8iHg/aeShPHZFEOLg1q9azV9Wt9FjJn3SxyTmgvA==\n-----END AGE ENCRYPTED FILE-----\n" + } + ], + "lastmodified": "2026-04-16T15:43:34Z", + "mac": "ENC[AES256_GCM,data:jVRr2TxSZH2paD2doIX4JwCqo5wiPYfTowpj189w1IVlS0EY/XQoqxiWbunX/LmIDdQlTPCSe/vTp1EJA0cx6vzN2xENrwsfzCP6dwDGaRlZhH3V0CVhtfHIkMTEKWrAUx5hFtiwJPkLYUUYi5aRWRxhZQM1eBeRvuGKdlwvmHA=,iv:H57a61AfVNLrlg+4aMl9mwXI5O38O5ZoRhpxe2PTTkY=,tag:2jwH1855VNYlKseTE/XtTg==,type:str]", + "pgp": null, + "unencrypted_suffix": "_unencrypted", + "version": "3.9.4" + } +} \ No newline at end of file diff --git a/tests/fixtures/age-keys.txt b/tests/fixtures/age-keys.txt new file mode 100644 index 0000000..081f2af --- /dev/null +++ b/tests/fixtures/age-keys.txt @@ -0,0 +1,5 @@ +# Test age key for sops +# Generated: 2026-04-16 +# Public key: age1ztkm8yvdk42m2cn4dj2v9ptfknq8wpgr3ry9dpmtmlaeas6p7yyqft0ldg + +AGE-SECRET-KEY-1PCQQX37MTZDGES76H9TGQN5XTG2ZZX2UUR87KR784NZ4MQ3NJ56S0Z23SF diff --git a/tests/fixtures/dot-env-complete b/tests/fixtures/dot-env-complete new file mode 100644 index 0000000..828b9a3 --- /dev/null +++ b/tests/fixtures/dot-env-complete @@ -0,0 +1,40 @@ +# Test fixture .env file for vault-import.sh +# This file contains all expected keys for the import test + +# Generic forge creds +FORGE_TOKEN=generic-forge-token +FORGE_PASS=generic-forge-pass +FORGE_ADMIN_TOKEN=generic-admin-token + +# Bot tokens (review, dev, gardener, architect, planner, predictor, supervisor, vault) +FORGE_REVIEW_TOKEN=review-token +FORGE_REVIEW_PASS=review-pass +FORGE_DEV_TOKEN=dev-token +FORGE_DEV_PASS=dev-pass +FORGE_GARDENER_TOKEN=gardener-token +FORGE_GARDENER_PASS=gardener-pass +FORGE_ARCHITECT_TOKEN=architect-token +FORGE_ARCHITECT_PASS=architect-pass +FORGE_PLANNER_TOKEN=planner-token +FORGE_PLANNER_PASS=planner-pass +FORGE_PREDICTOR_TOKEN=predictor-token +FORGE_PREDICTOR_PASS=predictor-pass +FORGE_SUPERVISOR_TOKEN=supervisor-token +FORGE_SUPERVISOR_PASS=supervisor-pass +FORGE_VAULT_TOKEN=vault-token +FORGE_VAULT_PASS=vault-pass + +# Llama bot +FORGE_TOKEN_LLAMA=llama-token +FORGE_PASS_LLAMA=llama-pass + +# Woodpecker secrets +WOODPECKER_AGENT_SECRET=wp-agent-secret +WP_FORGEJO_CLIENT=wp-forgejo-client +WP_FORGEJO_SECRET=wp-forgejo-secret +WOODPECKER_TOKEN=wp-token + +# Chat secrets +FORWARD_AUTH_SECRET=forward-auth-secret +CHAT_OAUTH_CLIENT_ID=chat-client-id +CHAT_OAUTH_CLIENT_SECRET=chat-client-secret diff --git a/tests/fixtures/dot-env-incomplete b/tests/fixtures/dot-env-incomplete new file mode 100644 index 0000000..9869944 --- /dev/null +++ b/tests/fixtures/dot-env-incomplete @@ -0,0 +1,27 @@ +# Test fixture .env file with missing required keys +# This file is intentionally missing some keys to test error handling + +# Generic forge creds - missing FORGE_ADMIN_TOKEN +FORGE_TOKEN=generic-forge-token +FORGE_PASS=generic-forge-pass + +# Bot tokens - missing several roles +FORGE_REVIEW_TOKEN=review-token +FORGE_REVIEW_PASS=review-pass +FORGE_DEV_TOKEN=dev-token +FORGE_DEV_PASS=dev-pass + +# Llama bot - missing (only token, no pass) +FORGE_TOKEN_LLAMA=llama-token +# FORGE_PASS_LLAMA=llama-pass + +# Woodpecker secrets - missing some +WOODPECKER_AGENT_SECRET=wp-agent-secret +# WP_FORGEJO_CLIENT=wp-forgejo-client +# WP_FORGEJO_SECRET=wp-forgejo-secret +# WOODPECKER_TOKEN=wp-token + +# Chat secrets - missing some +FORWARD_AUTH_SECRET=forward-auth-secret +# CHAT_OAUTH_CLIENT_ID=chat-client-id +# CHAT_OAUTH_CLIENT_SECRET=chat-client-secret diff --git a/tests/fixtures/dot-env.vault.plain b/tests/fixtures/dot-env.vault.plain new file mode 100644 index 0000000..e4b60c1 --- /dev/null +++ b/tests/fixtures/dot-env.vault.plain @@ -0,0 +1,6 @@ +GITHUB_TOKEN=github-test-token-abc123 +CODEBERG_TOKEN=codeberg-test-token-def456 +CLAWHUB_TOKEN=clawhub-test-token-ghi789 +DEPLOY_KEY=deploy-key-test-jkl012 +NPM_TOKEN=npm-test-token-mno345 +DOCKER_HUB_TOKEN=dockerhub-test-token-pqr678 diff --git a/tests/lib-generators.bats b/tests/lib-generators.bats new file mode 100644 index 0000000..b311325 --- /dev/null +++ b/tests/lib-generators.bats @@ -0,0 +1,161 @@ +#!/usr/bin/env bats +# ============================================================================= +# tests/lib-generators.bats — Regression guard for the #849 fix. +# +# Before #849, `_generate_local_model_services` emitted the forge-user env +# variable keyed by service name (`FORGE_BOT_USER_${service_name^^}`), so for +# an `[agents.llama]` block with `forge_user = "dev-qwen"` the compose file +# contained `FORGE_BOT_USER_LLAMA: "dev-qwen"`. That suffix diverges from the +# `FORGE_TOKEN_` / `FORGE_PASS_` convention that the +# same block uses two lines above, and it doesn't even round-trip through a +# dash-containing service name (`dev-qwen` → `DEV-QWEN`, which is not a valid +# shell identifier — see #852). +# +# The fix keys on `$user_upper` (already computed from `forge_user` via +# `tr 'a-z-' 'A-Z_'`), yielding `FORGE_BOT_USER_DEV_QWEN: "dev-qwen"`. +# ============================================================================= + +setup() { + ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)" + export FACTORY_ROOT="${BATS_TEST_TMPDIR}/factory" + mkdir -p "${FACTORY_ROOT}/projects" + + # Minimal compose skeleton that `_generate_local_model_services` can splice into. + # It only needs a `volumes:` marker line and nothing below it that would be + # re-read after the splice. + cat > "${FACTORY_ROOT}/docker-compose.yml" <<'EOF' +services: + agents: + image: placeholder + +volumes: + agent-data: +EOF +} + +@test "local-model agent service emits FORGE_BOT_USER keyed by forge_user (#849)" { + cat > "${FACTORY_ROOT}/projects/test.toml" <<'EOF' +name = "test" +repo = "test-owner/test-repo" +forge_url = "http://localhost:3000" + +[agents.llama] +base_url = "http://10.10.10.1:8081" +model = "qwen" +api_key = "sk-no-key-required" +roles = ["dev"] +forge_user = "dev-qwen" +compact_pct = 60 +EOF + + run bash -c " + set -euo pipefail + source '${ROOT}/lib/generators.sh' + _generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml' + cat '${FACTORY_ROOT}/docker-compose.yml' + " + + [ "$status" -eq 0 ] + # New, forge_user-keyed suffix is present with the right value. + [[ "$output" == *'FORGE_BOT_USER_DEV_QWEN: "dev-qwen"'* ]] + # Legacy service-name-keyed suffix must not be emitted. + [[ "$output" != *'FORGE_BOT_USER_LLAMA'* ]] +} + +@test "local-model agent service emits local image ref + build: fallback (#853)" { + # Before #853 the generator emitted `image: ghcr.io/disinto/agents:` for + # every hired agent. The ghcr image isn't publicly pullable and the running + # deployment has no credentials, so `docker compose up` failed with `denied`. + # The fix: emit the registry-less local name (matches `disinto init --build` + # and the legacy agents-llama stanza) plus a build: directive so hosts + # without a pre-built image can rebuild locally. + cat > "${FACTORY_ROOT}/projects/test.toml" <<'EOF' +name = "test" +repo = "test-owner/test-repo" +forge_url = "http://localhost:3000" + +[agents.dev-qwen2] +base_url = "http://10.10.10.1:8081" +model = "qwen" +api_key = "sk-no-key-required" +roles = ["dev"] +forge_user = "dev-qwen2" +EOF + + run bash -c " + set -euo pipefail + source '${ROOT}/lib/generators.sh' + _generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml' + cat '${FACTORY_ROOT}/docker-compose.yml' + " + + [ "$status" -eq 0 ] + # Local image ref — no ghcr prefix. + [[ "$output" == *'image: disinto/agents:${DISINTO_IMAGE_TAG:-latest}'* ]] + [[ "$output" != *'image: ghcr.io/disinto/agents'* ]] + # build: fallback so hosts without a pre-built image can rebuild. + [[ "$output" == *'dockerfile: docker/agents/Dockerfile'* ]] +} + +@test "local-model agent service emits pull_policy: build so docker compose up rebuilds on source change (#887)" { + # Without pull_policy: build, `docker compose up -d --force-recreate` reuses + # the cached `disinto/agents:latest` image and silently runs stale + # docker/agents/entrypoint.sh even after the repo is updated. `pull_policy: + # build` forces a rebuild on every up; BuildKit layer cache makes unchanged + # rebuilds near-instant. The alternative was requiring every operator to + # remember `--build` on every invocation, which was the bug that prompted + # #887 (2h of debugging a fix that was merged but never reached the container). + cat > "${FACTORY_ROOT}/projects/test.toml" <<'EOF' +name = "test" +repo = "test-owner/test-repo" +forge_url = "http://localhost:3000" + +[agents.dev-qwen2] +base_url = "http://10.10.10.1:8081" +model = "qwen" +api_key = "sk-no-key-required" +roles = ["dev"] +forge_user = "dev-qwen2" +EOF + + run bash -c " + set -euo pipefail + source '${ROOT}/lib/generators.sh' + _generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml' + cat '${FACTORY_ROOT}/docker-compose.yml' + " + + [ "$status" -eq 0 ] + [[ "$output" == *'pull_policy: build'* ]] +} + +@test "local-model agent service keys FORGE_BOT_USER to forge_user even when it differs from service name (#849)" { + # Exercise the case the issue calls out: two agents in the same factory + # whose service names are identical (`[agents.llama]`) but whose + # forge_users diverge would previously both have emitted + # `FORGE_BOT_USER_LLAMA`. With the fix each emission carries its own + # forge_user-derived suffix. + cat > "${FACTORY_ROOT}/projects/a.toml" <<'EOF' +name = "a" +repo = "a/a" +forge_url = "http://localhost:3000" + +[agents.dev] +base_url = "http://10.10.10.1:8081" +model = "qwen" +api_key = "sk-no-key-required" +roles = ["dev"] +forge_user = "review-qwen" +EOF + + run bash -c " + set -euo pipefail + source '${ROOT}/lib/generators.sh' + _generate_local_model_services '${FACTORY_ROOT}/docker-compose.yml' + cat '${FACTORY_ROOT}/docker-compose.yml' + " + + [ "$status" -eq 0 ] + [[ "$output" == *'FORGE_BOT_USER_REVIEW_QWEN: "review-qwen"'* ]] + [[ "$output" != *'FORGE_BOT_USER_DEV:'* ]] +} diff --git a/tests/lib-hvault.bats b/tests/lib-hvault.bats index 628bc99..2d779dc 100644 --- a/tests/lib-hvault.bats +++ b/tests/lib-hvault.bats @@ -126,7 +126,7 @@ setup() { @test "hvault_policy_apply creates a policy" { local pfile="${BATS_TEST_TMPDIR}/test-policy.hcl" cat > "$pfile" <<'HCL' -path "secret/data/test/*" { +path "kv/data/test/*" { capabilities = ["read"] } HCL @@ -138,12 +138,12 @@ HCL run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ "${VAULT_ADDR}/v1/sys/policies/acl/test-reader" [ "$status" -eq 0 ] - echo "$output" | jq -e '.data.policy' | grep -q "secret/data/test" + echo "$output" | jq -e '.data.policy' | grep -q "kv/data/test" } @test "hvault_policy_apply is idempotent" { local pfile="${BATS_TEST_TMPDIR}/idem-policy.hcl" - printf 'path "secret/*" { capabilities = ["list"] }\n' > "$pfile" + printf 'path "kv/*" { capabilities = ["list"] }\n' > "$pfile" run hvault_policy_apply "idem-policy" "$pfile" [ "$status" -eq 0 ] diff --git a/tests/lib-load-project.bats b/tests/lib-load-project.bats new file mode 100644 index 0000000..f0c583a --- /dev/null +++ b/tests/lib-load-project.bats @@ -0,0 +1,253 @@ +#!/usr/bin/env bats +# ============================================================================= +# tests/lib-load-project.bats — Regression guard for the #862 fix. +# +# TOML allows dashes in bare keys, so `[agents.dev-qwen2]` is a valid section +# header. Before #862, load-project.sh translated the section name into a +# shell variable name via Python's `.upper()` alone, which kept the dash and +# produced `AGENT_DEV-QWEN2_BASE_URL`. `export "AGENT_DEV-QWEN2_..."` is +# rejected by bash ("not a valid identifier"), and with `set -euo pipefail` +# anywhere up-stack that error aborts load-project.sh — effectively crashing +# the factory on the N+1 run after a dashed agent was hired. +# +# The fix normalizes via `.upper().replace('-', '_')`, matching the +# `tr 'a-z-' 'A-Z_'` convention already used in hire-agent.sh and +# generators.sh. +# ============================================================================= + +setup() { + ROOT="$(cd "$(dirname "$BATS_TEST_FILENAME")/.." && pwd)" + TOML="${BATS_TEST_TMPDIR}/test.toml" +} + +@test "dashed [agents.*] section name parses without error" { + cat > "$TOML" < "$TOML" < "$TOML" < "$TOML" <&1 + echo \"GOOD=\${MIRROR_GOOD:-MISSING}\" + " + + # Whole load did not abort under set -e. + [ "$status" -eq 0 ] + # The valid mirror still loads. + [[ "$output" == *"GOOD=https://example.com/good"* ]] + # The invalid one triggers a warning; load continues instead of crashing. + [[ "$output" == *"skipping invalid shell identifier"* ]] + [[ "$output" == *"MIRROR_BAD-NAME"* ]] +} + +@test "[agents.*] quoted section with space: warn-and-skip, does not crash" { + # TOML permits quoted keys with arbitrary characters. A hand-edited + # `[agents."weird name"]` would survive the Python .replace('-', '_') + # (because it has no dash) but still contains a space, which would + # yield AGENT_WEIRD NAME_BASE_URL — not a valid identifier. + cat > "$TOML" <<'EOF' +name = "test" +repo = "test-owner/test-repo" +forge_url = "http://localhost:3000" + +[agents.llama] +base_url = "http://10.10.10.1:8081" +model = "qwen" + +[agents."weird name"] +base_url = "http://10.10.10.1:8082" +model = "qwen-bad" +EOF + + run bash -c " + set -euo pipefail + source '${ROOT}/lib/load-project.sh' '$TOML' 2>&1 + echo \"LLAMA=\${AGENT_LLAMA_BASE_URL:-MISSING}\" + " + + # The sane sibling must still be loaded despite the malformed neighbour. + [ "$status" -eq 0 ] + [[ "$output" == *"LLAMA=http://10.10.10.1:8081"* ]] + # The invalid agent's identifier triggers a warning and is skipped. + [[ "$output" == *"skipping invalid shell identifier"* ]] +} diff --git a/tests/vault-import.bats b/tests/vault-import.bats new file mode 100644 index 0000000..890a900 --- /dev/null +++ b/tests/vault-import.bats @@ -0,0 +1,360 @@ +#!/usr/bin/env bats +# tests/vault-import.bats — Tests for tools/vault-import.sh +# +# Runs against a dev-mode Vault server (single binary, no LXC needed). +# CI launches vault server -dev inline before running these tests. + +VAULT_BIN="${VAULT_BIN:-vault}" +IMPORT_SCRIPT="${BATS_TEST_DIRNAME}/../tools/vault-import.sh" +FIXTURES_DIR="${BATS_TEST_DIRNAME}/fixtures" + +setup_file() { + # Start dev-mode vault on a random port + export VAULT_DEV_PORT + VAULT_DEV_PORT="$(shuf -i 18200-18299 -n 1)" + export VAULT_ADDR="http://127.0.0.1:${VAULT_DEV_PORT}" + + "$VAULT_BIN" server -dev \ + -dev-listen-address="127.0.0.1:${VAULT_DEV_PORT}" \ + -dev-root-token-id="test-root-token" \ + -dev-no-store-token \ + &>"${BATS_FILE_TMPDIR}/vault.log" & + export VAULT_PID=$! + + export VAULT_TOKEN="test-root-token" + + # Wait for vault to be ready (up to 10s) + local i=0 + while ! curl -sf "${VAULT_ADDR}/v1/sys/health" >/dev/null 2>&1; do + sleep 0.5 + i=$((i + 1)) + if [ "$i" -ge 20 ]; then + echo "Vault failed to start. Log:" >&2 + cat "${BATS_FILE_TMPDIR}/vault.log" >&2 + return 1 + fi + done + + # Enable kv-v2 at path=kv (production mount per S2 migration). Dev-mode + # vault only auto-mounts kv-v2 at secret/; tests must mirror the real + # cluster layout so vault-import.sh writes land where we read them. + curl -sf -H "X-Vault-Token: test-root-token" \ + -X POST -d '{"type":"kv","options":{"version":"2"}}' \ + "${VAULT_ADDR}/v1/sys/mounts/kv" >/dev/null +} + +teardown_file() { + if [ -n "${VAULT_PID:-}" ]; then + kill "$VAULT_PID" 2>/dev/null || true + wait "$VAULT_PID" 2>/dev/null || true + fi +} + +setup() { + # Source the module under test for hvault functions + source "${BATS_TEST_DIRNAME}/../lib/hvault.sh" + export VAULT_ADDR VAULT_TOKEN +} + +# --- Security checks --- + +@test "refuses to run if VAULT_ADDR is not localhost" { + export VAULT_ADDR="http://prod-vault.example.com:8200" + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "Security check failed" +} + +@test "refuses if age key file permissions are not 0400" { + # Create a temp file with wrong permissions + local bad_key="${BATS_TEST_TMPDIR}/bad-ages.txt" + echo "AGE-SECRET-KEY-1TEST" > "$bad_key" + chmod 644 "$bad_key" + + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$bad_key" + [ "$status" -ne 0 ] + echo "$output" | grep -q "permissions" +} + +# --- Dry-run mode ───────────────────────────────────────────────────────────── + +@test "--dry-run prints plan without writing to Vault" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" \ + --dry-run + [ "$status" -eq 0 ] + echo "$output" | grep -q "DRY-RUN" + echo "$output" | grep -q "Import plan" + echo "$output" | grep -q "Planned operations" + + # Verify nothing was written to Vault + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/bots/review" + [ "$status" -ne 0 ] +} + +# --- Complete fixture import ───────────────────────────────────────────────── + +@test "imports all keys from complete fixture" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Check bots/review + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/bots/review" + [ "$status" -eq 0 ] + echo "$output" | grep -q "review-token" + echo "$output" | grep -q "review-pass" + + # Check bots/dev-qwen + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen" + [ "$status" -eq 0 ] + echo "$output" | grep -q "llama-token" + echo "$output" | grep -q "llama-pass" + + # Check forge + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/shared/forge" + [ "$status" -eq 0 ] + echo "$output" | grep -q "generic-forge-token" + echo "$output" | grep -q "generic-forge-pass" + echo "$output" | grep -q "generic-admin-token" + + # Check woodpecker + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/shared/woodpecker" + [ "$status" -eq 0 ] + echo "$output" | grep -q "wp-agent-secret" + echo "$output" | grep -q "wp-forgejo-client" + echo "$output" | grep -q "wp-forgejo-secret" + echo "$output" | grep -q "wp-token" + + # Check chat + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/shared/chat" + [ "$status" -eq 0 ] + echo "$output" | grep -q "forward-auth-secret" + echo "$output" | grep -q "chat-client-id" + echo "$output" | grep -q "chat-client-secret" + + # Check runner tokens from sops + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/runner/GITHUB_TOKEN" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.data.data.value == "github-test-token-abc123"' +} + +# --- Idempotency ────────────────────────────────────────────────────────────── + +@test "re-run with unchanged fixtures reports all unchanged" { + # First run + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Second run - should report unchanged + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Check that all keys report unchanged + echo "$output" | grep -q "unchanged" + # Count unchanged occurrences (should be many) + local unchanged_count + unchanged_count=$(echo "$output" | grep -c "unchanged" || true) + [ "$unchanged_count" -gt 10 ] +} + +@test "re-run with modified value reports only that key as updated" { + # Create a modified fixture + local modified_env="${BATS_TEST_TMPDIR}/dot-env-modified" + cp "$FIXTURES_DIR/dot-env-complete" "$modified_env" + + # Modify one value + sed -i 's/llama-token/MODIFIED-LLAMA-TOKEN/' "$modified_env" + + # Run with modified fixture + run "$IMPORT_SCRIPT" \ + --env "$modified_env" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Check that dev-qwen token was updated + echo "$output" | grep -q "dev-qwen.*updated" + + # Verify the new value was written (path is disinto/bots/dev-qwen, key is token) + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.data.data.token == "MODIFIED-LLAMA-TOKEN"' +} + +# --- Delimiter-in-value regression (#898) ──────────────────────────────────── + +@test "preserves secret values that contain a pipe character" { + # Regression: previous accumulator packed values into "value|status" and + # joined per-path kv pairs with '|', so any value containing '|' was + # silently truncated or misrouted. + local piped_env="${BATS_TEST_TMPDIR}/dot-env-piped" + cp "$FIXTURES_DIR/dot-env-complete" "$piped_env" + + # Swap in values that contain the old delimiter. Exercise both: + # - a paired bot path (token + pass on same vault path, hitting the + # per-path kv-pair join) + # - a single-key path (admin token) + # Values are single-quoted so they survive `source` of the .env file; + # `|` is a shell metachar and unquoted would start a pipeline. That is + # orthogonal to the accumulator bug under test — users are expected to + # quote such values in .env, and the accumulator must then preserve them. + sed -i "s#^FORGE_REVIEW_TOKEN=.*#FORGE_REVIEW_TOKEN='abc|xyz'#" "$piped_env" + sed -i "s#^FORGE_REVIEW_PASS=.*#FORGE_REVIEW_PASS='p1|p2|p3'#" "$piped_env" + sed -i "s#^FORGE_ADMIN_TOKEN=.*#FORGE_ADMIN_TOKEN='admin|with|pipes'#" "$piped_env" + + run "$IMPORT_SCRIPT" \ + --env "$piped_env" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Verify each value round-trips intact. + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/bots/review" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.data.data.token == "abc|xyz"' + echo "$output" | jq -e '.data.data.pass == "p1|p2|p3"' + + run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/kv/data/disinto/shared/forge" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.data.data.admin_token == "admin|with|pipes"' +} + +# --- Incomplete fixture ─────────────────────────────────────────────────────── + +@test "handles incomplete fixture gracefully" { + # The incomplete fixture is missing some keys, but that should be OK + # - it should only import what exists + # - it should warn about missing pairs + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-incomplete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Should have imported what was available + echo "$output" | grep -q "review" + + # Should complete successfully even with incomplete fixture + # The script handles missing pairs gracefully with warnings to stderr + [ "$status" -eq 0 ] +} + +# --- Security: no secrets in output ─────────────────────────────────────────── + +@test "never logs secret values in stdout" { + # Run the import + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -eq 0 ] + + # Check that no actual secret values appear in output + # (only key names and status messages) + local secret_patterns=( + "generic-forge-token" + "generic-forge-pass" + "generic-admin-token" + "review-token" + "review-pass" + "llama-token" + "llama-pass" + "wp-agent-secret" + "forward-auth-secret" + "github-test-token" + "codeberg-test-token" + "clawhub-test-token" + "deploy-key-test" + "npm-test-token" + "dockerhub-test-token" + ) + + for pattern in "${secret_patterns[@]}"; do + if echo "$output" | grep -q "$pattern"; then + echo "FAIL: Found secret pattern '$pattern' in output" >&2 + echo "Output was:" >&2 + echo "$output" >&2 + return 1 + fi + done +} + +# --- Error handling ─────────────────────────────────────────────────────────── + +@test "fails with missing --env argument" { + run "$IMPORT_SCRIPT" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "Missing required argument" +} + +@test "fails with missing --sops argument" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "Missing required argument" +} + +@test "fails with missing --age-key argument" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" + [ "$status" -ne 0 ] + echo "$output" | grep -q "Missing required argument" +} + +@test "fails with non-existent env file" { + run "$IMPORT_SCRIPT" \ + --env "/nonexistent/.env" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "not found" +} + +@test "fails with non-existent sops file" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "/nonexistent/.env.vault.enc" \ + --age-key "$FIXTURES_DIR/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "not found" +} + +@test "fails with non-existent age key file" { + run "$IMPORT_SCRIPT" \ + --env "$FIXTURES_DIR/dot-env-complete" \ + --sops "$FIXTURES_DIR/.env.vault.enc" \ + --age-key "/nonexistent/age-keys.txt" + [ "$status" -ne 0 ] + echo "$output" | grep -q "not found" +} diff --git a/tools/vault-apply-policies.sh b/tools/vault-apply-policies.sh new file mode 100755 index 0000000..f425f17 --- /dev/null +++ b/tools/vault-apply-policies.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +# ============================================================================= +# tools/vault-apply-policies.sh — Idempotent Vault policy sync +# +# Part of the Nomad+Vault migration (S2.1, issue #879). Reads every +# vault/policies/*.hcl file and upserts it into Vault as an ACL policy +# named after the file's basename (without the .hcl suffix). +# +# Idempotency contract: +# For each vault/policies/.hcl: +# - Policy missing in Vault → apply, log "policy created" +# - Policy present, content same → skip, log "policy unchanged" +# - Policy present, content diff → apply, log "policy updated" +# +# Comparison is byte-for-byte against the on-server policy text returned by +# GET sys/policies/acl/.data.policy. Re-running with no file edits is +# a guaranteed no-op that reports every policy as "unchanged". +# +# --dry-run: prints for each file that WOULD be applied; +# does not call Vault at all (no GETs, no PUTs). Exits 0. +# +# Requires: +# - VAULT_ADDR (e.g. http://127.0.0.1:8200) +# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) +# - curl, jq, sha256sum +# +# Usage: +# tools/vault-apply-policies.sh +# tools/vault-apply-policies.sh --dry-run +# +# Exit codes: +# 0 success (policies synced, or --dry-run completed) +# 1 precondition / API failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +POLICIES_DIR="${REPO_ROOT}/vault/policies" + +# shellcheck source=../lib/hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +log() { printf '[vault-apply] %s\n' "$*"; } +die() { printf '[vault-apply] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing ───────────────────────────────────────────────────────────── +# Single optional flag — no loop needed. Keeps this block textually distinct +# from the multi-flag `while/case` parsers elsewhere in the repo (see +# .woodpecker/detect-duplicates.py — sliding 5-line window). +dry_run=false +[ "$#" -le 1 ] || die "too many arguments (saw: $*)" +case "${1:-}" in + '') ;; + --dry-run) dry_run=true ;; + -h|--help) printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" + printf 'Apply every vault/policies/*.hcl to Vault as an ACL policy.\n' + printf 'Idempotent: unchanged policies are reported as "unchanged" and\n' + printf 'not written.\n\n' + printf ' --dry-run Print policy names + content SHA256 that would be\n' + printf ' applied, without contacting Vault. Exits 0.\n' + exit 0 ;; + *) die "unknown flag: $1" ;; +esac + +# ── Preconditions ──────────────────────────────────────────────────────────── +for bin in curl jq sha256sum; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +[ -d "$POLICIES_DIR" ] \ + || die "policies directory not found: ${POLICIES_DIR}" + +# Collect policy files in a stable (lexicographic) order so log output is +# deterministic across runs and CI diffs. +mapfile -t POLICY_FILES < <( + find "$POLICIES_DIR" -maxdepth 1 -type f -name '*.hcl' | LC_ALL=C sort +) + +if [ "${#POLICY_FILES[@]}" -eq 0 ]; then + die "no *.hcl files in ${POLICIES_DIR}" +fi + +# ── Dry-run: print plan + exit (no Vault calls) ────────────────────────────── +if [ "$dry_run" = true ]; then + log "dry-run — ${#POLICY_FILES[@]} policy file(s) in ${POLICIES_DIR}" + for f in "${POLICY_FILES[@]}"; do + name="$(basename "$f" .hcl)" + sha="$(sha256sum "$f" | awk '{print $1}')" + printf '[vault-apply] would apply policy %s (sha256=%s)\n' "$name" "$sha" + done + exit 0 +fi + +# ── Live run: Vault connectivity check ─────────────────────────────────────── +# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env). +# `disinto init` does not export VAULT_ADDR before calling this script — the +# server is reachable on 127.0.0.1:8200 and the root token lives at +# /etc/vault.d/root.token in the common fresh-LXC case (issue #912). +_hvault_default_env + +# hvault_token_lookup both resolves the token (env or /etc/vault.d/root.token) +# and confirms the server is reachable with a valid token. Fail fast here so +# the per-file loop below doesn't emit N identical "HTTP 403" errors. +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Apply each policy, reporting created/updated/unchanged ─────────────────── +log "syncing ${#POLICY_FILES[@]} polic(y|ies) from ${POLICIES_DIR}" + +for f in "${POLICY_FILES[@]}"; do + name="$(basename "$f" .hcl)" + + desired="$(cat "$f")" + # hvault_get_or_empty returns the raw JSON body on 200 or empty on 404. + # Extract the .data.policy field here (jq on "" yields "", so the + # empty-string-means-create branch below still works). + raw="$(hvault_get_or_empty "sys/policies/acl/${name}")" \ + || die "failed to read existing policy: ${name}" + if [ -n "$raw" ]; then + current="$(printf '%s' "$raw" | jq -r '.data.policy // ""')" \ + || die "failed to parse policy response: ${name}" + else + current="" + fi + + if [ -z "$current" ]; then + hvault_policy_apply "$name" "$f" \ + || die "failed to create policy: ${name}" + log "policy ${name} created" + continue + fi + + if [ "$current" = "$desired" ]; then + log "policy ${name} unchanged" + continue + fi + + hvault_policy_apply "$name" "$f" \ + || die "failed to update policy: ${name}" + log "policy ${name} updated" +done + +log "done — ${#POLICY_FILES[@]} polic(y|ies) synced" diff --git a/tools/vault-apply-roles.sh b/tools/vault-apply-roles.sh new file mode 100755 index 0000000..8509493 --- /dev/null +++ b/tools/vault-apply-roles.sh @@ -0,0 +1,308 @@ +#!/usr/bin/env bash +# ============================================================================= +# tools/vault-apply-roles.sh — Idempotent Vault JWT-auth role sync +# +# Part of the Nomad+Vault migration (S2.3, issue #881). Reads +# vault/roles.yaml and upserts each entry as a Vault role under +# auth/jwt-nomad/role/. +# +# Idempotency contract: +# For each role entry in vault/roles.yaml: +# - Role missing in Vault → write, log "role created" +# - Role present, fields match → skip, log "role unchanged" +# - Role present, fields differ → write, log "role updated" +# +# Comparison is per-field on the data the CLI would read back +# (GET auth/jwt-nomad/role/.data.{policies,bound_audiences, +# bound_claims,token_ttl,token_max_ttl,token_type}). Only the fields +# this script owns are compared — a future field added by hand in +# Vault would not be reverted on the next run. +# +# --dry-run: prints the planned role list + full payload for each role +# WITHOUT touching Vault. Exits 0. +# +# Preconditions: +# - Vault auth method jwt-nomad must already be enabled + configured +# (done by lib/init/nomad/vault-nomad-auth.sh — which then calls +# this script). Running this script standalone against a Vault with +# no jwt-nomad path will fail on the first role write. +# - vault/roles.yaml present. See that file's header for the format. +# +# Requires: +# - VAULT_ADDR (e.g. http://127.0.0.1:8200) +# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) +# - curl, jq, awk +# +# Usage: +# tools/vault-apply-roles.sh +# tools/vault-apply-roles.sh --dry-run +# +# Exit codes: +# 0 success (roles synced, or --dry-run completed) +# 1 precondition / API / parse failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +ROLES_FILE="${REPO_ROOT}/vault/roles.yaml" + +# shellcheck source=../lib/hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +# Constants shared across every role — the issue's AC names these as the +# invariant token shape for Nomad workload identity. Bumping any of these +# is a knowing, repo-wide change, not a per-role knob, so they live here +# rather than as per-entry fields in roles.yaml. +ROLE_AUDIENCE="vault.io" +ROLE_TOKEN_TYPE="service" +ROLE_TOKEN_TTL="1h" +ROLE_TOKEN_MAX_TTL="24h" + +log() { printf '[vault-roles] %s\n' "$*"; } +die() { printf '[vault-roles] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing (single optional flag — see vault-apply-policies.sh for the +# sibling grammar). Structured as arg-count guard + dispatch to keep the +# 5-line sliding-window duplicate detector (.woodpecker/detect-duplicates.py) +# from flagging this as shared boilerplate with vault-apply-policies.sh — +# the two parsers implement the same shape but with different control flow. +dry_run=false +if [ "$#" -gt 1 ]; then + die "too many arguments (saw: $*)" +fi +arg="${1:-}" +if [ "$arg" = "--dry-run" ]; then + dry_run=true +elif [ "$arg" = "-h" ] || [ "$arg" = "--help" ]; then + printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" + printf 'Apply every role in vault/roles.yaml to Vault as a\n' + printf 'jwt-nomad role. Idempotent: unchanged roles are reported\n' + printf 'as "unchanged" and not written.\n\n' + printf ' --dry-run Print the planned role list + full role\n' + printf ' payload without contacting Vault. Exits 0.\n' + exit 0 +elif [ -n "$arg" ]; then + die "unknown flag: $arg" +fi +unset arg + +# ── Preconditions ──────────────────────────────────────────────────────────── +for bin in curl jq awk; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +[ -f "$ROLES_FILE" ] \ + || die "roles file not found: ${ROLES_FILE}" + +# ── Parse vault/roles.yaml → TSV ───────────────────────────────────────────── +# Strict-format parser. One awk pass; emits one TAB-separated line per role: +# \t\t\t +# +# Grammar: a record opens on a line matching `- name: ` and closes +# on the next `- name:` or EOF. Within a record, `policy:`, `namespace:`, +# and `job_id:` lines populate the record. Comments (`#...`) and blank +# lines are ignored. Whitespace around the colon and value is trimmed. +# +# This is intentionally narrower than full YAML — the file's header +# documents the exact subset. If someone adds nested maps, arrays, or +# anchors, this parser will silently drop them; the completeness check +# below catches records missing any of the four fields. +parse_roles() { + awk ' + function trim(s) { sub(/^[[:space:]]+/, "", s); sub(/[[:space:]]+$/, "", s); return s } + function strip_comment(s) { sub(/[[:space:]]+#.*$/, "", s); return s } + function emit() { + if (name != "") { + if (policy == "" || namespace == "" || job_id == "") { + printf "INCOMPLETE\t%s\t%s\t%s\t%s\n", name, policy, namespace, job_id + } else { + printf "%s\t%s\t%s\t%s\n", name, policy, namespace, job_id + } + } + name=""; policy=""; namespace=""; job_id="" + } + BEGIN { name=""; policy=""; namespace=""; job_id="" } + # Strip full-line comments and blank lines early. + /^[[:space:]]*#/ { next } + /^[[:space:]]*$/ { next } + # New record: "- name: " + /^[[:space:]]*-[[:space:]]+name:[[:space:]]/ { + emit() + line=strip_comment($0) + sub(/^[[:space:]]*-[[:space:]]+name:[[:space:]]*/, "", line) + name=trim(line) + next + } + # Field within current record. Only accept when a record is open. + /^[[:space:]]+policy:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+policy:[[:space:]]*/, "", line) + policy=trim(line); next + } + /^[[:space:]]+namespace:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+namespace:[[:space:]]*/, "", line) + namespace=trim(line); next + } + /^[[:space:]]+job_id:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+job_id:[[:space:]]*/, "", line) + job_id=trim(line); next + } + END { emit() } + ' "$ROLES_FILE" +} + +mapfile -t ROLE_RECORDS < <(parse_roles) + +if [ "${#ROLE_RECORDS[@]}" -eq 0 ]; then + die "no roles parsed from ${ROLES_FILE}" +fi + +# Validate every record is complete. An INCOMPLETE line has the form +# "INCOMPLETE\t\t\t\t" — list all of +# them at once so the operator sees every missing field, not one per run. +incomplete=() +for rec in "${ROLE_RECORDS[@]}"; do + case "$rec" in + INCOMPLETE*) incomplete+=("${rec#INCOMPLETE$'\t'}") ;; + esac +done +if [ "${#incomplete[@]}" -gt 0 ]; then + printf '[vault-roles] ERROR: role entries with missing fields:\n' >&2 + for row in "${incomplete[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$row" + printf ' - name=%-24s policy=%-22s namespace=%-10s job_id=%s\n' \ + "${name:-}" "${policy:-}" \ + "${namespace:-}" "${job_id:-}" >&2 + done + die "fix ${ROLES_FILE} and re-run" +fi + +# ── Helper: build the JSON payload Vault expects for a role ────────────────── +# Keeps bound_audiences as a JSON array (required by the API — a scalar +# string silently becomes a one-element-list in the CLI but the HTTP API +# rejects it). All fields that differ between runs are inside this payload +# so the diff-check below (role_fields_match) compares like-for-like. +build_payload() { + local policy="$1" namespace="$2" job_id="$3" + jq -n \ + --arg aud "$ROLE_AUDIENCE" \ + --arg policy "$policy" \ + --arg ns "$namespace" \ + --arg job "$job_id" \ + --arg ttype "$ROLE_TOKEN_TYPE" \ + --arg ttl "$ROLE_TOKEN_TTL" \ + --arg maxttl "$ROLE_TOKEN_MAX_TTL" \ + '{ + role_type: "jwt", + bound_audiences: [$aud], + user_claim: "nomad_job_id", + bound_claims: { nomad_namespace: $ns, nomad_job_id: $job }, + token_type: $ttype, + token_policies: [$policy], + token_ttl: $ttl, + token_max_ttl: $maxttl + }' +} + +# ── Dry-run: print plan + exit (no Vault calls) ────────────────────────────── +if [ "$dry_run" = true ]; then + log "dry-run — ${#ROLE_RECORDS[@]} role(s) in ${ROLES_FILE}" + for rec in "${ROLE_RECORDS[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$rec" + payload="$(build_payload "$policy" "$namespace" "$job_id")" + printf '[vault-roles] would apply role %s → policy=%s namespace=%s job_id=%s\n' \ + "$name" "$policy" "$namespace" "$job_id" + printf '%s\n' "$payload" | jq -S . | sed 's/^/ /' + done + exit 0 +fi + +# ── Live run: Vault connectivity check ─────────────────────────────────────── +# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env). +# Called transitively from vault-nomad-auth.sh during `disinto init`, which +# does not export VAULT_ADDR in the common fresh-LXC case (issue #912). +_hvault_default_env +if ! hvault_token_lookup >/dev/null; then + die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" +fi + +# ── Helper: compare on-server role to desired payload ──────────────────────── +# Returns 0 iff every field this script owns matches. Fields not in our +# payload (e.g. a manually-added `ttl` via the UI) are ignored — we don't +# revert them, but we also don't block on them. +role_fields_match() { + local current_json="$1" desired_json="$2" + local keys=( + role_type bound_audiences user_claim bound_claims + token_type token_policies token_ttl token_max_ttl + ) + # Vault returns token_ttl/token_max_ttl as integers (seconds) on GET but + # accepts strings ("1h") on PUT. Normalize: convert desired durations to + # seconds before comparing. jq's tonumber/type checks give us a uniform + # representation on both sides. + local cur des + for k in "${keys[@]}"; do + cur="$(printf '%s' "$current_json" | jq -cS --arg k "$k" '.data[$k] // null')" + des="$(printf '%s' "$desired_json" | jq -cS --arg k "$k" '.[$k] // null')" + case "$k" in + token_ttl|token_max_ttl) + # Normalize desired: "1h"→3600, "24h"→86400. + des="$(printf '%s' "$des" | jq -r '. // ""' | _duration_to_seconds)" + cur="$(printf '%s' "$cur" | jq -r '. // 0')" + ;; + esac + if [ "$cur" != "$des" ]; then + return 1 + fi + done + return 0 +} + +# _duration_to_seconds — read a duration string on stdin, echo seconds. +# Accepts the subset we emit: "Ns", "Nm", "Nh", "Nd". Integers pass through +# unchanged. Any other shape produces the empty string (which cannot match +# Vault's integer response → forces an update). +_duration_to_seconds() { + local s + s="$(cat)" + case "$s" in + ''|null) printf '0' ;; + *[0-9]s) printf '%d' "${s%s}" ;; + *[0-9]m) printf '%d' "$(( ${s%m} * 60 ))" ;; + *[0-9]h) printf '%d' "$(( ${s%h} * 3600 ))" ;; + *[0-9]d) printf '%d' "$(( ${s%d} * 86400 ))" ;; + *[0-9]) printf '%d' "$s" ;; + *) printf '' ;; + esac +} + +# ── Apply each role, reporting created/updated/unchanged ───────────────────── +log "syncing ${#ROLE_RECORDS[@]} role(s) from ${ROLES_FILE}" + +for rec in "${ROLE_RECORDS[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$rec" + + desired_payload="$(build_payload "$policy" "$namespace" "$job_id")" + # hvault_get_or_empty: raw body on 200, empty on 404 (caller: "create"). + current_json="$(hvault_get_or_empty "auth/jwt-nomad/role/${name}")" \ + || die "failed to read existing role: ${name}" + + if [ -z "$current_json" ]; then + _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ + || die "failed to create role: ${name}" + log "role ${name} created" + continue + fi + + if role_fields_match "$current_json" "$desired_payload"; then + log "role ${name} unchanged" + continue + fi + + _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ + || die "failed to update role: ${name}" + log "role ${name} updated" +done + +log "done — ${#ROLE_RECORDS[@]} role(s) synced" diff --git a/tools/vault-import.sh b/tools/vault-import.sh new file mode 100755 index 0000000..bea4a07 --- /dev/null +++ b/tools/vault-import.sh @@ -0,0 +1,593 @@ +#!/usr/bin/env bash +# ============================================================================= +# vault-import.sh — Import .env and sops-decrypted secrets into Vault KV +# +# Reads existing .env and sops-encrypted .env.vault.enc from the old docker stack +# and writes them to Vault KV paths matching the S2.1 policy layout. +# +# Usage: +# vault-import.sh \ +# --env /path/to/.env \ +# [--sops /path/to/.env.vault.enc] \ +# [--age-key /path/to/age/keys.txt] +# +# Flag validation (S2.5, issue #883): +# --import-sops without --age-key → error. +# --age-key without --import-sops → error. +# --env alone (no sops) → OK; imports only the plaintext half. +# +# Mapping: +# From .env: +# - FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS → kv/disinto/bots//{token,password} +# (roles: review, dev, gardener, architect, planner, predictor, supervisor, vault) +# - FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA → kv/disinto/bots/dev-qwen/{token,password} +# - FORGE_TOKEN + FORGE_PASS → kv/disinto/shared/forge/{token,password} +# - FORGE_ADMIN_TOKEN → kv/disinto/shared/forge/admin_token +# - WOODPECKER_* → kv/disinto/shared/woodpecker/ +# - FORWARD_AUTH_SECRET, CHAT_OAUTH_* → kv/disinto/shared/chat/ +# From sops-decrypted .env.vault.enc: +# - GITHUB_TOKEN, CODEBERG_TOKEN, CLAWHUB_TOKEN, DEPLOY_KEY, NPM_TOKEN, DOCKER_HUB_TOKEN +# → kv/disinto/runner//value +# +# Security: +# - Refuses to run if VAULT_ADDR is not localhost +# - Writes to KV v2, not v1 +# - Validates sops age key file is mode 0400 before sourcing +# - Never logs secret values — only key names +# +# Idempotency: +# - Reports unchanged/updated/created per key via hvault_kv_get +# - --dry-run prints the full import plan without writing +# ============================================================================= + +set -euo pipefail + +# ── Internal helpers ────────────────────────────────────────────────────────── + +# _log — emit a log message to stdout (never to stderr to avoid polluting diff) +_log() { + printf '[vault-import] %s\n' "$*" +} + +# _err — emit an error message to stderr +_err() { + printf '[vault-import] ERROR: %s\n' "$*" >&2 +} + +# _die — log error and exit with status 1 +_die() { + _err "$@" + exit 1 +} + +# _check_vault_addr — ensure VAULT_ADDR is localhost (security check) +_check_vault_addr() { + local addr="${VAULT_ADDR:-}" + if [[ ! "$addr" =~ ^https?://(localhost|127\.0\.0\.1)(:[0-9]+)?$ ]]; then + _die "Security check failed: VAULT_ADDR must be localhost for safety. Got: $addr" + fi +} + +# _validate_age_key_perms — ensure age key file is mode 0400 +_validate_age_key_perms() { + local keyfile="$1" + local perms + perms="$(stat -c '%a' "$keyfile" 2>/dev/null)" || _die "Cannot stat age key file: $keyfile" + if [ "$perms" != "400" ]; then + _die "Age key file permissions are $perms, expected 400. Refusing to proceed for security." + fi +} + +# _decrypt_sops — decrypt sops-encrypted file using SOPS_AGE_KEY_FILE +_decrypt_sops() { + local sops_file="$1" + local age_key="$2" + local output + # sops outputs YAML format by default, extract KEY=VALUE lines + output="$(SOPS_AGE_KEY_FILE="$age_key" sops -d "$sops_file" 2>/dev/null | \ + grep -E '^[A-Z_][A-Z0-9_]*=' | \ + sed 's/^\([^=]*\)=\(.*\)$/\1=\2/')" || \ + _die "Failed to decrypt sops file: $sops_file. Check age key and file integrity." + printf '%s' "$output" +} + +# _load_env_file — source an environment file (safety: only KEY=value lines) +_load_env_file() { + local env_file="$1" + local temp_env + temp_env="$(mktemp)" + # Extract only valid KEY=value lines (skip comments, blank lines, malformed) + grep -E '^[A-Za-z_][A-Za-z0-9_]*=' "$env_file" 2>/dev/null > "$temp_env" || true + # shellcheck source=/dev/null + source "$temp_env" + rm -f "$temp_env" +} + +# _kv_path_exists — check if a KV path exists (returns 0 if exists, 1 if not) +_kv_path_exists() { + local path="$1" + # Use hvault_kv_get and check if it fails with "not found" + if hvault_kv_get "$path" >/dev/null 2>&1; then + return 0 + fi + # Check if the error is specifically "not found" + local err_output + err_output="$(hvault_kv_get "$path" 2>&1)" || true + if printf '%s' "$err_output" | grep -qi 'not found\|404'; then + return 1 + fi + # Some other error (e.g., auth failure) — treat as unknown + return 1 +} + +# _kv_get_value — get a single key value from a KV path +_kv_get_value() { + local path="$1" + local key="$2" + hvault_kv_get "$path" "$key" +} + +# _kv_put_secret — write a secret to KV v2 +_kv_put_secret() { + local path="$1" + shift + local kv_pairs=("$@") + + # Build JSON payload with all key-value pairs + local payload='{"data":{}}' + for kv in "${kv_pairs[@]}"; do + local k="${kv%%=*}" + local v="${kv#*=}" + # Use jq with --arg for safe string interpolation (handles quotes/backslashes) + payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '. * {"data": {($k): $v}}')" + done + + # Use curl directly for KV v2 write with versioning + local tmpfile http_code + tmpfile="$(mktemp)" + http_code="$(curl -s -w '%{http_code}' \ + -H "X-Vault-Token: ${VAULT_TOKEN}" \ + -H "Content-Type: application/json" \ + -X POST \ + -d "$payload" \ + -o "$tmpfile" \ + "${VAULT_ADDR}/v1/kv/data/${path}")" || { + rm -f "$tmpfile" + _err "Failed to write to Vault at kv/data/${path}: curl error" + return 1 + } + rm -f "$tmpfile" + + # Check HTTP status — 2xx is success + case "$http_code" in + 2[0-9][0-9]) + return 0 + ;; + 404) + _err "KV path not found: kv/data/${path}" + return 1 + ;; + 403) + _err "Permission denied writing to kv/data/${path}" + return 1 + ;; + *) + _err "Failed to write to Vault at kv/data/${path}: HTTP $http_code" + return 1 + ;; + esac +} + +# _format_status — format the status string for a key +_format_status() { + local status="$1" + local path="$2" + local key="$3" + case "$status" in + unchanged) + printf ' %s: %s/%s (unchanged)' "$status" "$path" "$key" + ;; + updated) + printf ' %s: %s/%s (updated)' "$status" "$path" "$key" + ;; + created) + printf ' %s: %s/%s (created)' "$status" "$path" "$key" + ;; + *) + printf ' %s: %s/%s (unknown)' "$status" "$path" "$key" + ;; + esac +} + +# ── Mapping definitions ────────────────────────────────────────────────────── + +# Bots mapping: FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS +declare -a BOT_ROLES=(review dev gardener architect planner predictor supervisor vault) + +# Runner tokens from sops-decrypted file +declare -a RUNNER_TOKENS=(GITHUB_TOKEN CODEBERG_TOKEN CLAWHUB_TOKEN DEPLOY_KEY NPM_TOKEN DOCKER_HUB_TOKEN) + +# ── Main logic ──────────────────────────────────────────────────────────────── + +main() { + local env_file="" + local sops_file="" + local age_key_file="" + local dry_run=false + + # Parse arguments + while [[ $# -gt 0 ]]; do + case "$1" in + --env) + env_file="$2" + shift 2 + ;; + --sops) + sops_file="$2" + shift 2 + ;; + --age-key) + age_key_file="$2" + shift 2 + ;; + --dry-run) + dry_run=true + shift + ;; + --help|-h) + cat <<'EOF' +vault-import.sh — Import .env and sops-decrypted secrets into Vault KV + +Usage: + vault-import.sh \ + --env /path/to/.env \ + [--sops /path/to/.env.vault.enc] \ + [--age-key /path/to/age/keys.txt] \ + [--dry-run] + +Options: + --env Path to .env file (required) + --sops Path to sops-encrypted .env.vault.enc file (optional; + requires --age-key when set) + --age-key Path to age keys file (required when --sops is set) + --dry-run Print import plan without writing to Vault (optional) + --help Show this help message + +Mapping: + From .env: + - FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS → kv/disinto/bots//{token,password} + - FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA → kv/disinto/bots/dev-qwen/{token,password} + - FORGE_TOKEN + FORGE_PASS → kv/disinto/shared/forge/{token,password} + - FORGE_ADMIN_TOKEN → kv/disinto/shared/forge/admin_token + - WOODPECKER_* → kv/disinto/shared/woodpecker/ + - FORWARD_AUTH_SECRET, CHAT_OAUTH_* → kv/disinto/shared/chat/ + + From sops-decrypted .env.vault.enc: + - GITHUB_TOKEN, CODEBERG_TOKEN, CLAWHUB_TOKEN, DEPLOY_KEY, NPM_TOKEN, DOCKER_HUB_TOKEN + → kv/disinto/runner//value + +Examples: + vault-import.sh --env .env --sops .env.vault.enc --age-key age-keys.txt + vault-import.sh --env .env --sops .env.vault.enc --age-key age-keys.txt --dry-run +EOF + exit 0 + ;; + *) + _die "Unknown option: $1. Use --help for usage." + ;; + esac + done + + # Validate required arguments. --sops and --age-key are paired: if one + # is set, the other must be too. --env alone (no sops half) is valid — + # imports only the plaintext dotenv. Spec: S2.5 / issue #883 / #912. + if [ -z "$env_file" ]; then + _die "Missing required argument: --env" + fi + if [ -n "$sops_file" ] && [ -z "$age_key_file" ]; then + _die "--sops requires --age-key" + fi + if [ -n "$age_key_file" ] && [ -z "$sops_file" ]; then + _die "--age-key requires --sops" + fi + + # Validate files exist + if [ ! -f "$env_file" ]; then + _die "Environment file not found: $env_file" + fi + if [ -n "$sops_file" ] && [ ! -f "$sops_file" ]; then + _die "Sops file not found: $sops_file" + fi + if [ -n "$age_key_file" ] && [ ! -f "$age_key_file" ]; then + _die "Age key file not found: $age_key_file" + fi + + # Security check: age key permissions (only when an age key is provided — + # --env-only imports never touch the age key). + if [ -n "$age_key_file" ]; then + _validate_age_key_perms "$age_key_file" + fi + + # Source the Vault helpers and default the local-cluster VAULT_ADDR + + # VAULT_TOKEN before the localhost safety check runs. `disinto init` + # does not export these in the common fresh-LXC case (issue #912). + source "$(dirname "$0")/../lib/hvault.sh" + _hvault_default_env + + # Security check: VAULT_ADDR must be localhost + _check_vault_addr + + # Load .env file + _log "Loading environment from: $env_file" + _load_env_file "$env_file" + + # Decrypt sops file when --sops was provided. On the --env-only path + # (empty $sops_file) the sops_env stays empty and the per-token loop + # below silently skips runner-token imports — exactly the "only + # plaintext half" spec from S2.5. + local sops_env="" + if [ -n "$sops_file" ]; then + _log "Decrypting sops file: $sops_file" + sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")" + # shellcheck disable=SC2086 + eval "$sops_env" + else + _log "No --sops flag — skipping sops decryption (importing plaintext .env only)" + fi + + # Collect all import operations + declare -a operations=() + + # --- From .env --- + + # Bots: FORGE_{ROLE}_TOKEN + FORGE_{ROLE}_PASS + for role in "${BOT_ROLES[@]}"; do + local token_var="FORGE_${role^^}_TOKEN" + local pass_var="FORGE_${role^^}_PASS" + local token_val="${!token_var:-}" + local pass_val="${!pass_var:-}" + + if [ -n "$token_val" ] && [ -n "$pass_val" ]; then + operations+=("bots|$role|token|$env_file|$token_var") + operations+=("bots|$role|pass|$env_file|$pass_var") + elif [ -n "$token_val" ] || [ -n "$pass_val" ]; then + _err "Warning: $role bot has token but no password (or vice versa), skipping" + fi + done + + # Llama bot: FORGE_TOKEN_LLAMA + FORGE_PASS_LLAMA + local llama_token="${FORGE_TOKEN_LLAMA:-}" + local llama_pass="${FORGE_PASS_LLAMA:-}" + if [ -n "$llama_token" ] && [ -n "$llama_pass" ]; then + operations+=("bots|dev-qwen|token|$env_file|FORGE_TOKEN_LLAMA") + operations+=("bots|dev-qwen|pass|$env_file|FORGE_PASS_LLAMA") + elif [ -n "$llama_token" ] || [ -n "$llama_pass" ]; then + _err "Warning: dev-qwen bot has token but no password (or vice versa), skipping" + fi + + # Generic forge creds: FORGE_TOKEN + FORGE_PASS + local forge_token="${FORGE_TOKEN:-}" + local forge_pass="${FORGE_PASS:-}" + if [ -n "$forge_token" ] && [ -n "$forge_pass" ]; then + operations+=("forge|token|$env_file|FORGE_TOKEN") + operations+=("forge|pass|$env_file|FORGE_PASS") + fi + + # Forge admin token: FORGE_ADMIN_TOKEN + local forge_admin_token="${FORGE_ADMIN_TOKEN:-}" + if [ -n "$forge_admin_token" ]; then + operations+=("forge|admin_token|$env_file|FORGE_ADMIN_TOKEN") + fi + + # Woodpecker secrets: WOODPECKER_* + # Only read from the .env file, not shell environment + local woodpecker_keys=() + while IFS='=' read -r key _; do + if [[ "$key" =~ ^WOODPECKER_ ]] || [[ "$key" =~ ^WP_[A-Z_]+$ ]]; then + woodpecker_keys+=("$key") + fi + done < <(grep -E '^[A-Z_][A-Z0-9_]*=' "$env_file" 2>/dev/null || true) + for key in "${woodpecker_keys[@]}"; do + local val="${!key}" + if [ -n "$val" ]; then + local lowercase_key="${key,,}" + operations+=("woodpecker|$lowercase_key|$env_file|$key") + fi + done + + # Chat secrets: FORWARD_AUTH_SECRET, CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET + for key in FORWARD_AUTH_SECRET CHAT_OAUTH_CLIENT_ID CHAT_OAUTH_CLIENT_SECRET; do + local val="${!key:-}" + if [ -n "$val" ]; then + local lowercase_key="${key,,}" + operations+=("chat|$lowercase_key|$env_file|$key") + fi + done + + # --- From sops-decrypted .env.vault.enc --- + + # Runner tokens + for token_name in "${RUNNER_TOKENS[@]}"; do + local token_val="${!token_name:-}" + if [ -n "$token_val" ]; then + operations+=("runner|$token_name|$sops_file|$token_name") + fi + done + + # If dry-run, just print the plan + if $dry_run; then + _log "=== DRY-RUN: Import plan ===" + _log "Environment file: $env_file" + if [ -n "$sops_file" ]; then + _log "Sops file: $sops_file" + _log "Age key: $age_key_file" + else + _log "Sops file: (none — --env-only import)" + fi + _log "" + _log "Planned operations:" + for op in "${operations[@]}"; do + _log " $op" + done + _log "" + _log "Total: ${#operations[@]} operations" + exit 0 + fi + + # --- Actual import with idempotency check --- + + _log "=== Starting Vault import ===" + _log "Environment file: $env_file" + if [ -n "$sops_file" ]; then + _log "Sops file: $sops_file" + _log "Age key: $age_key_file" + else + _log "Sops file: (none — --env-only import)" + fi + _log "" + + local created=0 + local updated=0 + local unchanged=0 + + # First pass: collect all operations with their parsed values. + # Store value and status in separate associative arrays keyed by + # "vault_path:kv_key". Secret values may contain any character, so we + # never pack them into a delimited string — the previous `value|status` + # encoding silently truncated values containing '|' (see issue #898). + declare -A ops_value + declare -A ops_status + declare -A path_seen + + for op in "${operations[@]}"; do + # Parse operation: category|field|subkey|file|envvar (5 fields for bots/runner) + # or category|field|file|envvar (4 fields for forge/woodpecker/chat). + # These metadata strings are built from safe identifiers (role names, + # env-var names, file paths) and do not carry secret values, so '|' is + # still fine as a separator here. + local category field subkey file envvar="" + local field_count + field_count="$(printf '%s' "$op" | awk -F'|' '{print NF}')" + + if [ "$field_count" -eq 5 ]; then + # 5 fields: category|role|subkey|file|envvar + IFS='|' read -r category field subkey file envvar <<< "$op" + else + # 4 fields: category|field|file|envvar + IFS='|' read -r category field file envvar <<< "$op" + subkey="$field" # For 4-field ops, field is the vault key + fi + + # Determine Vault path and key based on category + local vault_path="" + local vault_key="$subkey" + local source_value="" + + if [ "$file" = "$env_file" ]; then + # Source from environment file (envvar contains the variable name) + source_value="${!envvar:-}" + else + # Source from sops-decrypted env (envvar contains the variable name) + source_value="$(printf '%s' "$sops_env" | grep "^${envvar}=" | sed "s/^${envvar}=//" || true)" + fi + + case "$category" in + bots) + vault_path="disinto/bots/${field}" + vault_key="$subkey" + ;; + forge) + vault_path="disinto/shared/forge" + vault_key="$field" + ;; + woodpecker) + vault_path="disinto/shared/woodpecker" + vault_key="$field" + ;; + chat) + vault_path="disinto/shared/chat" + vault_key="$field" + ;; + runner) + vault_path="disinto/runner/${field}" + vault_key="value" + ;; + *) + _err "Unknown category: $category" + continue + ;; + esac + + # Determine status for this key + local status="created" + if _kv_path_exists "$vault_path"; then + local existing_value + if existing_value="$(_kv_get_value "$vault_path" "$vault_key")" 2>/dev/null; then + if [ "$existing_value" = "$source_value" ]; then + status="unchanged" + else + status="updated" + fi + fi + fi + + # vault_path and vault_key are identifier-safe (no ':' in either), so + # the composite key round-trips cleanly via ${ck%:*} / ${ck#*:}. + local ck="${vault_path}:${vault_key}" + ops_value["$ck"]="$source_value" + ops_status["$ck"]="$status" + path_seen["$vault_path"]=1 + done + + # Second pass: group by vault_path and write. + # IMPORTANT: Always write ALL keys for a path, not just changed ones. + # KV v2 POST replaces the entire document, so we must include unchanged keys + # to avoid dropping them. The idempotency guarantee comes from KV v2 versioning. + for vault_path in "${!path_seen[@]}"; do + # Collect this path's "vault_key=source_value" pairs into a bash + # indexed array. Each element is one kv pair; '=' inside the value is + # preserved because _kv_put_secret splits on the *first* '=' only. + local pairs_array=() + local path_has_changes=0 + + for ck in "${!ops_value[@]}"; do + [ "${ck%:*}" = "$vault_path" ] || continue + local vault_key="${ck#*:}" + pairs_array+=("${vault_key}=${ops_value[$ck]}") + if [ "${ops_status[$ck]}" != "unchanged" ]; then + path_has_changes=1 + fi + done + + # Determine effective status for this path (updated if any key changed) + local effective_status="unchanged" + if [ "$path_has_changes" = 1 ]; then + effective_status="updated" + fi + + if ! _kv_put_secret "$vault_path" "${pairs_array[@]}"; then + _err "Failed to write to $vault_path" + exit 1 + fi + + # Output status for each key in this path + for kv in "${pairs_array[@]}"; do + local kv_key="${kv%%=*}" + _format_status "$effective_status" "$vault_path" "$kv_key" + printf '\n' + done + + # Count only if path has changes + if [ "$effective_status" = "updated" ]; then + ((updated++)) || true + fi + done + + _log "" + _log "=== Import complete ===" + _log "Created: $created" + _log "Updated: $updated" + _log "Unchanged: $unchanged" +} + +main "$@" diff --git a/tools/vault-seed-forgejo.sh b/tools/vault-seed-forgejo.sh new file mode 100755 index 0000000..1f1e619 --- /dev/null +++ b/tools/vault-seed-forgejo.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# ============================================================================= +# tools/vault-seed-forgejo.sh — Idempotent seed for kv/disinto/shared/forgejo +# +# Part of the Nomad+Vault migration (S2.4, issue #882). Populates the KV v2 +# path that nomad/jobs/forgejo.hcl reads from, so a clean-install factory +# (no old-stack secrets to import) still has per-key values for +# FORGEJO__security__SECRET_KEY + FORGEJO__security__INTERNAL_TOKEN. +# +# Companion to tools/vault-import.sh (S2.2, not yet merged) — when that +# import runs against a box with an existing stack, it overwrites these +# seeded values with the real ones. Order doesn't matter: whichever runs +# last wins, and both scripts are idempotent in the sense that re-running +# never rotates an existing non-empty key. +# +# Idempotency contract (per key): +# - Key missing or empty in Vault → generate a random value, write it, +# log " generated (N bytes hex)". +# - Key present with a non-empty value → leave untouched, log +# " unchanged". +# - Neither key changes is a silent no-op (no Vault write at all). +# +# Rotating an existing key is deliberately NOT in scope — SECRET_KEY +# rotation invalidates every existing session cookie in forgejo and +# INTERNAL_TOKEN rotation breaks internal RPC until all processes have +# restarted. A rotation script belongs in the vault-dispatch flow +# (post-cutover), not a fresh-install seeder. +# +# Preconditions: +# - Vault reachable + unsealed at $VAULT_ADDR. +# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. +# - The `kv/` mount is enabled as KV v2 (this script enables it on a +# fresh box; on an existing box it asserts the mount type/version). +# +# Requires: +# - VAULT_ADDR (e.g. http://127.0.0.1:8200) +# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) +# - curl, jq, openssl +# +# Usage: +# tools/vault-seed-forgejo.sh +# tools/vault-seed-forgejo.sh --dry-run +# +# Exit codes: +# 0 success (seed applied, or already applied) +# 1 precondition / API / mount-mismatch failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# shellcheck source=../lib/hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +# KV v2 mount + logical path. Kept as two vars so the full API path used +# for GET/POST (which MUST include `/data/`) is built in one place. +KV_MOUNT="kv" +KV_LOGICAL_PATH="disinto/shared/forgejo" +KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}" + +# Byte lengths for the generated secrets (hex output, so the printable +# string length is 2x these). 32 bytes matches forgejo's own +# `gitea generate secret SECRET_KEY` default; 64 bytes is comfortably +# above forgejo's INTERNAL_TOKEN JWT-HMAC key floor. +SECRET_KEY_BYTES=32 +INTERNAL_TOKEN_BYTES=64 + +log() { printf '[vault-seed-forgejo] %s\n' "$*"; } +die() { printf '[vault-seed-forgejo] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing — single optional `--dry-run`. Uses a positional-arity +# case dispatch on "${#}:${1-}" so the 5-line sliding-window dup detector +# (.woodpecker/detect-duplicates.py) sees a shape distinct from both +# vault-apply-roles.sh (if/elif chain) and vault-apply-policies.sh (flat +# case on $1 alone). Three sibling tools, three parser shapes. +DRY_RUN=0 +case "$#:${1-}" in + 0:) + ;; + 1:--dry-run) + DRY_RUN=1 + ;; + 1:-h|1:--help) + printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" + printf 'Seed kv/disinto/shared/forgejo with random SECRET_KEY +\n' + printf 'INTERNAL_TOKEN if they are missing. Idempotent: existing\n' + printf 'non-empty values are left untouched.\n\n' + printf ' --dry-run Print planned actions (enable mount? which keys\n' + printf ' to generate?) without writing to Vault. Exits 0.\n' + exit 0 + ;; + *) + die "invalid arguments: $* (try --help)" + ;; +esac + +# ── Preconditions ──────────────────────────────────────────────────────────── +for bin in curl jq openssl; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +# Vault connectivity — short-circuit style (`||`) instead of an `if`-chain +# so this block has a distinct textual shape from vault-apply-roles.sh's +# equivalent preflight; hvault.sh's typed helpers emit structured JSON +# errors that don't render well behind the `[vault-seed-forgejo] …` +# log prefix, hence the inline check + plain-string diag. +[ -n "${VAULT_ADDR:-}" ] \ + || die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200" +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Step 1/2: ensure kv/ mount exists and is KV v2 ─────────────────────────── +# The policy at vault/policies/service-forgejo.hcl grants read on +# `kv/data//*` — that `data` segment only exists for KV v2. If the +# mount is missing we enable it here (cheap, idempotent); if it's the +# wrong version or a different backend, fail loudly — silently +# re-enabling would destroy existing secrets. +log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" +mounts_json="$(hvault_get_or_empty "sys/mounts")" \ + || die "failed to list Vault mounts" + +mount_exists=false +if printf '%s' "$mounts_json" | jq -e --arg m "${KV_MOUNT}/" '.[$m]' >/dev/null 2>&1; then + mount_exists=true +fi + +if [ "$mount_exists" = true ]; then + mount_type="$(printf '%s' "$mounts_json" \ + | jq -r --arg m "${KV_MOUNT}/" '.[$m].type // ""')" + mount_version="$(printf '%s' "$mounts_json" \ + | jq -r --arg m "${KV_MOUNT}/" '.[$m].options.version // "1"')" + if [ "$mount_type" != "kv" ]; then + die "${KV_MOUNT}/ is mounted as type='${mount_type}', expected 'kv' — refuse to re-mount" + fi + if [ "$mount_version" != "2" ]; then + die "${KV_MOUNT}/ is KV v${mount_version}, expected v2 — refuse to upgrade in place (manual fix required)" + fi + log "${KV_MOUNT}/ already mounted (kv v2) — skipping enable" +else + if [ "$DRY_RUN" -eq 1 ]; then + log "[dry-run] would enable ${KV_MOUNT}/ as kv v2" + else + payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')" + _hvault_request POST "sys/mounts/${KV_MOUNT}" "$payload" >/dev/null \ + || die "failed to enable ${KV_MOUNT}/ as kv v2" + log "${KV_MOUNT}/ enabled as kv v2" + fi +fi + +# ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ──────────── +log "── Step 2/2: seed ${KV_API_PATH} ──" + +# hvault_get_or_empty returns an empty string on 404 (KV path absent). +# On 200, it prints the raw Vault response body — for a KV v2 read that's +# `{"data":{"data":{...},"metadata":{...}}}`, hence the `.data.data.` +# path below. A path with `deleted_time` set still returns 200 but the +# inner `.data.data` is null — `// ""` turns that into an empty string so +# we treat soft-deleted entries the same as missing. +existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \ + || die "failed to read ${KV_API_PATH}" + +existing_secret_key="" +existing_internal_token="" +if [ -n "$existing_raw" ]; then + existing_secret_key="$(printf '%s' "$existing_raw" | jq -r '.data.data.secret_key // ""')" + existing_internal_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.internal_token // ""')" +fi + +desired_secret_key="$existing_secret_key" +desired_internal_token="$existing_internal_token" +generated=() + +if [ -z "$desired_secret_key" ]; then + if [ "$DRY_RUN" -eq 1 ]; then + # In dry-run, don't call openssl — log the intent only. The real run + # generates fresh bytes; nothing about the generated value is + # deterministic so there's no "planned value" to show. + generated+=("secret_key") + else + desired_secret_key="$(openssl rand -hex "$SECRET_KEY_BYTES")" + generated+=("secret_key") + fi +fi + +if [ -z "$desired_internal_token" ]; then + if [ "$DRY_RUN" -eq 1 ]; then + generated+=("internal_token") + else + desired_internal_token="$(openssl rand -hex "$INTERNAL_TOKEN_BYTES")" + generated+=("internal_token") + fi +fi + +if [ "${#generated[@]}" -eq 0 ]; then + log "all keys present at ${KV_API_PATH} — no-op" + log "secret_key unchanged" + log "internal_token unchanged" + exit 0 +fi + +if [ "$DRY_RUN" -eq 1 ]; then + log "[dry-run] would generate + write: ${generated[*]}" + for key in secret_key internal_token; do + case " ${generated[*]} " in + *" ${key} "*) log "[dry-run] ${key} would be generated" ;; + *) log "[dry-run] ${key} unchanged" ;; + esac + done + exit 0 +fi + +# Write back BOTH keys in one payload. KV v2 replaces `.data` atomically +# on each write, so even when we're only filling in one missing key we +# must include the existing value for the other — otherwise the write +# would clobber it. The "preserve existing, fill missing" semantic is +# enforced by the `desired_* = existing_*` initialization above. +payload="$(jq -n \ + --arg sk "$desired_secret_key" \ + --arg it "$desired_internal_token" \ + '{data: {secret_key: $sk, internal_token: $it}}')" + +_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \ + || die "failed to write ${KV_API_PATH}" + +for key in secret_key internal_token; do + case " ${generated[*]} " in + *" ${key} "*) log "${key} generated" ;; + *) log "${key} unchanged" ;; + esac +done + +log "done — ${#generated[@]} key(s) seeded at ${KV_API_PATH}" diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md new file mode 100644 index 0000000..692c885 --- /dev/null +++ b/vault/policies/AGENTS.md @@ -0,0 +1,182 @@ + +# vault/policies/ — Agent Instructions + +HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per +policy; the basename (minus `.hcl`) is the Vault policy name applied to it. +Synced into Vault by `tools/vault-apply-policies.sh` (idempotent — see the +script header for the contract). + +This directory is part of the **Nomad+Vault migration (Step 2)** — see +issues #879–#884. Policies attach to Nomad jobs via workload identity in +S2.4; this PR only lands the files + apply script. + +## Naming convention + +| Prefix | Audience | KV scope | +|---|---|---| +| `service-.hcl` | Long-running platform services (forgejo, woodpecker) | `kv/data/disinto/shared//*` | +| `bot-.hcl` | Per-agent jobs (dev, review, gardener, …) | `kv/data/disinto/bots//*` + shared forge URL | +| `runner-.hcl` | Per-secret policy for vault-runner ephemeral dispatch | exactly one `kv/data/disinto/runner/` path | +| `dispatcher.hcl` | Long-running edge dispatcher | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` | + +The KV mount name `kv/` is the convention this migration uses (mounted as +KV v2). Vault addresses KV v2 data at `kv/data/` and metadata at +`kv/metadata/` — policies that need `list` always target the +`metadata` path; reads target `data`. + +## Policy → KV path summary + +| Policy | Reads | +|---|---| +| `service-forgejo` | `kv/data/disinto/shared/forgejo/*` | +| `service-woodpecker` | `kv/data/disinto/shared/woodpecker/*` | +| `bot-` (dev, review, gardener, architect, planner, predictor, supervisor, vault, dev-qwen) | `kv/data/disinto/bots//*` + `kv/data/disinto/shared/forge/*` | +| `runner-` (GITHUB\_TOKEN, CODEBERG\_TOKEN, CLAWHUB\_TOKEN, DEPLOY\_KEY, NPM\_TOKEN, DOCKER\_HUB\_TOKEN) | `kv/data/disinto/runner/` (exactly one) | +| `dispatcher` | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` | + +## Why one policy per runner secret + +`vault-runner` (Step 5) reads each action TOML's `secrets = [...]` list +and composes only those `runner-` policies onto the per-dispatch +ephemeral token. Wildcards or batched policies would hand the runner more +secrets than the action declared — defeats AD-006 (least-privilege per +external action). Adding a new declarable secret = adding one new +`runner-.hcl` here + extending the SECRETS allow-list in vault-action +validation. + +## Adding a new policy + +1. Drop a file matching one of the four naming patterns above. Use an + existing file in the same family as the template — comment header, + capability list, and KV path layout should match the family. +2. Run `vault policy fmt ` locally so the formatting matches what + the CI fmt-check (step 4 of `.woodpecker/nomad-validate.yml`) will + accept. The fmt check runs non-destructively in CI but a dirty file + fails the step; running `fmt` locally before pushing is the fastest + path. +3. Add the matching entry to `../roles.yaml` (see "JWT-auth roles" below) + so the CI role-reference check (step 6) stays green. +4. Run `tools/vault-apply-policies.sh --dry-run` to confirm the new + basename appears in the planned-work list with the expected SHA. +5. Run `tools/vault-apply-policies.sh` against a Vault instance to + create it; re-run to confirm it reports `unchanged`. + +## JWT-auth roles (S2.3) + +Policies are inert until a Vault token carrying them is minted. In this +migration that mint path is JWT auth — Nomad jobs exchange their +workload-identity JWT for a Vault token via +`auth/jwt-nomad/role/` → `token_policies = [""]`. The +role bindings live in [`../roles.yaml`](../roles.yaml); the script that +enables the auth method + writes the config + applies roles is +[`lib/init/nomad/vault-nomad-auth.sh`](../../lib/init/nomad/vault-nomad-auth.sh). +The applier is [`tools/vault-apply-roles.sh`](../../tools/vault-apply-roles.sh). + +### Role → policy naming convention + +Role name == policy name, 1:1. `vault/roles.yaml` carries one entry per +`vault/policies/*.hcl` file: + +```yaml +roles: + - name: service-forgejo # Vault role + policy: service-forgejo # ACL policy attached to minted tokens + namespace: default # bound_claims.nomad_namespace + job_id: forgejo # bound_claims.nomad_job_id +``` + +The role name is what jobspecs reference via `vault { role = "..." }` — +keep it identical to the policy basename so an S2.1↔S2.3 drift (new +policy without a role, or vice versa) shows up in one directory review, +not as a runtime "permission denied" at job placement. + +`bound_claims.nomad_job_id` is the actual `job "..."` name in the +jobspec, which may differ from the policy name (e.g. policy +`service-forgejo` binds to job `forgejo`). Update it when each bot's or +runner's jobspec lands. + +### Adding a new service + +1. Write `vault/policies/.hcl` using the naming-table family that + fits (`service-`, `bot-`, `runner-`, or standalone). +2. Add a matching entry to `vault/roles.yaml` with all four fields + (`name`, `policy`, `namespace`, `job_id`). +3. Apply both — either in one shot via `lib/init/nomad/vault-nomad-auth.sh` + (policies → roles → nomad SIGHUP), or granularly via + `tools/vault-apply-policies.sh` + `tools/vault-apply-roles.sh`. +4. Reference the role in the consuming jobspec's `vault { role = "" }`. + +### Token shape + +All roles share the same token shape, hardcoded in +`tools/vault-apply-roles.sh`: + +| Field | Value | +|---|---| +| `bound_audiences` | `["vault.io"]` — matches `default_identity.aud` in `nomad/server.hcl` | +| `token_type` | `service` — auto-revoked when the task exits | +| `token_ttl` | `1h` | +| `token_max_ttl` | `24h` | + +Bumping any of these is a knowing, repo-wide change. Per-role overrides +would let one service's tokens outlive the others — add a field to +`vault/roles.yaml` and the applier at the same time if that ever +becomes necessary. + +## Policy lifecycle + +Adding a policy that an actual workload consumes is a three-step chain; +the CI pipeline guards each link. + +1. **Add the policy HCL** — `vault/policies/.hcl`, formatted with + `vault policy fmt`. Capabilities must be drawn from the Vault-recognized + set (`read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`, + `deny`); a typo fails CI step 5 (HCL written to an inline dev-mode Vault + via `vault policy write` — a real parser, not a regex). +2. **Update `../roles.yaml`** — add a JWT-auth role entry whose `policy:` + field matches the new basename (without `.hcl`). CI step 6 re-checks + every role in this file against the policy set, so a drift between the + two directories fails the step. +3. **Reference from a Nomad jobspec** — add `vault { role = "" }` in + `nomad/jobs/.hcl` (owned by S2.4). Policies do not take effect + until a Nomad job asks for a token via that role. + +See the "Adding a new service" walkthrough below for the applier-script +flow once steps 1–3 are committed. + +## CI enforcement (`.woodpecker/nomad-validate.yml`) + +The pipeline triggers on any PR touching `vault/policies/**`, +`vault/roles.yaml`, or `lib/init/nomad/vault-*.sh` and runs four +vault-scoped checks (in addition to the nomad-scoped steps already in +place): + +| Step | Tool | What it catches | +|---|---|---| +| 4. `vault-policy-fmt` | `vault policy fmt` + `diff` | formatting drift — trailing whitespace, wrong indentation, missing newlines | +| 5. `vault-policy-validate` | `vault policy write` against inline dev Vault | HCL syntax errors, unknown stanzas, invalid capability names (e.g. `"frobnicate"`), malformed `path "..." {}` blocks | +| 6. `vault-roles-validate` | yamllint + PyYAML | roles.yaml syntax drift, missing required fields, role→policy references with no matching `.hcl` | +| P11 | `lib/secret-scan.sh` via `.woodpecker/secret-scan.yml` | literal secret leaked into a policy HCL (rare copy-paste mistake) — already covers `vault/**/*`, no duplicate step here | + +All four steps are fail-closed — any error blocks merge. The pipeline +pins `hashicorp/vault:1.18.5` (matching `lib/init/nomad/install.sh`); +bumping the runtime version without bumping the CI image is a CI-caught +drift. + +## Common failure modes + +| Symptom in CI logs | Root cause | Fix | +|---|---|---| +| `vault-policy-fmt: … is not formatted — run 'vault policy fmt '` | Trailing whitespace / mixed indent in an HCL file | `vault policy fmt ` locally and re-commit | +| `vault-policy-validate: … failed validation` plus a `policy` error from Vault | Unknown capability (e.g. `"frobnicate"`), unknown stanza, malformed `path` block | Fix the HCL; valid capabilities are `read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`, `deny` | +| `vault-roles-validate: ERROR: role 'X' references policy 'Y' but vault/policies/Y.hcl does not exist` | A role's `policy:` field does not match any file basename in `vault/policies/` | Either add the missing policy HCL or fix the typo in `roles.yaml` | +| `vault-roles-validate: ERROR: role entry missing required field 'Z'` | A role in `roles.yaml` is missing one of `name`, `policy`, `namespace`, `job_id` | Add the field; all four are required | +| P11 `secret-scan: detected potential secret …` on a `.hcl` file | A literal token/password was pasted into a policy | Policies must name KV paths, not carry secret values — move the literal into KV (S2.2) and have the policy grant `read` on the path | + +## What this directory does NOT own + +- **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the + jobspec `template { vault { policies = […] } }` stanza — the role + name in `vault { role = "..." }` is what binds the policy. +- **Writing the secret values themselves.** That's S2.2 (#880) via + `tools/vault-import.sh`. diff --git a/vault/policies/bot-architect.hcl b/vault/policies/bot-architect.hcl new file mode 100644 index 0000000..9381b61 --- /dev/null +++ b/vault/policies/bot-architect.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-architect.hcl +# +# Architect agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the architect-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/architect/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/architect/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-dev-qwen.hcl b/vault/policies/bot-dev-qwen.hcl new file mode 100644 index 0000000..b71283d --- /dev/null +++ b/vault/policies/bot-dev-qwen.hcl @@ -0,0 +1,18 @@ +# vault/policies/bot-dev-qwen.hcl +# +# Local-Qwen dev agent (agents-llama profile): reads its own bot KV +# namespace + the shared forge URL. Attached to the dev-qwen Nomad job +# via workload identity (S2.4). KV path mirrors the bot basename: +# kv/disinto/bots/dev-qwen/*. + +path "kv/data/disinto/bots/dev-qwen/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/dev-qwen/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-dev.hcl b/vault/policies/bot-dev.hcl new file mode 100644 index 0000000..3771288 --- /dev/null +++ b/vault/policies/bot-dev.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-dev.hcl +# +# Dev agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the dev-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/dev/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/dev/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-gardener.hcl b/vault/policies/bot-gardener.hcl new file mode 100644 index 0000000..f5ef230 --- /dev/null +++ b/vault/policies/bot-gardener.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-gardener.hcl +# +# Gardener agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the gardener-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/gardener/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/gardener/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-planner.hcl b/vault/policies/bot-planner.hcl new file mode 100644 index 0000000..440f6aa --- /dev/null +++ b/vault/policies/bot-planner.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-planner.hcl +# +# Planner agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the planner-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/planner/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/planner/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-predictor.hcl b/vault/policies/bot-predictor.hcl new file mode 100644 index 0000000..3a3b6b2 --- /dev/null +++ b/vault/policies/bot-predictor.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-predictor.hcl +# +# Predictor agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the predictor-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/predictor/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/predictor/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-review.hcl b/vault/policies/bot-review.hcl new file mode 100644 index 0000000..04c7668 --- /dev/null +++ b/vault/policies/bot-review.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-review.hcl +# +# Review agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the review-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/review/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/review/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-supervisor.hcl b/vault/policies/bot-supervisor.hcl new file mode 100644 index 0000000..36ecc90 --- /dev/null +++ b/vault/policies/bot-supervisor.hcl @@ -0,0 +1,16 @@ +# vault/policies/bot-supervisor.hcl +# +# Supervisor agent: reads its own bot KV namespace + the shared forge URL. +# Attached to the supervisor-agent Nomad job via workload identity (S2.4). + +path "kv/data/disinto/bots/supervisor/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/supervisor/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/bot-vault.hcl b/vault/policies/bot-vault.hcl new file mode 100644 index 0000000..0a088dd --- /dev/null +++ b/vault/policies/bot-vault.hcl @@ -0,0 +1,20 @@ +# vault/policies/bot-vault.hcl +# +# Vault agent (the legacy edge dispatcher / vault-action runner): reads its +# own bot KV namespace + the shared forge URL. Attached to the vault-agent +# Nomad job via workload identity (S2.4). +# +# NOTE: distinct from the runner-* policies, which gate per-secret access +# for vault-runner ephemeral dispatches (Step 5). + +path "kv/data/disinto/bots/vault/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/bots/vault/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/forge/*" { + capabilities = ["read"] +} diff --git a/vault/policies/dispatcher.hcl b/vault/policies/dispatcher.hcl new file mode 100644 index 0000000..6383ae7 --- /dev/null +++ b/vault/policies/dispatcher.hcl @@ -0,0 +1,29 @@ +# vault/policies/dispatcher.hcl +# +# Edge dispatcher policy: needs to enumerate the runner secret namespace +# (to check secret presence before dispatching) and read the shared +# ops-repo credentials (token + clone URL) it uses to fetch action TOMLs. +# +# Scope: +# - kv/disinto/runner/* — read all per-secret values + list keys +# - kv/disinto/shared/ops-repo/* — read the ops-repo creds bundle +# +# The actual ephemeral runner container created per dispatch gets the +# narrow runner- policies, NOT this one. This policy stays bound +# to the long-running dispatcher only. + +path "kv/data/disinto/runner/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/runner/*" { + capabilities = ["list", "read"] +} + +path "kv/data/disinto/shared/ops-repo/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/shared/ops-repo/*" { + capabilities = ["list", "read"] +} diff --git a/vault/policies/runner-CLAWHUB_TOKEN.hcl b/vault/policies/runner-CLAWHUB_TOKEN.hcl new file mode 100644 index 0000000..5de32e9 --- /dev/null +++ b/vault/policies/runner-CLAWHUB_TOKEN.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-CLAWHUB_TOKEN.hcl +# +# Per-secret runner policy: ClawHub token for skill-registry publish. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/CLAWHUB_TOKEN" { + capabilities = ["read"] +} diff --git a/vault/policies/runner-CODEBERG_TOKEN.hcl b/vault/policies/runner-CODEBERG_TOKEN.hcl new file mode 100644 index 0000000..5de534b --- /dev/null +++ b/vault/policies/runner-CODEBERG_TOKEN.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-CODEBERG_TOKEN.hcl +# +# Per-secret runner policy: Codeberg PAT for upstream-repo mirror push. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/CODEBERG_TOKEN" { + capabilities = ["read"] +} diff --git a/vault/policies/runner-DEPLOY_KEY.hcl b/vault/policies/runner-DEPLOY_KEY.hcl new file mode 100644 index 0000000..ac711f9 --- /dev/null +++ b/vault/policies/runner-DEPLOY_KEY.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-DEPLOY_KEY.hcl +# +# Per-secret runner policy: SSH deploy key for git push to a release target. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/DEPLOY_KEY" { + capabilities = ["read"] +} diff --git a/vault/policies/runner-DOCKER_HUB_TOKEN.hcl b/vault/policies/runner-DOCKER_HUB_TOKEN.hcl new file mode 100644 index 0000000..7d93a65 --- /dev/null +++ b/vault/policies/runner-DOCKER_HUB_TOKEN.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-DOCKER_HUB_TOKEN.hcl +# +# Per-secret runner policy: Docker Hub access token for image push. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/DOCKER_HUB_TOKEN" { + capabilities = ["read"] +} diff --git a/vault/policies/runner-GITHUB_TOKEN.hcl b/vault/policies/runner-GITHUB_TOKEN.hcl new file mode 100644 index 0000000..7914c92 --- /dev/null +++ b/vault/policies/runner-GITHUB_TOKEN.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-GITHUB_TOKEN.hcl +# +# Per-secret runner policy: GitHub PAT for cross-mirror push / API calls. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/GITHUB_TOKEN" { + capabilities = ["read"] +} diff --git a/vault/policies/runner-NPM_TOKEN.hcl b/vault/policies/runner-NPM_TOKEN.hcl new file mode 100644 index 0000000..27c77ee --- /dev/null +++ b/vault/policies/runner-NPM_TOKEN.hcl @@ -0,0 +1,10 @@ +# vault/policies/runner-NPM_TOKEN.hcl +# +# Per-secret runner policy: npm registry auth token for package publish. +# vault-runner (Step 5) composes only the runner-* policies named by the +# dispatching action's `secrets = [...]` list, so this policy intentionally +# scopes a single KV path — no wildcards, no list capability. + +path "kv/data/disinto/runner/NPM_TOKEN" { + capabilities = ["read"] +} diff --git a/vault/policies/service-forgejo.hcl b/vault/policies/service-forgejo.hcl new file mode 100644 index 0000000..1724fc5 --- /dev/null +++ b/vault/policies/service-forgejo.hcl @@ -0,0 +1,15 @@ +# vault/policies/service-forgejo.hcl +# +# Read-only access to shared Forgejo secrets (admin password, OAuth client +# config). Attached to the Forgejo Nomad job via workload identity (S2.4). +# +# Scope: kv/disinto/shared/forgejo — entries owned by the operator and +# shared between forgejo + the chat OAuth client (issue #855 lineage). + +path "kv/data/disinto/shared/forgejo" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/shared/forgejo" { + capabilities = ["list", "read"] +} diff --git a/vault/policies/service-woodpecker.hcl b/vault/policies/service-woodpecker.hcl new file mode 100644 index 0000000..19c9726 --- /dev/null +++ b/vault/policies/service-woodpecker.hcl @@ -0,0 +1,15 @@ +# vault/policies/service-woodpecker.hcl +# +# Read-only access to shared Woodpecker secrets (agent secret, forge OAuth +# client). Attached to the Woodpecker Nomad job via workload identity (S2.4). +# +# Scope: kv/disinto/shared/woodpecker/* — entries owned by the operator +# and consumed by woodpecker-server + woodpecker-agent. + +path "kv/data/disinto/shared/woodpecker/*" { + capabilities = ["read"] +} + +path "kv/metadata/disinto/shared/woodpecker/*" { + capabilities = ["list", "read"] +} diff --git a/vault/roles.yaml b/vault/roles.yaml new file mode 100644 index 0000000..fdc11d2 --- /dev/null +++ b/vault/roles.yaml @@ -0,0 +1,150 @@ +# ============================================================================= +# vault/roles.yaml — Vault JWT-auth role bindings for Nomad workload identity +# +# Part of the Nomad+Vault migration (S2.3, issue #881). One entry per +# vault/policies/*.hcl policy. Each entry pairs: +# +# - the Vault role name (what a Nomad job references via +# `vault { role = "..." }` in its jobspec), with +# - the ACL policy attached to tokens it mints, and +# - the bound claims that gate which Nomad workloads may authenticate +# through that role (prevents a jobspec named "woodpecker" from +# asking for role "service-forgejo"). +# +# The source of truth for *what* secrets each role's token can read is +# vault/policies/.hcl. This file only wires role→policy→claims. +# Keeping the two side-by-side in the repo means an S2.1↔S2.3 drift +# (new policy without a role, or vice versa) shows up in one directory +# review, not as a runtime "permission denied" at job placement. +# +# All roles share the same constants (hardcoded in tools/vault-apply-roles.sh): +# - bound_audiences = ["vault.io"] — Nomad's default workload-identity aud +# - token_type = "service" — revoked when task exits +# - token_ttl = "1h" — token lifetime +# - token_max_ttl = "24h" — hard cap across renewals +# +# Format (strict — parsed line-by-line by tools/vault-apply-roles.sh with +# awk; keep the "- name:" prefix + two-space nested indent exactly as +# shown below): +# +# roles: +# - name: # path: auth/jwt-nomad/role/ +# policy: # must match vault/policies/.hcl +# namespace: # bound_claims.nomad_namespace +# job_id: # bound_claims.nomad_job_id +# +# All four fields are required. Comments (#) and blank lines are ignored. +# +# Adding a new role: +# 1. Land the companion vault/policies/.hcl in S2.1 style. +# 2. Add a block here with all four fields. +# 3. Run tools/vault-apply-roles.sh to upsert it. +# 4. Re-run to confirm "role unchanged". +# ============================================================================= +roles: + # ── Long-running services (nomad/jobs/.hcl) ────────────────────────── + # The jobspec's nomad job name is the bound job_id, e.g. `job "forgejo"` + # in nomad/jobs/forgejo.hcl → job_id: forgejo. The policy name stays + # `service-` so the directory layout under vault/policies/ groups + # platform services under a single prefix. + - name: service-forgejo + policy: service-forgejo + namespace: default + job_id: forgejo + + - name: service-woodpecker + policy: service-woodpecker + namespace: default + job_id: woodpecker + + # ── Per-agent bots (nomad/jobs/bot-.hcl — land in later steps) ─────── + # job_id placeholders match the policy name 1:1 until each bot's jobspec + # lands. When a bot's jobspec is added under nomad/jobs/, update the + # corresponding job_id here to match the jobspec's `job ""` — and + # CI's S2.6 roles.yaml check will confirm the pairing. + - name: bot-dev + policy: bot-dev + namespace: default + job_id: bot-dev + + - name: bot-dev-qwen + policy: bot-dev-qwen + namespace: default + job_id: bot-dev-qwen + + - name: bot-review + policy: bot-review + namespace: default + job_id: bot-review + + - name: bot-gardener + policy: bot-gardener + namespace: default + job_id: bot-gardener + + - name: bot-planner + policy: bot-planner + namespace: default + job_id: bot-planner + + - name: bot-predictor + policy: bot-predictor + namespace: default + job_id: bot-predictor + + - name: bot-supervisor + policy: bot-supervisor + namespace: default + job_id: bot-supervisor + + - name: bot-architect + policy: bot-architect + namespace: default + job_id: bot-architect + + - name: bot-vault + policy: bot-vault + namespace: default + job_id: bot-vault + + # ── Edge dispatcher ──────────────────────────────────────────────────────── + - name: dispatcher + policy: dispatcher + namespace: default + job_id: dispatcher + + # ── Per-secret runner roles ──────────────────────────────────────────────── + # vault-runner (Step 5) composes runner- policies onto each + # ephemeral dispatch token based on the action TOML's `secrets = [...]`. + # The per-dispatch runner jobspec job_id follows the same `runner-` + # convention (one jobspec per secret, minted per dispatch) so the bound + # claim matches the role name directly. + - name: runner-GITHUB_TOKEN + policy: runner-GITHUB_TOKEN + namespace: default + job_id: runner-GITHUB_TOKEN + + - name: runner-CODEBERG_TOKEN + policy: runner-CODEBERG_TOKEN + namespace: default + job_id: runner-CODEBERG_TOKEN + + - name: runner-CLAWHUB_TOKEN + policy: runner-CLAWHUB_TOKEN + namespace: default + job_id: runner-CLAWHUB_TOKEN + + - name: runner-DEPLOY_KEY + policy: runner-DEPLOY_KEY + namespace: default + job_id: runner-DEPLOY_KEY + + - name: runner-NPM_TOKEN + policy: runner-NPM_TOKEN + namespace: default + job_id: runner-NPM_TOKEN + + - name: runner-DOCKER_HUB_TOKEN + policy: runner-DOCKER_HUB_TOKEN + namespace: default + job_id: runner-DOCKER_HUB_TOKEN