# ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # # Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6, # issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell # artifact under nomad/, lib/init/nomad/, vault/policies/, plus the # `disinto init` dispatcher and vault/roles.yaml, gets checked before it # can land. # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) # lib/init/nomad/** — cluster-up / install / systemd / vault-init / # vault-nomad-auth (S2.6 trigger: vault-*.sh # is a subset of this glob) # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself # vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6) # vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): # 1. nomad-config-validate — `nomad config validate` on server + client HCL # 2. nomad-job-validate — `nomad job validate` looped over every # nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl # 4. vault-policy-fmt — `vault policy fmt` idempotence check on # every vault/policies/*.hcl (format drift = # CI fail; non-destructive via cp+diff) # 5. vault-policy-validate — HCL syntax + capability validation for every # vault/policies/*.hcl via `vault policy write` # against an inline dev-mode Vault server # 6. vault-roles-validate — yamllint + role→policy reference check on # vault/roles.yaml (every referenced policy # must exist as vault/policies/.hcl) # 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto # 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests # # Secret-scan coverage: vault/policies/*.hcl is already scanned by the # P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path # `vault/**/*` covers everything under this directory. We intentionally # do NOT duplicate that gate here; one scanner, one source of truth. # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on # syntax the runtime would reject. # ============================================================================= when: - event: [push, pull_request] path: - "nomad/**" - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" - "vault/policies/**" - "vault/roles.yaml" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is # configured with REQUIRE_SIGN_IN, so anonymous git clones fail (exit 128). # FORGE_TOKEN is injected globally via WOODPECKER_ENVIRONMENT. clone: git: image: alpine/git commands: - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|") - git clone --depth 1 "$AUTH_URL" . - git fetch --depth 1 origin "$CI_COMMIT_REF" - git checkout FETCH_HEAD steps: # ── 1. Nomad HCL syntax check ──────────────────────────────────────────── # `nomad config validate` parses server.hcl + client.hcl and fails on any # HCL/semantic error (unknown block, invalid port range, bad driver cfg). # vault.hcl is excluded — it's a Vault config, not Nomad, so it goes # through the vault-operator-diagnose step instead. - name: nomad-config-validate image: hashicorp/nomad:1.9.5 commands: - nomad version - nomad config validate nomad/server.hcl nomad/client.hcl # ── 2. Nomad jobspec HCL syntax check ──────────────────────────────────── # `nomad job validate` is a *different* tool from `nomad config validate` — # the former parses jobspec HCL (job/group/task blocks, driver config, # volume refs, network ports), the latter parses agent config HCL # (server/client blocks). Running step 1 on a jobspec would reject it # with "unknown block 'job'", and vice versa. Hence two separate steps. # # Validation is offline: no running Nomad server is required (exit 0 on # valid HCL, 1 on syntax/semantic error). The CLI takes a single path # argument so we loop over every `*.hcl` file under nomad/jobs/ — # that way a new jobspec PR gets CI coverage automatically (no separate # "edit the pipeline" step to forget). The `.hcl` suffix is the naming # convention: anything else in nomad/jobs/ is deliberately not validated # by this step. # # `[ -f "$f" ]` guards against the no-match case: POSIX sh does not # nullglob, so an empty jobs/ directory would leave the literal glob in # "$f" and fail. Today forgejo.hcl exists, but the guard keeps the # step safe during any future transient empty state. # # Scope note: offline validate catches jobspec-level errors (unknown # stanzas, missing required fields, wrong value types, invalid driver # config). It does NOT resolve cross-file references like host_volume # source names against nomad/client.hcl — that mismatch surfaces at # scheduling time on the live cluster, not here. The paired-write rule # in nomad/AGENTS.md ("add to both client.hcl and cluster-up.sh") is the # primary guardrail for that class of drift. - name: nomad-job-validate image: hashicorp/nomad:1.9.5 commands: - | set -e for f in nomad/jobs/*.hcl; do [ -f "$f" ] || continue echo "validating jobspec: $f" nomad job validate "$f" done # ── 3. Vault HCL syntax check ──────────────────────────────────────────── # `vault operator diagnose` loads the config and runs a suite of checks. # Exit codes: # 0 — all checks green # 1 — at least one hard failure (bad HCL, bad schema, unreachable storage) # 2 — advisory warnings only (no hard failure) # Our factory dev-box vault.hcl deliberately runs TLS-disabled on a # localhost-only listener (documented in nomad/vault.hcl), which triggers # an advisory "Check Listener TLS" warning → exit 2. The config still # parses, so we tolerate exit 2 and fail only on exit 1 or crashes. # -skip=storage/-skip=listener disables the runtime-only checks (vault's # container has /vault/file so storage is fine, but explicit skip is cheap # insurance against future container-image drift). - name: vault-operator-diagnose image: hashicorp/vault:1.18.5 commands: - | rc=0 vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener || rc=$? case "$rc" in 0) echo "vault config: all checks green" ;; 2) echo "vault config: parse OK (rc=2 — advisory warnings only; TLS-disabled on localhost listener is by design)" ;; *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac # ── 4. Vault policy fmt idempotence check ──────────────────────────────── # `vault policy fmt ` formats a local HCL policy file in place. # There's no `-check`/dry-run flag (vault 1.18.5), so we implement a # non-destructive check as cp → fmt-on-copy → diff against original. # Any diff means the committed file would be rewritten by `vault policy # fmt` — failure steers the author to run `vault policy fmt ` # locally before pushing. # # Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the # no-match case (POSIX sh does not nullglob) so an empty policies/ # directory does not fail this step. # # Note: `vault policy fmt` is purely local (HCL text transform) and does # not require a running Vault server, which is why this step can run # without starting one. - name: vault-policy-fmt image: hashicorp/vault:1.18.5 commands: - | set -e failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue tmp="/tmp/$(basename "$f").fmt" cp "$f" "$tmp" vault policy fmt "$tmp" >/dev/null 2>&1 if ! diff -u "$f" "$tmp"; then echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2 failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-policy-fmt: formatting drift detected" >&2 exit 1 fi echo "vault-policy-fmt: all policies formatted correctly" # ── 5. Vault policy HCL syntax + capability validation ─────────────────── # Vault has no offline `vault policy validate` subcommand — the closest # in-CLI validator is `vault policy write`, which sends the HCL to a # running server which parses it, checks capability names against the # known set (read, list, create, update, delete, patch, sudo, deny), # and rejects unknown stanzas / malformed path blocks. We start an # inline dev-mode Vault (in-memory, no persistence, root token = "root") # for the duration of this step and loop `vault policy write` over every # vault/policies/*.hcl; the policies never leave the ephemeral dev # server, so this is strictly a validator — not a deploy. # # Exit-code handling: # - `vault policy write` exits 0 on success, non-zero on any parse / # semantic error. We aggregate failures across all files so a single # CI run surfaces every broken policy (not just the first). # - The dev server is killed on any step exit via EXIT trap so the # step tears down cleanly even on failure. # # Why dev-mode is sufficient: we're not persisting secrets, only asking # Vault to parse policy text. The factory's production Vault is NOT # contacted. - name: vault-policy-validate image: hashicorp/vault:1.18.5 commands: - | set -e vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 & VAULT_PID=$! trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM export VAULT_ADDR=http://127.0.0.1:8200 export VAULT_TOKEN=root ready=0 i=0 while [ "$i" -lt 30 ]; do if vault status >/dev/null 2>&1; then ready=1 break fi i=$((i + 1)) sleep 0.5 done if [ "$ready" -ne 1 ]; then echo "vault-policy-validate: dev server failed to start after 15s" >&2 cat /tmp/vault-dev.log >&2 || true exit 1 fi failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue name=$(basename "$f" .hcl) echo "validate: $f" if ! vault policy write "$name" "$f"; then echo " ERROR: $f failed validation" >&2 failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-policy-validate: validation errors found" >&2 exit 1 fi echo "vault-policy-validate: all policies valid" # ── 6. vault/roles.yaml validator ──────────────────────────────────────── # Validates the JWT-auth role bindings file (S2.3). Two checks: # # a. `yamllint` — catches YAML syntax errors and indentation drift. # Uses a relaxed config (line length bumped to 200) because # roles.yaml's comments are wide by design. # b. role → policy reference check — every role's `policy:` field # must match a basename in vault/policies/*.hcl. A role pointing # at a non-existent policy = runtime "permission denied" at job # placement; catching the drift here turns it into a CI failure. # Also verifies each role entry has the four required fields # (name, policy, namespace, job_id) per the file's documented # format. # # Parsing is done with PyYAML (the roles.yaml format is a strict # subset that awk-level parsing in tools/vault-apply-roles.sh handles # too, but PyYAML in CI gives us structural validation for free). If # roles.yaml is ever absent (e.g. reverted), the step skips rather # than fails — presence is enforced by S2.3's own tooling, not here. - name: vault-roles-validate image: python:3.12-alpine commands: - pip install --quiet --disable-pip-version-check pyyaml yamllint - | set -e if [ ! -f vault/roles.yaml ]; then echo "vault-roles-validate: vault/roles.yaml not present, skipping" exit 0 fi yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml echo "vault-roles-validate: yamllint OK" python3 - <<'PY' import os import sys import yaml with open('vault/roles.yaml') as f: data = yaml.safe_load(f) or {} roles = data.get('roles') or [] if not roles: print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr) sys.exit(1) existing = { os.path.splitext(e)[0] for e in os.listdir('vault/policies') if e.endswith('.hcl') } required = ('name', 'policy', 'namespace', 'job_id') failed = 0 for r in roles: if not isinstance(r, dict): print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr) failed = 1 continue for field in required: if r.get(field) in (None, ''): print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr) failed = 1 policy = r.get('policy') if policy and policy not in existing: print( f"ERROR: role '{r.get('name')}' references policy '{policy}' " f"but vault/policies/{policy}.hcl does not exist", file=sys.stderr, ) failed = 1 sys.exit(failed) PY echo "vault-roles-validate: all role→policy references valid" # ── 7. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the # one place it gets checked. - name: shellcheck-nomad image: koalaman/shellcheck-alpine:stable commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto # ── 8. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. - name: bats-init-nomad image: alpine:3.19 commands: - apk add --no-cache bash bats - bats tests/disinto-init-nomad.bats