# ============================================================================= # .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts # # Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the # "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or # lib/init/nomad/, plus the `disinto init` dispatcher, gets checked # before it can land. # # Also includes Vault policy validation (S2.6, issue #884): # - vault policy fmt -check on vault/policies/*.hcl # - vault policy validate on each policy file # - roles.yaml validator (yamllint + policy reference check) # - secret-scan gate on policy files # # Triggers on PRs (and pushes) that touch any of: # nomad/** — HCL configs (server, client, vault) # lib/init/nomad/** — cluster-up / install / systemd / vault-init # bin/disinto — `disinto init --backend=nomad` dispatcher # tests/disinto-init-nomad.bats — the bats suite itself # vault/policies/*.hcl — Vault ACL policies (S2.6) # vault/roles.yaml — JWT auth role definitions (S2.6) # lib/init/nomad/vault-*.sh — Vault init scripts (S2.6) # .woodpecker/nomad-validate.yml — the pipeline definition # # Steps (all fail-closed — any error blocks merge): # 1. nomad-config-validate — `nomad config validate` on server + client HCL # 2. nomad-job-validate — `nomad job validate` looped over every # nomad/jobs/*.hcl (new jobspecs get # CI coverage automatically) # 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl # 4. vault-policy-fmt — `vault policy fmt -check` on vault/policies/*.hcl # 5. vault-policy-validate — `vault policy validate` on each policy file # 6. vault-roles-validate — yamllint + policy reference check # 7. vault-secret-scan — scan policy files for embedded secrets # 8. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto # 9. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests # # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 / # vault 1.18.5). Bump there AND here together — drift = CI passing on # syntax the runtime would reject. # ============================================================================= when: - event: [push, pull_request] path: - "nomad/**" - "lib/init/nomad/**" - "bin/disinto" - "tests/disinto-init-nomad.bats" - "vault/policies/*.hcl" - "vault/roles.yaml" - "lib/init/nomad/vault-*.sh" - ".woodpecker/nomad-validate.yml" # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is # configured with REQUIRE_SIGN_IN, so anonymous git clones fail (exit 128). # FORGE_TOKEN is injected globally via WOODPECKER_ENVIRONMENT. clone: git: image: alpine/git commands: - AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|") - git clone --depth 1 "$AUTH_URL" . - git fetch --depth 1 origin "$CI_COMMIT_REF" - git checkout FETCH_HEAD steps: # ── 1. Nomad HCL syntax check ──────────────────────────────────────────── # `nomad config validate` parses server.hcl + client.hcl and fails on any # HCL/semantic error (unknown block, invalid port range, bad driver cfg). # vault.hcl is excluded — it's a Vault config, not Nomad, so it goes # through the vault-operator-diagnose step instead. - name: nomad-config-validate image: hashicorp/nomad:1.9.5 commands: - nomad version - nomad config validate nomad/server.hcl nomad/client.hcl # ── 2. Nomad jobspec HCL syntax check ──────────────────────────────────── # `nomad job validate` is a *different* tool from `nomad config validate` — # the former parses jobspec HCL (job/group/task blocks, driver config, # volume refs, network ports), the latter parses agent config HCL # (server/client blocks). Running step 1 on a jobspec would reject it # with "unknown block 'job'", and vice versa. Hence two separate steps. # # Validation is offline: no running Nomad server is required (exit 0 on # valid HCL, 1 on syntax/semantic error). The CLI takes a single path # argument so we loop over every `*.hcl` file under nomad/jobs/ — # that way a new jobspec PR gets CI coverage automatically (no separate # "edit the pipeline" step to forget). The `.hcl` suffix is the naming # convention: anything else in nomad/jobs/ is deliberately not validated # by this step. # # `[ -f "$f" ]` guards against the no-match case: POSIX sh does not # nullglob, so an empty jobs/ directory would leave the literal glob in # "$f" and fail. Today forgejo.hcl exists, but the guard keeps the # step safe during any future transient empty state. # # Scope note: offline validate catches jobspec-level errors (unknown # stanzas, missing required fields, wrong value types, invalid driver # config). It does NOT resolve cross-file references like host_volume # source names against nomad/client.hcl — that mismatch surfaces at # scheduling time on the live cluster, not here. The paired-write rule # in nomad/AGENTS.md ("add to both client.hcl and cluster-up.sh") is the # primary guardrail for that class of drift. - name: nomad-job-validate image: hashicorp/nomad:1.9.5 commands: - | set -e for f in nomad/jobs/*.hcl; do [ -f "$f" ] || continue echo "validating jobspec: $f" nomad job validate "$f" done # ── 3. Vault HCL syntax check ──────────────────────────────────────────── # `vault operator diagnose` loads the config and runs a suite of checks. # Exit codes: # 0 — all checks green # 1 — at least one hard failure (bad HCL, bad schema, unreachable storage) # 2 — advisory warnings only (no hard failure) # Our factory dev-box vault.hcl deliberately runs TLS-disabled on a # localhost-only listener (documented in nomad/vault.hcl), which triggers # an advisory "Check Listener TLS" warning → exit 2. The config still # parses, so we tolerate exit 2 and fail only on exit 1 or crashes. # -skip=storage/-skip=listener disables the runtime-only checks (vault's # container has /vault/file so storage is fine, but explicit skip is cheap # insurance against future container-image drift). - name: vault-operator-diagnose image: hashicorp/vault:1.18.5 commands: - | rc=0 vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener || rc=$? case "$rc" in 0) echo "vault config: all checks green" ;; 2) echo "vault config: parse OK (rc=2 — advisory warnings only; TLS-disabled on localhost listener is by design)" ;; *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;; esac # ── 4. Vault policy fmt -check ─────────────────────────────────────────── # `vault policy fmt -check` is non-destructive; it reads each policy file # and compares against the formatted version. Any difference means the file # needs formatting. This enforces consistent indentation (2-space), no # trailing whitespace, and proper HCL formatting conventions. # # CI runs this BEFORE vault policy validate because unformatted HCL can # sometimes cause confusing validation errors. - name: vault-policy-fmt image: hashicorp/vault:1.18.5 commands: - | set -e failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue echo "fmt-check: $f" if ! vault policy fmt -check "$f" > /dev/null 2>&1; then echo " ERROR: $f is not formatted correctly" >&2 vault policy fmt -check "$f" >&2 || true failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-policy-fmt: formatting errors found" >&2 exit 1 fi echo "vault-policy-fmt: all policies formatted correctly" # ── 5. Vault policy validate ───────────────────────────────────────────── # `vault policy validate` performs syntax + semantic validation: # - Checks for unknown stanzas/blocks # - Validates path patterns are valid # - Validates capabilities are known (read, list, create, update, delete, sudo) # - Checks for missing required fields # # Requires a running Vault instance (dev mode is sufficient for CI). # Uses the default dev server at http://127.0.0.1:8200 with "root" token. # # Exit codes: # 0 — policy is valid # 1 — policy has errors (syntax or semantic) # # CI starts a Vault dev server inline for validation. - name: vault-policy-validate image: hashicorp/vault:1.18.5 commands: - | set -e # Start Vault dev server in background vault server -dev -dev-root-token-id=root -dev-listen-address=0.0.0.0:8200 & VAULT_PID=$! trap "kill $VAULT_PID 2>/dev/null || true" EXIT # Wait for Vault to be ready for i in $(seq 1 30); do if vault status > /dev/null 2>&1; then echo "vault-policy-validate: Vault is ready" break fi sleep 0.5 done # Validate each policy failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue echo "validate: $f" if ! VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" > /dev/null 2>&1; then echo " ERROR: $f validation failed" >&2 VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" >&2 || true failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-policy-validate: validation errors found" >&2 exit 1 fi echo "vault-policy-validate: all policies valid" # ── 6. Vault roles.yaml validate ───────────────────────────────────────── # Validates vault/roles.yaml: # 1. yamllint check — ensures YAML syntax is valid # 2. Policy reference check — each role references a policy that exists # 3. Required fields check — each role has name, policies, and auth fields # # If roles.yaml doesn't exist yet, this step is skipped (it will be added # in S2.3 alongside JWT auth configuration). - name: vault-roles-validate image: python:3.12-alpine commands: - | set -e apk add --no-cache yamllint jq if [ ! -f vault/roles.yaml ]; then echo "vault-roles-validate: roles.yaml not found, skipping" exit 0 fi echo "yamllint: vault/roles.yaml" yamllint -q vault/roles.yaml || { echo " ERROR: yamllint found issues" >&2 exit 1 } echo " OK" # Extract and validate policy references echo "policy-reference-check: validating policy references" policy_dir="vault/policies" failed=0 # Get referenced policies from roles.yaml referenced=$(jq -r '.roles[].policies[]?' vault/roles.yaml 2>/dev/null | sort -u || true) if [ -z "$referenced" ]; then echo "vault-roles-validate: no policies referenced in roles.yaml" >&2 exit 1 fi # Get existing policy names existing=$(find "$policy_dir" -maxdepth 1 -name '*.hcl' -type f -exec basename {} .hcl \; | sort) for policy in $referenced; do if ! echo "$existing" | grep -q "^${policy}$"; then echo "vault-roles-validate: ERROR: policy '$policy' referenced but not found" >&2 failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-roles-validate: policy reference errors found" >&2 exit 1 fi echo "vault-roles-validate: all policy references valid" # ── 7. Vault secret-scan ───────────────────────────────────────────────── # Scans policy HCL files for embedded secrets (rare but dangerous copy-paste # mistake). Uses the same patterns as lib/secret-scan.sh: # - Long hex strings (32+ chars) # - API key patterns # - URLs with embedded credentials # - Bearer tokens # # Environment variables like $TOKEN or ${TOKEN} are excluded as safe. - name: vault-secret-scan image: alpine:3.19 commands: - | set -e apk add --no-cache bash # Copy the secret-scan.sh script into the container cat > /tmp/secret-scan.sh << 'EOF' #!/usr/bin/env bash # Inline version of lib/secret-scan.sh for CI secret detection _SECRET_PATTERNS=( '[0-9a-fA-F]{32,}' 'Bearer [A-Za-z0-9_/+=-]{20,}' '0x[0-9a-fA-F]{64}' 'https?://[^[:space:]]*[0-9a-fA-F]{20,}' 'AKIA[0-9A-Z]{16}' '(API_KEY|SECRET|TOKEN|PRIVATE_KEY|PASSWORD|INFURA|ALCHEMY)=[^[:space:]"]{16,}' ) _SAFE_PATTERNS=( '\$\{?[A-Z_]+\}?' 'commit [0-9a-f]{40}' 'Merge [0-9a-f]{40}' 'last-reviewed: [0-9a-f]{40}' 'codeberg\.org/[^[:space:]]+' 'localhost:3000/[^[:space:]]+' 'SC[0-9]{4}' ) scan_for_secrets() { local text="${1:-$(cat)}" local found=0 local cleaned="$text" for safe in "${_SAFE_PATTERNS[@]}"; do cleaned=$(printf '%s' "$cleaned" | sed -E "s/${safe}/__SAFE__/g" 2>/dev/null || printf '%s' "$cleaned") done for pattern in "${_SECRET_PATTERNS[@]}"; do local matches matches=$(printf '%s' "$cleaned" | grep -oE "$pattern" 2>/dev/null || true) if [ -n "$matches" ]; then while IFS= read -r match; do [ "$match" = "__SAFE__" ] && continue [ -z "$match" ] && continue printf 'secret-scan: detected potential secret matching pattern [%s]: %s\n' \ "$pattern" "${match:0:8}...${match: -4}" >&2 found=1 done <<< "$matches" fi done return $found } EOF chmod +x /tmp/secret-scan.sh # Scan policy files echo "secret-scan: vault/policies/*.hcl" failed=0 for f in vault/policies/*.hcl; do [ -f "$f" ] || continue echo " scanning: $f" if ! /tmp/secret-scan.sh < "$f"; then echo " ERROR: potential secrets detected in $f" >&2 failed=1 fi done if [ "$failed" -gt 0 ]; then echo "vault-secret-scan: secrets detected" >&2 exit 1 fi echo "vault-secret-scan: no secrets detected" # ── 8. Shellcheck ──────────────────────────────────────────────────────── # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns # the backend dispatcher). bin/disinto has no .sh extension so the # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the # one place it gets checked. - name: shellcheck-nomad image: koalaman/shellcheck-alpine:stable commands: - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto # ── 9. bats: `disinto init --backend=nomad --dry-run` ──────────────────── # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0 # with the expected step list, and --backend=docker stays on the docker # path (regression guard). Pure dry-run — no sudo, no network. - name: bats-init-nomad image: alpine:3.19 commands: - apk add --no-cache bash bats - bats tests/disinto-init-nomad.bats