fix: [nomad-step-2] S2.6 — CI: vault policy fmt + validate + roles.yaml check (#884)

2026-04-16 16:45:43 +00:00 · 2026-04-16 16:45:43 +00:00 · 108b928cfc
commit 108b928cfc
parent 88e49b9e9d
5 changed files with 725 additions and 9 deletions
--- a/.woodpecker/nomad-validate.yml
+++ b/.woodpecker/nomad-validate.yml
@ -6,11 +6,20 @@
 # lib/init/nomad/, plus the `disinto init` dispatcher, gets checked
 # before it can land.
 #
+# Also includes Vault policy validation (S2.6, issue #884):
+#   - vault policy fmt -check on vault/policies/*.hcl
+#   - vault policy validate on each policy file
+#   - roles.yaml validator (yamllint + policy reference check)
+#   - secret-scan gate on policy files
+#
 # Triggers on PRs (and pushes) that touch any of:
 #   nomad/**              — HCL configs (server, client, vault)
 #   lib/init/nomad/**     — cluster-up / install / systemd / vault-init
 #   bin/disinto           — `disinto init --backend=nomad` dispatcher
 #   tests/disinto-init-nomad.bats — the bats suite itself
+#   vault/policies/*.hcl  — Vault ACL policies (S2.6)
+#   vault/roles.yaml      — JWT auth role definitions (S2.6)
+#   lib/init/nomad/vault-*.sh — Vault init scripts (S2.6)
 #   .woodpecker/nomad-validate.yml — the pipeline definition
 #
 # Steps (all fail-closed — any error blocks merge):
@ -19,8 +28,12 @@
 #                                 nomad/jobs/*.hcl (new jobspecs get
 #                                 CI coverage automatically)
 #   3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl
-#   4. shellcheck-nomad        — shellcheck the cluster-up + install scripts + disinto
-#   5. bats-init-nomad         — `disinto init --backend=nomad --dry-run` smoke tests
+#   4. vault-policy-fmt        — `vault policy fmt -check` on vault/policies/*.hcl
+#   5. vault-policy-validate   — `vault policy validate` on each policy file
+#   6. vault-roles-validate    — yamllint + policy reference check
+#   7. vault-secret-scan       — scan policy files for embedded secrets
+#   8. shellcheck-nomad        — shellcheck the cluster-up + install scripts + disinto
+#   9. bats-init-nomad         — `disinto init --backend=nomad --dry-run` smoke tests
 #
 # Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
 # vault 1.18.5). Bump there AND here together — drift = CI passing on
@ -34,6 +47,9 @@ when:
      - "lib/init/nomad/**"
      - "bin/disinto"
      - "tests/disinto-init-nomad.bats"
+      - "vault/policies/*.hcl"
+      - "vault/roles.yaml"
+      - "lib/init/nomad/vault-*.sh"
      - ".woodpecker/nomad-validate.yml"

 # Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
@ -123,7 +139,231 @@ steps:
          *) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;;
        esac

-  # ── 4. Shellcheck ────────────────────────────────────────────────────────
+  # ── 4. Vault policy fmt -check ───────────────────────────────────────────
+  # `vault policy fmt -check` is non-destructive; it reads each policy file
+  # and compares against the formatted version. Any difference means the file
+  # needs formatting. This enforces consistent indentation (2-space), no
+  # trailing whitespace, and proper HCL formatting conventions.
+  #
+  # CI runs this BEFORE vault policy validate because unformatted HCL can
+  # sometimes cause confusing validation errors.
+  - name: vault-policy-fmt
+    image: hashicorp/vault:1.18.5
+    commands:
+      - |
+        set -e
+        failed=0
+        for f in vault/policies/*.hcl; do
+          [ -f "$f" ] || continue
+          echo "fmt-check: $f"
+          if ! vault policy fmt -check "$f" > /dev/null 2>&1; then
+            echo "  ERROR: $f is not formatted correctly" >&2
+            vault policy fmt -check "$f" >&2 || true
+            failed=1
+          fi
+        done
+        if [ "$failed" -gt 0 ]; then
+          echo "vault-policy-fmt: formatting errors found" >&2
+          exit 1
+        fi
+        echo "vault-policy-fmt: all policies formatted correctly"
+
+  # ── 5. Vault policy validate ─────────────────────────────────────────────
+  # `vault policy validate` performs syntax + semantic validation:
+  #   - Checks for unknown stanzas/blocks
+  #   - Validates path patterns are valid
+  #   - Validates capabilities are known (read, list, create, update, delete, sudo)
+  #   - Checks for missing required fields
+  #
+  # Requires a running Vault instance (dev mode is sufficient for CI).
+  # Uses the default dev server at http://127.0.0.1:8200 with "root" token.
+  #
+  # Exit codes:
+  #   0 — policy is valid
+  #   1 — policy has errors (syntax or semantic)
+  #
+  # CI starts a Vault dev server inline for validation.
+  - name: vault-policy-validate
+    image: hashicorp/vault:1.18.5
+    commands:
+      - |
+        set -e
+        # Start Vault dev server in background
+        vault server -dev -dev-root-token-id=root -dev-listen-address=0.0.0.0:8200 &
+        VAULT_PID=$!
+        trap "kill $VAULT_PID 2>/dev/null || true" EXIT
+
+        # Wait for Vault to be ready
+        for i in $(seq 1 30); do
+          if vault status > /dev/null 2>&1; then
+            echo "vault-policy-validate: Vault is ready"
+            break
+          fi
+          sleep 0.5
+        done
+
+        # Validate each policy
+        failed=0
+        for f in vault/policies/*.hcl; do
+          [ -f "$f" ] || continue
+          echo "validate: $f"
+          if ! VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" > /dev/null 2>&1; then
+            echo "  ERROR: $f validation failed" >&2
+            VAULT_ADDR=http://127.0.0.1:8200 VAULT_TOKEN=root vault policy validate "$f" >&2 || true
+            failed=1
+          fi
+        done
+
+        if [ "$failed" -gt 0 ]; then
+          echo "vault-policy-validate: validation errors found" >&2
+          exit 1
+        fi
+        echo "vault-policy-validate: all policies valid"
+
+  # ── 6. Vault roles.yaml validate ─────────────────────────────────────────
+  # Validates vault/roles.yaml:
+  #   1. yamllint check — ensures YAML syntax is valid
+  #   2. Policy reference check — each role references a policy that exists
+  #   3. Required fields check — each role has name, policies, and auth fields
+  #
+  # If roles.yaml doesn't exist yet, this step is skipped (it will be added
+  # in S2.3 alongside JWT auth configuration).
+  - name: vault-roles-validate
+    image: python:3.12-alpine
+    commands:
+      - |
+        set -e
+        apk add --no-cache yamllint jq
+
+        if [ ! -f vault/roles.yaml ]; then
+          echo "vault-roles-validate: roles.yaml not found, skipping"
+          exit 0
+        fi
+
+        echo "yamllint: vault/roles.yaml"
+        yamllint -q vault/roles.yaml || {
+          echo "  ERROR: yamllint found issues" >&2
+          exit 1
+        }
+        echo "  OK"
+
+        # Extract and validate policy references
+        echo "policy-reference-check: validating policy references"
+        policy_dir="vault/policies"
+        failed=0
+
+        # Get referenced policies from roles.yaml
+        referenced=$(jq -r '.roles[].policies[]?' vault/roles.yaml 2>/dev/null | sort -u || true)
+
+        if [ -z "$referenced" ]; then
+          echo "vault-roles-validate: no policies referenced in roles.yaml" >&2
+          exit 1
+        fi
+
+        # Get existing policy names
+        existing=$(find "$policy_dir" -maxdepth 1 -name '*.hcl' -type f -exec basename {} .hcl \; | sort)
+
+        for policy in $referenced; do
+          if ! echo "$existing" | grep -q "^${policy}$"; then
+            echo "vault-roles-validate: ERROR: policy '$policy' referenced but not found" >&2
+            failed=1
+          fi
+        done
+
+        if [ "$failed" -gt 0 ]; then
+          echo "vault-roles-validate: policy reference errors found" >&2
+          exit 1
+        fi
+
+        echo "vault-roles-validate: all policy references valid"
+
+  # ── 7. Vault secret-scan ─────────────────────────────────────────────────
+  # Scans policy HCL files for embedded secrets (rare but dangerous copy-paste
+  # mistake). Uses the same patterns as lib/secret-scan.sh:
+  #   - Long hex strings (32+ chars)
+  #   - API key patterns
+  #   - URLs with embedded credentials
+  #   - Bearer tokens
+  #
+  # Environment variables like $TOKEN or ${TOKEN} are excluded as safe.
+  - name: vault-secret-scan
+    image: alpine:3.19
+    commands:
+      - |
+        set -e
+        apk add --no-cache bash
+
+        # Copy the secret-scan.sh script into the container
+        cat > /tmp/secret-scan.sh << 'EOF'
+#!/usr/bin/env bash
+# Inline version of lib/secret-scan.sh for CI secret detection
+
+_SECRET_PATTERNS=(
+  '[0-9a-fA-F]{32,}'
+  'Bearer [A-Za-z0-9_/+=-]{20,}'
+  '0x[0-9a-fA-F]{64}'
+  'https?://[^[:space:]]*[0-9a-fA-F]{20,}'
+  'AKIA[0-9A-Z]{16}'
+  '(API_KEY|SECRET|TOKEN|PRIVATE_KEY|PASSWORD|INFURA|ALCHEMY)=[^[:space:]"]{16,}'
+)
+
+_SAFE_PATTERNS=(
+  '\$\{?[A-Z_]+\}?'
+  'commit [0-9a-f]{40}'
+  'Merge [0-9a-f]{40}'
+  'last-reviewed: [0-9a-f]{40}'
+  'codeberg\.org/[^[:space:]]+'
+  'localhost:3000/[^[:space:]]+'
+  'SC[0-9]{4}'
+)
+
+scan_for_secrets() {
+  local text="${1:-$(cat)}"
+  local found=0
+
+  local cleaned="$text"
+  for safe in "${_SAFE_PATTERNS[@]}"; do
+    cleaned=$(printf '%s' "$cleaned" | sed -E "s/${safe}/__SAFE__/g" 2>/dev/null || printf '%s' "$cleaned")
+  done
+
+  for pattern in "${_SECRET_PATTERNS[@]}"; do
+    local matches
+    matches=$(printf '%s' "$cleaned" | grep -oE "$pattern" 2>/dev/null || true)
+    if [ -n "$matches" ]; then
+      while IFS= read -r match; do
+        [ "$match" = "__SAFE__" ] && continue
+        [ -z "$match" ] && continue
+        printf 'secret-scan: detected potential secret matching pattern [%s]: %s\n' \
+          "$pattern" "${match:0:8}...${match: -4}" >&2
+        found=1
+      done <<< "$matches"
+    fi
+  done
+
+  return $found
+}
+EOF
+        chmod +x /tmp/secret-scan.sh
+
+        # Scan policy files
+        echo "secret-scan: vault/policies/*.hcl"
+        failed=0
+        for f in vault/policies/*.hcl; do
+          [ -f "$f" ] || continue
+          echo "  scanning: $f"
+          if ! /tmp/secret-scan.sh < "$f"; then
+            echo "  ERROR: potential secrets detected in $f" >&2
+            failed=1
+          fi
+        done
+
+        if [ "$failed" -gt 0 ]; then
+          echo "vault-secret-scan: secrets detected" >&2
+          exit 1
+        fi
+        echo "vault-secret-scan: no secrets detected"
+
+  # ── 8. Shellcheck ────────────────────────────────────────────────────────
  # Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns
  # the backend dispatcher). bin/disinto has no .sh extension so the
  # repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the
@ -133,7 +373,7 @@ steps:
    commands:
      - shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto

-  # ── 5. bats: `disinto init --backend=nomad --dry-run` ────────────────────
+  # ── 9. bats: `disinto init --backend=nomad --dry-run` ────────────────────
  # Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0
  # with the expected step list, and --backend=docker stays on the docker
  # path (regression guard). Pure dry-run — no sudo, no network.