disinto/.woodpecker/nomad-validate.yml

144 lines
7.3 KiB
YAML

# =============================================================================
# .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts
#
# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the
# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or
# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked
# before it can land.
#
# Triggers on PRs (and pushes) that touch any of:
# nomad/** — HCL configs (server, client, vault)
# lib/init/nomad/** — cluster-up / install / systemd / vault-init
# bin/disinto — `disinto init --backend=nomad` dispatcher
# tests/disinto-init-nomad.bats — the bats suite itself
# .woodpecker/nomad-validate.yml — the pipeline definition
#
# Steps (all fail-closed — any error blocks merge):
# 1. nomad-config-validate — `nomad config validate` on server + client HCL
# 2. nomad-job-validate — `nomad job validate` looped over every
# nomad/jobs/*.hcl (new jobspecs get
# CI coverage automatically)
# 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl
# 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto
# 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests
#
# Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
# vault 1.18.5). Bump there AND here together — drift = CI passing on
# syntax the runtime would reject.
# =============================================================================
when:
- event: [push, pull_request]
path:
- "nomad/**"
- "lib/init/nomad/**"
- "bin/disinto"
- "tests/disinto-init-nomad.bats"
- ".woodpecker/nomad-validate.yml"
# Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
# configured with REQUIRE_SIGN_IN, so anonymous git clones fail (exit 128).
# FORGE_TOKEN is injected globally via WOODPECKER_ENVIRONMENT.
clone:
git:
image: alpine/git
commands:
- AUTH_URL=$(printf '%s' "$CI_REPO_CLONE_URL" | sed "s|://|://token:$FORGE_TOKEN@|")
- git clone --depth 1 "$AUTH_URL" .
- git fetch --depth 1 origin "$CI_COMMIT_REF"
- git checkout FETCH_HEAD
steps:
# ── 1. Nomad HCL syntax check ────────────────────────────────────────────
# `nomad config validate` parses server.hcl + client.hcl and fails on any
# HCL/semantic error (unknown block, invalid port range, bad driver cfg).
# vault.hcl is excluded — it's a Vault config, not Nomad, so it goes
# through the vault-operator-diagnose step instead.
- name: nomad-config-validate
image: hashicorp/nomad:1.9.5
commands:
- nomad version
- nomad config validate nomad/server.hcl nomad/client.hcl
# ── 2. Nomad jobspec HCL syntax check ────────────────────────────────────
# `nomad job validate` is a *different* tool from `nomad config validate` —
# the former parses jobspec HCL (job/group/task blocks, driver config,
# volume refs, network ports), the latter parses agent config HCL
# (server/client blocks). Running step 1 on a jobspec would reject it
# with "unknown block 'job'", and vice versa. Hence two separate steps.
#
# Validation is offline: no running Nomad server is required (exit 0 on
# valid HCL, 1 on syntax/semantic error). The CLI takes a single path
# argument so we loop over every `*.hcl` file under nomad/jobs/ —
# that way a new jobspec PR gets CI coverage automatically (no separate
# "edit the pipeline" step to forget). The `.hcl` suffix is the naming
# convention: anything else in nomad/jobs/ is deliberately not validated
# by this step.
#
# `[ -f "$f" ]` guards against the no-match case: POSIX sh does not
# nullglob, so an empty jobs/ directory would leave the literal glob in
# "$f" and fail. Today forgejo.hcl exists, but the guard keeps the
# step safe during any future transient empty state.
#
# Scope note: offline validate catches jobspec-level errors (unknown
# stanzas, missing required fields, wrong value types, invalid driver
# config). It does NOT resolve cross-file references like host_volume
# source names against nomad/client.hcl — that mismatch surfaces at
# scheduling time on the live cluster, not here. The paired-write rule
# in nomad/AGENTS.md ("add to both client.hcl and cluster-up.sh") is the
# primary guardrail for that class of drift.
- name: nomad-job-validate
image: hashicorp/nomad:1.9.5
commands:
- |
set -e
for f in nomad/jobs/*.hcl; do
[ -f "$f" ] || continue
echo "validating jobspec: $f"
nomad job validate "$f"
done
# ── 3. Vault HCL syntax check ────────────────────────────────────────────
# `vault operator diagnose` loads the config and runs a suite of checks.
# Exit codes:
# 0 — all checks green
# 1 — at least one hard failure (bad HCL, bad schema, unreachable storage)
# 2 — advisory warnings only (no hard failure)
# Our factory dev-box vault.hcl deliberately runs TLS-disabled on a
# localhost-only listener (documented in nomad/vault.hcl), which triggers
# an advisory "Check Listener TLS" warning → exit 2. The config still
# parses, so we tolerate exit 2 and fail only on exit 1 or crashes.
# -skip=storage/-skip=listener disables the runtime-only checks (vault's
# container has /vault/file so storage is fine, but explicit skip is cheap
# insurance against future container-image drift).
- name: vault-operator-diagnose
image: hashicorp/vault:1.18.5
commands:
- |
rc=0
vault operator diagnose -config=nomad/vault.hcl -skip=storage -skip=listener || rc=$?
case "$rc" in
0) echo "vault config: all checks green" ;;
2) echo "vault config: parse OK (rc=2 — advisory warnings only; TLS-disabled on localhost listener is by design)" ;;
*) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;;
esac
# ── 4. Shellcheck ────────────────────────────────────────────────────────
# Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns
# the backend dispatcher). bin/disinto has no .sh extension so the
# repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the
# one place it gets checked.
- name: shellcheck-nomad
image: koalaman/shellcheck-alpine:stable
commands:
- shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto
# ── 5. bats: `disinto init --backend=nomad --dry-run` ────────────────────
# Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0
# with the expected step list, and --backend=docker stays on the docker
# path (regression guard). Pure dry-run — no sudo, no network.
- name: bats-init-nomad
image: alpine:3.19
commands:
- apk add --no-cache bash bats
- bats tests/disinto-init-nomad.bats