#!/usr/bin/env bash # ============================================================================= # lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring # # Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT # auth method at path `jwt-nomad`, points it at Nomad's workload-identity # JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh), # updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad # to reload so jobs can exchange short-lived workload-identity tokens for # Vault tokens — no shared VAULT_TOKEN in job env. # # Steps: # 1. Enable auth method (sys/auth/jwt-nomad, type=jwt) # 2. Configure JWKS + algs (auth/jwt-nomad/config) # 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh) # 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed # # Idempotency contract: # - Auth path already enabled → skip create, log "jwt-nomad already enabled". # - Config identical to desired → skip write, log "jwt-nomad config unchanged". # - Roles: see tools/vault-apply-roles.sh header for per-role diffing. # - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP. # - Second run on a fully-configured box is a silent no-op end-to-end. # # Preconditions: # - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed). # - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh # (otherwise the roles we write will reference policies Vault does not # know about — the write succeeds, but token minting will fail later). # - Running as root (writes /etc/nomad.d/server.hcl + signals nomad). # # Environment: # VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl). # VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh). # # Usage: # sudo lib/init/nomad/vault-nomad-auth.sh # # Exit codes: # 0 success (configured, or already so) # 1 precondition / API / nomad-reload failure # ============================================================================= set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh" SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl" SERVER_HCL_DST="/etc/nomad.d/server.hcl" # shellcheck source=../../lib/hvault.sh source "${REPO_ROOT}/lib/hvault.sh" _hvault_default_env log() { printf '[vault-auth] %s\n' "$*"; } die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; } # ── Preconditions ──────────────────────────────────────────────────────────── if [ "$(id -u)" -ne 0 ]; then die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)" fi # curl + jq are used directly; hvault.sh's helpers are also curl-based, so # the `vault` CLI is NOT required here — don't add it to this list, or a # Vault-server-present / vault-CLI-absent box (e.g. a Nomad-client-only # node) would die spuriously. systemctl is required for SIGHUPing nomad. for bin in curl jq systemctl; do command -v "$bin" >/dev/null 2>&1 \ || die "required binary not found: ${bin}" done [ -f "$SERVER_HCL_SRC" ] \ || die "source config not found: ${SERVER_HCL_SRC}" [ -x "$APPLY_ROLES_SH" ] \ || die "companion script missing or not executable: ${APPLY_ROLES_SH}" hvault_token_lookup >/dev/null \ || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" # ── Desired config (Nomad workload-identity JWKS on localhost:4646) ────────── # Nomad's default workload-identity signer publishes the public JWKS at # /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates # JWTs against it. RS256 is the signer's default algorithm. `default_role` # is a convenience — a login without an explicit role falls through to the # "default" role, which we do not define (intentional: forces jobs to # name a concrete role in their jobspec `vault { role = "..." }`). JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json" # ── Step 1/4: enable auth method jwt-nomad ─────────────────────────────────── log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──" # sys/auth returns an object keyed by "/" for every enabled method. # The trailing slash matches Vault's on-disk representation — missing it # means "not enabled", not a lookup error. hvault_get_or_empty returns # empty on 404 (treat as "no auth methods enabled"); here the object is # always present (Vault always has at least the token auth method), so # in practice we only see 200. auth_list="$(hvault_get_or_empty "sys/auth")" \ || die "failed to list auth methods" if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then log "auth path jwt-nomad already enabled" else enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')" _hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \ || die "failed to enable auth method jwt-nomad" log "auth path jwt-nomad enabled" fi # ── Step 2/4: configure auth/jwt-nomad/config ──────────────────────────────── log "── Step 2/4: configure auth/jwt-nomad/config ──" desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{ jwks_url: $jwks, jwt_supported_algs: ["RS256"], default_role: "default" }')" current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \ || die "failed to read current jwt-nomad config" if [ -n "$current_cfg_raw" ]; then cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')" cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')" cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')" else cur_jwks=""; cur_algs="[]"; cur_default="" fi if [ "$cur_jwks" = "$JWKS_URL" ] \ && [ "$cur_algs" = '["RS256"]' ] \ && [ "$cur_default" = "default" ]; then log "jwt-nomad config unchanged" else _hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \ || die "failed to write jwt-nomad config" log "jwt-nomad config written" fi # ── Step 3/4: apply roles from vault/roles.yaml ────────────────────────────── log "── Step 3/4: apply roles from vault/roles.yaml ──" # Delegates to tools/vault-apply-roles.sh — one source of truth for the # parser and per-role idempotency contract. Its header documents the # created/updated/unchanged wiring. "$APPLY_ROLES_SH" # ── Step 4/4: install server.hcl + SIGHUP nomad if changed ─────────────────── log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──" # cluster-up.sh (S0.4) is the normal path for installing server.hcl — but # this script is run AFTER S0.4, so we also install here. Writing only on # content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install` # preserves perms at 0644 root:root on every write. needs_reload=0 if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then log "unchanged: ${SERVER_HCL_DST}" else log "writing: ${SERVER_HCL_DST}" install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST" needs_reload=1 fi if [ "$needs_reload" -eq 1 ]; then # SIGHUP triggers Nomad's config reload (see ExecReload in # lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using # `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the # signal even when the unit doesn't declare ExecReload (defensive — # future unit edits can't silently break this script). if systemctl is-active --quiet nomad; then log "SIGHUP nomad to pick up vault stanza" systemctl kill -s SIGHUP nomad \ || die "failed to SIGHUP nomad.service" else # Fresh box: nomad not started yet. The updated server.hcl will be # picked up at first start. Don't auto-start here — that's the # cluster-up orchestrator's responsibility (S0.4). log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)" fi else log "server.hcl unchanged — nomad SIGHUP not needed" fi log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──"