fix: [nomad-step-2] S2.3 — vault-nomad-auth.sh (enable JWT auth + roles + nomad workload identity) (#881)
All checks were successful
All checks were successful
Wires Nomad → Vault via workload identity so jobs can exchange their short-lived JWT for a Vault token carrying the policies in vault/policies/ — no shared VAULT_TOKEN in job env. - `lib/init/nomad/vault-nomad-auth.sh` — idempotent script: enable jwt auth at path `jwt-nomad`, config JWKS/algs, apply roles, install server.hcl + SIGHUP nomad on change. - `tools/vault-apply-roles.sh` — companion sync script (S2.1 sibling); reads vault/roles.yaml and upserts each Vault role under auth/jwt-nomad/role/<name> with created/updated/unchanged semantics. - `vault/roles.yaml` — declarative role→policy→bound_claims map; one entry per vault/policies/*.hcl. Keeps S2.1 policies and S2.3 role bindings visible side-by-side at review time. - `nomad/server.hcl` — adds vault stanza (enabled, address, default_identity.aud=["vault.io"], ttl=1h). - `lib/hvault.sh` — new `hvault_get_or_empty` helper shared between vault-apply-policies.sh, vault-apply-roles.sh, and vault-nomad-auth.sh; reads a Vault endpoint and distinguishes 200 / 404 / other. - `vault/policies/AGENTS.md` — extends S2.1 docs with JWT-auth role naming convention, token shape, and the "add new service" flow. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
88e49b9e9d
commit
8efef9f1bb
7 changed files with 776 additions and 35 deletions
|
|
@ -178,6 +178,51 @@ hvault_kv_list() {
|
|||
}
|
||||
}
|
||||
|
||||
# hvault_get_or_empty PATH
|
||||
# GET /v1/PATH. On 200, prints the raw response body to stdout (caller
|
||||
# parses with jq). On 404, prints nothing and returns 0 — caller treats
|
||||
# the empty string as "resource absent, needs create". Any other HTTP
|
||||
# status is a hard error: response body is logged to stderr as a
|
||||
# structured JSON error and the function returns 1.
|
||||
#
|
||||
# Used by the sync scripts (tools/vault-apply-*.sh +
|
||||
# lib/init/nomad/vault-nomad-auth.sh) to read existing policies, roles,
|
||||
# auth-method listings, and per-role configs without triggering errexit
|
||||
# on the expected absent-resource case. `_hvault_request` is not a
|
||||
# substitute — it treats 404 as a hard error, which is correct for
|
||||
# writes but wrong for "does this already exist?" checks.
|
||||
#
|
||||
# Subshell + EXIT trap: the RETURN trap does NOT fire on set-e abort,
|
||||
# so tmpfile cleanup from a function-scoped RETURN trap would leak on
|
||||
# jq/curl errors under `set -eo pipefail`. The subshell + EXIT trap
|
||||
# is the reliable cleanup boundary.
|
||||
hvault_get_or_empty() {
|
||||
local path="${1:-}"
|
||||
|
||||
if [ -z "$path" ]; then
|
||||
_hvault_err "hvault_get_or_empty" "PATH is required" \
|
||||
"usage: hvault_get_or_empty PATH"
|
||||
return 1
|
||||
fi
|
||||
_hvault_check_prereqs "hvault_get_or_empty" || return 1
|
||||
|
||||
(
|
||||
local tmp http_code
|
||||
tmp="$(mktemp)"
|
||||
trap 'rm -f "$tmp"' EXIT
|
||||
http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \
|
||||
-H "X-Vault-Token: ${VAULT_TOKEN}" \
|
||||
"${VAULT_ADDR}/v1/${path}")" \
|
||||
|| { _hvault_err "hvault_get_or_empty" "curl failed" "path=$path"; exit 1; }
|
||||
case "$http_code" in
|
||||
2[0-9][0-9]) cat "$tmp" ;;
|
||||
404) printf '' ;;
|
||||
*) _hvault_err "hvault_get_or_empty" "HTTP $http_code" "$(cat "$tmp")"
|
||||
exit 1 ;;
|
||||
esac
|
||||
)
|
||||
}
|
||||
|
||||
# hvault_policy_apply NAME FILE
|
||||
# Idempotent policy upsert — create or update a Vault policy.
|
||||
hvault_policy_apply() {
|
||||
|
|
|
|||
177
lib/init/nomad/vault-nomad-auth.sh
Executable file
177
lib/init/nomad/vault-nomad-auth.sh
Executable file
|
|
@ -0,0 +1,177 @@
|
|||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring
|
||||
#
|
||||
# Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT
|
||||
# auth method at path `jwt-nomad`, points it at Nomad's workload-identity
|
||||
# JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh),
|
||||
# updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad
|
||||
# to reload so jobs can exchange short-lived workload-identity tokens for
|
||||
# Vault tokens — no shared VAULT_TOKEN in job env.
|
||||
#
|
||||
# Steps:
|
||||
# 1. Enable auth method (sys/auth/jwt-nomad, type=jwt)
|
||||
# 2. Configure JWKS + algs (auth/jwt-nomad/config)
|
||||
# 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh)
|
||||
# 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed
|
||||
#
|
||||
# Idempotency contract:
|
||||
# - Auth path already enabled → skip create, log "jwt-nomad already enabled".
|
||||
# - Config identical to desired → skip write, log "jwt-nomad config unchanged".
|
||||
# - Roles: see tools/vault-apply-roles.sh header for per-role diffing.
|
||||
# - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP.
|
||||
# - Second run on a fully-configured box is a silent no-op end-to-end.
|
||||
#
|
||||
# Preconditions:
|
||||
# - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed).
|
||||
# - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh
|
||||
# (otherwise the roles we write will reference policies Vault does not
|
||||
# know about — the write succeeds, but token minting will fail later).
|
||||
# - Running as root (writes /etc/nomad.d/server.hcl + signals nomad).
|
||||
#
|
||||
# Environment:
|
||||
# VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl).
|
||||
# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh).
|
||||
#
|
||||
# Usage:
|
||||
# sudo lib/init/nomad/vault-nomad-auth.sh
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 success (configured, or already so)
|
||||
# 1 precondition / API / nomad-reload failure
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
|
||||
|
||||
APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh"
|
||||
SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl"
|
||||
SERVER_HCL_DST="/etc/nomad.d/server.hcl"
|
||||
|
||||
VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
|
||||
export VAULT_ADDR
|
||||
|
||||
# shellcheck source=../../hvault.sh
|
||||
source "${REPO_ROOT}/lib/hvault.sh"
|
||||
|
||||
log() { printf '[vault-auth] %s\n' "$*"; }
|
||||
die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; }
|
||||
|
||||
# ── Preconditions ────────────────────────────────────────────────────────────
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)"
|
||||
fi
|
||||
|
||||
for bin in curl jq vault systemctl; do
|
||||
command -v "$bin" >/dev/null 2>&1 \
|
||||
|| die "required binary not found: ${bin}"
|
||||
done
|
||||
|
||||
[ -f "$SERVER_HCL_SRC" ] \
|
||||
|| die "source config not found: ${SERVER_HCL_SRC}"
|
||||
[ -x "$APPLY_ROLES_SH" ] \
|
||||
|| die "companion script missing or not executable: ${APPLY_ROLES_SH}"
|
||||
|
||||
hvault_token_lookup >/dev/null \
|
||||
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
|
||||
|
||||
# ── Desired config (Nomad workload-identity JWKS on localhost:4646) ──────────
|
||||
# Nomad's default workload-identity signer publishes the public JWKS at
|
||||
# /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates
|
||||
# JWTs against it. RS256 is the signer's default algorithm. `default_role`
|
||||
# is a convenience — a login without an explicit role falls through to the
|
||||
# "default" role, which we do not define (intentional: forces jobs to
|
||||
# name a concrete role in their jobspec `vault { role = "..." }`).
|
||||
JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json"
|
||||
|
||||
# ── Step 1/4: enable auth method jwt-nomad ───────────────────────────────────
|
||||
log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──"
|
||||
# sys/auth returns an object keyed by "<path>/" for every enabled method.
|
||||
# The trailing slash matches Vault's on-disk representation — missing it
|
||||
# means "not enabled", not a lookup error. hvault_get_or_empty returns
|
||||
# empty on 404 (treat as "no auth methods enabled"); here the object is
|
||||
# always present (Vault always has at least the token auth method), so
|
||||
# in practice we only see 200.
|
||||
auth_list="$(hvault_get_or_empty "sys/auth")" \
|
||||
|| die "failed to list auth methods"
|
||||
if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then
|
||||
log "auth path jwt-nomad already enabled"
|
||||
else
|
||||
enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')"
|
||||
_hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \
|
||||
|| die "failed to enable auth method jwt-nomad"
|
||||
log "auth path jwt-nomad enabled"
|
||||
fi
|
||||
|
||||
# ── Step 2/4: configure auth/jwt-nomad/config ────────────────────────────────
|
||||
log "── Step 2/4: configure auth/jwt-nomad/config ──"
|
||||
desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{
|
||||
jwks_url: $jwks,
|
||||
jwt_supported_algs: ["RS256"],
|
||||
default_role: "default"
|
||||
}')"
|
||||
|
||||
current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \
|
||||
|| die "failed to read current jwt-nomad config"
|
||||
if [ -n "$current_cfg_raw" ]; then
|
||||
cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')"
|
||||
cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')"
|
||||
cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')"
|
||||
else
|
||||
cur_jwks=""; cur_algs="[]"; cur_default=""
|
||||
fi
|
||||
|
||||
if [ "$cur_jwks" = "$JWKS_URL" ] \
|
||||
&& [ "$cur_algs" = '["RS256"]' ] \
|
||||
&& [ "$cur_default" = "default" ]; then
|
||||
log "jwt-nomad config unchanged"
|
||||
else
|
||||
_hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \
|
||||
|| die "failed to write jwt-nomad config"
|
||||
log "jwt-nomad config written"
|
||||
fi
|
||||
|
||||
# ── Step 3/4: apply roles from vault/roles.yaml ──────────────────────────────
|
||||
log "── Step 3/4: apply roles from vault/roles.yaml ──"
|
||||
# Delegates to tools/vault-apply-roles.sh — one source of truth for the
|
||||
# parser and per-role idempotency contract. Its header documents the
|
||||
# created/updated/unchanged wiring.
|
||||
"$APPLY_ROLES_SH"
|
||||
|
||||
# ── Step 4/4: install server.hcl + SIGHUP nomad if changed ───────────────────
|
||||
log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──"
|
||||
# cluster-up.sh (S0.4) is the normal path for installing server.hcl — but
|
||||
# this script is run AFTER S0.4, so we also install here. Writing only on
|
||||
# content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install`
|
||||
# preserves perms at 0644 root:root on every write.
|
||||
needs_reload=0
|
||||
if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then
|
||||
log "unchanged: ${SERVER_HCL_DST}"
|
||||
else
|
||||
log "writing: ${SERVER_HCL_DST}"
|
||||
install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST"
|
||||
needs_reload=1
|
||||
fi
|
||||
|
||||
if [ "$needs_reload" -eq 1 ]; then
|
||||
# SIGHUP triggers Nomad's config reload (see ExecReload in
|
||||
# lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using
|
||||
# `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the
|
||||
# signal even when the unit doesn't declare ExecReload (defensive —
|
||||
# future unit edits can't silently break this script).
|
||||
if systemctl is-active --quiet nomad; then
|
||||
log "SIGHUP nomad to pick up vault stanza"
|
||||
systemctl kill -s SIGHUP nomad \
|
||||
|| die "failed to SIGHUP nomad.service"
|
||||
else
|
||||
# Fresh box: nomad not started yet. The updated server.hcl will be
|
||||
# picked up at first start. Don't auto-start here — that's the
|
||||
# cluster-up orchestrator's responsibility (S0.4).
|
||||
log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)"
|
||||
fi
|
||||
else
|
||||
log "server.hcl unchanged — nomad SIGHUP not needed"
|
||||
fi
|
||||
|
||||
log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──"
|
||||
Loading…
Add table
Add a link
Reference in a new issue