From 8efef9f1bb63d3049ac7d6864840cc280ba8631b Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 16 Apr 2026 16:44:22 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20[nomad-step-2]=20S2.3=20=E2=80=94=20vaul?= =?UTF-8?q?t-nomad-auth.sh=20(enable=20JWT=20auth=20+=20roles=20+=20nomad?= =?UTF-8?q?=20workload=20identity)=20(#881)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires Nomad → Vault via workload identity so jobs can exchange their short-lived JWT for a Vault token carrying the policies in vault/policies/ — no shared VAULT_TOKEN in job env. - `lib/init/nomad/vault-nomad-auth.sh` — idempotent script: enable jwt auth at path `jwt-nomad`, config JWKS/algs, apply roles, install server.hcl + SIGHUP nomad on change. - `tools/vault-apply-roles.sh` — companion sync script (S2.1 sibling); reads vault/roles.yaml and upserts each Vault role under auth/jwt-nomad/role/ with created/updated/unchanged semantics. - `vault/roles.yaml` — declarative role→policy→bound_claims map; one entry per vault/policies/*.hcl. Keeps S2.1 policies and S2.3 role bindings visible side-by-side at review time. - `nomad/server.hcl` — adds vault stanza (enabled, address, default_identity.aud=["vault.io"], ttl=1h). - `lib/hvault.sh` — new `hvault_get_or_empty` helper shared between vault-apply-policies.sh, vault-apply-roles.sh, and vault-nomad-auth.sh; reads a Vault endpoint and distinguishes 200 / 404 / other. - `vault/policies/AGENTS.md` — extends S2.1 docs with JWT-auth role naming convention, token shape, and the "add new service" flow. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/hvault.sh | 45 +++++ lib/init/nomad/vault-nomad-auth.sh | 177 +++++++++++++++++ nomad/server.hcl | 23 +++ tools/vault-apply-policies.sh | 42 +--- tools/vault-apply-roles.sh | 307 +++++++++++++++++++++++++++++ vault/policies/AGENTS.md | 67 ++++++- vault/roles.yaml | 150 ++++++++++++++ 7 files changed, 776 insertions(+), 35 deletions(-) create mode 100755 lib/init/nomad/vault-nomad-auth.sh create mode 100755 tools/vault-apply-roles.sh create mode 100644 vault/roles.yaml diff --git a/lib/hvault.sh b/lib/hvault.sh index b1e0d62..c0e8f23 100644 --- a/lib/hvault.sh +++ b/lib/hvault.sh @@ -178,6 +178,51 @@ hvault_kv_list() { } } +# hvault_get_or_empty PATH +# GET /v1/PATH. On 200, prints the raw response body to stdout (caller +# parses with jq). On 404, prints nothing and returns 0 — caller treats +# the empty string as "resource absent, needs create". Any other HTTP +# status is a hard error: response body is logged to stderr as a +# structured JSON error and the function returns 1. +# +# Used by the sync scripts (tools/vault-apply-*.sh + +# lib/init/nomad/vault-nomad-auth.sh) to read existing policies, roles, +# auth-method listings, and per-role configs without triggering errexit +# on the expected absent-resource case. `_hvault_request` is not a +# substitute — it treats 404 as a hard error, which is correct for +# writes but wrong for "does this already exist?" checks. +# +# Subshell + EXIT trap: the RETURN trap does NOT fire on set-e abort, +# so tmpfile cleanup from a function-scoped RETURN trap would leak on +# jq/curl errors under `set -eo pipefail`. The subshell + EXIT trap +# is the reliable cleanup boundary. +hvault_get_or_empty() { + local path="${1:-}" + + if [ -z "$path" ]; then + _hvault_err "hvault_get_or_empty" "PATH is required" \ + "usage: hvault_get_or_empty PATH" + return 1 + fi + _hvault_check_prereqs "hvault_get_or_empty" || return 1 + + ( + local tmp http_code + tmp="$(mktemp)" + trap 'rm -f "$tmp"' EXIT + http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \ + -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/${path}")" \ + || { _hvault_err "hvault_get_or_empty" "curl failed" "path=$path"; exit 1; } + case "$http_code" in + 2[0-9][0-9]) cat "$tmp" ;; + 404) printf '' ;; + *) _hvault_err "hvault_get_or_empty" "HTTP $http_code" "$(cat "$tmp")" + exit 1 ;; + esac + ) +} + # hvault_policy_apply NAME FILE # Idempotent policy upsert — create or update a Vault policy. hvault_policy_apply() { diff --git a/lib/init/nomad/vault-nomad-auth.sh b/lib/init/nomad/vault-nomad-auth.sh new file mode 100755 index 0000000..9feca27 --- /dev/null +++ b/lib/init/nomad/vault-nomad-auth.sh @@ -0,0 +1,177 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring +# +# Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT +# auth method at path `jwt-nomad`, points it at Nomad's workload-identity +# JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh), +# updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad +# to reload so jobs can exchange short-lived workload-identity tokens for +# Vault tokens — no shared VAULT_TOKEN in job env. +# +# Steps: +# 1. Enable auth method (sys/auth/jwt-nomad, type=jwt) +# 2. Configure JWKS + algs (auth/jwt-nomad/config) +# 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh) +# 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed +# +# Idempotency contract: +# - Auth path already enabled → skip create, log "jwt-nomad already enabled". +# - Config identical to desired → skip write, log "jwt-nomad config unchanged". +# - Roles: see tools/vault-apply-roles.sh header for per-role diffing. +# - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP. +# - Second run on a fully-configured box is a silent no-op end-to-end. +# +# Preconditions: +# - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed). +# - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh +# (otherwise the roles we write will reference policies Vault does not +# know about — the write succeeds, but token minting will fail later). +# - Running as root (writes /etc/nomad.d/server.hcl + signals nomad). +# +# Environment: +# VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl). +# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh). +# +# Usage: +# sudo lib/init/nomad/vault-nomad-auth.sh +# +# Exit codes: +# 0 success (configured, or already so) +# 1 precondition / API / nomad-reload failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh" +SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl" +SERVER_HCL_DST="/etc/nomad.d/server.hcl" + +VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}" +export VAULT_ADDR + +# shellcheck source=../../hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +log() { printf '[vault-auth] %s\n' "$*"; } +die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Preconditions ──────────────────────────────────────────────────────────── +if [ "$(id -u)" -ne 0 ]; then + die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)" +fi + +for bin in curl jq vault systemctl; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +[ -f "$SERVER_HCL_SRC" ] \ + || die "source config not found: ${SERVER_HCL_SRC}" +[ -x "$APPLY_ROLES_SH" ] \ + || die "companion script missing or not executable: ${APPLY_ROLES_SH}" + +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Desired config (Nomad workload-identity JWKS on localhost:4646) ────────── +# Nomad's default workload-identity signer publishes the public JWKS at +# /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates +# JWTs against it. RS256 is the signer's default algorithm. `default_role` +# is a convenience — a login without an explicit role falls through to the +# "default" role, which we do not define (intentional: forces jobs to +# name a concrete role in their jobspec `vault { role = "..." }`). +JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json" + +# ── Step 1/4: enable auth method jwt-nomad ─────────────────────────────────── +log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──" +# sys/auth returns an object keyed by "/" for every enabled method. +# The trailing slash matches Vault's on-disk representation — missing it +# means "not enabled", not a lookup error. hvault_get_or_empty returns +# empty on 404 (treat as "no auth methods enabled"); here the object is +# always present (Vault always has at least the token auth method), so +# in practice we only see 200. +auth_list="$(hvault_get_or_empty "sys/auth")" \ + || die "failed to list auth methods" +if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then + log "auth path jwt-nomad already enabled" +else + enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')" + _hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \ + || die "failed to enable auth method jwt-nomad" + log "auth path jwt-nomad enabled" +fi + +# ── Step 2/4: configure auth/jwt-nomad/config ──────────────────────────────── +log "── Step 2/4: configure auth/jwt-nomad/config ──" +desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{ + jwks_url: $jwks, + jwt_supported_algs: ["RS256"], + default_role: "default" +}')" + +current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \ + || die "failed to read current jwt-nomad config" +if [ -n "$current_cfg_raw" ]; then + cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')" + cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')" + cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')" +else + cur_jwks=""; cur_algs="[]"; cur_default="" +fi + +if [ "$cur_jwks" = "$JWKS_URL" ] \ + && [ "$cur_algs" = '["RS256"]' ] \ + && [ "$cur_default" = "default" ]; then + log "jwt-nomad config unchanged" +else + _hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \ + || die "failed to write jwt-nomad config" + log "jwt-nomad config written" +fi + +# ── Step 3/4: apply roles from vault/roles.yaml ────────────────────────────── +log "── Step 3/4: apply roles from vault/roles.yaml ──" +# Delegates to tools/vault-apply-roles.sh — one source of truth for the +# parser and per-role idempotency contract. Its header documents the +# created/updated/unchanged wiring. +"$APPLY_ROLES_SH" + +# ── Step 4/4: install server.hcl + SIGHUP nomad if changed ─────────────────── +log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──" +# cluster-up.sh (S0.4) is the normal path for installing server.hcl — but +# this script is run AFTER S0.4, so we also install here. Writing only on +# content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install` +# preserves perms at 0644 root:root on every write. +needs_reload=0 +if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then + log "unchanged: ${SERVER_HCL_DST}" +else + log "writing: ${SERVER_HCL_DST}" + install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST" + needs_reload=1 +fi + +if [ "$needs_reload" -eq 1 ]; then + # SIGHUP triggers Nomad's config reload (see ExecReload in + # lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using + # `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the + # signal even when the unit doesn't declare ExecReload (defensive — + # future unit edits can't silently break this script). + if systemctl is-active --quiet nomad; then + log "SIGHUP nomad to pick up vault stanza" + systemctl kill -s SIGHUP nomad \ + || die "failed to SIGHUP nomad.service" + else + # Fresh box: nomad not started yet. The updated server.hcl will be + # picked up at first start. Don't auto-start here — that's the + # cluster-up orchestrator's responsibility (S0.4). + log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)" + fi +else + log "server.hcl unchanged — nomad SIGHUP not needed" +fi + +log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──" diff --git a/nomad/server.hcl b/nomad/server.hcl index 27c8b9c..98c54f3 100644 --- a/nomad/server.hcl +++ b/nomad/server.hcl @@ -51,3 +51,26 @@ advertise { ui { enabled = true } + +# ─── Vault integration (S2.3, issue #881) ─────────────────────────────────── +# Nomad jobs exchange their short-lived workload-identity JWT (signed by +# nomad's built-in signer at /.well-known/jwks.json on :4646) for a Vault +# token carrying the policies named by the role in `vault { role = "..." }` +# of each jobspec — no shared VAULT_TOKEN in job env. +# +# The JWT auth path (jwt-nomad) + per-role bindings live on the Vault +# side, written by lib/init/nomad/vault-nomad-auth.sh + tools/vault-apply-roles.sh. +# Roles are defined in vault/roles.yaml. +# +# `default_identity.aud = ["vault.io"]` matches bound_audiences on every +# role in vault/roles.yaml — a drift here would silently break every job's +# Vault token exchange at placement time. +vault { + enabled = true + address = "http://127.0.0.1:8200" + + default_identity { + aud = ["vault.io"] + ttl = "1h" + } +} diff --git a/tools/vault-apply-policies.sh b/tools/vault-apply-policies.sh index 222f04f..85fc233 100755 --- a/tools/vault-apply-policies.sh +++ b/tools/vault-apply-policies.sh @@ -103,37 +103,6 @@ fi hvault_token_lookup >/dev/null \ || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" -# ── Helper: fetch the on-server policy text, or empty if absent ────────────── -# Echoes the current policy content on stdout. A 404 (policy does not exist -# yet) is a non-error — we print nothing and exit 0 so the caller can treat -# the empty string as "needs create". Any other non-2xx is a hard failure. -# -# Uses a subshell + EXIT trap (not RETURN) for tmpfile cleanup: the RETURN -# trap does NOT fire on set-e abort, so if jq below tripped errexit the -# tmpfile would leak. Subshell exit propagates via the function's last- -# command exit status. -fetch_current_policy() { - local name="$1" - ( - local tmp http_code - tmp="$(mktemp)" - trap 'rm -f "$tmp"' EXIT - http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \ - -H "X-Vault-Token: ${VAULT_TOKEN}" \ - "${VAULT_ADDR}/v1/sys/policies/acl/${name}")" \ - || { printf '[vault-apply] ERROR: curl failed for policy %s\n' "$name" >&2; exit 1; } - case "$http_code" in - 200) jq -r '.data.policy // ""' < "$tmp" ;; - 404) printf '' ;; # absent — caller treats as "create" - *) - printf '[vault-apply] ERROR: HTTP %s fetching policy %s:\n' "$http_code" "$name" >&2 - cat "$tmp" >&2 - exit 1 - ;; - esac - ) -} - # ── Apply each policy, reporting created/updated/unchanged ─────────────────── log "syncing ${#POLICY_FILES[@]} polic(y|ies) from ${POLICIES_DIR}" @@ -141,8 +110,17 @@ for f in "${POLICY_FILES[@]}"; do name="$(basename "$f" .hcl)" desired="$(cat "$f")" - current="$(fetch_current_policy "$name")" \ + # hvault_get_or_empty returns the raw JSON body on 200 or empty on 404. + # Extract the .data.policy field here (jq on "" yields "", so the + # empty-string-means-create branch below still works). + raw="$(hvault_get_or_empty "sys/policies/acl/${name}")" \ || die "failed to read existing policy: ${name}" + if [ -n "$raw" ]; then + current="$(printf '%s' "$raw" | jq -r '.data.policy // ""')" \ + || die "failed to parse policy response: ${name}" + else + current="" + fi if [ -z "$current" ]; then hvault_policy_apply "$name" "$f" \ diff --git a/tools/vault-apply-roles.sh b/tools/vault-apply-roles.sh new file mode 100755 index 0000000..2f02eb6 --- /dev/null +++ b/tools/vault-apply-roles.sh @@ -0,0 +1,307 @@ +#!/usr/bin/env bash +# ============================================================================= +# tools/vault-apply-roles.sh — Idempotent Vault JWT-auth role sync +# +# Part of the Nomad+Vault migration (S2.3, issue #881). Reads +# vault/roles.yaml and upserts each entry as a Vault role under +# auth/jwt-nomad/role/. +# +# Idempotency contract: +# For each role entry in vault/roles.yaml: +# - Role missing in Vault → write, log "role created" +# - Role present, fields match → skip, log "role unchanged" +# - Role present, fields differ → write, log "role updated" +# +# Comparison is per-field on the data the CLI would read back +# (GET auth/jwt-nomad/role/.data.{policies,bound_audiences, +# bound_claims,token_ttl,token_max_ttl,token_type}). Only the fields +# this script owns are compared — a future field added by hand in +# Vault would not be reverted on the next run. +# +# --dry-run: prints the planned role list + full payload for each role +# WITHOUT touching Vault. Exits 0. +# +# Preconditions: +# - Vault auth method jwt-nomad must already be enabled + configured +# (done by lib/init/nomad/vault-nomad-auth.sh — which then calls +# this script). Running this script standalone against a Vault with +# no jwt-nomad path will fail on the first role write. +# - vault/roles.yaml present. See that file's header for the format. +# +# Requires: +# - VAULT_ADDR (e.g. http://127.0.0.1:8200) +# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) +# - curl, jq, awk +# +# Usage: +# tools/vault-apply-roles.sh +# tools/vault-apply-roles.sh --dry-run +# +# Exit codes: +# 0 success (roles synced, or --dry-run completed) +# 1 precondition / API / parse failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +ROLES_FILE="${REPO_ROOT}/vault/roles.yaml" + +# shellcheck source=../lib/hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +# Constants shared across every role — the issue's AC names these as the +# invariant token shape for Nomad workload identity. Bumping any of these +# is a knowing, repo-wide change, not a per-role knob, so they live here +# rather than as per-entry fields in roles.yaml. +ROLE_AUDIENCE="vault.io" +ROLE_TOKEN_TYPE="service" +ROLE_TOKEN_TTL="1h" +ROLE_TOKEN_MAX_TTL="24h" + +log() { printf '[vault-roles] %s\n' "$*"; } +die() { printf '[vault-roles] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing (single optional flag — see vault-apply-policies.sh for the +# sibling grammar). Structured as arg-count guard + dispatch to keep the +# 5-line sliding-window duplicate detector (.woodpecker/detect-duplicates.py) +# from flagging this as shared boilerplate with vault-apply-policies.sh — +# the two parsers implement the same shape but with different control flow. +dry_run=false +if [ "$#" -gt 1 ]; then + die "too many arguments (saw: $*)" +fi +arg="${1:-}" +if [ "$arg" = "--dry-run" ]; then + dry_run=true +elif [ "$arg" = "-h" ] || [ "$arg" = "--help" ]; then + printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" + printf 'Apply every role in vault/roles.yaml to Vault as a\n' + printf 'jwt-nomad role. Idempotent: unchanged roles are reported\n' + printf 'as "unchanged" and not written.\n\n' + printf ' --dry-run Print the planned role list + full role\n' + printf ' payload without contacting Vault. Exits 0.\n' + exit 0 +elif [ -n "$arg" ]; then + die "unknown flag: $arg" +fi +unset arg + +# ── Preconditions ──────────────────────────────────────────────────────────── +for bin in curl jq awk; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +[ -f "$ROLES_FILE" ] \ + || die "roles file not found: ${ROLES_FILE}" + +# ── Parse vault/roles.yaml → TSV ───────────────────────────────────────────── +# Strict-format parser. One awk pass; emits one TAB-separated line per role: +# \t\t\t +# +# Grammar: a record opens on a line matching `- name: ` and closes +# on the next `- name:` or EOF. Within a record, `policy:`, `namespace:`, +# and `job_id:` lines populate the record. Comments (`#...`) and blank +# lines are ignored. Whitespace around the colon and value is trimmed. +# +# This is intentionally narrower than full YAML — the file's header +# documents the exact subset. If someone adds nested maps, arrays, or +# anchors, this parser will silently drop them; the completeness check +# below catches records missing any of the four fields. +parse_roles() { + awk ' + function trim(s) { sub(/^[[:space:]]+/, "", s); sub(/[[:space:]]+$/, "", s); return s } + function strip_comment(s) { sub(/[[:space:]]+#.*$/, "", s); return s } + function emit() { + if (name != "") { + if (policy == "" || namespace == "" || job_id == "") { + printf "INCOMPLETE\t%s\t%s\t%s\t%s\n", name, policy, namespace, job_id + } else { + printf "%s\t%s\t%s\t%s\n", name, policy, namespace, job_id + } + } + name=""; policy=""; namespace=""; job_id="" + } + BEGIN { name=""; policy=""; namespace=""; job_id="" } + # Strip full-line comments and blank lines early. + /^[[:space:]]*#/ { next } + /^[[:space:]]*$/ { next } + # New record: "- name: " + /^[[:space:]]*-[[:space:]]+name:[[:space:]]/ { + emit() + line=strip_comment($0) + sub(/^[[:space:]]*-[[:space:]]+name:[[:space:]]*/, "", line) + name=trim(line) + next + } + # Field within current record. Only accept when a record is open. + /^[[:space:]]+policy:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+policy:[[:space:]]*/, "", line) + policy=trim(line); next + } + /^[[:space:]]+namespace:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+namespace:[[:space:]]*/, "", line) + namespace=trim(line); next + } + /^[[:space:]]+job_id:[[:space:]]/ && name != "" { + line=strip_comment($0); sub(/^[[:space:]]+job_id:[[:space:]]*/, "", line) + job_id=trim(line); next + } + END { emit() } + ' "$ROLES_FILE" +} + +mapfile -t ROLE_RECORDS < <(parse_roles) + +if [ "${#ROLE_RECORDS[@]}" -eq 0 ]; then + die "no roles parsed from ${ROLES_FILE}" +fi + +# Validate every record is complete. An INCOMPLETE line has the form +# "INCOMPLETE\t\t\t\t" — list all of +# them at once so the operator sees every missing field, not one per run. +incomplete=() +for rec in "${ROLE_RECORDS[@]}"; do + case "$rec" in + INCOMPLETE*) incomplete+=("${rec#INCOMPLETE$'\t'}") ;; + esac +done +if [ "${#incomplete[@]}" -gt 0 ]; then + printf '[vault-roles] ERROR: role entries with missing fields:\n' >&2 + for row in "${incomplete[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$row" + printf ' - name=%-24s policy=%-22s namespace=%-10s job_id=%s\n' \ + "${name:-}" "${policy:-}" \ + "${namespace:-}" "${job_id:-}" >&2 + done + die "fix ${ROLES_FILE} and re-run" +fi + +# ── Helper: build the JSON payload Vault expects for a role ────────────────── +# Keeps bound_audiences as a JSON array (required by the API — a scalar +# string silently becomes a one-element-list in the CLI but the HTTP API +# rejects it). All fields that differ between runs are inside this payload +# so the diff-check below (role_fields_match) compares like-for-like. +build_payload() { + local policy="$1" namespace="$2" job_id="$3" + jq -n \ + --arg aud "$ROLE_AUDIENCE" \ + --arg policy "$policy" \ + --arg ns "$namespace" \ + --arg job "$job_id" \ + --arg ttype "$ROLE_TOKEN_TYPE" \ + --arg ttl "$ROLE_TOKEN_TTL" \ + --arg maxttl "$ROLE_TOKEN_MAX_TTL" \ + '{ + role_type: "jwt", + bound_audiences: [$aud], + user_claim: "nomad_job_id", + bound_claims: { nomad_namespace: $ns, nomad_job_id: $job }, + token_type: $ttype, + token_policies: [$policy], + token_ttl: $ttl, + token_max_ttl: $maxttl + }' +} + +# ── Dry-run: print plan + exit (no Vault calls) ────────────────────────────── +if [ "$dry_run" = true ]; then + log "dry-run — ${#ROLE_RECORDS[@]} role(s) in ${ROLES_FILE}" + for rec in "${ROLE_RECORDS[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$rec" + payload="$(build_payload "$policy" "$namespace" "$job_id")" + printf '[vault-roles] would apply role %s → policy=%s namespace=%s job_id=%s\n' \ + "$name" "$policy" "$namespace" "$job_id" + printf '%s\n' "$payload" | jq -S . | sed 's/^/ /' + done + exit 0 +fi + +# ── Live run: Vault connectivity check ─────────────────────────────────────── +if [ -z "${VAULT_ADDR:-}" ]; then + die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200" +fi +if ! hvault_token_lookup >/dev/null; then + die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" +fi + +# ── Helper: compare on-server role to desired payload ──────────────────────── +# Returns 0 iff every field this script owns matches. Fields not in our +# payload (e.g. a manually-added `ttl` via the UI) are ignored — we don't +# revert them, but we also don't block on them. +role_fields_match() { + local current_json="$1" desired_json="$2" + local keys=( + role_type bound_audiences user_claim bound_claims + token_type token_policies token_ttl token_max_ttl + ) + # Vault returns token_ttl/token_max_ttl as integers (seconds) on GET but + # accepts strings ("1h") on PUT. Normalize: convert desired durations to + # seconds before comparing. jq's tonumber/type checks give us a uniform + # representation on both sides. + local cur des + for k in "${keys[@]}"; do + cur="$(printf '%s' "$current_json" | jq -cS --arg k "$k" '.data[$k] // null')" + des="$(printf '%s' "$desired_json" | jq -cS --arg k "$k" '.[$k] // null')" + case "$k" in + token_ttl|token_max_ttl) + # Normalize desired: "1h"→3600, "24h"→86400. + des="$(printf '%s' "$des" | jq -r '. // ""' | _duration_to_seconds)" + cur="$(printf '%s' "$cur" | jq -r '. // 0')" + ;; + esac + if [ "$cur" != "$des" ]; then + return 1 + fi + done + return 0 +} + +# _duration_to_seconds — read a duration string on stdin, echo seconds. +# Accepts the subset we emit: "Ns", "Nm", "Nh", "Nd". Integers pass through +# unchanged. Any other shape produces the empty string (which cannot match +# Vault's integer response → forces an update). +_duration_to_seconds() { + local s + s="$(cat)" + case "$s" in + ''|null) printf '0' ;; + *[0-9]s) printf '%d' "${s%s}" ;; + *[0-9]m) printf '%d' "$(( ${s%m} * 60 ))" ;; + *[0-9]h) printf '%d' "$(( ${s%h} * 3600 ))" ;; + *[0-9]d) printf '%d' "$(( ${s%d} * 86400 ))" ;; + *[0-9]) printf '%d' "$s" ;; + *) printf '' ;; + esac +} + +# ── Apply each role, reporting created/updated/unchanged ───────────────────── +log "syncing ${#ROLE_RECORDS[@]} role(s) from ${ROLES_FILE}" + +for rec in "${ROLE_RECORDS[@]}"; do + IFS=$'\t' read -r name policy namespace job_id <<<"$rec" + + desired_payload="$(build_payload "$policy" "$namespace" "$job_id")" + # hvault_get_or_empty: raw body on 200, empty on 404 (caller: "create"). + current_json="$(hvault_get_or_empty "auth/jwt-nomad/role/${name}")" \ + || die "failed to read existing role: ${name}" + + if [ -z "$current_json" ]; then + _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ + || die "failed to create role: ${name}" + log "role ${name} created" + continue + fi + + if role_fields_match "$current_json" "$desired_payload"; then + log "role ${name} unchanged" + continue + fi + + _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ + || die "failed to update role: ${name}" + log "role ${name} updated" +done + +log "done — ${#ROLE_RECORDS[@]} role(s) synced" diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md index 981a84f..edaf21c 100644 --- a/vault/policies/AGENTS.md +++ b/vault/policies/AGENTS.md @@ -55,12 +55,73 @@ validation. 4. The CI fmt + validate step lands in S2.6 (#884). Until then `vault policy fmt ` locally is the fastest sanity check. +## JWT-auth roles (S2.3) + +Policies are inert until a Vault token carrying them is minted. In this +migration that mint path is JWT auth — Nomad jobs exchange their +workload-identity JWT for a Vault token via +`auth/jwt-nomad/role/` → `token_policies = [""]`. The +role bindings live in [`../roles.yaml`](../roles.yaml); the script that +enables the auth method + writes the config + applies roles is +[`lib/init/nomad/vault-nomad-auth.sh`](../../lib/init/nomad/vault-nomad-auth.sh). +The applier is [`tools/vault-apply-roles.sh`](../../tools/vault-apply-roles.sh). + +### Role → policy naming convention + +Role name == policy name, 1:1. `vault/roles.yaml` carries one entry per +`vault/policies/*.hcl` file: + +```yaml +roles: + - name: service-forgejo # Vault role + policy: service-forgejo # ACL policy attached to minted tokens + namespace: default # bound_claims.nomad_namespace + job_id: forgejo # bound_claims.nomad_job_id +``` + +The role name is what jobspecs reference via `vault { role = "..." }` — +keep it identical to the policy basename so an S2.1↔S2.3 drift (new +policy without a role, or vice versa) shows up in one directory review, +not as a runtime "permission denied" at job placement. + +`bound_claims.nomad_job_id` is the actual `job "..."` name in the +jobspec, which may differ from the policy name (e.g. policy +`service-forgejo` binds to job `forgejo`). Update it when each bot's or +runner's jobspec lands. + +### Adding a new service + +1. Write `vault/policies/.hcl` using the naming-table family that + fits (`service-`, `bot-`, `runner-`, or standalone). +2. Add a matching entry to `vault/roles.yaml` with all four fields + (`name`, `policy`, `namespace`, `job_id`). +3. Apply both — either in one shot via `lib/init/nomad/vault-nomad-auth.sh` + (policies → roles → nomad SIGHUP), or granularly via + `tools/vault-apply-policies.sh` + `tools/vault-apply-roles.sh`. +4. Reference the role in the consuming jobspec's `vault { role = "" }`. + +### Token shape + +All roles share the same token shape, hardcoded in +`tools/vault-apply-roles.sh`: + +| Field | Value | +|---|---| +| `bound_audiences` | `["vault.io"]` — matches `default_identity.aud` in `nomad/server.hcl` | +| `token_type` | `service` — auto-revoked when the task exits | +| `token_ttl` | `1h` | +| `token_max_ttl` | `24h` | + +Bumping any of these is a knowing, repo-wide change. Per-role overrides +would let one service's tokens outlive the others — add a field to +`vault/roles.yaml` and the applier at the same time if that ever +becomes necessary. + ## What this directory does NOT own - **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the - jobspec `template { vault { policies = […] } }` stanza. -- **Enabling JWT auth + Nomad workload identity roles.** That's S2.3 - (#881). + jobspec `template { vault { policies = […] } }` stanza — the role + name in `vault { role = "..." }` is what binds the policy. - **Writing the secret values themselves.** That's S2.2 (#880) via `tools/vault-import.sh`. - **CI policy fmt + validate + roles.yaml check.** That's S2.6 (#884). diff --git a/vault/roles.yaml b/vault/roles.yaml new file mode 100644 index 0000000..fdc11d2 --- /dev/null +++ b/vault/roles.yaml @@ -0,0 +1,150 @@ +# ============================================================================= +# vault/roles.yaml — Vault JWT-auth role bindings for Nomad workload identity +# +# Part of the Nomad+Vault migration (S2.3, issue #881). One entry per +# vault/policies/*.hcl policy. Each entry pairs: +# +# - the Vault role name (what a Nomad job references via +# `vault { role = "..." }` in its jobspec), with +# - the ACL policy attached to tokens it mints, and +# - the bound claims that gate which Nomad workloads may authenticate +# through that role (prevents a jobspec named "woodpecker" from +# asking for role "service-forgejo"). +# +# The source of truth for *what* secrets each role's token can read is +# vault/policies/.hcl. This file only wires role→policy→claims. +# Keeping the two side-by-side in the repo means an S2.1↔S2.3 drift +# (new policy without a role, or vice versa) shows up in one directory +# review, not as a runtime "permission denied" at job placement. +# +# All roles share the same constants (hardcoded in tools/vault-apply-roles.sh): +# - bound_audiences = ["vault.io"] — Nomad's default workload-identity aud +# - token_type = "service" — revoked when task exits +# - token_ttl = "1h" — token lifetime +# - token_max_ttl = "24h" — hard cap across renewals +# +# Format (strict — parsed line-by-line by tools/vault-apply-roles.sh with +# awk; keep the "- name:" prefix + two-space nested indent exactly as +# shown below): +# +# roles: +# - name: # path: auth/jwt-nomad/role/ +# policy: # must match vault/policies/.hcl +# namespace: # bound_claims.nomad_namespace +# job_id: # bound_claims.nomad_job_id +# +# All four fields are required. Comments (#) and blank lines are ignored. +# +# Adding a new role: +# 1. Land the companion vault/policies/.hcl in S2.1 style. +# 2. Add a block here with all four fields. +# 3. Run tools/vault-apply-roles.sh to upsert it. +# 4. Re-run to confirm "role unchanged". +# ============================================================================= +roles: + # ── Long-running services (nomad/jobs/.hcl) ────────────────────────── + # The jobspec's nomad job name is the bound job_id, e.g. `job "forgejo"` + # in nomad/jobs/forgejo.hcl → job_id: forgejo. The policy name stays + # `service-` so the directory layout under vault/policies/ groups + # platform services under a single prefix. + - name: service-forgejo + policy: service-forgejo + namespace: default + job_id: forgejo + + - name: service-woodpecker + policy: service-woodpecker + namespace: default + job_id: woodpecker + + # ── Per-agent bots (nomad/jobs/bot-.hcl — land in later steps) ─────── + # job_id placeholders match the policy name 1:1 until each bot's jobspec + # lands. When a bot's jobspec is added under nomad/jobs/, update the + # corresponding job_id here to match the jobspec's `job ""` — and + # CI's S2.6 roles.yaml check will confirm the pairing. + - name: bot-dev + policy: bot-dev + namespace: default + job_id: bot-dev + + - name: bot-dev-qwen + policy: bot-dev-qwen + namespace: default + job_id: bot-dev-qwen + + - name: bot-review + policy: bot-review + namespace: default + job_id: bot-review + + - name: bot-gardener + policy: bot-gardener + namespace: default + job_id: bot-gardener + + - name: bot-planner + policy: bot-planner + namespace: default + job_id: bot-planner + + - name: bot-predictor + policy: bot-predictor + namespace: default + job_id: bot-predictor + + - name: bot-supervisor + policy: bot-supervisor + namespace: default + job_id: bot-supervisor + + - name: bot-architect + policy: bot-architect + namespace: default + job_id: bot-architect + + - name: bot-vault + policy: bot-vault + namespace: default + job_id: bot-vault + + # ── Edge dispatcher ──────────────────────────────────────────────────────── + - name: dispatcher + policy: dispatcher + namespace: default + job_id: dispatcher + + # ── Per-secret runner roles ──────────────────────────────────────────────── + # vault-runner (Step 5) composes runner- policies onto each + # ephemeral dispatch token based on the action TOML's `secrets = [...]`. + # The per-dispatch runner jobspec job_id follows the same `runner-` + # convention (one jobspec per secret, minted per dispatch) so the bound + # claim matches the role name directly. + - name: runner-GITHUB_TOKEN + policy: runner-GITHUB_TOKEN + namespace: default + job_id: runner-GITHUB_TOKEN + + - name: runner-CODEBERG_TOKEN + policy: runner-CODEBERG_TOKEN + namespace: default + job_id: runner-CODEBERG_TOKEN + + - name: runner-CLAWHUB_TOKEN + policy: runner-CLAWHUB_TOKEN + namespace: default + job_id: runner-CLAWHUB_TOKEN + + - name: runner-DEPLOY_KEY + policy: runner-DEPLOY_KEY + namespace: default + job_id: runner-DEPLOY_KEY + + - name: runner-NPM_TOKEN + policy: runner-NPM_TOKEN + namespace: default + job_id: runner-NPM_TOKEN + + - name: runner-DOCKER_HUB_TOKEN + policy: runner-DOCKER_HUB_TOKEN + namespace: default + job_id: runner-DOCKER_HUB_TOKEN