diff --git a/lib/AGENTS.md b/lib/AGENTS.md index 8807a69..555d0f7 100644 --- a/lib/AGENTS.md +++ b/lib/AGENTS.md @@ -34,5 +34,5 @@ sourced as needed. | `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses ` ... ` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) | | `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) | | `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) | -| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh` | -| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | +| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | Not sourced at runtime yet — pure scaffolding for Nomad+Vault migration (#799) | +| `lib/init/nomad/` | Nomad+Vault Step 0 installer scripts. `cluster-up.sh` — idempotent orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` | diff --git a/lib/hvault.sh b/lib/hvault.sh index c0e8f23..b1e0d62 100644 --- a/lib/hvault.sh +++ b/lib/hvault.sh @@ -178,51 +178,6 @@ hvault_kv_list() { } } -# hvault_get_or_empty PATH -# GET /v1/PATH. On 200, prints the raw response body to stdout (caller -# parses with jq). On 404, prints nothing and returns 0 — caller treats -# the empty string as "resource absent, needs create". Any other HTTP -# status is a hard error: response body is logged to stderr as a -# structured JSON error and the function returns 1. -# -# Used by the sync scripts (tools/vault-apply-*.sh + -# lib/init/nomad/vault-nomad-auth.sh) to read existing policies, roles, -# auth-method listings, and per-role configs without triggering errexit -# on the expected absent-resource case. `_hvault_request` is not a -# substitute — it treats 404 as a hard error, which is correct for -# writes but wrong for "does this already exist?" checks. -# -# Subshell + EXIT trap: the RETURN trap does NOT fire on set-e abort, -# so tmpfile cleanup from a function-scoped RETURN trap would leak on -# jq/curl errors under `set -eo pipefail`. The subshell + EXIT trap -# is the reliable cleanup boundary. -hvault_get_or_empty() { - local path="${1:-}" - - if [ -z "$path" ]; then - _hvault_err "hvault_get_or_empty" "PATH is required" \ - "usage: hvault_get_or_empty PATH" - return 1 - fi - _hvault_check_prereqs "hvault_get_or_empty" || return 1 - - ( - local tmp http_code - tmp="$(mktemp)" - trap 'rm -f "$tmp"' EXIT - http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \ - -H "X-Vault-Token: ${VAULT_TOKEN}" \ - "${VAULT_ADDR}/v1/${path}")" \ - || { _hvault_err "hvault_get_or_empty" "curl failed" "path=$path"; exit 1; } - case "$http_code" in - 2[0-9][0-9]) cat "$tmp" ;; - 404) printf '' ;; - *) _hvault_err "hvault_get_or_empty" "HTTP $http_code" "$(cat "$tmp")" - exit 1 ;; - esac - ) -} - # hvault_policy_apply NAME FILE # Idempotent policy upsert — create or update a Vault policy. hvault_policy_apply() { diff --git a/lib/init/nomad/vault-nomad-auth.sh b/lib/init/nomad/vault-nomad-auth.sh deleted file mode 100755 index 8a75e21..0000000 --- a/lib/init/nomad/vault-nomad-auth.sh +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# lib/init/nomad/vault-nomad-auth.sh — Idempotent Vault JWT auth + Nomad wiring -# -# Part of the Nomad+Vault migration (S2.3, issue #881). Enables Vault's JWT -# auth method at path `jwt-nomad`, points it at Nomad's workload-identity -# JWKS endpoint, writes one role per policy (via tools/vault-apply-roles.sh), -# updates /etc/nomad.d/server.hcl with the vault stanza, and signals nomad -# to reload so jobs can exchange short-lived workload-identity tokens for -# Vault tokens — no shared VAULT_TOKEN in job env. -# -# Steps: -# 1. Enable auth method (sys/auth/jwt-nomad, type=jwt) -# 2. Configure JWKS + algs (auth/jwt-nomad/config) -# 3. Upsert roles from vault/roles.yaml (delegates to vault-apply-roles.sh) -# 4. Install /etc/nomad.d/server.hcl from repo + SIGHUP nomad if changed -# -# Idempotency contract: -# - Auth path already enabled → skip create, log "jwt-nomad already enabled". -# - Config identical to desired → skip write, log "jwt-nomad config unchanged". -# - Roles: see tools/vault-apply-roles.sh header for per-role diffing. -# - server.hcl on disk byte-identical to repo copy → skip write, skip SIGHUP. -# - Second run on a fully-configured box is a silent no-op end-to-end. -# -# Preconditions: -# - S0 complete (empty cluster up: nomad + vault reachable, vault unsealed). -# - S2.1 complete: vault/policies/*.hcl applied via tools/vault-apply-policies.sh -# (otherwise the roles we write will reference policies Vault does not -# know about — the write succeeds, but token minting will fail later). -# - Running as root (writes /etc/nomad.d/server.hcl + signals nomad). -# -# Environment: -# VAULT_ADDR — default http://127.0.0.1:8200 (matches nomad/vault.hcl). -# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh). -# -# Usage: -# sudo lib/init/nomad/vault-nomad-auth.sh -# -# Exit codes: -# 0 success (configured, or already so) -# 1 precondition / API / nomad-reload failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" - -APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh" -SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl" -SERVER_HCL_DST="/etc/nomad.d/server.hcl" - -VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}" -export VAULT_ADDR - -# shellcheck source=../../hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -log() { printf '[vault-auth] %s\n' "$*"; } -die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; } - -# ── Preconditions ──────────────────────────────────────────────────────────── -if [ "$(id -u)" -ne 0 ]; then - die "must run as root (writes ${SERVER_HCL_DST} + signals nomad)" -fi - -# curl + jq are used directly; hvault.sh's helpers are also curl-based, so -# the `vault` CLI is NOT required here — don't add it to this list, or a -# Vault-server-present / vault-CLI-absent box (e.g. a Nomad-client-only -# node) would die spuriously. systemctl is required for SIGHUPing nomad. -for bin in curl jq systemctl; do - command -v "$bin" >/dev/null 2>&1 \ - || die "required binary not found: ${bin}" -done - -[ -f "$SERVER_HCL_SRC" ] \ - || die "source config not found: ${SERVER_HCL_SRC}" -[ -x "$APPLY_ROLES_SH" ] \ - || die "companion script missing or not executable: ${APPLY_ROLES_SH}" - -hvault_token_lookup >/dev/null \ - || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" - -# ── Desired config (Nomad workload-identity JWKS on localhost:4646) ────────── -# Nomad's default workload-identity signer publishes the public JWKS at -# /.well-known/jwks.json on the nomad HTTP API port (4646). Vault validates -# JWTs against it. RS256 is the signer's default algorithm. `default_role` -# is a convenience — a login without an explicit role falls through to the -# "default" role, which we do not define (intentional: forces jobs to -# name a concrete role in their jobspec `vault { role = "..." }`). -JWKS_URL="http://127.0.0.1:4646/.well-known/jwks.json" - -# ── Step 1/4: enable auth method jwt-nomad ─────────────────────────────────── -log "── Step 1/4: enable auth method path=jwt-nomad type=jwt ──" -# sys/auth returns an object keyed by "/" for every enabled method. -# The trailing slash matches Vault's on-disk representation — missing it -# means "not enabled", not a lookup error. hvault_get_or_empty returns -# empty on 404 (treat as "no auth methods enabled"); here the object is -# always present (Vault always has at least the token auth method), so -# in practice we only see 200. -auth_list="$(hvault_get_or_empty "sys/auth")" \ - || die "failed to list auth methods" -if printf '%s' "$auth_list" | jq -e '.["jwt-nomad/"]' >/dev/null 2>&1; then - log "auth path jwt-nomad already enabled" -else - enable_payload="$(jq -n '{type:"jwt",description:"Nomad workload identity (S2.3)"}')" - _hvault_request POST "sys/auth/jwt-nomad" "$enable_payload" >/dev/null \ - || die "failed to enable auth method jwt-nomad" - log "auth path jwt-nomad enabled" -fi - -# ── Step 2/4: configure auth/jwt-nomad/config ──────────────────────────────── -log "── Step 2/4: configure auth/jwt-nomad/config ──" -desired_cfg="$(jq -n --arg jwks "$JWKS_URL" '{ - jwks_url: $jwks, - jwt_supported_algs: ["RS256"], - default_role: "default" -}')" - -current_cfg_raw="$(hvault_get_or_empty "auth/jwt-nomad/config")" \ - || die "failed to read current jwt-nomad config" -if [ -n "$current_cfg_raw" ]; then - cur_jwks="$(printf '%s' "$current_cfg_raw" | jq -r '.data.jwks_url // ""')" - cur_algs="$(printf '%s' "$current_cfg_raw" | jq -cS '.data.jwt_supported_algs // []')" - cur_default="$(printf '%s' "$current_cfg_raw" | jq -r '.data.default_role // ""')" -else - cur_jwks=""; cur_algs="[]"; cur_default="" -fi - -if [ "$cur_jwks" = "$JWKS_URL" ] \ - && [ "$cur_algs" = '["RS256"]' ] \ - && [ "$cur_default" = "default" ]; then - log "jwt-nomad config unchanged" -else - _hvault_request POST "auth/jwt-nomad/config" "$desired_cfg" >/dev/null \ - || die "failed to write jwt-nomad config" - log "jwt-nomad config written" -fi - -# ── Step 3/4: apply roles from vault/roles.yaml ────────────────────────────── -log "── Step 3/4: apply roles from vault/roles.yaml ──" -# Delegates to tools/vault-apply-roles.sh — one source of truth for the -# parser and per-role idempotency contract. Its header documents the -# created/updated/unchanged wiring. -"$APPLY_ROLES_SH" - -# ── Step 4/4: install server.hcl + SIGHUP nomad if changed ─────────────────── -log "── Step 4/4: install ${SERVER_HCL_DST} + reload nomad if changed ──" -# cluster-up.sh (S0.4) is the normal path for installing server.hcl — but -# this script is run AFTER S0.4, so we also install here. Writing only on -# content-diff keeps re-runs a true no-op (no spurious SIGHUP). `install` -# preserves perms at 0644 root:root on every write. -needs_reload=0 -if [ -f "$SERVER_HCL_DST" ] && cmp -s "$SERVER_HCL_SRC" "$SERVER_HCL_DST"; then - log "unchanged: ${SERVER_HCL_DST}" -else - log "writing: ${SERVER_HCL_DST}" - install -m 0644 -o root -g root "$SERVER_HCL_SRC" "$SERVER_HCL_DST" - needs_reload=1 -fi - -if [ "$needs_reload" -eq 1 ]; then - # SIGHUP triggers Nomad's config reload (see ExecReload in - # lib/init/nomad/systemd-nomad.sh — /bin/kill -HUP $MAINPID). Using - # `systemctl kill -s SIGHUP` instead of `systemctl reload` sends the - # signal even when the unit doesn't declare ExecReload (defensive — - # future unit edits can't silently break this script). - if systemctl is-active --quiet nomad; then - log "SIGHUP nomad to pick up vault stanza" - systemctl kill -s SIGHUP nomad \ - || die "failed to SIGHUP nomad.service" - else - # Fresh box: nomad not started yet. The updated server.hcl will be - # picked up at first start. Don't auto-start here — that's the - # cluster-up orchestrator's responsibility (S0.4). - log "nomad.service not active — skipping SIGHUP (next start loads vault stanza)" - fi -else - log "server.hcl unchanged — nomad SIGHUP not needed" -fi - -log "── done — jwt-nomad auth + config + roles + nomad vault stanza in place ──" diff --git a/nomad/server.hcl b/nomad/server.hcl index 98c54f3..27c8b9c 100644 --- a/nomad/server.hcl +++ b/nomad/server.hcl @@ -51,26 +51,3 @@ advertise { ui { enabled = true } - -# ─── Vault integration (S2.3, issue #881) ─────────────────────────────────── -# Nomad jobs exchange their short-lived workload-identity JWT (signed by -# nomad's built-in signer at /.well-known/jwks.json on :4646) for a Vault -# token carrying the policies named by the role in `vault { role = "..." }` -# of each jobspec — no shared VAULT_TOKEN in job env. -# -# The JWT auth path (jwt-nomad) + per-role bindings live on the Vault -# side, written by lib/init/nomad/vault-nomad-auth.sh + tools/vault-apply-roles.sh. -# Roles are defined in vault/roles.yaml. -# -# `default_identity.aud = ["vault.io"]` matches bound_audiences on every -# role in vault/roles.yaml — a drift here would silently break every job's -# Vault token exchange at placement time. -vault { - enabled = true - address = "http://127.0.0.1:8200" - - default_identity { - aud = ["vault.io"] - ttl = "1h" - } -} diff --git a/tools/vault-apply-policies.sh b/tools/vault-apply-policies.sh index 85fc233..222f04f 100755 --- a/tools/vault-apply-policies.sh +++ b/tools/vault-apply-policies.sh @@ -103,6 +103,37 @@ fi hvault_token_lookup >/dev/null \ || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" +# ── Helper: fetch the on-server policy text, or empty if absent ────────────── +# Echoes the current policy content on stdout. A 404 (policy does not exist +# yet) is a non-error — we print nothing and exit 0 so the caller can treat +# the empty string as "needs create". Any other non-2xx is a hard failure. +# +# Uses a subshell + EXIT trap (not RETURN) for tmpfile cleanup: the RETURN +# trap does NOT fire on set-e abort, so if jq below tripped errexit the +# tmpfile would leak. Subshell exit propagates via the function's last- +# command exit status. +fetch_current_policy() { + local name="$1" + ( + local tmp http_code + tmp="$(mktemp)" + trap 'rm -f "$tmp"' EXIT + http_code="$(curl -sS -o "$tmp" -w '%{http_code}' \ + -H "X-Vault-Token: ${VAULT_TOKEN}" \ + "${VAULT_ADDR}/v1/sys/policies/acl/${name}")" \ + || { printf '[vault-apply] ERROR: curl failed for policy %s\n' "$name" >&2; exit 1; } + case "$http_code" in + 200) jq -r '.data.policy // ""' < "$tmp" ;; + 404) printf '' ;; # absent — caller treats as "create" + *) + printf '[vault-apply] ERROR: HTTP %s fetching policy %s:\n' "$http_code" "$name" >&2 + cat "$tmp" >&2 + exit 1 + ;; + esac + ) +} + # ── Apply each policy, reporting created/updated/unchanged ─────────────────── log "syncing ${#POLICY_FILES[@]} polic(y|ies) from ${POLICIES_DIR}" @@ -110,17 +141,8 @@ for f in "${POLICY_FILES[@]}"; do name="$(basename "$f" .hcl)" desired="$(cat "$f")" - # hvault_get_or_empty returns the raw JSON body on 200 or empty on 404. - # Extract the .data.policy field here (jq on "" yields "", so the - # empty-string-means-create branch below still works). - raw="$(hvault_get_or_empty "sys/policies/acl/${name}")" \ + current="$(fetch_current_policy "$name")" \ || die "failed to read existing policy: ${name}" - if [ -n "$raw" ]; then - current="$(printf '%s' "$raw" | jq -r '.data.policy // ""')" \ - || die "failed to parse policy response: ${name}" - else - current="" - fi if [ -z "$current" ]; then hvault_policy_apply "$name" "$f" \ diff --git a/tools/vault-apply-roles.sh b/tools/vault-apply-roles.sh deleted file mode 100755 index 2f02eb6..0000000 --- a/tools/vault-apply-roles.sh +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# tools/vault-apply-roles.sh — Idempotent Vault JWT-auth role sync -# -# Part of the Nomad+Vault migration (S2.3, issue #881). Reads -# vault/roles.yaml and upserts each entry as a Vault role under -# auth/jwt-nomad/role/. -# -# Idempotency contract: -# For each role entry in vault/roles.yaml: -# - Role missing in Vault → write, log "role created" -# - Role present, fields match → skip, log "role unchanged" -# - Role present, fields differ → write, log "role updated" -# -# Comparison is per-field on the data the CLI would read back -# (GET auth/jwt-nomad/role/.data.{policies,bound_audiences, -# bound_claims,token_ttl,token_max_ttl,token_type}). Only the fields -# this script owns are compared — a future field added by hand in -# Vault would not be reverted on the next run. -# -# --dry-run: prints the planned role list + full payload for each role -# WITHOUT touching Vault. Exits 0. -# -# Preconditions: -# - Vault auth method jwt-nomad must already be enabled + configured -# (done by lib/init/nomad/vault-nomad-auth.sh — which then calls -# this script). Running this script standalone against a Vault with -# no jwt-nomad path will fail on the first role write. -# - vault/roles.yaml present. See that file's header for the format. -# -# Requires: -# - VAULT_ADDR (e.g. http://127.0.0.1:8200) -# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) -# - curl, jq, awk -# -# Usage: -# tools/vault-apply-roles.sh -# tools/vault-apply-roles.sh --dry-run -# -# Exit codes: -# 0 success (roles synced, or --dry-run completed) -# 1 precondition / API / parse failure -# ============================================================================= -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -ROLES_FILE="${REPO_ROOT}/vault/roles.yaml" - -# shellcheck source=../lib/hvault.sh -source "${REPO_ROOT}/lib/hvault.sh" - -# Constants shared across every role — the issue's AC names these as the -# invariant token shape for Nomad workload identity. Bumping any of these -# is a knowing, repo-wide change, not a per-role knob, so they live here -# rather than as per-entry fields in roles.yaml. -ROLE_AUDIENCE="vault.io" -ROLE_TOKEN_TYPE="service" -ROLE_TOKEN_TTL="1h" -ROLE_TOKEN_MAX_TTL="24h" - -log() { printf '[vault-roles] %s\n' "$*"; } -die() { printf '[vault-roles] ERROR: %s\n' "$*" >&2; exit 1; } - -# ── Flag parsing (single optional flag — see vault-apply-policies.sh for the -# sibling grammar). Structured as arg-count guard + dispatch to keep the -# 5-line sliding-window duplicate detector (.woodpecker/detect-duplicates.py) -# from flagging this as shared boilerplate with vault-apply-policies.sh — -# the two parsers implement the same shape but with different control flow. -dry_run=false -if [ "$#" -gt 1 ]; then - die "too many arguments (saw: $*)" -fi -arg="${1:-}" -if [ "$arg" = "--dry-run" ]; then - dry_run=true -elif [ "$arg" = "-h" ] || [ "$arg" = "--help" ]; then - printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" - printf 'Apply every role in vault/roles.yaml to Vault as a\n' - printf 'jwt-nomad role. Idempotent: unchanged roles are reported\n' - printf 'as "unchanged" and not written.\n\n' - printf ' --dry-run Print the planned role list + full role\n' - printf ' payload without contacting Vault. Exits 0.\n' - exit 0 -elif [ -n "$arg" ]; then - die "unknown flag: $arg" -fi -unset arg - -# ── Preconditions ──────────────────────────────────────────────────────────── -for bin in curl jq awk; do - command -v "$bin" >/dev/null 2>&1 \ - || die "required binary not found: ${bin}" -done - -[ -f "$ROLES_FILE" ] \ - || die "roles file not found: ${ROLES_FILE}" - -# ── Parse vault/roles.yaml → TSV ───────────────────────────────────────────── -# Strict-format parser. One awk pass; emits one TAB-separated line per role: -# \t\t\t -# -# Grammar: a record opens on a line matching `- name: ` and closes -# on the next `- name:` or EOF. Within a record, `policy:`, `namespace:`, -# and `job_id:` lines populate the record. Comments (`#...`) and blank -# lines are ignored. Whitespace around the colon and value is trimmed. -# -# This is intentionally narrower than full YAML — the file's header -# documents the exact subset. If someone adds nested maps, arrays, or -# anchors, this parser will silently drop them; the completeness check -# below catches records missing any of the four fields. -parse_roles() { - awk ' - function trim(s) { sub(/^[[:space:]]+/, "", s); sub(/[[:space:]]+$/, "", s); return s } - function strip_comment(s) { sub(/[[:space:]]+#.*$/, "", s); return s } - function emit() { - if (name != "") { - if (policy == "" || namespace == "" || job_id == "") { - printf "INCOMPLETE\t%s\t%s\t%s\t%s\n", name, policy, namespace, job_id - } else { - printf "%s\t%s\t%s\t%s\n", name, policy, namespace, job_id - } - } - name=""; policy=""; namespace=""; job_id="" - } - BEGIN { name=""; policy=""; namespace=""; job_id="" } - # Strip full-line comments and blank lines early. - /^[[:space:]]*#/ { next } - /^[[:space:]]*$/ { next } - # New record: "- name: " - /^[[:space:]]*-[[:space:]]+name:[[:space:]]/ { - emit() - line=strip_comment($0) - sub(/^[[:space:]]*-[[:space:]]+name:[[:space:]]*/, "", line) - name=trim(line) - next - } - # Field within current record. Only accept when a record is open. - /^[[:space:]]+policy:[[:space:]]/ && name != "" { - line=strip_comment($0); sub(/^[[:space:]]+policy:[[:space:]]*/, "", line) - policy=trim(line); next - } - /^[[:space:]]+namespace:[[:space:]]/ && name != "" { - line=strip_comment($0); sub(/^[[:space:]]+namespace:[[:space:]]*/, "", line) - namespace=trim(line); next - } - /^[[:space:]]+job_id:[[:space:]]/ && name != "" { - line=strip_comment($0); sub(/^[[:space:]]+job_id:[[:space:]]*/, "", line) - job_id=trim(line); next - } - END { emit() } - ' "$ROLES_FILE" -} - -mapfile -t ROLE_RECORDS < <(parse_roles) - -if [ "${#ROLE_RECORDS[@]}" -eq 0 ]; then - die "no roles parsed from ${ROLES_FILE}" -fi - -# Validate every record is complete. An INCOMPLETE line has the form -# "INCOMPLETE\t\t\t\t" — list all of -# them at once so the operator sees every missing field, not one per run. -incomplete=() -for rec in "${ROLE_RECORDS[@]}"; do - case "$rec" in - INCOMPLETE*) incomplete+=("${rec#INCOMPLETE$'\t'}") ;; - esac -done -if [ "${#incomplete[@]}" -gt 0 ]; then - printf '[vault-roles] ERROR: role entries with missing fields:\n' >&2 - for row in "${incomplete[@]}"; do - IFS=$'\t' read -r name policy namespace job_id <<<"$row" - printf ' - name=%-24s policy=%-22s namespace=%-10s job_id=%s\n' \ - "${name:-}" "${policy:-}" \ - "${namespace:-}" "${job_id:-}" >&2 - done - die "fix ${ROLES_FILE} and re-run" -fi - -# ── Helper: build the JSON payload Vault expects for a role ────────────────── -# Keeps bound_audiences as a JSON array (required by the API — a scalar -# string silently becomes a one-element-list in the CLI but the HTTP API -# rejects it). All fields that differ between runs are inside this payload -# so the diff-check below (role_fields_match) compares like-for-like. -build_payload() { - local policy="$1" namespace="$2" job_id="$3" - jq -n \ - --arg aud "$ROLE_AUDIENCE" \ - --arg policy "$policy" \ - --arg ns "$namespace" \ - --arg job "$job_id" \ - --arg ttype "$ROLE_TOKEN_TYPE" \ - --arg ttl "$ROLE_TOKEN_TTL" \ - --arg maxttl "$ROLE_TOKEN_MAX_TTL" \ - '{ - role_type: "jwt", - bound_audiences: [$aud], - user_claim: "nomad_job_id", - bound_claims: { nomad_namespace: $ns, nomad_job_id: $job }, - token_type: $ttype, - token_policies: [$policy], - token_ttl: $ttl, - token_max_ttl: $maxttl - }' -} - -# ── Dry-run: print plan + exit (no Vault calls) ────────────────────────────── -if [ "$dry_run" = true ]; then - log "dry-run — ${#ROLE_RECORDS[@]} role(s) in ${ROLES_FILE}" - for rec in "${ROLE_RECORDS[@]}"; do - IFS=$'\t' read -r name policy namespace job_id <<<"$rec" - payload="$(build_payload "$policy" "$namespace" "$job_id")" - printf '[vault-roles] would apply role %s → policy=%s namespace=%s job_id=%s\n' \ - "$name" "$policy" "$namespace" "$job_id" - printf '%s\n' "$payload" | jq -S . | sed 's/^/ /' - done - exit 0 -fi - -# ── Live run: Vault connectivity check ─────────────────────────────────────── -if [ -z "${VAULT_ADDR:-}" ]; then - die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200" -fi -if ! hvault_token_lookup >/dev/null; then - die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" -fi - -# ── Helper: compare on-server role to desired payload ──────────────────────── -# Returns 0 iff every field this script owns matches. Fields not in our -# payload (e.g. a manually-added `ttl` via the UI) are ignored — we don't -# revert them, but we also don't block on them. -role_fields_match() { - local current_json="$1" desired_json="$2" - local keys=( - role_type bound_audiences user_claim bound_claims - token_type token_policies token_ttl token_max_ttl - ) - # Vault returns token_ttl/token_max_ttl as integers (seconds) on GET but - # accepts strings ("1h") on PUT. Normalize: convert desired durations to - # seconds before comparing. jq's tonumber/type checks give us a uniform - # representation on both sides. - local cur des - for k in "${keys[@]}"; do - cur="$(printf '%s' "$current_json" | jq -cS --arg k "$k" '.data[$k] // null')" - des="$(printf '%s' "$desired_json" | jq -cS --arg k "$k" '.[$k] // null')" - case "$k" in - token_ttl|token_max_ttl) - # Normalize desired: "1h"→3600, "24h"→86400. - des="$(printf '%s' "$des" | jq -r '. // ""' | _duration_to_seconds)" - cur="$(printf '%s' "$cur" | jq -r '. // 0')" - ;; - esac - if [ "$cur" != "$des" ]; then - return 1 - fi - done - return 0 -} - -# _duration_to_seconds — read a duration string on stdin, echo seconds. -# Accepts the subset we emit: "Ns", "Nm", "Nh", "Nd". Integers pass through -# unchanged. Any other shape produces the empty string (which cannot match -# Vault's integer response → forces an update). -_duration_to_seconds() { - local s - s="$(cat)" - case "$s" in - ''|null) printf '0' ;; - *[0-9]s) printf '%d' "${s%s}" ;; - *[0-9]m) printf '%d' "$(( ${s%m} * 60 ))" ;; - *[0-9]h) printf '%d' "$(( ${s%h} * 3600 ))" ;; - *[0-9]d) printf '%d' "$(( ${s%d} * 86400 ))" ;; - *[0-9]) printf '%d' "$s" ;; - *) printf '' ;; - esac -} - -# ── Apply each role, reporting created/updated/unchanged ───────────────────── -log "syncing ${#ROLE_RECORDS[@]} role(s) from ${ROLES_FILE}" - -for rec in "${ROLE_RECORDS[@]}"; do - IFS=$'\t' read -r name policy namespace job_id <<<"$rec" - - desired_payload="$(build_payload "$policy" "$namespace" "$job_id")" - # hvault_get_or_empty: raw body on 200, empty on 404 (caller: "create"). - current_json="$(hvault_get_or_empty "auth/jwt-nomad/role/${name}")" \ - || die "failed to read existing role: ${name}" - - if [ -z "$current_json" ]; then - _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ - || die "failed to create role: ${name}" - log "role ${name} created" - continue - fi - - if role_fields_match "$current_json" "$desired_payload"; then - log "role ${name} unchanged" - continue - fi - - _hvault_request POST "auth/jwt-nomad/role/${name}" "$desired_payload" >/dev/null \ - || die "failed to update role: ${name}" - log "role ${name} updated" -done - -log "done — ${#ROLE_RECORDS[@]} role(s) synced" diff --git a/tools/vault-import.sh b/tools/vault-import.sh index 3ee942e..516dca5 100755 --- a/tools/vault-import.sh +++ b/tools/vault-import.sh @@ -133,8 +133,8 @@ _kv_put_secret() { for kv in "${kv_pairs[@]}"; do local k="${kv%%=*}" local v="${kv#*=}" - # Use jq with --arg for safe string interpolation (handles quotes/backslashes) - payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '. * {"data": {($k): $v}}')" + # Use jq to merge the new pair into the data object + payload="$(printf '%s' "$payload" | jq ". * {\"data\": {\"$k\": \"$v\"}}")" done # Use curl directly for KV v2 write with versioning @@ -499,11 +499,8 @@ EOF done # Second pass: group by vault_path and write - # IMPORTANT: Always write ALL keys for a path, not just changed ones. - # KV v2 POST replaces the entire document, so we must include unchanged keys - # to avoid dropping them. The idempotency guarantee comes from KV v2 versioning. declare -A paths_to_write - declare -A path_has_changes + declare -A path_statuses for key in "${!ops_data[@]}"; do local data="${ops_data[$key]}" @@ -512,26 +509,25 @@ EOF local vault_path="${key%:*}" local vault_key="${key#*:}" - # Always add to paths_to_write (all keys for this path) - if [ -z "${paths_to_write[$vault_path]:-}" ]; then - paths_to_write[$vault_path]="${vault_key}=${source_value}" + if [ "$status" = "unchanged" ]; then + _format_status "$status" "$vault_path" "$vault_key" + printf '\n' + ((unchanged++)) || true else - paths_to_write[$vault_path]="${paths_to_write[$vault_path]}|${vault_key}=${source_value}" - fi - - # Track if this path has any changes (for status reporting) - if [ "$status" != "unchanged" ]; then - path_has_changes[$vault_path]=1 + # Add to paths_to_write for this vault_path + if [ -z "${paths_to_write[$vault_path]:-}" ]; then + paths_to_write[$vault_path]="${vault_key}=${source_value}" + else + paths_to_write[$vault_path]="${paths_to_write[$vault_path]}|${vault_key}=${source_value}" + fi + # Track status for counting (use last status for the path) + path_statuses[$vault_path]="$status" fi done # Write each path with all its key-value pairs for vault_path in "${!paths_to_write[@]}"; do - # Determine effective status for this path (updated if any key changed) - local effective_status="unchanged" - if [ "${path_has_changes[$vault_path]:-}" = "1" ]; then - effective_status="updated" - fi + local status="${path_statuses[$vault_path]}" # Read pipe-separated key-value pairs and write them local pairs_string="${paths_to_write[$vault_path]}" @@ -547,14 +543,14 @@ EOF # Output status for each key in this path for kv in "${pairs_array[@]}"; do local kv_key="${kv%%=*}" - _format_status "$effective_status" "$vault_path" "$kv_key" + _format_status "$status" "$vault_path" "$kv_key" printf '\n' done - # Count only if path has changes - if [ "$effective_status" = "updated" ]; then - ((updated++)) || true - fi + case "$status" in + updated) ((updated++)) || true ;; + created) ((created++)) || true ;; + esac done _log "" diff --git a/vault/policies/AGENTS.md b/vault/policies/AGENTS.md index edaf21c..981a84f 100644 --- a/vault/policies/AGENTS.md +++ b/vault/policies/AGENTS.md @@ -55,73 +55,12 @@ validation. 4. The CI fmt + validate step lands in S2.6 (#884). Until then `vault policy fmt ` locally is the fastest sanity check. -## JWT-auth roles (S2.3) - -Policies are inert until a Vault token carrying them is minted. In this -migration that mint path is JWT auth — Nomad jobs exchange their -workload-identity JWT for a Vault token via -`auth/jwt-nomad/role/` → `token_policies = [""]`. The -role bindings live in [`../roles.yaml`](../roles.yaml); the script that -enables the auth method + writes the config + applies roles is -[`lib/init/nomad/vault-nomad-auth.sh`](../../lib/init/nomad/vault-nomad-auth.sh). -The applier is [`tools/vault-apply-roles.sh`](../../tools/vault-apply-roles.sh). - -### Role → policy naming convention - -Role name == policy name, 1:1. `vault/roles.yaml` carries one entry per -`vault/policies/*.hcl` file: - -```yaml -roles: - - name: service-forgejo # Vault role - policy: service-forgejo # ACL policy attached to minted tokens - namespace: default # bound_claims.nomad_namespace - job_id: forgejo # bound_claims.nomad_job_id -``` - -The role name is what jobspecs reference via `vault { role = "..." }` — -keep it identical to the policy basename so an S2.1↔S2.3 drift (new -policy without a role, or vice versa) shows up in one directory review, -not as a runtime "permission denied" at job placement. - -`bound_claims.nomad_job_id` is the actual `job "..."` name in the -jobspec, which may differ from the policy name (e.g. policy -`service-forgejo` binds to job `forgejo`). Update it when each bot's or -runner's jobspec lands. - -### Adding a new service - -1. Write `vault/policies/.hcl` using the naming-table family that - fits (`service-`, `bot-`, `runner-`, or standalone). -2. Add a matching entry to `vault/roles.yaml` with all four fields - (`name`, `policy`, `namespace`, `job_id`). -3. Apply both — either in one shot via `lib/init/nomad/vault-nomad-auth.sh` - (policies → roles → nomad SIGHUP), or granularly via - `tools/vault-apply-policies.sh` + `tools/vault-apply-roles.sh`. -4. Reference the role in the consuming jobspec's `vault { role = "" }`. - -### Token shape - -All roles share the same token shape, hardcoded in -`tools/vault-apply-roles.sh`: - -| Field | Value | -|---|---| -| `bound_audiences` | `["vault.io"]` — matches `default_identity.aud` in `nomad/server.hcl` | -| `token_type` | `service` — auto-revoked when the task exits | -| `token_ttl` | `1h` | -| `token_max_ttl` | `24h` | - -Bumping any of these is a knowing, repo-wide change. Per-role overrides -would let one service's tokens outlive the others — add a field to -`vault/roles.yaml` and the applier at the same time if that ever -becomes necessary. - ## What this directory does NOT own - **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the - jobspec `template { vault { policies = […] } }` stanza — the role - name in `vault { role = "..." }` is what binds the policy. + jobspec `template { vault { policies = […] } }` stanza. +- **Enabling JWT auth + Nomad workload identity roles.** That's S2.3 + (#881). - **Writing the secret values themselves.** That's S2.2 (#880) via `tools/vault-import.sh`. - **CI policy fmt + validate + roles.yaml check.** That's S2.6 (#884). diff --git a/vault/roles.yaml b/vault/roles.yaml deleted file mode 100644 index fdc11d2..0000000 --- a/vault/roles.yaml +++ /dev/null @@ -1,150 +0,0 @@ -# ============================================================================= -# vault/roles.yaml — Vault JWT-auth role bindings for Nomad workload identity -# -# Part of the Nomad+Vault migration (S2.3, issue #881). One entry per -# vault/policies/*.hcl policy. Each entry pairs: -# -# - the Vault role name (what a Nomad job references via -# `vault { role = "..." }` in its jobspec), with -# - the ACL policy attached to tokens it mints, and -# - the bound claims that gate which Nomad workloads may authenticate -# through that role (prevents a jobspec named "woodpecker" from -# asking for role "service-forgejo"). -# -# The source of truth for *what* secrets each role's token can read is -# vault/policies/.hcl. This file only wires role→policy→claims. -# Keeping the two side-by-side in the repo means an S2.1↔S2.3 drift -# (new policy without a role, or vice versa) shows up in one directory -# review, not as a runtime "permission denied" at job placement. -# -# All roles share the same constants (hardcoded in tools/vault-apply-roles.sh): -# - bound_audiences = ["vault.io"] — Nomad's default workload-identity aud -# - token_type = "service" — revoked when task exits -# - token_ttl = "1h" — token lifetime -# - token_max_ttl = "24h" — hard cap across renewals -# -# Format (strict — parsed line-by-line by tools/vault-apply-roles.sh with -# awk; keep the "- name:" prefix + two-space nested indent exactly as -# shown below): -# -# roles: -# - name: # path: auth/jwt-nomad/role/ -# policy: # must match vault/policies/.hcl -# namespace: # bound_claims.nomad_namespace -# job_id: # bound_claims.nomad_job_id -# -# All four fields are required. Comments (#) and blank lines are ignored. -# -# Adding a new role: -# 1. Land the companion vault/policies/.hcl in S2.1 style. -# 2. Add a block here with all four fields. -# 3. Run tools/vault-apply-roles.sh to upsert it. -# 4. Re-run to confirm "role unchanged". -# ============================================================================= -roles: - # ── Long-running services (nomad/jobs/.hcl) ────────────────────────── - # The jobspec's nomad job name is the bound job_id, e.g. `job "forgejo"` - # in nomad/jobs/forgejo.hcl → job_id: forgejo. The policy name stays - # `service-` so the directory layout under vault/policies/ groups - # platform services under a single prefix. - - name: service-forgejo - policy: service-forgejo - namespace: default - job_id: forgejo - - - name: service-woodpecker - policy: service-woodpecker - namespace: default - job_id: woodpecker - - # ── Per-agent bots (nomad/jobs/bot-.hcl — land in later steps) ─────── - # job_id placeholders match the policy name 1:1 until each bot's jobspec - # lands. When a bot's jobspec is added under nomad/jobs/, update the - # corresponding job_id here to match the jobspec's `job ""` — and - # CI's S2.6 roles.yaml check will confirm the pairing. - - name: bot-dev - policy: bot-dev - namespace: default - job_id: bot-dev - - - name: bot-dev-qwen - policy: bot-dev-qwen - namespace: default - job_id: bot-dev-qwen - - - name: bot-review - policy: bot-review - namespace: default - job_id: bot-review - - - name: bot-gardener - policy: bot-gardener - namespace: default - job_id: bot-gardener - - - name: bot-planner - policy: bot-planner - namespace: default - job_id: bot-planner - - - name: bot-predictor - policy: bot-predictor - namespace: default - job_id: bot-predictor - - - name: bot-supervisor - policy: bot-supervisor - namespace: default - job_id: bot-supervisor - - - name: bot-architect - policy: bot-architect - namespace: default - job_id: bot-architect - - - name: bot-vault - policy: bot-vault - namespace: default - job_id: bot-vault - - # ── Edge dispatcher ──────────────────────────────────────────────────────── - - name: dispatcher - policy: dispatcher - namespace: default - job_id: dispatcher - - # ── Per-secret runner roles ──────────────────────────────────────────────── - # vault-runner (Step 5) composes runner- policies onto each - # ephemeral dispatch token based on the action TOML's `secrets = [...]`. - # The per-dispatch runner jobspec job_id follows the same `runner-` - # convention (one jobspec per secret, minted per dispatch) so the bound - # claim matches the role name directly. - - name: runner-GITHUB_TOKEN - policy: runner-GITHUB_TOKEN - namespace: default - job_id: runner-GITHUB_TOKEN - - - name: runner-CODEBERG_TOKEN - policy: runner-CODEBERG_TOKEN - namespace: default - job_id: runner-CODEBERG_TOKEN - - - name: runner-CLAWHUB_TOKEN - policy: runner-CLAWHUB_TOKEN - namespace: default - job_id: runner-CLAWHUB_TOKEN - - - name: runner-DEPLOY_KEY - policy: runner-DEPLOY_KEY - namespace: default - job_id: runner-DEPLOY_KEY - - - name: runner-NPM_TOKEN - policy: runner-NPM_TOKEN - namespace: default - job_id: runner-NPM_TOKEN - - - name: runner-DOCKER_HUB_TOKEN - policy: runner-DOCKER_HUB_TOKEN - namespace: default - job_id: runner-DOCKER_HUB_TOKEN