Merge pull request 'fix: [nomad-step-2] S2-fix — 4 bugs block Step 2 verification: kv/ mount missing, VAULT_ADDR, --sops required, template fallback (#912)' (#923) from fix/issue-912-2 into main

2026-04-16 21:21:39 +00:00 · 2026-04-16 21:21:39 +00:00 · cfe1ef9512
commit cfe1ef9512
parent 3e29a9a61d 0b994d5d6f
8 changed files with 283 additions and 48 deletions
--- a/bin/disinto
+++ b/bin/disinto
@ -670,6 +670,7 @@ _disinto_init_nomad() {
  local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}"
  local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh"
  local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh"
+  local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh"
  local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh"
  local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh"
  local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh"
@ -690,15 +691,22 @@ _disinto_init_nomad() {
  # --empty combined with --with or any --import-* flag, so reaching
  # this branch with those set is a bug in the caller.
  #
-  # On the default (non-empty) path, vault-apply-policies.sh and
-  # vault-nomad-auth.sh are invoked unconditionally — they are idempotent
-  # and cheap to re-run, and subsequent --with deployments depend on
-  # them. vault-import.sh is invoked only when an --import-* flag is set.
+  # On the default (non-empty) path, vault-engines.sh (enables the kv/
+  # mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked
+  # unconditionally — they are idempotent and cheap to re-run, and
+  # subsequent --with deployments depend on them. vault-import.sh is
+  # invoked only when an --import-* flag is set. vault-engines.sh runs
+  # first because every policy and role below references kv/disinto/*
+  # paths, which 403 if the engine is not yet mounted (issue #912).
  local import_any=false
  if [ -n "$import_env" ] || [ -n "$import_sops" ]; then
    import_any=true
  fi
  if [ "$empty" != "true" ]; then
+    if [ ! -x "$vault_engines_sh" ]; then
+      echo "Error: ${vault_engines_sh} not found or not executable" >&2
+      exit 1
+    fi
    if [ ! -x "$vault_policies_sh" ]; then
      echo "Error: ${vault_policies_sh} not found or not executable" >&2
      exit 1
@ -737,10 +745,15 @@ _disinto_init_nomad() {
      exit 0
    fi

-    # Vault policies + auth are invoked on every nomad real-run path
-    # regardless of --import-* flags (they're idempotent; S2.1 + S2.3).
-    # Mirror that ordering in the dry-run plan so the operator sees the
-    # full sequence Step 2 will execute.
+    # Vault engines + policies + auth are invoked on every nomad real-run
+    # path regardless of --import-* flags (they're idempotent; S2.1 + S2.3).
+    # Engines runs first because policies/roles/templates all reference the
+    # kv/ mount it enables (issue #912). Mirror that ordering in the
+    # dry-run plan so the operator sees the full sequence Step 2 will
+    # execute.
+    echo "── Vault engines dry-run ──────────────────────────────"
+    echo "[engines] [dry-run] ${vault_engines_sh} --dry-run"
+    echo ""
    echo "── Vault policies dry-run ─────────────────────────────"
    echo "[policies] [dry-run] ${vault_policies_sh} --dry-run"
    echo ""
@ -814,6 +827,22 @@ _disinto_init_nomad() {
    exit 0
  fi

+  # Enable Vault secret engines (S2.1 / issue #912) — must precede
+  # policies/auth/import because every policy and every import target
+  # addresses paths under kv/. Idempotent, safe to re-run.
+  echo ""
+  echo "── Enabling Vault secret engines ──────────────────────"
+  local -a engines_cmd=("$vault_engines_sh")
+  if [ "$(id -u)" -eq 0 ]; then
+    "${engines_cmd[@]}" || exit $?
+  else
+    if ! command -v sudo >/dev/null 2>&1; then
+      echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2
+      exit 1
+    fi
+    sudo -n -- "${engines_cmd[@]}" || exit $?
+  fi
+
  # Apply Vault policies (S2.1) — idempotent, safe to re-run.
  echo ""
  echo "── Applying Vault policies ────────────────────────────"
--- a/lib/hvault.sh
+++ b/lib/hvault.sh
@ -38,6 +38,30 @@ _hvault_resolve_token() {
  return 1
 }

+# _hvault_default_env — set the local-cluster Vault env if unset
+#
+# Idempotent helper used by every Vault-touching script that runs during
+# `disinto init` (S2). On the local-cluster common case, operators (and
+# the init dispatcher in bin/disinto) have not exported VAULT_ADDR or
+# VAULT_TOKEN — the server is reachable on localhost:8200 and the root
+# token lives at /etc/vault.d/root.token. Scripts must Just Work in that
+# shape.
+#
+#   - If VAULT_ADDR is unset, defaults to http://127.0.0.1:8200.
+#   - If VAULT_TOKEN is unset, resolves from /etc/vault.d/root.token via
+#     _hvault_resolve_token. A missing token file is not an error here —
+#     downstream hvault_token_lookup() probes connectivity and emits the
+#     operator-facing "VAULT_ADDR + VAULT_TOKEN" diagnostic.
+#
+# Centralised to keep the defaulting stanza in one place — copy-pasting
+# the 5-line block into each init script trips the repo-wide 5-line
+# sliding-window duplicate detector (.woodpecker/detect-duplicates.py).
+_hvault_default_env() {
+  VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
+  export VAULT_ADDR
+  _hvault_resolve_token || :
+}
+
 # _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set
 # Args: caller function name
 _hvault_check_prereqs() {
--- a/lib/init/nomad/vault-engines.sh
+++ b/lib/init/nomad/vault-engines.sh
@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# =============================================================================
+# lib/init/nomad/vault-engines.sh — Enable required Vault secret engines
+#
+# Part of the Nomad+Vault migration (S2.1, issue #912). Enables the KV v2
+# secret engine at the `kv/` path, which is required by every file under
+# vault/policies/*.hcl, every role in vault/roles.yaml, every write done
+# by tools/vault-import.sh, and every template read done by
+# nomad/jobs/forgejo.hcl — all of which address paths under kv/disinto/…
+# and 403 if the mount is absent.
+#
+# Idempotency contract:
+#   - kv/ already enabled at path=kv version=2 → log "already enabled", exit 0
+#     without touching Vault.
+#   - kv/ enabled at a different type/version → die (manual intervention).
+#   - kv/ not enabled → POST sys/mounts/kv to enable kv-v2, log "enabled".
+#   - Second run on a fully-configured box is a silent no-op.
+#
+# Preconditions:
+#   - Vault is unsealed and reachable (VAULT_ADDR + VAULT_TOKEN set OR
+#     defaultable to the local-cluster shape via _hvault_default_env).
+#   - Must run AFTER cluster-up.sh (unseal complete) but BEFORE
+#     vault-apply-policies.sh (policies reference kv/* paths).
+#
+# Environment:
+#   VAULT_ADDR  — default http://127.0.0.1:8200 via _hvault_default_env.
+#   VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh).
+#
+# Usage:
+#   sudo lib/init/nomad/vault-engines.sh
+#   sudo lib/init/nomad/vault-engines.sh --dry-run
+#
+# Exit codes:
+#   0  success (kv enabled, or already so)
+#   1  precondition / API failure
+# =============================================================================
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+
+# shellcheck source=../../hvault.sh
+source "${REPO_ROOT}/lib/hvault.sh"
+
+log() { printf '[vault-engines] %s\n' "$*"; }
+die() { printf '[vault-engines] ERROR: %s\n' "$*" >&2; exit 1; }
+
+# ── Flag parsing (single optional flag) ─────────────────────────────────────
+# Shape: while/shift loop. Deliberately NOT a flat `case "${1:-}"` like
+# tools/vault-apply-policies.sh nor an if/elif ladder like
+# tools/vault-apply-roles.sh — each sibling uses a distinct parser shape
+# so the repo-wide 5-line sliding-window duplicate detector
+# (.woodpecker/detect-duplicates.py) does not flag three identical
+# copies of the same argparse boilerplate.
+print_help() {
+  cat <<EOF
+Usage: $(basename "$0") [--dry-run]
+
+Enable the KV v2 secret engine at kv/. Required by all Vault policies,
+roles, and Nomad job templates that reference kv/disinto/* paths.
+Idempotent: an already-enabled kv/ is reported and left untouched.
+
+  --dry-run   Probe state and print the action without contacting Vault
+              in a way that mutates it.
+EOF
+}
+dry_run=false
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --dry-run) dry_run=true; shift ;;
+    -h|--help) print_help; exit 0 ;;
+    *)         die "unknown flag: $1" ;;
+  esac
+done
+
+# ── Preconditions ────────────────────────────────────────────────────────────
+for bin in curl jq; do
+  command -v "$bin" >/dev/null 2>&1 \
+    || die "required binary not found: ${bin}"
+done
+
+# Default the local-cluster Vault env (VAULT_ADDR + VAULT_TOKEN). Shared
+# with the rest of the init-time Vault scripts — see lib/hvault.sh header.
+_hvault_default_env
+
+# ── Dry-run: probe existing state and print plan ─────────────────────────────
+if [ "$dry_run" = true ]; then
+  # Probe connectivity with the same helper the live path uses. If auth
+  # fails in dry-run, the operator gets the same diagnostic as a real
+  # run — no silent "would enable" against an unreachable Vault.
+  hvault_token_lookup >/dev/null \
+    || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
+  mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
+    || die "failed to list secret engines"
+  if [ -n "$mounts_raw" ] \
+     && printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
+    log "[dry-run] kv-v2 at kv/ already enabled"
+  else
+    log "[dry-run] would enable kv-v2 at kv/"
+  fi
+  exit 0
+fi
+
+# ── Live run: Vault connectivity check ───────────────────────────────────────
+hvault_token_lookup >/dev/null \
+  || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
+
+# ── Check if kv/ is already enabled ──────────────────────────────────────────
+# sys/mounts returns an object keyed by "<path>/" for every enabled secret
+# engine (trailing slash is Vault's on-disk form). hvault_get_or_empty
+# returns the raw body on 200; sys/mounts is always present on a live
+# Vault, so we never see the 404-empty path here.
+log "checking existing secret engines"
+mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
+  || die "failed to list secret engines"
+
+if [ -n "$mounts_raw" ] \
+   && printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
+  # kv/ exists — verify it's kv-v2 on the right path shape. Vault returns
+  # the option as a string ("2") on GET, never an integer.
+  kv_type="$(printf '%s' "$mounts_raw" | jq -r '."kv/".type // ""')"
+  kv_version="$(printf '%s' "$mounts_raw" | jq -r '."kv/".options.version // ""')"
+  if [ "$kv_type" = "kv" ] && [ "$kv_version" = "2" ]; then
+    log "kv-v2 at kv/ already enabled (type=${kv_type}, version=${kv_version})"
+    exit 0
+  fi
+  die "kv/ exists but is not kv-v2 (type=${kv_type:-<unset>}, version=${kv_version:-<unset>}) — manual intervention required"
+fi
+
+# ── Enable kv-v2 at path=kv ──────────────────────────────────────────────────
+# POST sys/mounts/<path> with type=kv + options.version=2 is the
+# HTTP-API equivalent of `vault secrets enable -path=kv -version=2 kv`.
+# Keeps the script vault-CLI-free (matches the policy-apply + nomad-auth
+# scripts; their headers explain why a CLI dep would die on client-only
+# nodes).
+log "enabling kv-v2 at path=kv"
+enable_payload="$(jq -n '{type:"kv",options:{version:"2"}}')"
+_hvault_request POST "sys/mounts/kv" "$enable_payload" >/dev/null \
+  || die "failed to enable kv-v2 secret engine"
+log "kv-v2 enabled at kv/"
--- a/lib/init/nomad/vault-nomad-auth.sh
+++ b/lib/init/nomad/vault-nomad-auth.sh
@ -49,12 +49,14 @@ APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh"
 SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl"
 SERVER_HCL_DST="/etc/nomad.d/server.hcl"

-VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
-export VAULT_ADDR
-
 # shellcheck source=../../hvault.sh
 source "${REPO_ROOT}/lib/hvault.sh"

+# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
+# Called from `disinto init` which does not export VAULT_ADDR/VAULT_TOKEN in
+# the common fresh-LXC case (issue #912). Must run after hvault.sh is sourced.
+_hvault_default_env
+
 log() { printf '[vault-auth] %s\n' "$*"; }
 die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; }

--- a/nomad/jobs/forgejo.hcl
+++ b/nomad/jobs/forgejo.hcl
@ -154,11 +154,18 @@ job "forgejo" {
      # this file. "seed-me" is < 16 chars and still distinctive enough
      # to surface in a `grep FORGEJO__security__` audit. The template
      # comment below carries the operator-facing fix pointer.
+      # `error_on_missing_key = false` stops consul-template from blocking
+      # the alloc on template-pending when the Vault KV path exists but a
+      # referenced key is absent (or the path itself is absent and the
+      # else-branch placeholders are used). Without this, a fresh-LXC
+      # `disinto init --with forgejo` against an empty Vault hangs on
+      # template-pending until deploy.sh times out (issue #912, bug #4).
      template {
-        destination = "secrets/forgejo.env"
-        env         = true
-        change_mode = "restart"
-        data        = <<EOT
+        destination          = "secrets/forgejo.env"
+        env                  = true
+        change_mode          = "restart"
+        error_on_missing_key = false
+        data                 = <<EOT
 {{- with secret "kv/data/disinto/shared/forgejo" -}}
 FORGEJO__security__SECRET_KEY={{ .Data.data.secret_key }}
 FORGEJO__security__INTERNAL_TOKEN={{ .Data.data.internal_token }}
--- a/tools/vault-apply-policies.sh
+++ b/tools/vault-apply-policies.sh
@ -94,8 +94,11 @@ if [ "$dry_run" = true ]; then
 fi

 # ── Live run: Vault connectivity check ───────────────────────────────────────
-[ -n "${VAULT_ADDR:-}" ] \
-  || die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200"
+# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
+# `disinto init` does not export VAULT_ADDR before calling this script — the
+# server is reachable on 127.0.0.1:8200 and the root token lives at
+# /etc/vault.d/root.token in the common fresh-LXC case (issue #912).
+_hvault_default_env

 # hvault_token_lookup both resolves the token (env or /etc/vault.d/root.token)
 # and confirms the server is reachable with a valid token. Fail fast here so
--- a/tools/vault-apply-roles.sh
+++ b/tools/vault-apply-roles.sh
@ -219,9 +219,10 @@ if [ "$dry_run" = true ]; then
 fi

 # ── Live run: Vault connectivity check ───────────────────────────────────────
-if [ -z "${VAULT_ADDR:-}" ]; then
-  die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200"
-fi
+# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
+# Called transitively from vault-nomad-auth.sh during `disinto init`, which
+# does not export VAULT_ADDR in the common fresh-LXC case (issue #912).
+_hvault_default_env
 if ! hvault_token_lookup >/dev/null; then
  die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
 fi
--- a/tools/vault-import.sh
+++ b/tools/vault-import.sh
@ -8,8 +8,13 @@
 # Usage:
 #   vault-import.sh \
 #     --env /path/to/.env \
-#     --sops /path/to/.env.vault.enc \
-#     --age-key /path/to/age/keys.txt
+#     [--sops /path/to/.env.vault.enc] \
+#     [--age-key /path/to/age/keys.txt]
+#
+# Flag validation (S2.5, issue #883):
+#   --import-sops without --age-key → error.
+#   --age-key without --import-sops → error.
+#   --env alone (no sops) → OK; imports only the plaintext half.
 #
 # Mapping:
 #   From .env:
@ -236,14 +241,15 @@ vault-import.sh — Import .env and sops-decrypted secrets into Vault KV
 Usage:
  vault-import.sh \
    --env /path/to/.env \
-    --sops /path/to/.env.vault.enc \
-    --age-key /path/to/age/keys.txt \
+    [--sops /path/to/.env.vault.enc] \
+    [--age-key /path/to/age/keys.txt] \
    [--dry-run]

 Options:
  --env       Path to .env file (required)
-  --sops      Path to sops-encrypted .env.vault.enc file (required)
-  --age-key   Path to age keys file (required)
+  --sops      Path to sops-encrypted .env.vault.enc file (optional;
+              requires --age-key when set)
+  --age-key   Path to age keys file (required when --sops is set)
  --dry-run   Print import plan without writing to Vault (optional)
  --help      Show this help message

@ -272,47 +278,62 @@ EOF
    esac
  done

-  # Validate required arguments
+  # Validate required arguments. --sops and --age-key are paired: if one
+  # is set, the other must be too. --env alone (no sops half) is valid —
+  # imports only the plaintext dotenv. Spec: S2.5 / issue #883 / #912.
  if [ -z "$env_file" ]; then
    _die "Missing required argument: --env"
  fi
-  if [ -z "$sops_file" ]; then
-    _die "Missing required argument: --sops"
+  if [ -n "$sops_file" ] && [ -z "$age_key_file" ]; then
+    _die "--sops requires --age-key"
  fi
-  if [ -z "$age_key_file" ]; then
-    _die "Missing required argument: --age-key"
+  if [ -n "$age_key_file" ] && [ -z "$sops_file" ]; then
+    _die "--age-key requires --sops"
  fi

  # Validate files exist
  if [ ! -f "$env_file" ]; then
    _die "Environment file not found: $env_file"
  fi
-  if [ ! -f "$sops_file" ]; then
+  if [ -n "$sops_file" ] && [ ! -f "$sops_file" ]; then
    _die "Sops file not found: $sops_file"
  fi
-  if [ ! -f "$age_key_file" ]; then
+  if [ -n "$age_key_file" ] && [ ! -f "$age_key_file" ]; then
    _die "Age key file not found: $age_key_file"
  fi

-  # Security check: age key permissions
-  _validate_age_key_perms "$age_key_file"
+  # Security check: age key permissions (only when an age key is provided —
+  # --env-only imports never touch the age key).
+  if [ -n "$age_key_file" ]; then
+    _validate_age_key_perms "$age_key_file"
+  fi
+
+  # Source the Vault helpers and default the local-cluster VAULT_ADDR +
+  # VAULT_TOKEN before the localhost safety check runs. `disinto init`
+  # does not export these in the common fresh-LXC case (issue #912).
+  source "$(dirname "$0")/../lib/hvault.sh"
+  _hvault_default_env

  # Security check: VAULT_ADDR must be localhost
  _check_vault_addr

-  # Source the Vault helpers
-  source "$(dirname "$0")/../lib/hvault.sh"
-
  # Load .env file
  _log "Loading environment from: $env_file"
  _load_env_file "$env_file"

-  # Decrypt sops file
-  _log "Decrypting sops file: $sops_file"
-  local sops_env
-  sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")"
-  # shellcheck disable=SC2086
-  eval "$sops_env"
+  # Decrypt sops file when --sops was provided. On the --env-only path
+  # (empty $sops_file) the sops_env stays empty and the per-token loop
+  # below silently skips runner-token imports — exactly the "only
+  # plaintext half" spec from S2.5.
+  local sops_env=""
+  if [ -n "$sops_file" ]; then
+    _log "Decrypting sops file: $sops_file"
+    sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")"
+    # shellcheck disable=SC2086
+    eval "$sops_env"
+  else
+    _log "No --sops flag — skipping sops decryption (importing plaintext .env only)"
+  fi

  # Collect all import operations
  declare -a operations=()
@ -397,8 +418,12 @@ EOF
  if $dry_run; then
    _log "=== DRY-RUN: Import plan ==="
    _log "Environment file: $env_file"
-    _log "Sops file: $sops_file"
-    _log "Age key: $age_key_file"
+    if [ -n "$sops_file" ]; then
+      _log "Sops file: $sops_file"
+      _log "Age key: $age_key_file"
+    else
+      _log "Sops file: (none — --env-only import)"
+    fi
    _log ""
    _log "Planned operations:"
    for op in "${operations[@]}"; do
@ -413,8 +438,12 @@ EOF

  _log "=== Starting Vault import ==="
  _log "Environment file: $env_file"
-  _log "Sops file: $sops_file"
-  _log "Age key: $age_key_file"
+  if [ -n "$sops_file" ]; then
+    _log "Sops file: $sops_file"
+    _log "Age key: $age_key_file"
+  else
+    _log "Sops file: (none — --env-only import)"
+  fi
  _log ""

  local created=0