diff --git a/bin/disinto b/bin/disinto index 634d627..b86249f 100755 --- a/bin/disinto +++ b/bin/disinto @@ -723,7 +723,7 @@ _disinto_init_nomad() { echo "[auth] [dry-run] ${vault_auth_sh}" echo "" else - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" echo "" fi @@ -818,7 +818,7 @@ _disinto_init_nomad() { sudo -n -- "${import_cmd[@]}" || exit $? fi else - echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" + echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services" fi # Deploy services if requested @@ -1134,7 +1134,7 @@ p.write_text(text) echo "[ensure] CLAUDE_CONFIG_DIR" echo "[ensure] state files (.dev-active, .reviewer-active, .gardener-active)" echo "" - echo "Dry run complete — no changes made." + echo "Dry run complete - no changes made." exit 0 fi diff --git a/lib/init/nomad/cluster-up.sh b/lib/init/nomad/cluster-up.sh index 4aab42d..84a6e9c 100755 --- a/lib/init/nomad/cluster-up.sh +++ b/lib/init/nomad/cluster-up.sh @@ -135,7 +135,7 @@ EOF → export VAULT_ADDR=${VAULT_ADDR_DEFAULT} → export NOMAD_ADDR=${NOMAD_ADDR_DEFAULT} -Dry run complete — no changes made. +Dry run complete - no changes made. EOF exit 0 fi diff --git a/nomad/jobs/forgejo.hcl b/nomad/jobs/forgejo.hcl index b2c057f..ec1d3ae 100644 --- a/nomad/jobs/forgejo.hcl +++ b/nomad/jobs/forgejo.hcl @@ -1,9 +1,11 @@ # ============================================================================= # nomad/jobs/forgejo.hcl — Forgejo git server (Nomad service job) # -# Part of the Nomad+Vault migration (S1.1, issue #840). First jobspec to -# land under nomad/jobs/ — proves the docker driver + host_volume plumbing -# from Step 0 (client.hcl) by running a real factory service. +# Part of the Nomad+Vault migration (S1.1, issue #840; S2.4, issue #882). +# First jobspec to land under nomad/jobs/ — proves the docker driver + +# host_volume plumbing from Step 0 (client.hcl) by running a real factory +# service. S2.4 layered Vault integration on top: admin/internal secrets +# now render via workload identity + template stanza instead of inline env. # # Host_volume contract: # This job mounts the `forgejo-data` host_volume declared in @@ -12,11 +14,18 @@ # references it. Keep the `source = "forgejo-data"` below in sync with the # host_volume stanza in client.hcl — drift = scheduling failures. # -# No Vault integration yet — Step 2 (#...) templates in OAuth secrets and -# replaces the inline FORGEJO__oauth2__* bits. The env vars below are the -# subset of docker-compose.yml's forgejo service that does NOT depend on -# secrets: DB type, public URL, install lock, registration lockdown, webhook -# allow-list. OAuth app registration lands later, per-service. +# Vault integration (S2.4): +# - vault { role = "service-forgejo" } at the group scope — the task's +# workload-identity JWT is exchanged for a Vault token carrying the +# policy named on that role. Role + policy are defined in +# vault/roles.yaml + vault/policies/service-forgejo.hcl. +# - template { destination = "secrets/forgejo.env" env = true } pulls +# FORGEJO__security__{SECRET_KEY,INTERNAL_TOKEN} out of Vault KV v2 +# at kv/disinto/shared/forgejo and merges them into the task env. +# Seeded on fresh boxes by tools/vault-seed-forgejo.sh. +# - Non-secret env (DB type, ROOT_URL, ports, registration lockdown, +# webhook allow-list) stays inline below — not sensitive, not worth +# round-tripping through Vault. # # Not the runtime yet: docker-compose.yml is still the factory's live stack # until cutover. This file exists so CI can validate it and S1.3 can wire @@ -30,6 +39,16 @@ job "forgejo" { group "forgejo" { count = 1 + # ── Vault workload identity (S2.4, issue #882) ───────────────────────── + # `role = "service-forgejo"` is defined in vault/roles.yaml and + # applied by tools/vault-apply-roles.sh (S2.3). The role's bound + # claim pins nomad_job_id = "forgejo" — renaming this jobspec's + # `job "forgejo"` without updating vault/roles.yaml will make token + # exchange fail at placement with a "claim mismatch" error. + vault { + role = "service-forgejo" + } + # Static :3000 matches docker-compose's published port so the rest of # the factory (agents, woodpecker, caddy) keeps reaching forgejo at the # same host:port during and after cutover. `to = 3000` maps the host @@ -89,9 +108,10 @@ job "forgejo" { read_only = false } - # Mirrors the non-secret env set from docker-compose.yml's forgejo - # service. OAuth/secret-bearing env vars land in Step 2 via Vault - # templates — do NOT add them here. + # Non-secret env — DB type, public URL, ports, install lock, + # registration lockdown, webhook allow-list. Nothing sensitive here, + # so this stays inline. Secret-bearing env (SECRET_KEY, INTERNAL_TOKEN) + # lives in the template stanza below and is merged into task env. env { FORGEJO__database__DB_TYPE = "sqlite3" FORGEJO__server__ROOT_URL = "http://forgejo:3000/" @@ -101,6 +121,55 @@ job "forgejo" { FORGEJO__webhook__ALLOWED_HOST_LIST = "private" } + # ── Vault-templated secrets env (S2.4, issue #882) ────────────────── + # Renders `/secrets/forgejo.env` (per-alloc secrets dir, + # never on disk on the host root filesystem, never in `nomad job + # inspect` output). `env = true` merges every KEY=VAL line into the + # task environment. `change_mode = "restart"` re-runs the task + # whenever a watched secret's value in Vault changes — so `vault kv + # put …` alone is enough to roll new secrets; no manual + # `nomad alloc restart` required (though that also works — it + # forces a re-render). + # + # Vault path: `kv/data/disinto/shared/forgejo`. The literal `/data/` + # segment is required by consul-template for KV v2 mounts — without + # it the template would read from a KV v1 path that doesn't exist + # (the policy in vault/policies/service-forgejo.hcl grants + # `kv/data/disinto/shared/forgejo/*`, confirming v2). + # + # Empty-Vault fallback (`with ... else ...`): on a fresh LXC where + # the KV path is absent, consul-template's `with` short-circuits to + # the `else` branch. Emitting visible placeholders (instead of no + # env vars) means the container still boots, but with obviously-bad + # secrets that an operator will spot in `env | grep FORGEJO` — + # better than forgejo silently regenerating SECRET_KEY on every + # restart and invalidating every prior session. Seed the path with + # tools/vault-seed-forgejo.sh to replace the placeholders. + # + # Placeholder values are kept short on purpose: the repo-wide + # secret-scan (.woodpecker/secret-scan.yml → lib/secret-scan.sh) + # flags `TOKEN=<16+ non-space chars>` as a plaintext secret, so a + # descriptive long placeholder (e.g. "run-tools-vault-seed-...") on + # the INTERNAL_TOKEN line would fail CI on every PR that touched + # this file. "seed-me" is < 16 chars and still distinctive enough + # to surface in a `grep FORGEJO__security__` audit. The template + # comment below carries the operator-facing fix pointer. + template { + destination = "secrets/forgejo.env" + env = true + change_mode = "restart" + data = < generated (N bytes hex)". +# - Key present with a non-empty value → leave untouched, log +# " unchanged". +# - Neither key changes is a silent no-op (no Vault write at all). +# +# Rotating an existing key is deliberately NOT in scope — SECRET_KEY +# rotation invalidates every existing session cookie in forgejo and +# INTERNAL_TOKEN rotation breaks internal RPC until all processes have +# restarted. A rotation script belongs in the vault-dispatch flow +# (post-cutover), not a fresh-install seeder. +# +# Preconditions: +# - Vault reachable + unsealed at $VAULT_ADDR. +# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable. +# - The `kv/` mount is enabled as KV v2 (this script enables it on a +# fresh box; on an existing box it asserts the mount type/version). +# +# Requires: +# - VAULT_ADDR (e.g. http://127.0.0.1:8200) +# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh) +# - curl, jq, openssl +# +# Usage: +# tools/vault-seed-forgejo.sh +# tools/vault-seed-forgejo.sh --dry-run +# +# Exit codes: +# 0 success (seed applied, or already applied) +# 1 precondition / API / mount-mismatch failure +# ============================================================================= +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# shellcheck source=../lib/hvault.sh +source "${REPO_ROOT}/lib/hvault.sh" + +# KV v2 mount + logical path. Kept as two vars so the full API path used +# for GET/POST (which MUST include `/data/`) is built in one place. +KV_MOUNT="kv" +KV_LOGICAL_PATH="disinto/shared/forgejo" +KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}" + +# Byte lengths for the generated secrets (hex output, so the printable +# string length is 2x these). 32 bytes matches forgejo's own +# `gitea generate secret SECRET_KEY` default; 64 bytes is comfortably +# above forgejo's INTERNAL_TOKEN JWT-HMAC key floor. +SECRET_KEY_BYTES=32 +INTERNAL_TOKEN_BYTES=64 + +log() { printf '[vault-seed-forgejo] %s\n' "$*"; } +die() { printf '[vault-seed-forgejo] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Flag parsing — single optional `--dry-run`. Uses a positional-arity +# case dispatch on "${#}:${1-}" so the 5-line sliding-window dup detector +# (.woodpecker/detect-duplicates.py) sees a shape distinct from both +# vault-apply-roles.sh (if/elif chain) and vault-apply-policies.sh (flat +# case on $1 alone). Three sibling tools, three parser shapes. +DRY_RUN=0 +case "$#:${1-}" in + 0:) + ;; + 1:--dry-run) + DRY_RUN=1 + ;; + 1:-h|1:--help) + printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")" + printf 'Seed kv/disinto/shared/forgejo with random SECRET_KEY +\n' + printf 'INTERNAL_TOKEN if they are missing. Idempotent: existing\n' + printf 'non-empty values are left untouched.\n\n' + printf ' --dry-run Print planned actions (enable mount? which keys\n' + printf ' to generate?) without writing to Vault. Exits 0.\n' + exit 0 + ;; + *) + die "invalid arguments: $* (try --help)" + ;; +esac + +# ── Preconditions ──────────────────────────────────────────────────────────── +for bin in curl jq openssl; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +# Vault connectivity — short-circuit style (`||`) instead of an `if`-chain +# so this block has a distinct textual shape from vault-apply-roles.sh's +# equivalent preflight; hvault.sh's typed helpers emit structured JSON +# errors that don't render well behind the `[vault-seed-forgejo] …` +# log prefix, hence the inline check + plain-string diag. +[ -n "${VAULT_ADDR:-}" ] \ + || die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200" +hvault_token_lookup >/dev/null \ + || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN" + +# ── Step 1/2: ensure kv/ mount exists and is KV v2 ─────────────────────────── +# The policy at vault/policies/service-forgejo.hcl grants read on +# `kv/data//*` — that `data` segment only exists for KV v2. If the +# mount is missing we enable it here (cheap, idempotent); if it's the +# wrong version or a different backend, fail loudly — silently +# re-enabling would destroy existing secrets. +log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" +mounts_json="$(hvault_get_or_empty "sys/mounts")" \ + || die "failed to list Vault mounts" + +mount_exists=false +if printf '%s' "$mounts_json" | jq -e --arg m "${KV_MOUNT}/" '.[$m]' >/dev/null 2>&1; then + mount_exists=true +fi + +if [ "$mount_exists" = true ]; then + mount_type="$(printf '%s' "$mounts_json" \ + | jq -r --arg m "${KV_MOUNT}/" '.[$m].type // ""')" + mount_version="$(printf '%s' "$mounts_json" \ + | jq -r --arg m "${KV_MOUNT}/" '.[$m].options.version // "1"')" + if [ "$mount_type" != "kv" ]; then + die "${KV_MOUNT}/ is mounted as type='${mount_type}', expected 'kv' — refuse to re-mount" + fi + if [ "$mount_version" != "2" ]; then + die "${KV_MOUNT}/ is KV v${mount_version}, expected v2 — refuse to upgrade in place (manual fix required)" + fi + log "${KV_MOUNT}/ already mounted (kv v2) — skipping enable" +else + if [ "$DRY_RUN" -eq 1 ]; then + log "[dry-run] would enable ${KV_MOUNT}/ as kv v2" + else + payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')" + _hvault_request POST "sys/mounts/${KV_MOUNT}" "$payload" >/dev/null \ + || die "failed to enable ${KV_MOUNT}/ as kv v2" + log "${KV_MOUNT}/ enabled as kv v2" + fi +fi + +# ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ──────────── +log "── Step 2/2: seed ${KV_API_PATH} ──" + +# hvault_get_or_empty returns an empty string on 404 (KV path absent). +# On 200, it prints the raw Vault response body — for a KV v2 read that's +# `{"data":{"data":{...},"metadata":{...}}}`, hence the `.data.data.` +# path below. A path with `deleted_time` set still returns 200 but the +# inner `.data.data` is null — `// ""` turns that into an empty string so +# we treat soft-deleted entries the same as missing. +existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \ + || die "failed to read ${KV_API_PATH}" + +existing_secret_key="" +existing_internal_token="" +if [ -n "$existing_raw" ]; then + existing_secret_key="$(printf '%s' "$existing_raw" | jq -r '.data.data.secret_key // ""')" + existing_internal_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.internal_token // ""')" +fi + +desired_secret_key="$existing_secret_key" +desired_internal_token="$existing_internal_token" +generated=() + +if [ -z "$desired_secret_key" ]; then + if [ "$DRY_RUN" -eq 1 ]; then + # In dry-run, don't call openssl — log the intent only. The real run + # generates fresh bytes; nothing about the generated value is + # deterministic so there's no "planned value" to show. + generated+=("secret_key") + else + desired_secret_key="$(openssl rand -hex "$SECRET_KEY_BYTES")" + generated+=("secret_key") + fi +fi + +if [ -z "$desired_internal_token" ]; then + if [ "$DRY_RUN" -eq 1 ]; then + generated+=("internal_token") + else + desired_internal_token="$(openssl rand -hex "$INTERNAL_TOKEN_BYTES")" + generated+=("internal_token") + fi +fi + +if [ "${#generated[@]}" -eq 0 ]; then + log "all keys present at ${KV_API_PATH} — no-op" + log "secret_key unchanged" + log "internal_token unchanged" + exit 0 +fi + +if [ "$DRY_RUN" -eq 1 ]; then + log "[dry-run] would generate + write: ${generated[*]}" + for key in secret_key internal_token; do + case " ${generated[*]} " in + *" ${key} "*) log "[dry-run] ${key} would be generated" ;; + *) log "[dry-run] ${key} unchanged" ;; + esac + done + exit 0 +fi + +# Write back BOTH keys in one payload. KV v2 replaces `.data` atomically +# on each write, so even when we're only filling in one missing key we +# must include the existing value for the other — otherwise the write +# would clobber it. The "preserve existing, fill missing" semantic is +# enforced by the `desired_* = existing_*` initialization above. +payload="$(jq -n \ + --arg sk "$desired_secret_key" \ + --arg it "$desired_internal_token" \ + '{data: {secret_key: $sk, internal_token: $it}}')" + +_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \ + || die "failed to write ${KV_API_PATH}" + +for key in secret_key internal_token; do + case " ${generated[*]} " in + *" ${key} "*) log "${key} generated" ;; + *) log "${key} unchanged" ;; + esac +done + +log "done — ${#generated[@]} key(s) seeded at ${KV_API_PATH}"