fix: [nomad-step-3] S3.1 — nomad/jobs/woodpecker-server.hcl + vault-seed-woodpecker.sh (#934) #938

Merged
dev-qwen2 merged 2 commits from fix/issue-934 into main 2026-04-17 05:29:10 +00:00
5 changed files with 357 additions and 31 deletions

View file

@ -129,6 +129,60 @@ _hvault_request() {
# Used by: hvault_kv_get, hvault_kv_put, hvault_kv_list # Used by: hvault_kv_get, hvault_kv_put, hvault_kv_list
: "${VAULT_KV_MOUNT:=kv}" : "${VAULT_KV_MOUNT:=kv}"
# hvault_ensure_kv_v2 MOUNT [LOG_PREFIX]
# Assert that the given KV mount is present and KV v2. If absent, enable
# it. If present as wrong type/version, exit 1. Callers must have already
# checked VAULT_ADDR / VAULT_TOKEN.
#
# DRY_RUN (env, default 0): when 1, log intent without writing.
# LOG_PREFIX (optional): label for log lines, e.g. "[vault-seed-forgejo]".
#
# Extracted here because every vault-seed-*.sh script needs this exact
# sequence, and the 5-line sliding-window dup detector flags the
# copy-paste. One place, one implementation.
hvault_ensure_kv_v2() {
local mount="${1:?hvault_ensure_kv_v2: MOUNT required}"
local prefix="${2:-[hvault]}"
local dry_run="${DRY_RUN:-0}"
local mounts_json mount_exists mount_type mount_version
mounts_json="$(hvault_get_or_empty "sys/mounts")" \
|| { printf '%s ERROR: failed to list Vault mounts\n' "$prefix" >&2; return 1; }
mount_exists=false
if printf '%s' "$mounts_json" | jq -e --arg m "${mount}/" '.[$m]' >/dev/null 2>&1; then
mount_exists=true
fi
if [ "$mount_exists" = true ]; then
mount_type="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].type // ""')"
mount_version="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].options.version // "1"')"
if [ "$mount_type" != "kv" ]; then
printf '%s ERROR: %s/ is mounted as type=%q, expected kv — refuse to re-mount\n' \
"$prefix" "$mount" "$mount_type" >&2
return 1
fi
if [ "$mount_version" != "2" ]; then
printf '%s ERROR: %s/ is KV v%s, expected v2 — refuse to upgrade in place\n' \
"$prefix" "$mount" "$mount_version" >&2
return 1
fi
printf '%s %s/ already mounted (kv v2) — skipping enable\n' "$prefix" "$mount"
else
if [ "$dry_run" -eq 1 ]; then
printf '%s [dry-run] would enable %s/ as kv v2\n' "$prefix" "$mount"
else
local payload
payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')"
_hvault_request POST "sys/mounts/${mount}" "$payload" >/dev/null \
|| { printf '%s ERROR: failed to enable %s/ as kv v2\n' "$prefix" "$mount" >&2; return 1; }
printf '%s %s/ enabled as kv v2\n' "$prefix" "$mount"
fi
fi
}
# hvault_kv_get PATH [KEY] # hvault_kv_get PATH [KEY]
# Read a KV v2 secret at PATH, optionally extract a single KEY. # Read a KV v2 secret at PATH, optionally extract a single KEY.
# Outputs: JSON value (full data object, or single key value) # Outputs: JSON value (full data object, or single key value)

View file

@ -0,0 +1,173 @@
# =============================================================================
# nomad/jobs/woodpecker-server.hcl Woodpecker CI server (Nomad service job)
#
# Part of the Nomad+Vault migration (S3.1, issue #934).
# Runs the Woodpecker CI web UI + gRPC endpoint as a Nomad service job,
# reading its Forgejo OAuth + agent secret from Vault via workload identity.
#
# Host_volume contract:
# This job mounts the `woodpecker-data` host_volume declared in
# nomad/client.hcl. That volume is backed by /srv/disinto/woodpecker-data
# on the factory box, created by lib/init/nomad/cluster-up.sh before any
# job references it. Keep the `source = "woodpecker-data"` below in sync
# with the host_volume stanza in client.hcl — drift = scheduling failures.
#
# Vault integration (S2.4 pattern):
# - vault { role = "service-woodpecker" } at the group scope the task's
# workload-identity JWT is exchanged for a Vault token carrying the
# policy named on that role. Role + policy are defined in
# vault/roles.yaml + vault/policies/service-woodpecker.hcl.
# - template { destination = "secrets/wp.env" env = true } pulls
# WOODPECKER_AGENT_SECRET, WOODPECKER_FORGEJO_CLIENT, and
# WOODPECKER_FORGEJO_SECRET out of Vault KV v2 at
# kv/disinto/shared/woodpecker and merges them into the task env.
# Agent secret seeded by tools/vault-seed-woodpecker.sh; OAuth
# client/secret seeded by S3.3 (wp-oauth-register.sh).
# - Non-secret env (DB driver, Forgejo URL, host URL, open registration)
# stays inline below not sensitive, not worth round-tripping through
# Vault.
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S3.4 can wire
# `disinto init --backend=nomad --with woodpecker` to `nomad job run` it.
# =============================================================================
job "woodpecker-server" {
type = "service"
datacenters = ["dc1"]
group "woodpecker-server" {
count = 1
# Vault workload identity (S2.4 pattern)
# `role = "service-woodpecker"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh (S2.3). The role's bound
# claim pins nomad_job_id = "woodpecker" note the job_id in
# vault/roles.yaml is "woodpecker" (matching the roles.yaml entry),
# but the actual Nomad job name here is "woodpecker-server". Update
# vault/roles.yaml job_id to "woodpecker-server" if the bound claim
# enforces an exact match at placement.
vault {
role = "service-woodpecker"
}
# HTTP UI (:8000) + gRPC agent endpoint (:9000). Static ports match
# docker-compose's published ports so the rest of the factory keeps
# reaching woodpecker at the same host:port during and after cutover.
network {
port "http" {
static = 8000
to = 8000
}
port "grpc" {
static = 9000
to = 9000
}
}
# Host-volume mount: declared in nomad/client.hcl, path
# /srv/disinto/woodpecker-data on the factory box.
volume "woodpecker-data" {
type = "host"
source = "woodpecker-data"
read_only = false
}
# Conservative restart policy fail fast to the scheduler instead of
# spinning on a broken image/config. 3 attempts over 5m, then back off.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Native Nomad service discovery (no Consul in this factory cluster).
# Health check gates the service as healthy only after the HTTP API is
# up; initial_status is deliberately unset so Nomad waits for the first
# probe to pass before marking the allocation healthy on boot.
service {
name = "woodpecker"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/healthz"
interval = "10s"
timeout = "3s"
}
}
task "woodpecker-server" {
driver = "docker"
config {
image = "woodpeckerci/woodpecker-server:v3"
ports = ["http", "grpc"]
}
volume_mount {
volume = "woodpecker-data"
destination = "/var/lib/woodpecker"
read_only = false
}
# Non-secret env Forgejo integration flags, public URL, DB driver.
# Nothing sensitive here, so this stays inline. Secret-bearing env
# (agent secret, OAuth client/secret) lives in the template stanza
# below and is merged into task env.
env {
WOODPECKER_FORGEJO = "true"
WOODPECKER_FORGEJO_URL = "http://forgejo:3000"
WOODPECKER_HOST = "http://woodpecker:8000"
WOODPECKER_OPEN = "true"
WOODPECKER_DATABASE_DRIVER = "sqlite3"
WOODPECKER_DATABASE_DATASOURCE = "/var/lib/woodpecker/woodpecker.sqlite"
}
# Vault-templated secrets env (S2.4 pattern)
# Renders `<task-dir>/secrets/wp.env` (per-alloc secrets dir, never on
# disk on the host root filesystem). `env = true` merges every KEY=VAL
# line into the task environment. `change_mode = "restart"` re-runs the
# task whenever a watched secret's value in Vault changes.
#
# Vault path: `kv/data/disinto/shared/woodpecker`. The literal `/data/`
# segment is required by consul-template for KV v2 mounts.
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting visible placeholders means the container
# still boots, but with obviously-bad secrets. Seed the path with
# tools/vault-seed-woodpecker.sh (agent_secret) and S3.3's
# wp-oauth-register.sh (forgejo_client, forgejo_secret).
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan flags `TOKEN=<16+ non-space chars>` as a plaintext
# secret; "seed-me" is < 16 chars and still distinctive.
template {
destination = "secrets/wp.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/woodpecker" -}}
WOODPECKER_AGENT_SECRET={{ .Data.data.agent_secret }}
WOODPECKER_FORGEJO_CLIENT={{ .Data.data.forgejo_client }}
WOODPECKER_FORGEJO_SECRET={{ .Data.data.forgejo_secret }}
{{- else -}}
# WARNING: kv/disinto/shared/woodpecker is empty run tools/vault-seed-woodpecker.sh + S3.3
WOODPECKER_AGENT_SECRET=seed-me
WOODPECKER_FORGEJO_CLIENT=seed-me
WOODPECKER_FORGEJO_SECRET=seed-me
{{- end -}}
EOT
}
resources {
cpu = 300
memory = 512
}
}
}
}

View file

@ -118,36 +118,9 @@ hvault_token_lookup >/dev/null \
# wrong version or a different backend, fail loudly — silently # wrong version or a different backend, fail loudly — silently
# re-enabling would destroy existing secrets. # re-enabling would destroy existing secrets.
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──" log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
mounts_json="$(hvault_get_or_empty "sys/mounts")" \ export DRY_RUN
|| die "failed to list Vault mounts" hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-forgejo]" \
|| die "KV mount check failed"
mount_exists=false
if printf '%s' "$mounts_json" | jq -e --arg m "${KV_MOUNT}/" '.[$m]' >/dev/null 2>&1; then
mount_exists=true
fi
if [ "$mount_exists" = true ]; then
mount_type="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${KV_MOUNT}/" '.[$m].type // ""')"
mount_version="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${KV_MOUNT}/" '.[$m].options.version // "1"')"
if [ "$mount_type" != "kv" ]; then
die "${KV_MOUNT}/ is mounted as type='${mount_type}', expected 'kv' — refuse to re-mount"
fi
if [ "$mount_version" != "2" ]; then
die "${KV_MOUNT}/ is KV v${mount_version}, expected v2 — refuse to upgrade in place (manual fix required)"
fi
log "${KV_MOUNT}/ already mounted (kv v2) — skipping enable"
else
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would enable ${KV_MOUNT}/ as kv v2"
else
payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')"
_hvault_request POST "sys/mounts/${KV_MOUNT}" "$payload" >/dev/null \
|| die "failed to enable ${KV_MOUNT}/ as kv v2"
log "${KV_MOUNT}/ enabled as kv v2"
fi
fi
# ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ──────────── # ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ────────────
log "── Step 2/2: seed ${KV_API_PATH} ──" log "── Step 2/2: seed ${KV_API_PATH} ──"

126
tools/vault-seed-woodpecker.sh Executable file
View file

@ -0,0 +1,126 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-woodpecker.sh — Idempotent seed for kv/disinto/shared/woodpecker
#
# Part of the Nomad+Vault migration (S3.1, issue #934). Populates the
# `agent_secret` key at the KV v2 path that nomad/jobs/woodpecker-server.hcl
# reads from, so a clean-install factory has a pre-shared agent secret for
# woodpecker-server ↔ woodpecker-agent communication.
#
# Scope: ONLY seeds `agent_secret`. The Forgejo OAuth client/secret
# (`forgejo_client`, `forgejo_secret`) are written by S3.3's
# wp-oauth-register.sh after creating the OAuth app via the Forgejo API.
# This script preserves any existing keys it doesn't own.
#
# Idempotency contract (per key):
# - Key missing or empty in Vault → generate a random value, write it,
# log "agent_secret generated".
# - Key present with a non-empty value → leave untouched, log
# "agent_secret unchanged".
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2 (this script enables it on a
# fresh box; on an existing box it asserts the mount type/version).
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-woodpecker.sh
# tools/vault-seed-woodpecker.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SEED_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SEED_DIR}/.." && pwd)"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
KV_MOUNT="kv"
KV_LOGICAL_PATH="disinto/shared/woodpecker"
KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}"
AGENT_SECRET_BYTES=32 # 32 bytes → 64 hex chars
LOG_TAG="[vault-seed-woodpecker]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
# for-over-"$@" loop — shape distinct from vault-seed-forgejo.sh (arity:value
# case) and vault-apply-roles.sh (if/elif).
DRY_RUN=0
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/woodpecker with a random agent_secret\n'
printf 'if it is missing. Idempotent: existing non-empty values are\n'
printf 'left untouched.\n\n'
printf ' --dry-run Print planned actions without writing to Vault.\n'
exit 0
;;
*) die "invalid argument: ${arg} (try --help)" ;;
esac
done
# ── Preconditions — binary + Vault connectivity checks ───────────────────────
required_bins=(curl jq openssl)
for bin in "${required_bins[@]}"; do
command -v "$bin" >/dev/null 2>&1 || die "required binary not found: ${bin}"
done
[ -n "${VAULT_ADDR:-}" ] || die "VAULT_ADDR unset — export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/2: ensure kv/ mount exists and is KV v2 ───────────────────────────
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-woodpecker]" \
|| die "KV mount check failed"
# ── Step 2/2: seed agent_secret at kv/data/disinto/shared/woodpecker ─────────
log "── Step 2/2: seed ${KV_API_PATH} ──"
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \
|| die "failed to read ${KV_API_PATH}"
# Read all existing keys so we can preserve them on write (KV v2 replaces
# `.data` atomically). Missing path → empty object.
existing_data="{}"
existing_agent_secret=""
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_agent_secret="$(printf '%s' "$existing_raw" | jq -r '.data.data.agent_secret // ""')"
fi
if [ -n "$existing_agent_secret" ]; then
log "agent_secret unchanged"
exit 0
fi
# agent_secret is missing — generate it.
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would generate + write: agent_secret"
exit 0
fi
new_agent_secret="$(openssl rand -hex "$AGENT_SECRET_BYTES")"
# Merge the new key into existing data to preserve any keys written by
# other seeders (e.g. S3.3's forgejo_client/forgejo_secret).
payload="$(printf '%s' "$existing_data" \
| jq --arg as "$new_agent_secret" '{data: (. + {agent_secret: $as})}')"
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
log "agent_secret generated"
log "done — 1 key seeded at ${KV_API_PATH}"

View file

@ -55,7 +55,7 @@ roles:
- name: service-woodpecker - name: service-woodpecker
policy: service-woodpecker policy: service-woodpecker
namespace: default namespace: default
job_id: woodpecker job_id: woodpecker-server
# ── Per-agent bots (nomad/jobs/bot-<role>.hcl — land in later steps) ─────── # ── Per-agent bots (nomad/jobs/bot-<role>.hcl — land in later steps) ───────
# job_id placeholders match the policy name 1:1 until each bot's jobspec # job_id placeholders match the policy name 1:1 until each bot's jobspec