Merge pull request 'fix: [nomad-step-0] S0.2 — install nomad + systemd unit + nomad/server.hcl/client.hcl (#822)' (#827) from fix/issue-822 into main
All checks were successful
ci/woodpecker/push/ci Pipeline was successful

This commit is contained in:
dev-bot 2026-04-16 06:15:32 +00:00
commit 75bec43c4a
4 changed files with 389 additions and 0 deletions

118
lib/init/nomad/install.sh Executable file
View file

@ -0,0 +1,118 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad
#
# Part of the Nomad+Vault migration (S0.2, issue #822). Installs the `nomad`
# binary from the HashiCorp apt repository. Does NOT install Vault — S0.3
# owns that. Does NOT configure, start, or enable a systemd unit —
# lib/init/nomad/systemd-nomad.sh owns that. Does NOT wire this script into
# `disinto init` — S0.4 owns that.
#
# Idempotency contract:
# - Running twice back-to-back is a no-op once the target version is
# installed and the apt source is in place.
# - Adds the HashiCorp apt keyring only if it is absent.
# - Adds the HashiCorp apt sources list only if it is absent.
# - Skips `apt-get install` entirely when the installed version already
# matches ${NOMAD_VERSION}.
#
# Configuration:
# NOMAD_VERSION — pinned Nomad version (default: see below). The apt
# package name is versioned as "nomad=<version>-1".
#
# Usage:
# sudo NOMAD_VERSION=1.9.5 lib/init/nomad/install.sh
#
# Exit codes:
# 0 success (installed or already present)
# 1 precondition failure (not Debian/Ubuntu, missing tools, not root)
# =============================================================================
set -euo pipefail
# Pin to a specific Nomad 1.x release. Bump here, not at call sites.
NOMAD_VERSION="${NOMAD_VERSION:-1.9.5}"
HASHICORP_KEYRING="/usr/share/keyrings/hashicorp-archive-keyring.gpg"
HASHICORP_SOURCES="/etc/apt/sources.list.d/hashicorp.list"
HASHICORP_GPG_URL="https://apt.releases.hashicorp.com/gpg"
HASHICORP_REPO_URL="https://apt.releases.hashicorp.com"
log() { printf '[install-nomad] %s\n' "$*"; }
die() { printf '[install-nomad] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Preconditions ────────────────────────────────────────────────────────────
if [ "$(id -u)" -ne 0 ]; then
die "must run as root (needs apt-get + /usr/share/keyrings write access)"
fi
for bin in apt-get gpg curl lsb_release; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
CODENAME="$(lsb_release -cs)"
[ -n "$CODENAME" ] || die "lsb_release returned empty codename"
# ── Fast-path: already at desired version? ───────────────────────────────────
installed_version=""
if command -v nomad >/dev/null 2>&1; then
# `nomad version` prints e.g. "Nomad v1.9.5" on the first line.
installed_version="$(nomad version 2>/dev/null \
| awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')"
fi
if [ "$installed_version" = "$NOMAD_VERSION" ]; then
log "nomad ${NOMAD_VERSION} already installed — nothing to do"
exit 0
fi
# ── Ensure HashiCorp apt keyring ─────────────────────────────────────────────
if [ ! -f "$HASHICORP_KEYRING" ]; then
log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}"
tmpkey="$(mktemp)"
trap 'rm -f "$tmpkey"' EXIT
curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \
|| die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}"
gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \
|| die "failed to dearmor HashiCorp GPG key"
chmod 0644 "$HASHICORP_KEYRING"
rm -f "$tmpkey"
trap - EXIT
else
log "HashiCorp apt keyring already present"
fi
# ── Ensure HashiCorp apt sources list ────────────────────────────────────────
desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main"
if [ ! -f "$HASHICORP_SOURCES" ] \
|| ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then
log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}"
printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES"
apt_update_needed=1
else
log "HashiCorp apt sources list already present"
apt_update_needed=0
fi
# ── Install the pinned version ───────────────────────────────────────────────
if [ "$apt_update_needed" -eq 1 ]; then
log "running apt-get update"
DEBIAN_FRONTEND=noninteractive apt-get update -qq \
|| die "apt-get update failed"
fi
# HashiCorp apt packages use the "<version>-1" package-revision suffix.
pkg_spec="nomad=${NOMAD_VERSION}-1"
log "installing ${pkg_spec}"
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
"$pkg_spec" \
|| die "apt-get install ${pkg_spec} failed"
# ── Verify ───────────────────────────────────────────────────────────────────
final_version="$(nomad version 2>/dev/null \
| awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')"
if [ "$final_version" != "$NOMAD_VERSION" ]; then
die "post-install check: expected ${NOMAD_VERSION}, got '${final_version}'"
fi
log "nomad ${NOMAD_VERSION} installed successfully"

130
lib/init/nomad/systemd-nomad.sh Executable file
View file

@ -0,0 +1,130 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/systemd-nomad.sh — Idempotent systemd unit installer for Nomad
#
# Part of the Nomad+Vault migration (S0.2, issue #822). Writes
# /etc/systemd/system/nomad.service pointing at /etc/nomad.d/ and runs
# `systemctl enable nomad` WITHOUT starting the service — we don't launch
# the cluster until S0.4 wires everything together.
#
# Idempotency contract:
# - Existing unit file is NOT rewritten when on-disk content already
# matches the desired content (avoids spurious `daemon-reload`).
# - `systemctl enable` on an already-enabled unit is a no-op.
# - This script is safe to run unconditionally before every factory boot.
#
# Preconditions:
# - nomad binary installed (see lib/init/nomad/install.sh)
# - /etc/nomad.d/ will hold server.hcl / client.hcl (placed by S0.4)
#
# Usage:
# sudo lib/init/nomad/systemd-nomad.sh
#
# Exit codes:
# 0 success (unit installed + enabled, or already so)
# 1 precondition failure (not root, no systemctl, no nomad binary)
# =============================================================================
set -euo pipefail
UNIT_PATH="/etc/systemd/system/nomad.service"
NOMAD_CONFIG_DIR="/etc/nomad.d"
NOMAD_DATA_DIR="/var/lib/nomad"
log() { printf '[systemd-nomad] %s\n' "$*"; }
die() { printf '[systemd-nomad] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Preconditions ────────────────────────────────────────────────────────────
if [ "$(id -u)" -ne 0 ]; then
die "must run as root (needs write access to ${UNIT_PATH})"
fi
command -v systemctl >/dev/null 2>&1 \
|| die "systemctl not found (systemd is required)"
NOMAD_BIN="$(command -v nomad 2>/dev/null || true)"
[ -n "$NOMAD_BIN" ] \
|| die "nomad binary not found — run lib/init/nomad/install.sh first"
# ── Desired unit content ─────────────────────────────────────────────────────
# Upstream-recommended baseline (https://developer.hashicorp.com/nomad/docs/install/production/deployment-guide)
# trimmed for a single-node combined server+client dev box.
# - Wants=/After= network-online: nomad must have networking up.
# - User/Group=root: the Docker driver needs root to talk to dockerd.
# - LimitNOFILE/LimitNPROC=infinity: avoid Nomad's startup warning.
# - KillSignal=SIGINT: triggers Nomad's graceful shutdown path.
# - Restart=on-failure with a bounded burst to avoid crash-loops eating the
# journal when /etc/nomad.d/ is mis-configured.
read -r -d '' DESIRED_UNIT <<EOF || true
[Unit]
Description=Nomad
Documentation=https://developer.hashicorp.com/nomad/docs
Wants=network-online.target
After=network-online.target
# When Docker is present, ensure dockerd is up before nomad starts — the
# Docker task driver needs the daemon socket available at startup.
Wants=docker.service
After=docker.service
[Service]
Type=notify
User=root
Group=root
ExecReload=/bin/kill -HUP \$MAINPID
ExecStart=${NOMAD_BIN} agent -config=${NOMAD_CONFIG_DIR}
KillMode=process
KillSignal=SIGINT
LimitNOFILE=infinity
LimitNPROC=infinity
Restart=on-failure
RestartSec=2
StartLimitBurst=3
StartLimitIntervalSec=10
TasksMax=infinity
OOMScoreAdjust=-1000
[Install]
WantedBy=multi-user.target
EOF
# ── Ensure config + data dirs exist ──────────────────────────────────────────
# We do not populate /etc/nomad.d/ here (that's S0.4). We do create the
# directory so `nomad agent -config=/etc/nomad.d` doesn't error if the unit
# is started before hcl files are dropped in.
for d in "$NOMAD_CONFIG_DIR" "$NOMAD_DATA_DIR"; do
if [ ! -d "$d" ]; then
log "creating ${d}"
install -d -m 0755 "$d"
fi
done
# ── Install unit file only if content differs ────────────────────────────────
needs_reload=0
if [ ! -f "$UNIT_PATH" ] \
|| ! printf '%s\n' "$DESIRED_UNIT" | cmp -s - "$UNIT_PATH"; then
log "writing unit → ${UNIT_PATH}"
tmp="$(mktemp)"
trap 'rm -f "$tmp"' EXIT
printf '%s\n' "$DESIRED_UNIT" > "$tmp"
install -m 0644 -o root -g root "$tmp" "$UNIT_PATH"
rm -f "$tmp"
trap - EXIT
needs_reload=1
else
log "unit file already up to date"
fi
# ── Reload + enable ──────────────────────────────────────────────────────────
if [ "$needs_reload" -eq 1 ]; then
log "systemctl daemon-reload"
systemctl daemon-reload
fi
if systemctl is-enabled --quiet nomad.service 2>/dev/null; then
log "nomad.service already enabled"
else
log "systemctl enable nomad"
systemctl enable nomad.service >/dev/null
fi
log "done — unit installed and enabled (NOT started; S0.4 brings the cluster up)"

88
nomad/client.hcl Normal file
View file

@ -0,0 +1,88 @@
# =============================================================================
# nomad/client.hcl Docker driver + host_volume declarations
#
# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
# /etc/nomad.d/client.hcl on the factory dev box alongside server.hcl.
#
# This file owns: Docker driver plugin config + host_volume pre-wiring.
# server.hcl owns: agent role, bind, ports, data_dir.
#
# NOTE: Nomad merges every *.hcl under -config=/etc/nomad.d, so declaring
# a second `client { ... }` block here augments (not replaces) the one in
# server.hcl. On a single-node setup this file could be inlined into
# server.hcl the split is for readability, not semantics.
#
# host_volume declarations let Nomad jobspecs mount factory state by name
# (volume = "forgejo-data", etc.) without coupling host paths into jobspec
# HCL. Host paths under /srv/disinto/* are created out-of-band by the
# orchestrator (S0.4) before any job references them.
# =============================================================================
client {
# forgejo git server data (repos, avatars, attachments).
host_volume "forgejo-data" {
path = "/srv/disinto/forgejo-data"
read_only = false
}
# woodpecker CI data (pipeline artifacts, sqlite db).
host_volume "woodpecker-data" {
path = "/srv/disinto/woodpecker-data"
read_only = false
}
# agent runtime data (claude config, logs, phase files).
host_volume "agent-data" {
path = "/srv/disinto/agent-data"
read_only = false
}
# per-project git clones and worktrees.
host_volume "project-repos" {
path = "/srv/disinto/project-repos"
read_only = false
}
# caddy config + ACME state.
host_volume "caddy-data" {
path = "/srv/disinto/caddy-data"
read_only = false
}
# disinto chat transcripts + attachments.
host_volume "chat-history" {
path = "/srv/disinto/chat-history"
read_only = false
}
# ops repo clone (vault actions, sprint artifacts, knowledge).
host_volume "ops-repo" {
path = "/srv/disinto/ops-repo"
read_only = false
}
}
# Docker task driver. `volumes.enabled = true` is required so jobspecs
# can mount host_volume declarations defined above. `allow_privileged`
# stays false no factory workload needs privileged containers today,
# and flipping it is an audit-worthy change.
plugin "docker" {
config {
allow_privileged = false
volumes {
enabled = true
}
# Leave images behind when jobs stop, so short job churn doesn't thrash
# the image cache. Factory disk is not constrained; `docker system prune`
# is the escape hatch.
gc {
image = false
container = true
dangling_containers {
enabled = true
}
}
}
}

53
nomad/server.hcl Normal file
View file

@ -0,0 +1,53 @@
# =============================================================================
# nomad/server.hcl Single-node combined server+client configuration
#
# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
# /etc/nomad.d/server.hcl on the factory dev box alongside client.hcl.
#
# This file owns: agent role, ports, bind, data directory.
# client.hcl owns: Docker driver plugin config + host_volume declarations.
#
# NOTE: On single-node setups these two files could be merged into one
# (Nomad auto-merges every *.hcl under -config=/etc/nomad.d). The split is
# purely for readability role/bind/port vs. plugin/volume wiring.
#
# This is a factory dev-box baseline TLS, ACLs, gossip encryption, and
# consul/vault integration are deliberately absent and land in later steps.
# =============================================================================
data_dir = "/var/lib/nomad"
bind_addr = "127.0.0.1"
log_level = "INFO"
# All Nomad agent traffic stays on localhost the factory box does not
# federate with peers. Ports are the Nomad defaults, pinned here so that
# future changes to these numbers are a visible diff.
ports {
http = 4646
rpc = 4647
serf = 4648
}
# Single-node combined mode: this agent is both the only server and the
# only client. bootstrap_expect=1 makes the server quorum-of-one.
server {
enabled = true
bootstrap_expect = 1
}
client {
enabled = true
}
# Advertise localhost to self to avoid surprises if the default IP
# autodetection picks a transient interface (e.g. docker0, wg0).
advertise {
http = "127.0.0.1"
rpc = "127.0.0.1"
serf = "127.0.0.1"
}
# UI on by default same bind as http, no TLS (localhost only).
ui {
enabled = true
}