fix: [nomad-step-0] S0.2 — install nomad + systemd unit + nomad/server.hcl/client.hcl (#822) #827
4 changed files with 389 additions and 0 deletions
118
lib/init/nomad/install.sh
Executable file
118
lib/init/nomad/install.sh
Executable file
|
|
@ -0,0 +1,118 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad
|
||||||
|
#
|
||||||
|
# Part of the Nomad+Vault migration (S0.2, issue #822). Installs the `nomad`
|
||||||
|
# binary from the HashiCorp apt repository. Does NOT install Vault — S0.3
|
||||||
|
# owns that. Does NOT configure, start, or enable a systemd unit —
|
||||||
|
# lib/init/nomad/systemd-nomad.sh owns that. Does NOT wire this script into
|
||||||
|
# `disinto init` — S0.4 owns that.
|
||||||
|
#
|
||||||
|
# Idempotency contract:
|
||||||
|
# - Running twice back-to-back is a no-op once the target version is
|
||||||
|
# installed and the apt source is in place.
|
||||||
|
# - Adds the HashiCorp apt keyring only if it is absent.
|
||||||
|
# - Adds the HashiCorp apt sources list only if it is absent.
|
||||||
|
# - Skips `apt-get install` entirely when the installed version already
|
||||||
|
# matches ${NOMAD_VERSION}.
|
||||||
|
#
|
||||||
|
# Configuration:
|
||||||
|
# NOMAD_VERSION — pinned Nomad version (default: see below). The apt
|
||||||
|
# package name is versioned as "nomad=<version>-1".
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# sudo NOMAD_VERSION=1.9.5 lib/init/nomad/install.sh
|
||||||
|
#
|
||||||
|
# Exit codes:
|
||||||
|
# 0 success (installed or already present)
|
||||||
|
# 1 precondition failure (not Debian/Ubuntu, missing tools, not root)
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Pin to a specific Nomad 1.x release. Bump here, not at call sites.
|
||||||
|
NOMAD_VERSION="${NOMAD_VERSION:-1.9.5}"
|
||||||
|
|
||||||
|
HASHICORP_KEYRING="/usr/share/keyrings/hashicorp-archive-keyring.gpg"
|
||||||
|
HASHICORP_SOURCES="/etc/apt/sources.list.d/hashicorp.list"
|
||||||
|
HASHICORP_GPG_URL="https://apt.releases.hashicorp.com/gpg"
|
||||||
|
HASHICORP_REPO_URL="https://apt.releases.hashicorp.com"
|
||||||
|
|
||||||
|
log() { printf '[install-nomad] %s\n' "$*"; }
|
||||||
|
die() { printf '[install-nomad] ERROR: %s\n' "$*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# ── Preconditions ────────────────────────────────────────────────────────────
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
die "must run as root (needs apt-get + /usr/share/keyrings write access)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
for bin in apt-get gpg curl lsb_release; do
|
||||||
|
command -v "$bin" >/dev/null 2>&1 \
|
||||||
|
|| die "required binary not found: ${bin}"
|
||||||
|
done
|
||||||
|
|
||||||
|
CODENAME="$(lsb_release -cs)"
|
||||||
|
[ -n "$CODENAME" ] || die "lsb_release returned empty codename"
|
||||||
|
|
||||||
|
# ── Fast-path: already at desired version? ───────────────────────────────────
|
||||||
|
installed_version=""
|
||||||
|
if command -v nomad >/dev/null 2>&1; then
|
||||||
|
# `nomad version` prints e.g. "Nomad v1.9.5" on the first line.
|
||||||
|
installed_version="$(nomad version 2>/dev/null \
|
||||||
|
| awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$installed_version" = "$NOMAD_VERSION" ]; then
|
||||||
|
log "nomad ${NOMAD_VERSION} already installed — nothing to do"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Ensure HashiCorp apt keyring ─────────────────────────────────────────────
|
||||||
|
if [ ! -f "$HASHICORP_KEYRING" ]; then
|
||||||
|
log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}"
|
||||||
|
tmpkey="$(mktemp)"
|
||||||
|
trap 'rm -f "$tmpkey"' EXIT
|
||||||
|
curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \
|
||||||
|
|| die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}"
|
||||||
|
gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \
|
||||||
|
|| die "failed to dearmor HashiCorp GPG key"
|
||||||
|
chmod 0644 "$HASHICORP_KEYRING"
|
||||||
|
rm -f "$tmpkey"
|
||||||
|
trap - EXIT
|
||||||
|
else
|
||||||
|
log "HashiCorp apt keyring already present"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Ensure HashiCorp apt sources list ────────────────────────────────────────
|
||||||
|
desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main"
|
||||||
|
if [ ! -f "$HASHICORP_SOURCES" ] \
|
||||||
|
|| ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then
|
||||||
|
log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}"
|
||||||
|
printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES"
|
||||||
|
apt_update_needed=1
|
||||||
|
else
|
||||||
|
log "HashiCorp apt sources list already present"
|
||||||
|
apt_update_needed=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Install the pinned version ───────────────────────────────────────────────
|
||||||
|
if [ "$apt_update_needed" -eq 1 ]; then
|
||||||
|
log "running apt-get update"
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get update -qq \
|
||||||
|
|| die "apt-get update failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# HashiCorp apt packages use the "<version>-1" package-revision suffix.
|
||||||
|
pkg_spec="nomad=${NOMAD_VERSION}-1"
|
||||||
|
log "installing ${pkg_spec}"
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
"$pkg_spec" \
|
||||||
|
|| die "apt-get install ${pkg_spec} failed"
|
||||||
|
|
||||||
|
# ── Verify ───────────────────────────────────────────────────────────────────
|
||||||
|
final_version="$(nomad version 2>/dev/null \
|
||||||
|
| awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')"
|
||||||
|
if [ "$final_version" != "$NOMAD_VERSION" ]; then
|
||||||
|
die "post-install check: expected ${NOMAD_VERSION}, got '${final_version}'"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "nomad ${NOMAD_VERSION} installed successfully"
|
||||||
130
lib/init/nomad/systemd-nomad.sh
Executable file
130
lib/init/nomad/systemd-nomad.sh
Executable file
|
|
@ -0,0 +1,130 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# lib/init/nomad/systemd-nomad.sh — Idempotent systemd unit installer for Nomad
|
||||||
|
#
|
||||||
|
# Part of the Nomad+Vault migration (S0.2, issue #822). Writes
|
||||||
|
# /etc/systemd/system/nomad.service pointing at /etc/nomad.d/ and runs
|
||||||
|
# `systemctl enable nomad` WITHOUT starting the service — we don't launch
|
||||||
|
# the cluster until S0.4 wires everything together.
|
||||||
|
#
|
||||||
|
# Idempotency contract:
|
||||||
|
# - Existing unit file is NOT rewritten when on-disk content already
|
||||||
|
# matches the desired content (avoids spurious `daemon-reload`).
|
||||||
|
# - `systemctl enable` on an already-enabled unit is a no-op.
|
||||||
|
# - This script is safe to run unconditionally before every factory boot.
|
||||||
|
#
|
||||||
|
# Preconditions:
|
||||||
|
# - nomad binary installed (see lib/init/nomad/install.sh)
|
||||||
|
# - /etc/nomad.d/ will hold server.hcl / client.hcl (placed by S0.4)
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# sudo lib/init/nomad/systemd-nomad.sh
|
||||||
|
#
|
||||||
|
# Exit codes:
|
||||||
|
# 0 success (unit installed + enabled, or already so)
|
||||||
|
# 1 precondition failure (not root, no systemctl, no nomad binary)
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
UNIT_PATH="/etc/systemd/system/nomad.service"
|
||||||
|
NOMAD_CONFIG_DIR="/etc/nomad.d"
|
||||||
|
NOMAD_DATA_DIR="/var/lib/nomad"
|
||||||
|
|
||||||
|
log() { printf '[systemd-nomad] %s\n' "$*"; }
|
||||||
|
die() { printf '[systemd-nomad] ERROR: %s\n' "$*" >&2; exit 1; }
|
||||||
|
|
||||||
|
# ── Preconditions ────────────────────────────────────────────────────────────
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
die "must run as root (needs write access to ${UNIT_PATH})"
|
||||||
|
fi
|
||||||
|
|
||||||
|
command -v systemctl >/dev/null 2>&1 \
|
||||||
|
|| die "systemctl not found (systemd is required)"
|
||||||
|
|
||||||
|
NOMAD_BIN="$(command -v nomad 2>/dev/null || true)"
|
||||||
|
[ -n "$NOMAD_BIN" ] \
|
||||||
|
|| die "nomad binary not found — run lib/init/nomad/install.sh first"
|
||||||
|
|
||||||
|
# ── Desired unit content ─────────────────────────────────────────────────────
|
||||||
|
# Upstream-recommended baseline (https://developer.hashicorp.com/nomad/docs/install/production/deployment-guide)
|
||||||
|
# trimmed for a single-node combined server+client dev box.
|
||||||
|
# - Wants=/After= network-online: nomad must have networking up.
|
||||||
|
# - User/Group=root: the Docker driver needs root to talk to dockerd.
|
||||||
|
# - LimitNOFILE/LimitNPROC=infinity: avoid Nomad's startup warning.
|
||||||
|
# - KillSignal=SIGINT: triggers Nomad's graceful shutdown path.
|
||||||
|
# - Restart=on-failure with a bounded burst to avoid crash-loops eating the
|
||||||
|
# journal when /etc/nomad.d/ is mis-configured.
|
||||||
|
read -r -d '' DESIRED_UNIT <<EOF || true
|
||||||
|
[Unit]
|
||||||
|
Description=Nomad
|
||||||
|
Documentation=https://developer.hashicorp.com/nomad/docs
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
# When Docker is present, ensure dockerd is up before nomad starts — the
|
||||||
|
# Docker task driver needs the daemon socket available at startup.
|
||||||
|
Wants=docker.service
|
||||||
|
After=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=notify
|
||||||
|
User=root
|
||||||
|
Group=root
|
||||||
|
ExecReload=/bin/kill -HUP \$MAINPID
|
||||||
|
ExecStart=${NOMAD_BIN} agent -config=${NOMAD_CONFIG_DIR}
|
||||||
|
KillMode=process
|
||||||
|
KillSignal=SIGINT
|
||||||
|
LimitNOFILE=infinity
|
||||||
|
LimitNPROC=infinity
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=2
|
||||||
|
StartLimitBurst=3
|
||||||
|
StartLimitIntervalSec=10
|
||||||
|
TasksMax=infinity
|
||||||
|
OOMScoreAdjust=-1000
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# ── Ensure config + data dirs exist ──────────────────────────────────────────
|
||||||
|
# We do not populate /etc/nomad.d/ here (that's S0.4). We do create the
|
||||||
|
# directory so `nomad agent -config=/etc/nomad.d` doesn't error if the unit
|
||||||
|
# is started before hcl files are dropped in.
|
||||||
|
for d in "$NOMAD_CONFIG_DIR" "$NOMAD_DATA_DIR"; do
|
||||||
|
if [ ! -d "$d" ]; then
|
||||||
|
log "creating ${d}"
|
||||||
|
install -d -m 0755 "$d"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ── Install unit file only if content differs ────────────────────────────────
|
||||||
|
needs_reload=0
|
||||||
|
if [ ! -f "$UNIT_PATH" ] \
|
||||||
|
|| ! printf '%s\n' "$DESIRED_UNIT" | cmp -s - "$UNIT_PATH"; then
|
||||||
|
log "writing unit → ${UNIT_PATH}"
|
||||||
|
tmp="$(mktemp)"
|
||||||
|
trap 'rm -f "$tmp"' EXIT
|
||||||
|
printf '%s\n' "$DESIRED_UNIT" > "$tmp"
|
||||||
|
install -m 0644 -o root -g root "$tmp" "$UNIT_PATH"
|
||||||
|
rm -f "$tmp"
|
||||||
|
trap - EXIT
|
||||||
|
needs_reload=1
|
||||||
|
else
|
||||||
|
log "unit file already up to date"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── Reload + enable ──────────────────────────────────────────────────────────
|
||||||
|
if [ "$needs_reload" -eq 1 ]; then
|
||||||
|
log "systemctl daemon-reload"
|
||||||
|
systemctl daemon-reload
|
||||||
|
fi
|
||||||
|
|
||||||
|
if systemctl is-enabled --quiet nomad.service 2>/dev/null; then
|
||||||
|
log "nomad.service already enabled"
|
||||||
|
else
|
||||||
|
log "systemctl enable nomad"
|
||||||
|
systemctl enable nomad.service >/dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "done — unit installed and enabled (NOT started; S0.4 brings the cluster up)"
|
||||||
88
nomad/client.hcl
Normal file
88
nomad/client.hcl
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
# =============================================================================
|
||||||
|
# nomad/client.hcl — Docker driver + host_volume declarations
|
||||||
|
#
|
||||||
|
# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
|
||||||
|
# /etc/nomad.d/client.hcl on the factory dev box alongside server.hcl.
|
||||||
|
#
|
||||||
|
# This file owns: Docker driver plugin config + host_volume pre-wiring.
|
||||||
|
# server.hcl owns: agent role, bind, ports, data_dir.
|
||||||
|
#
|
||||||
|
# NOTE: Nomad merges every *.hcl under -config=/etc/nomad.d, so declaring
|
||||||
|
# a second `client { ... }` block here augments (not replaces) the one in
|
||||||
|
# server.hcl. On a single-node setup this file could be inlined into
|
||||||
|
# server.hcl — the split is for readability, not semantics.
|
||||||
|
#
|
||||||
|
# host_volume declarations let Nomad jobspecs mount factory state by name
|
||||||
|
# (volume = "forgejo-data", etc.) without coupling host paths into jobspec
|
||||||
|
# HCL. Host paths under /srv/disinto/* are created out-of-band by the
|
||||||
|
# orchestrator (S0.4) before any job references them.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
client {
|
||||||
|
# forgejo git server data (repos, avatars, attachments).
|
||||||
|
host_volume "forgejo-data" {
|
||||||
|
path = "/srv/disinto/forgejo-data"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# woodpecker CI data (pipeline artifacts, sqlite db).
|
||||||
|
host_volume "woodpecker-data" {
|
||||||
|
path = "/srv/disinto/woodpecker-data"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# agent runtime data (claude config, logs, phase files).
|
||||||
|
host_volume "agent-data" {
|
||||||
|
path = "/srv/disinto/agent-data"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# per-project git clones and worktrees.
|
||||||
|
host_volume "project-repos" {
|
||||||
|
path = "/srv/disinto/project-repos"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# caddy config + ACME state.
|
||||||
|
host_volume "caddy-data" {
|
||||||
|
path = "/srv/disinto/caddy-data"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# disinto chat transcripts + attachments.
|
||||||
|
host_volume "chat-history" {
|
||||||
|
path = "/srv/disinto/chat-history"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
|
||||||
|
# ops repo clone (vault actions, sprint artifacts, knowledge).
|
||||||
|
host_volume "ops-repo" {
|
||||||
|
path = "/srv/disinto/ops-repo"
|
||||||
|
read_only = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Docker task driver. `volumes.enabled = true` is required so jobspecs
|
||||||
|
# can mount host_volume declarations defined above. `allow_privileged`
|
||||||
|
# stays false — no factory workload needs privileged containers today,
|
||||||
|
# and flipping it is an audit-worthy change.
|
||||||
|
plugin "docker" {
|
||||||
|
config {
|
||||||
|
allow_privileged = false
|
||||||
|
|
||||||
|
volumes {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Leave images behind when jobs stop, so short job churn doesn't thrash
|
||||||
|
# the image cache. Factory disk is not constrained; `docker system prune`
|
||||||
|
# is the escape hatch.
|
||||||
|
gc {
|
||||||
|
image = false
|
||||||
|
container = true
|
||||||
|
dangling_containers {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
53
nomad/server.hcl
Normal file
53
nomad/server.hcl
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
# =============================================================================
|
||||||
|
# nomad/server.hcl — Single-node combined server+client configuration
|
||||||
|
#
|
||||||
|
# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to
|
||||||
|
# /etc/nomad.d/server.hcl on the factory dev box alongside client.hcl.
|
||||||
|
#
|
||||||
|
# This file owns: agent role, ports, bind, data directory.
|
||||||
|
# client.hcl owns: Docker driver plugin config + host_volume declarations.
|
||||||
|
#
|
||||||
|
# NOTE: On single-node setups these two files could be merged into one
|
||||||
|
# (Nomad auto-merges every *.hcl under -config=/etc/nomad.d). The split is
|
||||||
|
# purely for readability — role/bind/port vs. plugin/volume wiring.
|
||||||
|
#
|
||||||
|
# This is a factory dev-box baseline — TLS, ACLs, gossip encryption, and
|
||||||
|
# consul/vault integration are deliberately absent and land in later steps.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
data_dir = "/var/lib/nomad"
|
||||||
|
bind_addr = "127.0.0.1"
|
||||||
|
log_level = "INFO"
|
||||||
|
|
||||||
|
# All Nomad agent traffic stays on localhost — the factory box does not
|
||||||
|
# federate with peers. Ports are the Nomad defaults, pinned here so that
|
||||||
|
# future changes to these numbers are a visible diff.
|
||||||
|
ports {
|
||||||
|
http = 4646
|
||||||
|
rpc = 4647
|
||||||
|
serf = 4648
|
||||||
|
}
|
||||||
|
|
||||||
|
# Single-node combined mode: this agent is both the only server and the
|
||||||
|
# only client. bootstrap_expect=1 makes the server quorum-of-one.
|
||||||
|
server {
|
||||||
|
enabled = true
|
||||||
|
bootstrap_expect = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
client {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
|
|
||||||
|
# Advertise localhost to self to avoid surprises if the default IP
|
||||||
|
# autodetection picks a transient interface (e.g. docker0, wg0).
|
||||||
|
advertise {
|
||||||
|
http = "127.0.0.1"
|
||||||
|
rpc = "127.0.0.1"
|
||||||
|
serf = "127.0.0.1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# UI on by default — same bind as http, no TLS (localhost only).
|
||||||
|
ui {
|
||||||
|
enabled = true
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue