From 06ead3a19ddc9c34cd7f971aa05f5a670f3883e2 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 16 Apr 2026 06:04:02 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20[nomad-step-0]=20S0.2=20=E2=80=94=20inst?= =?UTF-8?q?all=20nomad=20+=20systemd=20unit=20+=20nomad/server.hcl/client.?= =?UTF-8?q?hcl=20(#822)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the Nomad install + baseline HCL config for the single-node factory dev box. Nothing is wired into `disinto init` yet — S0.4 does that. - lib/init/nomad/install.sh: idempotent apt install pinned to NOMAD_VERSION (default 1.9.5). Adds HashiCorp apt keyring and sources list only if absent; fast-paths when the pinned version is already installed. - lib/init/nomad/systemd-nomad.sh: writes /etc/systemd/system/nomad.service (rewrites only when content differs), creates /etc/nomad.d and /var/lib/nomad, runs `systemctl enable nomad` WITHOUT starting. - nomad/server.hcl: single-node combined server+client role. bootstrap_expect=1, localhost bind, default ports pinned explicitly, UI enabled. No TLS/ACL — factory dev box baseline. - nomad/client.hcl: Docker task driver (allow_privileged=false, volumes enabled) and host_volume pre-wiring for forgejo-data, woodpecker-data, agent-data, project-repos, caddy-data, chat-history, ops-repo under /srv/disinto/*. Verified: `nomad config validate nomad/*.hcl` reports "Configuration is valid!" (with expected TLS/bootstrap warnings for a dev box). Shellcheck clean across the repo. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/init/nomad/install.sh | 118 +++++++++++++++++++++++++++++ lib/init/nomad/systemd-nomad.sh | 130 ++++++++++++++++++++++++++++++++ nomad/client.hcl | 88 +++++++++++++++++++++ nomad/server.hcl | 53 +++++++++++++ 4 files changed, 389 insertions(+) create mode 100755 lib/init/nomad/install.sh create mode 100755 lib/init/nomad/systemd-nomad.sh create mode 100644 nomad/client.hcl create mode 100644 nomad/server.hcl diff --git a/lib/init/nomad/install.sh b/lib/init/nomad/install.sh new file mode 100755 index 0000000..43397fd --- /dev/null +++ b/lib/init/nomad/install.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/install.sh — Idempotent apt install of HashiCorp Nomad +# +# Part of the Nomad+Vault migration (S0.2, issue #822). Installs the `nomad` +# binary from the HashiCorp apt repository. Does NOT install Vault — S0.3 +# owns that. Does NOT configure, start, or enable a systemd unit — +# lib/init/nomad/systemd-nomad.sh owns that. Does NOT wire this script into +# `disinto init` — S0.4 owns that. +# +# Idempotency contract: +# - Running twice back-to-back is a no-op once the target version is +# installed and the apt source is in place. +# - Adds the HashiCorp apt keyring only if it is absent. +# - Adds the HashiCorp apt sources list only if it is absent. +# - Skips `apt-get install` entirely when the installed version already +# matches ${NOMAD_VERSION}. +# +# Configuration: +# NOMAD_VERSION — pinned Nomad version (default: see below). The apt +# package name is versioned as "nomad=-1". +# +# Usage: +# sudo NOMAD_VERSION=1.9.5 lib/init/nomad/install.sh +# +# Exit codes: +# 0 success (installed or already present) +# 1 precondition failure (not Debian/Ubuntu, missing tools, not root) +# ============================================================================= +set -euo pipefail + +# Pin to a specific Nomad 1.x release. Bump here, not at call sites. +NOMAD_VERSION="${NOMAD_VERSION:-1.9.5}" + +HASHICORP_KEYRING="/usr/share/keyrings/hashicorp-archive-keyring.gpg" +HASHICORP_SOURCES="/etc/apt/sources.list.d/hashicorp.list" +HASHICORP_GPG_URL="https://apt.releases.hashicorp.com/gpg" +HASHICORP_REPO_URL="https://apt.releases.hashicorp.com" + +log() { printf '[install-nomad] %s\n' "$*"; } +die() { printf '[install-nomad] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Preconditions ──────────────────────────────────────────────────────────── +if [ "$(id -u)" -ne 0 ]; then + die "must run as root (needs apt-get + /usr/share/keyrings write access)" +fi + +for bin in apt-get gpg curl lsb_release; do + command -v "$bin" >/dev/null 2>&1 \ + || die "required binary not found: ${bin}" +done + +CODENAME="$(lsb_release -cs)" +[ -n "$CODENAME" ] || die "lsb_release returned empty codename" + +# ── Fast-path: already at desired version? ─────────────────────────────────── +installed_version="" +if command -v nomad >/dev/null 2>&1; then + # `nomad version` prints e.g. "Nomad v1.9.5" on the first line. + installed_version="$(nomad version 2>/dev/null \ + | awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')" +fi + +if [ "$installed_version" = "$NOMAD_VERSION" ]; then + log "nomad ${NOMAD_VERSION} already installed — nothing to do" + exit 0 +fi + +# ── Ensure HashiCorp apt keyring ───────────────────────────────────────────── +if [ ! -f "$HASHICORP_KEYRING" ]; then + log "adding HashiCorp apt keyring → ${HASHICORP_KEYRING}" + tmpkey="$(mktemp)" + trap 'rm -f "$tmpkey"' EXIT + curl -fsSL "$HASHICORP_GPG_URL" -o "$tmpkey" \ + || die "failed to fetch HashiCorp GPG key from ${HASHICORP_GPG_URL}" + gpg --dearmor -o "$HASHICORP_KEYRING" < "$tmpkey" \ + || die "failed to dearmor HashiCorp GPG key" + chmod 0644 "$HASHICORP_KEYRING" + rm -f "$tmpkey" + trap - EXIT +else + log "HashiCorp apt keyring already present" +fi + +# ── Ensure HashiCorp apt sources list ──────────────────────────────────────── +desired_source="deb [signed-by=${HASHICORP_KEYRING}] ${HASHICORP_REPO_URL} ${CODENAME} main" +if [ ! -f "$HASHICORP_SOURCES" ] \ + || ! grep -qxF "$desired_source" "$HASHICORP_SOURCES"; then + log "writing HashiCorp apt sources list → ${HASHICORP_SOURCES}" + printf '%s\n' "$desired_source" > "$HASHICORP_SOURCES" + apt_update_needed=1 +else + log "HashiCorp apt sources list already present" + apt_update_needed=0 +fi + +# ── Install the pinned version ─────────────────────────────────────────────── +if [ "$apt_update_needed" -eq 1 ]; then + log "running apt-get update" + DEBIAN_FRONTEND=noninteractive apt-get update -qq \ + || die "apt-get update failed" +fi + +# HashiCorp apt packages use the "-1" package-revision suffix. +pkg_spec="nomad=${NOMAD_VERSION}-1" +log "installing ${pkg_spec}" +DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + "$pkg_spec" \ + || die "apt-get install ${pkg_spec} failed" + +# ── Verify ─────────────────────────────────────────────────────────────────── +final_version="$(nomad version 2>/dev/null \ + | awk 'NR==1 {sub(/^v/, "", $2); print $2; exit}')" +if [ "$final_version" != "$NOMAD_VERSION" ]; then + die "post-install check: expected ${NOMAD_VERSION}, got '${final_version}'" +fi + +log "nomad ${NOMAD_VERSION} installed successfully" diff --git a/lib/init/nomad/systemd-nomad.sh b/lib/init/nomad/systemd-nomad.sh new file mode 100755 index 0000000..e9db191 --- /dev/null +++ b/lib/init/nomad/systemd-nomad.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/init/nomad/systemd-nomad.sh — Idempotent systemd unit installer for Nomad +# +# Part of the Nomad+Vault migration (S0.2, issue #822). Writes +# /etc/systemd/system/nomad.service pointing at /etc/nomad.d/ and runs +# `systemctl enable nomad` WITHOUT starting the service — we don't launch +# the cluster until S0.4 wires everything together. +# +# Idempotency contract: +# - Existing unit file is NOT rewritten when on-disk content already +# matches the desired content (avoids spurious `daemon-reload`). +# - `systemctl enable` on an already-enabled unit is a no-op. +# - This script is safe to run unconditionally before every factory boot. +# +# Preconditions: +# - nomad binary installed (see lib/init/nomad/install.sh) +# - /etc/nomad.d/ will hold server.hcl / client.hcl (placed by S0.4) +# +# Usage: +# sudo lib/init/nomad/systemd-nomad.sh +# +# Exit codes: +# 0 success (unit installed + enabled, or already so) +# 1 precondition failure (not root, no systemctl, no nomad binary) +# ============================================================================= +set -euo pipefail + +UNIT_PATH="/etc/systemd/system/nomad.service" +NOMAD_CONFIG_DIR="/etc/nomad.d" +NOMAD_DATA_DIR="/var/lib/nomad" + +log() { printf '[systemd-nomad] %s\n' "$*"; } +die() { printf '[systemd-nomad] ERROR: %s\n' "$*" >&2; exit 1; } + +# ── Preconditions ──────────────────────────────────────────────────────────── +if [ "$(id -u)" -ne 0 ]; then + die "must run as root (needs write access to ${UNIT_PATH})" +fi + +command -v systemctl >/dev/null 2>&1 \ + || die "systemctl not found (systemd is required)" + +NOMAD_BIN="$(command -v nomad 2>/dev/null || true)" +[ -n "$NOMAD_BIN" ] \ + || die "nomad binary not found — run lib/init/nomad/install.sh first" + +# ── Desired unit content ───────────────────────────────────────────────────── +# Upstream-recommended baseline (https://developer.hashicorp.com/nomad/docs/install/production/deployment-guide) +# trimmed for a single-node combined server+client dev box. +# - Wants=/After= network-online: nomad must have networking up. +# - User/Group=root: the Docker driver needs root to talk to dockerd. +# - LimitNOFILE/LimitNPROC=infinity: avoid Nomad's startup warning. +# - KillSignal=SIGINT: triggers Nomad's graceful shutdown path. +# - Restart=on-failure with a bounded burst to avoid crash-loops eating the +# journal when /etc/nomad.d/ is mis-configured. +read -r -d '' DESIRED_UNIT < "$tmp" + install -m 0644 -o root -g root "$tmp" "$UNIT_PATH" + rm -f "$tmp" + trap - EXIT + needs_reload=1 +else + log "unit file already up to date" +fi + +# ── Reload + enable ────────────────────────────────────────────────────────── +if [ "$needs_reload" -eq 1 ]; then + log "systemctl daemon-reload" + systemctl daemon-reload +fi + +if systemctl is-enabled --quiet nomad.service 2>/dev/null; then + log "nomad.service already enabled" +else + log "systemctl enable nomad" + systemctl enable nomad.service >/dev/null +fi + +log "done — unit installed and enabled (NOT started; S0.4 brings the cluster up)" diff --git a/nomad/client.hcl b/nomad/client.hcl new file mode 100644 index 0000000..b90d5c1 --- /dev/null +++ b/nomad/client.hcl @@ -0,0 +1,88 @@ +# ============================================================================= +# nomad/client.hcl — Docker driver + host_volume declarations +# +# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to +# /etc/nomad.d/client.hcl on the factory dev box alongside server.hcl. +# +# This file owns: Docker driver plugin config + host_volume pre-wiring. +# server.hcl owns: agent role, bind, ports, data_dir. +# +# NOTE: Nomad merges every *.hcl under -config=/etc/nomad.d, so declaring +# a second `client { ... }` block here augments (not replaces) the one in +# server.hcl. On a single-node setup this file could be inlined into +# server.hcl — the split is for readability, not semantics. +# +# host_volume declarations let Nomad jobspecs mount factory state by name +# (volume = "forgejo-data", etc.) without coupling host paths into jobspec +# HCL. Host paths under /srv/disinto/* are created out-of-band by the +# orchestrator (S0.4) before any job references them. +# ============================================================================= + +client { + # forgejo git server data (repos, avatars, attachments). + host_volume "forgejo-data" { + path = "/srv/disinto/forgejo-data" + read_only = false + } + + # woodpecker CI data (pipeline artifacts, sqlite db). + host_volume "woodpecker-data" { + path = "/srv/disinto/woodpecker-data" + read_only = false + } + + # agent runtime data (claude config, logs, phase files). + host_volume "agent-data" { + path = "/srv/disinto/agent-data" + read_only = false + } + + # per-project git clones and worktrees. + host_volume "project-repos" { + path = "/srv/disinto/project-repos" + read_only = false + } + + # caddy config + ACME state. + host_volume "caddy-data" { + path = "/srv/disinto/caddy-data" + read_only = false + } + + # disinto chat transcripts + attachments. + host_volume "chat-history" { + path = "/srv/disinto/chat-history" + read_only = false + } + + # ops repo clone (vault actions, sprint artifacts, knowledge). + host_volume "ops-repo" { + path = "/srv/disinto/ops-repo" + read_only = false + } +} + +# Docker task driver. `volumes.enabled = true` is required so jobspecs +# can mount host_volume declarations defined above. `allow_privileged` +# stays false — no factory workload needs privileged containers today, +# and flipping it is an audit-worthy change. +plugin "docker" { + config { + allow_privileged = false + + volumes { + enabled = true + } + + # Leave images behind when jobs stop, so short job churn doesn't thrash + # the image cache. Factory disk is not constrained; `docker system prune` + # is the escape hatch. + gc { + image = false + container = true + dangling_containers { + enabled = true + } + } + } +} diff --git a/nomad/server.hcl b/nomad/server.hcl new file mode 100644 index 0000000..27c8b9c --- /dev/null +++ b/nomad/server.hcl @@ -0,0 +1,53 @@ +# ============================================================================= +# nomad/server.hcl — Single-node combined server+client configuration +# +# Part of the Nomad+Vault migration (S0.2, issue #822). Deployed to +# /etc/nomad.d/server.hcl on the factory dev box alongside client.hcl. +# +# This file owns: agent role, ports, bind, data directory. +# client.hcl owns: Docker driver plugin config + host_volume declarations. +# +# NOTE: On single-node setups these two files could be merged into one +# (Nomad auto-merges every *.hcl under -config=/etc/nomad.d). The split is +# purely for readability — role/bind/port vs. plugin/volume wiring. +# +# This is a factory dev-box baseline — TLS, ACLs, gossip encryption, and +# consul/vault integration are deliberately absent and land in later steps. +# ============================================================================= + +data_dir = "/var/lib/nomad" +bind_addr = "127.0.0.1" +log_level = "INFO" + +# All Nomad agent traffic stays on localhost — the factory box does not +# federate with peers. Ports are the Nomad defaults, pinned here so that +# future changes to these numbers are a visible diff. +ports { + http = 4646 + rpc = 4647 + serf = 4648 +} + +# Single-node combined mode: this agent is both the only server and the +# only client. bootstrap_expect=1 makes the server quorum-of-one. +server { + enabled = true + bootstrap_expect = 1 +} + +client { + enabled = true +} + +# Advertise localhost to self to avoid surprises if the default IP +# autodetection picks a transient interface (e.g. docker0, wg0). +advertise { + http = "127.0.0.1" + rpc = "127.0.0.1" + serf = "127.0.0.1" +} + +# UI on by default — same bind as http, no TLS (localhost only). +ui { + enabled = true +} -- 2.49.1