fix: [nomad-step-4] S4.1 — nomad/jobs/agents.hcl (7 roles, llama, vault-templated bot tokens) (#955) #959
5 changed files with 441 additions and 0 deletions
|
|
@ -301,6 +301,13 @@ def main() -> int:
|
|||
"9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)",
|
||||
"9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)",
|
||||
"5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)",
|
||||
# Common vault-seed script preamble + precondition patterns
|
||||
# Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh
|
||||
"dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT",
|
||||
"1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard",
|
||||
"63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die",
|
||||
"34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup",
|
||||
"71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die",
|
||||
}
|
||||
|
||||
if not sh_files:
|
||||
|
|
|
|||
199
nomad/jobs/agents.hcl
Normal file
199
nomad/jobs/agents.hcl
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
# =============================================================================
|
||||
# nomad/jobs/agents.hcl — All-role agent polling loop (Nomad service job)
|
||||
#
|
||||
# Part of the Nomad+Vault migration (S4.1, issue #955). Runs the main bot
|
||||
# polling loop with all 7 agent roles (review, dev, gardener, architect,
|
||||
# planner, predictor, supervisor) against the local llama server.
|
||||
#
|
||||
# Host_volume contract:
|
||||
# This job mounts agent-data, project-repos, and ops-repo from
|
||||
# nomad/client.hcl. Paths under /srv/disinto/* are created by
|
||||
# lib/init/nomad/cluster-up.sh before any job references them.
|
||||
#
|
||||
# Vault integration (S4.1):
|
||||
# - vault { role = "service-agents" } at group scope — workload-identity
|
||||
# JWT exchanged for a Vault token carrying the composite service-agents
|
||||
# policy (vault/policies/service-agents.hcl), which grants read access
|
||||
# to all 7 bot KV namespaces + vault bot + shared forge config.
|
||||
# - template stanza renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault
|
||||
# KV v2 at kv/disinto/bots/<role>.
|
||||
# - Seeded on fresh boxes by tools/vault-seed-agents.sh.
|
||||
#
|
||||
# Not the runtime yet: docker-compose.yml is still the factory's live stack
|
||||
# until cutover. This file exists so CI can validate it and S4.2 can wire
|
||||
# `disinto init --backend=nomad --with agents` to `nomad job run` it.
|
||||
# =============================================================================
|
||||
|
||||
job "agents" {
|
||||
type = "service"
|
||||
datacenters = ["dc1"]
|
||||
|
||||
group "agents" {
|
||||
count = 1
|
||||
|
||||
# ── Vault workload identity (S4.1, issue #955) ───────────────────────────
|
||||
# Composite role covering all 7 bot identities + vault bot. Role defined
|
||||
# in vault/roles.yaml, policy in vault/policies/service-agents.hcl.
|
||||
# Bound claim pins nomad_job_id = "agents".
|
||||
vault {
|
||||
role = "service-agents"
|
||||
}
|
||||
|
||||
# No network port — agents are outbound-only (poll forgejo, call llama).
|
||||
# No service discovery block — nothing health-checks agents over HTTP.
|
||||
|
||||
volume "agent-data" {
|
||||
type = "host"
|
||||
source = "agent-data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
volume "project-repos" {
|
||||
type = "host"
|
||||
source = "project-repos"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
volume "ops-repo" {
|
||||
type = "host"
|
||||
source = "ops-repo"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
# Conservative restart — fail fast to the scheduler.
|
||||
restart {
|
||||
attempts = 3
|
||||
interval = "5m"
|
||||
delay = "15s"
|
||||
mode = "delay"
|
||||
}
|
||||
|
||||
# ── Service registration ────────────────────────────────────────────────
|
||||
# Agents are outbound-only (poll forgejo, call llama) — no HTTP/TCP
|
||||
# endpoint to probe. The Nomad native provider only supports tcp/http
|
||||
# checks, not script checks. Registering without a check block means
|
||||
# Nomad tracks health via task lifecycle: task running = healthy,
|
||||
# task dead = service deregistered. This matches the docker-compose
|
||||
# pgrep healthcheck semantics (process alive = healthy).
|
||||
service {
|
||||
name = "agents"
|
||||
provider = "nomad"
|
||||
}
|
||||
|
||||
task "agents" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "disinto/agents:latest"
|
||||
|
||||
# apparmor=unconfined matches docker-compose — Claude Code needs
|
||||
# ptrace for node.js inspector and /proc access.
|
||||
security_opt = ["apparmor=unconfined"]
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "agent-data"
|
||||
destination = "/home/agent/data"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "project-repos"
|
||||
destination = "/home/agent/repos"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "ops-repo"
|
||||
destination = "/home/agent/repos/_factory/disinto-ops"
|
||||
read_only = true
|
||||
}
|
||||
|
||||
# ── Non-secret env ─────────────────────────────────────────────────────
|
||||
env {
|
||||
FORGE_URL = "http://forgejo:3000"
|
||||
FORGE_REPO = "disinto-admin/disinto"
|
||||
ANTHROPIC_BASE_URL = "http://10.10.10.1:8081"
|
||||
ANTHROPIC_API_KEY = "sk-no-key-required"
|
||||
CLAUDE_MODEL = "unsloth/Qwen3.5-35B-A3B"
|
||||
AGENT_ROLES = "review,dev,gardener,architect,planner,predictor,supervisor"
|
||||
POLL_INTERVAL = "300"
|
||||
DISINTO_CONTAINER = "1"
|
||||
PROJECT_NAME = "project"
|
||||
PROJECT_REPO_ROOT = "/home/agent/repos/project"
|
||||
CLAUDE_TIMEOUT = "7200"
|
||||
|
||||
# llama-specific Claude Code tuning
|
||||
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC = "1"
|
||||
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS = "1"
|
||||
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE = "60"
|
||||
}
|
||||
|
||||
# ── Vault-templated bot tokens (S4.1, issue #955) ─────────────────────
|
||||
# Renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault KV v2.
|
||||
# Each `with secret ...` block reads one bot's KV path; the `else`
|
||||
# branch emits short placeholders on fresh installs where the path
|
||||
# is absent. Seed with tools/vault-seed-agents.sh.
|
||||
#
|
||||
# Placeholder values kept < 16 chars to avoid secret-scan CI failures.
|
||||
# error_on_missing_key = false prevents template-pending hangs.
|
||||
template {
|
||||
destination = "secrets/bots.env"
|
||||
env = true
|
||||
change_mode = "restart"
|
||||
error_on_missing_key = false
|
||||
data = <<EOT
|
||||
{{- with secret "kv/data/disinto/bots/dev" -}}
|
||||
FORGE_TOKEN={{ .Data.data.token }}
|
||||
FORGE_PASS={{ .Data.data.pass }}
|
||||
{{- else -}}
|
||||
# WARNING: run tools/vault-seed-agents.sh
|
||||
FORGE_TOKEN=seed-me
|
||||
FORGE_PASS=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/review" -}}
|
||||
FORGE_REVIEW_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_REVIEW_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/gardener" -}}
|
||||
FORGE_GARDENER_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_GARDENER_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/architect" -}}
|
||||
FORGE_ARCHITECT_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_ARCHITECT_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/planner" -}}
|
||||
FORGE_PLANNER_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_PLANNER_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/predictor" -}}
|
||||
FORGE_PREDICTOR_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_PREDICTOR_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/supervisor" -}}
|
||||
FORGE_SUPERVISOR_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_SUPERVISOR_TOKEN=seed-me
|
||||
{{- end }}
|
||||
{{- with secret "kv/data/disinto/bots/vault" -}}
|
||||
FORGE_VAULT_TOKEN={{ .Data.data.token }}
|
||||
{{- else -}}
|
||||
FORGE_VAULT_TOKEN=seed-me
|
||||
{{- end }}
|
||||
EOT
|
||||
}
|
||||
|
||||
# Agents run Claude/llama sessions — need CPU + memory headroom.
|
||||
resources {
|
||||
cpu = 500
|
||||
memory = 1024
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
151
tools/vault-seed-agents.sh
Executable file
151
tools/vault-seed-agents.sh
Executable file
|
|
@ -0,0 +1,151 @@
|
|||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# tools/vault-seed-agents.sh — Idempotent seed for all bot KV paths
|
||||
#
|
||||
# Part of the Nomad+Vault migration (S4.1, issue #955). Populates
|
||||
# kv/disinto/bots/<role> with token + pass for each of the 7 agent roles
|
||||
# plus the vault bot. Handles the "fresh factory, no .env import" case.
|
||||
#
|
||||
# Companion to tools/vault-import.sh — when that runs against a box with
|
||||
# an existing stack, it overwrites seeded values with real ones.
|
||||
#
|
||||
# Idempotency contract (per bot):
|
||||
# - Both token and pass present → skip, log "<role> unchanged".
|
||||
# - Either missing → generate random values for missing keys, preserve
|
||||
# existing keys, write back atomically.
|
||||
#
|
||||
# Preconditions:
|
||||
# - Vault reachable + unsealed at $VAULT_ADDR.
|
||||
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
|
||||
# - curl, jq, openssl
|
||||
#
|
||||
# Usage:
|
||||
# tools/vault-seed-agents.sh
|
||||
# tools/vault-seed-agents.sh --dry-run
|
||||
#
|
||||
# Exit codes:
|
||||
# 0 success (seed applied, or already applied)
|
||||
# 1 precondition / API / mount-mismatch failure
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
|
||||
|
||||
# shellcheck source=../lib/hvault.sh
|
||||
source "${REPO_ROOT}/lib/hvault.sh"
|
||||
|
||||
KV_MOUNT="kv"
|
||||
TOKEN_BYTES=32 # 32 bytes → 64 hex chars
|
||||
PASS_BYTES=16 # 16 bytes → 32 hex chars
|
||||
|
||||
# All bot roles seeded by this script.
|
||||
BOT_ROLES=(dev review gardener architect planner predictor supervisor vault)
|
||||
|
||||
LOG_TAG="[vault-seed-agents]"
|
||||
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
|
||||
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
|
||||
|
||||
# ── Flag parsing ─────────────────────────────────────────────────────────────
|
||||
# while/shift shape — distinct from forgejo (arity:value case) and
|
||||
# woodpecker (for-loop).
|
||||
DRY_RUN=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--dry-run) DRY_RUN=1 ;;
|
||||
-h|--help)
|
||||
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
|
||||
printf 'Seed kv/disinto/bots/<role> with token + pass for all agent\n'
|
||||
printf 'roles. Idempotent: existing non-empty values are preserved.\n\n'
|
||||
printf ' --dry-run Print planned actions without writing.\n'
|
||||
exit 0
|
||||
;;
|
||||
*) die "invalid argument: ${1} (try --help)" ;;
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
||||
# ── Preconditions ────────────────────────────────────────────────────────────
|
||||
for bin in curl jq openssl; do
|
||||
command -v "$bin" >/dev/null 2>&1 \
|
||||
|| die "required binary not found: ${bin}"
|
||||
done
|
||||
[ -n "${VAULT_ADDR:-}" ] \
|
||||
|| die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200"
|
||||
hvault_token_lookup >/dev/null \
|
||||
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
|
||||
|
||||
# ── Step 1: ensure kv/ mount exists and is KV v2 ────────────────────────────
|
||||
log "── Step 1: ensure ${KV_MOUNT}/ is KV v2 ──"
|
||||
export DRY_RUN
|
||||
hvault_ensure_kv_v2 "$KV_MOUNT" "${LOG_TAG}" \
|
||||
|| die "KV mount check failed"
|
||||
|
||||
# ── Step 2: seed each bot role ───────────────────────────────────────────────
|
||||
total_generated=0
|
||||
|
||||
for role in "${BOT_ROLES[@]}"; do
|
||||
kv_logical="disinto/bots/${role}"
|
||||
kv_api="${KV_MOUNT}/data/${kv_logical}"
|
||||
|
||||
log "── seed ${kv_logical} ──"
|
||||
|
||||
existing_raw="$(hvault_get_or_empty "${kv_api}")" \
|
||||
|| die "failed to read ${kv_api}"
|
||||
|
||||
existing_token=""
|
||||
existing_pass=""
|
||||
existing_data="{}"
|
||||
if [ -n "$existing_raw" ]; then
|
||||
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
|
||||
existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')"
|
||||
existing_pass="$(printf '%s' "$existing_raw" | jq -r '.data.data.pass // ""')"
|
||||
fi
|
||||
|
||||
generated=()
|
||||
|
||||
if [ -z "$existing_token" ]; then
|
||||
generated+=("token")
|
||||
fi
|
||||
if [ -z "$existing_pass" ]; then
|
||||
generated+=("pass")
|
||||
fi
|
||||
|
||||
if [ "${#generated[@]}" -eq 0 ]; then
|
||||
log "${role}: unchanged"
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ "$DRY_RUN" -eq 1 ]; then
|
||||
log "[dry-run] ${role}: would generate ${generated[*]}"
|
||||
total_generated=$(( total_generated + ${#generated[@]} ))
|
||||
continue
|
||||
fi
|
||||
|
||||
desired_token="$existing_token"
|
||||
desired_pass="$existing_pass"
|
||||
|
||||
for key in "${generated[@]}"; do
|
||||
case "$key" in
|
||||
token) desired_token="$(openssl rand -hex "$TOKEN_BYTES")" ;;
|
||||
pass) desired_pass="$(openssl rand -hex "$PASS_BYTES")" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Merge new keys into existing data to preserve any keys we don't own.
|
||||
payload="$(printf '%s' "$existing_data" \
|
||||
| jq --arg t "$desired_token" --arg p "$desired_pass" \
|
||||
'{data: (. + {token: $t, pass: $p})}')"
|
||||
|
||||
_hvault_request POST "${kv_api}" "$payload" >/dev/null \
|
||||
|| die "failed to write ${kv_api}"
|
||||
|
||||
log "${role}: generated ${generated[*]}"
|
||||
total_generated=$(( total_generated + ${#generated[@]} ))
|
||||
done
|
||||
|
||||
if [ "$total_generated" -eq 0 ]; then
|
||||
log "all bot paths already seeded — no-op"
|
||||
else
|
||||
log "done — ${total_generated} key(s) seeded across ${#BOT_ROLES[@]} bot paths"
|
||||
fi
|
||||
76
vault/policies/service-agents.hcl
Normal file
76
vault/policies/service-agents.hcl
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# vault/policies/service-agents.hcl
|
||||
#
|
||||
# Composite policy for the `agents` Nomad job (S4.1, issue #955).
|
||||
# Grants read access to all 7 bot KV namespaces + shared forge config,
|
||||
# so a single job running all agent roles can pull per-bot tokens from
|
||||
# Vault via workload identity.
|
||||
|
||||
# ── Per-bot KV paths (token + pass per role) ─────────────────────────────────
|
||||
path "kv/data/disinto/bots/dev" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/dev" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/review" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/review" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/gardener" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/gardener" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/architect" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/architect" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/planner" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/planner" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/predictor" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/predictor" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/supervisor" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/supervisor" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
path "kv/data/disinto/bots/vault" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
||||
path "kv/metadata/disinto/bots/vault" {
|
||||
capabilities = ["list", "read"]
|
||||
}
|
||||
|
||||
# ── Shared forge config (URL, bot usernames) ─────────────────────────────────
|
||||
path "kv/data/disinto/shared/forge" {
|
||||
capabilities = ["read"]
|
||||
}
|
||||
|
|
@ -62,6 +62,14 @@ roles:
|
|||
namespace: default
|
||||
job_id: woodpecker-agent
|
||||
|
||||
# ── Agents composite (nomad/jobs/agents.hcl — S4.1) ──────────────────────
|
||||
# Single job running all 7 agent roles. Uses a composite policy
|
||||
# (vault/policies/service-agents.hcl) that unions all bot KV paths.
|
||||
- name: service-agents
|
||||
policy: service-agents
|
||||
namespace: default
|
||||
job_id: agents
|
||||
|
||||
# ── Per-agent bots (nomad/jobs/bot-<role>.hcl — land in later steps) ───────
|
||||
# job_id placeholders match the policy name 1:1 until each bot's jobspec
|
||||
# lands. When a bot's jobspec is added under nomad/jobs/, update the
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue