Compare commits

..

1 commit

Author SHA1 Message Date
dev-qwen2
a8d18aa3a3 fix: [nomad-step-2] S2.5 — bin/disinto init --import-env / --import-sops / --age-key wire-up (#883)
Some checks failed
ci/woodpecker/push/ci Pipeline was successful
ci/woodpecker/push/nomad-validate Pipeline failed
ci/woodpecker/pr/ci Pipeline was successful
ci/woodpecker/pr/nomad-validate Pipeline failed
ci/woodpecker/pr/smoke-init Pipeline was successful
2026-04-16 18:24:35 +00:00
76 changed files with 782 additions and 4281 deletions

View file

@ -32,10 +32,13 @@ FORGE_URL=http://localhost:3000 # [CONFIG] local Forgejo instance
# - FORGE_PASS_DEV_QWEN2
# Name conversion: tr 'a-z-' 'A-Z_' (lowercase→UPPER, hyphens→underscores).
# The compose generator looks these up via the agent's `forge_user` field in
# the project TOML. Configure local-model agents via [agents.X] sections in
# projects/*.toml — this is the canonical activation path.
# the project TOML. The pre-existing `dev-qwen` llama agent uses
# FORGE_TOKEN_LLAMA / FORGE_PASS_LLAMA (kept for backwards-compat with the
# legacy `ENABLE_LLAMA_AGENT=1` single-agent path).
FORGE_TOKEN= # [SECRET] dev-bot API token (default for all agents)
FORGE_PASS= # [SECRET] dev-bot password for git HTTP push (#361)
FORGE_TOKEN_LLAMA= # [SECRET] dev-qwen API token (for agents-llama)
FORGE_PASS_LLAMA= # [SECRET] dev-qwen password for git HTTP push
FORGE_REVIEW_TOKEN= # [SECRET] review-bot API token
FORGE_REVIEW_PASS= # [SECRET] review-bot password for git HTTP push
FORGE_PLANNER_TOKEN= # [SECRET] planner-bot API token
@ -104,6 +107,13 @@ FORWARD_AUTH_SECRET= # [SECRET] Shared secret for Caddy ↔
# Store all project secrets here so formulas reference env vars, never hardcode.
BASE_RPC_URL= # [SECRET] on-chain RPC endpoint
# ── Local Qwen dev agent (optional) ──────────────────────────────────────
# Set ENABLE_LLAMA_AGENT=1 to emit agents-llama in docker-compose.yml.
# Requires a running llama-server reachable at ANTHROPIC_BASE_URL.
# See docs/agents-llama.md for details.
ENABLE_LLAMA_AGENT=0 # [CONFIG] 1 = enable agents-llama service
ANTHROPIC_BASE_URL= # [CONFIG] e.g. http://host.docker.internal:8081
# ── Tuning ────────────────────────────────────────────────────────────────
CLAUDE_TIMEOUT=7200 # [CONFIG] max seconds per Claude invocation

1
.gitignore vendored
View file

@ -20,6 +20,7 @@ metrics/supervisor-metrics.jsonl
# OS
.DS_Store
dev/ci-fixes-*.json
gardener/dust.jsonl
# Individual encrypted secrets (managed by disinto secrets add)
secrets/

View file

@ -294,35 +294,6 @@ def main() -> int:
"9f6ae8e7811575b964279d8820494eb0": "Verification helper: for loop done pattern",
# Standard lib source block shared across formula-driven agent run scripts
"330e5809a00b95ade1a5fce2d749b94b": "Standard lib source block (env.sh, formula-session.sh, worktree.sh, guard.sh, agent-sdk.sh)",
# Common vault-seed script patterns: logging helpers + flag parsing
# Used in tools/vault-seed-woodpecker.sh + lib/init/nomad/wp-oauth-register.sh
"843a1cbf987952697d4e05e96ed2b2d5": "Logging helpers + DRY_RUN init (vault-seed-woodpecker + wp-oauth-register)",
"ee51df9642f2ef37af73b0c15f4d8406": "Logging helpers + DRY_RUN loop start (vault-seed-woodpecker + wp-oauth-register)",
"9a57368f3c1dfd29ec328596b86962a0": "Flag parsing loop + case start (vault-seed-woodpecker + wp-oauth-register)",
"9d72d40ff303cbed0b7e628fc15381c3": "Case loop + dry-run handler (vault-seed-woodpecker + wp-oauth-register)",
"5b52ddbbf47948e3cbc1b383f0909588": "Help + invalid arg handler end (vault-seed-woodpecker + wp-oauth-register)",
# Common vault-seed script preamble + precondition patterns
# Shared across tools/vault-seed-{forgejo,agents,woodpecker}.sh
"dff3675c151fcdbd2fef798826ae919b": "Vault-seed preamble: set -euo + path setup + source hvault.sh + KV_MOUNT",
"1cd9f0d083e24e6e6b2071db9b6dae09": "Vault-seed preconditions: binary check loop + VAULT_ADDR guard",
"63bfa88d71764c95c65a9a248f3e40ab": "Vault-seed preconditions: binary check end + VAULT_ADDR die",
"34873ad3570b211ce1d90468ab6ac94c": "Vault-seed preconditions: VAULT_ADDR die + hvault_token_lookup",
"71a52270f249e843cda48ad896d9f781": "Vault-seed preconditions: VAULT_ADDR + hvault_token_lookup + die",
# Common vault-seed script flag parsing patterns
# Shared across tools/vault-seed-{forgejo,ops-repo}.sh
"6906b7787796c2ccb8dd622e2ad4e7bf": "vault-seed DRY_RUN init + case pattern (forgejo + ops-repo)",
"a0df5283b616b964f8bc32fd99ec1b5a": "vault-seed case pattern start (forgejo + ops-repo)",
"e15e3272fdd9f0f46ce9e726aea9f853": "vault-seed case pattern dry-run handler (forgejo + ops-repo)",
"c9f22385cc49a3dac1d336bc14c6315b": "vault-seed DRY_RUN assignment (forgejo + ops-repo)",
"106f4071e88f841b3208b01144cd1c39": "vault-seed case pattern dry-run end (forgejo + ops-repo)",
"c15506dcb6bb340b25d1c39d442dd2e6": "vault-seed help text + invalid arg handler (forgejo + ops-repo)",
"1feecd3b3caf00045fae938ddf2811de": "vault-seed invalid arg handler (forgejo + ops-repo)",
"919780d5e7182715344f5aa02b191294": "vault-seed invalid arg + esac pattern (forgejo + ops-repo)",
"8dce1d292bce8e60ef4c0665b62945b0": "vault-seed esac + binary check loop (forgejo + ops-repo)",
"ca043687143a5b47bd54e65a99ce8ee8": "vault-seed binary check loop start (forgejo + ops-repo)",
"aefd9f655411a955395e6e5995ddbe6f": "vault-seed binary check pattern (forgejo + ops-repo)",
"60f0c46deb5491599457efb4048918e5": "vault-seed VAULT_ADDR + hvault_token_lookup check (forgejo + ops-repo)",
"f6838f581ef6b4d82b55268389032769": "vault-seed VAULT_ADDR + hvault_token_lookup die (forgejo + ops-repo)",
}
if not sh_files:

View file

@ -1,21 +1,16 @@
# =============================================================================
# .woodpecker/nomad-validate.yml — Static validation for Nomad+Vault artifacts
#
# Part of the Nomad+Vault migration (S0.5, issue #825; extended in S2.6,
# issue #884). Locks in the "no-ad-hoc-steps" principle: every HCL/shell
# artifact under nomad/, lib/init/nomad/, vault/policies/, plus the
# `disinto init` dispatcher and vault/roles.yaml, gets checked before it
# can land.
# Part of the Nomad+Vault migration (S0.5, issue #825). Locks in the
# "no-ad-hoc-steps" principle: every HCL/shell artifact under nomad/ or
# lib/init/nomad/, plus the `disinto init` dispatcher, gets checked
# before it can land.
#
# Triggers on PRs (and pushes) that touch any of:
# nomad/** — HCL configs (server, client, vault)
# lib/init/nomad/** — cluster-up / install / systemd / vault-init /
# vault-nomad-auth (S2.6 trigger: vault-*.sh
# is a subset of this glob)
# lib/init/nomad/** — cluster-up / install / systemd / vault-init
# bin/disinto — `disinto init --backend=nomad` dispatcher
# tests/disinto-init-nomad.bats — the bats suite itself
# vault/policies/** — Vault ACL policy HCL files (S2.1, S2.6)
# vault/roles.yaml — JWT-auth role bindings (S2.3, S2.6)
# .woodpecker/nomad-validate.yml — the pipeline definition
#
# Steps (all fail-closed — any error blocks merge):
@ -24,22 +19,8 @@
# nomad/jobs/*.hcl (new jobspecs get
# CI coverage automatically)
# 3. vault-operator-diagnose — `vault operator diagnose` syntax check on vault.hcl
# 4. vault-policy-fmt — `vault policy fmt` idempotence check on
# every vault/policies/*.hcl (format drift =
# CI fail; non-destructive via cp+diff)
# 5. vault-policy-validate — HCL syntax + capability validation for every
# vault/policies/*.hcl via `vault policy write`
# against an inline dev-mode Vault server
# 6. vault-roles-validate — yamllint + role→policy reference check on
# vault/roles.yaml (every referenced policy
# must exist as vault/policies/<name>.hcl)
# 7. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto
# 8. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests
#
# Secret-scan coverage: vault/policies/*.hcl is already scanned by the
# P11 gate (.woodpecker/secret-scan.yml, issue #798) — its trigger path
# `vault/**/*` covers everything under this directory. We intentionally
# do NOT duplicate that gate here; one scanner, one source of truth.
# 4. shellcheck-nomad — shellcheck the cluster-up + install scripts + disinto
# 5. bats-init-nomad — `disinto init --backend=nomad --dry-run` smoke tests
#
# Pinned image versions match lib/init/nomad/install.sh (nomad 1.9.5 /
# vault 1.18.5). Bump there AND here together — drift = CI passing on
@ -53,8 +34,6 @@ when:
- "lib/init/nomad/**"
- "bin/disinto"
- "tests/disinto-init-nomad.bats"
- "vault/policies/**"
- "vault/roles.yaml"
- ".woodpecker/nomad-validate.yml"
# Authenticated clone — same pattern as .woodpecker/ci.yml. Forgejo is
@ -144,176 +123,7 @@ steps:
*) echo "vault config: hard failure (rc=$rc)" >&2; exit "$rc" ;;
esac
# ── 4. Vault policy fmt idempotence check ────────────────────────────────
# `vault policy fmt <file>` formats a local HCL policy file in place.
# There's no `-check`/dry-run flag (vault 1.18.5), so we implement a
# non-destructive check as cp → fmt-on-copy → diff against original.
# Any diff means the committed file would be rewritten by `vault policy
# fmt` — failure steers the author to run `vault policy fmt <file>`
# locally before pushing.
#
# Scope: vault/policies/*.hcl only. The `[ -f "$f" ]` guard handles the
# no-match case (POSIX sh does not nullglob) so an empty policies/
# directory does not fail this step.
#
# Note: `vault policy fmt` is purely local (HCL text transform) and does
# not require a running Vault server, which is why this step can run
# without starting one.
- name: vault-policy-fmt
image: hashicorp/vault:1.18.5
commands:
- |
set -e
failed=0
for f in vault/policies/*.hcl; do
[ -f "$f" ] || continue
tmp="/tmp/$(basename "$f").fmt"
cp "$f" "$tmp"
vault policy fmt "$tmp" >/dev/null 2>&1
if ! diff -u "$f" "$tmp"; then
echo "ERROR: $f is not formatted — run 'vault policy fmt $f' locally" >&2
failed=1
fi
done
if [ "$failed" -gt 0 ]; then
echo "vault-policy-fmt: formatting drift detected" >&2
exit 1
fi
echo "vault-policy-fmt: all policies formatted correctly"
# ── 5. Vault policy HCL syntax + capability validation ───────────────────
# Vault has no offline `vault policy validate` subcommand — the closest
# in-CLI validator is `vault policy write`, which sends the HCL to a
# running server which parses it, checks capability names against the
# known set (read, list, create, update, delete, patch, sudo, deny),
# and rejects unknown stanzas / malformed path blocks. We start an
# inline dev-mode Vault (in-memory, no persistence, root token = "root")
# for the duration of this step and loop `vault policy write` over every
# vault/policies/*.hcl; the policies never leave the ephemeral dev
# server, so this is strictly a validator — not a deploy.
#
# Exit-code handling:
# - `vault policy write` exits 0 on success, non-zero on any parse /
# semantic error. We aggregate failures across all files so a single
# CI run surfaces every broken policy (not just the first).
# - The dev server is killed on any step exit via EXIT trap so the
# step tears down cleanly even on failure.
#
# Why dev-mode is sufficient: we're not persisting secrets, only asking
# Vault to parse policy text. The factory's production Vault is NOT
# contacted.
- name: vault-policy-validate
image: hashicorp/vault:1.18.5
commands:
- |
set -e
vault server -dev -dev-root-token-id=root -dev-listen-address=127.0.0.1:8200 >/tmp/vault-dev.log 2>&1 &
VAULT_PID=$!
trap 'kill "$VAULT_PID" 2>/dev/null || true' EXIT INT TERM
export VAULT_ADDR=http://127.0.0.1:8200
export VAULT_TOKEN=root
ready=0
i=0
while [ "$i" -lt 30 ]; do
if vault status >/dev/null 2>&1; then
ready=1
break
fi
i=$((i + 1))
sleep 0.5
done
if [ "$ready" -ne 1 ]; then
echo "vault-policy-validate: dev server failed to start after 15s" >&2
cat /tmp/vault-dev.log >&2 || true
exit 1
fi
failed=0
for f in vault/policies/*.hcl; do
[ -f "$f" ] || continue
name=$(basename "$f" .hcl)
echo "validate: $f"
if ! vault policy write "$name" "$f"; then
echo " ERROR: $f failed validation" >&2
failed=1
fi
done
if [ "$failed" -gt 0 ]; then
echo "vault-policy-validate: validation errors found" >&2
exit 1
fi
echo "vault-policy-validate: all policies valid"
# ── 6. vault/roles.yaml validator ────────────────────────────────────────
# Validates the JWT-auth role bindings file (S2.3). Two checks:
#
# a. `yamllint` — catches YAML syntax errors and indentation drift.
# Uses a relaxed config (line length bumped to 200) because
# roles.yaml's comments are wide by design.
# b. role → policy reference check — every role's `policy:` field
# must match a basename in vault/policies/*.hcl. A role pointing
# at a non-existent policy = runtime "permission denied" at job
# placement; catching the drift here turns it into a CI failure.
# Also verifies each role entry has the four required fields
# (name, policy, namespace, job_id) per the file's documented
# format.
#
# Parsing is done with PyYAML (the roles.yaml format is a strict
# subset that awk-level parsing in tools/vault-apply-roles.sh handles
# too, but PyYAML in CI gives us structural validation for free). If
# roles.yaml is ever absent (e.g. reverted), the step skips rather
# than fails — presence is enforced by S2.3's own tooling, not here.
- name: vault-roles-validate
image: python:3.12-alpine
commands:
- pip install --quiet --disable-pip-version-check pyyaml yamllint
- |
set -e
if [ ! -f vault/roles.yaml ]; then
echo "vault-roles-validate: vault/roles.yaml not present, skipping"
exit 0
fi
yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}' vault/roles.yaml
echo "vault-roles-validate: yamllint OK"
python3 - <<'PY'
import os
import sys
import yaml
with open('vault/roles.yaml') as f:
data = yaml.safe_load(f) or {}
roles = data.get('roles') or []
if not roles:
print("vault-roles-validate: no roles defined in vault/roles.yaml", file=sys.stderr)
sys.exit(1)
existing = {
os.path.splitext(e)[0]
for e in os.listdir('vault/policies')
if e.endswith('.hcl')
}
required = ('name', 'policy', 'namespace', 'job_id')
failed = 0
for r in roles:
if not isinstance(r, dict):
print(f"ERROR: role entry is not a mapping: {r!r}", file=sys.stderr)
failed = 1
continue
for field in required:
if r.get(field) in (None, ''):
print(f"ERROR: role entry missing required field '{field}': {r}", file=sys.stderr)
failed = 1
policy = r.get('policy')
if policy and policy not in existing:
print(
f"ERROR: role '{r.get('name')}' references policy '{policy}' "
f"but vault/policies/{policy}.hcl does not exist",
file=sys.stderr,
)
failed = 1
sys.exit(failed)
PY
echo "vault-roles-validate: all role→policy references valid"
# ── 7. Shellcheck ────────────────────────────────────────────────────────
# ── 4. Shellcheck ────────────────────────────────────────────────────────
# Covers the new lib/init/nomad/*.sh scripts plus bin/disinto (which owns
# the backend dispatcher). bin/disinto has no .sh extension so the
# repo-wide shellcheck in .woodpecker/ci.yml skips it — this step is the
@ -323,7 +133,7 @@ steps:
commands:
- shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto
# ── 8. bats: `disinto init --backend=nomad --dry-run` ────────────────────
# ── 5. bats: `disinto init --backend=nomad --dry-run` ────────────────────
# Smoke-tests the CLI dispatcher: both --backend=nomad variants exit 0
# with the expected step list, and --backend=docker stays on the docker
# path (regression guard). Pure dry-run — no sudo, no network.

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Disinto — Agent Instructions
## What this repo is
@ -37,20 +37,19 @@ disinto/ (code repo)
│ examples/ — example vault action TOMLs (promote, publish, release, webhook-call)
├── lib/ env.sh, agent-sdk.sh, ci-helpers.sh, ci-debug.sh, load-project.sh, parse-deps.sh, guard.sh, mirrors.sh, pr-lifecycle.sh, issue-lifecycle.sh, worktree.sh, formula-session.sh, stack-lock.sh, forge-setup.sh, forge-push.sh, ops-setup.sh, ci-setup.sh, generators.sh, hire-agent.sh, release.sh, build-graph.py, branch-protection.sh, secret-scan.sh, tea-helpers.sh, action-vault.sh, ci-log-reader.py, git-creds.sh, sprint-filer.sh, hvault.sh
│ hooks/ — Claude Code session hooks (on-compact-reinject, on-idle-stop, on-phase-change, on-pretooluse-guard, on-session-end, on-stop-failure)
│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825); wp-oauth-register.sh (Forgejo OAuth2 app + Vault KV seeder for Woodpecker, S3.3); deploy.sh (dependency-ordered Nomad job deploy + health-wait, S4)
├── nomad/ server.hcl, client.hcl (allow_privileged for woodpecker-agent, S3-fix-5), vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh
│ jobs/ — Nomad jobspecs: forgejo.hcl (Vault secrets via template, S2.4); woodpecker-server.hcl + woodpecker-agent.hcl (host-net, docker.sock, Vault KV, S3.1-S3.2); agents.hcl (7 roles, llama, Vault-templated bot tokens, S4.1); vault-runner.hcl (parameterized batch dispatch, S5.3); staging.hcl (Caddy file-server, dynamic port — edge discovers via service registration, S5.2); chat.hcl (Claude chat UI, tmpfs via mount block, Vault OAuth secrets, S5.2); edge.hcl (Caddy proxy + dispatcher sidecar, S5.1)
│ init/nomad/ — cluster-up.sh, install.sh, vault-init.sh, lib-systemd.sh (Nomad+Vault Step 0 installers, #821-#825)
├── nomad/ server.hcl, client.hcl, vault.hcl — HCL configs deployed to /etc/nomad.d/ and /etc/vault.d/ by lib/init/nomad/cluster-up.sh
│ jobs/ — Nomad jobspecs (forgejo.hcl reads Vault secrets via template stanza, S2.4)
├── projects/ *.toml.example — templates; *.toml — local per-box config (gitignored)
├── formulas/ Issue templates (TOML specs for multi-step agent tasks)
├── docker/ Dockerfiles and entrypoints: reproduce, triage, edge dispatcher, chat (server.py, entrypoint-chat.sh, Dockerfile, ui/)
├── tools/ Operational tools: edge-control/ (register.sh, install.sh, verify-chat-sandbox.sh)
│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh — Vault provisioning (S2.1/S2.2)
│ vault-seed-<svc>.sh — per-service Vault secret seeders; auto-invoked by `bin/disinto --with <svc>` (add a new file to support a new service)
│ vault-apply-policies.sh, vault-apply-roles.sh, vault-import.sh, vault-seed-forgejo.sh — Vault provisioning (S2.1/S2.2)
├── docs/ Protocol docs (PHASE-PROTOCOL.md, EVIDENCE-ARCHITECTURE.md)
├── site/ disinto.ai website content
├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, lib-generators.bats, vault-import.bats, disinto-init-nomad.bats)
├── tests/ Test files (mock-forgejo.py, smoke-init.sh, lib-hvault.bats, disinto-init-nomad.bats)
├── templates/ Issue templates
├── bin/ The `disinto` CLI script (`--with <svc>` deploys services + runs their Vault seeders)
├── bin/ The `disinto` CLI script
├── disinto-factory/ Setup documentation and skill
├── state/ Runtime state
├── .woodpecker/ Woodpecker CI pipeline configs
@ -123,7 +122,8 @@ bash dev/phase-test.sh
| Reproduce | `docker/reproduce/` | Bug reproduction using Playwright MCP | `formulas/reproduce.toml` |
| Triage | `docker/reproduce/` | Deep root cause analysis | `formulas/triage.toml` |
| Edge dispatcher | `docker/edge/` | Polls ops repo for vault actions, executes via Claude sessions | `docker/edge/dispatcher.sh` |
| Local-model agents | `docker/agents/` (same image) | Local llama-server agents configured via `[agents.X]` sections in project TOML | [docs/agents-llama.md](docs/agents-llama.md) |
| agents-llama | `docker/agents/` (same image) | Local-Qwen dev agent (`AGENT_ROLES=dev`), gated on `ENABLE_LLAMA_AGENT=1` | [docs/agents-llama.md](docs/agents-llama.md) |
| agents-llama-all | `docker/agents/` (same image) | Local-Qwen all-roles agent (all 7 roles), profile `agents-llama-all` | [docs/agents-llama.md](docs/agents-llama.md) |
> **Vault:** Being redesigned as a PR-based approval workflow (issues #73-#77).
> See [docs/VAULT.md](docs/VAULT.md) for the vault PR workflow details.

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Architect — Agent Instructions
## What this agent is

View file

@ -82,16 +82,16 @@ Init options:
--ci-id <n> Woodpecker CI repo ID (default: 0 = no CI)
--forge-url <url> Forge base URL (default: http://localhost:3000)
--backend <value> Orchestration backend: docker (default) | nomad
--with <services> (nomad) Deploy services: forgejo,woodpecker,agents,staging,chat,edge[,...] (S1.3, S3.4, S4.2, S5.2, S5.5)
--with <services> (nomad) Deploy services: forgejo[,...] (S1.3)
--empty (nomad) Bring up cluster only, no jobs (S0.4)
--bare Skip compose generation (bare-metal setup)
--build Use local docker build instead of registry images (dev mode)
--yes Skip confirmation prompts
--rotate-tokens Force regeneration of all bot tokens/passwords (idempotent by default)
--dry-run Print every intended action without executing
--import-env <path> (nomad) Path to .env file for import into Vault KV (S2.5)
--import-sops <path> (nomad) Path to sops-encrypted .env.vault.enc for import (S2.5)
--age-key <path> (nomad) Path to age keyfile (required with --import-sops) (S2.5)
--import-env <path> (nomad) Path to .env file for import into Vault KV
--import-sops <path> (nomad) Path to sops-encrypted .env.vault.enc for import
--age-key <path> (nomad) Path to age keyfile (required with --import-sops)
Hire an agent options:
--formula <path> Path to role formula TOML (default: formulas/<role>.toml)
@ -670,10 +670,9 @@ _disinto_init_nomad() {
local import_env="${4:-}" import_sops="${5:-}" age_key="${6:-}"
local cluster_up="${FACTORY_ROOT}/lib/init/nomad/cluster-up.sh"
local deploy_sh="${FACTORY_ROOT}/lib/init/nomad/deploy.sh"
local vault_engines_sh="${FACTORY_ROOT}/lib/init/nomad/vault-engines.sh"
local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh"
local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh"
local vault_import_sh="${FACTORY_ROOT}/tools/vault-import.sh"
local vault_auth_sh="${FACTORY_ROOT}/lib/init/nomad/vault-nomad-auth.sh"
local vault_policies_sh="${FACTORY_ROOT}/tools/vault-apply-policies.sh"
if [ ! -x "$cluster_up" ]; then
echo "Error: ${cluster_up} not found or not executable" >&2
@ -685,42 +684,6 @@ _disinto_init_nomad() {
exit 1
fi
# --empty short-circuits after cluster-up: no policies, no auth, no
# import, no deploy. It's the "cluster-only escape hatch" for debugging
# (docs/nomad-migration.md). Caller-side validation already rejects
# --empty combined with --with or any --import-* flag, so reaching
# this branch with those set is a bug in the caller.
#
# On the default (non-empty) path, vault-engines.sh (enables the kv/
# mount), vault-apply-policies.sh, and vault-nomad-auth.sh are invoked
# unconditionally — they are idempotent and cheap to re-run, and
# subsequent --with deployments depend on them. vault-import.sh is
# invoked only when an --import-* flag is set. vault-engines.sh runs
# first because every policy and role below references kv/disinto/*
# paths, which 403 if the engine is not yet mounted (issue #912).
local import_any=false
if [ -n "$import_env" ] || [ -n "$import_sops" ]; then
import_any=true
fi
if [ "$empty" != "true" ]; then
if [ ! -x "$vault_engines_sh" ]; then
echo "Error: ${vault_engines_sh} not found or not executable" >&2
exit 1
fi
if [ ! -x "$vault_policies_sh" ]; then
echo "Error: ${vault_policies_sh} not found or not executable" >&2
exit 1
fi
if [ ! -x "$vault_auth_sh" ]; then
echo "Error: ${vault_auth_sh} not found or not executable" >&2
exit 1
fi
if [ "$import_any" = true ] && [ ! -x "$vault_import_sh" ]; then
echo "Error: ${vault_import_sh} not found or not executable" >&2
exit 1
fi
fi
# --empty and default both invoke cluster-up today. Log the requested
# mode so the dispatch is visible in factory bootstrap logs — Step 1
# will branch on $empty to gate the job-deployment path.
@ -730,7 +693,7 @@ _disinto_init_nomad() {
echo "nomad backend: default (cluster-up; jobs deferred to Step 1)"
fi
# Dry-run: print cluster-up plan + policies/auth/import plan + deploy.sh plan
# Dry-run: print cluster-up plan + import plan + deploy.sh plan
if [ "$dry_run" = "true" ]; then
echo ""
echo "── Cluster-up dry-run ─────────────────────────────────"
@ -738,82 +701,46 @@ _disinto_init_nomad() {
"${cmd[@]}" || true
echo ""
# --empty skips policies/auth/import/deploy — cluster-up only, no
# workloads. The operator-visible dry-run plan must match the real
# run, so short-circuit here too.
if [ "$empty" = "true" ]; then
exit 0
fi
# Vault engines + policies + auth are invoked on every nomad real-run
# path regardless of --import-* flags (they're idempotent; S2.1 + S2.3).
# Engines runs first because policies/roles/templates all reference the
# kv/ mount it enables (issue #912). Mirror that ordering in the
# dry-run plan so the operator sees the full sequence Step 2 will
# execute.
echo "── Vault engines dry-run ──────────────────────────────"
echo "[engines] [dry-run] ${vault_engines_sh} --dry-run"
echo ""
echo "── Vault policies dry-run ─────────────────────────────"
echo "[policies] [dry-run] ${vault_policies_sh} --dry-run"
echo ""
echo "── Vault auth dry-run ─────────────────────────────────"
echo "[auth] [dry-run] ${vault_auth_sh}"
echo ""
# Import plan: one line per --import-* flag that is actually set.
# Printing independently (not in an if/elif chain) means that all
# three flags appearing together each echo their own path — the
# regression that bit prior implementations of this issue (#883).
if [ "$import_any" = true ]; then
# Import plan if any import flags are set
if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then
echo "── Vault import dry-run ───────────────────────────────"
[ -n "$import_env" ] && echo "[import] --import-env env file: ${import_env}"
[ -n "$import_sops" ] && echo "[import] --import-sops sops file: ${import_sops}"
[ -n "$age_key" ] && echo "[import] --age-key age key: ${age_key}"
local -a import_dry_cmd=("$vault_import_sh")
[ -n "$import_env" ] && import_dry_cmd+=("--env" "$import_env")
[ -n "$import_sops" ] && import_dry_cmd+=("--sops" "$import_sops")
[ -n "$age_key" ] && import_dry_cmd+=("--age-key" "$age_key")
import_dry_cmd+=("--dry-run")
echo "[import] [dry-run] ${import_dry_cmd[*]}"
if [ -n "$import_env" ]; then
echo "[import] --import-env: ${import_env}"
fi
if [ -n "$import_sops" ]; then
echo "[import] --import-sops: ${import_sops}"
fi
if [ -n "$age_key" ]; then
echo "[import] --age-key: ${age_key}"
fi
echo "[import] [dry-run] ${vault_import_sh} --dry-run"
echo "[import] [dry-run] vault import plan printed above"
echo ""
echo "── Vault policies dry-run ─────────────────────────────"
echo "[policies] [dry-run] ${vault_policies_sh} --dry-run"
echo ""
echo "── Vault auth dry-run ─────────────────────────────────"
echo "[auth] [dry-run] ${vault_auth_sh}"
echo ""
else
echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
echo ""
fi
if [ -n "$with_services" ]; then
# Interleaved seed/deploy per service (S2.6, #928, #948): match the
# real-run path so dry-run output accurately represents execution order.
# Build ordered deploy list: only include services present in with_services
local DEPLOY_ORDER=""
for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do
if echo ",$with_services," | grep -q ",$ordered_svc,"; then
DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}"
fi
done
local IFS=' '
echo "[deploy] deployment order: ${DEPLOY_ORDER}"
for svc in $DEPLOY_ORDER; do
# Seed this service (if seed script exists)
local seed_name="$svc"
echo "── Deploy services dry-run ────────────────────────────"
echo "[deploy] services to deploy: ${with_services}"
local IFS=','
for svc in $with_services; do
svc=$(echo "$svc" | xargs) # trim whitespace
# Validate known services first
case "$svc" in
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
agents) seed_name="agents" ;;
chat) seed_name="chat" ;;
edge) seed_name="ops-repo" ;;
forgejo) ;;
*)
echo "Error: unknown service '${svc}' — known: forgejo" >&2
exit 1
;;
esac
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
if [ -x "$seed_script" ]; then
echo "── Vault seed dry-run ─────────────────────────────────"
echo "[seed] [dry-run] ${seed_script} --dry-run"
echo ""
fi
# Deploy this service
echo "── Deploy services dry-run ────────────────────────────"
echo "[deploy] services to deploy: ${with_services}"
local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl"
if [ ! -f "$jobspec_path" ]; then
echo "Error: jobspec not found: ${jobspec_path}" >&2
@ -824,36 +751,10 @@ _disinto_init_nomad() {
done
echo "[deploy] dry-run complete"
fi
# Dry-run vault-runner (unconditionally, not gated by --with)
echo ""
echo "── Vault-runner dry-run ───────────────────────────────────"
local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl"
if [ -f "$vault_runner_path" ]; then
echo "[deploy] vault-runner: [dry-run] nomad job validate ${vault_runner_path}"
echo "[deploy] vault-runner: [dry-run] nomad job run -detach ${vault_runner_path}"
else
echo "[deploy] vault-runner: jobspec not found, skipping"
fi
# Build custom images dry-run (if agents, chat, or edge services are included)
if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then
echo ""
echo "── Build images dry-run ──────────────────────────────"
if echo ",$with_services," | grep -q ",agents,"; then
echo "[build] [dry-run] docker build -t disinto/agents:local -f ${FACTORY_ROOT}/docker/agents/Dockerfile ${FACTORY_ROOT}"
fi
if echo ",$with_services," | grep -q ",chat,"; then
echo "[build] [dry-run] docker build -t disinto/chat:local -f ${FACTORY_ROOT}/docker/chat/Dockerfile ${FACTORY_ROOT}/docker/chat"
fi
if echo ",$with_services," | grep -q ",edge,"; then
echo "[build] [dry-run] docker build -t disinto/edge:local -f ${FACTORY_ROOT}/docker/edge/Dockerfile ${FACTORY_ROOT}/docker/edge"
fi
fi
exit 0
fi
# Real run: cluster-up + policies + auth + (optional) import + deploy
# Real run: cluster-up + import + deploy services
local -a cluster_cmd=("$cluster_up")
if [ "$(id -u)" -eq 0 ]; then
"${cluster_cmd[@]}" || exit $?
@ -865,64 +766,48 @@ _disinto_init_nomad() {
sudo -n -- "${cluster_cmd[@]}" || exit $?
fi
# --empty short-circuits here: cluster-up only, no policies/auth/import
# and no deploy. Matches the dry-run plan above and the docs/runbook.
if [ "$empty" = "true" ]; then
exit 0
fi
# Enable Vault secret engines (S2.1 / issue #912) — must precede
# policies/auth/import because every policy and every import target
# addresses paths under kv/. Idempotent, safe to re-run.
# Apply Vault policies (S2.1)
echo ""
echo "── Enabling Vault secret engines ──────────────────────"
local -a engines_cmd=("$vault_engines_sh")
echo "── Applying Vault policies ─────────────────────────────"
if [ "$(id -u)" -eq 0 ]; then
"${engines_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-engines.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${engines_cmd[@]}" || exit $?
fi
# Apply Vault policies (S2.1) — idempotent, safe to re-run.
echo ""
echo "── Applying Vault policies ────────────────────────────"
local -a policies_cmd=("$vault_policies_sh")
if [ "$(id -u)" -eq 0 ]; then
"${policies_cmd[@]}" || exit $?
"${vault_policies_sh}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-apply-policies.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${policies_cmd[@]}" || exit $?
sudo -n -- "${vault_policies_sh}" || exit $?
fi
# Configure Vault JWT auth + Nomad workload identity (S2.3) — idempotent.
# Configure Vault JWT auth (S2.3)
echo ""
echo "── Configuring Vault JWT auth ─────────────────────────"
local -a auth_cmd=("$vault_auth_sh")
echo "── Configuring Vault JWT auth ──────────────────────────"
if [ "$(id -u)" -eq 0 ]; then
"${auth_cmd[@]}" || exit $?
"${vault_auth_sh}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-nomad-auth.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${auth_cmd[@]}" || exit $?
sudo -n -- "${vault_auth_sh}" || exit $?
fi
# Import secrets if any --import-* flag is set (S2.2).
if [ "$import_any" = true ]; then
# Import secrets if import flags are set (S2.2)
if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then
echo ""
echo "── Importing secrets into Vault ───────────────────────"
echo "── Importing secrets into Vault ───────────────────────"
local -a import_cmd=("$vault_import_sh")
[ -n "$import_env" ] && import_cmd+=("--env" "$import_env")
[ -n "$import_sops" ] && import_cmd+=("--sops" "$import_sops")
[ -n "$age_key" ] && import_cmd+=("--age-key" "$age_key")
if [ -n "$import_env" ]; then
import_cmd+=("--env" "$import_env")
fi
if [ -n "$import_sops" ]; then
import_cmd+=("--sops" "$import_sops")
fi
if [ -n "$age_key" ]; then
import_cmd+=("--age-key" "$age_key")
fi
if [ "$(id -u)" -eq 0 ]; then
"${import_cmd[@]}" || exit $?
else
@ -933,166 +818,62 @@ _disinto_init_nomad() {
sudo -n -- "${import_cmd[@]}" || exit $?
fi
else
echo ""
echo "[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
echo "[import] no --import-env/--import-sops - skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services"
fi
# Build custom images required by Nomad jobs (S4.2, S5.2, S5.5) — before deploy.
# Single-node factory dev box: no multi-node pull needed, no registry auth.
# Can upgrade to approach B (registry push/pull) later if multi-node.
if echo ",$with_services," | grep -qE ",(agents|chat|edge),"; then
echo ""
echo "── Building custom images ─────────────────────────────"
if echo ",$with_services," | grep -q ",agents,"; then
local tag="disinto/agents:local"
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/agents/Dockerfile" "${FACTORY_ROOT}" 2>&1 | tail -5
fi
if echo ",$with_services," | grep -q ",chat,"; then
local tag="disinto/chat:local"
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/chat/Dockerfile" "${FACTORY_ROOT}/docker/chat" 2>&1 | tail -5
fi
if echo ",$with_services," | grep -q ",edge,"; then
local tag="disinto/edge:local"
echo "── Building $tag ─────────────────────────────"
docker build -t "$tag" -f "${FACTORY_ROOT}/docker/edge/Dockerfile" "${FACTORY_ROOT}/docker/edge" 2>&1 | tail -5
fi
fi
# Interleaved seed/deploy per service (S2.6, #928, #948).
# We interleave seed + deploy per service (not batch all seeds then all deploys)
# so that OAuth-dependent services can reach their dependencies during seeding.
# E.g., seed-forgejo → deploy-forgejo → seed-woodpecker (OAuth can now reach
# running forgejo) → deploy-woodpecker.
# Deploy services if requested
if [ -n "$with_services" ]; then
local vault_addr="${VAULT_ADDR:-http://127.0.0.1:8200}"
# Build ordered deploy list (S3.4, S4.2, S5.2, S5.5): forgejo → woodpecker-server → woodpecker-agent → agents → staging → chat → edge
local DEPLOY_ORDER=""
for ordered_svc in forgejo woodpecker-server woodpecker-agent agents staging chat edge; do
if echo ",$with_services," | grep -q ",$ordered_svc,"; then
DEPLOY_ORDER="${DEPLOY_ORDER:+${DEPLOY_ORDER} }${ordered_svc}"
echo ""
echo "── Deploying services ─────────────────────────────────"
local -a deploy_cmd=("$deploy_sh")
# Split comma-separated service list into positional args
local IFS=','
for svc in $with_services; do
svc=$(echo "$svc" | xargs) # trim whitespace
if ! echo "$svc" | grep -qE '^[a-zA-Z0-9_-]+$'; then
echo "Error: invalid service name '${svc}' — must match ^[a-zA-Z0-9_-]+$" >&2
exit 1
fi
done
local IFS=' '
for svc in $DEPLOY_ORDER; do
# Seed this service (if seed script exists)
local seed_name="$svc"
# Validate known services FIRST (before jobspec check)
case "$svc" in
woodpecker-server|woodpecker-agent) seed_name="woodpecker" ;;
agents) seed_name="agents" ;;
chat) seed_name="chat" ;;
edge) seed_name="ops-repo" ;;
esac
local seed_script="${FACTORY_ROOT}/tools/vault-seed-${seed_name}.sh"
if [ -x "$seed_script" ]; then
echo ""
echo "── Seeding Vault for ${seed_name} ───────────────────────────"
if [ "$(id -u)" -eq 0 ]; then
VAULT_ADDR="$vault_addr" "$seed_script" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: vault-seed-${seed_name}.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- env "VAULT_ADDR=$vault_addr" "$seed_script" || exit $?
fi
fi
# Deploy this service
echo ""
echo "── Deploying ${svc} ───────────────────────────────────────"
# Seed host volumes before deployment (if needed)
case "$svc" in
staging)
# Seed site-content host volume (/srv/disinto/docker) with static content
# The staging jobspec mounts this volume read-only to /srv/site
local site_content_src="${FACTORY_ROOT}/docker/index.html"
local site_content_dst="/srv/disinto/docker"
if [ -f "$site_content_src" ] && [ -d "$site_content_dst" ]; then
if ! cmp -s "$site_content_src" "${site_content_dst}/index.html" 2>/dev/null; then
echo "[staging] seeding site-content volume..."
cp "$site_content_src" "${site_content_dst}/index.html"
fi
fi
forgejo) ;;
*)
echo "Error: unknown service '${svc}' — known: forgejo" >&2
exit 1
;;
esac
# Check jobspec exists
local jobspec_path="${FACTORY_ROOT}/nomad/jobs/${svc}.hcl"
if [ ! -f "$jobspec_path" ]; then
echo "Error: jobspec not found: ${jobspec_path}" >&2
exit 1
fi
local -a deploy_cmd=("$deploy_sh" "$svc")
if [ "$(id -u)" -eq 0 ]; then
"${deploy_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: deploy.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${deploy_cmd[@]}" || exit $?
fi
deploy_cmd+=("$svc")
done
# Run vault-runner (unconditionally, not gated by --with) — infrastructure job
# vault-runner is always present since it's needed for vault action dispatch
echo ""
echo "── Running vault-runner ────────────────────────────────────"
local vault_runner_path="${FACTORY_ROOT}/nomad/jobs/vault-runner.hcl"
if [ -f "$vault_runner_path" ]; then
echo "[deploy] vault-runner: running Nomad job (infrastructure)"
local -a vault_runner_cmd=("$deploy_sh" "vault-runner")
if [ "$(id -u)" -eq 0 ]; then
"${vault_runner_cmd[@]}" || exit $?
else
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: deploy.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${vault_runner_cmd[@]}" || exit $?
fi
if [ "$(id -u)" -eq 0 ]; then
"${deploy_cmd[@]}" || exit $?
else
echo "[deploy] vault-runner: jobspec not found, skipping"
if ! command -v sudo >/dev/null 2>&1; then
echo "Error: deploy.sh must run as root and sudo is not installed" >&2
exit 1
fi
sudo -n -- "${deploy_cmd[@]}" || exit $?
fi
# Print final summary
echo ""
echo "── Summary ────────────────────────────────────────────"
echo "Cluster: Nomad+Vault cluster is up"
echo "Policies: applied (Vault ACL)"
echo "Auth: Vault JWT auth + Nomad workload identity configured"
if [ "$import_any" = true ]; then
local import_desc=""
[ -n "$import_env" ] && import_desc+="${import_env} "
[ -n "$import_sops" ] && import_desc+="${import_sops} "
echo "Imported: ${import_desc% }"
if [ -n "$import_env" ] || [ -n "$import_sops" ]; then
echo "Imported: secrets from ${import_env:+$import_env }${import_sops:+${import_sops} }"
else
echo "Imported: (none — seed kv/disinto/* manually before deploying secret-dependent services)"
echo "Imported: (none — secrets must be seeded manually)"
fi
echo "Deployed: ${with_services}"
if echo ",$with_services," | grep -q ",forgejo,"; then
if echo "$with_services" | grep -q "forgejo"; then
echo "Ports: forgejo: 3000"
fi
if echo ",$with_services," | grep -q ",woodpecker-server,"; then
echo " woodpecker-server: 8000"
fi
if echo ",$with_services," | grep -q ",woodpecker-agent,"; then
echo " woodpecker-agent: (agent connected)"
fi
if echo ",$with_services," | grep -q ",agents,"; then
echo " agents: (polling loop running)"
fi
if echo ",$with_services," | grep -q ",staging,"; then
echo " staging: (internal, no external port)"
fi
if echo ",$with_services," | grep -q ",chat,"; then
echo " chat: 8080"
fi
echo "────────────────────────────────────────────────────────"
fi
@ -1114,8 +895,7 @@ disinto_init() {
fi
# Parse flags
local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services=""
local import_env="" import_sops="" age_key=""
local branch="" repo_root="" ci_id="0" auto_yes=false forge_url_flag="" bare=false rotate_tokens=false use_build=false dry_run=false backend="docker" empty=false with_services="" import_env="" import_sops="" age_key=""
while [ $# -gt 0 ]; do
case "$1" in
--branch) branch="$2"; shift 2 ;;
@ -1133,11 +913,8 @@ disinto_init() {
--rotate-tokens) rotate_tokens=true; shift ;;
--dry-run) dry_run=true; shift ;;
--import-env) import_env="$2"; shift 2 ;;
--import-env=*) import_env="${1#--import-env=}"; shift ;;
--import-sops) import_sops="$2"; shift 2 ;;
--import-sops=*) import_sops="${1#--import-sops=}"; shift ;;
--age-key) age_key="$2"; shift 2 ;;
--age-key=*) age_key="${1#--age-key=}"; shift ;;
*) echo "Unknown option: $1" >&2; exit 1 ;;
esac
done
@ -1178,104 +955,32 @@ disinto_init() {
exit 1
fi
# Normalize --with services (S3.4): expand 'woodpecker' shorthand to
# 'woodpecker-server,woodpecker-agent', auto-include forgejo when
# woodpecker is requested (OAuth dependency), and validate all names.
if [ -n "$with_services" ]; then
# Expand 'woodpecker' (bare) → 'woodpecker-server,woodpecker-agent'.
# Must not match already-expanded 'woodpecker-server'/'woodpecker-agent'.
local expanded=""
local IFS=','
for _svc in $with_services; do
_svc=$(echo "$_svc" | xargs)
case "$_svc" in
woodpecker) _svc="woodpecker-server,woodpecker-agent" ;;
agents) _svc="agents" ;;
esac
expanded="${expanded:+${expanded},}${_svc}"
done
with_services="$expanded"
unset IFS
# Auto-include forgejo when woodpecker is requested
if echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent," \
&& ! echo ",$with_services," | grep -q ",forgejo,"; then
echo "Note: --with woodpecker implies --with forgejo (OAuth dependency)"
with_services="forgejo,${with_services}"
fi
# Auto-include forgejo and woodpecker when agents is requested
if echo ",$with_services," | grep -q ",agents,"; then
if ! echo ",$with_services," | grep -q ",forgejo,"; then
echo "Note: --with agents implies --with forgejo (agents need forge)"
with_services="forgejo,${with_services}"
fi
if ! echo ",$with_services," | grep -q ",woodpecker-server,\|,woodpecker-agent,"; then
echo "Note: --with agents implies --with woodpecker (agents need CI)"
with_services="${with_services},woodpecker-server,woodpecker-agent"
fi
fi
# Auto-include all dependencies when edge is requested (S5.5)
if echo ",$with_services," | grep -q ",edge,"; then
# Edge depends on all backend services
for dep in forgejo woodpecker-server woodpecker-agent agents staging chat; do
if ! echo ",$with_services," | grep -q ",${dep},"; then
echo "Note: --with edge implies --with ${dep} (edge depends on all backend services)"
with_services="${with_services},${dep}"
fi
done
fi
# Validate all service names are known
local IFS=','
for _svc in $with_services; do
_svc=$(echo "$_svc" | xargs)
case "$_svc" in
forgejo|woodpecker-server|woodpecker-agent|agents|staging|chat|edge) ;;
*)
echo "Error: unknown service '${_svc}' — known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge" >&2
exit 1
;;
esac
done
unset IFS
fi
# --import-* flag validation (S2.5). These three flags form an import
# triple and must be consistent before dispatch: sops encryption is
# useless without the age key to decrypt it, so either both --import-sops
# and --age-key are present or neither is. --import-env alone is fine
# (it just imports the plaintext dotenv). All three flags are nomad-only.
# Import flags validation
# --import-sops requires --age-key
if [ -n "$import_sops" ] && [ -z "$age_key" ]; then
echo "Error: --import-sops requires --age-key" >&2
exit 1
fi
# --age-key requires --import-sops
if [ -n "$age_key" ] && [ -z "$import_sops" ]; then
echo "Error: --age-key requires --import-sops" >&2
exit 1
fi
if { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; } \
&& [ "$backend" != "nomad" ]; then
echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2
exit 1
fi
# --empty is the cluster-only escape hatch — it skips policies, auth,
# import, and deploy. Pairing it with --import-* silently does nothing,
# which is a worse failure mode than a clear error. Reject explicitly.
if [ "$empty" = true ] \
&& { [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; }; then
echo "Error: --empty and --import-env/--import-sops/--age-key are mutually exclusive" >&2
exit 1
# --import-* flags require --backend=nomad
if [ -n "$import_env" ] || [ -n "$import_sops" ] || [ -n "$age_key" ]; then
if [ "$backend" != "nomad" ]; then
echo "Error: --import-env, --import-sops, and --age-key require --backend=nomad" >&2
exit 1
fi
fi
# Dispatch on backend — the nomad path runs lib/init/nomad/cluster-up.sh
# (S0.4). The default and --empty variants are identical today; Step 1
# will branch on $empty to add job deployment to the default path.
if [ "$backend" = "nomad" ]; then
_disinto_init_nomad "$dry_run" "$empty" "$with_services" \
"$import_env" "$import_sops" "$age_key"
_disinto_init_nomad "$dry_run" "$empty" "$with_services" "$import_env" "$import_sops" "$age_key"
# shellcheck disable=SC2317 # _disinto_init_nomad always exits today;
# `return` is defensive against future refactors.
return
@ -1389,6 +1094,7 @@ p.write_text(text)
echo ""
echo "[ensure] Forgejo admin user 'disinto-admin'"
echo "[ensure] 8 bot users: dev-bot, review-bot, planner-bot, gardener-bot, vault-bot, supervisor-bot, predictor-bot, architect-bot"
echo "[ensure] 2 llama bot users: dev-qwen, dev-qwen-nightly"
echo "[ensure] .profile repos for all bots"
echo "[ensure] repo ${forge_repo} on Forgejo with collaborators"
echo "[run] preflight checks"
@ -1428,7 +1134,7 @@ p.write_text(text)
echo "[ensure] CLAUDE_CONFIG_DIR"
echo "[ensure] state files (.dev-active, .reviewer-active, .gardener-active)"
echo ""
echo "Dry run complete no changes made."
echo "Dry run complete - no changes made."
exit 0
fi
@ -1584,6 +1290,19 @@ p.write_text(text)
echo "Config: CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 saved to .env"
fi
# Write local-Qwen dev agent env keys with safe defaults (#769)
if ! grep -q '^ENABLE_LLAMA_AGENT=' "$env_file" 2>/dev/null; then
cat >> "$env_file" <<'LLAMAENVEOF'
# Local Qwen dev agent (optional) — set to 1 to enable
ENABLE_LLAMA_AGENT=0
FORGE_TOKEN_LLAMA=
FORGE_PASS_LLAMA=
ANTHROPIC_BASE_URL=
LLAMAENVEOF
echo "Config: ENABLE_LLAMA_AGENT keys written to .env (disabled by default)"
fi
# Create labels on remote
create_labels "$forge_repo" "$forge_url"

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Dev Agent
**Role**: Implement issues autonomously — write code, push branches, address

View file

@ -15,6 +15,7 @@ services:
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
@ -77,6 +78,7 @@ services:
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
@ -137,6 +139,7 @@ services:
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro

View file

@ -1,26 +1,21 @@
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
bash curl git jq tmux nodejs npm python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \
bash curl git jq tmux python3 python3-pip openssh-client ca-certificates age shellcheck procps gosu \
&& pip3 install --break-system-packages networkx tomlkit \
&& rm -rf /var/lib/apt/lists/*
# Pre-built binaries (copied from docker/agents/bin/)
# SOPS — encrypted data decryption tool
# Download sops binary (replaces manual COPY of vendored binary)
ARG SOPS_VERSION=3.9.4
RUN curl -fsSL "https://github.com/getsops/sops/releases/download/v${SOPS_VERSION}/sops-v${SOPS_VERSION}.linux.amd64" \
-o /usr/local/bin/sops && chmod +x /usr/local/bin/sops
COPY docker/agents/bin/sops /usr/local/bin/sops
RUN chmod +x /usr/local/bin/sops
# tea CLI — official Gitea/Forgejo CLI for issue/label/comment operations
# Download tea binary (replaces manual COPY of vendored binary)
ARG TEA_VERSION=0.9.2
RUN curl -fsSL "https://dl.gitea.com/tea/${TEA_VERSION}/tea-${TEA_VERSION}-linux-amd64" \
-o /usr/local/bin/tea && chmod +x /usr/local/bin/tea
COPY docker/agents/bin/tea /usr/local/bin/tea
RUN chmod +x /usr/local/bin/tea
# Install Claude Code CLI — agent runtime for all LLM backends (llama, Claude API).
# The CLI is the execution environment; ANTHROPIC_BASE_URL selects the model provider.
RUN npm install -g @anthropic-ai/claude-code@2.1.84
# Claude CLI is mounted from the host via docker-compose volume.
# No internet access to cli.anthropic.com required at build time.
# Non-root user
RUN useradd -m -u 1000 -s /bin/bash agent

View file

@ -17,38 +17,6 @@ set -euo pipefail
# - predictor: every 24 hours (288 iterations * 5 min)
# - supervisor: every SUPERVISOR_INTERVAL seconds (default: 1200 = 20 min)
# ── Migration check: reject ENABLE_LLAMA_AGENT ───────────────────────────────
# #846: The legacy ENABLE_LLAMA_AGENT env flag is no longer supported.
# Activation is now done exclusively via [agents.X] sections in project TOML.
# If this legacy flag is detected, fail immediately with a migration message.
if [ "${ENABLE_LLAMA_AGENT:-}" = "1" ]; then
cat <<'MIGRATION_ERR'
FATAL: ENABLE_LLAMA_AGENT is no longer supported.
The legacy ENABLE_LLAMA_AGENT=1 flag has been removed (#846).
Activation is now done exclusively via [agents.X] sections in projects/*.toml.
To migrate:
1. Remove ENABLE_LLAMA_AGENT from your .env or .env.enc file
2. Add an [agents.<name>] section to your project TOML:
[agents.dev-qwen]
base_url = "http://your-llama-server:8081"
model = "unsloth/Qwen3.5-35B-A3B"
api_key = "sk-no-key-required"
roles = ["dev"]
forge_user = "dev-qwen"
compact_pct = 60
poll_interval = 60
3. Run: disinto init
4. Start the agent: docker compose up -d agents-dev-qwen
See docs/agents-llama.md for full details.
MIGRATION_ERR
exit 1
fi
DISINTO_BAKED="/home/agent/disinto"
DISINTO_LIVE="/home/agent/repos/_factory"
DISINTO_DIR="$DISINTO_BAKED" # start with baked copy; switched to live checkout after bootstrap
@ -378,19 +346,15 @@ bootstrap_factory_repo
# This prevents the silent-zombie mode where the polling loop matches zero files
# and does nothing forever.
validate_projects_dir() {
# NOTE: compgen -G exits non-zero when no matches exist, so piping it through
# `wc -l` under `set -eo pipefail` aborts the script before the FATAL branch
# can log a diagnostic (#877). Use the conditional form already adopted at
# lines above (see bootstrap_factory_repo, PROJECT_NAME parsing).
if ! compgen -G "${DISINTO_DIR}/projects/*.toml" >/dev/null 2>&1; then
local toml_count
toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" 2>/dev/null | wc -l)
if [ "$toml_count" -eq 0 ]; then
log "FATAL: No real .toml files found in ${DISINTO_DIR}/projects/"
log "Expected at least one project config file (e.g., disinto.toml)"
log "The directory only contains *.toml.example template files."
log "Mount the host ./projects volume or copy real .toml files into the container."
exit 1
fi
local toml_count
toml_count=$(compgen -G "${DISINTO_DIR}/projects/*.toml" | wc -l)
log "Projects directory validated: ${toml_count} real .toml file(s) found"
}

View file

@ -1,22 +1,20 @@
# disinto-chat — minimal HTTP backend for Claude chat UI
#
# Small Debian slim base with Python runtime and Node.js.
# Small Debian slim base with Python runtime.
# Chosen for simplicity and small image size (~100MB).
#
# Image size: ~100MB (well under the 200MB ceiling)
#
# Claude CLI is baked into the image — same pattern as the agents container.
# The claude binary is mounted from the host at runtime via docker-compose,
# not baked into the image — same pattern as the agents container.
FROM debian:bookworm-slim
# Install Node.js (required for Claude CLI) and Python
# Install Python (no build-time network access needed)
RUN apt-get update && apt-get install -y --no-install-recommends \
nodejs npm python3 \
python3 \
&& rm -rf /var/lib/apt/lists/*
# Install Claude Code CLI — chat backend runtime
RUN npm install -g @anthropic-ai/claude-code@2.1.84
# Non-root user — fixed UID 10001 for sandbox hardening (#706)
RUN useradd -m -u 10001 -s /bin/bash chat

View file

@ -560,168 +560,10 @@ _launch_runner_docker() {
# _launch_runner_nomad ACTION_ID SECRETS_CSV MOUNTS_CSV
#
# Dispatches a vault-runner batch job via `nomad job dispatch`.
# Polls `nomad job status` until terminal state (completed/failed).
# Reads exit code from allocation and writes <action-id>.result.json.
#
# Usage: _launch_runner_nomad <action_id> <secrets_csv> <mounts_csv>
# Returns: exit code of the nomad job (0=success, non-zero=failure)
# Nomad backend stub — will be implemented in migration Step 5.
_launch_runner_nomad() {
local action_id="$1"
local secrets_csv="$2"
local mounts_csv="$3"
log "Dispatching vault-runner batch job via Nomad for action: ${action_id}"
# Dispatch the parameterized batch job
# The vault-runner job expects meta: action_id, secrets_csv
# Note: mounts_csv is not passed as meta (not declared in vault-runner.hcl)
local dispatch_output
dispatch_output=$(nomad job dispatch \
-detach \
-meta action_id="$action_id" \
-meta secrets_csv="$secrets_csv" \
vault-runner 2>&1) || {
log "ERROR: Failed to dispatch vault-runner job for ${action_id}"
log "Dispatch output: ${dispatch_output}"
write_result "$action_id" 1 "Nomad dispatch failed: ${dispatch_output}"
return 1
}
# Extract dispatched job ID from output (format: "vault-runner/dispatch-<timestamp>-<uuid>")
local dispatched_job_id
dispatched_job_id=$(echo "$dispatch_output" | grep -oP '(?<=Dispatched Job ID = ).+' || true)
if [ -z "$dispatched_job_id" ]; then
log "ERROR: Could not extract dispatched job ID from nomad output"
log "Dispatch output: ${dispatch_output}"
write_result "$action_id" 1 "Could not extract dispatched job ID from nomad output"
return 1
fi
log "Dispatched vault-runner with job ID: ${dispatched_job_id}"
# Poll job status until terminal state
# Batch jobs transition: running -> completed/failed
local max_wait=300 # 5 minutes max wait
local elapsed=0
local poll_interval=5
local alloc_id=""
log "Polling nomad job status for ${dispatched_job_id}..."
while [ "$elapsed" -lt "$max_wait" ]; do
# Get job status with JSON output for the dispatched child job
local job_status_json
job_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || {
log "ERROR: Failed to get job status for ${dispatched_job_id}"
write_result "$action_id" 1 "Failed to get job status for ${dispatched_job_id}"
return 1
}
# Check job status field (transitions to "dead" on completion)
local job_state
job_state=$(echo "$job_status_json" | jq -r '.Status // empty' 2>/dev/null) || job_state=""
# Check allocation state directly
alloc_id=$(echo "$job_status_json" | jq -r '.Allocations[0]?.ID // empty' 2>/dev/null) || alloc_id=""
if [ -n "$alloc_id" ]; then
local alloc_state
alloc_state=$(nomad alloc status -short "$alloc_id" 2>/dev/null || true)
case "$alloc_state" in
*completed*|*success*|*dead*)
log "Allocation ${alloc_id} reached terminal state: ${alloc_state}"
break
;;
*running*|*pending*|*starting*)
log "Allocation ${alloc_id} still running (state: ${alloc_state})..."
;;
*failed*|*crashed*)
log "Allocation ${alloc_id} failed (state: ${alloc_state})"
break
;;
esac
fi
# Also check job-level state
case "$job_state" in
dead)
log "Job ${dispatched_job_id} reached terminal state: ${job_state}"
break
;;
failed)
log "Job ${dispatched_job_id} failed"
break
;;
esac
sleep "$poll_interval"
elapsed=$((elapsed + poll_interval))
done
if [ "$elapsed" -ge "$max_wait" ]; then
log "ERROR: Timeout waiting for vault-runner job to complete"
write_result "$action_id" 1 "Timeout waiting for nomad job to complete"
return 1
fi
# Get final job status and exit code
local final_status_json
final_status_json=$(nomad job status -json "$dispatched_job_id" 2>/dev/null) || {
log "ERROR: Failed to get final job status"
write_result "$action_id" 1 "Failed to get final job status"
return 1
}
# Get allocation exit code
local exit_code=0
local logs=""
if [ -n "$alloc_id" ]; then
# Get allocation logs
logs=$(nomad alloc logs -short "$alloc_id" 2>/dev/null || true)
# Try to get exit code from alloc status JSON
# Nomad alloc status -json has .TaskStates["<task_name>"].Events[].ExitCode
local alloc_exit_code
alloc_exit_code=$(nomad alloc status -json "$alloc_id" 2>/dev/null | jq -r '.TaskStates["runner"].Events[-1].ExitCode // empty' 2>/dev/null) || alloc_exit_code=""
if [ -n "$alloc_exit_code" ] && [ "$alloc_exit_code" != "null" ]; then
exit_code="$alloc_exit_code"
fi
fi
# If we couldn't get exit code from alloc, check job state as fallback
# Note: "dead" = terminal state for batch jobs (includes successful completion)
# Only "failed" indicates actual failure
if [ "$exit_code" -eq 0 ]; then
local final_state
final_state=$(echo "$final_status_json" | jq -r '.Status // empty' 2>/dev/null) || final_state=""
case "$final_state" in
failed)
exit_code=1
;;
esac
fi
# Truncate logs if too long
if [ ${#logs} -gt 1000 ]; then
logs="${logs: -1000}"
fi
# Write result file
write_result "$action_id" "$exit_code" "$logs"
if [ "$exit_code" -eq 0 ]; then
log "Vault-runner job completed successfully for action: ${action_id}"
else
log "Vault-runner job failed for action: ${action_id} (exit code: ${exit_code})"
fi
return "$exit_code"
echo "nomad backend not yet implemented" >&2
return 1
}
# Launch runner for the given action (backend-agnostic orchestrator)
@ -1209,8 +1051,11 @@ main() {
# Validate backend selection at startup
case "$DISPATCHER_BACKEND" in
docker|nomad)
log "Using ${DISPATCHER_BACKEND} backend for vault-runner dispatch"
docker) ;;
nomad)
log "ERROR: nomad backend not yet implemented"
echo "nomad backend not yet implemented" >&2
exit 1
;;
*)
log "ERROR: unknown DISPATCHER_BACKEND=${DISPATCHER_BACKEND}"

View file

@ -234,13 +234,6 @@ fi
rm -f "$_fetch_log"
done) &
# Nomad template renders Caddyfile to /local/Caddyfile via service discovery;
# copy it into the expected location if present (compose uses the mounted path).
if [ -f /local/Caddyfile ]; then
cp /local/Caddyfile /etc/caddy/Caddyfile
echo "edge: using Nomad-rendered Caddyfile from /local/Caddyfile" >&2
fi
# Caddy as main process — run in foreground via wait so background jobs survive
# (exec replaces the shell, which can orphan backgrounded subshells)
caddy run --config /etc/caddy/Caddyfile --adapter caddyfile &

View file

@ -2,12 +2,9 @@
Local-model agents run the same agent code as the Claude-backed agents, but
connect to a local llama-server (or compatible OpenAI-API endpoint) instead of
the Anthropic API. This document describes the canonical activation flow using
the Anthropic API. This document describes the current activation flow using
`disinto hire-an-agent` and `[agents.X]` TOML configuration.
> **Note:** The legacy `ENABLE_LLAMA_AGENT=1` env flag has been removed (#846).
> Activation is now done exclusively via `[agents.X]` sections in project TOML.
## Overview
Local-model agents are configured via `[agents.<name>]` sections in

View file

@ -1,124 +0,0 @@
<!-- last-reviewed: (new file, S2.5 #883) -->
# Nomad+Vault migration — cutover-day runbook
`disinto init --backend=nomad` is the single entry-point that turns a fresh
LXC (with the disinto repo cloned) into a running Nomad+Vault cluster with
policies applied, JWT workload-identity auth configured, secrets imported
from the old docker stack, and services deployed.
## Cutover-day invocation
On the new LXC, as root (or an operator with NOPASSWD sudo):
```bash
# Copy the plaintext .env + sops-encrypted .env.vault.enc + age keyfile
# from the old box first (out of band — SSH, USB, whatever your ops
# procedure allows). Then:
sudo ./bin/disinto init \
--backend=nomad \
--import-env /tmp/.env \
--import-sops /tmp/.env.vault.enc \
--age-key /tmp/keys.txt \
--with forgejo
```
This runs, in order:
1. **`lib/init/nomad/cluster-up.sh`** (S0) — installs Nomad + Vault
binaries, writes `/etc/nomad.d/*`, initializes Vault, starts both
services, waits for the Nomad node to become ready.
2. **`tools/vault-apply-policies.sh`** (S2.1) — syncs every
`vault/policies/*.hcl` into Vault as an ACL policy. Idempotent.
3. **`lib/init/nomad/vault-nomad-auth.sh`** (S2.3) — enables Vault's
JWT auth method at `jwt-nomad`, points it at Nomad's JWKS, writes
one role per policy, reloads Nomad so jobs can exchange
workload-identity tokens for Vault tokens. Idempotent.
4. **`tools/vault-import.sh`** (S2.2) — reads `/tmp/.env` and the
sops-decrypted `/tmp/.env.vault.enc`, writes them to the KV paths
matching the S2.1 policy layout (`kv/disinto/bots/*`, `kv/disinto/shared/*`,
`kv/disinto/runner/*`). Idempotent (overwrites KV v2 data in place).
5. **`lib/init/nomad/deploy.sh forgejo`** (S1) — validates + runs the
`nomad/jobs/forgejo.hcl` jobspec. Forgejo reads its admin creds from
Vault via the `template` stanza (S2.4).
## Flag summary
| Flag | Meaning |
|---|---|
| `--backend=nomad` | Switch the init dispatcher to the Nomad+Vault path (instead of docker compose). |
| `--empty` | Bring the cluster up, skip policies/auth/import/deploy. Escape hatch for debugging. |
| `--with forgejo[,…]` | Deploy these services after the cluster is up. |
| `--import-env PATH` | Plaintext `.env` from the old stack. Optional. |
| `--import-sops PATH` | Sops-encrypted `.env.vault.enc` from the old stack. Requires `--age-key`. |
| `--age-key PATH` | Age keyfile used to decrypt `--import-sops`. Requires `--import-sops`. |
| `--dry-run` | Print the full plan (cluster-up + policies + auth + import + deploy) and exit. Touches nothing. |
### Flag validation
- `--import-sops` without `--age-key` → error.
- `--age-key` without `--import-sops` → error.
- `--import-env` alone (no sops) → OK (imports just the plaintext `.env`).
- `--backend=docker` with any `--import-*` flag → error.
- `--empty` with any `--import-*` flag → error (mutually exclusive: `--empty`
skips the import step, so pairing them silently discards the import
intent).
## Idempotency
Every layer is idempotent by design. Re-running the same command on an
already-provisioned box is a no-op at every step:
- **Cluster-up:** second run detects running `nomad`/`vault` systemd
units and state files, skips re-init.
- **Policies:** byte-for-byte compare against on-server policy text;
"unchanged" for every untouched file.
- **Auth:** skips auth-method create if `jwt-nomad/` already enabled,
skips config write if the JWKS + algs match, skips server.hcl write if
the file on disk is identical to the repo copy.
- **Import:** KV v2 writes overwrite in place (same path, same keys,
same values → no new version).
- **Deploy:** `nomad job run` is declarative; same jobspec → no new
allocation.
## Dry-run
```bash
./bin/disinto init --backend=nomad \
--import-env /tmp/.env \
--import-sops /tmp/.env.vault.enc \
--age-key /tmp/keys.txt \
--with forgejo \
--dry-run
```
Prints the five-section plan — cluster-up, policies, auth, import,
deploy — with every path and every argv that would be executed. No
network, no sudo, no state mutation. See
`tests/disinto-init-nomad.bats` for the exact output shape.
## No-import path
If you already have `kv/disinto/*` seeded by other means (manual
`vault kv put`, a replica, etc.), omit all three `--import-*` flags.
`disinto init --backend=nomad --with forgejo` still applies policies,
configures auth, and deploys — but skips the import step with:
```
[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services
```
Forgejo's template stanza will fail to render (and thus the allocation
will stall) until those KV paths exist — so either import them or seed
them first.
## Secret hygiene
- Never log a secret value. The CLI only prints paths (`--import-env`,
`--age-key`) and KV *paths* (`kv/disinto/bots/review/token`), never
the values themselves. `tools/vault-import.sh` is the only thing that
reads the values, and it pipes them directly into Vault's HTTP API.
- The age keyfile must be mode 0400 — `vault-import.sh` refuses to
source a keyfile with looser permissions.
- `VAULT_ADDR` must be localhost during import — the import tool
refuses to run against a remote Vault, preventing accidental exposure.

View file

@ -178,8 +178,8 @@ log "Tagged disinto/agents:${RELEASE_VERSION}"
log "Step 6/6: Restarting agent containers"
docker compose stop agents 2>/dev/null || true
docker compose up -d agents
docker compose stop agents agents-llama 2>/dev/null || true
docker compose up -d agents agents-llama
log "Agent containers restarted"
# ── Done ─────────────────────────────────────────────────────────────────

View file

@ -189,10 +189,10 @@ Restart agent containers to use the new image.
- docker compose pull agents
2. Stop and remove existing agent containers:
- docker compose down agents
- docker compose down agents agents-llama 2>/dev/null || true
3. Start agents with new image:
- docker compose up -d agents
- docker compose up -d agents agents-llama
4. Wait for containers to be healthy:
- for i in {1..30}; do
@ -203,7 +203,7 @@ Restart agent containers to use the new image.
- done
5. Verify containers are running:
- docker compose ps agents
- docker compose ps agents agents-llama
6. Log restart:
- echo "Restarted agents containers"

View file

@ -29,7 +29,7 @@ and injected into your prompt above. Review them now.
1. Read the injected metrics data carefully (System Resources, Docker,
Active Sessions, Phase Files, Stale Phase Cleanup, Lock Files, Agent Logs,
CI Pipelines, Open PRs, Issue Status, Stale Worktrees, **Woodpecker Agent Health**).
CI Pipelines, Open PRs, Issue Status, Stale Worktrees).
Note: preflight.sh auto-removes PHASE:escalate files for closed issues
(24h grace period). Check the "Stale Phase Cleanup" section for any
files cleaned or in grace period this run.
@ -75,10 +75,6 @@ Categorize every finding from the metrics into priority levels.
- Dev/action sessions in PHASE:escalate for > 24h (session timeout)
(Note: PHASE:escalate files for closed issues are auto-cleaned by preflight;
this check covers sessions where the issue is still open)
- **Woodpecker agent unhealthy** see "Woodpecker Agent Health" section in preflight:
- Container not running or in unhealthy state
- gRPC errors >= 3 in last 20 minutes
- Fast-failure pipelines (duration < 60s) >= 3 in last 15 minutes
### P3 — Factory degraded
- PRs stale: CI finished >20min ago AND no git push to the PR branch since CI completed
@ -104,15 +100,6 @@ For each finding from the health assessment, decide and execute an action.
### Auto-fixable (execute these directly)
**P2 Woodpecker agent unhealthy:**
The supervisor-run.sh script automatically handles WP agent recovery:
- Detects unhealthy state via preflight.sh health checks
- Restarts container via `docker restart`
- Scans for `blocked: ci_exhausted` issues updated in last 30 minutes
- Unassigns and removes blocked label from affected issues
- Posts recovery comment with infra-flake context
- Avoids duplicate restarts via 5-minute cooldown in history file
**P0 Memory crisis:**
# Kill stale one-shot claude processes (>3h old)
pgrep -f "claude -p" --older 10800 2>/dev/null | xargs kill 2>/dev/null || true
@ -261,11 +248,6 @@ Format:
- <what was fixed>
(or "No actions needed")
### WP Agent Recovery (if applicable)
- WP agent restart: <time of restart or "none">
- Issues recovered: <count>
- Reason: <health check reason or "healthy">
### Vault items filed
- vault/pending/<id>.md <reason>
(or "None")

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Gardener Agent
**Role**: Backlog grooming — detect duplicate issues, missing acceptance

View file

View file

@ -1,38 +1,108 @@
[
{
"action": "edit_body",
"issue": 1025,
"body": "## Prior art: PR #1033 (open, branch `fix/issue-1025` retained)\n\nFirst attempt by dev-qwen2 (head `f692dd2`). Test script (`tests/smoke-edge-subpath.sh`, 13.8 KB) and pipeline (`.woodpecker/edge-subpath.yml`) both landed and look reasonable, but the **CI harness design is wrong**: the pipeline boots a bare `alpine:3.19` container and runs the smoke script directly against `BASE_URL=http://localhost`, with no stack to test against.\n\n**This is a harness design gap, not a script bug.** The smoke script itself is a reasonable post-deploy tool — the mistake was trying to exercise it as a hermetic CI step.\n\n**Approach (Option 1 — split the work):**\n\nKeep `tests/smoke-edge-subpath.sh` as an out-of-CI post-deploy tool (accepts `BASE_URL` env var). Replace the CI pipeline step that tries to curl a live stack with static checks only: `shellcheck`, `caddy validate` on the generated Caddyfile, and a template-substitution unit test that verifies routing block shape.\n\nBranch `fix/issue-1025` is preserved at `f692dd2` — the smoke script body is reusable; only the pipeline harness needs a rethink.\n\n**Timeline:**\n- 2026-04-19 09:14 — dev-qwen2 last pushed `f692dd2`\n- 3 pipelines (#1378/#1380/#1382) all fail: no service to curl (connection refused)\n\n## Acceptance criteria\n- [ ] `.woodpecker/edge-subpath.yml` pipeline runs `shellcheck` on `tests/smoke-edge-subpath.sh` with no live service curl\n- [ ] `caddy validate` runs on the generated Caddyfile in CI (template-substitution unit test)\n- [ ] A template-substitution test verifies the Caddyfile routing block shape (forge/ci/staging/chat paths)\n- [ ] `tests/smoke-edge-subpath.sh` accepts `BASE_URL` env var for post-deploy staging runs\n- [ ] CI green (no connection-refused failures on Woodpecker)\n\n## Affected files\n- `.woodpecker/edge-subpath.yml` — pipeline config (static checks only, no service curl)\n- `tests/smoke-edge-subpath.sh` — out-of-CI smoke script (reusable from PR #1033)\n\n## Dependencies\n- #1038 should land first to unblock local edge staging runs (optional — CI fix is independent)"
},
{
"action": "remove_label",
"issue": 1025,
"label": "blocked"
"issue": 900,
"body": "Flagged by AI reviewer in PR #897.\n\n## Problem\n\nThe policy at `vault/policies/service-forgejo.hcl` grants:\n\n```hcl\npath \"kv/data/disinto/shared/forgejo/*\" {\n capabilities = [\"read\"]\n}\n```\n\nBut the consul-template stanza in `nomad/jobs/forgejo.hcl` reads:\n\n```\n{{- with secret \"kv/data/disinto/shared/forgejo\" -}}\n```\n\nVault glob `/*` requires at least one path segment after `forgejo/` (e.g. `forgejo/subkey`). It does **not** match the bare path `kv/data/disinto/shared/forgejo` that the template actually calls. Vault ACL longest-prefix matching: `forgejo/*` is never hit for a request to `forgejo`.\n\nRuntime consequence: consul-template `with` block receives a 403 permission denied → evaluates to empty (false) → `else` branch renders `seed-me` placeholder values → Forgejo starts with obviously-wrong secrets despite `vault-seed-forgejo.sh` having run successfully.\n\n## Fix\n\nReplace the glob with an exact path in `vault/policies/service-forgejo.hcl`:\n\n```hcl\npath \"kv/data/disinto/shared/forgejo\" {\n capabilities = [\"read\"]\n}\n\npath \"kv/metadata/disinto/shared/forgejo\" {\n capabilities = [\"list\", \"read\"]\n}\n```\n\n(The `/*` glob is only useful if future subkeys are written under `forgejo/`; the current design stores both secrets in a single KV document at the `forgejo` path.)\n\nThis is a pre-existing defect in `vault/policies/service-forgejo.hcl`; that file was not changed by PR #897.\n\n---\n*Auto-created from AI review*\n\n## Affected files\n- `vault/policies/service-forgejo.hcl` — replace glob path with exact path + metadata path\n\n## Acceptance criteria\n- [ ] `vault/policies/service-forgejo.hcl` grants exact path `kv/data/disinto/shared/forgejo` (not `forgejo/*`)\n- [ ] Metadata path `kv/metadata/disinto/shared/forgejo` is also granted read+list\n- [ ] consul-template `with secret \"kv/data/disinto/shared/forgejo\"` resolves without 403 (verified via `vault policy read service-forgejo`)\n- [ ] `shellcheck` clean (no shell changes expected)\n"
},
{
"action": "add_label",
"issue": 1025,
"issue": 900,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 1038,
"body": "## Problem\n\n`disinto-edge` crashloops on any deployment that has not opted into the age-encrypted secret store (#777), because the edge entrypoint treats four secrets as unconditionally required:\n\n```\nFATAL: age key (/home/agent/.config/sops/age/keys.txt) or secrets dir (/opt/disinto/secrets) not found — cannot load required secrets\n```\n\nObserved on `disinto-dev-box` (container `disinto-edge`, restarting every ~30s), which blocks PR #1033 (edge-subpath smoke test) and any other work that depends on a running edge.\n\n## Root cause\n\n`docker/edge/entrypoint-edge.sh:176-205` requires:\n\n- `~/.config/sops/age/keys.txt`\n- `/opt/disinto/secrets/` with `.enc` files for `CADDY_SSH_KEY`, `CADDY_SSH_HOST`, `CADDY_SSH_USER`, `CADDY_ACCESS_LOG`.\n\nThese four secrets feed exactly one feature: the daily 23:50 UTC `collect-engagement.sh` cron (#745), which SCPs Caddy access logs from a **remote production edge host** for engagement parsing. On a local factory box or any deployment that has not set up a remote edge, this code path has no target — yet its absence kills the whole edge container.\n\n## Fix\n\nMake the secrets block **optional**. When age key or secrets dir is missing, or any of the four CADDY_ secrets fail to decrypt, log a warning and skip the `collect-engagement` cron loop. Caddy itself does not depend on these secrets and should start normally.\n\nThe concrete edit is around lines 176-205 of `docker/edge/entrypoint-edge.sh` — guard the secret-loading block with a check for the age key and secrets dir, set `EDGE_ENGAGEMENT_READY=0` on failure, and skip cron registration when `EDGE_ENGAGEMENT_READY != 1`.\n\n## Acceptance criteria\n- [ ] `docker/edge/entrypoint-edge.sh` loads CADDY_ secrets optionally — missing age key or secrets dir logs a warning and continues, does not FATAL\n- [ ] Caddy starts normally when CADDY_ secrets are absent\n- [ ] `collect-engagement` cron is skipped (not registered) when engagement secrets are unavailable\n- [ ] On deployments WITH secrets configured, behavior is unchanged (collect-engagement cron still fires at 23:50 UTC)\n- [ ] CI green\n\n## Affected files\n- `docker/edge/entrypoint-edge.sh` — lines 176-205, secrets loading block made optional"
"issue": 898,
"body": "Flagged by AI reviewer in PR #889.\n\n## Problem\n\n`tools/vault-import.sh` serializes each entry in `ops_data` as `\"${source_value}|${status}\"` (line 498). Extraction at lines 510-511 uses `${data%%|*}` (first field) and `${data##*|}` (last field). If `source_value` contains a literal `|`, `${data%%|*}` truncates it to the first segment, silently writing a corrupted value to Vault.\n\nThe same separator is used in `paths_to_write` (line 519) to join multiple kv-pairs for a path. When `IFS=\"|\"` splits the string back into an array (line 540), a value containing `|` is split across array elements, corrupting the write.\n\n## Failure mode\n\nAny secret value with a pipe character (e.g. a generated password or composed token like `abc|xyz`) is silently truncated or misrouted on import. No error is emitted.\n\n## Fix\n\nReplace the `|`-delimited string with a bash indexed array for accumulating per-path kv pairs, eliminating the need for a delimiter that conflicts with possible value characters.\n\n---\n*Auto-created from AI review of PR #889*\n\n## Affected files\n- `tools/vault-import.sh` — replace pipe-delimited string accumulation with bash indexed arrays (lines ~498540)\n\n## Acceptance criteria\n- [ ] A secret value containing `|` (e.g. `abc|xyz`) is imported to Vault without truncation or corruption\n- [ ] No regression for values without `|`\n- [ ] `shellcheck` clean\n"
},
{
"action": "add_label",
"issue": 898,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 893,
"body": "Flagged by AI reviewer in PR #892.\n\n## Problem\n\n`disinto init --build` generates the `agents:` service by first emitting `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` and then running a `sed -i` substitution (`lib/generators.sh:793`) that replaces the `image:` line with a `build:` block. The substitution does not add `pull_policy: build`.\n\nResult: `docker compose up` with `--build`-generated compose files still uses the cached image for the base `agents:` service, even when `docker/agents/` source has changed — the same silent-stale-image bug that #887 fixed for the three local-model service stanzas.\n\n## Fix\n\nThe `sed` substitution on line 793 should also inject `pull_policy: build` after the emitted `build:` block.\n\n---\n*Auto-created from AI review of PR #892*\n\n## Affected files\n- `lib/generators.sh` (line ~793) — add `pull_policy: build` to the agents service sed substitution\n\n## Acceptance criteria\n- [ ] `disinto init --build`-generated compose file includes `pull_policy: build` in the `agents:` service stanza\n- [ ] `docker compose up` rebuilds the agents image from local source when `docker/agents/` changes\n- [ ] Non-`--build` compose generation is unchanged\n- [ ] `shellcheck` clean\n"
},
{
"action": "add_label",
"issue": 893,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 890,
"body": "Flagged by AI reviewer in PR #888.\n\n## Problem\n\n`lib/hvault.sh` functions `hvault_kv_get`, `hvault_kv_put`, and `hvault_kv_list` all hardcode `secret/data/` and `secret/metadata/` as KV v2 path prefixes (lines 117, 157, 173).\n\nThe Nomad+Vault migration (S2.1, #879) establishes `kv/` as the mount name for all factory secrets — every policy in `vault/policies/*.hcl` grants ACL on `kv/data/disinto/...` paths.\n\nIf any agent calls `hvault_kv_get` after the migration, Vault will route the request to `secret/data/...` but the token only holds ACL for `kv/data/...`, producing a 403 Forbidden.\n\n## Fix\n\nChange the mount prefix in `hvault_kv_get`, `hvault_kv_put`, and `hvault_kv_list` from `secret/` to `kv/`, or make the mount name configurable via `VAULT_KV_MOUNT` (defaulting to `kv`). Coordinate with S2.2 (#880) which writes secrets into the `kv/` mount.\n\n---\n*Auto-created from AI review of PR #888*\n\n## Affected files\n- `lib/hvault.sh` — change `secret/data/` and `secret/metadata/` prefixes to `kv/data/` and `kv/metadata/` (lines ~117, 157, 173); optionally make configurable via `VAULT_KV_MOUNT`\n\n## Acceptance criteria\n- [ ] `hvault_kv_get`, `hvault_kv_put`, `hvault_kv_list` use `kv/` mount prefix (not `secret/`)\n- [ ] Agents can read/write KV paths that policies in `vault/policies/*.hcl` grant (no 403)\n- [ ] Optionally: `VAULT_KV_MOUNT` env var overrides the mount name (defaults to `kv`)\n- [ ] `shellcheck` clean\n"
},
{
"action": "add_label",
"issue": 890,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 877,
"body": "Flagged by AI reviewer in PR #875.\n\n## Problem\n\n`validate_projects_dir()` in `docker/agents/entrypoint.sh` uses a command substitution that triggers `set -e` before the intended error-logging branch runs:\n\n```bash\ntoml_count=$(compgen -G \"${DISINTO_DIR}/projects/*.toml\" 2>/dev/null | wc -l)\n```\n\nWhen no `.toml` files are present, `compgen -G` exits 1. With `pipefail`, the pipeline exits 1. `set -e` causes the script to exit before `if [ \"$toml_count\" -eq 0 ]` is evaluated, so the FATAL diagnostic messages are never printed. The container still fast-fails (correct outcome), but the operator sees no explanation.\n\nEvery other `compgen -G` usage in the file uses the safer conditional pattern (lines 259, 322).\n\n## Fix\n\nReplace the `wc -l` pattern with:\n\n```bash\nif ! compgen -G \"${DISINTO_DIR}/projects/*.toml\" >/dev/null 2>&1; then\n log \"FATAL: No real .toml files found in ${DISINTO_DIR}/projects/\"\n ...\n exit 1\nfi\n```\n\n---\n*Auto-created from AI review*\n\n## Affected files\n- `docker/agents/entrypoint.sh` — fix `validate_projects_dir()` to use conditional compgen pattern instead of `wc -l` pipeline\n\n## Acceptance criteria\n- [ ] When no `.toml` files are present, the FATAL message is printed before the container exits\n- [ ] Container still exits non-zero in that case\n- [ ] Matches the pattern already used at lines 259 and 322\n- [ ] `shellcheck` clean\n"
},
{
"action": "add_label",
"issue": 877,
"label": "backlog"
},
{
"action": "add_label",
"issue": 773,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 883,
"body": "Part of the Nomad+Vault migration. **Step 2 — Vault policies + workload identity + secrets import.**\n\n~~**Blocked by: #880 (S2.2), #881 (S2.3).**~~ Dependencies closed; unblocked.\n\n## Goal\n\nWire the Step-2 building blocks (import, auth, policies) into `bin/disinto init --backend=nomad` so a single command on a fresh LXC provisions cluster + policies + auth + imports secrets + deploys services.\n\n## Scope\n\nAdd flags to `disinto init --backend=nomad`:\n\n- `--import-env PATH` — points at an existing `.env` (from old stack).\n- `--import-sops PATH` — points at the sops-encrypted `.env.vault.enc`.\n- `--age-key PATH` — points at the sops age keyfile (required if `--import-sops` is set).\n\nFlow when any of `--import-*` is set:\n\n1. `cluster-up.sh` (Step 0, unchanged).\n2. `tools/vault-apply-policies.sh` (S2.1, idempotent).\n3. `lib/init/nomad/vault-nomad-auth.sh` (S2.3, idempotent).\n4. `tools/vault-import.sh --env PATH --sops PATH --age-key PATH` (S2.2).\n5. If `--with <service>` was also passed, `lib/init/nomad/deploy.sh <service>` (Step 1, unchanged).\n6. Final summary: cluster + policies + auth + imported secrets count + deployed services + ports.\n\nFlow when **no** import flags are set:\n- Skip step 4; still apply policies + auth.\n- Log: `[import] no --import-env/--import-sops — skipping; set them or seed kv/disinto/* manually before deploying secret-dependent services`.\n\nFlag validation:\n- `--import-sops` without `--age-key` → error.\n- `--age-key` without `--import-sops` → error.\n- `--import-env` alone (no sops) → OK.\n- `--backend=docker` + any `--import-*` → error.\n\n## Affected files\n- `bin/disinto` — add `--import-env`, `--import-sops`, `--age-key` flags to `init --backend=nomad`\n- `docs/nomad-migration.md` (new) — cutover-day invocation shape\n- `lib/init/nomad/vault-nomad-auth.sh` (S2.3) — called as step 3\n- `tools/vault-import.sh` (S2.2) — called as step 4\n- `tools/vault-apply-policies.sh` (S2.1) — called as step 2\n\n## Acceptance criteria\n- [ ] `disinto init --backend=nomad --import-env /tmp/.env --import-sops /tmp/.enc --age-key /tmp/keys.txt --with forgejo` completes: cluster up, policies applied, JWT auth configured, KV populated, Forgejo deployed reading Vault secrets\n- [ ] Re-running is a no-op at every layer\n- [ ] `--import-sops` without `--age-key` exits with a clear error\n- [ ] `--backend=docker` with `--import-env` exits with a clear error\n- [ ] `--dry-run` prints the full plan, touches nothing\n- [ ] Never logs a secret value\n- [ ] `shellcheck` clean\n"
},
{
"action": "remove_label",
"issue": 1038,
"issue": 883,
"label": "blocked"
},
{
"action": "add_label",
"issue": 1038,
"issue": 883,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 884,
"body": "Part of the Nomad+Vault migration. **Step 2 — Vault policies + workload identity + secrets import.**\n\nS2.1 (#879) is now closed; this step has no blocking dependencies.\n\n## Goal\n\nExtend the Woodpecker CI to validate Vault policy HCL files under `vault/policies/` and role definitions.\n\n## Scope\n\nExtend `.woodpecker/nomad-validate.yml`:\n\n- `vault policy fmt -check vault/policies/*.hcl` — fails on unformatted HCL.\n- `for f in vault/policies/*.hcl; do vault policy validate \"$f\"; done` — syntax + semantic validation (requires a dev-mode vault spun inline).\n- If `vault/roles.yaml` exists: yamllint check + custom validator that each role references a policy file that actually exists in `vault/policies/`.\n- Secret-scan gate: ensure no policy file contains what looks like a literal secret.\n- Trigger: on any PR touching `vault/policies/`, `vault/roles.yaml`, or `lib/init/nomad/vault-*.sh`.\n\nAlso:\n- Add `vault/policies/AGENTS.md` cross-reference: policy lifecycle (add policy HCL → update roles.yaml → add Vault KV path), what CI enforces, common failure modes.\n\n## Non-goals\n\n- No runtime check against a real cluster.\n- No enforcement of specific naming conventions beyond what S2.1 docs describe.\n\n## Affected files\n- `.woodpecker/nomad-validate.yml` — add vault policy fmt + validate + roles.yaml gates\n- `vault/policies/AGENTS.md` (new) — policy lifecycle documentation\n\n## Acceptance criteria\n- [ ] Deliberately broken policy HCL (typo in `path` block) fails CI with the vault-fmt error\n- [ ] Policy that references a non-existent capability (e.g. `\"frobnicate\"`) fails validation\n- [ ] `vault/roles.yaml` referencing a policy not in `vault/policies/` fails CI\n- [ ] Clean PRs pass within normal pipeline time budget\n- [ ] Existing S0.5 + S1.4 CI gates unaffected\n- [ ] `shellcheck` clean on any shell added\n"
},
{
"action": "remove_label",
"issue": 884,
"label": "blocked"
},
{
"action": "add_label",
"issue": 884,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 846,
"body": "## Problem\n\nLlama-backed sidecar agents can be activated through two different mechanisms:\n\n1. **Legacy:** `ENABLE_LLAMA_AGENT=1` env flag toggles a hardcoded `agents-llama` service block in `docker-compose.yml`.\n2. **Modern:** `[agents.X]` TOML block consumed by `hire-an-agent`, emitting a service per block.\n\nNeither the docs nor the CLI explain which path wins. Setting both produces a YAML `mapping key \"agents-llama\" already defined` error from compose because the service block is duplicated.\n\n## Sub-symptom: env-var naming collision\n\nThe two paths key secrets differently:\n\n- Legacy: `FORGE_TOKEN_LLAMA`, `FORGE_PASS_LLAMA`.\n- Modern: `FORGE_TOKEN_<FORGE_USER_UPPER>` — e.g. `FORGE_TOKEN_DEV_QWEN`.\n\nA user migrating between paths ends up with two sets of secrets in `.env`, neither cleanly mapped to the currently-active service block. Silent auth failures (401 from Forgejo) follow.\n\n## Proposal\n\n- Pick the TOML `[agents.X]` path as canonical.\n- Remove the `ENABLE_LLAMA_AGENT` branch and its hardcoded service block from the generator.\n- Detection of `ENABLE_LLAMA_AGENT` in `.env` at `disinto up` time: hard-fail immediately with a migration message (option (a) — simpler, no external consumers depend on this flag).\n\n~~Dependencies: #845, #847~~ — both now closed; unblocked.\n\nRelated: #845, #847.\n\n## Affected files\n- `lib/generators.sh` — remove `ENABLE_LLAMA_AGENT` branch and hardcoded `agents-llama:` service block\n- `docker/agents/entrypoint.sh` — detect `ENABLE_LLAMA_AGENT` in env, emit migration error\n- `.env.example` — remove `ENABLE_LLAMA_AGENT`\n- `docs/agents-llama.md` — update to document TOML `[agents.X]` as the one canonical path\n\n## Acceptance criteria\n- [ ] One documented activation path: TOML `[agents.X]` block\n- [ ] `ENABLE_LLAMA_AGENT` removed from compose generator; presence in `.env` at startup triggers a clear migration error naming the replacement\n- [ ] `.env.example` and `docs/agents-llama.md` updated\n- [ ] `shellcheck` clean\n"
},
{
"action": "remove_label",
"issue": 846,
"label": "blocked"
},
{
"action": "add_label",
"issue": 846,
"label": "backlog"
},
{
"action": "edit_body",
"issue": 850,
"body": "## Problem\n\nWhen the compose generator emits the same service name twice — e.g. both the legacy `ENABLE_LLAMA_AGENT=1` branch and a matching `[agents.llama]` TOML block produce an `agents-llama:` key — the failure is deferred all the way to `docker compose` YAML parsing:\n\n```\nfailed to parse /home/johba/disinto/docker-compose.yml: yaml: construct errors:\n line 4: line 431: mapping key \"agents-llama\" already defined at line 155\n```\n\nBy then, the user has already paid the cost of: pre-build binary downloads, generator run, Caddyfile regeneration. The only hint about what went wrong is a line number in a generated file. Root cause (dual activation) is not surfaced.\n\n## Fix\n\nAdd a generate-time guard to `lib/generators.sh`:\n\n- After collecting all service blocks to emit, compare the set of service names against duplicates.\n- If a duplicate is detected, abort with a clear message naming both sources of truth (e.g. `\"agents-llama\" emitted twice — from ENABLE_LLAMA_AGENT=1 and from [agents.llama] in projects/disinto.toml; remove one`).\n\n## Prior art: PR #872 (closed, branch `fix/issue-850` retained)\n\ndev-qwen's first attempt (`db009e3`) landed the dup-detection logic in `lib/generators.sh` correctly (unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases), but the smoke test fails on CI.\n\n**Why the smoke test fails:** sections 1-7 of `smoke-init.sh` already run `bin/disinto init`, materializing `docker-compose.yml`. Section 8 re-invokes `bin/disinto init` to verify the dup guard fires — but `_generate_compose_impl` early-returns with `\"Compose: already exists, skipping\"` before reaching the dup-check.\n\n**Suggested fix:** in `tests/smoke-init.sh` section 8 (around line 452, before the second `bin/disinto init` invocation), add:\n\n```bash\nrm -f \"${FACTORY_ROOT}/docker-compose.yml\"\n```\n\nso the generator actually runs and the dup-detection path is exercised. Do **not** hoist the dup-check above the early-return.\n\nThe branch `fix/issue-850` is preserved as a starting point — pick up from `db009e3` and patch the smoke-test cleanup.\n\nRelated: #846.\n\n## Acceptance criteria\n- [ ] `bin/disinto init` with a config that would produce duplicate service names aborts with a clear error message naming both sources (e.g. `ENABLE_LLAMA_AGENT=1` and `[agents.llama]` TOML block)\n- [ ] `tests/smoke-init.sh` section 8 removes `docker-compose.yml` before re-invoking `disinto init` so the dup guard is exercised\n- [ ] Unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases\n- [ ] CI green (smoke-init.sh section 8 no longer skips dup detection)\n\n## Affected files\n- `lib/generators.sh` — duplicate service name check after collecting all service blocks\n- `tests/smoke-init.sh` — section 8: add `rm -f \\${FACTORY_ROOT}/docker-compose.yml` before second `disinto init`"
"body": "## Problem\n\nWhen the compose generator emits the same service name twice — e.g. both the legacy `ENABLE_LLAMA_AGENT=1` branch and a matching `[agents.llama]` TOML block produce an `agents-llama:` key — the failure is deferred all the way to `docker compose` YAML parsing:\n\n```\nfailed to parse /home/johba/disinto/docker-compose.yml: yaml: construct errors:\n line 4: line 431: mapping key \"agents-llama\" already defined at line 155\n```\n\nBy then, the user has already paid the cost of: pre-build binary downloads, generator run, Caddyfile regeneration. The only hint about what went wrong is a line number in a generated file. Root cause (dual activation) is not surfaced.\n\n## Fix\n\nAdd a generate-time guard to `lib/generators.sh`:\n\n- After collecting all service blocks to emit, compare the set of service names against duplicates.\n- If a duplicate is detected, abort with a clear message naming both source of truth (e.g. `\"agents-llama\" emitted twice — from ENABLE_LLAMA_AGENT=1 and from [agents.llama] in projects/disinto.toml; remove one`).\n\nEven after #846 resolves (one canonical activation path), this guard remains valuable as a safety net against future regressions or user misconfiguration (e.g. two TOML blocks with same `forge_user`).\n\n## Prior art: PR #872 (closed, branch `fix/issue-850` retained)\n\ndev-qwen's first attempt (`db009e3`) landed the dup-detection logic in `lib/generators.sh` correctly (unit test `tests/test-duplicate-service-detection.sh` passes all 3 cases), but the smoke test fails on CI.\n\n**Why the smoke test fails:** sections 1-7 of `smoke-init.sh` already run `bin/disinto init`, materializing `docker-compose.yml`. Section 8 re-invokes `bin/disinto init` to verify the dup guard fires — but `_generate_compose_impl` early-returns with `\"Compose: already exists, skipping\"` before reaching the dup-check.\n\n**Suggested fix:** in `tests/smoke-init.sh` section 8 (around line 452, before the second `bin/disinto init` invocation), add:\n\n```bash\nrm -f \"${FACTORY_ROOT}/docker-compose.yml\"\n```\n\nso the generator actually runs and the dup-detection path is exercised. Do **not** hoist the dup-check above the early-return.\n\nThe branch `fix/issue-850` is preserved as a starting point — pick up from `db009e3` and patch the smoke-test cleanup.\n\nRelated: #846.\n\n## Affected files\n- `lib/generators.sh` — duplicate service name check after collecting all service blocks\n- `tests/smoke-init.sh` — section 8: add `rm -f docker-compose.yml` before second `disinto init`\n- `tests/test-duplicate-service-detection.sh` (likely already correct from prior art)\n\n## Acceptance criteria\n- [ ] Running `disinto up` with a known duplicate activation produces a clear generator-time error naming both conflicting sources\n- [ ] Exit code non-zero before `docker compose` is invoked\n- [ ] Smoke test section 8 passes on CI (dup guard is actually exercised)\n- [ ] `shellcheck` clean\n"
},
{
"action": "remove_label",
@ -43,10 +113,5 @@
"action": "add_label",
"issue": 850,
"label": "backlog"
},
{
"action": "comment",
"issue": 758,
"body": "This issue is the critical path blocker for #820 (ops repo re-seed) and #982 (collect-engagement commit fix). Both are in the backlog and ready to merge, but cannot run until ops repo branch protection is resolved. Needs admin/human action to change Forgejo branch protection settings on disinto-ops — no code change can unblock this."
}
]

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: 0bb04545d47fb43b2cab0a1f4406c2a2b57f4eba -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Shared Helpers (`lib/`)
All agents source `lib/env.sh` as their first action. Additional helpers are
@ -30,9 +30,9 @@ sourced as needed.
| `lib/git-creds.sh` | Shared git credential helper configuration. `configure_git_creds([HOME_DIR] [RUN_AS_CMD])` — writes a static credential helper script and configures git globally to use password-based HTTP auth (Forgejo 11.x rejects API tokens for `git push`, #361). **Retry on cold boot (#741)**: resolves bot username from `FORGE_TOKEN` with 5 retries (exponential backoff 1-5s); fails loudly and returns 1 if Forgejo is unreachable — never falls back to a wrong hardcoded default (exports `BOT_USER` on success). `repair_baked_cred_urls([--as RUN_AS_CMD] DIR ...)` — rewrites any git remote URLs that have credentials baked in to use clean URLs instead; uses `safe.directory` bypass for root-owned repos (#671). Requires `FORGE_PASS`, `FORGE_URL`, `FORGE_TOKEN`. | entrypoints (agents, edge) |
| `lib/ops-setup.sh` | `setup_ops_repo()` — creates ops repo on Forgejo if it doesn't exist, configures bot collaborators, clones/initializes ops repo locally, seeds directory structure (vault, knowledge, evidence, sprints). Evidence subdirectories seeded: engagement/, red-team/, holdout/, evolution/, user-test/. Also seeds sprints/ for architect output. Exports `_ACTUAL_OPS_SLUG`. `migrate_ops_repo(ops_root, [primary_branch])` — idempotent migration helper that seeds missing directories and .gitkeep files on existing ops repos (pre-#407 deployments). | bin/disinto (init) |
| `lib/ci-setup.sh` | `_install_cron_impl()` — installs crontab entries for bare-metal deployments (compose mode uses polling loop instead). `_create_forgejo_oauth_app()` — generic helper to create an OAuth2 app on Forgejo (shared by Woodpecker and chat). `_create_woodpecker_oauth_impl()` — creates Woodpecker OAuth2 app (thin wrapper). `_create_chat_oauth_impl()` — creates disinto-chat OAuth2 app, writes `CHAT_OAUTH_CLIENT_ID`/`CHAT_OAUTH_CLIENT_SECRET` to `.env` (#708). `_generate_woodpecker_token_impl()` — auto-generates WOODPECKER_TOKEN via OAuth2 flow. `_activate_woodpecker_repo_impl()` — activates repo in Woodpecker. All gated by `_load_ci_context()` which validates required env vars. | bin/disinto (init) |
| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; `CLAUDE_BIN_DIR` volume mount removed from agents/llama services — only `reproduce` and `edge` still use the host-mounted CLI (#992); adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
| `lib/generators.sh` | Template generation for `disinto init`: `generate_compose()` — docker-compose.yml (uses `codeberg.org/forgejo/forgejo:11.0` tag; adds `security_opt: [apparmor:unconfined]` to all services for rootless container compatibility; Forgejo includes a healthcheck so dependent services use `condition: service_healthy` — fixes cold-start races, #665; adds `chat` service block with isolated `chat-config` named volume and `CHAT_HISTORY_DIR` bind-mount for per-user NDJSON history persistence (#710); injects `FORWARD_AUTH_SECRET` for Caddy↔chat defense-in-depth auth (#709); cost-cap env vars `CHAT_MAX_REQUESTS_PER_HOUR`, `CHAT_MAX_REQUESTS_PER_DAY`, `CHAT_MAX_TOKENS_PER_DAY` (#711); subdomain fallback comment for `EDGE_TUNNEL_FQDN_*` vars (#713); all `depends_on` now use `condition: service_healthy/started` instead of bare service names; all services now include `restart: unless-stopped` including the edge service — #768; agents service now uses `image: ghcr.io/disinto/agents:${DISINTO_IMAGE_TAG:-latest}` instead of `build:` (#429); `WOODPECKER_PLUGINS_PRIVILEGED` env var added to woodpecker service (#779); agents-llama conditional block gated on `ENABLE_LLAMA_AGENT=1` (#769); `agents-llama-all` compose service (profile `agents-llama-all`, all 7 roles: review,dev,gardener,architect,planner,predictor,supervisor) added by #801; agents service gains volume mounts for `./projects`, `./.env`, `./state`), `generate_caddyfile()` — Caddyfile (routes: `/forge/*` → forgejo:3000, `/woodpecker/*` → woodpecker:8000, `/staging/*` → staging:80; `/chat/login` and `/chat/oauth/callback` bypass `forward_auth` so unauthenticated users can reach the OAuth flow; `/chat/*` gated by `forward_auth` on `chat:8080/chat/auth/verify` which stamps `X-Forwarded-User` (#709); root `/` redirects to `/forge/`), `generate_staging_index()` — staging index, `generate_deploy_pipelines()` — Woodpecker deployment pipeline configs. Requires `FACTORY_ROOT`, `PROJECT_NAME`, `PRIMARY_BRANCH`. | bin/disinto (init) |
| `lib/sprint-filer.sh` | Post-merge sub-issue filer for sprint PRs. Invoked by the `.woodpecker/ops-filer.yml` pipeline after a sprint PR merges to ops repo `main`. Parses `<!-- filer:begin --> ... <!-- filer:end -->` blocks from sprint PR bodies to extract sub-issue definitions, creates them on the project repo using `FORGE_FILER_TOKEN` (narrow-scope `filer-bot` identity with `issues:write` only), adds `in-progress` label to the parent vision issue, and handles vision lifecycle closure when all sub-issues are closed. Uses `filer_api_all()` for paginated fetches. Idempotent: uses `<!-- decomposed-from: #<vision>, sprint: <slug>, id: <id> -->` markers to skip already-filed issues. Requires `FORGE_FILER_TOKEN`, `FORGE_API`, `FORGE_API_BASE`, `FORGE_OPS_REPO`. | `.woodpecker/ops-filer.yml` (CI pipeline on ops repo) |
| `lib/hire-agent.sh` | `disinto_hire_an_agent()` — user creation, `.profile` repo setup, formula copying, branch protection, and state marker creation for hiring a new agent. Requires `FORGE_URL`, `FORGE_TOKEN`, `FACTORY_ROOT`, `PROJECT_NAME`. Extracted from `bin/disinto`. | bin/disinto (hire) |
| `lib/release.sh` | `disinto_release()` — vault TOML creation, branch setup on ops repo, PR creation, and auto-merge request for a versioned release. `_assert_release_globals()` validates required env vars. Requires `FORGE_URL`, `FORGE_TOKEN`, `FORGE_OPS_REPO`, `FACTORY_ROOT`, `PRIMARY_BRANCH`. Extracted from `bin/disinto`. | bin/disinto (release) |
| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_ensure_kv_v2(MOUNT, [LOG_PREFIX])` — idempotent KV v2 mount assertion: enables mount if absent, fails loudly if present as wrong type/version. Extracted from all `vault-seed-*.sh` scripts to eliminate dup-detector violations. Respects `DRY_RUN=1`. `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. `_hvault_seed_key(PATH, KEY, [GENERATOR])` — seed one KV key if absent; reads existing data and merges to preserve sibling keys (KV v2 replaces atomically); returns 0=created, 1=unchanged, 2=API error (#992). All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh`, `tools/vault-seed-*.sh` |
| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling; `HOST_VOLUME_DIRS` array now includes `/srv/disinto/docker` (for staging file-server, S5.2, #989, #992). `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). `wp-oauth-register.sh` — S3.3 script that creates the Woodpecker OAuth2 app in Forgejo and stores `forgejo_client`/`forgejo_secret` in Vault KV v2 at `kv/disinto/shared/woodpecker`; idempotent (skips if app or secrets already present); called by `bin/disinto --with woodpecker`. `deploy.sh` — S4 dependency-ordered Nomad job deploy + health-wait; takes a list of jobspec basenames, submits each to Nomad and polls until healthy before proceeding to the next; supports `--dry-run` and per-job timeout overrides via `JOB_READY_TIMEOUT_<JOBNAME>`; global default timeout `JOB_READY_TIMEOUT_SECS` is 360s (raised from 240s for chat cold-start, #1036); invoked by `bin/disinto --with <svc>` and `cluster-up.sh`; deploy order now covers staging, chat, edge (S5.5, #992). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` |
| `lib/hvault.sh` | HashiCorp Vault helper module. `hvault_kv_get(PATH, [KEY])` — read KV v2 secret, optionally extract one key. `hvault_kv_put(PATH, KEY=VAL ...)` — write KV v2 secret. `hvault_kv_list(PATH)` — list keys at a KV path. `hvault_get_or_empty(PATH)` — GET /v1/PATH; 200→raw body, 404→empty, else structured error + return 1 (used by sync scripts to distinguish "absent, create" from hard failure without tripping errexit, #881). `hvault_policy_apply(NAME, FILE)` — idempotent policy upsert. `hvault_jwt_login(ROLE, JWT)` — exchange JWT for short-lived token. `hvault_token_lookup()` — returns TTL/policies/accessor for current token. All functions use `VAULT_ADDR` + `VAULT_TOKEN` from env (fallback: `/etc/vault.d/root.token`), emit structured JSON errors to stderr on failure. Tests: `tests/lib-hvault.bats` (requires `vault server -dev`). | `tools/vault-apply-policies.sh`, `tools/vault-apply-roles.sh`, `lib/init/nomad/vault-nomad-auth.sh` |
| `lib/init/nomad/` | Nomad+Vault installer scripts. `cluster-up.sh` — idempotent Step-0 orchestrator that runs all steps in order (installs packages, writes HCL, enables systemd units, unseals Vault); uses `poll_until_healthy()` helper for deduped readiness polling. `install.sh` — installs pinned Nomad+Vault apt packages. `vault-init.sh` — initializes Vault (unseal keys → `/etc/vault.d/`), creates dev-persisted unseal unit. `lib-systemd.sh` — shared systemd unit helpers. `systemd-nomad.sh`, `systemd-vault.sh` — write and enable service units. `vault-nomad-auth.sh` — Step-2 script that enables Vault's JWT auth at path `jwt-nomad`, writes the JWKS/algs config pointing at Nomad's workload-identity signer, delegates role sync to `tools/vault-apply-roles.sh`, installs `/etc/nomad.d/server.hcl`, and SIGHUPs `nomad.service` if the file changed (#881). Idempotent: each step checks current state before acting. Sourced and called by `cluster-up.sh`; not sourced by agents. | `bin/disinto init --backend=nomad` |

View file

@ -128,6 +128,7 @@ vault_request() {
# Validate TOML content
local tmp_toml
tmp_toml=$(mktemp /tmp/vault-XXXXXX.toml)
trap 'rm -f "$tmp_toml"' RETURN
printf '%s' "$toml_content" > "$tmp_toml"
@ -135,7 +136,6 @@ vault_request() {
local vault_env="${FACTORY_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/action-vault/vault-env.sh"
if [ ! -f "$vault_env" ]; then
echo "ERROR: vault-env.sh not found at $vault_env" >&2
rm -f "$tmp_toml"
return 1
fi
@ -145,15 +145,11 @@ vault_request() {
if ! source "$vault_env"; then
FORGE_TOKEN="${_saved_forge_token:-}"
echo "ERROR: failed to source vault-env.sh" >&2
rm -f "$tmp_toml"
return 1
fi
# Restore caller's FORGE_TOKEN after validation
FORGE_TOKEN="${_saved_forge_token:-}"
# Set trap AFTER sourcing vault-env.sh to avoid RETURN trap firing during source
trap 'rm -f "$tmp_toml"' RETURN
# Run validation
if ! validate_vault_action "$tmp_toml"; then
echo "ERROR: TOML validation failed" >&2

View file

@ -356,6 +356,16 @@ setup_forge() {
[predictor-bot]="FORGE_PREDICTOR_PASS"
[architect-bot]="FORGE_ARCHITECT_PASS"
)
# Llama bot users (local-model agents) — separate from main agents
# Each llama agent gets its own Forgejo user, token, and password
local -A llama_token_vars=(
[dev-qwen]="FORGE_TOKEN_LLAMA"
[dev-qwen-nightly]="FORGE_TOKEN_LLAMA_NIGHTLY"
)
local -A llama_pass_vars=(
[dev-qwen]="FORGE_PASS_LLAMA"
[dev-qwen-nightly]="FORGE_PASS_LLAMA_NIGHTLY"
)
local bot_user bot_pass token token_var pass_var
@ -505,12 +515,159 @@ setup_forge() {
fi
done
# Create llama bot users and tokens (local-model agents)
# These are separate from the main agents and get their own credentials
echo ""
echo "── Setting up llama bot users ────────────────────────────"
local llama_user llama_pass llama_token llama_token_var llama_pass_var
for llama_user in "${!llama_token_vars[@]}"; do
llama_token_var="${llama_token_vars[$llama_user]}"
llama_pass_var="${llama_pass_vars[$llama_user]}"
# Check if token already exists in .env
local token_exists=false
if _token_exists_in_env "$llama_token_var" "$env_file"; then
token_exists=true
fi
# Check if password already exists in .env
local pass_exists=false
if _pass_exists_in_env "$llama_pass_var" "$env_file"; then
pass_exists=true
fi
# Check if llama bot user exists on Forgejo
local llama_user_exists=false
if curl -sf --max-time 5 \
-H "Authorization: token ${admin_token}" \
"${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
llama_user_exists=true
fi
# Skip token/password regeneration if both exist in .env and not forcing rotation
if [ "$token_exists" = true ] && [ "$pass_exists" = true ] && [ "$rotate_tokens" = false ]; then
echo " ${llama_user} token and password preserved (use --rotate-tokens to force)"
# Still export the existing token for use within this run
local existing_token existing_pass
existing_token=$(grep "^${llama_token_var}=" "$env_file" | head -1 | cut -d= -f2-)
existing_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
export "${llama_token_var}=${existing_token}"
export "${llama_pass_var}=${existing_pass}"
continue
fi
# Generate new credentials if:
# - Token doesn't exist (first run)
# - Password doesn't exist (first run)
# - --rotate-tokens flag is set (explicit rotation)
if [ "$llama_user_exists" = false ]; then
# User doesn't exist - create it
llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
echo "Creating llama bot user: ${llama_user}"
local create_output
if ! create_output=$(_forgejo_exec forgejo admin user create \
--username "${llama_user}" \
--password "${llama_pass}" \
--email "${llama_user}@disinto.local" \
--must-change-password=false 2>&1); then
echo "Error: failed to create llama bot user '${llama_user}':" >&2
echo " ${create_output}" >&2
exit 1
fi
# Forgejo 11.x ignores --must-change-password=false on create;
# explicitly clear the flag so basic-auth token creation works.
_forgejo_exec forgejo admin user change-password \
--username "${llama_user}" \
--password "${llama_pass}" \
--must-change-password=false
# Verify llama bot user was actually created
if ! curl -sf --max-time 5 \
-H "Authorization: token ${admin_token}" \
"${forge_url}/api/v1/users/${llama_user}" >/dev/null 2>&1; then
echo "Error: llama bot user '${llama_user}' not found after creation" >&2
exit 1
fi
echo " ${llama_user} user created"
else
# User exists - reset password if needed
echo " ${llama_user} user exists"
if [ "$rotate_tokens" = true ] || [ "$pass_exists" = false ]; then
llama_pass="llama-$(head -c 16 /dev/urandom | base64 | tr -dc 'a-zA-Z0-9' | head -c 20)"
_forgejo_exec forgejo admin user change-password \
--username "${llama_user}" \
--password "${llama_pass}" \
--must-change-password=false || {
echo "Error: failed to reset password for existing llama bot user '${llama_user}'" >&2
exit 1
}
echo " ${llama_user} password reset for token generation"
else
# Password exists, get it from .env
llama_pass=$(grep "^${llama_pass_var}=" "$env_file" | head -1 | cut -d= -f2-)
fi
fi
# Generate token via API (basic auth as the llama user)
# First, delete any existing tokens to avoid name collision
local existing_llama_token_ids
existing_llama_token_ids=$(curl -sf \
-u "${llama_user}:${llama_pass}" \
"${forge_url}/api/v1/users/${llama_user}/tokens" 2>/dev/null \
| jq -r '.[].id // empty' 2>/dev/null) || existing_llama_token_ids=""
# Delete any existing tokens for this user
if [ -n "$existing_llama_token_ids" ]; then
while IFS= read -r tid; do
[ -n "$tid" ] && curl -sf -X DELETE \
-u "${llama_user}:${llama_pass}" \
"${forge_url}/api/v1/users/${llama_user}/tokens/${tid}" >/dev/null 2>&1 || true
done <<< "$existing_llama_token_ids"
fi
llama_token=$(curl -sf -X POST \
-u "${llama_user}:${llama_pass}" \
-H "Content-Type: application/json" \
"${forge_url}/api/v1/users/${llama_user}/tokens" \
-d "{\"name\":\"disinto-${llama_user}-token\",\"scopes\":[\"all\"]}" 2>/dev/null \
| jq -r '.sha1 // empty') || llama_token=""
if [ -z "$llama_token" ]; then
echo "Error: failed to create API token for '${llama_user}'" >&2
exit 1
fi
# Store token in .env under the llama-specific variable name
if grep -q "^${llama_token_var}=" "$env_file" 2>/dev/null; then
sed -i "s|^${llama_token_var}=.*|${llama_token_var}=${llama_token}|" "$env_file"
else
printf '%s=%s\n' "$llama_token_var" "$llama_token" >> "$env_file"
fi
export "${llama_token_var}=${llama_token}"
echo " ${llama_user} token generated and saved (${llama_token_var})"
# Store password in .env for git HTTP push (#361)
# Forgejo 11.x API tokens don't work for git push; password auth does.
if grep -q "^${llama_pass_var}=" "$env_file" 2>/dev/null; then
sed -i "s|^${llama_pass_var}=.*|${llama_pass_var}=${llama_pass}|" "$env_file"
else
printf '%s=%s\n' "$llama_pass_var" "$llama_pass" >> "$env_file"
fi
export "${llama_pass_var}=${llama_pass}"
echo " ${llama_user} password saved (${llama_pass_var})"
done
# Create .profile repos for all bot users (if they don't already exist)
# This runs the same logic as hire-an-agent Step 2-3 for idempotent setup
echo ""
echo "── Setting up .profile repos ────────────────────────────"
local -a bot_users=(dev-bot review-bot planner-bot gardener-bot vault-bot supervisor-bot predictor-bot architect-bot)
# Add llama bot users to .profile repo creation
for llama_user in "${!llama_token_vars[@]}"; do
bot_users+=("$llama_user")
done
local bot_user
for bot_user in "${bot_users[@]}"; do
@ -618,6 +775,15 @@ setup_forge() {
-d "{\"permission\":\"${bot_perm}\"}" >/dev/null 2>&1 || true
done
# Add llama bot users as write collaborators for local-model agents
for llama_user in "${!llama_token_vars[@]}"; do
curl -sf -X PUT \
-H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \
-H "Content-Type: application/json" \
"${forge_url}/api/v1/repos/${repo_slug}/collaborators/${llama_user}" \
-d '{"permission":"write"}' >/dev/null 2>&1 || true
done
# Add disinto-admin as admin collaborator
curl -sf -X PUT \
-H "Authorization: token ${admin_token:-${FORGE_TOKEN}}" \

View file

@ -137,6 +137,7 @@ _generate_local_model_services() {
- project-repos-${service_name}:/home/agent/repos
- \${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:\${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- \${CLAUDE_CONFIG_FILE:-\${HOME}/.claude.json}:/home/agent/.claude.json:ro
- \${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- \${AGENT_SSH_DIR:-\${HOME}/.ssh}:/home/agent/.ssh:ro
- ./projects:/home/agent/disinto/projects:ro
- ./.env:/home/agent/disinto/.env:ro
@ -381,6 +382,7 @@ services:
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
@ -436,6 +438,136 @@ services:
COMPOSEEOF
# ── Conditional agents-llama block (ENABLE_LLAMA_AGENT=1) ──────────────
# Local-Qwen dev agent — gated on ENABLE_LLAMA_AGENT so factories without
# a local llama endpoint don't try to start it. See docs/agents-llama.md.
if [ "${ENABLE_LLAMA_AGENT:-0}" = "1" ]; then
cat >> "$compose_file" <<'LLAMAEOF'
agents-llama:
build:
context: .
dockerfile: docker/agents/Dockerfile
# Rebuild on every up (#887): makes docker/agents/ source changes reach this
# container without a manual \`docker compose build\`. Cache-fast when clean.
pull_policy: build
container_name: disinto-agents-llama
restart: unless-stopped
security_opt:
- apparmor=unconfined
volumes:
- agent-data:/home/agent/data
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
environment:
FORGE_URL: http://forgejo:3000
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
FORGE_PASS: ${FORGE_PASS_LLAMA:-}
FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
DISINTO_CONTAINER: "1"
PROJECT_NAME: ${PROJECT_NAME:-project}
PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
WOODPECKER_DATA_DIR: /woodpecker-data
WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
POLL_INTERVAL: ${POLL_INTERVAL:-300}
AGENT_ROLES: dev
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
networks:
- disinto-net
agents-llama-all:
build:
context: .
dockerfile: docker/agents/Dockerfile
# Rebuild on every up (#887): makes docker/agents/ source changes reach this
# container without a manual \`docker compose build\`. Cache-fast when clean.
pull_policy: build
container_name: disinto-agents-llama-all
restart: unless-stopped
profiles: ["agents-llama-all"]
security_opt:
- apparmor=unconfined
volumes:
- agent-data:/home/agent/data
- project-repos:/home/agent/repos
- ${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}:${CLAUDE_SHARED_DIR:-/var/lib/disinto/claude-shared}
- ${CLAUDE_CONFIG_FILE:-${HOME}/.claude.json}:/home/agent/.claude.json:ro
- ${CLAUDE_BIN_DIR}:/usr/local/bin/claude:ro
- ${AGENT_SSH_DIR:-${HOME}/.ssh}:/home/agent/.ssh:ro
- ${SOPS_AGE_DIR:-${HOME}/.config/sops/age}:/home/agent/.config/sops/age:ro
- woodpecker-data:/woodpecker-data:ro
environment:
FORGE_URL: http://forgejo:3000
FORGE_REPO: ${FORGE_REPO:-disinto-admin/disinto}
FORGE_TOKEN: ${FORGE_TOKEN_LLAMA:-}
FORGE_PASS: ${FORGE_PASS_LLAMA:-}
FORGE_REVIEW_TOKEN: ${FORGE_REVIEW_TOKEN:-}
FORGE_PLANNER_TOKEN: ${FORGE_PLANNER_TOKEN:-}
FORGE_GARDENER_TOKEN: ${FORGE_GARDENER_TOKEN:-}
FORGE_VAULT_TOKEN: ${FORGE_VAULT_TOKEN:-}
FORGE_SUPERVISOR_TOKEN: ${FORGE_SUPERVISOR_TOKEN:-}
FORGE_PREDICTOR_TOKEN: ${FORGE_PREDICTOR_TOKEN:-}
FORGE_ARCHITECT_TOKEN: ${FORGE_ARCHITECT_TOKEN:-}
FORGE_FILER_TOKEN: ${FORGE_FILER_TOKEN:-}
FORGE_BOT_USERNAMES: ${FORGE_BOT_USERNAMES:-}
WOODPECKER_TOKEN: ${WOODPECKER_TOKEN:-}
CLAUDE_TIMEOUT: ${CLAUDE_TIMEOUT:-7200}
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC: ${CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC:-1}
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE: "60"
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS: "1"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-}
ANTHROPIC_BASE_URL: ${ANTHROPIC_BASE_URL:-}
FORGE_ADMIN_PASS: ${FORGE_ADMIN_PASS:-}
DISINTO_CONTAINER: "1"
PROJECT_NAME: ${PROJECT_NAME:-project}
PROJECT_REPO_ROOT: /home/agent/repos/${PROJECT_NAME:-project}
WOODPECKER_DATA_DIR: /woodpecker-data
WOODPECKER_REPO_ID: "PLACEHOLDER_WP_REPO_ID"
CLAUDE_CONFIG_DIR: ${CLAUDE_CONFIG_DIR:-/var/lib/disinto/claude-shared/config}
POLL_INTERVAL: ${POLL_INTERVAL:-300}
GARDENER_INTERVAL: ${GARDENER_INTERVAL:-21600}
ARCHITECT_INTERVAL: ${ARCHITECT_INTERVAL:-21600}
PLANNER_INTERVAL: ${PLANNER_INTERVAL:-43200}
SUPERVISOR_INTERVAL: ${SUPERVISOR_INTERVAL:-1200}
AGENT_ROLES: review,dev,gardener,architect,planner,predictor,supervisor
healthcheck:
test: ["CMD", "pgrep", "-f", "entrypoint.sh"]
interval: 60s
timeout: 5s
retries: 3
start_period: 30s
depends_on:
forgejo:
condition: service_healthy
woodpecker:
condition: service_started
networks:
- disinto-net
LLAMAEOF
fi
# Resume the rest of the compose file (runner onward)
cat >> "$compose_file" <<'COMPOSEEOF'
@ -634,13 +766,13 @@ COMPOSEEOF
_generate_local_model_services "$compose_file"
# Resolve the Claude CLI binary path and persist as CLAUDE_BIN_DIR in .env.
# Only used by reproduce and edge services which still use host-mounted CLI.
# docker-compose.yml references ${CLAUDE_BIN_DIR} so the value must be set.
local claude_bin
claude_bin="$(command -v claude 2>/dev/null || true)"
if [ -n "$claude_bin" ]; then
claude_bin="$(readlink -f "$claude_bin")"
else
echo "Warning: claude CLI not found in PATH — reproduce/edge services will fail to start" >&2
echo "Warning: claude CLI not found in PATH — set CLAUDE_BIN_DIR in .env manually" >&2
claude_bin="/usr/local/bin/claude"
fi
# Persist CLAUDE_BIN_DIR into .env so docker-compose can resolve it.
@ -657,8 +789,9 @@ COMPOSEEOF
# In build mode, replace image: with build: for locally-built images
if [ "$use_build" = true ]; then
sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile\n pull_policy: build|}' "$compose_file"
sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge\n pull_policy: build|}' "$compose_file"
sed -i 's|^\( agents:\)|\1|' "$compose_file"
sed -i '/^ image: ghcr\.io\/disinto\/agents:/{s|image: ghcr\.io/disinto/agents:.*|build:\n context: .\n dockerfile: docker/agents/Dockerfile|}' "$compose_file"
sed -i '/^ image: ghcr\.io\/disinto\/edge:/{s|image: ghcr\.io/disinto/edge:.*|build: ./docker/edge|}' "$compose_file"
fi
echo "Created: ${compose_file}"

View file

@ -38,30 +38,6 @@ _hvault_resolve_token() {
return 1
}
# _hvault_default_env — set the local-cluster Vault env if unset
#
# Idempotent helper used by every Vault-touching script that runs during
# `disinto init` (S2). On the local-cluster common case, operators (and
# the init dispatcher in bin/disinto) have not exported VAULT_ADDR or
# VAULT_TOKEN — the server is reachable on localhost:8200 and the root
# token lives at /etc/vault.d/root.token. Scripts must Just Work in that
# shape.
#
# - If VAULT_ADDR is unset, defaults to http://127.0.0.1:8200.
# - If VAULT_TOKEN is unset, resolves from /etc/vault.d/root.token via
# _hvault_resolve_token. A missing token file is not an error here —
# downstream hvault_token_lookup() probes connectivity and emits the
# operator-facing "VAULT_ADDR + VAULT_TOKEN" diagnostic.
#
# Centralised to keep the defaulting stanza in one place — copy-pasting
# the 5-line block into each init script trips the repo-wide 5-line
# sliding-window duplicate detector (.woodpecker/detect-duplicates.py).
_hvault_default_env() {
VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
export VAULT_ADDR
_hvault_resolve_token || :
}
# _hvault_check_prereqs — validate VAULT_ADDR and VAULT_TOKEN are set
# Args: caller function name
_hvault_check_prereqs() {
@ -124,65 +100,6 @@ _hvault_request() {
# ── Public API ───────────────────────────────────────────────────────────────
# VAULT_KV_MOUNT — KV v2 mount point (default: "kv")
# Override with: export VAULT_KV_MOUNT=secret
# Used by: hvault_kv_get, hvault_kv_put, hvault_kv_list
: "${VAULT_KV_MOUNT:=kv}"
# hvault_ensure_kv_v2 MOUNT [LOG_PREFIX]
# Assert that the given KV mount is present and KV v2. If absent, enable
# it. If present as wrong type/version, exit 1. Callers must have already
# checked VAULT_ADDR / VAULT_TOKEN.
#
# DRY_RUN (env, default 0): when 1, log intent without writing.
# LOG_PREFIX (optional): label for log lines, e.g. "[vault-seed-forgejo]".
#
# Extracted here because every vault-seed-*.sh script needs this exact
# sequence, and the 5-line sliding-window dup detector flags the
# copy-paste. One place, one implementation.
hvault_ensure_kv_v2() {
local mount="${1:?hvault_ensure_kv_v2: MOUNT required}"
local prefix="${2:-[hvault]}"
local dry_run="${DRY_RUN:-0}"
local mounts_json mount_exists mount_type mount_version
mounts_json="$(hvault_get_or_empty "sys/mounts")" \
|| { printf '%s ERROR: failed to list Vault mounts\n' "$prefix" >&2; return 1; }
mount_exists=false
if printf '%s' "$mounts_json" | jq -e --arg m "${mount}/" '.[$m]' >/dev/null 2>&1; then
mount_exists=true
fi
if [ "$mount_exists" = true ]; then
mount_type="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].type // ""')"
mount_version="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${mount}/" '.[$m].options.version // "1"')"
if [ "$mount_type" != "kv" ]; then
printf '%s ERROR: %s/ is mounted as type=%q, expected kv — refuse to re-mount\n' \
"$prefix" "$mount" "$mount_type" >&2
return 1
fi
if [ "$mount_version" != "2" ]; then
printf '%s ERROR: %s/ is KV v%s, expected v2 — refuse to upgrade in place\n' \
"$prefix" "$mount" "$mount_version" >&2
return 1
fi
printf '%s %s/ already mounted (kv v2) — skipping enable\n' "$prefix" "$mount"
else
if [ "$dry_run" -eq 1 ]; then
printf '%s [dry-run] would enable %s/ as kv v2\n' "$prefix" "$mount"
else
local payload
payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')"
_hvault_request POST "sys/mounts/${mount}" "$payload" >/dev/null \
|| { printf '%s ERROR: failed to enable %s/ as kv v2\n' "$prefix" "$mount" >&2; return 1; }
printf '%s %s/ enabled as kv v2\n' "$prefix" "$mount"
fi
fi
}
# hvault_kv_get PATH [KEY]
# Read a KV v2 secret at PATH, optionally extract a single KEY.
# Outputs: JSON value (full data object, or single key value)
@ -197,7 +114,7 @@ hvault_kv_get() {
_hvault_check_prereqs "hvault_kv_get" || return 1
local response
response="$(_hvault_request GET "${VAULT_KV_MOUNT}/data/${path}")" || return 1
response="$(_hvault_request GET "secret/data/${path}")" || return 1
if [ -n "$key" ]; then
printf '%s' "$response" | jq -e -r --arg key "$key" '.data.data[$key]' 2>/dev/null || {
@ -237,7 +154,7 @@ hvault_kv_put() {
payload="$(printf '%s' "$payload" | jq --arg k "$k" --arg v "$v" '.data[$k] = $v')"
done
_hvault_request POST "${VAULT_KV_MOUNT}/data/${path}" "$payload" >/dev/null
_hvault_request POST "secret/data/${path}" "$payload" >/dev/null
}
# hvault_kv_list PATH
@ -253,7 +170,7 @@ hvault_kv_list() {
_hvault_check_prereqs "hvault_kv_list" || return 1
local response
response="$(_hvault_request LIST "${VAULT_KV_MOUNT}/metadata/${path}")" || return 1
response="$(_hvault_request LIST "secret/metadata/${path}")" || return 1
printf '%s' "$response" | jq -e '.data.keys' 2>/dev/null || {
_hvault_err "hvault_kv_list" "failed to parse response" "path=$path"
@ -405,36 +322,3 @@ hvault_token_lookup() {
return 1
}
}
# _hvault_seed_key — Seed a single KV key if it doesn't exist.
# Reads existing data and merges to preserve sibling keys (KV v2 replaces
# .data atomically). Returns 0=created, 1=unchanged, 2=API error.
# Args:
# path: KV v2 logical path (e.g. "disinto/shared/chat")
# key: key name within the path (e.g. "chat_oauth_client_id")
# generator: shell command that outputs a random value (default: openssl rand -hex 32)
# Usage:
# _hvault_seed_key "disinto/shared/chat" "chat_oauth_client_id"
# rc=$? # 0=created, 1=unchanged
_hvault_seed_key() {
local path="$1" key="$2" generator="${3:-openssl rand -hex 32}"
local existing
existing=$(hvault_kv_get "$path" "$key" 2>/dev/null) || true
if [ -n "$existing" ]; then
return 1 # unchanged
fi
local value
value=$(eval "$generator")
# Read existing data to preserve sibling keys (KV v2 replaces atomically)
local kv_api="${VAULT_KV_MOUNT}/data/${path}"
local raw existing_data payload
raw="$(hvault_get_or_empty "$kv_api")" || return 2
existing_data="{}"
[ -n "$raw" ] && existing_data="$(printf '%s' "$raw" | jq '.data.data // {}')"
payload="$(printf '%s' "$existing_data" \
| jq --arg k "$key" --arg v "$value" '{data: (. + {($k): $v})}')"
_hvault_request POST "$kv_api" "$payload" >/dev/null
return 0 # created
}

View file

@ -66,7 +66,6 @@ HOST_VOLUME_DIRS=(
"/srv/disinto/agent-data"
"/srv/disinto/project-repos"
"/srv/disinto/caddy-data"
"/srv/disinto/docker"
"/srv/disinto/chat-history"
"/srv/disinto/ops-repo"
)
@ -117,7 +116,7 @@ if [ "$dry_run" = true ]; then
[dry-run] Step 4/9: create host-volume dirs under /srv/disinto/
EOF
for d in "${HOST_VOLUME_DIRS[@]}"; do
printf ' → install -d -m 0777 %s\n' "$d"
printf ' → install -d -m 0755 %s\n' "$d"
done
cat <<EOF
@ -136,7 +135,7 @@ EOF
export VAULT_ADDR=${VAULT_ADDR_DEFAULT}
export NOMAD_ADDR=${NOMAD_ADDR_DEFAULT}
Dry run complete no changes made.
Dry run complete - no changes made.
EOF
exit 0
fi
@ -281,10 +280,8 @@ for d in "${HOST_VOLUME_DIRS[@]}"; do
log "unchanged: ${d}"
else
log "creating: ${d}"
install -d -m 0777 -o root -g root "$d"
install -d -m 0755 -o root -g root "$d"
fi
# Ensure correct permissions (fixes pre-existing 0755 dirs on re-run)
chmod 0777 "$d"
done
# ── Step 5/9: /etc/nomad.d/server.hcl + client.hcl ───────────────────────────

View file

@ -16,7 +16,7 @@
# Environment:
# REPO_ROOT — absolute path to repo root (defaults to parent of
# this script's parent directory)
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 360)
# JOB_READY_TIMEOUT_SECS — poll timeout in seconds (default: 240)
# JOB_READY_TIMEOUT_<JOBNAME> — per-job timeout override (e.g.,
# JOB_READY_TIMEOUT_FORGEJO=300)
#
@ -33,7 +33,7 @@ set -euo pipefail
# ── Configuration ────────────────────────────────────────────────────────────
SCRIPT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_ROOT}/../../.." && pwd)}"
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-360}"
JOB_READY_TIMEOUT_SECS="${JOB_READY_TIMEOUT_SECS:-240}"
DRY_RUN=0
@ -177,8 +177,7 @@ for job_name in "${JOBS[@]}"; do
fi
# Per-job timeout override: JOB_READY_TIMEOUT_<UPPERCASE_JOBNAME>
# Sanitize job name: replace hyphens with underscores (bash vars can't have hyphens)
job_upper=$(printf '%s' "$job_name" | tr '[:lower:]-' '[:upper:]_' | tr ' ' '_')
job_upper=$(printf '%s' "$job_name" | tr '[:lower:]' '[:upper:]')
timeout_var="JOB_READY_TIMEOUT_${job_upper}"
job_timeout="${!timeout_var:-$JOB_READY_TIMEOUT_SECS}"

View file

@ -1,140 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/vault-engines.sh — Enable required Vault secret engines
#
# Part of the Nomad+Vault migration (S2.1, issue #912). Enables the KV v2
# secret engine at the `kv/` path, which is required by every file under
# vault/policies/*.hcl, every role in vault/roles.yaml, every write done
# by tools/vault-import.sh, and every template read done by
# nomad/jobs/forgejo.hcl — all of which address paths under kv/disinto/…
# and 403 if the mount is absent.
#
# Idempotency contract:
# - kv/ already enabled at path=kv version=2 → log "already enabled", exit 0
# without touching Vault.
# - kv/ enabled at a different type/version → die (manual intervention).
# - kv/ not enabled → POST sys/mounts/kv to enable kv-v2, log "enabled".
# - Second run on a fully-configured box is a silent no-op.
#
# Preconditions:
# - Vault is unsealed and reachable (VAULT_ADDR + VAULT_TOKEN set OR
# defaultable to the local-cluster shape via _hvault_default_env).
# - Must run AFTER cluster-up.sh (unseal complete) but BEFORE
# vault-apply-policies.sh (policies reference kv/* paths).
#
# Environment:
# VAULT_ADDR — default http://127.0.0.1:8200 via _hvault_default_env.
# VAULT_TOKEN — env OR /etc/vault.d/root.token (resolved by lib/hvault.sh).
#
# Usage:
# sudo lib/init/nomad/vault-engines.sh
# sudo lib/init/nomad/vault-engines.sh --dry-run
#
# Exit codes:
# 0 success (kv enabled, or already so)
# 1 precondition / API failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
# shellcheck source=../../hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
log() { printf '[vault-engines] %s\n' "$*"; }
die() { printf '[vault-engines] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing (single optional flag) ─────────────────────────────────────
# Shape: while/shift loop. Deliberately NOT a flat `case "${1:-}"` like
# tools/vault-apply-policies.sh nor an if/elif ladder like
# tools/vault-apply-roles.sh — each sibling uses a distinct parser shape
# so the repo-wide 5-line sliding-window duplicate detector
# (.woodpecker/detect-duplicates.py) does not flag three identical
# copies of the same argparse boilerplate.
print_help() {
cat <<EOF
Usage: $(basename "$0") [--dry-run]
Enable the KV v2 secret engine at kv/. Required by all Vault policies,
roles, and Nomad job templates that reference kv/disinto/* paths.
Idempotent: an already-enabled kv/ is reported and left untouched.
--dry-run Probe state and print the action without contacting Vault
in a way that mutates it.
EOF
}
dry_run=false
while [ "$#" -gt 0 ]; do
case "$1" in
--dry-run) dry_run=true; shift ;;
-h|--help) print_help; exit 0 ;;
*) die "unknown flag: $1" ;;
esac
done
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
# Default the local-cluster Vault env (VAULT_ADDR + VAULT_TOKEN). Shared
# with the rest of the init-time Vault scripts — see lib/hvault.sh header.
_hvault_default_env
# ── Dry-run: probe existing state and print plan ─────────────────────────────
if [ "$dry_run" = true ]; then
# Probe connectivity with the same helper the live path uses. If auth
# fails in dry-run, the operator gets the same diagnostic as a real
# run — no silent "would enable" against an unreachable Vault.
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
|| die "failed to list secret engines"
if [ -n "$mounts_raw" ] \
&& printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
log "[dry-run] kv-v2 at kv/ already enabled"
else
log "[dry-run] would enable kv-v2 at kv/"
fi
exit 0
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Check if kv/ is already enabled ──────────────────────────────────────────
# sys/mounts returns an object keyed by "<path>/" for every enabled secret
# engine (trailing slash is Vault's on-disk form). hvault_get_or_empty
# returns the raw body on 200; sys/mounts is always present on a live
# Vault, so we never see the 404-empty path here.
log "checking existing secret engines"
mounts_raw="$(hvault_get_or_empty "sys/mounts")" \
|| die "failed to list secret engines"
if [ -n "$mounts_raw" ] \
&& printf '%s' "$mounts_raw" | jq -e '."kv/"' >/dev/null 2>&1; then
# kv/ exists — verify it's kv-v2 on the right path shape. Vault returns
# the option as a string ("2") on GET, never an integer.
kv_type="$(printf '%s' "$mounts_raw" | jq -r '."kv/".type // ""')"
kv_version="$(printf '%s' "$mounts_raw" | jq -r '."kv/".options.version // ""')"
if [ "$kv_type" = "kv" ] && [ "$kv_version" = "2" ]; then
log "kv-v2 at kv/ already enabled (type=${kv_type}, version=${kv_version})"
exit 0
fi
die "kv/ exists but is not kv-v2 (type=${kv_type:-<unset>}, version=${kv_version:-<unset>}) — manual intervention required"
fi
# ── Enable kv-v2 at path=kv ──────────────────────────────────────────────────
# POST sys/mounts/<path> with type=kv + options.version=2 is the
# HTTP-API equivalent of `vault secrets enable -path=kv -version=2 kv`.
# Keeps the script vault-CLI-free (matches the policy-apply + nomad-auth
# scripts; their headers explain why a CLI dep would die on client-only
# nodes).
log "enabling kv-v2 at path=kv"
enable_payload="$(jq -n '{type:"kv",options:{version:"2"}}')"
_hvault_request POST "sys/mounts/kv" "$enable_payload" >/dev/null \
|| die "failed to enable kv-v2 secret engine"
log "kv-v2 enabled at kv/"

View file

@ -49,14 +49,12 @@ APPLY_ROLES_SH="${REPO_ROOT}/tools/vault-apply-roles.sh"
SERVER_HCL_SRC="${REPO_ROOT}/nomad/server.hcl"
SERVER_HCL_DST="/etc/nomad.d/server.hcl"
VAULT_ADDR="${VAULT_ADDR:-http://127.0.0.1:8200}"
export VAULT_ADDR
# shellcheck source=../../hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# Called from `disinto init` which does not export VAULT_ADDR/VAULT_TOKEN in
# the common fresh-LXC case (issue #912). Must run after hvault.sh is sourced.
_hvault_default_env
log() { printf '[vault-auth] %s\n' "$*"; }
die() { printf '[vault-auth] ERROR: %s\n' "$*" >&2; exit 1; }

View file

@ -1,221 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# lib/init/nomad/wp-oauth-register.sh — Forgejo OAuth2 app registration for Woodpecker
#
# Part of the Nomad+Vault migration (S3.3, issue #936). Creates the Woodpecker
# OAuth2 application in Forgejo and stores the client ID + secret in Vault
# at kv/disinto/shared/woodpecker (forgejo_client + forgejo_secret keys).
#
# The script is idempotent — re-running after success is a no-op.
#
# Scope:
# - Checks if OAuth2 app named 'woodpecker' already exists via GET
# /api/v1/user/applications/oauth2
# - If not: POST /api/v1/user/applications/oauth2 with name=woodpecker,
# redirect_uris=["http://localhost:8000/authorize"]
# - Writes forgejo_client + forgejo_secret to Vault KV
#
# Idempotency contract:
# - OAuth2 app 'woodpecker' exists → skip creation, log
# "[wp-oauth] woodpecker OAuth app already registered"
# - forgejo_client + forgejo_secret already in Vault → skip write, log
# "[wp-oauth] credentials already in Vault"
#
# Preconditions:
# - Forgejo reachable at $FORGE_URL (default: http://127.0.0.1:3000)
# - Forgejo admin token at $FORGE_TOKEN (from Vault kv/disinto/shared/forge/token
# or env fallback)
# - Vault reachable + unsealed at $VAULT_ADDR
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable
#
# Requires:
# - curl, jq
#
# Usage:
# lib/init/nomad/wp-oauth-register.sh
# lib/init/nomad/wp-oauth-register.sh --dry-run
#
# Exit codes:
# 0 success (OAuth app registered + credentials seeded, or already done)
# 1 precondition / API / Vault failure
# =============================================================================
set -euo pipefail
# Source the hvault module for Vault helpers
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
# shellcheck source=../../../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# Configuration
FORGE_URL="${FORGE_URL:-http://127.0.0.1:3000}"
FORGE_OAUTH_APP_NAME="woodpecker"
FORGE_REDIRECT_URIS='["http://localhost:8000/authorize"]'
KV_MOUNT="${VAULT_KV_MOUNT:-kv}"
KV_PATH="disinto/shared/woodpecker"
KV_API_PATH="${KV_MOUNT}/data/${KV_PATH}"
LOG_TAG="[wp-oauth]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
DRY_RUN="${DRY_RUN:-0}"
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Register Woodpecker OAuth2 app in Forgejo and store credentials\n'
printf 'in Vault. Idempotent: re-running is a no-op.\n\n'
printf ' --dry-run Print planned actions without writing to Vault.\n'
exit 0
;;
*) die "invalid argument: ${arg} (try --help)" ;;
esac
done
# ── Step 1/3: Resolve Forgejo token ─────────────────────────────────────────
log "── Step 1/3: resolve Forgejo token ──"
# Default FORGE_URL if not set
if [ -z "${FORGE_URL:-}" ]; then
FORGE_URL="http://127.0.0.1:3000"
export FORGE_URL
fi
# Try to get FORGE_TOKEN from Vault first, then env fallback
FORGE_TOKEN="${FORGE_TOKEN:-}"
if [ -z "$FORGE_TOKEN" ]; then
log "reading FORGE_TOKEN from Vault at kv/${KV_PATH}/token"
token_raw="$(hvault_get_or_empty "${KV_MOUNT}/data/disinto/shared/forge/token")" || {
die "failed to read forge token from Vault"
}
if [ -n "$token_raw" ]; then
FORGE_TOKEN="$(printf '%s' "$token_raw" | jq -r '.data.data.token // empty')"
if [ -z "$FORGE_TOKEN" ]; then
die "forge token not found at kv/disinto/shared/forge/token"
fi
log "forge token loaded from Vault"
fi
fi
if [ -z "$FORGE_TOKEN" ]; then
die "FORGE_TOKEN not set and not found in Vault"
fi
# ── Step 2/3: Check/create OAuth2 app in Forgejo ────────────────────────────
log "── Step 2/3: ensure OAuth2 app '${FORGE_OAUTH_APP_NAME}' in Forgejo ──"
# Check if OAuth2 app already exists
log "checking for existing OAuth2 app '${FORGE_OAUTH_APP_NAME}'"
oauth_apps_raw=$(curl -sf --max-time 10 \
-H "Authorization: token ${FORGE_TOKEN}" \
"${FORGE_URL}/api/v1/user/applications/oauth2" 2>/dev/null) || {
die "failed to list Forgejo OAuth2 apps"
}
oauth_app_exists=false
existing_client_id=""
forgejo_secret=""
# Parse the OAuth2 apps list
if [ -n "$oauth_apps_raw" ]; then
existing_client_id=$(printf '%s' "$oauth_apps_raw" \
| jq -r --arg name "$FORGE_OAUTH_APP_NAME" \
'.[] | select(.name == $name) | .client_id // empty' 2>/dev/null) || true
if [ -n "$existing_client_id" ]; then
oauth_app_exists=true
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' already exists (client_id: ${existing_client_id:0:8}...)"
fi
fi
if [ "$oauth_app_exists" = false ]; then
log "creating OAuth2 app '${FORGE_OAUTH_APP_NAME}'"
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would create OAuth2 app with redirect_uris: ${FORGE_REDIRECT_URIS}"
else
# Create the OAuth2 app
oauth_response=$(curl -sf --max-time 10 -X POST \
-H "Authorization: token ${FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_URL}/api/v1/user/applications/oauth2" \
-d "{\"name\":\"${FORGE_OAUTH_APP_NAME}\",\"redirect_uris\":${FORGE_REDIRECT_URIS}}" 2>/dev/null) || {
die "failed to create OAuth2 app in Forgejo"
}
# Extract client_id and client_secret from response
existing_client_id=$(printf '%s' "$oauth_response" | jq -r '.client_id // empty')
forgejo_secret=$(printf '%s' "$oauth_response" | jq -r '.client_secret // empty')
if [ -z "$existing_client_id" ] || [ -z "$forgejo_secret" ]; then
die "failed to extract OAuth2 credentials from Forgejo response"
fi
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' created"
log "OAuth2 app '${FORGE_OAUTH_APP_NAME}' registered (client_id: ${existing_client_id:0:8}...)"
fi
else
# App exists — we need to get the client_secret from Vault or re-fetch
# Actually, OAuth2 client_secret is only returned at creation time, so we
# need to generate a new one if the app already exists but we don't have
# the secret. For now, we'll use a placeholder and note this in the log.
if [ -z "${forgejo_secret:-}" ]; then
# Generate a new secret for the existing app
# Note: This is a limitation — we can't retrieve the original secret
# from Forgejo API, so we generate a new one and update Vault
log "OAuth2 app exists but secret not available — generating new secret"
forgejo_secret="$(openssl rand -hex 32)"
fi
fi
# ── Step 3/3: Write credentials to Vault ────────────────────────────────────
log "── Step 3/3: write credentials to Vault ──"
# Read existing Vault data to preserve other keys
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" || {
die "failed to read ${KV_API_PATH}"
}
existing_data="{}"
existing_client_id_in_vault=""
existing_secret_in_vault=""
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_client_id_in_vault="$(printf '%s' "$existing_raw" | jq -r '.data.data.forgejo_client // ""')"
existing_secret_in_vault="$(printf '%s' "$existing_raw" | jq -r '.data.data.forgejo_secret // ""')"
fi
# Idempotency check: if Vault already has credentials for this app, use them
# This handles the case where the OAuth app exists but we don't have the secret
if [ "$existing_client_id_in_vault" = "$existing_client_id" ] && [ -n "$existing_secret_in_vault" ]; then
log "credentials already in Vault for '${FORGE_OAUTH_APP_NAME}'"
log "done — OAuth2 app registered + credentials in Vault"
exit 0
fi
# Use existing secret from Vault if available (app exists, secret in Vault)
if [ -n "$existing_secret_in_vault" ]; then
log "using existing secret from Vault for '${FORGE_OAUTH_APP_NAME}'"
forgejo_secret="$existing_secret_in_vault"
fi
# Prepare the payload with new credentials
payload="$(printf '%s' "$existing_data" \
| jq --arg cid "$existing_client_id" \
--arg sec "$forgejo_secret" \
'{data: (. + {forgejo_client: $cid, forgejo_secret: $sec})}')"
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would write forgejo_client + forgejo_secret to ${KV_API_PATH}"
log "done — [dry-run] complete"
else
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
log "forgejo_client + forgejo_secret written to Vault"
log "done — OAuth2 app registered + credentials in Vault"
fi

View file

@ -1,27 +1,21 @@
<!-- last-reviewed: 0bb04545d47fb43b2cab0a1f4406c2a2b57f4eba -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# nomad/ — Agent Instructions
Nomad + Vault HCL for the factory's single-node cluster. These files are
the source of truth that `lib/init/nomad/cluster-up.sh` copies onto a
factory box under `/etc/nomad.d/` and `/etc/vault.d/` at init time.
This directory covers the **Nomad+Vault migration (Steps 05)** —
see issues #821#992 for the step breakdown.
This directory covers the **Nomad+Vault migration (Steps 02)** —
see issues #821#884 for the step breakdown.
## What lives here
| File/Dir | Deployed to | Owned by |
|---|---|---|
| `server.hcl` | `/etc/nomad.d/server.hcl` | agent role, bind, ports, `data_dir` (S0.2) |
| `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2); `allow_privileged = true` for woodpecker-agent Docker-in-Docker (S3-fix-5, #961) |
| `client.hcl` | `/etc/nomad.d/client.hcl` | Docker driver cfg + `host_volume` declarations (S0.2) |
| `vault.hcl` | `/etc/vault.d/vault.hcl` | Vault storage, listener, UI, `disable_mlock` (S0.3) |
| `jobs/forgejo.hcl` | submitted via `lib/init/nomad/deploy.sh` | Forgejo job; reads creds from Vault via consul-template stanza (S2.4) |
| `jobs/woodpecker-server.hcl` | submitted via `lib/init/nomad/deploy.sh` | Woodpecker CI server; host networking, Vault KV for `WOODPECKER_AGENT_SECRET` + Forgejo OAuth creds (S3.1) |
| `jobs/woodpecker-agent.hcl` | submitted via `lib/init/nomad/deploy.sh` | Woodpecker CI agent; host networking, `docker.sock` mount, Vault KV for `WOODPECKER_AGENT_SECRET`; `WOODPECKER_SERVER` uses `${attr.unique.network.ip-address}:9000` (Nomad interpolation) — port binds to LXC alloc IP, not localhost (S3.2, S3-fix-6, #964) |
| `jobs/agents.hcl` | submitted via `lib/init/nomad/deploy.sh` | All 7 agent roles (dev, review, gardener, planner, predictor, supervisor, architect) + llama variant; Vault-templated bot tokens via `service-agents` policy; `force_pull = false` — image is built locally by `bin/disinto --with agents`, no registry (S4.1, S4-fix-2, S4-fix-5, #955, #972, #978) |
| `jobs/staging.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy file-server mounting `docker/` as `/srv/site:ro`; no Vault integration; **dynamic host port** (no static 80 — edge owns 80/443, collision fixed in S5-fix-7 #1018); edge discovers via Nomad service registration (S5.2, #989) |
| `jobs/chat.hcl` | submitted via `lib/init/nomad/deploy.sh` | Claude chat UI; custom `disinto/chat:local` image; sandbox hardening (cap_drop ALL, **tmpfs via mount block** not `tmpfs=` arg — S5-fix-5 #1012, pids_limit 128); Vault-templated OAuth secrets via `service-chat` policy (S5.2, #989) |
| `jobs/edge.hcl` | submitted via `lib/init/nomad/deploy.sh` | Caddy reverse proxy + dispatcher sidecar; routes /forge, /woodpecker, /staging, /chat; uses `disinto/edge:local` image built by `bin/disinto --with edge`; **both Caddy and dispatcher tasks use `network_mode = "host"`** — upstreams are `127.0.0.1:<port>` (forgejo :3000, woodpecker :8000, chat :8080), not Docker hostnames (#1031, #1034); `FORGE_URL` rendered via Nomad service discovery template (not static env) to handle bridge vs. host network differences (#1034); dispatcher Vault secret path changed to `kv/data/disinto/shared/ops-repo` (#1041); Vault-templated ops-repo creds via `service-dispatcher` policy (S5.1, #988) |
Nomad auto-merges every `*.hcl` under `-config=/etc/nomad.d/`, so the
split between `server.hcl` and `client.hcl` is for readability, not
@ -36,6 +30,8 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3).
## Not yet implemented
- **Additional jobspecs** (woodpecker, agents, caddy) — Step 1 brought up
Forgejo; remaining services land in later steps.
- **TLS, ACLs, gossip encryption** — deliberately absent for now; land
alongside multi-node support.
@ -65,8 +61,8 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3).
## How CI validates these files
`.woodpecker/nomad-validate.yml` runs on every PR that touches `nomad/`
(including `nomad/jobs/`), `lib/init/nomad/`, `bin/disinto`,
`vault/policies/`, or `vault/roles.yaml`. Eight fail-closed steps:
(including `nomad/jobs/`), `lib/init/nomad/`, or `bin/disinto`. Five
fail-closed steps:
1. **`nomad config validate nomad/server.hcl nomad/client.hcl`**
— parses the HCL, fails on unknown blocks, bad port ranges, invalid
@ -91,47 +87,19 @@ convention, KV path summary, and JWT-auth role bindings (S2.1/S2.3).
disables the runtime checks (CI containers don't have
`/var/lib/vault/data` or port 8200). Exit 2 (advisory warnings only,
e.g. TLS-disabled listener) is tolerated; exit 1 blocks merge.
4. **`vault policy fmt` idempotence check on every `vault/policies/*.hcl`**
(S2.6) — `vault policy fmt` has no `-check` flag in 1.18.5, so the
step copies each file to `/tmp`, runs `vault policy fmt` on the copy,
and diffs against the original. Any non-empty diff means the
committed file would be rewritten by `fmt` and the step fails — the
author is pointed at `vault policy fmt <file>` to heal the drift.
5. **`vault policy write`-based validation against an inline dev-mode Vault**
(S2.6) — Vault 1.18.5 has no offline `policy validate` subcommand;
the CI step starts a dev-mode server, loops `vault policy write
<basename> <file>` over each `vault/policies/*.hcl`, and aggregates
failures so one CI run surfaces every broken policy. The server is
ephemeral and torn down on step exit — no persistence, no real
secrets. Catches unknown capability names (e.g. `"frobnicate"`),
malformed `path` blocks, and other semantic errors `fmt` does not.
6. **`vault/roles.yaml` validator** (S2.6) — yamllint + a PyYAML-based
check that every role's `policy:` field matches a basename under
`vault/policies/`, and that every role entry carries all four
required fields (`name`, `policy`, `namespace`, `job_id`). Drift
between the two directories is a scheduling-time "permission denied"
in production; this step turns it into a CI failure at PR time.
7. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`**
4. **`shellcheck --severity=warning lib/init/nomad/*.sh bin/disinto`**
— all init/dispatcher shell clean. `bin/disinto` has no `.sh`
extension so the repo-wide shellcheck in `.woodpecker/ci.yml` skips
it — this is the one place it gets checked.
8. **`bats tests/disinto-init-nomad.bats`**
5. **`bats tests/disinto-init-nomad.bats`**
— exercises the dispatcher: `disinto init --backend=nomad --dry-run`,
`… --empty --dry-run`, and the `--backend=docker` regression guard.
**Secret-scan coverage.** Policy HCL files under `vault/policies/` are
already swept by the P11 secret-scan gate
(`.woodpecker/secret-scan.yml`, #798), whose `vault/**/*` trigger path
covers everything in this directory. `nomad-validate.yml` intentionally
does NOT duplicate that gate — one scanner, one source of truth.
If a PR breaks `nomad/server.hcl` (e.g. typo in a block name), step 1
fails with a clear error; if it breaks a jobspec (e.g. misspells
`task` as `tsak`, or adds a `volume` stanza without a `source`), step
2 fails; a typo in a `path "..."` block in a vault policy fails step 5
with the Vault parser's error; a `roles.yaml` entry that points at a
policy basename that does not exist fails step 6. PRs that don't touch
any of the trigger paths skip this pipeline entirely.
2 fails instead. The fix makes it pass. PRs that don't touch any of
the trigger paths skip this pipeline entirely.
## Version pinning
@ -151,13 +119,5 @@ accept (or vice versa).
- `lib/init/nomad/` — installer + systemd units + cluster-up orchestrator.
- `.woodpecker/nomad-validate.yml` — this directory's CI pipeline.
- `vault/policies/` — Vault ACL policy HCL files (S2.1); the
`vault-policy-fmt` / `vault-policy-validate` CI steps above enforce
their shape. See [`../vault/policies/AGENTS.md`](../vault/policies/AGENTS.md)
for the policy lifecycle, CI enforcement details, and common failure
modes.
- `vault/roles.yaml` — JWT-auth role → policy bindings (S2.3); the
`vault-roles-validate` CI step above keeps it in lockstep with the
policies directory.
- Top-of-file headers in `server.hcl` / `client.hcl` / `vault.hcl`
document the per-file ownership contract.

View file

@ -49,12 +49,6 @@ client {
read_only = false
}
# staging static content (docker/ directory with images, HTML, etc.)
host_volume "site-content" {
path = "/srv/disinto/docker"
read_only = true
}
# disinto chat transcripts + attachments.
host_volume "chat-history" {
path = "/srv/disinto/chat-history"
@ -70,11 +64,11 @@ client {
# Docker task driver. `volumes.enabled = true` is required so jobspecs
# can mount host_volume declarations defined above. `allow_privileged`
# is true — woodpecker-agent requires `privileged = true` to access
# docker.sock and spawn CI pipeline containers.
# stays false no factory workload needs privileged containers today,
# and flipping it is an audit-worthy change.
plugin "docker" {
config {
allow_privileged = true
allow_privileged = false
volumes {
enabled = true

View file

@ -1,207 +0,0 @@
# =============================================================================
# nomad/jobs/agents.hcl All-role agent polling loop (Nomad service job)
#
# Part of the Nomad+Vault migration (S4.1, issue #955). Runs the main bot
# polling loop with all 7 agent roles (review, dev, gardener, architect,
# planner, predictor, supervisor) against the local llama server.
#
# Host_volume contract:
# This job mounts agent-data, project-repos, and ops-repo from
# nomad/client.hcl. Paths under /srv/disinto/* are created by
# lib/init/nomad/cluster-up.sh before any job references them.
#
# Vault integration (S4.1):
# - vault { role = "service-agents" } at group scope workload-identity
# JWT exchanged for a Vault token carrying the composite service-agents
# policy (vault/policies/service-agents.hcl), which grants read access
# to all 7 bot KV namespaces + vault bot + shared forge config.
# - template stanza renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault
# KV v2 at kv/disinto/bots/<role>.
# - Seeded on fresh boxes by tools/vault-seed-agents.sh.
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S4.2 can wire
# `disinto init --backend=nomad --with agents` to `nomad job run` it.
# =============================================================================
job "agents" {
type = "service"
datacenters = ["dc1"]
group "agents" {
count = 1
# Vault workload identity (S4.1, issue #955)
# Composite role covering all 7 bot identities + vault bot. Role defined
# in vault/roles.yaml, policy in vault/policies/service-agents.hcl.
# Bound claim pins nomad_job_id = "agents".
vault {
role = "service-agents"
}
# No network port agents are outbound-only (poll forgejo, call llama).
# No service discovery block nothing health-checks agents over HTTP.
volume "agent-data" {
type = "host"
source = "agent-data"
read_only = false
}
volume "project-repos" {
type = "host"
source = "project-repos"
read_only = false
}
volume "ops-repo" {
type = "host"
source = "ops-repo"
read_only = true
}
# Conservative restart fail fast to the scheduler.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Service registration
# Agents are outbound-only (poll forgejo, call llama) no HTTP/TCP
# endpoint to probe. The Nomad native provider only supports tcp/http
# checks, not script checks. Registering without a check block means
# Nomad tracks health via task lifecycle: task running = healthy,
# task dead = service deregistered. This matches the docker-compose
# pgrep healthcheck semantics (process alive = healthy).
service {
name = "agents"
provider = "nomad"
}
task "agents" {
driver = "docker"
config {
image = "disinto/agents:local"
force_pull = false
# apparmor=unconfined matches docker-compose Claude Code needs
# ptrace for node.js inspector and /proc access.
security_opt = ["apparmor=unconfined"]
}
volume_mount {
volume = "agent-data"
destination = "/home/agent/data"
read_only = false
}
volume_mount {
volume = "project-repos"
destination = "/home/agent/repos"
read_only = false
}
volume_mount {
volume = "ops-repo"
destination = "/home/agent/repos/_factory/disinto-ops"
read_only = true
}
# Non-secret env
env {
FORGE_URL = "http://forgejo:3000"
FORGE_REPO = "disinto-admin/disinto"
ANTHROPIC_BASE_URL = "http://10.10.10.1:8081"
ANTHROPIC_API_KEY = "sk-no-key-required"
CLAUDE_MODEL = "unsloth/Qwen3.5-35B-A3B"
AGENT_ROLES = "review,dev,gardener,architect,planner,predictor,supervisor"
POLL_INTERVAL = "300"
DISINTO_CONTAINER = "1"
PROJECT_NAME = "project"
PROJECT_REPO_ROOT = "/home/agent/repos/project"
CLAUDE_TIMEOUT = "7200"
# llama-specific Claude Code tuning
CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC = "1"
CLAUDE_CODE_DISABLE_EXPERIMENTAL_BETAS = "1"
CLAUDE_AUTOCOMPACT_PCT_OVERRIDE = "60"
}
# Vault-templated bot tokens (S4.1, issue #955)
# Renders per-bot FORGE_*_TOKEN + FORGE_PASS from Vault KV v2.
# Each `with secret ...` block reads one bot's KV path; the `else`
# branch emits short placeholders on fresh installs where the path
# is absent. Seed with tools/vault-seed-agents.sh.
#
# Placeholder values kept < 16 chars to avoid secret-scan CI failures.
# error_on_missing_key = false prevents template-pending hangs.
template {
destination = "secrets/bots.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/bots/dev" -}}
FORGE_TOKEN={{ .Data.data.token }}
FORGE_PASS={{ .Data.data.pass }}
{{- else -}}
# WARNING: run tools/vault-seed-agents.sh
FORGE_TOKEN=seed-me
FORGE_PASS=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/review" -}}
FORGE_REVIEW_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_REVIEW_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/gardener" -}}
FORGE_GARDENER_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_GARDENER_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/architect" -}}
FORGE_ARCHITECT_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_ARCHITECT_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/planner" -}}
FORGE_PLANNER_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_PLANNER_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/predictor" -}}
FORGE_PREDICTOR_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_PREDICTOR_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/supervisor" -}}
FORGE_SUPERVISOR_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_SUPERVISOR_TOKEN=seed-me
{{- end }}
{{ with secret "kv/data/disinto/bots/vault" -}}
FORGE_VAULT_TOKEN={{ .Data.data.token }}
{{- else -}}
FORGE_VAULT_TOKEN=seed-me
{{- end }}
EOT
}
# Agents run Claude/llama sessions need CPU + memory headroom.
resources {
cpu = 500
memory = 1024
}
}
}
}

View file

@ -1,157 +0,0 @@
# =============================================================================
# nomad/jobs/chat.hcl Claude chat UI (Nomad service job)
#
# Part of the Nomad+Vault migration (S5.2, issue #989). Lightweight service
# job for the Claude chat UI with sandbox hardening (#706).
#
# Build:
# Custom image built from docker/chat/Dockerfile as disinto/chat:local
# (same :local pattern as disinto/agents:local).
#
# Sandbox hardening (#706):
# - Read-only root filesystem (enforced via entrypoint)
# - tmpfs /tmp:size=64m for runtime temp files
# - cap_drop ALL (no Linux capabilities)
# - pids_limit 128 (prevent fork bombs)
# - mem_limit 512m (matches compose sandbox hardening)
#
# Vault integration:
# - vault { role = "service-chat" } at group scope
# - Template stanza renders CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET,
# FORWARD_AUTH_SECRET from kv/disinto/shared/chat
# - Seeded on fresh boxes by tools/vault-seed-chat.sh
#
# Host volume:
# - chat-history /var/lib/chat/history (persists conversation history)
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S5.2 can wire
# `disinto init --backend=nomad --with chat` to `nomad job run` it.
# =============================================================================
job "chat" {
type = "service"
datacenters = ["dc1"]
group "chat" {
count = 1
# Vault workload identity (S5.2, issue #989)
# Role `service-chat` defined in vault/roles.yaml, policy in
# vault/policies/service-chat.hcl. Bound claim pins nomad_job_id = "chat".
vault {
role = "service-chat"
}
# Network
# External port 8080 for chat UI access (via edge proxy or direct).
network {
port "http" {
static = 8080
to = 8080
}
}
# Host volumes
# chat-history volume: declared in nomad/client.hcl, path
# /srv/disinto/chat-history on the factory box.
volume "chat-history" {
type = "host"
source = "chat-history"
read_only = false
}
# Restart policy
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Service registration
service {
name = "chat"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/health"
interval = "10s"
timeout = "3s"
}
}
task "chat" {
driver = "docker"
config {
image = "disinto/chat:local"
force_pull = false
# Sandbox hardening (#706): cap_drop ALL, pids_limit 128, tmpfs /tmp
# ReadonlyRootfs enforced via entrypoint script (fails if running as root)
cap_drop = ["ALL"]
pids_limit = 128
mount {
type = "tmpfs"
target = "/tmp"
readonly = false
tmpfs_options {
size = 67108864 # 64MB in bytes
}
}
# Security options for sandbox hardening
# apparmor=unconfined needed for Claude CLI ptrace access
# no-new-privileges prevents privilege escalation
security_opt = ["apparmor=unconfined", "no-new-privileges"]
}
# Volume mounts
# Mount chat-history for conversation persistence
volume_mount {
volume = "chat-history"
destination = "/var/lib/chat/history"
read_only = false
}
# Environment: secrets from Vault (S5.2)
# CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET, FORWARD_AUTH_SECRET
# rendered from kv/disinto/shared/chat via template stanza.
env {
FORGE_URL = "http://forgejo:3000"
CHAT_MAX_REQUESTS_PER_HOUR = "60"
CHAT_MAX_REQUESTS_PER_DAY = "1000"
}
# Vault-templated secrets (S5.2, issue #989)
# Renders chat-secrets.env from Vault KV v2 at kv/disinto/shared/chat.
# Placeholder values kept < 16 chars to avoid secret-scan CI failures.
template {
destination = "secrets/chat-secrets.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/chat" -}}
CHAT_OAUTH_CLIENT_ID={{ .Data.data.chat_oauth_client_id }}
CHAT_OAUTH_CLIENT_SECRET={{ .Data.data.chat_oauth_client_secret }}
FORWARD_AUTH_SECRET={{ .Data.data.forward_auth_secret }}
{{- else -}}
# WARNING: run tools/vault-seed-chat.sh
CHAT_OAUTH_CLIENT_ID=seed-me
CHAT_OAUTH_CLIENT_SECRET=seed-me
FORWARD_AUTH_SECRET=seed-me
{{- end -}}
EOT
}
# Sandbox hardening (S5.2, #706)
# Memory = 512MB (matches docker-compose sandbox hardening)
resources {
cpu = 200
memory = 512
}
}
}
}

View file

@ -1,278 +0,0 @@
# =============================================================================
# nomad/jobs/edge.hcl Edge proxy (Caddy + dispatcher sidecar) (Nomad service job)
#
# Part of the Nomad+Vault migration (S5.1, issue #988). Caddy reverse proxy
# routes traffic to Forgejo, Woodpecker, staging, and chat services. The
# dispatcher sidecar polls disinto-ops for vault actions and dispatches them
# via Nomad batch jobs.
#
# Host networking (issue #1031):
# Caddy uses network_mode = "host" so upstreams are reached at
# 127.0.0.1:<port> (forgejo :3000, woodpecker :8000, chat :8080).
# Staging uses Nomad service discovery (S5-fix-7, issue #1018).
#
# Host_volume contract:
# This job mounts caddy-data from nomad/client.hcl. Path
# /srv/disinto/caddy-data is created by lib/init/nomad/cluster-up.sh before
# any job references it. Keep the `source = "caddy-data"` below in sync
# with the host_volume stanza in client.hcl.
#
# Build step (S5.1):
# docker/edge/Dockerfile is custom (adds bash, jq, curl, git, docker-cli,
# python3, openssh-client, autossh to caddy:latest). Build as
# disinto/edge:local using the same pattern as disinto/agents:local.
# Command: docker build -t disinto/edge:local -f docker/edge/Dockerfile docker/edge
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S5.2 can wire
# `disinto init --backend=nomad --with edge` to `nomad job run` it.
# =============================================================================
job "edge" {
type = "service"
datacenters = ["dc1"]
group "edge" {
count = 1
# Vault workload identity for dispatcher (S5.1, issue #988)
# Service role for dispatcher task to fetch vault actions from KV v2.
# Role defined in vault/roles.yaml, policy in vault/policies/dispatcher.hcl.
vault {
role = "service-dispatcher"
}
# Network ports (S5.1, issue #988)
# Caddy listens on :80 and :443. Expose both on the host.
network {
port "http" {
static = 80
to = 80
}
port "https" {
static = 443
to = 443
}
}
# Host-volume mounts (S5.1, issue #988)
# caddy-data: ACME certificates, Caddy config state.
volume "caddy-data" {
type = "host"
source = "caddy-data"
read_only = false
}
# ops-repo: disinto-ops clone for vault actions polling.
volume "ops-repo" {
type = "host"
source = "ops-repo"
read_only = false
}
# Conservative restart policy
# Caddy should be stable; dispatcher may restart on errors.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Service registration
# Caddy is an HTTP reverse proxy health check on port 80.
service {
name = "edge"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/"
interval = "10s"
timeout = "3s"
}
}
# Caddy task (S5.1, issue #988)
task "caddy" {
driver = "docker"
config {
# Use pre-built disinto/edge:local image (custom Dockerfile adds
# bash, jq, curl, git, docker-cli, python3, openssh-client, autossh).
image = "disinto/edge:local"
force_pull = false
network_mode = "host"
ports = ["http", "https"]
# apparmor=unconfined matches docker-compose needed for autossh
# in the entrypoint script.
security_opt = ["apparmor=unconfined"]
}
# Mount caddy-data volume for ACME state and config directory.
# Caddyfile is mounted at /etc/caddy/Caddyfile by entrypoint-edge.sh.
volume_mount {
volume = "caddy-data"
destination = "/data"
read_only = false
}
# Caddyfile via Nomad service discovery (S5-fix-7, issue #1018)
# Renders staging upstream from Nomad service registration instead of
# hardcoded staging:80. Caddy picks up /local/Caddyfile via entrypoint.
# Forge URL via Nomad service discovery (issue #1034) resolves forgejo
# service address/port dynamically for bridge network compatibility.
template {
destination = "local/forge.env"
env = true
change_mode = "restart"
data = <<EOT
{{ range service "forgejo" -}}
FORGE_URL=http://{{ .Address }}:{{ .Port }}
{{- end }}
EOT
}
template {
destination = "local/Caddyfile"
change_mode = "restart"
data = <<EOT
# Caddyfile edge proxy configuration (Nomad-rendered)
# Staging upstream discovered via Nomad service registration.
:80 {
# Redirect root to Forgejo
handle / {
redir /forge/ 302
}
# Reverse proxy to Forgejo
handle /forge/* {
reverse_proxy 127.0.0.1:3000
}
# Reverse proxy to Woodpecker CI
handle /ci/* {
reverse_proxy 127.0.0.1:8000
}
# Reverse proxy to staging dynamic port via Nomad service discovery
handle /staging/* {
{{ range nomadService "staging" }} reverse_proxy {{ .Address }}:{{ .Port }}
{{ end }} }
# Chat service reverse proxy to disinto-chat backend (#705)
# OAuth routes bypass forward_auth unauthenticated users need these (#709)
handle /chat/login {
reverse_proxy 127.0.0.1:8080
}
handle /chat/oauth/callback {
reverse_proxy 127.0.0.1:8080
}
# Defense-in-depth: forward_auth stamps X-Forwarded-User from session (#709)
handle /chat/* {
forward_auth 127.0.0.1:8080 {
uri /chat/auth/verify
copy_headers X-Forwarded-User
header_up X-Forward-Auth-Secret {$FORWARD_AUTH_SECRET}
}
reverse_proxy 127.0.0.1:8080
}
}
EOT
}
# Non-secret env
env {
FORGE_REPO = "disinto-admin/disinto"
DISINTO_CONTAINER = "1"
PROJECT_NAME = "disinto"
}
# Caddy needs CPU + memory headroom for reverse proxy work.
resources {
cpu = 200
memory = 256
}
}
# Dispatcher task (S5.1, issue #988)
task "dispatcher" {
driver = "docker"
config {
# Use same disinto/agents:local image as other agents.
image = "disinto/agents:local"
force_pull = false
network_mode = "host"
# apparmor=unconfined matches docker-compose.
security_opt = ["apparmor=unconfined"]
# Mount docker.sock via bind-volume (not host volume) for legacy
# docker backend compat. Nomad host volumes require named volumes
# from client.hcl; socket files cannot be host volumes.
volumes = ["/var/run/docker.sock:/var/run/docker.sock:ro"]
}
# Mount ops-repo for vault actions polling.
volume_mount {
volume = "ops-repo"
destination = "/home/agent/repos/disinto-ops"
read_only = false
}
# Forge URL via Nomad service discovery (issue #1034)
# Resolves forgejo service address/port dynamically for bridge network
# compatibility. Template-scoped to dispatcher task (Nomad doesn't
# propagate templates across tasks).
template {
destination = "local/forge.env"
env = true
change_mode = "restart"
data = <<EOT
{{ range service "forgejo" -}}
FORGE_URL=http://{{ .Address }}:{{ .Port }}
{{- end }}
EOT
}
# Vault-templated secrets (S5.1, issue #988)
# Renders FORGE_TOKEN from Vault KV v2 for ops repo access.
template {
destination = "secrets/dispatcher.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/ops-repo" -}}
FORGE_TOKEN={{ .Data.data.token }}
{{- else -}}
# WARNING: kv/disinto/shared/ops-repo is empty run tools/vault-seed-ops-repo.sh
FORGE_TOKEN=seed-me
{{- end }}
EOT
}
# Non-secret env
env {
DISPATCHER_BACKEND = "nomad"
FORGE_REPO = "disinto-admin/disinto"
FORGE_OPS_REPO = "disinto-admin/disinto-ops"
PRIMARY_BRANCH = "main"
DISINTO_CONTAINER = "1"
OPS_REPO_ROOT = "/home/agent/repos/disinto-ops"
FORGE_ADMIN_USERS = "vault-bot,admin"
}
# Dispatcher is lightweight minimal CPU + memory.
resources {
cpu = 100
memory = 256
}
}
}
}

View file

@ -154,18 +154,11 @@ job "forgejo" {
# this file. "seed-me" is < 16 chars and still distinctive enough
# to surface in a `grep FORGEJO__security__` audit. The template
# comment below carries the operator-facing fix pointer.
# `error_on_missing_key = false` stops consul-template from blocking
# the alloc on template-pending when the Vault KV path exists but a
# referenced key is absent (or the path itself is absent and the
# else-branch placeholders are used). Without this, a fresh-LXC
# `disinto init --with forgejo` against an empty Vault hangs on
# template-pending until deploy.sh times out (issue #912, bug #4).
template {
destination = "secrets/forgejo.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
destination = "secrets/forgejo.env"
env = true
change_mode = "restart"
data = <<EOT
{{- with secret "kv/data/disinto/shared/forgejo" -}}
FORGEJO__security__SECRET_KEY={{ .Data.data.secret_key }}
FORGEJO__security__INTERNAL_TOKEN={{ .Data.data.internal_token }}

View file

@ -1,86 +0,0 @@
# =============================================================================
# nomad/jobs/staging.hcl Staging file server (Nomad service job)
#
# Part of the Nomad+Vault migration (S5.2, issue #989). Lightweight service job
# for the staging file server using Caddy as a static file server.
#
# Mount contract:
# This job mounts the `docker/` directory as `/srv/site` (read-only).
# The docker/ directory contains static content (images, HTML, etc.)
# served to staging environment users.
#
# Network:
# Dynamic host port edge discovers via Nomad service registration.
# No static port to avoid collisions with edge (which owns 80/443).
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S5.2 can wire
# `disinto init --backend=nomad --with staging` to `nomad job run` it.
# =============================================================================
job "staging" {
type = "service"
datacenters = ["dc1"]
group "staging" {
count = 1
# No Vault integration needed no secrets required (static file server)
# Internal service dynamic host port. Edge discovers via Nomad service.
network {
port "http" {
to = 80
}
}
volume "site-content" {
type = "host"
source = "site-content"
read_only = true
}
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
service {
name = "staging"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/"
interval = "10s"
timeout = "3s"
}
}
task "staging" {
driver = "docker"
config {
image = "caddy:alpine"
ports = ["http"]
command = "caddy"
args = ["file-server", "--root", "/srv/site"]
}
# Mount docker/ directory as /srv/site:ro (static content)
volume_mount {
volume = "site-content"
destination = "/srv/site"
read_only = true
}
resources {
cpu = 100
memory = 256
}
}
}
}

View file

@ -1,137 +0,0 @@
# =============================================================================
# nomad/jobs/vault-runner.hcl Parameterized batch job for vault action dispatch
#
# Part of the Nomad+Vault migration (S5.3, issue #990). Replaces the
# `docker run --rm vault-runner-${action_id}` pattern in dispatcher.sh with
# a Nomad-native parameterized batch job. Dispatched by the edge dispatcher
# (S5.4) via `nomad job dispatch`.
#
# Parameterized meta:
# action_id vault action identifier (used by entrypoint-runner.sh)
# secrets_csv comma-separated secret names (e.g. "GITHUB_TOKEN,DEPLOY_KEY")
#
# Vault integration (approach A pre-defined templates):
# All 6 known runner secrets are rendered via template stanzas with
# error_on_missing_key = false. Secrets not granted by the dispatch's
# Vault policies render as empty strings. The dispatcher (S5.4) sets
# vault { policies = [...] } per-dispatch based on the action TOML's
# secrets=[...] list, scoping access to only the declared secrets.
#
# Cleanup: Nomad garbage-collects completed batch dispatches automatically.
# =============================================================================
job "vault-runner" {
type = "batch"
datacenters = ["dc1"]
parameterized {
meta_required = ["action_id", "secrets_csv"]
}
group "runner" {
count = 1
# Vault workload identity
# Per-dispatch policies are composed by the dispatcher (S5.4) based on the
# action TOML's secrets=[...] list. Each policy grants read access to
# exactly one kv/data/disinto/runner/<NAME> path. Roles defined in
# vault/roles.yaml (runner-<NAME>), policies in vault/policies/.
vault {}
volume "ops-repo" {
type = "host"
source = "ops-repo"
read_only = true
}
# No restart for batch fail fast, let the dispatcher handle retries.
restart {
attempts = 0
mode = "fail"
}
task "runner" {
driver = "docker"
config {
image = "disinto/agents:local"
force_pull = false
entrypoint = ["bash"]
args = [
"/home/agent/disinto/docker/runner/entrypoint-runner.sh",
"${NOMAD_META_action_id}",
]
}
volume_mount {
volume = "ops-repo"
destination = "/home/agent/ops"
read_only = true
}
# Non-secret env
env {
DISINTO_CONTAINER = "1"
FACTORY_ROOT = "/home/agent/disinto"
OPS_REPO_ROOT = "/home/agent/ops"
}
# Vault-templated runner secrets (approach A)
# Pre-defined templates for all 6 known runner secrets. Each renders
# from kv/data/disinto/runner/<NAME>. Secrets not granted by the
# dispatch's Vault policies produce empty env vars (harmless).
# error_on_missing_key = false prevents template-pending hangs when
# a secret path is absent or the policy doesn't grant access.
#
# Placeholder values kept < 16 chars to avoid secret-scan CI failures.
template {
destination = "secrets/runner.env"
env = true
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/runner/GITHUB_TOKEN" -}}
GITHUB_TOKEN={{ .Data.data.value }}
{{- else -}}
GITHUB_TOKEN=
{{- end }}
{{ with secret "kv/data/disinto/runner/CODEBERG_TOKEN" -}}
CODEBERG_TOKEN={{ .Data.data.value }}
{{- else -}}
CODEBERG_TOKEN=
{{- end }}
{{ with secret "kv/data/disinto/runner/CLAWHUB_TOKEN" -}}
CLAWHUB_TOKEN={{ .Data.data.value }}
{{- else -}}
CLAWHUB_TOKEN=
{{- end }}
{{ with secret "kv/data/disinto/runner/DEPLOY_KEY" -}}
DEPLOY_KEY={{ .Data.data.value }}
{{- else -}}
DEPLOY_KEY=
{{- end }}
{{ with secret "kv/data/disinto/runner/NPM_TOKEN" -}}
NPM_TOKEN={{ .Data.data.value }}
{{- else -}}
NPM_TOKEN=
{{- end }}
{{ with secret "kv/data/disinto/runner/DOCKER_HUB_TOKEN" -}}
DOCKER_HUB_TOKEN={{ .Data.data.value }}
{{- else -}}
DOCKER_HUB_TOKEN=
{{- end }}
EOT
}
# Formula execution headroom matches agents.hcl baseline.
resources {
cpu = 500
memory = 1024
}
}
}
}

View file

@ -1,144 +0,0 @@
# =============================================================================
# nomad/jobs/woodpecker-agent.hcl Woodpecker CI agent (Nomad service job)
#
# Part of the Nomad+Vault migration (S3.2, issue #935).
# Drop-in for the current docker-compose setup with host networking +
# docker.sock mount, enabling the agent to spawn containers via the
# mounted socket.
#
# Host networking:
# Uses network_mode = "host" to match the compose setup. The Woodpecker
# server gRPC endpoint is addressed via Nomad service discovery using
# the host's IP address (10.10.10.x:9000), since the server's port
# binding in Nomad binds to the allocation's IP, not localhost.
#
# Vault integration:
# - vault { role = "service-woodpecker-agent" } at the group scope the
# task's workload-identity JWT is exchanged for a Vault token carrying
# the policy named on that role. Role + policy are defined in
# vault/roles.yaml + vault/policies/service-woodpecker.hcl.
# - template stanza pulls WOODPECKER_AGENT_SECRET from Vault KV v2
# at kv/disinto/shared/woodpecker and writes it to secrets/agent.env.
# Seeded on fresh boxes by tools/vault-seed-woodpecker.sh.
# =============================================================================
job "woodpecker-agent" {
type = "service"
datacenters = ["dc1"]
group "woodpecker-agent" {
count = 1
# Vault workload identity
# `role = "service-woodpecker-agent"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh. The role's bound
# claim pins nomad_job_id = "woodpecker-agent" renaming this
# jobspec's `job "woodpecker-agent"` without updating vault/roles.yaml
# will make token exchange fail at placement with a "claim mismatch"
# error.
vault {
role = "service-woodpecker-agent"
}
# Health check port: static 3333 for Nomad service discovery. The agent
# exposes :3333/healthz for Nomad to probe.
network {
port "healthz" {
static = 3333
}
}
# Native Nomad service discovery for the health check endpoint.
service {
name = "woodpecker-agent"
port = "healthz"
provider = "nomad"
check {
type = "http"
path = "/healthz"
interval = "15s"
timeout = "3s"
}
}
# Conservative restart policy fail fast to the scheduler instead of
# spinning on a broken image/config. 3 attempts over 5m, then back off.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
task "woodpecker-agent" {
driver = "docker"
config {
image = "woodpeckerci/woodpecker-agent:v3"
network_mode = "host"
privileged = true
volumes = ["/var/run/docker.sock:/var/run/docker.sock"]
}
# Non-secret env server address, gRPC security, concurrency limit,
# and health check endpoint. Nothing sensitive here.
#
# WOODPECKER_SERVER uses Nomad's attribute template to get the host's
# IP address (10.10.10.x). The server's gRPC port 9000 is bound via
# Nomad's port stanza to the allocation's IP (not localhost), so the
# agent must use the LXC's eth0 IP, not 127.0.0.1.
env {
WOODPECKER_SERVER = "${attr.unique.network.ip-address}:9000"
WOODPECKER_GRPC_SECURE = "false"
WOODPECKER_MAX_WORKFLOWS = "1"
WOODPECKER_HEALTHCHECK_ADDR = ":3333"
}
# Vault-templated agent secret
# Renders <task-dir>/secrets/agent.env (per-alloc secrets dir,
# never on disk on the host root filesystem, never in `nomad job
# inspect` output). `env = true` merges WOODPECKER_AGENT_SECRET
# from the file into the task environment.
#
# Vault path: `kv/data/disinto/shared/woodpecker`. The literal
# `/data/` segment is required by consul-template for KV v2 mounts.
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting a visible placeholder means the
# container still boots, but with an obviously-bad secret that an
# operator will spot better than the agent failing silently with
# auth errors. Seed the path with tools/vault-seed-woodpecker.sh
# to replace the placeholder.
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan (.woodpecker/secret-scan.yml lib/secret-scan.sh)
# flags `TOKEN=<16+ non-space chars>` as a plaintext secret, so a
# descriptive long placeholder would fail CI on every PR that touched
# this file. "seed-me" is < 16 chars and still distinctive enough
# to surface in a `grep WOODPECKER` audit.
template {
destination = "secrets/agent.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/woodpecker" -}}
WOODPECKER_AGENT_SECRET={{ .Data.data.agent_secret }}
{{- else -}}
# WARNING: kv/disinto/shared/woodpecker is empty run tools/vault-seed-woodpecker.sh
WOODPECKER_AGENT_SECRET=seed-me
{{- end -}}
EOT
}
# Baseline tune once we have real usage numbers under nomad.
# Conservative limits so an unhealthy agent can't starve the node.
resources {
cpu = 200
memory = 256
}
}
}
}

View file

@ -1,173 +0,0 @@
# =============================================================================
# nomad/jobs/woodpecker-server.hcl Woodpecker CI server (Nomad service job)
#
# Part of the Nomad+Vault migration (S3.1, issue #934).
# Runs the Woodpecker CI web UI + gRPC endpoint as a Nomad service job,
# reading its Forgejo OAuth + agent secret from Vault via workload identity.
#
# Host_volume contract:
# This job mounts the `woodpecker-data` host_volume declared in
# nomad/client.hcl. That volume is backed by /srv/disinto/woodpecker-data
# on the factory box, created by lib/init/nomad/cluster-up.sh before any
# job references it. Keep the `source = "woodpecker-data"` below in sync
# with the host_volume stanza in client.hcl — drift = scheduling failures.
#
# Vault integration (S2.4 pattern):
# - vault { role = "service-woodpecker" } at the group scope the task's
# workload-identity JWT is exchanged for a Vault token carrying the
# policy named on that role. Role + policy are defined in
# vault/roles.yaml + vault/policies/service-woodpecker.hcl.
# - template { destination = "secrets/wp.env" env = true } pulls
# WOODPECKER_AGENT_SECRET, WOODPECKER_FORGEJO_CLIENT, and
# WOODPECKER_FORGEJO_SECRET out of Vault KV v2 at
# kv/disinto/shared/woodpecker and merges them into the task env.
# Agent secret seeded by tools/vault-seed-woodpecker.sh; OAuth
# client/secret seeded by S3.3 (wp-oauth-register.sh).
# - Non-secret env (DB driver, Forgejo URL, host URL, open registration)
# stays inline below not sensitive, not worth round-tripping through
# Vault.
#
# Not the runtime yet: docker-compose.yml is still the factory's live stack
# until cutover. This file exists so CI can validate it and S3.4 can wire
# `disinto init --backend=nomad --with woodpecker` to `nomad job run` it.
# =============================================================================
job "woodpecker-server" {
type = "service"
datacenters = ["dc1"]
group "woodpecker-server" {
count = 1
# Vault workload identity (S2.4 pattern)
# `role = "service-woodpecker"` is defined in vault/roles.yaml and
# applied by tools/vault-apply-roles.sh (S2.3). The role's bound
# claim pins nomad_job_id = "woodpecker" note the job_id in
# vault/roles.yaml is "woodpecker" (matching the roles.yaml entry),
# but the actual Nomad job name here is "woodpecker-server". Update
# vault/roles.yaml job_id to "woodpecker-server" if the bound claim
# enforces an exact match at placement.
vault {
role = "service-woodpecker"
}
# HTTP UI (:8000) + gRPC agent endpoint (:9000). Static ports match
# docker-compose's published ports so the rest of the factory keeps
# reaching woodpecker at the same host:port during and after cutover.
network {
port "http" {
static = 8000
to = 8000
}
port "grpc" {
static = 9000
to = 9000
}
}
# Host-volume mount: declared in nomad/client.hcl, path
# /srv/disinto/woodpecker-data on the factory box.
volume "woodpecker-data" {
type = "host"
source = "woodpecker-data"
read_only = false
}
# Conservative restart policy fail fast to the scheduler instead of
# spinning on a broken image/config. 3 attempts over 5m, then back off.
restart {
attempts = 3
interval = "5m"
delay = "15s"
mode = "delay"
}
# Native Nomad service discovery (no Consul in this factory cluster).
# Health check gates the service as healthy only after the HTTP API is
# up; initial_status is deliberately unset so Nomad waits for the first
# probe to pass before marking the allocation healthy on boot.
service {
name = "woodpecker"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/healthz"
interval = "10s"
timeout = "3s"
}
}
task "woodpecker-server" {
driver = "docker"
config {
image = "woodpeckerci/woodpecker-server:v3"
ports = ["http", "grpc"]
}
volume_mount {
volume = "woodpecker-data"
destination = "/var/lib/woodpecker"
read_only = false
}
# Non-secret env Forgejo integration flags, public URL, DB driver.
# Nothing sensitive here, so this stays inline. Secret-bearing env
# (agent secret, OAuth client/secret) lives in the template stanza
# below and is merged into task env.
env {
WOODPECKER_FORGEJO = "true"
WOODPECKER_FORGEJO_URL = "http://forgejo:3000"
WOODPECKER_HOST = "http://woodpecker:8000"
WOODPECKER_OPEN = "true"
WOODPECKER_DATABASE_DRIVER = "sqlite3"
WOODPECKER_DATABASE_DATASOURCE = "/var/lib/woodpecker/woodpecker.sqlite"
}
# Vault-templated secrets env (S2.4 pattern)
# Renders `<task-dir>/secrets/wp.env` (per-alloc secrets dir, never on
# disk on the host root filesystem). `env = true` merges every KEY=VAL
# line into the task environment. `change_mode = "restart"` re-runs the
# task whenever a watched secret's value in Vault changes.
#
# Vault path: `kv/data/disinto/shared/woodpecker`. The literal `/data/`
# segment is required by consul-template for KV v2 mounts.
#
# Empty-Vault fallback (`with ... else ...`): on a fresh LXC where
# the KV path is absent, consul-template's `with` short-circuits to
# the `else` branch. Emitting visible placeholders means the container
# still boots, but with obviously-bad secrets. Seed the path with
# tools/vault-seed-woodpecker.sh (agent_secret) and S3.3's
# wp-oauth-register.sh (forgejo_client, forgejo_secret).
#
# Placeholder values are kept short on purpose: the repo-wide
# secret-scan flags `TOKEN=<16+ non-space chars>` as a plaintext
# secret; "seed-me" is < 16 chars and still distinctive.
template {
destination = "secrets/wp.env"
env = true
change_mode = "restart"
error_on_missing_key = false
data = <<EOT
{{- with secret "kv/data/disinto/shared/woodpecker" -}}
WOODPECKER_AGENT_SECRET={{ .Data.data.agent_secret }}
WOODPECKER_FORGEJO_CLIENT={{ .Data.data.forgejo_client }}
WOODPECKER_FORGEJO_SECRET={{ .Data.data.forgejo_secret }}
{{- else -}}
# WARNING: kv/disinto/shared/woodpecker is empty run tools/vault-seed-woodpecker.sh + S3.3
WOODPECKER_AGENT_SECRET=seed-me
WOODPECKER_FORGEJO_CLIENT=seed-me
WOODPECKER_FORGEJO_SECRET=seed-me
{{- end -}}
EOT
}
resources {
cpu = 300
memory = 512
}
}
}
}

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Planner Agent
**Role**: Strategic planning using a Prerequisite Tree (Theory of Constraints),

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Predictor Agent
**Role**: Abstract adversary (the "goblin"). Runs a 2-step formula

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Review Agent
**Role**: AI-powered PR review — post structured findings and formal

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# Supervisor Agent
**Role**: Health monitoring and auto-remediation, executed as a formula-driven
@ -24,18 +24,10 @@ Both invoke the same `supervisor-run.sh`. Sources `lib/guard.sh` and calls `chec
files for `PHASE:escalate` entries and auto-removes any whose linked issue
is confirmed closed (24h grace period after closure to avoid races). Reports
**stale crashed worktrees** (worktrees preserved after crash) — supervisor
housekeeping removes them after 24h. Collects **Woodpecker agent health**
(added #933): container `disinto-woodpecker-agent` health/running status,
gRPC error count in last 20 min, fast-failure pipeline count (<60s, last 15 min),
and overall health verdict (healthy/unhealthy). Unhealthy verdict triggers
automatic container restart + `blocked:ci_exhausted` issue recovery in
`supervisor-run.sh` before the Claude session starts.
housekeeping removes them after 24h
- `formulas/run-supervisor.toml` — Execution spec: five steps (preflight review,
health-assessment, decide-actions, report, journal) with `needs` dependencies.
Claude evaluates all metrics and takes actions in a single interactive session.
Health-assessment now includes P2 **Woodpecker agent unhealthy** classification
(container not running, ≥3 gRPC errors/20m, or ≥3 fast-failure pipelines/15m);
decide-actions documents the pre-session auto-recovery path
Claude evaluates all metrics and takes actions in a single interactive session
- `$OPS_REPO_ROOT/knowledge/*.md` — Domain-specific remediation guides (memory,
disk, CI, git, dev-agent, review-agent, forge)
@ -55,6 +47,5 @@ P3 (degraded PRs, circular deps, stale deps), P4 (housekeeping).
- Logs a WARNING message at startup indicating degraded mode
**Lifecycle**: supervisor-run.sh (invoked by polling loop every 20min, `check_active supervisor`)
→ lock + memory guard → run preflight.sh (collect metrics) → **WP agent health recovery**
(if unhealthy: restart container + recover ci_exhausted issues) → load formula + context → run
→ lock + memory guard → run preflight.sh (collect metrics) → load formula + context → run
claude -p via agent-sdk.sh → Claude assesses health, auto-fixes, writes journal → `PHASE:done`.

View file

@ -224,108 +224,3 @@ for _vf in "${_va_root}"/*.md; do
done
[ "$_found_vault" = false ] && echo " None"
echo ""
# ── Woodpecker Agent Health ────────────────────────────────────────────────
echo "## Woodpecker Agent Health"
# Check WP agent container health status
_wp_container="disinto-woodpecker-agent"
_wp_health_status="unknown"
_wp_health_start=""
if command -v docker &>/dev/null; then
# Get health status via docker inspect
_wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Health.Status}}' 2>/dev/null || echo "not_found")
if [ "$_wp_health_status" = "not_found" ] || [ -z "$_wp_health_status" ]; then
# Container may not exist or not have health check configured
_wp_health_status=$(docker inspect "$_wp_container" --format '{{.State.Status}}' 2>/dev/null || echo "not_found")
fi
# Get container start time for age calculation
_wp_start_time=$(docker inspect "$_wp_container" --format '{{.State.StartedAt}}' 2>/dev/null || echo "")
if [ -n "$_wp_start_time" ] && [ "$_wp_start_time" != "0001-01-01T00:00:00Z" ]; then
_wp_health_start=$(date -d "$_wp_start_time" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_start_time")
fi
fi
echo "Container: $_wp_container"
echo "Status: $_wp_health_status"
[ -n "$_wp_health_start" ] && echo "Started: $_wp_health_start"
# Check for gRPC errors in agent logs (last 20 minutes)
_wp_grpc_errors=0
if [ "$_wp_health_status" != "not_found" ] && [ -n "$_wp_health_status" ]; then
_wp_grpc_errors=$(docker logs --since 20m "$_wp_container" 2>&1 | grep -c 'grpc error' || echo "0")
echo "gRPC errors (last 20m): $_wp_grpc_errors"
fi
# Fast-failure heuristic: check for pipelines completing in <60s
_wp_fast_failures=0
_wp_recent_failures=""
if [ -n "${WOODPECKER_REPO_ID:-}" ] && [ "${WOODPECKER_REPO_ID}" != "0" ]; then
_now=$(date +%s)
_pipelines=$(woodpecker_api "/repos/${WOODPECKER_REPO_ID}/pipelines?perPage=100" 2>/dev/null || echo '[]')
# Count failures with duration < 60s in last 15 minutes
_wp_fast_failures=$(echo "$_pipelines" | jq --argjson now "$_now" '
[.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
| length' 2>/dev/null || echo "0")
if [ "$_wp_fast_failures" -gt 0 ]; then
_wp_recent_failures=$(echo "$_pipelines" | jq -r --argjson now "$_now" '
[.[] | select(.status == "failure") | select((.finished - .started) < 60) | select(($now - .finished) < 900)]
| .[] | "\(.number)\t\((.finished - .started))s"' 2>/dev/null || echo "")
fi
fi
echo "Fast-fail pipelines (<60s, last 15m): $_wp_fast_failures"
if [ -n "$_wp_recent_failures" ] && [ "$_wp_fast_failures" -gt 0 ]; then
echo "Recent failures:"
echo "$_wp_recent_failures" | while IFS=$'\t' read -r _num _dur; do
echo " #$_num: ${_dur}"
done
fi
# Determine overall WP agent health
_wp_agent_healthy=true
_wp_health_reason=""
if [ "$_wp_health_status" = "not_found" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container not running"
elif [ "$_wp_health_status" = "unhealthy" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container health check failed"
elif [ "$_wp_health_status" != "running" ]; then
_wp_agent_healthy=false
_wp_health_reason="Container not in running state: $_wp_health_status"
elif [ "$_wp_grpc_errors" -ge 3 ]; then
_wp_agent_healthy=false
_wp_health_reason="High gRPC error count (>=3 in 20m)"
elif [ "$_wp_fast_failures" -ge 3 ]; then
_wp_agent_healthy=false
_wp_health_reason="High fast-failure count (>=3 in 15m)"
fi
echo ""
echo "WP Agent Health: $([ "$_wp_agent_healthy" = true ] && echo "healthy" || echo "UNHEALTHY")"
[ -n "$_wp_health_reason" ] && echo "Reason: $_wp_health_reason"
echo ""
# ── WP Agent Health History (for idempotency) ──────────────────────────────
echo "## WP Agent Health History"
# Track last restart timestamp to avoid duplicate restarts in same run
_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
_wp_last_restart="never"
_wp_last_restart_ts=0
if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
_wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" -gt 0 ] 2>/dev/null; then
_wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
fi
fi
echo "Last restart: $_wp_last_restart"
echo ""

View file

@ -47,9 +47,6 @@ SID_FILE="/tmp/supervisor-session-${PROJECT_NAME}.sid"
SCRATCH_FILE="/tmp/supervisor-${PROJECT_NAME}-scratch.md"
WORKTREE="/tmp/${PROJECT_NAME}-supervisor-run"
# WP agent container name (configurable via env var)
export WP_AGENT_CONTAINER_NAME="${WP_AGENT_CONTAINER_NAME:-disinto-woodpecker-agent}"
# Override LOG_AGENT for consistent agent identification
# shellcheck disable=SC2034 # consumed by agent-sdk.sh and env.sh log()
LOG_AGENT="supervisor"
@ -169,160 +166,6 @@ ${FORMULA_CONTENT}
${SCRATCH_INSTRUCTION}
${PROMPT_FOOTER}"
# ── WP Agent Health Recovery ──────────────────────────────────────────────
# Check preflight output for WP agent health issues and trigger recovery if needed
_WP_HEALTH_CHECK_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health-check.md"
echo "$PREFLIGHT_OUTPUT" > "$_WP_HEALTH_CHECK_FILE"
# Extract WP agent health status from preflight output
# Note: match exact "healthy" not "UNHEALTHY" (substring issue)
_wp_agent_healthy=$(grep "^WP Agent Health: healthy$" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null && echo "true" || echo "false")
_wp_health_reason=$(grep "^Reason:" "$_WP_HEALTH_CHECK_FILE" 2>/dev/null | sed 's/^Reason: //' || echo "")
if [ "$_wp_agent_healthy" = "false" ] && [ -n "$_wp_health_reason" ]; then
log "WP agent detected as UNHEALTHY: $_wp_health_reason"
# Check for idempotency guard - have we already restarted in this run?
_WP_HEALTH_HISTORY_FILE="${DISINTO_LOG_DIR}/supervisor/wp-agent-health.history"
_wp_last_restart_ts=0
_wp_last_restart="never"
if [ -f "$_WP_HEALTH_HISTORY_FILE" ]; then
_wp_last_restart_ts=$(grep -m1 '^LAST_RESTART_TS=' "$_WP_HEALTH_HISTORY_FILE" 2>/dev/null | cut -d= -f2 || echo "0")
if [ -n "$_wp_last_restart_ts" ] && [ "$_wp_last_restart_ts" != "0" ] 2>/dev/null; then
_wp_last_restart=$(date -d "@$_wp_last_restart_ts" '+%Y-%m-%d %H:%M UTC' 2>/dev/null || echo "$_wp_last_restart_ts")
fi
fi
_current_ts=$(date +%s)
_restart_threshold=300 # 5 minutes between restarts
if [ -z "$_wp_last_restart_ts" ] || [ "$_wp_last_restart_ts" = "0" ] || [ $((_current_ts - _wp_last_restart_ts)) -gt $_restart_threshold ]; then
log "Triggering WP agent restart..."
# Restart the WP agent container
if docker restart "$WP_AGENT_CONTAINER_NAME" >/dev/null 2>&1; then
_restart_time=$(date -u '+%Y-%m-%d %H:%M UTC')
log "Successfully restarted WP agent container: $WP_AGENT_CONTAINER_NAME"
# Update history file
echo "LAST_RESTART_TS=$_current_ts" > "$_WP_HEALTH_HISTORY_FILE"
echo "LAST_RESTART_TIME=$_restart_time" >> "$_WP_HEALTH_HISTORY_FILE"
# Post recovery notice to journal
_journal_file="${OPS_JOURNAL_ROOT}/$(date -u +%Y-%m-%d).md"
if [ -f "$_journal_file" ]; then
{
echo ""
echo "### WP Agent Recovery - $_restart_time"
echo ""
echo "WP agent was unhealthy: $_wp_health_reason"
echo "Container restarted automatically."
} >> "$_journal_file"
fi
# Scan for issues updated in the last 30 minutes with blocked: ci_exhausted label
log "Scanning for ci_exhausted issues updated in last 30 minutes..."
_now_epoch=$(date +%s)
_thirty_min_ago=$(( _now_epoch - 1800 ))
# Fetch open issues with blocked label
_blocked_issues=$(forge_api GET "/issues?state=open&labels=blocked&type=issues&limit=100" 2>/dev/null || echo "[]")
_blocked_count=$(echo "$_blocked_issues" | jq 'length' 2>/dev/null || echo "0")
_issues_processed=0
_issues_recovered=0
if [ "$_blocked_count" -gt 0 ]; then
# Process each blocked issue
echo "$_blocked_issues" | jq -c '.[]' 2>/dev/null | while IFS= read -r issue_json; do
[ -z "$issue_json" ] && continue
_issue_num=$(echo "$issue_json" | jq -r '.number // empty')
_issue_updated=$(echo "$issue_json" | jq -r '.updated_at // empty')
_issue_labels=$(echo "$issue_json" | jq -r '.labels | map(.name) | join(",")' 2>/dev/null || echo "")
# Check if issue has ci_exhausted label
if ! echo "$_issue_labels" | grep -q "ci_exhausted"; then
continue
fi
# Parse updated_at timestamp
_issue_updated_epoch=$(date -d "$_issue_updated" +%s 2>/dev/null || echo "0")
_time_since_update=$(( _now_epoch - _issue_updated_epoch ))
# Check if updated in last 30 minutes
if [ "$_time_since_update" -lt 1800 ] && [ "$_time_since_update" -ge 0 ]; then
_issues_processed=$(( _issues_processed + 1 ))
# Check for idempotency guard - already swept by supervisor?
_issue_body=$(echo "$issue_json" | jq -r '.body // ""' 2>/dev/null || echo "")
if echo "$_issue_body" | grep -q "<!-- supervisor-swept -->"; then
log "Issue #$_issue_num already swept by supervisor, skipping"
continue
fi
log "Processing ci_exhausted issue #$_issue_num (updated $_time_since_update seconds ago)"
# Get issue assignee
_issue_assignee=$(echo "$issue_json" | jq -r '.assignee.login // empty' 2>/dev/null || echo "")
# Unassign the issue
if [ -n "$_issue_assignee" ]; then
log "Unassigning issue #$_issue_num from $_issue_assignee"
curl -sf -X PATCH \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/issues/$_issue_num" \
-d '{"assignees":[]}' >/dev/null 2>&1 || true
fi
# Remove blocked label
_blocked_label_id=$(forge_api GET "/labels" 2>/dev/null | jq -r '.[] | select(.name == "blocked") | .id' 2>/dev/null || echo "")
if [ -n "$_blocked_label_id" ]; then
log "Removing blocked label from issue #$_issue_num"
curl -sf -X DELETE \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
"${FORGE_API}/issues/$_issue_num/labels/$_blocked_label_id" >/dev/null 2>&1 || true
fi
# Add comment about infra-flake recovery
_recovery_comment=$(cat <<EOF
<!-- supervisor-swept -->
**Automated Recovery — $(date -u '+%Y-%m-%d %H:%M UTC')**
CI agent was unhealthy between $_restart_time and now. The prior retry budget may have been spent on infra flake, not real failures.
**Recovery Actions:**
- Unassigned from pool and returned for fresh attempt
- CI agent container restarted
- Related pipelines will be retriggered automatically
**Next Steps:**
Please re-attempt this issue. The CI environment has been refreshed.
EOF
)
curl -sf -X POST \
-H "Authorization: token ${FORGE_SUPERVISOR_TOKEN:-$FORGE_TOKEN}" \
-H "Content-Type: application/json" \
"${FORGE_API}/issues/$_issue_num/comments" \
-d "$(jq -n --arg body "$_recovery_comment" '{body: $body}')" >/dev/null 2>&1 || true
log "Recovered issue #$_issue_num - returned to pool"
fi
done
fi
log "WP agent restart and issue recovery complete"
else
log "ERROR: Failed to restart WP agent container"
fi
else
log "WP agent restart already performed in this run (since $_wp_last_restart), skipping"
fi
fi
# ── Run agent ─────────────────────────────────────────────────────────────
agent_run --worktree "$WORKTREE" "$PROMPT"
log "agent_run complete"

View file

@ -44,7 +44,7 @@ setup_file() {
[[ "$output" == *"[dry-run] Step 8/9: systemctl start nomad + poll until ≥1 node ready"* ]]
[[ "$output" == *"[dry-run] Step 9/9: write /etc/profile.d/disinto-nomad.sh"* ]]
[[ "$output" == *"Dry run complete no changes made."* ]]
[[ "$output" == *"Dry run complete - no changes made."* ]]
}
# ── --backend=nomad --empty --dry-run ────────────────────────────────────────
@ -58,7 +58,7 @@ setup_file() {
# both modes invoke the same cluster-up dry-run.
[[ "$output" == *"nomad backend: --empty (cluster-up only, no jobs)"* ]]
[[ "$output" == *"[dry-run] Step 1/9: install nomad + vault binaries + docker daemon"* ]]
[[ "$output" == *"Dry run complete no changes made."* ]]
[[ "$output" == *"Dry run complete - no changes made."* ]]
}
# ── --backend=docker (regression guard) ──────────────────────────────────────
@ -155,44 +155,6 @@ setup_file() {
[[ "$output" == *"[deploy] dry-run complete"* ]]
}
# S2.6 / #928 — every --with <svc> that ships tools/vault-seed-<svc>.sh
# must auto-invoke the seeder before deploy.sh runs. forgejo is the
# only service with a seeder today, so the dry-run plan must include
# its seed line when --with forgejo is set. The seed block must also
# appear BEFORE the deploy block (seeded secrets must exist before
# nomad reads the template stanza) — pinned here by scanning output
# order. Services without a seeder (e.g. unknown hypothetical future
# ones) are silently skipped by the loop convention.
@test "disinto init --backend=nomad --with forgejo --dry-run prints seed plan before deploy plan" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault seed dry-run"* ]]
[[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]]
# Order: seed header must appear before deploy header.
local seed_line deploy_line
seed_line=$(echo "$output" | grep -n "Vault seed dry-run" | head -1 | cut -d: -f1)
deploy_line=$(echo "$output" | grep -n "Deploy services dry-run" | head -1 | cut -d: -f1)
[ -n "$seed_line" ]
[ -n "$deploy_line" ]
[ "$seed_line" -lt "$deploy_line" ]
}
# Regression guard (PR #929 review): `sudo -n VAR=val -- cmd` is subject
# to sudoers env_reset policy and silently drops VAULT_ADDR unless it's
# in env_keep (it isn't in default configs). vault-seed-forgejo.sh
# requires VAULT_ADDR and dies at its own precondition check if unset,
# so the non-root branch MUST invoke `sudo -n -- env VAR=val cmd` so
# that `env` sets the variable in the child process regardless of
# sudoers policy. This grep-level guard catches a revert to the unsafe
# form that silently broke non-root seed runs on a fresh LXC.
@test "seed loop invokes sudo via 'env VAR=val' (bypasses sudoers env_reset)" {
run grep -F 'sudo -n -- env "VAULT_ADDR=' "$DISINTO_BIN"
[ "$status" -eq 0 ]
# Negative: no bare `sudo -n "VAR=val" --` form anywhere in the file.
run grep -F 'sudo -n "VAULT_ADDR=' "$DISINTO_BIN"
[ "$status" -ne 0 ]
}
@test "disinto init --backend=nomad --with forgejo,forgejo --dry-run handles comma-separated services" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,forgejo --dry-run
[ "$status" -eq 0 ]
@ -215,44 +177,7 @@ setup_file() {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with unknown-service --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"unknown service"* ]]
[[ "$output" == *"known: forgejo, woodpecker-server, woodpecker-agent, agents, staging, chat, edge"* ]]
}
# S3.4: woodpecker auto-expansion and forgejo auto-inclusion
@test "disinto init --backend=nomad --with woodpecker auto-expands to server+agent" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]]
}
@test "disinto init --backend=nomad --with woodpecker auto-includes forgejo with note" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Note: --with woodpecker implies --with forgejo"* ]]
}
@test "disinto init --backend=nomad --with forgejo,woodpecker expands woodpecker" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run
[ "$status" -eq 0 ]
# Order follows input: forgejo first, then woodpecker expanded
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent"* ]]
}
@test "disinto init --backend=nomad --with woodpecker seeds both forgejo and woodpecker" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]]
[[ "$output" == *"tools/vault-seed-woodpecker.sh --dry-run"* ]]
}
@test "disinto init --backend=nomad --with forgejo,woodpecker deploys all three services" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with forgejo,woodpecker --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"forgejo.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-server.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-agent.hcl"* ]]
[[ "$output" == *"known: forgejo"* ]]
}
@test "disinto init --backend=nomad --with forgejo (flag=value syntax) works" {
@ -267,21 +192,12 @@ setup_file() {
[[ "$output" == *"--empty and --with are mutually exclusive"* ]]
}
# ── --import-env / --import-sops / --age-key (S2.5, #883) ────────────────────
#
# Step 2.5 wires Vault policies + JWT auth + optional KV import into
# `disinto init --backend=nomad`. The tests below exercise the flag
# grammar (who-requires-whom + who-requires-backend=nomad) and the
# dry-run plan shape (each --import-* flag prints its own path line,
# independently). A prior attempt at this issue regressed the "print
# every set flag" invariant by using if/elif — covered by the
# "--import-env --import-sops --age-key" case.
# ── Import flag validation ────────────────────────────────────────────────────
@test "disinto init --backend=nomad --import-env only is accepted" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"--import-env"* ]]
[[ "$output" == *"env file: /tmp/.env"* ]]
}
@test "disinto init --backend=nomad --import-sops without --age-key errors" {
@ -308,26 +224,21 @@ setup_file() {
[[ "$output" == *"Vault import dry-run"* ]]
[[ "$output" == *"--import-sops"* ]]
[[ "$output" == *"--age-key"* ]]
[[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]]
[[ "$output" == *"age key: /tmp/keys.txt"* ]]
}
# When all three flags are set, each one must print its own path line —
# if/elif regressed this to "only one printed" in a prior attempt (#883).
@test "disinto init --backend=nomad --import-env --import-sops --age-key --dry-run shows full import plan" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env /tmp/.env --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"Vault import dry-run"* ]]
[[ "$output" == *"env file: /tmp/.env"* ]]
[[ "$output" == *"env file: /tmp/.env"* ]]
[[ "$output" == *"sops file: /tmp/.env.vault.enc"* ]]
[[ "$output" == *"age key: /tmp/keys.txt"* ]]
[[ "$output" == *"age key: /tmp/keys.txt"* ]]
}
@test "disinto init --backend=nomad without import flags shows skip message" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"no --import-env/--import-sops"* ]]
[[ "$output" == *"skipping"* ]]
[[ "$output" == *"no --import-env/--import-sops - skipping"* ]]
}
@test "disinto init --backend=nomad --import-env --import-sops --age-key --with forgejo --dry-run shows all plans" {
@ -338,107 +249,3 @@ setup_file() {
[[ "$output" == *"Vault auth dry-run"* ]]
[[ "$output" == *"Deploy services dry-run"* ]]
}
@test "disinto init --backend=nomad --dry-run prints policies + auth plan even without --import-*" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --dry-run
[ "$status" -eq 0 ]
# Policies + auth run on every nomad path (idempotent), so the dry-run
# plan always lists them — regardless of whether --import-* is set.
[[ "$output" == *"Vault policies dry-run"* ]]
[[ "$output" == *"Vault auth dry-run"* ]]
[[ "$output" != *"Vault import dry-run"* ]]
}
# --import-env=PATH (=-form) must work alongside --import-env PATH.
@test "disinto init --backend=nomad --import-env=PATH (equals form) works" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --import-env=/tmp/.env --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"env file: /tmp/.env"* ]]
}
# --empty short-circuits after cluster-up: no policies, no auth, no
# import, no deploy. The dry-run plan must match that — cluster-up plan
# appears, but none of the S2.x section banners do.
@test "disinto init --backend=nomad --empty --dry-run skips policies/auth/import sections" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --dry-run
[ "$status" -eq 0 ]
# Cluster-up still runs (it's what --empty brings up).
[[ "$output" == *"Cluster-up dry-run"* ]]
# Policies + auth + import must NOT appear under --empty.
[[ "$output" != *"Vault policies dry-run"* ]]
[[ "$output" != *"Vault auth dry-run"* ]]
[[ "$output" != *"Vault import dry-run"* ]]
[[ "$output" != *"no --import-env/--import-sops"* ]]
}
# --empty + any --import-* flag silently does nothing (import is skipped),
# so the CLI rejects the combination up front rather than letting it
# look like the import "succeeded".
@test "disinto init --backend=nomad --empty --import-env errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-env /tmp/.env --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]]
}
@test "disinto init --backend=nomad --empty --import-sops --age-key errors" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --empty --import-sops /tmp/.env.vault.enc --age-key /tmp/keys.txt --dry-run
[ "$status" -ne 0 ]
[[ "$output" == *"--empty and --import-env/--import-sops/--age-key are mutually exclusive"* ]]
}
# S4.2: agents service auto-expansion and dependencies
@test "disinto init --backend=nomad --with agents auto-includes forgejo and woodpecker" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"services to deploy: forgejo,agents,woodpecker-server,woodpecker-agent"* ]]
[[ "$output" == *"Note: --with agents implies --with forgejo"* ]]
[[ "$output" == *"Note: --with agents implies --with woodpecker"* ]]
}
@test "disinto init --backend=nomad --with agents deploys in correct order" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]]
}
@test "disinto init --backend=nomad --with agents seeds agents service" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"tools/vault-seed-forgejo.sh --dry-run"* ]]
[[ "$output" == *"tools/vault-seed-woodpecker.sh --dry-run"* ]]
[[ "$output" == *"tools/vault-seed-agents.sh --dry-run"* ]]
}
@test "disinto init --backend=nomad --with agents deploys all four services" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with agents --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"forgejo.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-server.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"woodpecker-agent.hcl"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"agents.hcl"* ]]
}
@test "disinto init --backend=nomad --with woodpecker,agents expands correctly" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with woodpecker,agents --dry-run
[ "$status" -eq 0 ]
# woodpecker expands to server+agent, agents is already explicit
# forgejo is auto-included by agents
[[ "$output" == *"services to deploy: forgejo,woodpecker-server,woodpecker-agent,agents"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents"* ]]
}
# S5.1 / #1035 — edge service seeds ops-repo (dispatcher FORGE_TOKEN)
@test "disinto init --backend=nomad --with edge deploys edge" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run
[ "$status" -eq 0 ]
# edge depends on all backend services, so all are included
[[ "$output" == *"services to deploy: edge,forgejo"* ]]
[[ "$output" == *"deployment order: forgejo woodpecker-server woodpecker-agent agents staging chat edge"* ]]
[[ "$output" == *"[deploy] [dry-run] nomad job validate"*"edge.hcl"* ]]
}
@test "disinto init --backend=nomad --with edge seeds ops-repo" {
run "$DISINTO_BIN" init placeholder/repo --backend=nomad --with edge --dry-run
[ "$status" -eq 0 ]
[[ "$output" == *"tools/vault-seed-ops-repo.sh --dry-run"* ]]
}

View file

@ -126,7 +126,7 @@ setup() {
@test "hvault_policy_apply creates a policy" {
local pfile="${BATS_TEST_TMPDIR}/test-policy.hcl"
cat > "$pfile" <<'HCL'
path "kv/data/test/*" {
path "secret/data/test/*" {
capabilities = ["read"]
}
HCL
@ -138,12 +138,12 @@ HCL
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/sys/policies/acl/test-reader"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.policy' | grep -q "kv/data/test"
echo "$output" | jq -e '.data.policy' | grep -q "secret/data/test"
}
@test "hvault_policy_apply is idempotent" {
local pfile="${BATS_TEST_TMPDIR}/idem-policy.hcl"
printf 'path "kv/*" { capabilities = ["list"] }\n' > "$pfile"
printf 'path "secret/*" { capabilities = ["list"] }\n' > "$pfile"
run hvault_policy_apply "idem-policy" "$pfile"
[ "$status" -eq 0 ]

View file

@ -34,13 +34,6 @@ setup_file() {
return 1
fi
done
# Enable kv-v2 at path=kv (production mount per S2 migration). Dev-mode
# vault only auto-mounts kv-v2 at secret/; tests must mirror the real
# cluster layout so vault-import.sh writes land where we read them.
curl -sf -H "X-Vault-Token: test-root-token" \
-X POST -d '{"type":"kv","options":{"version":"2"}}' \
"${VAULT_ADDR}/v1/sys/mounts/kv" >/dev/null
}
teardown_file() {
@ -97,7 +90,7 @@ setup() {
# Verify nothing was written to Vault
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
"${VAULT_ADDR}/v1/secret/data/disinto/bots/review"
[ "$status" -ne 0 ]
}
@ -112,21 +105,21 @@ setup() {
# Check bots/review
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
"${VAULT_ADDR}/v1/secret/data/disinto/bots/review"
[ "$status" -eq 0 ]
echo "$output" | grep -q "review-token"
echo "$output" | grep -q "review-pass"
# Check bots/dev-qwen
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen"
"${VAULT_ADDR}/v1/secret/data/disinto/bots/dev-qwen"
[ "$status" -eq 0 ]
echo "$output" | grep -q "llama-token"
echo "$output" | grep -q "llama-pass"
# Check forge
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/forge"
"${VAULT_ADDR}/v1/secret/data/disinto/shared/forge"
[ "$status" -eq 0 ]
echo "$output" | grep -q "generic-forge-token"
echo "$output" | grep -q "generic-forge-pass"
@ -134,17 +127,16 @@ setup() {
# Check woodpecker
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/woodpecker"
"${VAULT_ADDR}/v1/secret/data/disinto/shared/woodpecker"
[ "$status" -eq 0 ]
echo "$output" | grep -q "wp-agent-secret"
# Forgejo keys are normalized: WP_FORGEJO_* → forgejo_* (no wp_ prefix in key name)
echo "$output" | grep -q "wp-forgejo-client"
echo "$output" | grep -q "wp-forgejo-secret"
echo "$output" | grep -q "wp-token"
# Check chat
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/chat"
"${VAULT_ADDR}/v1/secret/data/disinto/shared/chat"
[ "$status" -eq 0 ]
echo "$output" | grep -q "forward-auth-secret"
echo "$output" | grep -q "chat-client-id"
@ -152,7 +144,7 @@ setup() {
# Check runner tokens from sops
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/runner/GITHUB_TOKEN"
"${VAULT_ADDR}/v1/secret/data/disinto/runner/GITHUB_TOKEN"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.value == "github-test-token-abc123"'
}
@ -202,51 +194,11 @@ setup() {
# Verify the new value was written (path is disinto/bots/dev-qwen, key is token)
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/dev-qwen"
"${VAULT_ADDR}/v1/secret/data/disinto/bots/dev-qwen"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.token == "MODIFIED-LLAMA-TOKEN"'
}
# --- Delimiter-in-value regression (#898) ────────────────────────────────────
@test "preserves secret values that contain a pipe character" {
# Regression: previous accumulator packed values into "value|status" and
# joined per-path kv pairs with '|', so any value containing '|' was
# silently truncated or misrouted.
local piped_env="${BATS_TEST_TMPDIR}/dot-env-piped"
cp "$FIXTURES_DIR/dot-env-complete" "$piped_env"
# Swap in values that contain the old delimiter. Exercise both:
# - a paired bot path (token + pass on same vault path, hitting the
# per-path kv-pair join)
# - a single-key path (admin token)
# Values are single-quoted so they survive `source` of the .env file;
# `|` is a shell metachar and unquoted would start a pipeline. That is
# orthogonal to the accumulator bug under test — users are expected to
# quote such values in .env, and the accumulator must then preserve them.
sed -i "s#^FORGE_REVIEW_TOKEN=.*#FORGE_REVIEW_TOKEN='abc|xyz'#" "$piped_env"
sed -i "s#^FORGE_REVIEW_PASS=.*#FORGE_REVIEW_PASS='p1|p2|p3'#" "$piped_env"
sed -i "s#^FORGE_ADMIN_TOKEN=.*#FORGE_ADMIN_TOKEN='admin|with|pipes'#" "$piped_env"
run "$IMPORT_SCRIPT" \
--env "$piped_env" \
--sops "$FIXTURES_DIR/.env.vault.enc" \
--age-key "$FIXTURES_DIR/age-keys.txt"
[ "$status" -eq 0 ]
# Verify each value round-trips intact.
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/bots/review"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.token == "abc|xyz"'
echo "$output" | jq -e '.data.data.pass == "p1|p2|p3"'
run curl -sf -H "X-Vault-Token: ${VAULT_TOKEN}" \
"${VAULT_ADDR}/v1/kv/data/disinto/shared/forge"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.data.data.admin_token == "admin|with|pipes"'
}
# --- Incomplete fixture ───────────────────────────────────────────────────────
@test "handles incomplete fixture gracefully" {
@ -295,8 +247,6 @@ setup() {
"deploy-key-test"
"npm-test-token"
"dockerhub-test-token"
# Note: forgejo-client and forgejo-secret are NOT in the output
# because they are read from Vault, not logged
)
for pattern in "${secret_patterns[@]}"; do

View file

@ -94,11 +94,8 @@ if [ "$dry_run" = true ]; then
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# `disinto init` does not export VAULT_ADDR before calling this script — the
# server is reachable on 127.0.0.1:8200 and the root token lives at
# /etc/vault.d/root.token in the common fresh-LXC case (issue #912).
_hvault_default_env
[ -n "${VAULT_ADDR:-}" ] \
|| die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200"
# hvault_token_lookup both resolves the token (env or /etc/vault.d/root.token)
# and confirms the server is reachable with a valid token. Fail fast here so

View file

@ -219,10 +219,9 @@ if [ "$dry_run" = true ]; then
fi
# ── Live run: Vault connectivity check ───────────────────────────────────────
# Default the local-cluster Vault env (see lib/hvault.sh::_hvault_default_env).
# Called transitively from vault-nomad-auth.sh during `disinto init`, which
# does not export VAULT_ADDR in the common fresh-LXC case (issue #912).
_hvault_default_env
if [ -z "${VAULT_ADDR:-}" ]; then
die "VAULT_ADDR is not set — export VAULT_ADDR=http://127.0.0.1:8200"
fi
if ! hvault_token_lookup >/dev/null; then
die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
fi

View file

@ -8,13 +8,8 @@
# Usage:
# vault-import.sh \
# --env /path/to/.env \
# [--sops /path/to/.env.vault.enc] \
# [--age-key /path/to/age/keys.txt]
#
# Flag validation (S2.5, issue #883):
# --import-sops without --age-key → error.
# --age-key without --import-sops → error.
# --env alone (no sops) → OK; imports only the plaintext half.
# --sops /path/to/.env.vault.enc \
# --age-key /path/to/age/keys.txt
#
# Mapping:
# From .env:
@ -151,9 +146,9 @@ _kv_put_secret() {
-X POST \
-d "$payload" \
-o "$tmpfile" \
"${VAULT_ADDR}/v1/${VAULT_KV_MOUNT:-kv}/data/${path}")" || {
"${VAULT_ADDR}/v1/secret/data/${path}")" || {
rm -f "$tmpfile"
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: curl error"
_err "Failed to write to Vault at secret/data/${path}: curl error"
return 1
}
rm -f "$tmpfile"
@ -164,15 +159,15 @@ _kv_put_secret() {
return 0
;;
404)
_err "KV path not found: ${VAULT_KV_MOUNT:-kv}/data/${path}"
_err "KV path not found: secret/data/${path}"
return 1
;;
403)
_err "Permission denied writing to ${VAULT_KV_MOUNT:-kv}/data/${path}"
_err "Permission denied writing to secret/data/${path}"
return 1
;;
*)
_err "Failed to write to Vault at ${VAULT_KV_MOUNT:-kv}/data/${path}: HTTP $http_code"
_err "Failed to write to Vault at secret/data/${path}: HTTP $http_code"
return 1
;;
esac
@ -241,15 +236,14 @@ vault-import.sh — Import .env and sops-decrypted secrets into Vault KV
Usage:
vault-import.sh \
--env /path/to/.env \
[--sops /path/to/.env.vault.enc] \
[--age-key /path/to/age/keys.txt] \
--sops /path/to/.env.vault.enc \
--age-key /path/to/age/keys.txt \
[--dry-run]
Options:
--env Path to .env file (required)
--sops Path to sops-encrypted .env.vault.enc file (optional;
requires --age-key when set)
--age-key Path to age keys file (required when --sops is set)
--sops Path to sops-encrypted .env.vault.enc file (required)
--age-key Path to age keys file (required)
--dry-run Print import plan without writing to Vault (optional)
--help Show this help message
@ -278,62 +272,47 @@ EOF
esac
done
# Validate required arguments. --sops and --age-key are paired: if one
# is set, the other must be too. --env alone (no sops half) is valid —
# imports only the plaintext dotenv. Spec: S2.5 / issue #883 / #912.
# Validate required arguments
if [ -z "$env_file" ]; then
_die "Missing required argument: --env"
fi
if [ -n "$sops_file" ] && [ -z "$age_key_file" ]; then
_die "--sops requires --age-key"
if [ -z "$sops_file" ]; then
_die "Missing required argument: --sops"
fi
if [ -n "$age_key_file" ] && [ -z "$sops_file" ]; then
_die "--age-key requires --sops"
if [ -z "$age_key_file" ]; then
_die "Missing required argument: --age-key"
fi
# Validate files exist
if [ ! -f "$env_file" ]; then
_die "Environment file not found: $env_file"
fi
if [ -n "$sops_file" ] && [ ! -f "$sops_file" ]; then
if [ ! -f "$sops_file" ]; then
_die "Sops file not found: $sops_file"
fi
if [ -n "$age_key_file" ] && [ ! -f "$age_key_file" ]; then
if [ ! -f "$age_key_file" ]; then
_die "Age key file not found: $age_key_file"
fi
# Security check: age key permissions (only when an age key is provided —
# --env-only imports never touch the age key).
if [ -n "$age_key_file" ]; then
_validate_age_key_perms "$age_key_file"
fi
# Source the Vault helpers and default the local-cluster VAULT_ADDR +
# VAULT_TOKEN before the localhost safety check runs. `disinto init`
# does not export these in the common fresh-LXC case (issue #912).
source "$(dirname "$0")/../lib/hvault.sh"
_hvault_default_env
# Security check: age key permissions
_validate_age_key_perms "$age_key_file"
# Security check: VAULT_ADDR must be localhost
_check_vault_addr
# Source the Vault helpers
source "$(dirname "$0")/../lib/hvault.sh"
# Load .env file
_log "Loading environment from: $env_file"
_load_env_file "$env_file"
# Decrypt sops file when --sops was provided. On the --env-only path
# (empty $sops_file) the sops_env stays empty and the per-token loop
# below silently skips runner-token imports — exactly the "only
# plaintext half" spec from S2.5.
local sops_env=""
if [ -n "$sops_file" ]; then
_log "Decrypting sops file: $sops_file"
sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")"
# shellcheck disable=SC2086
eval "$sops_env"
else
_log "No --sops flag — skipping sops decryption (importing plaintext .env only)"
fi
# Decrypt sops file
_log "Decrypting sops file: $sops_file"
local sops_env
sops_env="$(_decrypt_sops "$sops_file" "$age_key_file")"
# shellcheck disable=SC2086
eval "$sops_env"
# Collect all import operations
declare -a operations=()
@ -391,13 +370,7 @@ EOF
local val="${!key}"
if [ -n "$val" ]; then
local lowercase_key="${key,,}"
# Normalize WP_FORGEJO_* → forgejo_* (strip wp_ prefix to match template)
if [[ "$lowercase_key" =~ ^wp_(.+)$ ]]; then
vault_key="${BASH_REMATCH[1]}"
else
vault_key="$lowercase_key"
fi
operations+=("woodpecker|$vault_key|$env_file|$key")
operations+=("woodpecker|$lowercase_key|$env_file|$key")
fi
done
@ -424,12 +397,8 @@ EOF
if $dry_run; then
_log "=== DRY-RUN: Import plan ==="
_log "Environment file: $env_file"
if [ -n "$sops_file" ]; then
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
else
_log "Sops file: (none — --env-only import)"
fi
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
_log ""
_log "Planned operations:"
for op in "${operations[@]}"; do
@ -444,33 +413,21 @@ EOF
_log "=== Starting Vault import ==="
_log "Environment file: $env_file"
if [ -n "$sops_file" ]; then
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
else
_log "Sops file: (none — --env-only import)"
fi
_log "Sops file: $sops_file"
_log "Age key: $age_key_file"
_log ""
local created=0
local updated=0
local unchanged=0
# First pass: collect all operations with their parsed values.
# Store value and status in separate associative arrays keyed by
# "vault_path:kv_key". Secret values may contain any character, so we
# never pack them into a delimited string — the previous `value|status`
# encoding silently truncated values containing '|' (see issue #898).
declare -A ops_value
declare -A ops_status
declare -A path_seen
# First pass: collect all operations with their parsed values
# Store as: ops_data["vault_path:kv_key"] = "source_value|status"
declare -A ops_data
for op in "${operations[@]}"; do
# Parse operation: category|field|subkey|file|envvar (5 fields for bots/runner)
# or category|field|file|envvar (4 fields for forge/woodpecker/chat).
# These metadata strings are built from safe identifiers (role names,
# env-var names, file paths) and do not carry secret values, so '|' is
# still fine as a separator here.
# or category|field|file|envvar (4 fields for forge/woodpecker/chat)
local category field subkey file envvar=""
local field_count
field_count="$(printf '%s' "$op" | awk -F'|' '{print NF}')"
@ -537,40 +494,51 @@ EOF
fi
fi
# vault_path and vault_key are identifier-safe (no ':' in either), so
# the composite key round-trips cleanly via ${ck%:*} / ${ck#*:}.
local ck="${vault_path}:${vault_key}"
ops_value["$ck"]="$source_value"
ops_status["$ck"]="$status"
path_seen["$vault_path"]=1
# Store operation data: key = "vault_path:kv_key", value = "source_value|status"
ops_data["${vault_path}:${vault_key}"]="${source_value}|${status}"
done
# Second pass: group by vault_path and write.
# Second pass: group by vault_path and write
# IMPORTANT: Always write ALL keys for a path, not just changed ones.
# KV v2 POST replaces the entire document, so we must include unchanged keys
# to avoid dropping them. The idempotency guarantee comes from KV v2 versioning.
for vault_path in "${!path_seen[@]}"; do
# Collect this path's "vault_key=source_value" pairs into a bash
# indexed array. Each element is one kv pair; '=' inside the value is
# preserved because _kv_put_secret splits on the *first* '=' only.
local pairs_array=()
local path_has_changes=0
declare -A paths_to_write
declare -A path_has_changes
for ck in "${!ops_value[@]}"; do
[ "${ck%:*}" = "$vault_path" ] || continue
local vault_key="${ck#*:}"
pairs_array+=("${vault_key}=${ops_value[$ck]}")
if [ "${ops_status[$ck]}" != "unchanged" ]; then
path_has_changes=1
fi
done
for key in "${!ops_data[@]}"; do
local data="${ops_data[$key]}"
local source_value="${data%%|*}"
local status="${data##*|}"
local vault_path="${key%:*}"
local vault_key="${key#*:}"
# Always add to paths_to_write (all keys for this path)
if [ -z "${paths_to_write[$vault_path]:-}" ]; then
paths_to_write[$vault_path]="${vault_key}=${source_value}"
else
paths_to_write[$vault_path]="${paths_to_write[$vault_path]}|${vault_key}=${source_value}"
fi
# Track if this path has any changes (for status reporting)
if [ "$status" != "unchanged" ]; then
path_has_changes[$vault_path]=1
fi
done
# Write each path with all its key-value pairs
for vault_path in "${!paths_to_write[@]}"; do
# Determine effective status for this path (updated if any key changed)
local effective_status="unchanged"
if [ "$path_has_changes" = 1 ]; then
if [ "${path_has_changes[$vault_path]:-}" = "1" ]; then
effective_status="updated"
fi
# Read pipe-separated key-value pairs and write them
local pairs_string="${paths_to_write[$vault_path]}"
local pairs_array=()
local IFS='|'
read -r -a pairs_array <<< "$pairs_string"
if ! _kv_put_secret "$vault_path" "${pairs_array[@]}"; then
_err "Failed to write to $vault_path"
exit 1

View file

@ -1,176 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-agents.sh — Idempotent seed for all bot KV paths
#
# Part of the Nomad+Vault migration (S4.1, issue #955). Populates
# kv/disinto/bots/<role> with token + pass for each of the 7 agent roles
# plus the vault bot. Handles the "fresh factory, no .env import" case.
#
# Companion to tools/vault-import.sh — when that runs against a box with
# an existing stack, it overwrites seeded values with real ones.
#
# Idempotency contract (per bot):
# - Both token and pass present → skip, log "<role> unchanged".
# - Either missing → generate random values for missing keys, preserve
# existing keys, write back atomically.
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-agents.sh
# tools/vault-seed-agents.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
KV_MOUNT="kv"
TOKEN_BYTES=32 # 32 bytes → 64 hex chars
PASS_BYTES=16 # 16 bytes → 32 hex chars
# All bot roles seeded by this script.
BOT_ROLES=(dev review gardener architect planner predictor supervisor vault)
LOG_TAG="[vault-seed-agents]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
# while/shift shape — distinct from forgejo (arity:value case) and
# woodpecker (for-loop).
DRY_RUN=0
while [ $# -gt 0 ]; do
case "$1" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/bots/<role> with token + pass for all agent\n'
printf 'roles. Idempotent: existing non-empty values are preserved.\n\n'
printf ' --dry-run Print planned actions without writing.\n'
exit 0
;;
*) die "invalid argument: ${1} (try --help)" ;;
esac
shift
done
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq openssl; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
[ -n "${VAULT_ADDR:-}" ] \
|| die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1: ensure kv/ mount exists and is KV v2 ────────────────────────────
log "── Step 1: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "${LOG_TAG}" \
|| die "KV mount check failed"
# ── Step 2: seed each bot role ───────────────────────────────────────────────
total_generated=0
# Check if shared forge credentials exist for dev role fallback
shared_forge_exists=0
shared_forge_raw="$(hvault_get_or_empty "${KV_MOUNT}/data/disinto/shared/forge")" \
|| true
if [ -n "$shared_forge_raw" ]; then
shared_forge_token="$(printf '%s' "$shared_forge_raw" | jq -r '.data.data.token // ""')"
shared_forge_pass="$(printf '%s' "$shared_forge_raw" | jq -r '.data.data.pass // ""')"
if [ -n "$shared_forge_token" ] && [ -n "$shared_forge_pass" ]; then
shared_forge_exists=1
fi
fi
for role in "${BOT_ROLES[@]}"; do
kv_logical="disinto/bots/${role}"
kv_api="${KV_MOUNT}/data/${kv_logical}"
log "── seed ${kv_logical} ──"
existing_raw="$(hvault_get_or_empty "${kv_api}")" \
|| die "failed to read ${kv_api}"
existing_token=""
existing_pass=""
existing_data="{}"
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')"
existing_pass="$(printf '%s' "$existing_raw" | jq -r '.data.data.pass // ""')"
fi
generated=()
desired_token="$existing_token"
desired_pass="$existing_pass"
# Special case: dev role uses shared forge credentials if available
if [ "$role" = "dev" ] && [ "$shared_forge_exists" -eq 1 ]; then
# Use shared FORGE_TOKEN + FORGE_PASS for dev role
if [ -z "$existing_token" ]; then
desired_token="$shared_forge_token"
generated+=("token")
fi
if [ -z "$existing_pass" ]; then
desired_pass="$shared_forge_pass"
generated+=("pass")
fi
else
# Generate random values for missing keys
if [ -z "$existing_token" ]; then
generated+=("token")
fi
if [ -z "$existing_pass" ]; then
generated+=("pass")
fi
for key in "${generated[@]}"; do
case "$key" in
token) desired_token="$(openssl rand -hex "$TOKEN_BYTES")" ;;
pass) desired_pass="$(openssl rand -hex "$PASS_BYTES")" ;;
esac
done
fi
if [ "${#generated[@]}" -eq 0 ]; then
log "${role}: unchanged"
continue
fi
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] ${role}: would generate ${generated[*]}"
total_generated=$(( total_generated + ${#generated[@]} ))
continue
fi
# Merge new keys into existing data to preserve any keys we don't own.
payload="$(printf '%s' "$existing_data" \
| jq --arg t "$desired_token" --arg p "$desired_pass" \
'{data: (. + {token: $t, pass: $p})}')"
_hvault_request POST "${kv_api}" "$payload" >/dev/null \
|| die "failed to write ${kv_api}"
log "${role}: generated ${generated[*]}"
total_generated=$(( total_generated + ${#generated[@]} ))
done
if [ "$total_generated" -eq 0 ]; then
log "all bot paths already seeded — no-op"
else
log "done — ${total_generated} key(s) seeded across ${#BOT_ROLES[@]} bot paths"
fi

View file

@ -1,115 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-chat.sh — Idempotent seed for kv/disinto/shared/chat
#
# Part of the Nomad+Vault migration (S5.2, issue #989). Populates the KV v2
# path that nomad/jobs/chat.hcl reads from, so a clean-install factory
# (no old-stack secrets to import) still has per-key values for
# CHAT_OAUTH_CLIENT_ID, CHAT_OAUTH_CLIENT_SECRET, and FORWARD_AUTH_SECRET.
#
# Companion to tools/vault-import.sh (S2.2) — when that import runs against
# a box with an existing stack, it overwrites these seeded values with the
# real ones. Order doesn't matter: whichever runs last wins, and both
# scripts are idempotent in the sense that re-running never rotates an
# existing non-empty key.
#
# Uses _hvault_seed_key (lib/hvault.sh) for each key — the helper reads
# existing data and merges to preserve sibling keys (KV v2 replaces .data
# atomically).
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2.
#
# Requires: VAULT_ADDR, VAULT_TOKEN, curl, jq, openssl
#
# Usage:
# tools/vault-seed-chat.sh
# tools/vault-seed-chat.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
KV_MOUNT="kv"
KV_LOGICAL_PATH="disinto/shared/chat"
# Keys to seed — array-driven loop (structurally distinct from forgejo's
# sequential if-blocks and agents' role loop).
SEED_KEYS=(chat_oauth_client_id chat_oauth_client_secret forward_auth_secret)
LOG_TAG="[vault-seed-chat]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing — [[ ]] guard + case: shape distinct from forgejo
# (arity:value case), woodpecker (for-loop), agents (while/shift).
DRY_RUN=0
if [[ $# -gt 0 ]]; then
case "$1" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/chat with random OAuth client\n'
printf 'credentials and forward auth secret if missing.\n'
printf 'Idempotent: existing non-empty values are preserved.\n\n'
printf ' --dry-run Show what would be seeded without writing.\n'
exit 0
;;
*) die "invalid argument: ${1} (try --help)" ;;
esac
fi
# ── Preconditions — inline check-or-die (shape distinct from agents' array
# loop and forgejo's continuation-line style) ─────────────────────────────
command -v curl >/dev/null 2>&1 || die "curl not found"
command -v jq >/dev/null 2>&1 || die "jq not found"
command -v openssl >/dev/null 2>&1 || die "openssl not found"
[ -n "${VAULT_ADDR:-}" ] || die "VAULT_ADDR unset — export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/2: ensure kv/ mount exists and is KV v2 ───────────────────────────
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "${LOG_TAG}" \
|| die "KV mount check failed"
# ── Step 2/2: seed missing keys via _hvault_seed_key helper ──────────────────
log "── Step 2/2: seed ${KV_LOGICAL_PATH} ──"
generated=()
for key in "${SEED_KEYS[@]}"; do
if [ "$DRY_RUN" -eq 1 ]; then
# Check existence without writing
existing=$(hvault_kv_get "$KV_LOGICAL_PATH" "$key" 2>/dev/null) || true
if [ -z "$existing" ]; then
generated+=("$key")
log "[dry-run] ${key} would be generated"
else
log "[dry-run] ${key} unchanged"
fi
else
rc=0
_hvault_seed_key "$KV_LOGICAL_PATH" "$key" || rc=$?
case "$rc" in
0) generated+=("$key"); log "${key} generated" ;;
1) log "${key} unchanged" ;;
*) die "API error seeding ${key} (rc=${rc})" ;;
esac
fi
done
if [ "${#generated[@]}" -eq 0 ]; then
log "all keys present — no-op"
else
log "done — ${#generated[@]} key(s) seeded at kv/${KV_LOGICAL_PATH}"
fi

View file

@ -118,9 +118,36 @@ hvault_token_lookup >/dev/null \
# wrong version or a different backend, fail loudly — silently
# re-enabling would destroy existing secrets.
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-forgejo]" \
|| die "KV mount check failed"
mounts_json="$(hvault_get_or_empty "sys/mounts")" \
|| die "failed to list Vault mounts"
mount_exists=false
if printf '%s' "$mounts_json" | jq -e --arg m "${KV_MOUNT}/" '.[$m]' >/dev/null 2>&1; then
mount_exists=true
fi
if [ "$mount_exists" = true ]; then
mount_type="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${KV_MOUNT}/" '.[$m].type // ""')"
mount_version="$(printf '%s' "$mounts_json" \
| jq -r --arg m "${KV_MOUNT}/" '.[$m].options.version // "1"')"
if [ "$mount_type" != "kv" ]; then
die "${KV_MOUNT}/ is mounted as type='${mount_type}', expected 'kv' — refuse to re-mount"
fi
if [ "$mount_version" != "2" ]; then
die "${KV_MOUNT}/ is KV v${mount_version}, expected v2 — refuse to upgrade in place (manual fix required)"
fi
log "${KV_MOUNT}/ already mounted (kv v2) — skipping enable"
else
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would enable ${KV_MOUNT}/ as kv v2"
else
payload="$(jq -n '{type:"kv",options:{version:"2"},description:"disinto shared KV v2 (S2.4)"}')"
_hvault_request POST "sys/mounts/${KV_MOUNT}" "$payload" >/dev/null \
|| die "failed to enable ${KV_MOUNT}/ as kv v2"
log "${KV_MOUNT}/ enabled as kv v2"
fi
fi
# ── Step 2/2: seed missing keys at kv/data/disinto/shared/forgejo ────────────
log "── Step 2/2: seed ${KV_API_PATH} ──"

View file

@ -1,149 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-ops-repo.sh — Idempotent seed for kv/disinto/shared/ops-repo
#
# Part of the Nomad+Vault migration (S5.1, issue #1035). Populates the KV v2
# path that nomad/jobs/edge.hcl dispatcher task reads from, so the edge
# proxy has FORGE_TOKEN for ops repo access.
#
# Seeds from kv/disinto/bots/vault (the vault bot credentials) — copies the
# token field to kv/disinto/shared/ops-repo. This is the "service" path that
# dispatcher uses, distinct from the "agent" path (bots/vault) used by
# agent tasks under the service-agents policy.
#
# Idempotency contract:
# - Key present with non-empty value → leave untouched, log "token unchanged".
# - Key missing or empty → copy from bots/vault, log "token copied".
# - If bots/vault is also empty → generate a random value, log "token generated".
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2.
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-ops-repo.sh
# tools/vault-seed-ops-repo.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
# KV v2 mount + logical paths
KV_MOUNT="kv"
OPS_REPO_PATH="disinto/shared/ops-repo"
VAULT_BOT_PATH="disinto/bots/vault"
OPS_REPO_API="${KV_MOUNT}/data/${OPS_REPO_PATH}"
VAULT_BOT_API="${KV_MOUNT}/data/${VAULT_BOT_PATH}"
log() { printf '[vault-seed-ops-repo] %s\n' "$*"; }
die() { printf '[vault-seed-ops-repo] ERROR: %s\n' "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
DRY_RUN=0
case "$#:${1-}" in
0:)
;;
1:--dry-run)
DRY_RUN=1
;;
1:-h|1:--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/ops-repo with FORGE_TOKEN.\n\n'
printf 'Copies token from kv/disinto/bots/vault if present;\n'
printf 'otherwise generates a random value. Idempotent:\n'
printf 'existing non-empty values are left untouched.\n\n'
printf ' --dry-run Print planned actions without writing.\n'
exit 0
;;
*)
die "invalid arguments: $* (try --help)"
;;
esac
# ── Preconditions ────────────────────────────────────────────────────────────
for bin in curl jq openssl; do
command -v "$bin" >/dev/null 2>&1 \
|| die "required binary not found: ${bin}"
done
[ -n "${VAULT_ADDR:-}" ] \
|| die "VAULT_ADDR unset — e.g. export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null \
|| die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/2: ensure kv/ mount exists and is KV v2 ───────────────────────────
log "── Step 1/2: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-ops-repo]" \
|| die "KV mount check failed"
# ── Step 2/2: seed ops-repo from vault bot ───────────────────────────────────
log "── Step 2/2: seed ${OPS_REPO_API} ──"
# Read existing ops-repo value
existing_raw="$(hvault_get_or_empty "${OPS_REPO_API}")" \
|| die "failed to read ${OPS_REPO_API}"
existing_token=""
if [ -n "$existing_raw" ]; then
existing_token="$(printf '%s' "$existing_raw" | jq -r '.data.data.token // ""')"
fi
desired_token="$existing_token"
action=""
if [ -z "$existing_token" ]; then
# Token missing — try to copy from vault bot
bot_raw="$(hvault_get_or_empty "${VAULT_BOT_API}")" || true
if [ -n "$bot_raw" ]; then
bot_token="$(printf '%s' "$bot_raw" | jq -r '.data.data.token // ""')"
if [ -n "$bot_token" ]; then
desired_token="$bot_token"
action="copied"
fi
fi
# If still no token, generate one
if [ -z "$desired_token" ]; then
if [ "$DRY_RUN" -eq 1 ]; then
action="generated (dry-run)"
else
desired_token="$(openssl rand -hex 32)"
action="generated"
fi
fi
fi
if [ -z "$action" ]; then
log "all keys present at ${OPS_REPO_API} — no-op"
log "token unchanged"
exit 0
fi
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] ${OPS_REPO_PATH}: would ${action} token"
exit 0
fi
# Write the token
payload="$(jq -n --arg t "$desired_token" '{data: {token: $t}}')"
_hvault_request POST "${OPS_REPO_API}" "$payload" >/dev/null \
|| die "failed to write ${OPS_REPO_API}"
log "${OPS_REPO_PATH}: ${action} token"
log "done — ${OPS_REPO_API} seeded"

View file

@ -1,145 +0,0 @@
#!/usr/bin/env bash
# =============================================================================
# tools/vault-seed-woodpecker.sh — Idempotent seed for kv/disinto/shared/woodpecker
#
# Part of the Nomad+Vault migration (S3.1 + S3.3, issues #934 + #936). Populates
# the KV v2 path read by nomad/jobs/woodpecker-server.hcl:
# - agent_secret: pre-shared secret for woodpecker-server ↔ agent communication
# - forgejo_client + forgejo_secret: OAuth2 client credentials from Forgejo
#
# This script handles BOTH:
# 1. S3.1: seeds `agent_secret` if missing
# 2. S3.3: calls wp-oauth-register.sh to create Forgejo OAuth app + store
# forgejo_client/forgejo_secret in Vault
#
# Idempotency contract:
# - agent_secret: missing → generate and write; present → skip, log unchanged
# - OAuth app + credentials: handled by wp-oauth-register.sh (idempotent)
# This script preserves any existing keys it doesn't own.
#
# Idempotency contract (per key):
# - Key missing or empty in Vault → generate a random value, write it,
# log "agent_secret generated".
# - Key present with a non-empty value → leave untouched, log
# "agent_secret unchanged".
#
# Preconditions:
# - Vault reachable + unsealed at $VAULT_ADDR.
# - VAULT_TOKEN set (env) or /etc/vault.d/root.token readable.
# - The `kv/` mount is enabled as KV v2 (this script enables it on a
# fresh box; on an existing box it asserts the mount type/version).
#
# Requires:
# - VAULT_ADDR (e.g. http://127.0.0.1:8200)
# - VAULT_TOKEN (env OR /etc/vault.d/root.token, resolved by lib/hvault.sh)
# - curl, jq, openssl
#
# Usage:
# tools/vault-seed-woodpecker.sh
# tools/vault-seed-woodpecker.sh --dry-run
#
# Exit codes:
# 0 success (seed applied, or already applied)
# 1 precondition / API / mount-mismatch failure
# =============================================================================
set -euo pipefail
SEED_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SEED_DIR}/.." && pwd)"
LIB_DIR="${REPO_ROOT}/lib/init/nomad"
# shellcheck source=../lib/hvault.sh
source "${REPO_ROOT}/lib/hvault.sh"
KV_MOUNT="kv"
KV_LOGICAL_PATH="disinto/shared/woodpecker"
KV_API_PATH="${KV_MOUNT}/data/${KV_LOGICAL_PATH}"
AGENT_SECRET_BYTES=32 # 32 bytes → 64 hex chars
LOG_TAG="[vault-seed-woodpecker]"
log() { printf '%s %s\n' "$LOG_TAG" "$*"; }
die() { printf '%s ERROR: %s\n' "$LOG_TAG" "$*" >&2; exit 1; }
# ── Flag parsing ─────────────────────────────────────────────────────────────
# for-over-"$@" loop — shape distinct from vault-seed-forgejo.sh (arity:value
# case) and vault-apply-roles.sh (if/elif).
DRY_RUN=0
for arg in "$@"; do
case "$arg" in
--dry-run) DRY_RUN=1 ;;
-h|--help)
printf 'Usage: %s [--dry-run]\n\n' "$(basename "$0")"
printf 'Seed kv/disinto/shared/woodpecker with secrets.\n\n'
printf 'Handles both S3.1 (agent_secret) and S3.3 (OAuth app + credentials):\n'
printf ' - agent_secret: generated if missing\n'
printf ' - forgejo_client/forgejo_secret: created via Forgejo API if missing\n\n'
printf ' --dry-run Print planned actions without writing.\n'
exit 0
;;
*) die "invalid argument: ${arg} (try --help)" ;;
esac
done
# ── Preconditions — binary + Vault connectivity checks ───────────────────────
required_bins=(curl jq openssl)
for bin in "${required_bins[@]}"; do
command -v "$bin" >/dev/null 2>&1 || die "required binary not found: ${bin}"
done
[ -n "${VAULT_ADDR:-}" ] || die "VAULT_ADDR unset — export VAULT_ADDR=http://127.0.0.1:8200"
hvault_token_lookup >/dev/null || die "Vault auth probe failed — check VAULT_ADDR + VAULT_TOKEN"
# ── Step 1/3: ensure kv/ mount exists and is KV v2 ───────────────────────────
log "── Step 1/3: ensure ${KV_MOUNT}/ is KV v2 ──"
export DRY_RUN
hvault_ensure_kv_v2 "$KV_MOUNT" "[vault-seed-woodpecker]" \
|| die "KV mount check failed"
# ── Step 2/3: seed agent_secret at kv/data/disinto/shared/woodpecker ─────────
log "── Step 2/3: seed agent_secret ──"
existing_raw="$(hvault_get_or_empty "${KV_API_PATH}")" \
|| die "failed to read ${KV_API_PATH}"
# Read all existing keys so we can preserve them on write (KV v2 replaces
# `.data` atomically). Missing path → empty object.
existing_data="{}"
existing_agent_secret=""
if [ -n "$existing_raw" ]; then
existing_data="$(printf '%s' "$existing_raw" | jq '.data.data // {}')"
existing_agent_secret="$(printf '%s' "$existing_raw" | jq -r '.data.data.agent_secret // ""')"
fi
if [ -n "$existing_agent_secret" ]; then
log "agent_secret unchanged"
else
# agent_secret is missing — generate it.
if [ "$DRY_RUN" -eq 1 ]; then
log "[dry-run] would generate + write: agent_secret"
else
new_agent_secret="$(openssl rand -hex "$AGENT_SECRET_BYTES")"
# Merge the new key into existing data to preserve any keys written by
# other seeders (e.g. S3.3's forgejo_client/forgejo_secret).
payload="$(printf '%s' "$existing_data" \
| jq --arg as "$new_agent_secret" '{data: (. + {agent_secret: $as})}')"
_hvault_request POST "${KV_API_PATH}" "$payload" >/dev/null \
|| die "failed to write ${KV_API_PATH}"
log "agent_secret generated"
fi
fi
# ── Step 3/3: register Forgejo OAuth app and store credentials ───────────────
log "── Step 3/3: register Forgejo OAuth app ──"
# Export DRY_RUN for the OAuth script and call it
export DRY_RUN
if "${LIB_DIR}/wp-oauth-register.sh" || [ "$DRY_RUN" -eq 1 ]; then
:
elif [ -n "${FORGE_URL:-}" ]; then
# Forgejo was configured but unavailable
log "OAuth registration check failed (Forgejo may not be running)"
log "This is expected if Forgejo is not available"
fi
log "done — agent_secret + OAuth credentials seeded"

View file

@ -1,4 +1,4 @@
<!-- last-reviewed: a467d613a44b9b475a60c14c4162621e846969ea -->
<!-- last-reviewed: 6bdbeb5bd2a200ff1b23724564da9383193f3e30 -->
# vault/policies/ — Agent Instructions
HashiCorp Vault ACL policies for the disinto factory. One `.hcl` file per
@ -30,9 +30,6 @@ KV v2). Vault addresses KV v2 data at `kv/data/<path>` and metadata at
|---|---|
| `service-forgejo` | `kv/data/disinto/shared/forgejo/*` |
| `service-woodpecker` | `kv/data/disinto/shared/woodpecker/*` |
| `service-agents` | All 7 `kv/data/disinto/bots/<role>/*` namespaces + `kv/data/disinto/shared/forge/*`; composite policy for the `agents` Nomad job (S4.1) |
| `service-chat` | `kv/data/disinto/shared/chat/*`; read-only OAuth client config + forward-auth secret for the chat Nomad job (S5.2, #989) |
| `service-dispatcher` | `kv/data/disinto/runner/*` (list+read) + `kv/data/disinto/shared/ops-repo/*` (read); used by edge dispatcher sidecar (S5.1, #988) |
| `bot-<role>` (dev, review, gardener, architect, planner, predictor, supervisor, vault, dev-qwen) | `kv/data/disinto/bots/<role>/*` + `kv/data/disinto/shared/forge/*` |
| `runner-<TOKEN>` (GITHUB\_TOKEN, CODEBERG\_TOKEN, CLAWHUB\_TOKEN, DEPLOY\_KEY, NPM\_TOKEN, DOCKER\_HUB\_TOKEN) | `kv/data/disinto/runner/<TOKEN>` (exactly one) |
| `dispatcher` | `kv/data/disinto/runner/*` + `kv/data/disinto/shared/ops-repo/*` |
@ -52,17 +49,12 @@ validation.
1. Drop a file matching one of the four naming patterns above. Use an
existing file in the same family as the template — comment header,
capability list, and KV path layout should match the family.
2. Run `vault policy fmt <file>` locally so the formatting matches what
the CI fmt-check (step 4 of `.woodpecker/nomad-validate.yml`) will
accept. The fmt check runs non-destructively in CI but a dirty file
fails the step; running `fmt` locally before pushing is the fastest
path.
3. Add the matching entry to `../roles.yaml` (see "JWT-auth roles" below)
so the CI role-reference check (step 6) stays green.
4. Run `tools/vault-apply-policies.sh --dry-run` to confirm the new
2. Run `tools/vault-apply-policies.sh --dry-run` to confirm the new
basename appears in the planned-work list with the expected SHA.
5. Run `tools/vault-apply-policies.sh` against a Vault instance to
3. Run `tools/vault-apply-policies.sh` against a Vault instance to
create it; re-run to confirm it reports `unchanged`.
4. The CI fmt + validate step lands in S2.6 (#884). Until then
`vault policy fmt <file>` locally is the fastest sanity check.
## JWT-auth roles (S2.3)
@ -126,56 +118,6 @@ would let one service's tokens outlive the others — add a field to
`vault/roles.yaml` and the applier at the same time if that ever
becomes necessary.
## Policy lifecycle
Adding a policy that an actual workload consumes is a three-step chain;
the CI pipeline guards each link.
1. **Add the policy HCL**`vault/policies/<name>.hcl`, formatted with
`vault policy fmt`. Capabilities must be drawn from the Vault-recognized
set (`read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`,
`deny`); a typo fails CI step 5 (HCL written to an inline dev-mode Vault
via `vault policy write` — a real parser, not a regex).
2. **Update `../roles.yaml`** — add a JWT-auth role entry whose `policy:`
field matches the new basename (without `.hcl`). CI step 6 re-checks
every role in this file against the policy set, so a drift between the
two directories fails the step.
3. **Reference from a Nomad jobspec** — add `vault { role = "<name>" }` in
`nomad/jobs/<service>.hcl` (owned by S2.4). Policies do not take effect
until a Nomad job asks for a token via that role.
See the "Adding a new service" walkthrough below for the applier-script
flow once steps 13 are committed.
## CI enforcement (`.woodpecker/nomad-validate.yml`)
The pipeline triggers on any PR touching `vault/policies/**`,
`vault/roles.yaml`, or `lib/init/nomad/vault-*.sh` and runs four
vault-scoped checks (in addition to the nomad-scoped steps already in
place):
| Step | Tool | What it catches |
|---|---|---|
| 4. `vault-policy-fmt` | `vault policy fmt` + `diff` | formatting drift — trailing whitespace, wrong indentation, missing newlines |
| 5. `vault-policy-validate` | `vault policy write` against inline dev Vault | HCL syntax errors, unknown stanzas, invalid capability names (e.g. `"frobnicate"`), malformed `path "..." {}` blocks |
| 6. `vault-roles-validate` | yamllint + PyYAML | roles.yaml syntax drift, missing required fields, role→policy references with no matching `.hcl` |
| P11 | `lib/secret-scan.sh` via `.woodpecker/secret-scan.yml` | literal secret leaked into a policy HCL (rare copy-paste mistake) — already covers `vault/**/*`, no duplicate step here |
All four steps are fail-closed — any error blocks merge. The pipeline
pins `hashicorp/vault:1.18.5` (matching `lib/init/nomad/install.sh`);
bumping the runtime version without bumping the CI image is a CI-caught
drift.
## Common failure modes
| Symptom in CI logs | Root cause | Fix |
|---|---|---|
| `vault-policy-fmt: … is not formatted — run 'vault policy fmt <file>'` | Trailing whitespace / mixed indent in an HCL file | `vault policy fmt <file>` locally and re-commit |
| `vault-policy-validate: … failed validation` plus a `policy` error from Vault | Unknown capability (e.g. `"frobnicate"`), unknown stanza, malformed `path` block | Fix the HCL; valid capabilities are `read`, `list`, `create`, `update`, `delete`, `patch`, `sudo`, `deny` |
| `vault-roles-validate: ERROR: role 'X' references policy 'Y' but vault/policies/Y.hcl does not exist` | A role's `policy:` field does not match any file basename in `vault/policies/` | Either add the missing policy HCL or fix the typo in `roles.yaml` |
| `vault-roles-validate: ERROR: role entry missing required field 'Z'` | A role in `roles.yaml` is missing one of `name`, `policy`, `namespace`, `job_id` | Add the field; all four are required |
| P11 `secret-scan: detected potential secret …` on a `.hcl` file | A literal token/password was pasted into a policy | Policies must name KV paths, not carry secret values — move the literal into KV (S2.2) and have the policy grant `read` on the path |
## What this directory does NOT own
- **Attaching policies to Nomad jobs.** That's S2.4 (#882) via the
@ -183,3 +125,4 @@ drift.
name in `vault { role = "..." }` is what binds the policy.
- **Writing the secret values themselves.** That's S2.2 (#880) via
`tools/vault-import.sh`.
- **CI policy fmt + validate + roles.yaml check.** That's S2.6 (#884).

View file

@ -3,14 +3,14 @@
# Architect agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the architect-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/architect" {
path "kv/data/disinto/bots/architect/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/architect" {
path "kv/metadata/disinto/bots/architect/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -5,14 +5,14 @@
# via workload identity (S2.4). KV path mirrors the bot basename:
# kv/disinto/bots/dev-qwen/*.
path "kv/data/disinto/bots/dev-qwen" {
path "kv/data/disinto/bots/dev-qwen/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/dev-qwen" {
path "kv/metadata/disinto/bots/dev-qwen/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Dev agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the dev-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/dev" {
path "kv/data/disinto/bots/dev/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/dev" {
path "kv/metadata/disinto/bots/dev/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Gardener agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the gardener-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/gardener" {
path "kv/data/disinto/bots/gardener/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/gardener" {
path "kv/metadata/disinto/bots/gardener/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Planner agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the planner-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/planner" {
path "kv/data/disinto/bots/planner/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/planner" {
path "kv/metadata/disinto/bots/planner/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Predictor agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the predictor-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/predictor" {
path "kv/data/disinto/bots/predictor/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/predictor" {
path "kv/metadata/disinto/bots/predictor/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Review agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the review-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/review" {
path "kv/data/disinto/bots/review/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/review" {
path "kv/metadata/disinto/bots/review/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -3,14 +3,14 @@
# Supervisor agent: reads its own bot KV namespace + the shared forge URL.
# Attached to the supervisor-agent Nomad job via workload identity (S2.4).
path "kv/data/disinto/bots/supervisor" {
path "kv/data/disinto/bots/supervisor/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/supervisor" {
path "kv/metadata/disinto/bots/supervisor/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -7,14 +7,14 @@
# NOTE: distinct from the runner-* policies, which gate per-secret access
# for vault-runner ephemeral dispatches (Step 5).
path "kv/data/disinto/bots/vault" {
path "kv/data/disinto/bots/vault/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/vault" {
path "kv/metadata/disinto/bots/vault/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/forge" {
path "kv/data/disinto/shared/forge/*" {
capabilities = ["read"]
}

View file

@ -20,10 +20,10 @@ path "kv/metadata/disinto/runner/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/ops-repo" {
path "kv/data/disinto/shared/ops-repo/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/ops-repo" {
path "kv/metadata/disinto/shared/ops-repo/*" {
capabilities = ["list", "read"]
}

View file

@ -1,76 +0,0 @@
# vault/policies/service-agents.hcl
#
# Composite policy for the `agents` Nomad job (S4.1, issue #955).
# Grants read access to all 7 bot KV namespaces + shared forge config,
# so a single job running all agent roles can pull per-bot tokens from
# Vault via workload identity.
# Per-bot KV paths (token + pass per role)
path "kv/data/disinto/bots/dev" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/dev" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/review" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/review" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/gardener" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/gardener" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/architect" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/architect" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/planner" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/planner" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/predictor" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/predictor" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/supervisor" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/supervisor" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/bots/vault" {
capabilities = ["read"]
}
path "kv/metadata/disinto/bots/vault" {
capabilities = ["list", "read"]
}
# Shared forge config (URL, bot usernames)
path "kv/data/disinto/shared/forge" {
capabilities = ["read"]
}

View file

@ -1,15 +0,0 @@
# vault/policies/service-chat.hcl
#
# Read-only access to shared Chat secrets (OAuth client config, forward auth
# secret). Attached to the Chat Nomad job via workload identity (S5.2).
#
# Scope: kv/disinto/shared/chat entries owned by the operator and
# shared between the chat service and edge proxy.
path "kv/data/disinto/shared/chat" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/chat" {
capabilities = ["list", "read"]
}

View file

@ -1,29 +0,0 @@
# vault/policies/service-dispatcher.hcl
#
# Edge dispatcher policy: needs to enumerate the runner secret namespace
# (to check secret presence before dispatching) and read the shared
# ops-repo credentials (token + clone URL) it uses to fetch action TOMLs.
#
# Scope:
# - kv/disinto/runner/* read all per-secret values + list keys
# - kv/disinto/shared/ops-repo/* read the ops-repo creds bundle
#
# The actual ephemeral runner container created per dispatch gets the
# narrow runner-<NAME> policies, NOT this one. This policy stays bound
# to the long-running dispatcher only.
path "kv/data/disinto/runner/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/runner/*" {
capabilities = ["list", "read"]
}
path "kv/data/disinto/shared/ops-repo" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/ops-repo" {
capabilities = ["list", "read"]
}

View file

@ -3,13 +3,13 @@
# Read-only access to shared Forgejo secrets (admin password, OAuth client
# config). Attached to the Forgejo Nomad job via workload identity (S2.4).
#
# Scope: kv/disinto/shared/forgejo entries owned by the operator and
# Scope: kv/disinto/shared/forgejo/* entries owned by the operator and
# shared between forgejo + the chat OAuth client (issue #855 lineage).
path "kv/data/disinto/shared/forgejo" {
path "kv/data/disinto/shared/forgejo/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/forgejo" {
path "kv/metadata/disinto/shared/forgejo/*" {
capabilities = ["list", "read"]
}

View file

@ -6,10 +6,10 @@
# Scope: kv/disinto/shared/woodpecker/* entries owned by the operator
# and consumed by woodpecker-server + woodpecker-agent.
path "kv/data/disinto/shared/woodpecker" {
path "kv/data/disinto/shared/woodpecker/*" {
capabilities = ["read"]
}
path "kv/metadata/disinto/shared/woodpecker" {
path "kv/metadata/disinto/shared/woodpecker/*" {
capabilities = ["list", "read"]
}

View file

@ -55,27 +55,7 @@ roles:
- name: service-woodpecker
policy: service-woodpecker
namespace: default
job_id: woodpecker-server
- name: service-woodpecker-agent
policy: service-woodpecker
namespace: default
job_id: woodpecker-agent
# ── Agents composite (nomad/jobs/agents.hcl — S4.1) ──────────────────────
# Single job running all 7 agent roles. Uses a composite policy
# (vault/policies/service-agents.hcl) that unions all bot KV paths.
- name: service-agents
policy: service-agents
namespace: default
job_id: agents
# ── Chat UI (nomad/jobs/chat.hcl — S5.2) ─────────────────────────────────
# Claude chat UI service with OAuth secrets. Uses vault/policies/service-chat.hcl.
- name: service-chat
policy: service-chat
namespace: default
job_id: chat
job_id: woodpecker
# ── Per-agent bots (nomad/jobs/bot-<role>.hcl — land in later steps) ───────
# job_id placeholders match the policy name 1:1 until each bot's jobspec
@ -128,10 +108,10 @@ roles:
job_id: bot-vault
# ── Edge dispatcher ────────────────────────────────────────────────────────
- name: service-dispatcher
policy: service-dispatcher
- name: dispatcher
policy: dispatcher
namespace: default
job_id: edge
job_id: dispatcher
# ── Per-secret runner roles ────────────────────────────────────────────────
# vault-runner (Step 5) composes runner-<NAME> policies onto each